QCM6490 SSR 记述（一）项目场景：问题描述原因分析：解决方案：

140 阅读 0 评论 93 点赞

我是靠谱客的博主辛勤柠檬，这篇文章主要介绍QCM6490 SSR 记述（一）项目场景：问题描述原因分析：解决方案：，现在分享给大家，希望可以做个参考。

项目场景：

modem 子系统crash导致系统crash，偶现。且SSR已经关闭。

如果disable_restart_work 设置为DISABLE_SSR，那么不管什么（wlan adsp-audio/sensor modem etc）触发了SSR，都不会重启

复制代码

1
2
3
4
5
#define DISABLE_SSR 0x9889deed
/* If set to 0x9889deed, call to subsystem_restart_dev() returns immediately */
//static uint disable_restart_work;
static uint disable_restart_work = DISABLE_SSR;

常用的打开log:

复制代码

1
2
3
adb shell "echo 'file subsystem_restart.c +p' > /sys/kernel/debug/dynamic_debug/control"  
adb shell "echo 'file subsys-pil-tz.c +p' > /sys/kernel/debug/dynamic_debug/control"

问题描述

1.有以下几种情况会导致系统重启/crash:

复制代码

//subsystem_restart_dev
//01.If a system reboot/shutdown is underway ignore subsystem errors.
//However, print a message so that we know that a subsystem behaved unexpectedly here.
extern enum system_states {
  	SYSTEM_BOOTING,
  	SYSTEM_SCHEDULING,
  	SYSTEM_RUNNING,
  	SYSTEM_HALT,
  	SYSTEM_POWER_OFF,
  	SYSTEM_RESTART,
  	SYSTEM_SUSPEND,
  } system_state;
  if (system_state == SYSTEM_RESTART
		|| system_state == SYSTEM_POWER_OFF) {
		pr_err("%s crashed during a system poweroff/shutdown.n", name);
		return -EBUSY;
}

//02.disable_restart_work = DISABLE_SSR;直接跳过

if (disable_restart_work == DISABLE_SSR) {
		pr_err("subsys-restart: Ignoring restart request for %sn",
									name);
		return 0;
	}

//03.restart_level
	switch (dev->restart_level) {

case RESET_SUBSYS_COUPLED://related 已经确认是这里
		__subsystem_restart_dev(dev);
		break;
	case RESET_SOC://system
		__pm_stay_awake(dev->ssr_wlock);
		schedule_work(&dev->device_restart_work);
		return 0;
	default:
		panic("subsys-restart: Unknown restart level!n");
		break;
	}
//__subsystem_restart_dev 
//04.正常的情况下，应该是track->p_state为SUBSYS_NORMAL；dev->track.state为SUBSYS_ONLINE；否则系统重启
	if (track->p_state != SUBSYS_CRASHED &&
					dev->track.state == SUBSYS_ONLINE) {
		if (track->p_state != SUBSYS_RESTARTING) {
			track->p_state = SUBSYS_CRASHED;
			__pm_stay_awake(dev->ssr_wlock);
			queue_work(ssr_wq, &dev->work);//触发子系统重启
		} else {
			pr_err("Subsystem %s crashed during SSR!", name);
		}
	} else
		WARN(dev->track.state == SUBSYS_OFFLINE,
			"SSR aborted: %s subsystem not onlinen", name);

//	INIT_WORK(&subsys->work, subsystem_restart_wq_func);
//05.再次检测系统状态,系统关机重启abort SSR 
	if (system_state == SYSTEM_RESTART
		|| system_state == SYSTEM_POWER_OFF) {
		WARN(1, "SSR aborted: %s, system reboot/shutdown is under wayn",
			desc->name);
		pr_err("SSR aborted: %s, system reboot/shutdown is under wayn",
			desc->name);
		return;
	}
//06.子系统没有起来，abort SSR
	if (dev->track.state == SUBSYS_OFFLINE) {
		mutex_unlock(&track->lock);
		WARN(1, "SSR aborted: %s subsystem not onlinen", desc->name);
		pr_err("SSR aborted: %s subsystem not onlinen",
			desc->name);
		return;
	}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
//subsystem_restart_dev
//01.If a system reboot/shutdown is underway ignore subsystem errors.
//However, print a message so that we know that a subsystem behaved unexpectedly here.
extern enum system_states {
  	SYSTEM_BOOTING,
  	SYSTEM_SCHEDULING,
  	SYSTEM_RUNNING,
  	SYSTEM_HALT,
  	SYSTEM_POWER_OFF,
  	SYSTEM_RESTART,
  	SYSTEM_SUSPEND,
  } system_state;
  if (system_state == SYSTEM_RESTART
		|| system_state == SYSTEM_POWER_OFF) {
		pr_err("%s crashed during a system poweroff/shutdown.n", name);
		return -EBUSY;
}


//02.disable_restart_work = DISABLE_SSR;直接跳过


	if (disable_restart_work == DISABLE_SSR) {
		pr_err("subsys-restart: Ignoring restart request for %sn",
									name);
		return 0;
	}


//03.restart_level
	switch (dev->restart_level) {

	case RESET_SUBSYS_COUPLED://related 已经确认是这里
		__subsystem_restart_dev(dev);
		break;
	case RESET_SOC://system
		__pm_stay_awake(dev->ssr_wlock);
		schedule_work(&dev->device_restart_work);
		return 0;
	default:
		panic("subsys-restart: Unknown restart level!n");
		break;
	}
//__subsystem_restart_dev 
//04.正常的情况下，应该是track->p_state为SUBSYS_NORMAL；dev->track.state为SUBSYS_ONLINE；否则系统重启
	if (track->p_state != SUBSYS_CRASHED &&
					dev->track.state == SUBSYS_ONLINE) {
		if (track->p_state != SUBSYS_RESTARTING) {
			track->p_state = SUBSYS_CRASHED;
			__pm_stay_awake(dev->ssr_wlock);
			queue_work(ssr_wq, &dev->work);//触发子系统重启
		} else {
			pr_err("Subsystem %s crashed during SSR!", name);
		}
	} else
		WARN(dev->track.state == SUBSYS_OFFLINE,
			"SSR aborted: %s subsystem not onlinen", name);

//	INIT_WORK(&subsys->work, subsystem_restart_wq_func);
//05.再次检测系统状态,系统关机重启abort SSR 
	if (system_state == SYSTEM_RESTART
		|| system_state == SYSTEM_POWER_OFF) {
		WARN(1, "SSR aborted: %s, system reboot/shutdown is under wayn",
			desc->name);
		pr_err("SSR aborted: %s, system reboot/shutdown is under wayn",
			desc->name);
		return;
	}
//06.子系统没有起来，abort SSR
	if (dev->track.state == SUBSYS_OFFLINE) {
		mutex_unlock(&track->lock);
		WARN(1, "SSR aborted: %s subsystem not onlinen", desc->name);
		pr_err("SSR aborted: %s subsystem not onlinen",
			desc->name);
		return;
	}

2.首先肯定是wlan adsp(audio sensor) modem 子系统异常触发中断或者直接进入下面的函数：
subsystem_restart_dev

3.内核中有许多地方调用类似BUG（）的语句，它非常像一个内核运行时的断言，意味着本来不该执行到BUG（）这条语句，一旦执行即抛出Oops。 BUG（）的定义为：

复制代码

1
2
3
4
5
#define BUG() do { 
  	printk("BUG at %s:%d/%s()!n", __FILE__, __LINE__, __func__); 
  	panic("BUG!"); 
  } while (0)

BUG（）还有一个变体叫BUG_ON（），它的内部会引用BUG（）

复制代码

1
2
#define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)

其中的panic（）定义在kernel/panic.c中，会导致内核崩溃，并打印Oops。
内核有个稍微弱一些WARN_ON（），在括号中的条件成立时，内核会打印栈回溯，但是不会panic（），表示内核抛出一个警告，暗示某种不太合理的事情发生了。

4.CONFIG_SETUP_SSR_NOTIF_TIMEOUTS 这个宏控可以关闭，没有什么影响

原因分析：

目前出现一个问题，modem子系统重启偶现不生效；还是panic.

我这边加了个延时，手动触发modem crash 可以模拟出来“Subsystem modem crashed during SSR!”：是是因为前一次modem 子系统重启未完成又触发了下一次modem 子系统重启，使得p_state为SUBSYS_RESTARTING从而导致panic；正常情况下是不应该出现这么频繁的子系统重启的，为防止这种情况，可以加个标志位，等待上一次modem 子系统重启完成才会进行下一次子系统重启。

模拟方法：

复制代码

static void subsystem_restart_wq_func(struct work_struct *work)
{
	struct subsys_device *dev = container_of(work,
						struct subsys_device, work);
	struct subsys_device **list;
	struct subsys_desc *desc = dev->desc;
	struct subsys_soc_restart_order *order = dev->restart_order;
	struct subsys_tracking *track;
	unsigned int count;
	unsigned long flags;
	int ret;

/*
	 * It's OK to not take the registration lock at this point.
	 * This is because the subsystem list inside the relevant
	 * restart order is not being traversed.
	 */
	if (order) {
		list = order->subsys_ptrs;
		count = order->count;
		track = &order->track;
	} else {
		list = &dev;
		count = 1;
		track = &dev->track;
	}

/*
	 * If a system reboot/shutdown is under way, ignore subsystem errors.
	 * However, print a message so that we know that a subsystem behaved
	 * unexpectedly here.
	 */
	 if(meig_work_flag==1){
		 pr_err("wait complete at the last timen");
		return; 
	 }
	 meig_work_flag=1;
	 
	if (system_state == SYSTEM_RESTART
		|| system_state == SYSTEM_POWER_OFF) {
		WARN(1, "SSR aborted: %s, system reboot/shutdown is under wayn",
			desc->name);
		pr_err("SSR aborted: %s, system reboot/shutdown is under wayn",
			desc->name);
		return;
	}

mutex_lock(&track->lock);
	do_epoch_check(dev);

if (dev->track.state == SUBSYS_OFFLINE) {
		mutex_unlock(&track->lock);
		WARN(1, "SSR aborted: %s subsystem not onlinen", desc->name);
		pr_err("SSR aborted: %s subsystem not onlinen",
			desc->name);
		return;
	}

/*
	 * It's necessary to take the registration lock because the subsystem
	 * list in the SoC restart order will be traversed and it shouldn't be
	 * changed until _this_ restart sequence completes.
	 */
	mutex_lock(&soc_order_reg_lock);

pr_err("[%s:%d]: Starting restart sequence for %sn",
			current->comm, current->pid, desc->name);
	notify_each_subsys_device(list, count, SUBSYS_BEFORE_SHUTDOWN, NULL);
	ret = for_each_subsys_device(list, count, NULL, subsystem_shutdown);
	if (ret)
		goto err;
	notify_each_subsys_device(list, count, SUBSYS_AFTER_SHUTDOWN, NULL);

notify_each_subsys_device(list, count, SUBSYS_RAMDUMP_NOTIFICATION,
									NULL);

spin_lock_irqsave(&track->s_lock, flags);
	track->p_state = SUBSYS_RESTARTING;
	spin_unlock_irqrestore(&track->s_lock, flags);

//msleep(3000);
	/* Collect ram dumps for all subsystems in order here */
	for_each_subsys_device(list, count, NULL, subsystem_ramdump);

for_each_subsys_device(list, count, NULL, subsystem_free_memory);

notify_each_subsys_device(list, count, SUBSYS_BEFORE_POWERUP, NULL);
	ret = for_each_subsys_device(list, count, NULL, subsystem_powerup);
	if (ret)
		goto err;
	notify_each_subsys_device(list, count, SUBSYS_AFTER_POWERUP, NULL);

pr_err("[%s:%d]: Restart sequence for %s completed.n",
			current->comm, current->pid, desc->name);
+	msleep(3000);//加个延时，通过QXDM发送命令send_data 75 37 03 00 00 触发modem死机；多次发送，即可出现panic("Subsystem %s crashed during SSR!", name);

err:
	/* Reset subsys count */
	if (ret)
		dev->count = 0;
	//msleep(9000);

mutex_unlock(&soc_order_reg_lock);
	mutex_unlock(&track->lock);

spin_lock_irqsave(&track->s_lock, flags);
	pr_err("zhanghong 6666666666666n");
	track->p_state = SUBSYS_NORMAL;
	meig_work_flag=0;
	__pm_relax(dev->ssr_wlock);
	spin_unlock_irqrestore(&track->s_lock, flags);
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
static void subsystem_restart_wq_func(struct work_struct *work)
{
	struct subsys_device *dev = container_of(work,
						struct subsys_device, work);
	struct subsys_device **list;
	struct subsys_desc *desc = dev->desc;
	struct subsys_soc_restart_order *order = dev->restart_order;
	struct subsys_tracking *track;
	unsigned int count;
	unsigned long flags;
	int ret;

	/*
	 * It's OK to not take the registration lock at this point.
	 * This is because the subsystem list inside the relevant
	 * restart order is not being traversed.
	 */
	if (order) {
		list = order->subsys_ptrs;
		count = order->count;
		track = &order->track;
	} else {
		list = &dev;
		count = 1;
		track = &dev->track;
	}

	/*
	 * If a system reboot/shutdown is under way, ignore subsystem errors.
	 * However, print a message so that we know that a subsystem behaved
	 * unexpectedly here.
	 */
	 if(meig_work_flag==1){
		 pr_err("wait complete at the last timen");
		return; 
	 }
	 meig_work_flag=1;
	 
	if (system_state == SYSTEM_RESTART
		|| system_state == SYSTEM_POWER_OFF) {
		WARN(1, "SSR aborted: %s, system reboot/shutdown is under wayn",
			desc->name);
		pr_err("SSR aborted: %s, system reboot/shutdown is under wayn",
			desc->name);
		return;
	}

	mutex_lock(&track->lock);
	do_epoch_check(dev);

	if (dev->track.state == SUBSYS_OFFLINE) {
		mutex_unlock(&track->lock);
		WARN(1, "SSR aborted: %s subsystem not onlinen", desc->name);
		pr_err("SSR aborted: %s subsystem not onlinen",
			desc->name);
		return;
	}

	/*
	 * It's necessary to take the registration lock because the subsystem
	 * list in the SoC restart order will be traversed and it shouldn't be
	 * changed until _this_ restart sequence completes.
	 */
	mutex_lock(&soc_order_reg_lock);

	pr_err("[%s:%d]: Starting restart sequence for %sn",
			current->comm, current->pid, desc->name);
	notify_each_subsys_device(list, count, SUBSYS_BEFORE_SHUTDOWN, NULL);
	ret = for_each_subsys_device(list, count, NULL, subsystem_shutdown);
	if (ret)
		goto err;
	notify_each_subsys_device(list, count, SUBSYS_AFTER_SHUTDOWN, NULL);

	notify_each_subsys_device(list, count, SUBSYS_RAMDUMP_NOTIFICATION,
									NULL);

	spin_lock_irqsave(&track->s_lock, flags);
	track->p_state = SUBSYS_RESTARTING;
	spin_unlock_irqrestore(&track->s_lock, flags);

	//msleep(3000);
	/* Collect ram dumps for all subsystems in order here */
	for_each_subsys_device(list, count, NULL, subsystem_ramdump);

	for_each_subsys_device(list, count, NULL, subsystem_free_memory);

	notify_each_subsys_device(list, count, SUBSYS_BEFORE_POWERUP, NULL);
	ret = for_each_subsys_device(list, count, NULL, subsystem_powerup);
	if (ret)
		goto err;
	notify_each_subsys_device(list, count, SUBSYS_AFTER_POWERUP, NULL);

	pr_err("[%s:%d]: Restart sequence for %s completed.n",
			current->comm, current->pid, desc->name);
+	msleep(3000);//加个延时，通过QXDM发送命令send_data 75 37 03 00 00 触发modem死机；多次发送，即可出现panic("Subsystem %s crashed during SSR!", name);

err:
	/* Reset subsys count */
	if (ret)
		dev->count = 0;
	//msleep(9000);

	mutex_unlock(&soc_order_reg_lock);
	mutex_unlock(&track->lock);

	spin_lock_irqsave(&track->s_lock, flags);
	pr_err("zhanghong 6666666666666n");
	track->p_state = SUBSYS_NORMAL;
	meig_work_flag=0;
	__pm_relax(dev->ssr_wlock);
	spin_unlock_irqrestore(&track->s_lock, flags);
}

在这里插入图片描述

规避方案：
1.subsystem_restart_wq_func 函数开始的地方加个延时，等待上一次modem重启完成
2.panic(“Subsystem %s crashed during SSR!”, name); 改为仅打印

解决方案：

1.如何设置系统restart_level为related
device/qcom/common/rootdir/Android.mk

复制代码

1
2
3
4
5
6
7
8
9
#<!-- Enable SSR for user version[Solution]Add use/debug control for init.qcom.rc. 
#LOCAL_SRC_FILES    := etc/init.qcom.rc
ifeq ($(TARGET_BUILD_VARIANT),user)
  LOCAL_SRC_FILES    := etc/init.qcom.user.rc
else
  LOCAL_SRC_FILES    := etc/init.qcom.rc
endif
#END-->

device/qcom/common/rootdir/etc/init.qcom.user.rc

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
    #sensors log dir
    mkdir /data/vendor/sensors
    chown system system /data/vendor/sensors

#<!-- Enable SSR Add use/debug control for init.qcom.rc. 
    write /sys/bus/msm_subsys/devices/subsys0/restart_level related
    write /sys/bus/msm_subsys/devices/subsys1/restart_level related
    write /sys/bus/msm_subsys/devices/subsys2/restart_level related
    write /sys/bus/msm_subsys/devices/subsys3/restart_level related
#end-->

# msm specific files that need to be created on /data
on post-fs-data
    mkdir /data/vendor/misc 01771 system system