项目场景:
modem 子系统crash导致系统crash,偶现。且SSR已经关闭。
如果disable_restart_work 设置为DISABLE_SSR,那么不管什么(wlan adsp-audio/sensor modem etc)触发了SSR,都不会重启
1
2
3
4
5#define DISABLE_SSR 0x9889deed /* If set to 0x9889deed, call to subsystem_restart_dev() returns immediately */ //static uint disable_restart_work; static uint disable_restart_work = DISABLE_SSR;
常用的打开log:
1
2
3adb shell "echo 'file subsystem_restart.c +p' > /sys/kernel/debug/dynamic_debug/control" adb shell "echo 'file subsys-pil-tz.c +p' > /sys/kernel/debug/dynamic_debug/control"
问题描述
1.有以下几种情况会导致系统重启/crash:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78//subsystem_restart_dev //01.If a system reboot/shutdown is underway ignore subsystem errors. //However, print a message so that we know that a subsystem behaved unexpectedly here. extern enum system_states { SYSTEM_BOOTING, SYSTEM_SCHEDULING, SYSTEM_RUNNING, SYSTEM_HALT, SYSTEM_POWER_OFF, SYSTEM_RESTART, SYSTEM_SUSPEND, } system_state; if (system_state == SYSTEM_RESTART || system_state == SYSTEM_POWER_OFF) { pr_err("%s crashed during a system poweroff/shutdown.n", name); return -EBUSY; } //02.disable_restart_work = DISABLE_SSR;直接跳过 if (disable_restart_work == DISABLE_SSR) { pr_err("subsys-restart: Ignoring restart request for %sn", name); return 0; } //03.restart_level switch (dev->restart_level) { case RESET_SUBSYS_COUPLED://related 已经确认是这里 __subsystem_restart_dev(dev); break; case RESET_SOC://system __pm_stay_awake(dev->ssr_wlock); schedule_work(&dev->device_restart_work); return 0; default: panic("subsys-restart: Unknown restart level!n"); break; } //__subsystem_restart_dev //04.正常的情况下,应该是track->p_state为SUBSYS_NORMAL;dev->track.state为SUBSYS_ONLINE;否则系统重启 if (track->p_state != SUBSYS_CRASHED && dev->track.state == SUBSYS_ONLINE) { if (track->p_state != SUBSYS_RESTARTING) { track->p_state = SUBSYS_CRASHED; __pm_stay_awake(dev->ssr_wlock); queue_work(ssr_wq, &dev->work);//触发子系统重启 } else { pr_err("Subsystem %s crashed during SSR!", name); } } else WARN(dev->track.state == SUBSYS_OFFLINE, "SSR aborted: %s subsystem not onlinen", name); // INIT_WORK(&subsys->work, subsystem_restart_wq_func); //05.再次检测系统状态,系统关机重启abort SSR if (system_state == SYSTEM_RESTART || system_state == SYSTEM_POWER_OFF) { WARN(1, "SSR aborted: %s, system reboot/shutdown is under wayn", desc->name); pr_err("SSR aborted: %s, system reboot/shutdown is under wayn", desc->name); return; } //06.子系统没有起来,abort SSR if (dev->track.state == SUBSYS_OFFLINE) { mutex_unlock(&track->lock); WARN(1, "SSR aborted: %s subsystem not onlinen", desc->name); pr_err("SSR aborted: %s subsystem not onlinen", desc->name); return; }
2.首先肯定是wlan adsp(audio sensor) modem 子系统异常触发中断或者直接进入下面的函数:
subsystem_restart_dev
3.内核中有许多地方调用类似BUG()的语句,它非常像一个内核运行时的断言,意味着本来不该执行到BUG()这条语句,一旦执行即抛出Oops。 BUG()的定义为:
1
2
3
4
5#define BUG() do { printk("BUG at %s:%d/%s()!n", __FILE__, __LINE__, __func__); panic("BUG!"); } while (0)
BUG()还有一个变体叫BUG_ON(),它的内部会引用BUG()
1
2#define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
其中的panic()定义在kernel/panic.c中,会导致内核崩溃,并打印Oops。
内核有个稍微弱一些WARN_ON(),在括号中的条件成立时,内核会打印栈回溯,但是不会panic(),表示内核抛出一个警告,暗示某种不太合理的事情发生了。
4.CONFIG_SETUP_SSR_NOTIF_TIMEOUTS 这个宏控可以关闭,没有什么影响
原因分析:
目前出现一个问题,modem子系统重启偶现不生效;还是panic.
我这边加了个延时,手动触发modem crash 可以模拟出来“Subsystem modem crashed during SSR!”:是是因为前一次modem 子系统重启未完成又触发了下一次modem 子系统重启,使得p_state为SUBSYS_RESTARTING从而导致panic;正常情况下是不应该出现这么频繁的子系统重启的,为防止这种情况,可以加个标志位,等待上一次modem 子系统重启完成才会进行下一次子系统重启。
模拟方法:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114static void subsystem_restart_wq_func(struct work_struct *work) { struct subsys_device *dev = container_of(work, struct subsys_device, work); struct subsys_device **list; struct subsys_desc *desc = dev->desc; struct subsys_soc_restart_order *order = dev->restart_order; struct subsys_tracking *track; unsigned int count; unsigned long flags; int ret; /* * It's OK to not take the registration lock at this point. * This is because the subsystem list inside the relevant * restart order is not being traversed. */ if (order) { list = order->subsys_ptrs; count = order->count; track = &order->track; } else { list = &dev; count = 1; track = &dev->track; } /* * If a system reboot/shutdown is under way, ignore subsystem errors. * However, print a message so that we know that a subsystem behaved * unexpectedly here. */ if(meig_work_flag==1){ pr_err("wait complete at the last timen"); return; } meig_work_flag=1; if (system_state == SYSTEM_RESTART || system_state == SYSTEM_POWER_OFF) { WARN(1, "SSR aborted: %s, system reboot/shutdown is under wayn", desc->name); pr_err("SSR aborted: %s, system reboot/shutdown is under wayn", desc->name); return; } mutex_lock(&track->lock); do_epoch_check(dev); if (dev->track.state == SUBSYS_OFFLINE) { mutex_unlock(&track->lock); WARN(1, "SSR aborted: %s subsystem not onlinen", desc->name); pr_err("SSR aborted: %s subsystem not onlinen", desc->name); return; } /* * It's necessary to take the registration lock because the subsystem * list in the SoC restart order will be traversed and it shouldn't be * changed until _this_ restart sequence completes. */ mutex_lock(&soc_order_reg_lock); pr_err("[%s:%d]: Starting restart sequence for %sn", current->comm, current->pid, desc->name); notify_each_subsys_device(list, count, SUBSYS_BEFORE_SHUTDOWN, NULL); ret = for_each_subsys_device(list, count, NULL, subsystem_shutdown); if (ret) goto err; notify_each_subsys_device(list, count, SUBSYS_AFTER_SHUTDOWN, NULL); notify_each_subsys_device(list, count, SUBSYS_RAMDUMP_NOTIFICATION, NULL); spin_lock_irqsave(&track->s_lock, flags); track->p_state = SUBSYS_RESTARTING; spin_unlock_irqrestore(&track->s_lock, flags); //msleep(3000); /* Collect ram dumps for all subsystems in order here */ for_each_subsys_device(list, count, NULL, subsystem_ramdump); for_each_subsys_device(list, count, NULL, subsystem_free_memory); notify_each_subsys_device(list, count, SUBSYS_BEFORE_POWERUP, NULL); ret = for_each_subsys_device(list, count, NULL, subsystem_powerup); if (ret) goto err; notify_each_subsys_device(list, count, SUBSYS_AFTER_POWERUP, NULL); pr_err("[%s:%d]: Restart sequence for %s completed.n", current->comm, current->pid, desc->name); + msleep(3000);//加个延时,通过QXDM发送命令send_data 75 37 03 00 00 触发modem死机;多次发送,即可出现panic("Subsystem %s crashed during SSR!", name); err: /* Reset subsys count */ if (ret) dev->count = 0; //msleep(9000); mutex_unlock(&soc_order_reg_lock); mutex_unlock(&track->lock); spin_lock_irqsave(&track->s_lock, flags); pr_err("zhanghong 6666666666666n"); track->p_state = SUBSYS_NORMAL; meig_work_flag=0; __pm_relax(dev->ssr_wlock); spin_unlock_irqrestore(&track->s_lock, flags); }
规避方案:
1.subsystem_restart_wq_func 函数开始的地方加个延时,等待上一次modem重启完成
2.panic(“Subsystem %s crashed during SSR!”, name); 改为仅打印
解决方案:
1.如何设置系统restart_level为related
device/qcom/common/rootdir/Android.mk
1
2
3
4
5
6
7
8
9#<!-- Enable SSR for user version[Solution]Add use/debug control for init.qcom.rc. #LOCAL_SRC_FILES := etc/init.qcom.rc ifeq ($(TARGET_BUILD_VARIANT),user) LOCAL_SRC_FILES := etc/init.qcom.user.rc else LOCAL_SRC_FILES := etc/init.qcom.rc endif #END-->
device/qcom/common/rootdir/etc/init.qcom.user.rc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#sensors log dir mkdir /data/vendor/sensors chown system system /data/vendor/sensors #<!-- Enable SSR Add use/debug control for init.qcom.rc. write /sys/bus/msm_subsys/devices/subsys0/restart_level related write /sys/bus/msm_subsys/devices/subsys1/restart_level related write /sys/bus/msm_subsys/devices/subsys2/restart_level related write /sys/bus/msm_subsys/devices/subsys3/restart_level related #end--> # msm specific files that need to be created on /data on post-fs-data mkdir /data/vendor/misc 01771 system system
“qcom,ignore-ssr-failure” can be added in the following node of dtsi
pil_modem: qcom,mss@4080000
modem 如何和AP通,待更新。。。
最后
以上就是辛勤柠檬最近收集整理的关于QCM6490 SSR 记述(一)项目场景:问题描述原因分析:解决方案:的全部内容,更多相关QCM6490内容请搜索靠谱客的其他文章。
发表评论 取消回复