概述
项目场景:
modem 子系统crash导致系统crash,偶现。且SSR已经关闭。
如果disable_restart_work 设置为DISABLE_SSR,那么不管什么(wlan adsp-audio/sensor modem etc)触发了SSR,都不会重启
#define DISABLE_SSR 0x9889deed
/* If set to 0x9889deed, call to subsystem_restart_dev() returns immediately */
//static uint disable_restart_work;
static uint disable_restart_work = DISABLE_SSR;
常用的打开log:
adb shell "echo 'file subsystem_restart.c +p' > /sys/kernel/debug/dynamic_debug/control"
adb shell "echo 'file subsys-pil-tz.c +p' > /sys/kernel/debug/dynamic_debug/control"
问题描述
1.有以下几种情况会导致系统重启/crash:
//subsystem_restart_dev
//01.If a system reboot/shutdown is underway ignore subsystem errors.
//However, print a message so that we know that a subsystem behaved unexpectedly here.
extern enum system_states {
SYSTEM_BOOTING,
SYSTEM_SCHEDULING,
SYSTEM_RUNNING,
SYSTEM_HALT,
SYSTEM_POWER_OFF,
SYSTEM_RESTART,
SYSTEM_SUSPEND,
} system_state;
if (system_state == SYSTEM_RESTART
|| system_state == SYSTEM_POWER_OFF) {
pr_err("%s crashed during a system poweroff/shutdown.n", name);
return -EBUSY;
}
//02.disable_restart_work = DISABLE_SSR;直接跳过
if (disable_restart_work == DISABLE_SSR) {
pr_err("subsys-restart: Ignoring restart request for %sn",
name);
return 0;
}
//03.restart_level
switch (dev->restart_level) {
case RESET_SUBSYS_COUPLED://related 已经确认是这里
__subsystem_restart_dev(dev);
break;
case RESET_SOC://system
__pm_stay_awake(dev->ssr_wlock);
schedule_work(&dev->device_restart_work);
return 0;
default:
panic("subsys-restart: Unknown restart level!n");
break;
}
//__subsystem_restart_dev
//04.正常的情况下,应该是track->p_state为SUBSYS_NORMAL;dev->track.state为SUBSYS_ONLINE;否则系统重启
if (track->p_state != SUBSYS_CRASHED &&
dev->track.state == SUBSYS_ONLINE) {
if (track->p_state != SUBSYS_RESTARTING) {
track->p_state = SUBSYS_CRASHED;
__pm_stay_awake(dev->ssr_wlock);
queue_work(ssr_wq, &dev->work);//触发子系统重启
} else {
pr_err("Subsystem %s crashed during SSR!", name);
}
} else
WARN(dev->track.state == SUBSYS_OFFLINE,
"SSR aborted: %s subsystem not onlinen", name);
// INIT_WORK(&subsys->work, subsystem_restart_wq_func);
//05.再次检测系统状态,系统关机重启abort SSR
if (system_state == SYSTEM_RESTART
|| system_state == SYSTEM_POWER_OFF) {
WARN(1, "SSR aborted: %s, system reboot/shutdown is under wayn",
desc->name);
pr_err("SSR aborted: %s, system reboot/shutdown is under wayn",
desc->name);
return;
}
//06.子系统没有起来,abort SSR
if (dev->track.state == SUBSYS_OFFLINE) {
mutex_unlock(&track->lock);
WARN(1, "SSR aborted: %s subsystem not onlinen", desc->name);
pr_err("SSR aborted: %s subsystem not onlinen",
desc->name);
return;
}
2.首先肯定是wlan adsp(audio sensor) modem 子系统异常触发中断或者直接进入下面的函数:
subsystem_restart_dev
3.内核中有许多地方调用类似BUG()的语句,它非常像一个内核运行时的断言,意味着本来不该执行到BUG()这条语句,一旦执行即抛出Oops。 BUG()的定义为:
#define BUG() do {
printk("BUG at %s:%d/%s()!n", __FILE__, __LINE__, __func__);
panic("BUG!");
} while (0)
BUG()还有一个变体叫BUG_ON(),它的内部会引用BUG()
#define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
其中的panic()定义在kernel/panic.c中,会导致内核崩溃,并打印Oops。
内核有个稍微弱一些WARN_ON(),在括号中的条件成立时,内核会打印栈回溯,但是不会panic(),表示内核抛出一个警告,暗示某种不太合理的事情发生了。
4.CONFIG_SETUP_SSR_NOTIF_TIMEOUTS 这个宏控可以关闭,没有什么影响
原因分析:
目前出现一个问题,modem子系统重启偶现不生效;还是panic.
我这边加了个延时,手动触发modem crash 可以模拟出来“Subsystem modem crashed during SSR!”:是是因为前一次modem 子系统重启未完成又触发了下一次modem 子系统重启,使得p_state为SUBSYS_RESTARTING从而导致panic;正常情况下是不应该出现这么频繁的子系统重启的,为防止这种情况,可以加个标志位,等待上一次modem 子系统重启完成才会进行下一次子系统重启。
模拟方法:
static void subsystem_restart_wq_func(struct work_struct *work)
{
struct subsys_device *dev = container_of(work,
struct subsys_device, work);
struct subsys_device **list;
struct subsys_desc *desc = dev->desc;
struct subsys_soc_restart_order *order = dev->restart_order;
struct subsys_tracking *track;
unsigned int count;
unsigned long flags;
int ret;
/*
* It's OK to not take the registration lock at this point.
* This is because the subsystem list inside the relevant
* restart order is not being traversed.
*/
if (order) {
list = order->subsys_ptrs;
count = order->count;
track = &order->track;
} else {
list = &dev;
count = 1;
track = &dev->track;
}
/*
* If a system reboot/shutdown is under way, ignore subsystem errors.
* However, print a message so that we know that a subsystem behaved
* unexpectedly here.
*/
if(meig_work_flag==1){
pr_err("wait complete at the last timen");
return;
}
meig_work_flag=1;
if (system_state == SYSTEM_RESTART
|| system_state == SYSTEM_POWER_OFF) {
WARN(1, "SSR aborted: %s, system reboot/shutdown is under wayn",
desc->name);
pr_err("SSR aborted: %s, system reboot/shutdown is under wayn",
desc->name);
return;
}
mutex_lock(&track->lock);
do_epoch_check(dev);
if (dev->track.state == SUBSYS_OFFLINE) {
mutex_unlock(&track->lock);
WARN(1, "SSR aborted: %s subsystem not onlinen", desc->name);
pr_err("SSR aborted: %s subsystem not onlinen",
desc->name);
return;
}
/*
* It's necessary to take the registration lock because the subsystem
* list in the SoC restart order will be traversed and it shouldn't be
* changed until _this_ restart sequence completes.
*/
mutex_lock(&soc_order_reg_lock);
pr_err("[%s:%d]: Starting restart sequence for %sn",
current->comm, current->pid, desc->name);
notify_each_subsys_device(list, count, SUBSYS_BEFORE_SHUTDOWN, NULL);
ret = for_each_subsys_device(list, count, NULL, subsystem_shutdown);
if (ret)
goto err;
notify_each_subsys_device(list, count, SUBSYS_AFTER_SHUTDOWN, NULL);
notify_each_subsys_device(list, count, SUBSYS_RAMDUMP_NOTIFICATION,
NULL);
spin_lock_irqsave(&track->s_lock, flags);
track->p_state = SUBSYS_RESTARTING;
spin_unlock_irqrestore(&track->s_lock, flags);
//msleep(3000);
/* Collect ram dumps for all subsystems in order here */
for_each_subsys_device(list, count, NULL, subsystem_ramdump);
for_each_subsys_device(list, count, NULL, subsystem_free_memory);
notify_each_subsys_device(list, count, SUBSYS_BEFORE_POWERUP, NULL);
ret = for_each_subsys_device(list, count, NULL, subsystem_powerup);
if (ret)
goto err;
notify_each_subsys_device(list, count, SUBSYS_AFTER_POWERUP, NULL);
pr_err("[%s:%d]: Restart sequence for %s completed.n",
current->comm, current->pid, desc->name);
+ msleep(3000);//加个延时,通过QXDM发送命令send_data 75 37 03 00 00 触发modem死机;多次发送,即可出现panic("Subsystem %s crashed during SSR!", name);
err:
/* Reset subsys count */
if (ret)
dev->count = 0;
//msleep(9000);
mutex_unlock(&soc_order_reg_lock);
mutex_unlock(&track->lock);
spin_lock_irqsave(&track->s_lock, flags);
pr_err("zhanghong 6666666666666n");
track->p_state = SUBSYS_NORMAL;
meig_work_flag=0;
__pm_relax(dev->ssr_wlock);
spin_unlock_irqrestore(&track->s_lock, flags);
}
规避方案:
1.subsystem_restart_wq_func 函数开始的地方加个延时,等待上一次modem重启完成
2.panic(“Subsystem %s crashed during SSR!”, name); 改为仅打印
解决方案:
1.如何设置系统restart_level为related
device/qcom/common/rootdir/Android.mk
#<!-- Enable SSR for user version[Solution]Add use/debug control for init.qcom.rc.
#LOCAL_SRC_FILES := etc/init.qcom.rc
ifeq ($(TARGET_BUILD_VARIANT),user)
LOCAL_SRC_FILES := etc/init.qcom.user.rc
else
LOCAL_SRC_FILES := etc/init.qcom.rc
endif
#END-->
device/qcom/common/rootdir/etc/init.qcom.user.rc
#sensors log dir
mkdir /data/vendor/sensors
chown system system /data/vendor/sensors
#<!-- Enable SSR Add use/debug control for init.qcom.rc.
write /sys/bus/msm_subsys/devices/subsys0/restart_level related
write /sys/bus/msm_subsys/devices/subsys1/restart_level related
write /sys/bus/msm_subsys/devices/subsys2/restart_level related
write /sys/bus/msm_subsys/devices/subsys3/restart_level related
#end-->
# msm specific files that need to be created on /data
on post-fs-data
mkdir /data/vendor/misc 01771 system system
“qcom,ignore-ssr-failure” can be added in the following node of dtsi
pil_modem: qcom,mss@4080000
modem 如何和AP通,待更新。。。
最后
以上就是辛勤柠檬为你收集整理的QCM6490 SSR 记述(一)项目场景:问题描述原因分析:解决方案:的全部内容,希望文章能够帮你解决QCM6490 SSR 记述(一)项目场景:问题描述原因分析:解决方案:所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复