Some update, on this topic.
When we updated kernel config with:
CONFIG_NR_CPUS=2
CONFIG_SCHED_MC=n
CONFIG_SCHED_SMT=n
We were able to reproduce crash on each boot. This made situation easier for debugging.
Running kernel with kdgb we managed to get more details on crash:
xffff8000090b8ef8 in dev_gpd_data (dev=0xffff000827349000) at /home/vik/Workspace/V4.0/2171-platform-linux-imx6/01-Yocto/build-imx8/workspace/sources/linux-imx/include/linux/pm_domain.h:223
223 return to_gpd_data(dev->power.subsys_data->domain_data);
(gdb) bt full
#0 0xffff8000090b8ef8 in dev_gpd_data (dev=0xffff000827349000)
at /home/vik/Workspace/V4.0/2171-platform-linux-imx6/01-Yocto/build-imx8/workspace/sources/linux-imx/include/linux/pm_domain.h:223
No locals.
#1 genpd_drop_performance_state (dev=0xffff000827349000)
at /home/vik/Workspace/V4.0/2171-platform-linux-imx6/01-Yocto/build-imx8/workspace/sources/linux-imx/drivers/base/power/domain.c:439
gpd_data = <optimized out>
prev_state = <optimized out>
gpd_data = <optimized out>
prev_state = <optimized out>
#2 genpd_runtime_suspend (dev=0xffff000827349000)
at /home/vik/Workspace/V4.0/2171-platform-linux-imx6/01-Yocto/build-imx8/workspace/sources/linux-imx/drivers/base/power/domain.c:989
genpd = 0xffff0008109bd880
suspend_ok = <optimized out>
gpd_data = 0xffff000827c05b80
td = 0xffff000827c05b98
runtime_pm = <optimized out>
time_start = 92688223125
elapsed_ns = <optimized out>
ret = <optimized out>
__func__ = "genpd_runtime_suspend"
#3 0xffff800008818328 in __rpm_callback (cb=0xffff8000090b8d7c <genpd_runtime_suspend>, dev=0xffff000827349000)
at /home/vik/Workspace/V4.0/2171-platform-linux-imx6/01-Yocto/build-imx8/workspace/sources/linux-imx/drivers/base/power/runtime.c:398
retval = 0
idx = <optimized out>
use_links = false
#4 0xffff80000881849c in rpm_callback (cb=cb@entry=0xffff8000090b8d7c <genpd_runtime_suspend>, dev=dev@entry=0xffff000827349000)
at /home/vik/Workspace/V4.0/2171-platform-linux-imx6/01-Yocto/build-imx8/workspace/sources/linux-imx/drivers/base/power/runtime.c:525
retval = <optimized out>
#5 0xffff800008819b70 in rpm_suspend (dev=dev@entry=0xffff000827349000, rpmflags=rpmflags@entry=10)
at /home/vik/Workspace/V4.0/2171-platform-linux-imx6/01-Yocto/build-imx8/workspace/sources/linux-imx/drivers/base/power/runtime.c:665
callback = 0xffff8000090b8d7c <genpd_runtime_suspend>
parent = 0x0
retval = <optimized out>
repeat = <optimized out>
#6 0xffff80000881a2d4 in pm_runtime_work (work=<optimized out>)
at /home/vik/Workspace/V4.0/2171-platform-linux-imx6/01-Yocto/build-imx8/workspace/sources/linux-imx/drivers/base/power/runtime.c:960
dev = 0xffff000827349000
req = <optimized out>
#7 0xffff80000806e3c0 in process_one_work (worker=worker@entry=0xffff000810a71700, work=0xffff000827349190)
at /home/vik/Workspace/V4.0/2171-platform-linux-imx6/01-Yocto/build-imx8/workspace/sources/linux-imx/kernel/workqueue.c:2306
pwq = 0xffff0008ff3da400
pool = 0xffff0008ff3d4cc0
cpu_intensive = false
work_data = 18446462637374809093
collision = 0x0
#8 0xffff80000806ea40 in worker_thread (__worker=0xffff000810a71700)
at /home/vik/Workspace/V4.0/2171-platform-linux-imx6/01-Yocto/build-imx8/workspace/sources/linux-imx/kernel/workqueue.c:2453
work = <optimized out>
worker = 0xffff000810a71700
Due stack corruption we couldn't get more details but it helped us to add some additional pointer/value checks in genpd_drop_performance_state(). We created this patch:
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 7bef4b6638a2..09891e892866 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -436,7 +436,18 @@ static int genpd_set_performance_state(struct device *dev, unsigned int state)
static int genpd_drop_performance_state(struct device *dev)
{
- unsigned int prev_state = dev_gpd_data(dev)->performance_state;
+ struct generic_pm_domain_data *gpd_data = NULL;
+ unsigned int prev_state = 0;
+
+ if (dev->power.subsys_data && dev->power.subsys_data->domain_data)
+ gpd_data = dev_gpd_data(dev);
+ else
+ return 0;
+
+ if (gpd_data)
+ prev_state = gpd_data->performance_state;
+ else
+ return 0;
if (!genpd_set_performance_state(dev, 0))
return prev_state;
--
2.29.0
After applying this patch we didn't see crash for more than 3000 reboots.