Hello all,
My company is facing a problem with executing our highly accelerated testing of a handful of t2080 based systems.
Background, we have delivered successfully a dozen or so of these systems that have all cycled through this highly accelerated test set with out issue.
Issue that we are seeing is a machine check after power cycle of the unit. This issue doesn't happen on every power cycle, during the testing the ambient temperature is either ramped from high temp to low temp or vice versa (believe 15 - 30 degree per minute) . When the temp has settled at high or low the test waits a handful of seconds and then power cycles the unit. Our t2080 is running in a hypervisor environment (topaz based), with a Linux partition (cores 0-5), and a RTEMS partition (cores 6-7). The Linux partition is the manager of the RTEMS partition and starts it during Linux boot.
The Linux partition has completed system init and has begun launching our custom applications when we would receive the machine check (output below).
I have inspected the output and do not see anything SW related that would have caused the machine check. The documentation leads me to believe this is something internal to the SoC and have no ability to correct nor determine the root cause.
Is there better way to determine the cause (some additional registers to inspect, some hardware to be instrumented, etc...)
One last note, once this machine check has occurred, the system will startup correctly on power cycle without changing the physical environment.
At this point when the machine check happens, the Linux partition is running and the RTEMs side was previous launched and is starting up.
[0] Recursive machine check
[0] powerpc_mchk_interrupt: machine check interrupt!
[0] mcheck registers: mcsr = 80000000, mcssr0 = 4009c0, mcsrr1 = 80001000
[0] machine check
[0] NIP 0x004009c0 MSR 0x80001000 LR 0xc00000000001aaa0 ESR 0x00000080 EXC 1
CTR 0xc00000000001aa7c CR 0x22000088 XER 0x00000000 DEAR 0x8000080080901120 PIR 0
Prev trap level 0
r00 0x00000001 r01 0x004816e0 r02 0x0048d880 r03 0xc000000000bdfd00
r04 0xc00000003fff5000 r05 0xc00000000001aaa0 r06 0xc00000000001aa7c r07 0x100000000000000
r08 0x00000000 r09 0xc000000000b43518 r10 0x00000001 r11 0xc000000000bdc000
r12 0xc000000000bdfbc8 r13 0x00478b40 r14 0xc000000000bdc000 r15 0xc000000000be91c8
r16 0xc000000000b1691f r17 0xc0000000007addc8 r18 0xc000000000bdc040 r19 0xc000000000bdc040
r20 0x00000001 r21 0xc000000000bdc040 r22 0xc000000000acf4c8 r23 0xc000000000bdc000
r24 0xc000000000bdc040 r25 0xc000000000bdc000 r26 0xc000000000bdc000 r27 0xc000000000bdc000
r28 0x00080000 r29 0xc000000000bdc000 r30 0xc000000000bdc000 r31 0xc000000000be9138
[0] sp 4816e0 0 0 400a84 0
[0] Traceback:
[1] doorbell critical
[1] NIP 0xc00000000001aac4 MSR 0x90029000 LR 0xc00000000001aaa0 ESR 0x00000080 EXC 37
CTR 0xc00000000001aa7c CR 0x22000088 XER 0x00000000 DEAR 0x800003fffff2d058 PIR 1
Prev trap level 0
r00 0x00000001 r01 0xc0000001730fbd40 r02 0xc000000000be1a00 r03 0x00000000
r04 0x00000000 r05 0x00000001 r06 0x17db51000 r07 0x100000000000000
r08 0x140000000000000 r09 0xc000000000b43518 r10 0x00000001 r11 0xc0000001730f8000
r12 0xc0000001730fbc08 r13 0xc00000003fff56c0 r14 0xc0000001730f8000 r15 0xc000000000be91c8
r16 0xc000000000b1691f r17 0xc0000000007addc8 r18 0xc0000001730f8040 r19 0xc0000001730f8040
r20 0x00000001 r21 0xc0000001730f8040 r22 0xc000000000acf4c8 r23 0xc0000001730f8000
r24 0xc0000001730f8040 r25 0xc0000001730f8000 r26 0xc0000001730f8000 r27 0xc0000001730f8000
r28 0x00080000 r29 0xc0000001730f8000 r30 0xc0000001730f8000 r31 0xc000000000be9138
[2] doorbell critical
[2] NIP 0xc00000000001aac4 MSR 0x90029000 LR 0xc00000000001aaa0 ESR 0x00000080 EXC 37
CTR 0xc00000000001aa7c CR 0x22000028 XER 0x00000000 DEAR 0x3fff86af7ae8 PIR 2
Prev trap level 0
r00 0x00000001 r01 0xc0000001730ffd40 r02 0xc000000000be1a00 r03 0x00000000
r04 0xc0000001730f6150 r05 0x00000002 r06 0x17db71000 r07 0x100000000000000
r08 0x140000000000000 r09 0xc000000000b43518 r10 0x00000001 r11 0xc0000001730fc000
r12 0x84a28e22 r13 0xc00000003fff5d80 r14 0xc0000001730fc000 r15 0xc000000000be91c8
r16 0xc000000000b1691f r17 0xc0000000007addc8 r18 0xc0000001730fc040 r19 0xc0000001730fc040
r20 0x00000001 r21 0xc0000001730fc040 r22 0xc000000000acf4c8 r23 0xc0000001730fc000
r24 0xc0000001730fc040 r25 0xc0000001730fc000 r26 0xc0000001730fc000 r27 0xc0000001730fc000
r28 0x00080000 r29 0xc0000001730fc000 r30 0xc0000001730fc000 r31 0xc000000000be9138
[3] doorbell critical
[3] NIP 0xc000000000173b00 MSR 0x90029000 LR 0xc000000000173be4 ESR 0x00000080 EXC 37
CTR 0xc0000000002a8f70 CR 0x24000222 XER 0x20000000 DEAR 0x80000800809e8008 PIR 3
Prev trap level 0
r00 0xc000000000173be4 r01 0xc00000016e98fb50 r02 0xc000000000be1a00 r03 0x00000001
r04 0x00003e40 r05 0xc000000000b1c340 r06 0x00000000 r07 0xe000000000000000
r08 0x00000000 r09 0x00000000 r10 0x00000000 r11 0x00000c00
r12 0x24000224 r13 0xc00000003fff6440 r14 0x1000d3e8 r15 0x00000000
r16 0x00000000 r17 0x1000d380 r18 0x1000d438 r19 0x00000001
r20 0x10021420 r21 0x00000001 r22 0x1000cf18 r23 0x100225c8
r24 0x100e5c48 r25 0x101025c8 r26 0xc0000001732da410 r27 0xc000000172358798
r28 0xc000000000b3b160 r29 0xc0000001723587f0 r30 0xc00000007a39def0 r31 0xc00000007a39ddf0
[4] doorbell critical
[4] NIP 0xc00000000001aac4 MSR 0x90029000 LR 0xc00000000001aaa0 ESR 0x00000080 EXC 37
CTR 0xc00000000001aa7c CR 0x22000028 XER 0x00000000 DEAR 0x80000800809e0008 PIR 4
Prev trap level 0
r00 0x00000001 r01 0xc00000017310bd40 r02 0xc000000000be1a00 r03 0x00000000
r04 0xc000000173100190 r05 0x00000004 r06 0x17dbb1000 r07 0x100000000000000
r08 0x140000000000000 r09 0xc000000000b43518 r10 0x00000001 r11 0xc000000173108000
r12 0x84adbe82 r13 0xc00000003fff6b00 r14 0xc000000173108000 r15 0xc000000000be91c8
r16 0xc000000000b1691f r17 0xc0000000007addc8 r18 0xc000000173108040 r19 0xc000000173108040
r20 0x00000001 r21 0xc000000173108040 r22 0xc000000000acf4c8 r23 0xc000000173108000
r24 0xc000000173108040 r25 0xc000000173108000 r26 0xc000000173108000 r27 0xc000000173108000
r28 0x00080000 r29 0xc000000173108000 r30 0xc000000173108000 r31 0xc000000000be9138
[5] doorbell critical
[5] NIP 0xc00000000001aac4 MSR 0x90029000 LR 0xc00000000001aaa0 ESR 0x00000080 EXC 37
CTR 0xc00000000001aa7c CR 0x22000028 XER 0x00000000 DEAR 0x8000080080010e04 PIR 5
Prev trap level 0
r00 0x00000001 r01 0xc000000173113d40 r02 0xc000000000be1a00 r03 0x00000000
r04 0xc00000017310cd50 r05 0x00000005 r06 0x17dbd1000 r07 0x100000000000000
r08 0x140000000000000 r09 0xc000000000b43518 r10 0x00000001 r11 0xc000000173110000
r12 0x84a28e82 r13 0xc00000003fff71c0 r14 0xc000000173110000 r15 0xc000000000be91c8
r16 0xc000000000b1691f r17 0xc0000000007addc8 r18 0xc000000173110040 r19 0xc000000173110040
r20 0x00000001 r21 0xc000000173110040 r22 0xc000000000acf4c8 r23 0xc000000173110000
r24 0xc000000173110040 r25 0xc000000173110000 r26 0xc000000173110000 r27 0xc000000173110000
r28 0x00080000 r29 0xc000000173110000 r30 0xc000000173110000 r31 0xc000000000be9138
[7] doorbell critical
[7] NIP 0x00421090 MSR 0x80029002 LR 0x004210a4 ESR 0x00000000 EXC 37
CTR 0x00000000 CR 0x20000088 XER 0x00000000 DEAR 0x00000000 PIR 7
Prev trap level 0
r00 0x00400500 r01 0x006411a0 r02 0x0048d880 r03 0x7e3bd040
r04 0x7e3bd040 r05 0x00000000 r06 0x00000000 r07 0x0049d880
r08 0x00000000 r09 0x00000000 r10 0x00000020 r11 0x00000000
r12 0x00000000 r13 0x004e8280 r14 0x00000000 r15 0x00000000
r16 0x00000000 r17 0x00000000 r18 0x00000000 r19 0x00000000
r20 0x00000000 r21 0x00000000 r22 0x00000000 r23 0x00000000
r24 0x00000007 r25 0x00472260 r26 0x00000001 r27 0x00641250
r28 0x00000040 r29 0x0068d278 r30 0x00000001 r31 0x0063d3e0
[7] sp 6411a0 641250 0 0 0
[7] Traceback: 0xfffffffffffffffc
[6] doorbell critical
[6] NIP 0x0011f3ac MSR 0x1200a200 LR 0x00137394 ESR 0x00000000 EXC 37
CTR 0x0011f3a0 CR 0x20000008 XER 0x00000000 DEAR 0x14041000 PIR 6
Prev trap level 0
r00 0x00137374 r01 0x05089bf0 r02 0x00000000 r03 0x1204a200
r04 0x050694c0 r05 0x0500a198 r06 0x1200a200 r07 0x0500a1a0
r08 0x0500a1a4 r09 0x0011f3a0 r10 0x09010001 r11 0x00000000
r12 0x20000002 r13 0x050375d8 r14 0x00000000 r15 0x00000000
r16 0x00000000 r17 0x00000000 r18 0x00000000 r19 0x00000000
r20 0x00000000 r21 0x00000000 r22 0x00000000 r23 0x00000000
r24 0x00000000 r25 0x00000000 r26 0x00000000 r27 0x00000000
r28 0x00000000 r29 0x00000000 r30 0x00000000 r31 0x050694c0