powerpc/powernv: Invoke opal_cec_reboot2() on unrecoverable machine check errors.

On non-recoverable MCE errors in kernel space, Linux kernel panics
and system reboots. On BMC based system opal-prd runs as a daemon
in the host. Hence, kernel crash may prevent opal-prd to detect and
analyze this MCE error. This may land us in a situation where the faulty
memory never gets de-configured and Linux would keep hitting same MCE error
again and again. If this happens in early stage of kernel initialization,
then Linux will keep crashing and rebooting in a loop.

This patch fixes this issue by invoking new opal_cec_reboot2() call with
reboot type OPAL_REBOOT_PLATFORM_ERROR to inform BMC/OCC about this
error, so that BMC can collect relevant data for error analysis and
decide what component to de-configure before rebooting.

This patch is dependent on OPAL patchset posted on skiboot mailing list
at https://lists.ozlabs.org/pipermail/skiboot/2015-July/001771.html that
introduces opal_cec_reboot2() opal call.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
Mahesh Salgaonkar 2015-07-31 21:24:38 +05:30 committed by Michael Ellerman
parent 1852ae276b
commit e784b6499d
4 changed files with 45 additions and 1 deletions

View File

@ -154,7 +154,8 @@
#define OPAL_FLASH_WRITE 111 #define OPAL_FLASH_WRITE 111
#define OPAL_FLASH_ERASE 112 #define OPAL_FLASH_ERASE 112
#define OPAL_PRD_MSG 113 #define OPAL_PRD_MSG 113
#define OPAL_LAST 113 #define OPAL_CEC_REBOOT2 116
#define OPAL_LAST 116
/* Device tree flags */ /* Device tree flags */
@ -857,6 +858,12 @@ enum OpalSysCooling {
OPAL_SYSCOOL_INSF = 0x0001, /* System insufficient cooling */ OPAL_SYSCOOL_INSF = 0x0001, /* System insufficient cooling */
}; };
/* Argument to OPAL_CEC_REBOOT2() */
enum {
OPAL_REBOOT_NORMAL = 0,
OPAL_REBOOT_PLATFORM_ERROR = 1,
};
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
#endif /* __OPAL_API_H */ #endif /* __OPAL_API_H */

View File

@ -44,6 +44,7 @@ int64_t opal_tpo_write(uint64_t token, uint32_t year_mon_day,
uint32_t hour_min); uint32_t hour_min);
int64_t opal_cec_power_down(uint64_t request); int64_t opal_cec_power_down(uint64_t request);
int64_t opal_cec_reboot(void); int64_t opal_cec_reboot(void);
int64_t opal_cec_reboot2(uint32_t reboot_type, char *diag);
int64_t opal_read_nvram(uint64_t buffer, uint64_t size, uint64_t offset); int64_t opal_read_nvram(uint64_t buffer, uint64_t size, uint64_t offset);
int64_t opal_write_nvram(uint64_t buffer, uint64_t size, uint64_t offset); int64_t opal_write_nvram(uint64_t buffer, uint64_t size, uint64_t offset);
int64_t opal_handle_interrupt(uint64_t isn, __be64 *outstanding_event_mask); int64_t opal_handle_interrupt(uint64_t isn, __be64 *outstanding_event_mask);

View File

@ -202,6 +202,7 @@ OPAL_CALL(opal_rtc_read, OPAL_RTC_READ);
OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE);
OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN);
OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT);
OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2);
OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM);
OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM);
OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT);

View File

@ -441,6 +441,7 @@ static int opal_recover_mce(struct pt_regs *regs,
int opal_machine_check(struct pt_regs *regs) int opal_machine_check(struct pt_regs *regs)
{ {
struct machine_check_event evt; struct machine_check_event evt;
int ret;
if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
return 0; return 0;
@ -455,6 +456,40 @@ int opal_machine_check(struct pt_regs *regs)
if (opal_recover_mce(regs, &evt)) if (opal_recover_mce(regs, &evt))
return 1; return 1;
/*
* Unrecovered machine check, we are heading to panic path.
*
* We may have hit this MCE in very early stage of kernel
* initialization even before opal-prd has started running. If
* this is the case then this MCE error may go un-noticed or
* un-analyzed if we go down panic path. We need to inform
* BMC/OCC about this error so that they can collect relevant
* data for error analysis before rebooting.
* Use opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR) to do so.
* This function may not return on BMC based system.
*/
ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR,
"Unrecoverable Machine Check exception");
if (ret == OPAL_UNSUPPORTED) {
pr_emerg("Reboot type %d not supported\n",
OPAL_REBOOT_PLATFORM_ERROR);
}
/*
* We reached here. There can be three possibilities:
* 1. We are running on a firmware level that do not support
* opal_cec_reboot2()
* 2. We are running on a firmware level that do not support
* OPAL_REBOOT_PLATFORM_ERROR reboot type.
* 3. We are running on FSP based system that does not need opal
* to trigger checkstop explicitly for error analysis. The FSP
* PRD component would have already got notified about this
* error through other channels.
*
* In any case, let us just fall through. We anyway heading
* down to panic path.
*/
return 0; return 0;
} }