- Enable additional logging mode on older Xeons (Tony Luck)
- Pass error records logged by firmware through the MCE decoding chain to provide human-readable error descriptions instead of raw values (Smita Koralahalli) - Some #MC handler fixes (Gabriele Paoloni) - The usual small fixes and cleanups all over. -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAl/XSJcACgkQEsHwGGHe VUob6hAAgSJq1IcftZR4DSk/Mlrt0x4orDNCmoGhxAlOT7ryiidYhXuKV2tvWloA 3v7X5E0r9CroS3PMghQtVOD7qJMjNAWKun5C6zPhLkIeV+CvcLAHhfShlcyWhJ76 PQaHHSxRQGEh5M2Xcp26kdwqrARbhcl66ukBvFNpiNUkLH+robDmIazraI5a2bV/ 9azNoVUZBSXQoJYpPz/tBTxu2EToj/xrMVIZg1OPHR5cxtOwUSZCr8V69KGK4onm avYQY8TSCDxsG1VcywYzBNi2W6lKs2EFlhCVZBLCz0NIkZCYTXErI4OsuExMufLu t17sAfHjQg+SxEcL5pc+iQkr9i0LLnujKz+Cl0ShtRk6SER0U+9pc/yf0wQSGDhB AZz87z+a6+r4pxdTSclOkpQCAfRR+pWjNwA5dyi6/72Qbqi6lmwKWDPJnnyq0YS4 UZI01zjs7ir93nS1zwJcekFOJCSTsb6XmhEgMVlpw+YoZHaOki1KJMCU0kIgZt8O YlEniP/DdXBS0mflOJQnoes7XrcIWVqWEubeRZdoWYnC07hNmdg4XJ0c3Skx8ZW+ gL8kt4pDWlnKHlTlhtgocG3H5BkMazrYEmbograc/Oe8lkr9ESqIS7yS1l8lM7Z6 i0HXATcdvDHV0AqW/uoNczXpck4x8xrahIzyPqAve2G15XEIgq4= =D9fV -----END PGP SIGNATURE----- Merge tag 'ras_updates_for_v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull x86 RAS updates from Borislav Petkov: - Enable additional logging mode on older Xeons (Tony Luck) - Pass error records logged by firmware through the MCE decoding chain to provide human-readable error descriptions instead of raw values (Smita Koralahalli) - Some #MC handler fixes (Gabriele Paoloni) - The usual small fixes and cleanups all over. * tag 'ras_updates_for_v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Rename kill_it to kill_current_task x86/mce: Remove redundant call to irq_work_queue() x86/mce: Panic for LMCE only if mca_cfg.tolerant < 3 x86/mce: Move the mce_panic() call and 'kill_it' assignments to the right places x86/mce, cper: Pass x86 CPER through the MCA handling chain x86/mce: Use "safe" MSR functions when enabling additional error logging x86/mce: Correct the detection of invalid notifier priorities x86/mce: Assign boolean values to a bool variable x86/mce: Enable additional error logging on certain Intel CPUs x86/mce: Remove unneeded break
This commit is contained in:
commit
2b34233ce2
@ -159,6 +159,8 @@ static inline u64 x86_default_get_root_pointer(void)
|
||||
extern int x86_acpi_numa_init(void);
|
||||
#endif /* CONFIG_ACPI_NUMA */
|
||||
|
||||
struct cper_ia_proc_ctx;
|
||||
|
||||
#ifdef CONFIG_ACPI_APEI
|
||||
static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
|
||||
{
|
||||
@ -177,6 +179,15 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
|
||||
*/
|
||||
return PAGE_KERNEL_NOENC;
|
||||
}
|
||||
|
||||
int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
|
||||
u64 lapic_id);
|
||||
#else
|
||||
static inline int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
|
||||
u64 lapic_id)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define ACPI_TABLE_UPGRADE_MAX_PHYS (max_low_pfn_mapped << PAGE_SHIFT)
|
||||
|
@ -177,7 +177,8 @@ enum mce_notifier_prios {
|
||||
MCE_PRIO_EXTLOG,
|
||||
MCE_PRIO_UC,
|
||||
MCE_PRIO_EARLY,
|
||||
MCE_PRIO_CEC
|
||||
MCE_PRIO_CEC,
|
||||
MCE_PRIO_HIGHEST = MCE_PRIO_CEC
|
||||
};
|
||||
|
||||
struct notifier_block;
|
||||
@ -198,16 +199,22 @@ static inline void enable_copy_mc_fragile(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
struct cper_ia_proc_ctx;
|
||||
|
||||
#ifdef CONFIG_X86_MCE
|
||||
int mcheck_init(void);
|
||||
void mcheck_cpu_init(struct cpuinfo_x86 *c);
|
||||
void mcheck_cpu_clear(struct cpuinfo_x86 *c);
|
||||
void mcheck_vendor_init_severity(void);
|
||||
int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
|
||||
u64 lapic_id);
|
||||
#else
|
||||
static inline int mcheck_init(void) { return 0; }
|
||||
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
|
||||
static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
|
||||
static inline void mcheck_vendor_init_severity(void) {}
|
||||
static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
|
||||
u64 lapic_id) { return -EINVAL; }
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_ANCIENT_MCE
|
||||
|
@ -139,6 +139,7 @@
|
||||
#define MSR_IA32_MCG_CAP 0x00000179
|
||||
#define MSR_IA32_MCG_STATUS 0x0000017a
|
||||
#define MSR_IA32_MCG_CTL 0x0000017b
|
||||
#define MSR_ERROR_CONTROL 0x0000017f
|
||||
#define MSR_IA32_MCG_EXT_CTL 0x000004d0
|
||||
|
||||
#define MSR_OFFCORE_RSP_0 0x000001a6
|
||||
|
@ -43,3 +43,8 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
|
||||
apei_mce_report_mem_error(sev, mem_err);
|
||||
#endif
|
||||
}
|
||||
|
||||
int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
|
||||
{
|
||||
return apei_smca_report_x86_error(ctx_info, lapic_id);
|
||||
}
|
||||
|
@ -51,6 +51,67 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
|
||||
|
||||
int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
|
||||
{
|
||||
const u64 *i_mce = ((const u64 *) (ctx_info + 1));
|
||||
unsigned int cpu;
|
||||
struct mce m;
|
||||
|
||||
if (!boot_cpu_has(X86_FEATURE_SMCA))
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* The starting address of the register array extracted from BERT must
|
||||
* match with the first expected register in the register layout of
|
||||
* SMCA address space. This address corresponds to banks's MCA_STATUS
|
||||
* register.
|
||||
*
|
||||
* Match any MCi_STATUS register by turning off bank numbers.
|
||||
*/
|
||||
if ((ctx_info->msr_addr & MSR_AMD64_SMCA_MC0_STATUS) !=
|
||||
MSR_AMD64_SMCA_MC0_STATUS)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* The register array size must be large enough to include all the
|
||||
* SMCA registers which need to be extracted.
|
||||
*
|
||||
* The number of registers in the register array is determined by
|
||||
* Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2.
|
||||
* The register layout is fixed and currently the raw data in the
|
||||
* register array includes 6 SMCA registers which the kernel can
|
||||
* extract.
|
||||
*/
|
||||
if (ctx_info->reg_arr_size < 48)
|
||||
return -EINVAL;
|
||||
|
||||
mce_setup(&m);
|
||||
|
||||
m.extcpu = -1;
|
||||
m.socketid = -1;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
if (cpu_data(cpu).initial_apicid == lapic_id) {
|
||||
m.extcpu = cpu;
|
||||
m.socketid = cpu_data(m.extcpu).phys_proc_id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
m.apicid = lapic_id;
|
||||
m.bank = (ctx_info->msr_addr >> 4) & 0xFF;
|
||||
m.status = *i_mce;
|
||||
m.addr = *(i_mce + 1);
|
||||
m.misc = *(i_mce + 2);
|
||||
/* Skipping MCA_CONFIG */
|
||||
m.ipid = *(i_mce + 4);
|
||||
m.synd = *(i_mce + 5);
|
||||
|
||||
mce_log(&m);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define CPER_CREATOR_MCE \
|
||||
GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
|
||||
0x64, 0x90, 0xb8, 0x9d)
|
||||
|
@ -162,7 +162,8 @@ EXPORT_SYMBOL_GPL(mce_log);
|
||||
|
||||
void mce_register_decode_chain(struct notifier_block *nb)
|
||||
{
|
||||
if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
|
||||
if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
|
||||
nb->priority > MCE_PRIO_HIGHEST))
|
||||
return;
|
||||
|
||||
blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
|
||||
@ -1265,14 +1266,14 @@ static void kill_me_maybe(struct callback_head *cb)
|
||||
}
|
||||
}
|
||||
|
||||
static void queue_task_work(struct mce *m, int kill_it)
|
||||
static void queue_task_work(struct mce *m, int kill_current_task)
|
||||
{
|
||||
current->mce_addr = m->addr;
|
||||
current->mce_kflags = m->kflags;
|
||||
current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
|
||||
current->mce_whole_page = whole_page(m);
|
||||
|
||||
if (kill_it)
|
||||
if (kill_current_task)
|
||||
current->mce_kill_me.func = kill_me_now;
|
||||
else
|
||||
current->mce_kill_me.func = kill_me_maybe;
|
||||
@ -1320,10 +1321,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
int no_way_out = 0;
|
||||
|
||||
/*
|
||||
* If kill_it gets set, there might be a way to recover from this
|
||||
* If kill_current_task is not set, there might be a way to recover from this
|
||||
* error.
|
||||
*/
|
||||
int kill_it = 0;
|
||||
int kill_current_task = 0;
|
||||
|
||||
/*
|
||||
* MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
|
||||
@ -1350,8 +1351,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
* severity is MCE_AR_SEVERITY we have other options.
|
||||
*/
|
||||
if (!(m.mcgstatus & MCG_STATUS_RIPV))
|
||||
kill_it = 1;
|
||||
|
||||
kill_current_task = (cfg->tolerant == 3) ? 0 : 1;
|
||||
/*
|
||||
* Check if this MCE is signaled to only this logical processor,
|
||||
* on Intel, Zhaoxin only.
|
||||
@ -1368,7 +1368,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
* to see it will clear it.
|
||||
*/
|
||||
if (lmce) {
|
||||
if (no_way_out)
|
||||
if (no_way_out && cfg->tolerant < 3)
|
||||
mce_panic("Fatal local machine check", &m, msg);
|
||||
} else {
|
||||
order = mce_start(&no_way_out);
|
||||
@ -1387,6 +1387,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
if (mce_end(order) < 0) {
|
||||
if (!no_way_out)
|
||||
no_way_out = worst >= MCE_PANIC_SEVERITY;
|
||||
|
||||
if (no_way_out && cfg->tolerant < 3)
|
||||
mce_panic("Fatal machine check on current CPU", &m, msg);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
@ -1403,19 +1406,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If tolerant is at an insane level we drop requests to kill
|
||||
* processes and continue even when there is no way out.
|
||||
*/
|
||||
if (cfg->tolerant == 3)
|
||||
kill_it = 0;
|
||||
else if (no_way_out)
|
||||
mce_panic("Fatal machine check on current CPU", &m, msg);
|
||||
|
||||
if (worst > 0)
|
||||
irq_work_queue(&mce_irq_work);
|
||||
|
||||
if (worst != MCE_AR_SEVERITY && !kill_it)
|
||||
if (worst != MCE_AR_SEVERITY && !kill_current_task)
|
||||
goto out;
|
||||
|
||||
/* Fault was in user mode and we need to take some action */
|
||||
@ -1423,7 +1414,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
/* If this triggers there is no way to recover. Die hard. */
|
||||
BUG_ON(!on_thread_stack() || !user_mode(regs));
|
||||
|
||||
queue_task_work(&m, kill_it);
|
||||
queue_task_work(&m, kill_current_task);
|
||||
|
||||
} else {
|
||||
/*
|
||||
@ -1441,7 +1432,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
}
|
||||
|
||||
if (m.kflags & MCE_IN_KERNEL_COPYIN)
|
||||
queue_task_work(&m, kill_it);
|
||||
queue_task_work(&m, kill_current_task);
|
||||
}
|
||||
out:
|
||||
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
|
||||
@ -1583,7 +1574,7 @@ static void __mcheck_cpu_mce_banks_init(void)
|
||||
* __mcheck_cpu_init_clear_banks() does the final bank setup.
|
||||
*/
|
||||
b->ctl = -1ULL;
|
||||
b->init = 1;
|
||||
b->init = true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1764,7 +1755,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
|
||||
*/
|
||||
|
||||
if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
|
||||
mce_banks[0].init = 0;
|
||||
mce_banks[0].init = false;
|
||||
|
||||
/*
|
||||
* All newer Intel systems support MCE broadcasting. Enable
|
||||
@ -1813,11 +1804,9 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
|
||||
case X86_VENDOR_INTEL:
|
||||
intel_p5_mcheck_init(c);
|
||||
return 1;
|
||||
break;
|
||||
case X86_VENDOR_CENTAUR:
|
||||
winchip_mcheck_init(c);
|
||||
return 1;
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
@ -509,12 +509,33 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Enable additional error logs from the integrated
|
||||
* memory controller on processors that support this.
|
||||
*/
|
||||
static void intel_imc_init(struct cpuinfo_x86 *c)
|
||||
{
|
||||
u64 error_control;
|
||||
|
||||
switch (c->x86_model) {
|
||||
case INTEL_FAM6_SANDYBRIDGE_X:
|
||||
case INTEL_FAM6_IVYBRIDGE_X:
|
||||
case INTEL_FAM6_HASWELL_X:
|
||||
if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control))
|
||||
return;
|
||||
error_control |= 2;
|
||||
wrmsrl_safe(MSR_ERROR_CONTROL, error_control);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void mce_intel_feature_init(struct cpuinfo_x86 *c)
|
||||
{
|
||||
intel_init_thermal(c);
|
||||
intel_init_cmci();
|
||||
intel_init_lmce();
|
||||
intel_ppin_init(c);
|
||||
intel_imc_init(c);
|
||||
}
|
||||
|
||||
void mce_intel_feature_clear(struct cpuinfo_x86 *c)
|
||||
|
@ -2,6 +2,7 @@
|
||||
// Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
#include <linux/cper.h>
|
||||
#include <linux/acpi.h>
|
||||
|
||||
/*
|
||||
* We don't need a "CPER_IA" prefix since these are all locally defined.
|
||||
@ -347,9 +348,13 @@ void cper_print_proc_ia(const char *pfx, const struct cper_sec_proc_ia *proc)
|
||||
ctx_info->mm_reg_addr);
|
||||
}
|
||||
|
||||
printk("%sRegister Array:\n", newpfx);
|
||||
print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, groupsize,
|
||||
(ctx_info + 1), ctx_info->reg_arr_size, 0);
|
||||
if (ctx_info->reg_ctx_type != CTX_TYPE_MSR ||
|
||||
arch_apei_report_x86_error(ctx_info, proc->lapic_id)) {
|
||||
printk("%sRegister Array:\n", newpfx);
|
||||
print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16,
|
||||
groupsize, (ctx_info + 1),
|
||||
ctx_info->reg_arr_size, 0);
|
||||
}
|
||||
|
||||
ctx_info = (struct cper_ia_proc_ctx *)((long)ctx_info + size);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user