x86/MCE/AMD: Add support for new MCA_SYND{1,2} registers

Starting with Zen4, AMD's Scalable MCA systems incorporate two new registers:
MCA_SYND1 and MCA_SYND2.

These registers will include supplemental error information in addition to the
existing MCA_SYND register. The data within these registers is considered
valid if MCA_STATUS[SyndV] is set.

Userspace error decoding tools like rasdaemon gather related hardware error
information through the tracepoints.

Therefore, export these two registers through the mce_record tracepoint so
that tools like rasdaemon can parse them and output the supplemental error
information like FRU text contained in them.

  [ bp: Massage. ]

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Link: https://lore.kernel.org/r/20241022194158.110073-4-avadhut.naik@amd.com
This commit is contained in:
Avadhut Naik 2024-10-22 19:36:29 +00:00 committed by Borislav Petkov (AMD)
parent e52750fb14
commit d4fca1358e
6 changed files with 46 additions and 7 deletions

View File

@ -122,6 +122,9 @@
#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
/* Registers MISC2 to MISC4 are at offsets B to D. */
#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e
#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f
#define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
@ -132,6 +135,8 @@
#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x))
#define XEC(x, mask) (((x) >> 16) & mask)
@ -190,9 +195,25 @@ enum mce_notifier_prios {
/**
* struct mce_hw_err - Hardware Error Record.
* @m: Machine Check record.
* @vendor: Vendor-specific error information.
*
* Vendor-specific fields should not be added to struct mce. Instead, vendors
* should export their vendor-specific data through their structure in the
* vendor union below.
*
* AMD's vendor data is parsed by error decoding tools for supplemental error
* information. Thus, current offsets of existing fields must be maintained.
* Only add new fields at the end of AMD's vendor structure.
*/
struct mce_hw_err {
struct mce m;
union vendor_info {
struct {
u64 synd1; /* MCA_SYND1 MSR */
u64 synd2; /* MCA_SYND2 MSR */
} amd;
} vendor;
};
#define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m)

View File

@ -8,7 +8,8 @@
/*
* Fields are zero when not available. Also, this struct is shared with
* userspace mcelog and thus must keep existing fields at current offsets.
* Only add new fields to the end of the structure
* Only add new, shared fields to the end of the structure.
* Do not add vendor-specific fields.
*/
struct mce {
__u64 status; /* Bank's MCi_STATUS MSR */

View File

@ -797,8 +797,11 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
if (mce_flags.smca) {
rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
if (m->status & MCI_STATUS_SYNDV)
if (m->status & MCI_STATUS_SYNDV) {
rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
}
}
mce_log(&err);

View File

@ -202,6 +202,10 @@ static void __print_mce(struct mce_hw_err *err)
if (mce_flags.smca) {
if (m->synd)
pr_cont("SYND %llx ", m->synd);
if (err->vendor.amd.synd1)
pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
if (err->vendor.amd.synd2)
pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
if (m->ipid)
pr_cont("IPID %llx ", m->ipid);
}
@ -678,8 +682,11 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
if (mce_flags.smca) {
m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
if (m->status & MCI_STATUS_SYNDV)
if (m->status & MCI_STATUS_SYNDV) {
m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i));
err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i));
}
}
}

View File

@ -793,6 +793,7 @@ static int
amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
{
struct mce *m = (struct mce *)data;
struct mce_hw_err *err = to_mce_hw_err(m);
unsigned int fam = x86_family(m->cpuid);
int ecc;
@ -850,8 +851,11 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
if (boot_cpu_has(X86_FEATURE_SMCA)) {
pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
if (m->status & MCI_STATUS_SYNDV)
pr_cont(", Syndrome: 0x%016llx", m->synd);
if (m->status & MCI_STATUS_SYNDV) {
pr_cont(", Syndrome: 0x%016llx\n", m->synd);
pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx",
err->vendor.amd.synd1, err->vendor.amd.synd2);
}
pr_cont("\n");

View File

@ -43,6 +43,7 @@ TRACE_EVENT(mce_record,
__field( u8, bank )
__field( u8, cpuvendor )
__field( u32, microcode )
__dynamic_array(u8, v_data, sizeof(err->vendor))
),
TP_fast_assign(
@ -65,9 +66,10 @@ TRACE_EVENT(mce_record,
__entry->bank = err->m.bank;
__entry->cpuvendor = err->m.cpuvendor;
__entry->microcode = err->m.microcode;
memcpy(__get_dynamic_array(v_data), &err->vendor, sizeof(err->vendor));
),
TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x",
TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016llx, IPID: %016llx, ADDR: %016llx, MISC: %016llx, SYND: %016llx, RIP: %02x:<%016llx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x, vendor data: %s",
__entry->cpu,
__entry->mcgcap, __entry->mcgstatus,
__entry->bank, __entry->status,
@ -83,7 +85,8 @@ TRACE_EVENT(mce_record,
__entry->walltime,
__entry->socketid,
__entry->apicid,
__entry->microcode)
__entry->microcode,
__print_dynamic_array(v_data, sizeof(u8)))
);
#endif /* _TRACE_MCE_H */