mirror of
https://github.com/torvalds/linux.git
synced 2024-11-24 21:21:41 +00:00
x86/MCE/AMD: Add support for new MCA_SYND{1,2} registers
Starting with Zen4, AMD's Scalable MCA systems incorporate two new registers: MCA_SYND1 and MCA_SYND2. These registers will include supplemental error information in addition to the existing MCA_SYND register. The data within these registers is considered valid if MCA_STATUS[SyndV] is set. Userspace error decoding tools like rasdaemon gather related hardware error information through the tracepoints. Therefore, export these two registers through the mce_record tracepoint so that tools like rasdaemon can parse them and output the supplemental error information like FRU text contained in them. [ bp: Massage. ] Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com> Signed-off-by: Avadhut Naik <avadhut.naik@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com> Link: https://lore.kernel.org/r/20241022194158.110073-4-avadhut.naik@amd.com
This commit is contained in:
parent
e52750fb14
commit
d4fca1358e
@ -122,6 +122,9 @@
|
||||
#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
|
||||
#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
|
||||
#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
|
||||
/* Registers MISC2 to MISC4 are at offsets B to D. */
|
||||
#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e
|
||||
#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f
|
||||
#define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
|
||||
#define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
|
||||
#define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
|
||||
@ -132,6 +135,8 @@
|
||||
#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
|
||||
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
|
||||
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
|
||||
#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x))
|
||||
#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x))
|
||||
|
||||
#define XEC(x, mask) (((x) >> 16) & mask)
|
||||
|
||||
@ -190,9 +195,25 @@ enum mce_notifier_prios {
|
||||
/**
|
||||
* struct mce_hw_err - Hardware Error Record.
|
||||
* @m: Machine Check record.
|
||||
* @vendor: Vendor-specific error information.
|
||||
*
|
||||
* Vendor-specific fields should not be added to struct mce. Instead, vendors
|
||||
* should export their vendor-specific data through their structure in the
|
||||
* vendor union below.
|
||||
*
|
||||
* AMD's vendor data is parsed by error decoding tools for supplemental error
|
||||
* information. Thus, current offsets of existing fields must be maintained.
|
||||
* Only add new fields at the end of AMD's vendor structure.
|
||||
*/
|
||||
struct mce_hw_err {
|
||||
struct mce m;
|
||||
|
||||
union vendor_info {
|
||||
struct {
|
||||
u64 synd1; /* MCA_SYND1 MSR */
|
||||
u64 synd2; /* MCA_SYND2 MSR */
|
||||
} amd;
|
||||
} vendor;
|
||||
};
|
||||
|
||||
#define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m)
|
||||
|
@ -8,7 +8,8 @@
|
||||
/*
|
||||
* Fields are zero when not available. Also, this struct is shared with
|
||||
* userspace mcelog and thus must keep existing fields at current offsets.
|
||||
* Only add new fields to the end of the structure
|
||||
* Only add new, shared fields to the end of the structure.
|
||||
* Do not add vendor-specific fields.
|
||||
*/
|
||||
struct mce {
|
||||
__u64 status; /* Bank's MCi_STATUS MSR */
|
||||
|
@ -797,8 +797,11 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
|
||||
if (mce_flags.smca) {
|
||||
rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
|
||||
|
||||
if (m->status & MCI_STATUS_SYNDV)
|
||||
if (m->status & MCI_STATUS_SYNDV) {
|
||||
rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
|
||||
rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
|
||||
rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
|
||||
}
|
||||
}
|
||||
|
||||
mce_log(&err);
|
||||
|
@ -202,6 +202,10 @@ static void __print_mce(struct mce_hw_err *err)
|
||||
if (mce_flags.smca) {
|
||||
if (m->synd)
|
||||
pr_cont("SYND %llx ", m->synd);
|
||||
if (err->vendor.amd.synd1)
|
||||
pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
|
||||
if (err->vendor.amd.synd2)
|
||||
pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
|
||||
if (m->ipid)
|
||||
pr_cont("IPID %llx ", m->ipid);
|
||||
}
|
||||
@ -678,8 +682,11 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
|
||||
if (mce_flags.smca) {
|
||||
m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
|
||||
|
||||
if (m->status & MCI_STATUS_SYNDV)
|
||||
if (m->status & MCI_STATUS_SYNDV) {
|
||||
m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
|
||||
err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i));
|
||||
err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -793,6 +793,7 @@ static int
|
||||
amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
|
||||
{
|
||||
struct mce *m = (struct mce *)data;
|
||||
struct mce_hw_err *err = to_mce_hw_err(m);
|
||||
unsigned int fam = x86_family(m->cpuid);
|
||||
int ecc;
|
||||
|
||||
@ -850,8 +851,11 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
|
||||
if (boot_cpu_has(X86_FEATURE_SMCA)) {
|
||||
pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
|
||||
|
||||
if (m->status & MCI_STATUS_SYNDV)
|
||||
pr_cont(", Syndrome: 0x%016llx", m->synd);
|
||||
if (m->status & MCI_STATUS_SYNDV) {
|
||||
pr_cont(", Syndrome: 0x%016llx\n", m->synd);
|
||||
pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx",
|
||||
err->vendor.amd.synd1, err->vendor.amd.synd2);
|
||||
}
|
||||
|
||||
pr_cont("\n");
|
||||
|
||||
|
@ -43,6 +43,7 @@ TRACE_EVENT(mce_record,
|
||||
__field( u8, bank )
|
||||
__field( u8, cpuvendor )
|
||||
__field( u32, microcode )
|
||||
__dynamic_array(u8, v_data, sizeof(err->vendor))
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@ -65,9 +66,10 @@ TRACE_EVENT(mce_record,
|
||||
__entry->bank = err->m.bank;
|
||||
__entry->cpuvendor = err->m.cpuvendor;
|
||||
__entry->microcode = err->m.microcode;
|
||||
memcpy(__get_dynamic_array(v_data), &err->vendor, sizeof(err->vendor));
|
||||
),
|
||||
|
||||
TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x",
|
||||
TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016llx, IPID: %016llx, ADDR: %016llx, MISC: %016llx, SYND: %016llx, RIP: %02x:<%016llx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x, vendor data: %s",
|
||||
__entry->cpu,
|
||||
__entry->mcgcap, __entry->mcgstatus,
|
||||
__entry->bank, __entry->status,
|
||||
@ -83,7 +85,8 @@ TRACE_EVENT(mce_record,
|
||||
__entry->walltime,
|
||||
__entry->socketid,
|
||||
__entry->apicid,
|
||||
__entry->microcode)
|
||||
__entry->microcode,
|
||||
__print_dynamic_array(v_data, sizeof(u8)))
|
||||
);
|
||||
|
||||
#endif /* _TRACE_MCE_H */
|
||||
|
Loading…
Reference in New Issue
Block a user