From 0b746e8c1e1e3fcc9e4036efe8d3ea3fd3e5d4c3 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 28 Oct 2021 17:56:56 +0000 Subject: [PATCH 01/24] x86/MCE/AMD, EDAC/amd64: Move address translation to AMD64 EDAC The address translation code used for current AMD systems is non-architectural. So move it to EDAC. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211028175728.121452-2-yazen.ghannam@amd.com --- arch/x86/include/asm/mce.h | 3 - arch/x86/kernel/cpu/mce/amd.c | 200 ---------------------------------- drivers/edac/amd64_edac.c | 199 +++++++++++++++++++++++++++++++++ 3 files changed, 199 insertions(+), 203 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 8f6395d9e209..d58f4f2e006f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -345,7 +345,6 @@ extern int mce_threshold_create_device(unsigned int cpu); extern int mce_threshold_remove_device(unsigned int cpu); void mce_amd_feature_init(struct cpuinfo_x86 *c); -int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr); enum smca_bank_types smca_get_bank_type(unsigned int bank); #else @@ -353,8 +352,6 @@ static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } -static inline int -umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; }; #endif static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index fc85eb17cb6d..2f351838d5f7 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -689,206 +689,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } -int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) -{ - u64 dram_base_addr, dram_limit_addr, dram_hole_base; - /* We start from the normalized address */ - u64 ret_addr = norm_addr; - - u32 tmp; - - u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; - u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; - u8 intlv_addr_sel, intlv_addr_bit; - u8 num_intlv_bits, hashed_bit; - u8 lgcy_mmio_hole_en, base = 0; - u8 cs_mask, cs_id = 0; - bool hash_enabled = false; - - /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ - if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) - goto out_err; - - /* Remove HiAddrOffset from normalized address, if enabled: */ - if (tmp & BIT(0)) { - u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; - - if (norm_addr >= hi_addr_offset) { - ret_addr -= hi_addr_offset; - base = 1; - } - } - - /* Read D18F0x110 (DramBaseAddress). */ - if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) - goto out_err; - - /* Check if address range is valid. */ - if (!(tmp & BIT(0))) { - pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", - __func__, tmp); - goto out_err; - } - - lgcy_mmio_hole_en = tmp & BIT(1); - intlv_num_chan = (tmp >> 4) & 0xF; - intlv_addr_sel = (tmp >> 8) & 0x7; - dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; - - /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ - if (intlv_addr_sel > 3) { - pr_err("%s: Invalid interleave address select %d.\n", - __func__, intlv_addr_sel); - goto out_err; - } - - /* Read D18F0x114 (DramLimitAddress). */ - if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) - goto out_err; - - intlv_num_sockets = (tmp >> 8) & 0x1; - intlv_num_dies = (tmp >> 10) & 0x3; - dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); - - intlv_addr_bit = intlv_addr_sel + 8; - - /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ - switch (intlv_num_chan) { - case 0: intlv_num_chan = 0; break; - case 1: intlv_num_chan = 1; break; - case 3: intlv_num_chan = 2; break; - case 5: intlv_num_chan = 3; break; - case 7: intlv_num_chan = 4; break; - - case 8: intlv_num_chan = 1; - hash_enabled = true; - break; - default: - pr_err("%s: Invalid number of interleaved channels %d.\n", - __func__, intlv_num_chan); - goto out_err; - } - - num_intlv_bits = intlv_num_chan; - - if (intlv_num_dies > 2) { - pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", - __func__, intlv_num_dies); - goto out_err; - } - - num_intlv_bits += intlv_num_dies; - - /* Add a bit if sockets are interleaved. */ - num_intlv_bits += intlv_num_sockets; - - /* Assert num_intlv_bits <= 4 */ - if (num_intlv_bits > 4) { - pr_err("%s: Invalid interleave bits %d.\n", - __func__, num_intlv_bits); - goto out_err; - } - - if (num_intlv_bits > 0) { - u64 temp_addr_x, temp_addr_i, temp_addr_y; - u8 die_id_bit, sock_id_bit, cs_fabric_id; - - /* - * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. - * This is the fabric id for this coherent slave. Use - * umc/channel# as instance id of the coherent slave - * for FICAA. - */ - if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) - goto out_err; - - cs_fabric_id = (tmp >> 8) & 0xFF; - die_id_bit = 0; - - /* If interleaved over more than 1 channel: */ - if (intlv_num_chan) { - die_id_bit = intlv_num_chan; - cs_mask = (1 << die_id_bit) - 1; - cs_id = cs_fabric_id & cs_mask; - } - - sock_id_bit = die_id_bit; - - /* Read D18F1x208 (SystemFabricIdMask). */ - if (intlv_num_dies || intlv_num_sockets) - if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) - goto out_err; - - /* If interleaved over more than 1 die. */ - if (intlv_num_dies) { - sock_id_bit = die_id_bit + intlv_num_dies; - die_id_shift = (tmp >> 24) & 0xF; - die_id_mask = (tmp >> 8) & 0xFF; - - cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; - } - - /* If interleaved over more than 1 socket. */ - if (intlv_num_sockets) { - socket_id_shift = (tmp >> 28) & 0xF; - socket_id_mask = (tmp >> 16) & 0xFF; - - cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; - } - - /* - * The pre-interleaved address consists of XXXXXXIIIYYYYY - * where III is the ID for this CS, and XXXXXXYYYYY are the - * address bits from the post-interleaved address. - * "num_intlv_bits" has been calculated to tell us how many "I" - * bits there are. "intlv_addr_bit" tells us how many "Y" bits - * there are (where "I" starts). - */ - temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); - temp_addr_i = (cs_id << intlv_addr_bit); - temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; - ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; - } - - /* Add dram base address */ - ret_addr += dram_base_addr; - - /* If legacy MMIO hole enabled */ - if (lgcy_mmio_hole_en) { - if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) - goto out_err; - - dram_hole_base = tmp & GENMASK(31, 24); - if (ret_addr >= dram_hole_base) - ret_addr += (BIT_ULL(32) - dram_hole_base); - } - - if (hash_enabled) { - /* Save some parentheses and grab ls-bit at the end. */ - hashed_bit = (ret_addr >> 12) ^ - (ret_addr >> 18) ^ - (ret_addr >> 21) ^ - (ret_addr >> 30) ^ - cs_id; - - hashed_bit &= BIT(0); - - if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) - ret_addr ^= BIT(intlv_addr_bit); - } - - /* Is calculated system address is above DRAM limit address? */ - if (ret_addr > dram_limit_addr) - goto out_err; - - *sys_addr = ret_addr; - return 0; - -out_err: - return -EINVAL; -} -EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); - bool amd_mce_is_memory_error(struct mce *m) { /* ErrCodeExt[20:16] */ diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 4fce75013674..d2ad9f06abb7 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -988,6 +988,205 @@ static int sys_addr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr) return csrow; } +static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) +{ + u64 dram_base_addr, dram_limit_addr, dram_hole_base; + /* We start from the normalized address */ + u64 ret_addr = norm_addr; + + u32 tmp; + + u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; + u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; + u8 intlv_addr_sel, intlv_addr_bit; + u8 num_intlv_bits, hashed_bit; + u8 lgcy_mmio_hole_en, base = 0; + u8 cs_mask, cs_id = 0; + bool hash_enabled = false; + + /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ + if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) + goto out_err; + + /* Remove HiAddrOffset from normalized address, if enabled: */ + if (tmp & BIT(0)) { + u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; + + if (norm_addr >= hi_addr_offset) { + ret_addr -= hi_addr_offset; + base = 1; + } + } + + /* Read D18F0x110 (DramBaseAddress). */ + if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) + goto out_err; + + /* Check if address range is valid. */ + if (!(tmp & BIT(0))) { + pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", + __func__, tmp); + goto out_err; + } + + lgcy_mmio_hole_en = tmp & BIT(1); + intlv_num_chan = (tmp >> 4) & 0xF; + intlv_addr_sel = (tmp >> 8) & 0x7; + dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; + + /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ + if (intlv_addr_sel > 3) { + pr_err("%s: Invalid interleave address select %d.\n", + __func__, intlv_addr_sel); + goto out_err; + } + + /* Read D18F0x114 (DramLimitAddress). */ + if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) + goto out_err; + + intlv_num_sockets = (tmp >> 8) & 0x1; + intlv_num_dies = (tmp >> 10) & 0x3; + dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); + + intlv_addr_bit = intlv_addr_sel + 8; + + /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ + switch (intlv_num_chan) { + case 0: intlv_num_chan = 0; break; + case 1: intlv_num_chan = 1; break; + case 3: intlv_num_chan = 2; break; + case 5: intlv_num_chan = 3; break; + case 7: intlv_num_chan = 4; break; + + case 8: intlv_num_chan = 1; + hash_enabled = true; + break; + default: + pr_err("%s: Invalid number of interleaved channels %d.\n", + __func__, intlv_num_chan); + goto out_err; + } + + num_intlv_bits = intlv_num_chan; + + if (intlv_num_dies > 2) { + pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", + __func__, intlv_num_dies); + goto out_err; + } + + num_intlv_bits += intlv_num_dies; + + /* Add a bit if sockets are interleaved. */ + num_intlv_bits += intlv_num_sockets; + + /* Assert num_intlv_bits <= 4 */ + if (num_intlv_bits > 4) { + pr_err("%s: Invalid interleave bits %d.\n", + __func__, num_intlv_bits); + goto out_err; + } + + if (num_intlv_bits > 0) { + u64 temp_addr_x, temp_addr_i, temp_addr_y; + u8 die_id_bit, sock_id_bit, cs_fabric_id; + + /* + * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. + * This is the fabric id for this coherent slave. Use + * umc/channel# as instance id of the coherent slave + * for FICAA. + */ + if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) + goto out_err; + + cs_fabric_id = (tmp >> 8) & 0xFF; + die_id_bit = 0; + + /* If interleaved over more than 1 channel: */ + if (intlv_num_chan) { + die_id_bit = intlv_num_chan; + cs_mask = (1 << die_id_bit) - 1; + cs_id = cs_fabric_id & cs_mask; + } + + sock_id_bit = die_id_bit; + + /* Read D18F1x208 (SystemFabricIdMask). */ + if (intlv_num_dies || intlv_num_sockets) + if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) + goto out_err; + + /* If interleaved over more than 1 die. */ + if (intlv_num_dies) { + sock_id_bit = die_id_bit + intlv_num_dies; + die_id_shift = (tmp >> 24) & 0xF; + die_id_mask = (tmp >> 8) & 0xFF; + + cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; + } + + /* If interleaved over more than 1 socket. */ + if (intlv_num_sockets) { + socket_id_shift = (tmp >> 28) & 0xF; + socket_id_mask = (tmp >> 16) & 0xFF; + + cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; + } + + /* + * The pre-interleaved address consists of XXXXXXIIIYYYYY + * where III is the ID for this CS, and XXXXXXYYYYY are the + * address bits from the post-interleaved address. + * "num_intlv_bits" has been calculated to tell us how many "I" + * bits there are. "intlv_addr_bit" tells us how many "Y" bits + * there are (where "I" starts). + */ + temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); + temp_addr_i = (cs_id << intlv_addr_bit); + temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; + ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; + } + + /* Add dram base address */ + ret_addr += dram_base_addr; + + /* If legacy MMIO hole enabled */ + if (lgcy_mmio_hole_en) { + if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) + goto out_err; + + dram_hole_base = tmp & GENMASK(31, 24); + if (ret_addr >= dram_hole_base) + ret_addr += (BIT_ULL(32) - dram_hole_base); + } + + if (hash_enabled) { + /* Save some parentheses and grab ls-bit at the end. */ + hashed_bit = (ret_addr >> 12) ^ + (ret_addr >> 18) ^ + (ret_addr >> 21) ^ + (ret_addr >> 30) ^ + cs_id; + + hashed_bit &= BIT(0); + + if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) + ret_addr ^= BIT(intlv_addr_bit); + } + + /* Is calculated system address is above DRAM limit address? */ + if (ret_addr > dram_limit_addr) + goto out_err; + + *sys_addr = ret_addr; + return 0; + +out_err: + return -EINVAL; +} + static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16); /* From b3218ae47771f943b3e222f35fc46afacba39929 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 28 Oct 2021 17:56:57 +0000 Subject: [PATCH 02/24] x86/amd_nb, EDAC/amd64: Move DF Indirect Read to AMD64 EDAC df_indirect_read() is used only for address translation. Move it to EDAC along with the translation code. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211028175728.121452-3-yazen.ghannam@amd.com --- arch/x86/include/asm/amd_nb.h | 1 - arch/x86/kernel/amd_nb.c | 49 +--------------------------------- drivers/edac/amd64_edac.c | 50 +++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 49 deletions(-) diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 455066a06f60..00d1a400b7a1 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -24,7 +24,6 @@ extern int amd_set_subcaches(int, unsigned long); extern int amd_smn_read(u16 node, u32 address, u32 *value); extern int amd_smn_write(u16 node, u32 address, u32 value); -extern int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo); struct amd_l3_cache { unsigned indices; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index c92c9c774c0e..f814d5fc333e 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -29,7 +29,7 @@ #define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e -/* Protect the PCI config register pairs used for SMN and DF indirect access. */ +/* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); static u32 *flush_words; @@ -182,53 +182,6 @@ int amd_smn_write(u16 node, u32 address, u32 value) } EXPORT_SYMBOL_GPL(amd_smn_write); -/* - * Data Fabric Indirect Access uses FICAA/FICAD. - * - * Fabric Indirect Configuration Access Address (FICAA): Constructed based - * on the device's Instance Id and the PCI function and register offset of - * the desired register. - * - * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO - * and FICAD HI registers but so far we only need the LO register. - */ -int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) -{ - struct pci_dev *F4; - u32 ficaa; - int err = -ENODEV; - - if (node >= amd_northbridges.num) - goto out; - - F4 = node_to_amd_nb(node)->link; - if (!F4) - goto out; - - ficaa = 1; - ficaa |= reg & 0x3FC; - ficaa |= (func & 0x7) << 11; - ficaa |= instance_id << 16; - - mutex_lock(&smn_mutex); - - err = pci_write_config_dword(F4, 0x5C, ficaa); - if (err) { - pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); - goto out_unlock; - } - - err = pci_read_config_dword(F4, 0x98, lo); - if (err) - pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); - -out_unlock: - mutex_unlock(&smn_mutex); - -out: - return err; -} -EXPORT_SYMBOL_GPL(amd_df_indirect_read); int amd_cache_northbridges(void) { diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index d2ad9f06abb7..034d9863bcf9 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -988,6 +988,56 @@ static int sys_addr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr) return csrow; } +/* Protect the PCI config register pairs used for DF indirect access. */ +static DEFINE_MUTEX(df_indirect_mutex); + +/* + * Data Fabric Indirect Access uses FICAA/FICAD. + * + * Fabric Indirect Configuration Access Address (FICAA): Constructed based + * on the device's Instance Id and the PCI function and register offset of + * the desired register. + * + * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO + * and FICAD HI registers but so far we only need the LO register. + */ +static int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) +{ + struct pci_dev *F4; + u32 ficaa; + int err = -ENODEV; + + if (node >= amd_nb_num()) + goto out; + + F4 = node_to_amd_nb(node)->link; + if (!F4) + goto out; + + ficaa = 1; + ficaa |= reg & 0x3FC; + ficaa |= (func & 0x7) << 11; + ficaa |= instance_id << 16; + + mutex_lock(&df_indirect_mutex); + + err = pci_write_config_dword(F4, 0x5C, ficaa); + if (err) { + pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); + goto out_unlock; + } + + err = pci_read_config_dword(F4, 0x98, lo); + if (err) + pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); + +out_unlock: + mutex_unlock(&df_indirect_mutex); + +out: + return err; +} + static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { u64 dram_base_addr, dram_limit_addr, dram_hole_base; From 448c3d6085b71aad58cd515469560ee76c982007 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 28 Oct 2021 17:56:58 +0000 Subject: [PATCH 03/24] EDAC/amd64: Allow for DF Indirect Broadcast reads The DF Indirect Access method allows for "Broadcast" accesses in which case no specific instance is targeted. Add support using a reserved instance ID of 0xFF to indicate a broadcast access. Set the FICAA register appropriately. Define helpers functions for instance and broadcast reads and use them where appropriate. Drop the "amd_" prefix since these functions are all static. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211028175728.121452-4-yazen.ghannam@amd.com --- drivers/edac/amd64_edac.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 034d9863bcf9..d41b9a02cc7d 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1000,8 +1000,11 @@ static DEFINE_MUTEX(df_indirect_mutex); * * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO * and FICAD HI registers but so far we only need the LO register. + * + * Use Instance Id 0xFF to indicate a broadcast read. */ -static int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) +#define DF_BROADCAST 0xFF +static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) { struct pci_dev *F4; u32 ficaa; @@ -1014,7 +1017,7 @@ static int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 if (!F4) goto out; - ficaa = 1; + ficaa = (instance_id == DF_BROADCAST) ? 0 : 1; ficaa |= reg & 0x3FC; ficaa |= (func & 0x7) << 11; ficaa |= instance_id << 16; @@ -1038,6 +1041,16 @@ out: return err; } +static int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) +{ + return __df_indirect_read(node, func, reg, instance_id, lo); +} + +static int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo) +{ + return __df_indirect_read(node, func, reg, DF_BROADCAST, lo); +} + static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { u64 dram_base_addr, dram_limit_addr, dram_hole_base; @@ -1055,7 +1068,7 @@ static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr bool hash_enabled = false; /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ - if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) + if (df_indirect_read_instance(nid, 0, 0x1B4, umc, &tmp)) goto out_err; /* Remove HiAddrOffset from normalized address, if enabled: */ @@ -1069,7 +1082,7 @@ static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr } /* Read D18F0x110 (DramBaseAddress). */ - if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) + if (df_indirect_read_instance(nid, 0, 0x110 + (8 * base), umc, &tmp)) goto out_err; /* Check if address range is valid. */ @@ -1092,7 +1105,7 @@ static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr } /* Read D18F0x114 (DramLimitAddress). */ - if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) + if (df_indirect_read_instance(nid, 0, 0x114 + (8 * base), umc, &tmp)) goto out_err; intlv_num_sockets = (tmp >> 8) & 0x1; @@ -1148,7 +1161,7 @@ static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr * umc/channel# as instance id of the coherent slave * for FICAA. */ - if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) + if (df_indirect_read_instance(nid, 0, 0x50, umc, &tmp)) goto out_err; cs_fabric_id = (tmp >> 8) & 0xFF; @@ -1165,7 +1178,7 @@ static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr /* Read D18F1x208 (SystemFabricIdMask). */ if (intlv_num_dies || intlv_num_sockets) - if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) + if (df_indirect_read_broadcast(nid, 1, 0x208, &tmp)) goto out_err; /* If interleaved over more than 1 die. */ @@ -1204,7 +1217,7 @@ static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr /* If legacy MMIO hole enabled */ if (lgcy_mmio_hole_en) { - if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) + if (df_indirect_read_broadcast(nid, 0, 0x104, &tmp)) goto out_err; dram_hole_base = tmp & GENMASK(31, 24); From 70aeb807cf8649dedbcd59b70dfc38fb89bdf1bd Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 28 Oct 2021 17:56:59 +0000 Subject: [PATCH 04/24] EDAC/amd64: Add context struct Define an address translation context struct. This will hold values that will be passed between multiple functions. Save return address, Node ID, and the Instance ID number to start. Currently, the UMC number is used as the Instance ID, but future DF versions may use another value. Also include a "tmp" field to use when reading registers. This is to avoid having to define a temporary variable in multiple functions. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211028175728.121452-5-yazen.ghannam@amd.com --- drivers/edac/amd64_edac.c | 97 ++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 42 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index d41b9a02cc7d..ca0c67bc25c6 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1051,13 +1051,16 @@ static int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo) return __df_indirect_read(node, func, reg, DF_BROADCAST, lo); } +struct addr_ctx { + u64 ret_addr; + u32 tmp; + u16 nid; + u8 inst_id; +}; + static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { u64 dram_base_addr, dram_limit_addr, dram_hole_base; - /* We start from the normalized address */ - u64 ret_addr = norm_addr; - - u32 tmp; u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; @@ -1067,35 +1070,45 @@ static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr u8 cs_mask, cs_id = 0; bool hash_enabled = false; + struct addr_ctx ctx; + + memset(&ctx, 0, sizeof(ctx)); + + /* Start from the normalized address */ + ctx.ret_addr = norm_addr; + + ctx.nid = nid; + ctx.inst_id = umc; + /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ - if (df_indirect_read_instance(nid, 0, 0x1B4, umc, &tmp)) + if (df_indirect_read_instance(nid, 0, 0x1B4, umc, &ctx.tmp)) goto out_err; /* Remove HiAddrOffset from normalized address, if enabled: */ - if (tmp & BIT(0)) { - u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; + if (ctx.tmp & BIT(0)) { + u64 hi_addr_offset = (ctx.tmp & GENMASK_ULL(31, 20)) << 8; if (norm_addr >= hi_addr_offset) { - ret_addr -= hi_addr_offset; + ctx.ret_addr -= hi_addr_offset; base = 1; } } /* Read D18F0x110 (DramBaseAddress). */ - if (df_indirect_read_instance(nid, 0, 0x110 + (8 * base), umc, &tmp)) + if (df_indirect_read_instance(nid, 0, 0x110 + (8 * base), umc, &ctx.tmp)) goto out_err; /* Check if address range is valid. */ - if (!(tmp & BIT(0))) { + if (!(ctx.tmp & BIT(0))) { pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", - __func__, tmp); + __func__, ctx.tmp); goto out_err; } - lgcy_mmio_hole_en = tmp & BIT(1); - intlv_num_chan = (tmp >> 4) & 0xF; - intlv_addr_sel = (tmp >> 8) & 0x7; - dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; + lgcy_mmio_hole_en = ctx.tmp & BIT(1); + intlv_num_chan = (ctx.tmp >> 4) & 0xF; + intlv_addr_sel = (ctx.tmp >> 8) & 0x7; + dram_base_addr = (ctx.tmp & GENMASK_ULL(31, 12)) << 16; /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ if (intlv_addr_sel > 3) { @@ -1105,12 +1118,12 @@ static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr } /* Read D18F0x114 (DramLimitAddress). */ - if (df_indirect_read_instance(nid, 0, 0x114 + (8 * base), umc, &tmp)) + if (df_indirect_read_instance(nid, 0, 0x114 + (8 * base), umc, &ctx.tmp)) goto out_err; - intlv_num_sockets = (tmp >> 8) & 0x1; - intlv_num_dies = (tmp >> 10) & 0x3; - dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); + intlv_num_sockets = (ctx.tmp >> 8) & 0x1; + intlv_num_dies = (ctx.tmp >> 10) & 0x3; + dram_limit_addr = ((ctx.tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); intlv_addr_bit = intlv_addr_sel + 8; @@ -1161,10 +1174,10 @@ static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr * umc/channel# as instance id of the coherent slave * for FICAA. */ - if (df_indirect_read_instance(nid, 0, 0x50, umc, &tmp)) + if (df_indirect_read_instance(nid, 0, 0x50, umc, &ctx.tmp)) goto out_err; - cs_fabric_id = (tmp >> 8) & 0xFF; + cs_fabric_id = (ctx.tmp >> 8) & 0xFF; die_id_bit = 0; /* If interleaved over more than 1 channel: */ @@ -1178,22 +1191,22 @@ static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr /* Read D18F1x208 (SystemFabricIdMask). */ if (intlv_num_dies || intlv_num_sockets) - if (df_indirect_read_broadcast(nid, 1, 0x208, &tmp)) + if (df_indirect_read_broadcast(nid, 1, 0x208, &ctx.tmp)) goto out_err; /* If interleaved over more than 1 die. */ if (intlv_num_dies) { sock_id_bit = die_id_bit + intlv_num_dies; - die_id_shift = (tmp >> 24) & 0xF; - die_id_mask = (tmp >> 8) & 0xFF; + die_id_shift = (ctx.tmp >> 24) & 0xF; + die_id_mask = (ctx.tmp >> 8) & 0xFF; cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; } /* If interleaved over more than 1 socket. */ if (intlv_num_sockets) { - socket_id_shift = (tmp >> 28) & 0xF; - socket_id_mask = (tmp >> 16) & 0xFF; + socket_id_shift = (ctx.tmp >> 28) & 0xF; + socket_id_mask = (ctx.tmp >> 16) & 0xFF; cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; } @@ -1206,44 +1219,44 @@ static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr * bits there are. "intlv_addr_bit" tells us how many "Y" bits * there are (where "I" starts). */ - temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); + temp_addr_y = ctx.ret_addr & GENMASK_ULL(intlv_addr_bit - 1, 0); temp_addr_i = (cs_id << intlv_addr_bit); - temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; - ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; + temp_addr_x = (ctx.ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; + ctx.ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; } /* Add dram base address */ - ret_addr += dram_base_addr; + ctx.ret_addr += dram_base_addr; /* If legacy MMIO hole enabled */ if (lgcy_mmio_hole_en) { - if (df_indirect_read_broadcast(nid, 0, 0x104, &tmp)) + if (df_indirect_read_broadcast(nid, 0, 0x104, &ctx.tmp)) goto out_err; - dram_hole_base = tmp & GENMASK(31, 24); - if (ret_addr >= dram_hole_base) - ret_addr += (BIT_ULL(32) - dram_hole_base); + dram_hole_base = ctx.tmp & GENMASK(31, 24); + if (ctx.ret_addr >= dram_hole_base) + ctx.ret_addr += (BIT_ULL(32) - dram_hole_base); } if (hash_enabled) { /* Save some parentheses and grab ls-bit at the end. */ - hashed_bit = (ret_addr >> 12) ^ - (ret_addr >> 18) ^ - (ret_addr >> 21) ^ - (ret_addr >> 30) ^ + hashed_bit = (ctx.ret_addr >> 12) ^ + (ctx.ret_addr >> 18) ^ + (ctx.ret_addr >> 21) ^ + (ctx.ret_addr >> 30) ^ cs_id; hashed_bit &= BIT(0); - if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) - ret_addr ^= BIT(intlv_addr_bit); + if (hashed_bit != ((ctx.ret_addr >> intlv_addr_bit) & BIT(0))) + ctx.ret_addr ^= BIT(intlv_addr_bit); } /* Is calculated system address is above DRAM limit address? */ - if (ret_addr > dram_limit_addr) + if (ctx.ret_addr > dram_limit_addr) goto out_err; - *sys_addr = ret_addr; + *sys_addr = ctx.ret_addr; return 0; out_err: From 2322b532ad9043336f36ba2634ab3620d5d5b4f5 Mon Sep 17 00:00:00 2001 From: Zhaolong Zhang Date: Tue, 9 Nov 2021 19:23:45 +0800 Subject: [PATCH 05/24] x86/mce: Get rid of cpu_missing Get rid of cpu_missing because 7bb39313cd62 ("x86/mce: Make mce_timed_out() identify holdout CPUs") provides a more detailed message about which CPUs are missing. Suggested-by: Borislav Petkov Signed-off-by: Zhaolong Zhang Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211109112345.2673403-1-zhangzl2013@126.com --- arch/x86/kernel/cpu/mce/core.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6ed365337a3b..30de00fe0d7a 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -99,7 +99,6 @@ struct mca_config mca_cfg __read_mostly = { static DEFINE_PER_CPU(struct mce, mces_seen); static unsigned long mce_need_notify; -static int cpu_missing; /* * MCA banks polled by the period polling timer for corrected events. @@ -314,8 +313,6 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) if (!apei_err) apei_err = apei_write_mce(final); } - if (cpu_missing) - pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { @@ -891,7 +888,6 @@ static int mce_timed_out(u64 *t, const char *msg) cpumask_pr_args(&mce_missing_cpus)); mce_panic(msg, NULL, NULL); } - cpu_missing = 1; return 1; } *t -= SPINUNIT; @@ -2702,7 +2698,6 @@ struct dentry *mce_get_debugfs_dir(void) static void mce_reset(void) { - cpu_missing = 0; atomic_set(&mce_fake_panicked, 0); atomic_set(&mce_executing, 0); atomic_set(&mce_callin, 0); From e48d008bd13eaa9068ff59c65275d17516179f7b Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Thu, 4 Nov 2021 16:58:41 -0500 Subject: [PATCH 06/24] x86/mce/inject: Check if a bank is populated before injecting The MCA_IPID register uniquely identifies a bank's type on Scalable MCA (SMCA) systems. When an MCA bank is not populated, the MCA_IPID register will read as zero and writes to it will be ignored. On a hw-type error injection (injection which writes the actual MCA registers in an attempt to cause a real MCE) check the value of this register before trying to inject the error. Do not impose any limitations on a sw injection and allow the user to test out all the decoding paths without relying on the available hardware, as its purpose is to just test the code. [ bp: Heavily massage. ] Link: https://lkml.kernel.org/r/20211019233641.140275-2-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Smita Koralahalli Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211104215846.254012-2-Smita.KoralahalliChannabasappa@amd.com --- arch/x86/kernel/cpu/mce/inject.c | 42 +++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 0bfc14041bbb..725e8e4331a9 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -74,7 +74,6 @@ MCE_INJECT_SET(status); MCE_INJECT_SET(misc); MCE_INJECT_SET(addr); MCE_INJECT_SET(synd); -MCE_INJECT_SET(ipid); #define MCE_INJECT_GET(reg) \ static int inj_##reg##_get(void *data, u64 *val) \ @@ -95,6 +94,20 @@ DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); + +/* Use the user provided IPID value on a sw injection. */ +static int inj_ipid_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + if (inj_type == SW_INJ) + m->ipid = val; + } + + return 0; +} + DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n"); static void setup_inj_struct(struct mce *m) @@ -577,6 +590,33 @@ static int inj_bank_set(void *data, u64 val) } m->bank = val; + + /* + * sw-only injection allows to write arbitrary values into the MCA + * registers because it tests only the decoding paths. + */ + if (inj_type == SW_INJ) + goto inject; + + /* + * Read IPID value to determine if a bank is populated on the target + * CPU. + */ + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + u64 ipid; + + if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { + pr_err("Error reading IPID on CPU%d\n", m->extcpu); + return -EINVAL; + } + + if (!ipid) { + pr_err("Cannot inject into unpopulated bank %llu\n", val); + return -ENODEV; + } + } + +inject: do_inject(); /* Reset injection struct */ From 1e56279a49168f24b2b58e843ae92976f90a98a6 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Thu, 4 Nov 2021 16:58:42 -0500 Subject: [PATCH 07/24] x86/mce/inject: Set the valid bit in MCA_STATUS before error injection MCA handlers check the valid bit in each status register (MCA_STATUS[Val]) and continue processing the error only if the valid bit is set. Set the valid bit unconditionally in the corresponding MCA_STATUS register and correct any Val=0 injections made by the user as such errors will get ignored and such injections will be largely pointless. Signed-off-by: Smita Koralahalli Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211104215846.254012-3-Smita.KoralahalliChannabasappa@amd.com --- arch/x86/kernel/cpu/mce/inject.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 725e8e4331a9..6eac840c64bb 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -503,6 +503,8 @@ static void do_inject(void) i_mce.tsc = rdtsc_ordered(); + i_mce.status |= MCI_STATUS_VAL; + if (i_mce.misc) i_mce.status |= MCI_STATUS_MISCV; From cd5e0d1fc93a2218fbfb2b614f06e04e218ca948 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 1 Nov 2021 15:34:48 +0100 Subject: [PATCH 08/24] x86/mce: Do not use memset to clear the banks bitmaps The bitmap is a single unsigned long so no need for the function call. No functional changes. Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208111343.8130-2-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 30de00fe0d7a..7c264ee18c95 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1336,8 +1336,8 @@ static noinstr void unexpected_machine_check(struct pt_regs *regs) noinstr void do_machine_check(struct pt_regs *regs) { int worst = 0, order, no_way_out, kill_current_task, lmce; - DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); - DECLARE_BITMAP(toclear, MAX_NR_BANKS); + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; + DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; struct mca_config *cfg = &mca_cfg; struct mce m, *final; char *msg = NULL; @@ -1381,7 +1381,6 @@ noinstr void do_machine_check(struct pt_regs *regs) final = this_cpu_ptr(&mces_seen); *final = m; - memset(valid_banks, 0, sizeof(valid_banks)); no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); barrier(); From ad669ec16afeb0edb7d5bc4f92fdd059565281d2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Oct 2021 20:06:57 +0200 Subject: [PATCH 09/24] x86/mce: Remove function-local cpus variables Use num_online_cpus() directly. No functional changes. Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208111343.8130-3-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7c264ee18c95..f15efa242f44 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -985,7 +985,6 @@ static atomic_t global_nwo; static int mce_start(int *no_way_out) { int order; - int cpus = num_online_cpus(); u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; if (!timeout) @@ -1002,7 +1001,7 @@ static int mce_start(int *no_way_out) /* * Wait for everyone. */ - while (atomic_read(&mce_callin) != cpus) { + while (atomic_read(&mce_callin) != num_online_cpus()) { if (mce_timed_out(&timeout, "Timeout: Not all CPUs entered broadcast exception handler")) { atomic_set(&global_nwo, 0); @@ -1066,14 +1065,11 @@ static int mce_end(int order) atomic_inc(&mce_executing); if (order == 1) { - /* CHECKME: Can this race with a parallel hotplug? */ - int cpus = num_online_cpus(); - /* * Monarch: Wait for everyone to go through their scanning * loops. */ - while (atomic_read(&mce_executing) <= cpus) { + while (atomic_read(&mce_executing) <= num_online_cpus()) { if (mce_timed_out(&timeout, "Timeout: Monarch CPU unable to finish machine check processing")) goto reset; From 88f66a42353717e7fac31caf04f0acd2d7fbbf54 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 6 Oct 2021 00:55:38 +0200 Subject: [PATCH 10/24] x86/mce: Use mce_rdmsrl() in severity checking code MCA has its own special MSR accessors. Use them. No functional changes. Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208111343.8130-4-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 2 +- arch/x86/kernel/cpu/mce/internal.h | 2 ++ arch/x86/kernel/cpu/mce/severity.c | 8 +++----- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index f15efa242f44..87a277fc80b4 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -362,7 +362,7 @@ void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) } /* MSR access wrappers used for error injection */ -static noinstr u64 mce_rdmsrl(u32 msr) +noinstr u64 mce_rdmsrl(u32 msr) { DECLARE_ARGS(val, low, high); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index acd61c41846c..52c633950b38 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -207,4 +207,6 @@ static inline void pentium_machine_check(struct pt_regs *regs) {} static inline void winchip_machine_check(struct pt_regs *regs) {} #endif +noinstr u64 mce_rdmsrl(u32 msr); + #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index bb019a594a2c..00e97ebbd94a 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -288,8 +288,7 @@ static int error_context(struct mce *m, struct pt_regs *regs) static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) { - u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); - u32 low, high; + u64 mcx_cfg; /* * We need to look at the following bits: @@ -300,11 +299,10 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) if (!mce_flags.succor) return MCE_PANIC_SEVERITY; - if (rdmsr_safe(addr, &low, &high)) - return MCE_PANIC_SEVERITY; + mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank)); /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */ - if ((low & MCI_CONFIG_MCAX) && + if ((mcx_cfg & MCI_CONFIG_MCAX) && (m->status & MCI_STATUS_TCC) && (err_ctx == IN_KERNEL)) return MCE_PANIC_SEVERITY; From 487d654db3edacc31dee86b10258cc740640fad8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Oct 2021 19:54:47 +0200 Subject: [PATCH 11/24] x86/mce: Remove noinstr annotation from mce_setup() Instead, sandwitch around the call which is done in noinstr context and mark the caller - mce_gather_info() - as noinstr. Also, document what the whole instrumentation strategy with #MC is going to be in the future and where it all is supposed to be going to. Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208111343.8130-5-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 87a277fc80b4..c11fc8d6a4c5 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -127,7 +127,7 @@ static struct irq_work mce_irq_work; BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ -noinstr void mce_setup(struct mce *m) +void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); @@ -430,9 +430,15 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ -static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) +static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) { + /* + * Enable instrumentation around mce_setup() which calls external + * facilities. + */ + instrumentation_begin(); mce_setup(m); + instrumentation_end(); m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); if (regs) { @@ -1312,11 +1318,11 @@ static noinstr void unexpected_machine_check(struct pt_regs *regs) } /* - * The actual machine check handler. This only handles real - * exceptions when something got corrupted coming in through int 18. + * The actual machine check handler. This only handles real exceptions when + * something got corrupted coming in through int 18. * - * This is executed in NMI context not subject to normal locking rules. This - * implies that most kernel services cannot be safely used. Don't even + * This is executed in #MC context not subject to normal locking rules. + * This implies that most kernel services cannot be safely used. Don't even * think about putting a printk in there! * * On Intel systems this is entered on all CPUs in parallel through @@ -1328,6 +1334,14 @@ static noinstr void unexpected_machine_check(struct pt_regs *regs) * issues: if the machine check was due to a failure of the memory * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. + * + * Currently, the #MC handler calls out to a number of external facilities + * and, therefore, allows instrumentation around them. The optimal thing to + * have would be to do the absolutely minimal work required in #MC context + * and have instrumentation disabled only around that. Further processing can + * then happen in process context where instrumentation is allowed. Achieving + * that requires careful auditing and modifications. Until then, the code + * allows instrumentation temporarily, where required. * */ noinstr void do_machine_check(struct pt_regs *regs) { From 4fbce464db81a42f9a57ee242d6150ec7f996415 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 13 Oct 2021 09:07:19 +0200 Subject: [PATCH 12/24] x86/mce: Allow instrumentation during task work queueing Fixes vmlinux.o: warning: objtool: do_machine_check()+0xdb1: call to queue_task_work() leaves .noinstr.text section Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208111343.8130-6-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index c11fc8d6a4c5..d788ccc5a35f 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1459,6 +1459,14 @@ noinstr void do_machine_check(struct pt_regs *regs) if (worst != MCE_AR_SEVERITY && !kill_current_task) goto out; + /* + * Enable instrumentation around the external facilities like + * task_work_add() (via queue_task_work()), fixup_exception() etc. + * For now, that is. Fixing this properly would need a lot more involved + * reorganization. + */ + instrumentation_begin(); + /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ @@ -1487,6 +1495,9 @@ noinstr void do_machine_check(struct pt_regs *regs) if (m.kflags & MCE_IN_KERNEL_COPYIN) queue_task_work(&m, msg, kill_me_never); } + + instrumentation_end(); + out: mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); } From 0a5b288e85bbef5227bb6397e31fcf1d7ba9142a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 13 Oct 2021 09:47:50 +0200 Subject: [PATCH 13/24] x86/mce: Prevent severity computation from being instrumented Mark all the MCE severity computation logic noinstr and allow instrumentation when it "calls out". Fixes vmlinux.o: warning: objtool: do_machine_check()+0xc5d: call to mce_severity() leaves .noinstr.text section Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208111343.8130-7-bp@alien8.de --- arch/x86/kernel/cpu/mce/severity.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 00e97ebbd94a..a32646769705 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -263,24 +263,36 @@ static bool is_copy_from_user(struct pt_regs *regs) * distinguish an exception taken in user from from one * taken in the kernel. */ -static int error_context(struct mce *m, struct pt_regs *regs) +static noinstr int error_context(struct mce *m, struct pt_regs *regs) { + int fixup_type; + bool copy_user; + if ((m->cs & 3) == 3) return IN_USER; + if (!mc_recoverable(m->mcgstatus)) return IN_KERNEL; - switch (ex_get_fixup_type(m->ip)) { + /* Allow instrumentation around external facilities usage. */ + instrumentation_begin(); + fixup_type = ex_get_fixup_type(m->ip); + copy_user = is_copy_from_user(regs); + instrumentation_end(); + + switch (fixup_type) { case EX_TYPE_UACCESS: case EX_TYPE_COPY: - if (!regs || !is_copy_from_user(regs)) + if (!regs || !copy_user) return IN_KERNEL; m->kflags |= MCE_IN_KERNEL_COPYIN; fallthrough; + case EX_TYPE_FAULT_MCE_SAFE: case EX_TYPE_DEFAULT_MCE_SAFE: m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; + default: return IN_KERNEL; } @@ -315,8 +327,8 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) * See AMD Error Scope Hierarchy table in a newer BKDG. For example * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" */ -static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant, - char **msg, bool is_excp) +static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant, + char **msg, bool is_excp) { enum context ctx = error_context(m, regs); @@ -368,8 +380,8 @@ static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant, return MCE_KEEP_SEVERITY; } -static int mce_severity_intel(struct mce *m, struct pt_regs *regs, - int tolerant, char **msg, bool is_excp) +static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, + int tolerant, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m, regs); @@ -405,8 +417,8 @@ static int mce_severity_intel(struct mce *m, struct pt_regs *regs, } } -int mce_severity(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, - bool is_excp) +int noinstr mce_severity(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, + bool is_excp) { if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) From 3c7ce80a818fa7950be123cac80cd078e5ac1013 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 1 Nov 2021 13:39:35 +0100 Subject: [PATCH 14/24] x86/mce: Mark mce_panic() noinstr And allow instrumentation inside it because it does calls to other facilities which will not be tagged noinstr. Fixes vmlinux.o: warning: objtool: do_machine_check()+0xc73: call to mce_panic() leaves .noinstr.text section Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208111343.8130-8-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index d788ccc5a35f..ec0f7bb53477 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -266,11 +266,17 @@ static void wait_for_panic(void) panic("Panicing machine check CPU died"); } -static void mce_panic(const char *msg, struct mce *final, char *exp) +static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) { - int apei_err = 0; struct llist_node *pending; struct mce_evt_llist *l; + int apei_err = 0; + + /* + * Allow instrumentation around external facilities usage. Not that it + * matters a whole lot since the machine is going to panic anyway. + */ + instrumentation_begin(); if (!fake_panic) { /* @@ -285,7 +291,7 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) } else { /* Don't log too much for fake panic */ if (atomic_inc_return(&mce_fake_panicked) > 1) - return; + goto out; } pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ @@ -321,6 +327,9 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); + +out: + instrumentation_end(); } /* Support code for software error injection */ From b4813539d37fa31fed62cdfab7bd2dd8929c5b2e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 1 Nov 2021 16:43:33 +0100 Subject: [PATCH 15/24] x86/mce: Mark mce_end() noinstr It is called by the #MC handler which is noinstr. Fixes vmlinux.o: warning: objtool: do_machine_check()+0xbd6: call to memset() leaves .noinstr.text section Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208111343.8130-9-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ec0f7bb53477..4bdcca8493ca 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1064,10 +1064,13 @@ static int mce_start(int *no_way_out) * Synchronize between CPUs after main scanning loop. * This invokes the bulk of the Monarch processing. */ -static int mce_end(int order) +static noinstr int mce_end(int order) { - int ret = -1; u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + int ret = -1; + + /* Allow instrumentation around external facilities. */ + instrumentation_begin(); if (!timeout) goto reset; @@ -1108,7 +1111,8 @@ static int mce_end(int order) /* * Don't reset anything. That's done by the Monarch. */ - return 0; + ret = 0; + goto out; } /* @@ -1124,6 +1128,10 @@ reset: * Let others run again. */ atomic_set(&mce_executing, 0); + +out: + instrumentation_end(); + return ret; } From db6c996d6ce45dfb44891f0824a65ecec216f47a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 2 Nov 2021 11:14:48 +0100 Subject: [PATCH 16/24] x86/mce: Mark mce_read_aux() noinstr Fixes vmlinux.o: warning: objtool: do_machine_check()+0x681: call to mce_read_aux() leaves .noinstr.text section Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208111343.8130-10-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4bdcca8493ca..53b4cfc72c8e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -648,7 +648,7 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); From 75581a203e63210aabb1336c8c9cb65a7858b596 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 2 Nov 2021 08:14:44 +0100 Subject: [PATCH 17/24] x86/mce: Move the tainting outside of the noinstr region add_taint() is yet another external facility which the #MC handler calls. Move that tainting call into the instrumentation-allowed part of the handler. Fixes vmlinux.o: warning: objtool: do_machine_check()+0x617: call to add_taint() leaves .noinstr.text section While at it, allow instrumentation around the mce_log() call. Fixes vmlinux.o: warning: objtool: do_machine_check()+0x690: call to mce_log() leaves .noinstr.text section Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208111343.8130-11-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 41 +++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 53b4cfc72c8e..044c94b090bf 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1180,13 +1180,14 @@ static noinstr bool mce_check_crashing_cpu(void) return false; } -static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, - int no_way_out, int *worst) +static __always_inline int +__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, + unsigned long *toclear, unsigned long *valid_banks, int no_way_out, + int *worst) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; - int severity, i; + int severity, i, taint = 0; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { __clear_bit(i, toclear); @@ -1213,7 +1214,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin continue; /* Set taint even when machine check was not enabled. */ - add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + taint++; severity = mce_severity(m, regs, cfg->tolerant, NULL, true); @@ -1236,7 +1237,13 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin /* assuming valid severity level != 0 */ m->severity = severity; + /* + * Enable instrumentation around the mce_log() call which is + * done in #MC context, where instrumentation is disabled. + */ + instrumentation_begin(); mce_log(m); + instrumentation_end(); if (severity > *worst) { *final = *m; @@ -1246,6 +1253,8 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin /* mce_clear_state will clear *final, save locally for use later */ *m = *final; + + return taint; } static void kill_me_now(struct callback_head *ch) @@ -1362,7 +1371,7 @@ static noinstr void unexpected_machine_check(struct pt_regs *regs) */ noinstr void do_machine_check(struct pt_regs *regs) { - int worst = 0, order, no_way_out, kill_current_task, lmce; + int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; struct mca_config *cfg = &mca_cfg; @@ -1441,7 +1450,7 @@ noinstr void do_machine_check(struct pt_regs *regs) order = mce_start(&no_way_out); } - __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1473,17 +1482,19 @@ noinstr void do_machine_check(struct pt_regs *regs) } } - if (worst != MCE_AR_SEVERITY && !kill_current_task) - goto out; - /* - * Enable instrumentation around the external facilities like - * task_work_add() (via queue_task_work()), fixup_exception() etc. - * For now, that is. Fixing this properly would need a lot more involved - * reorganization. + * Enable instrumentation around the external facilities like task_work_add() + * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this + * properly would need a lot more involved reorganization. */ instrumentation_begin(); + if (taint) + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + if (worst != MCE_AR_SEVERITY && !kill_current_task) + goto out; + /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ @@ -1513,9 +1524,9 @@ noinstr void do_machine_check(struct pt_regs *regs) queue_task_work(&m, msg, kill_me_never); } +out: instrumentation_end(); -out: mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); From edb3d07e2403abd13fc664e8b6f23ea7efb52747 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 2 Nov 2021 22:25:12 +0100 Subject: [PATCH 18/24] x86/mce: Mark mce_timed_out() noinstr Fixes vmlinux.o: warning: objtool: do_machine_check()+0x482: call to mce_timed_out() leaves .noinstr.text section Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208111343.8130-12-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 044c94b090bf..7023d6550688 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -883,8 +883,13 @@ static cpumask_t mce_missing_cpus = CPU_MASK_ALL; /* * Check if a timeout waiting for other CPUs happened. */ -static int mce_timed_out(u64 *t, const char *msg) +static noinstr int mce_timed_out(u64 *t, const char *msg) { + int ret = 0; + + /* Enable instrumentation around calls to external facilities */ + instrumentation_begin(); + /* * The others already did panic for some reason. * Bail out like in a timeout. @@ -903,12 +908,17 @@ static int mce_timed_out(u64 *t, const char *msg) cpumask_pr_args(&mce_missing_cpus)); mce_panic(msg, NULL, NULL); } - return 1; + ret = 1; + goto out; } *t -= SPINUNIT; + out: touch_nmi_watchdog(); - return 0; + + instrumentation_end(); + + return ret; } /* From e3d72e8eee53c55835ab4664920d83400ff5d6c2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Nov 2021 10:30:29 +0100 Subject: [PATCH 19/24] x86/mce: Mark mce_start() noinstr Fixes vmlinux.o: warning: objtool: do_machine_check()+0x4ae: call to __const_udelay() leaves .noinstr.text section Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208111343.8130-13-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7023d6550688..5818b837fd4d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1007,13 +1007,13 @@ static atomic_t global_nwo; * in the entry order. * TBD double check parallel CPU hotunplug */ -static int mce_start(int *no_way_out) +static noinstr int mce_start(int *no_way_out) { - int order; u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + int order, ret = -1; if (!timeout) - return -1; + return ret; atomic_add(*no_way_out, &global_nwo); /* @@ -1023,6 +1023,9 @@ static int mce_start(int *no_way_out) order = atomic_inc_return(&mce_callin); cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus); + /* Enable instrumentation around calls to external facilities */ + instrumentation_begin(); + /* * Wait for everyone. */ @@ -1030,7 +1033,7 @@ static int mce_start(int *no_way_out) if (mce_timed_out(&timeout, "Timeout: Not all CPUs entered broadcast exception handler")) { atomic_set(&global_nwo, 0); - return -1; + goto out; } ndelay(SPINUNIT); } @@ -1056,7 +1059,7 @@ static int mce_start(int *no_way_out) if (mce_timed_out(&timeout, "Timeout: Subject CPUs unable to finish machine check processing")) { atomic_set(&global_nwo, 0); - return -1; + goto out; } ndelay(SPINUNIT); } @@ -1067,7 +1070,12 @@ static int mce_start(int *no_way_out) */ *no_way_out = atomic_read(&global_nwo); - return order; + ret = order; + +out: + instrumentation_end(); + + return ret; } /* From 1acd85feba81084fcef00b73fc1601e42b77c5d8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 17 Dec 2021 16:49:25 +0100 Subject: [PATCH 20/24] x86/mce: Check regs before accessing it Commit in Fixes accesses pt_regs before checking whether it is NULL or not. Make sure the NULL pointer check happens first. Fixes: 0a5b288e85bb ("x86/mce: Prevent severity computation from being instrumented") Reported-by: Dan Carpenter Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Link: https://lore.kernel.org/r/20211217102029.GA29708@kili --- arch/x86/kernel/cpu/mce/severity.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index a32646769705..7aa2bda93cbb 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -222,6 +222,9 @@ static bool is_copy_from_user(struct pt_regs *regs) struct insn insn; int ret; + if (!regs) + return false; + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return false; @@ -283,7 +286,7 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs) switch (fixup_type) { case EX_TYPE_UACCESS: case EX_TYPE_COPY: - if (!regs || !copy_user) + if (!copy_user) return IN_KERNEL; m->kflags |= MCE_IN_KERNEL_COPYIN; fallthrough; From 5176a93ab27aef1b9f4496fc68e6c303a011d7cc Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 16 Dec 2021 16:29:04 +0000 Subject: [PATCH 21/24] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Add HWID and McaType values for new SMCA bank types, and add their error descriptions to edac_mce_amd. The "PHY" bank types all have the same error descriptions, and the NBIF and SHUB bank types have the same error descriptions. So reuse the same arrays where appropriate. [ bp: Remove useless comments over hwid types. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211216162905.4132657-2-yazen.ghannam@amd.com --- arch/x86/include/asm/mce.h | 7 ++ arch/x86/kernel/cpu/mce/amd.c | 21 ++++-- drivers/edac/mce_amd.c | 135 ++++++++++++++++++++++++++++++++-- 3 files changed, 151 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index d58f4f2e006f..52d2b35cac2d 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -313,12 +313,19 @@ enum smca_bank_types { SMCA_SMU, /* System Management Unit */ SMCA_SMU_V2, SMCA_MP5, /* Microprocessor 5 Unit */ + SMCA_MPDMA, /* MPDMA Unit */ SMCA_NBIO, /* Northbridge IO Unit */ SMCA_PCIE, /* PCI Express Unit */ SMCA_PCIE_V2, SMCA_XGMI_PCS, /* xGMI PCS Unit */ + SMCA_NBIF, /* NBIF Unit */ + SMCA_SHUB, /* System HUB Unit */ + SMCA_SATA, /* SATA Unit */ + SMCA_USB, /* USB Unit */ + SMCA_GMI_PCS, /* GMI PCS Unit */ SMCA_XGMI_PHY, /* xGMI PHY Unit */ SMCA_WAFL_PHY, /* WAFL PHY Unit */ + SMCA_GMI_PHY, /* GMI PHY Unit */ N_SMCA_BANK_TYPES }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 2f351838d5f7..9407cdc1e081 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -95,11 +95,18 @@ static struct smca_bank_name smca_names[] = { [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" }, [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, + [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" }, [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" }, [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" }, + [SMCA_NBIF] = { "nbif", "NBIF Unit" }, + [SMCA_SHUB] = { "shub", "System Hub Unit" }, + [SMCA_SATA] = { "sata", "SATA Unit" }, + [SMCA_USB] = { "usb", "USB Unit" }, + [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" }, [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" }, [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" }, + [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" }, }; static const char *smca_get_name(enum smca_bank_types t) @@ -174,6 +181,9 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* Microprocessor 5 Unit MCA type */ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) }, + /* MPDMA MCA type */ + { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) }, + /* Northbridge IO Unit MCA type */ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) }, @@ -181,14 +191,15 @@ static struct smca_hwid smca_hwid_mcatypes[] = { { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) }, - /* xGMI PCS MCA type */ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) }, - - /* xGMI PHY MCA type */ + { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) }, + { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) }, + { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) }, + { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) }, + { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) }, { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) }, - - /* WAFL PHY MCA type */ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) }, + { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) }, }; struct smca_bank smca_banks[MAX_NR_BANKS]; diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 67dbf4c31271..cfd3f7ae9251 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -399,6 +399,63 @@ static const char * const smca_mp5_mce_desc[] = { "Instruction Tag Cache Bank B ECC or parity error", }; +static const char * const smca_mpdma_mce_desc[] = { + "Main SRAM [31:0] bank ECC or parity error", + "Main SRAM [63:32] bank ECC or parity error", + "Main SRAM [95:64] bank ECC or parity error", + "Main SRAM [127:96] bank ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", + "MPDMA TVF DVSEC Memory ECC or parity error", + "MPDMA TVF MMIO Mailbox0 ECC or parity error", + "MPDMA TVF MMIO Mailbox1 ECC or parity error", + "MPDMA TVF Doorbell Memory ECC or parity error", + "MPDMA TVF SDP Slave Memory 0 ECC or parity error", + "MPDMA TVF SDP Slave Memory 1 ECC or parity error", + "MPDMA TVF SDP Slave Memory 2 ECC or parity error", + "MPDMA TVF SDP Master Memory 0 ECC or parity error", + "MPDMA TVF SDP Master Memory 1 ECC or parity error", + "MPDMA TVF SDP Master Memory 2 ECC or parity error", + "MPDMA TVF SDP Master Memory 3 ECC or parity error", + "MPDMA TVF SDP Master Memory 4 ECC or parity error", + "MPDMA TVF SDP Master Memory 5 ECC or parity error", + "MPDMA TVF SDP Master Memory 6 ECC or parity error", + "MPDMA PTE Command FIFO ECC or parity error", + "MPDMA PTE Hub Data FIFO ECC or parity error", + "MPDMA PTE Internal Data FIFO ECC or parity error", + "MPDMA PTE Command Memory DMA ECC or parity error", + "MPDMA PTE Command Memory Internal ECC or parity error", + "MPDMA PTE DMA Completion FIFO ECC or parity error", + "MPDMA PTE Tablewalk Completion FIFO ECC or parity error", + "MPDMA PTE Descriptor Completion FIFO ECC or parity error", + "MPDMA PTE ReadOnly Completion FIFO ECC or parity error", + "MPDMA PTE DirectWrite Completion FIFO ECC or parity error", + "SDP Watchdog Timer expired", +}; + static const char * const smca_nbio_mce_desc[] = { "ECC or Parity error", "PCIE error", @@ -448,7 +505,7 @@ static const char * const smca_xgmipcs_mce_desc[] = { "Rx Replay Timeout Error", "LinkSub Tx Timeout Error", "LinkSub Rx Timeout Error", - "Rx CMD Pocket Error", + "Rx CMD Packet Error", }; static const char * const smca_xgmiphy_mce_desc[] = { @@ -458,11 +515,66 @@ static const char * const smca_xgmiphy_mce_desc[] = { "PHY APB error", }; -static const char * const smca_waflphy_mce_desc[] = { - "RAM ECC Error", - "ARC instruction buffer parity error", - "ARC data buffer parity error", - "PHY APB error", +static const char * const smca_nbif_mce_desc[] = { + "Timeout error from GMI", + "SRAM ECC error", + "NTB Error Event", + "SDP Parity error", +}; + +static const char * const smca_sata_mce_desc[] = { + "Parity error for port 0", + "Parity error for port 1", + "Parity error for port 2", + "Parity error for port 3", + "Parity error for port 4", + "Parity error for port 5", + "Parity error for port 6", + "Parity error for port 7", +}; + +static const char * const smca_usb_mce_desc[] = { + "Parity error or ECC error for S0 RAM0", + "Parity error or ECC error for S0 RAM1", + "Parity error or ECC error for S0 RAM2", + "Parity error for PHY RAM0", + "Parity error for PHY RAM1", + "AXI Slave Response error", +}; + +static const char * const smca_gmipcs_mce_desc[] = { + "Data Loss Error", + "Training Error", + "Replay Parity Error", + "Rx Fifo Underflow Error", + "Rx Fifo Overflow Error", + "CRC Error", + "BER Exceeded Error", + "Tx Fifo Underflow Error", + "Replay Buffer Parity Error", + "Tx Overflow Error", + "Replay Fifo Overflow Error", + "Replay Fifo Underflow Error", + "Elastic Fifo Overflow Error", + "Deskew Error", + "Offline Error", + "Data Startup Limit Error", + "FC Init Timeout Error", + "Recovery Timeout Error", + "Ready Serial Timeout Error", + "Ready Serial Attempt Error", + "Recovery Attempt Error", + "Recovery Relock Attempt Error", + "Deskew Abort Error", + "Rx Buffer Error", + "Rx LFDS Fifo Overflow Error", + "Rx LFDS Fifo Underflow Error", + "LinkSub Tx Timeout Error", + "LinkSub Rx Timeout Error", + "Rx CMD Packet Error", + "LFDS Training Timeout Error", + "LFDS FC Init Timeout Error", + "Data Loss Error", }; struct smca_mce_desc { @@ -490,12 +602,21 @@ static struct smca_mce_desc smca_mce_descs[] = { [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) }, [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, + [SMCA_MPDMA] = { smca_mpdma_mce_desc, ARRAY_SIZE(smca_mpdma_mce_desc) }, [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) }, [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) }, [SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) }, [SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) }, + /* NBIF and SHUB have the same error descriptions, for now. */ + [SMCA_NBIF] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) }, + [SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) }, + [SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) }, + [SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) }, + [SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) }, + /* All the PHY bank types have the same error descriptions, for now. */ [SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, - [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) }, + [SMCA_WAFL_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, + [SMCA_GMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, }; static bool f12h_mc0_mce(u16 ec, u8 xec) From 91f75eb481cfaee5c4ed8fb5214bf2fbfa04bd7b Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 16 Dec 2021 16:29:05 +0000 Subject: [PATCH 22/24] x86/MCE/AMD, EDAC/mce_amd: Support non-uniform MCA bank type enumeration AMD systems currently lay out MCA bank types such that the type of bank number "i" is either the same across all CPUs or is Reserved/Read-as-Zero. For example: Bank # | CPUx | CPUy 0 LS LS 1 RAZ UMC 2 CS CS 3 SMU RAZ Future AMD systems will lay out MCA bank types such that the type of bank number "i" may be different across CPUs. For example: Bank # | CPUx | CPUy 0 LS LS 1 RAZ UMC 2 CS NBIO 3 SMU RAZ Change the structures that cache MCA bank types to be per-CPU and update smca_get_bank_type() to handle this change. Move some SMCA-specific structures to amd.c from mce.h, since they no longer need to be global. Break out the "count" for bank types from struct smca_hwid, since this should provide a per-CPU count rather than a system-wide count. Apply the "const" qualifier to the struct smca_hwid_mcatypes array. The values in this array should not change at runtime. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211216162905.4132657-3-yazen.ghannam@amd.com --- arch/x86/include/asm/mce.h | 18 +------- arch/x86/kernel/cpu/mce/amd.c | 59 +++++++++++++++---------- drivers/edac/mce_amd.c | 11 +---- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 4 files changed, 39 insertions(+), 51 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 52d2b35cac2d..cc73061e7255 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -329,22 +329,6 @@ enum smca_bank_types { N_SMCA_BANK_TYPES }; -#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype)) - -struct smca_hwid { - unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ - u32 hwid_mcatype; /* (hwid,mcatype) tuple */ - u8 count; /* Number of instances. */ -}; - -struct smca_bank { - struct smca_hwid *hwid; - u32 id; /* Value of MCA_IPID[InstanceId]. */ - u8 sysfs_id; /* Value used for sysfs name. */ -}; - -extern struct smca_bank smca_banks[MAX_NR_BANKS]; - extern const char *smca_get_long_name(enum smca_bank_types t); extern bool amd_mce_is_memory_error(struct mce *m); @@ -352,7 +336,7 @@ extern int mce_threshold_create_device(unsigned int cpu); extern int mce_threshold_remove_device(unsigned int cpu); void mce_amd_feature_init(struct cpuinfo_x86 *c); -enum smca_bank_types smca_get_bank_type(unsigned int bank); +enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank); #else static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 9407cdc1e081..a1e2f41796dc 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -71,6 +71,22 @@ static const char * const smca_umc_block_names[] = { "misc_umc" }; +#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype)) + +struct smca_hwid { + unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ + u32 hwid_mcatype; /* (hwid,mcatype) tuple */ +}; + +struct smca_bank { + const struct smca_hwid *hwid; + u32 id; /* Value of MCA_IPID[InstanceId]. */ + u8 sysfs_id; /* Value used for sysfs name. */ +}; + +static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks); +static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts); + struct smca_bank_name { const char *name; /* Short name for sysfs */ const char *long_name; /* Long name for pretty-printing */ @@ -126,14 +142,14 @@ const char *smca_get_long_name(enum smca_bank_types t) } EXPORT_SYMBOL_GPL(smca_get_long_name); -enum smca_bank_types smca_get_bank_type(unsigned int bank) +enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank) { struct smca_bank *b; if (bank >= MAX_NR_BANKS) return N_SMCA_BANK_TYPES; - b = &smca_banks[bank]; + b = &per_cpu(smca_banks, cpu)[bank]; if (!b->hwid) return N_SMCA_BANK_TYPES; @@ -141,7 +157,7 @@ enum smca_bank_types smca_get_bank_type(unsigned int bank) } EXPORT_SYMBOL_GPL(smca_get_bank_type); -static struct smca_hwid smca_hwid_mcatypes[] = { +static const struct smca_hwid smca_hwid_mcatypes[] = { /* { bank_type, hwid_mcatype } */ /* Reserved type */ @@ -202,9 +218,6 @@ static struct smca_hwid smca_hwid_mcatypes[] = { { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) }, }; -struct smca_bank smca_banks[MAX_NR_BANKS]; -EXPORT_SYMBOL_GPL(smca_banks); - /* * In SMCA enabled processors, we can have multiple banks for a given IP type. * So to define a unique name for each bank, we use a temp c-string to append @@ -260,8 +273,9 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) static void smca_configure(unsigned int bank, unsigned int cpu) { + u8 *bank_counts = this_cpu_ptr(smca_bank_counts); + const struct smca_hwid *s_hwid; unsigned int i, hwid_mcatype; - struct smca_hwid *s_hwid; u32 high, low; u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); @@ -297,10 +311,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) smca_set_misc_banks_map(bank, cpu); - /* Return early if this bank was already initialized. */ - if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0) - return; - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; @@ -311,10 +321,11 @@ static void smca_configure(unsigned int bank, unsigned int cpu) for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { s_hwid = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == s_hwid->hwid_mcatype) { - smca_banks[bank].hwid = s_hwid; - smca_banks[bank].id = low; - smca_banks[bank].sysfs_id = s_hwid->count++; + this_cpu_ptr(smca_banks)[bank].hwid = s_hwid; + this_cpu_ptr(smca_banks)[bank].id = low; + this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++; break; } } @@ -600,7 +611,7 @@ out: bool amd_filter_mce(struct mce *m) { - enum smca_bank_types bank_type = smca_get_bank_type(m->bank); + enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank); struct cpuinfo_x86 *c = &boot_cpu_data; /* See Family 17h Models 10h-2Fh Erratum #1114. */ @@ -638,7 +649,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) } else if (c->x86 == 0x17 && (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) { - if (smca_get_bank_type(bank) != SMCA_IF) + if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF) return; msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank); @@ -706,7 +717,7 @@ bool amd_mce_is_memory_error(struct mce *m) u8 xec = (m->status >> 16) & 0x1f; if (mce_flags.smca) - return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0; + return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0; return m->bank == 4 && xec == 0x8; } @@ -1022,7 +1033,7 @@ static struct kobj_type threshold_ktype = { .release = threshold_block_release, }; -static const char *get_name(unsigned int bank, struct threshold_block *b) +static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b) { enum smca_bank_types bank_type; @@ -1033,7 +1044,7 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return th_names[bank]; } - bank_type = smca_get_bank_type(bank); + bank_type = smca_get_bank_type(cpu, bank); if (bank_type >= N_SMCA_BANK_TYPES) return NULL; @@ -1043,12 +1054,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return NULL; } - if (smca_banks[bank].hwid->count == 1) + if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1) return smca_get_name(bank_type); snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, - "%s_%x", smca_get_name(bank_type), - smca_banks[bank].sysfs_id); + "%s_%u", smca_get_name(bank_type), + per_cpu(smca_banks, cpu)[bank].sysfs_id); return buf_mcatype; } @@ -1104,7 +1115,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb else tb->blocks = b; - err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b)); + err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) goto out_free; recurse: @@ -1159,7 +1170,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, struct device *dev = this_cpu_read(mce_device); struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; - const char *name = get_name(bank, NULL); + const char *name = get_name(cpu, bank, NULL); int err = 0; if (!dev) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index cfd3f7ae9251..cc5c63feb26a 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1166,20 +1166,13 @@ static void decode_mc6_mce(struct mce *m) /* Decode errors according to Scalable MCA specification */ static void decode_smca_error(struct mce *m) { - struct smca_hwid *hwid; - enum smca_bank_types bank_type; + enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank); const char *ip_name; u8 xec = XEC(m->status, xec_mask); - if (m->bank >= ARRAY_SIZE(smca_banks)) + if (bank_type >= N_SMCA_BANK_TYPES) return; - hwid = smca_banks[m->bank].hwid; - if (!hwid) - return; - - bank_type = hwid->bank_type; - if (bank_type == SMCA_RESERVED) { pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank); return; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 08133de21fdd..75dad0214dc7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2647,7 +2647,7 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb, * and error occurred in DramECC (Extended error code = 0) then only * process the error, else bail out. */ - if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) && + if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) && (XEC(m->status, 0x3f) == 0x0))) return NOTIFY_DONE; From de768416b203ac84e02a757b782a32efb388476f Mon Sep 17 00:00:00 2001 From: Zhang Zixun Date: Mon, 27 Dec 2021 22:02:49 +0100 Subject: [PATCH 23/24] x86/mce/inject: Avoid out-of-bounds write when setting flags A contrived zero-length write, for example, by using write(2): ... ret = write(fd, str, 0); ... to the "flags" file causes: BUG: KASAN: stack-out-of-bounds in flags_write Write of size 1 at addr ffff888019be7ddf by task writefile/3787 CPU: 4 PID: 3787 Comm: writefile Not tainted 5.16.0-rc7+ #12 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 due to accessing buf one char before its start. Prevent such out-of-bounds access. [ bp: Productize into a proper patch. Link below is the next best thing because the original mail didn't get archived on lore. ] Fixes: 0451d14d0561 ("EDAC, mce_amd_inj: Modify flags attribute to use string arguments") Signed-off-by: Zhang Zixun Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/linux-edac/YcnePfF1OOqoQwrX@zn.tnic/ --- arch/x86/kernel/cpu/mce/inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 6eac840c64bb..5fbd7ffb3233 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -363,7 +363,7 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf, char buf[MAX_FLAG_OPT_SIZE], *__buf; int err; - if (cnt > MAX_FLAG_OPT_SIZE) + if (!cnt || cnt > MAX_FLAG_OPT_SIZE) return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) From 3376136300a00df9a864b88fa969177d6c3be8e5 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Thu, 23 Dec 2021 12:07:01 -0800 Subject: [PATCH 24/24] x86/mce: Reduce number of machine checks taken during recovery When any of the copy functions in arch/x86/lib/copy_user_64.S take a fault, the fixup code copies the remaining byte count from %ecx to %edx and unconditionally jumps to .Lcopy_user_handle_tail to continue the copy in case any more bytes can be copied. If the fault was #PF this may copy more bytes (because the page fault handler might have fixed the fault). But when the fault is a machine check the original copy code will have copied all the way to the poisoned cache line. So .Lcopy_user_handle_tail will just take another machine check for no good reason. Every code path to .Lcopy_user_handle_tail comes from an exception fixup path, so add a check there to check the trap type (in %eax) and simply return the count of remaining bytes if the trap was a machine check. Doing this reduces the number of machine checks taken during synthetic tests from four to three. As well as reducing the number of machine checks, this also allows Skylake generation Xeons to recover some cases that currently fail. The is because REP; MOVSB is only recoverable when source and destination are well aligned and the byte count is large. That useless call to .Lcopy_user_handle_tail may violate one or more of these conditions and generate a fatal machine check. [ Tony: Add more details to commit message. ] [ bp: Fixup comment. Also, another tip patchset which is adding straight-line speculation mitigation changes the "ret" instruction to an all-caps macro "RET". But, since gas is case-insensitive, use "RET" in the newly added asm block already in order to simplify tip branch merging on its way upstream. ] Signed-off-by: Youquan Song Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/YcTW5dh8yTGucDd+@agluck-desk2.amr.corp.intel.com --- arch/x86/lib/copy_user_64.S | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 2797e630b9b1..e73b76e5bc09 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -225,6 +225,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * Don't try to copy the tail if machine check happened * * Input: + * eax trap number written by ex_handler_copy() * rdi destination * rsi source * rdx count @@ -233,12 +234,20 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * eax uncopied bytes or 0 if successful. */ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) + cmp $X86_TRAP_MC,%eax + je 3f + movl %edx,%ecx 1: rep movsb 2: mov %ecx,%eax ASM_CLAC ret +3: + movl %edx,%eax + ASM_CLAC + RET + _ASM_EXTABLE_CPY(1b, 2b) SYM_CODE_END(.Lcopy_user_handle_tail)