From d19faf0e49eb6fe90e218b9ccfdabd61dc968b41 Mon Sep 17 00:00:00 2001 From: Dwaipayan Ray Date: Tue, 13 Jul 2021 12:21:30 +0530 Subject: [PATCH 1/7] EDAC/amd64: Use DEVICE_ATTR helper macros Instead of "open coding" DEVICE_ATTR, use the corresponding helper macros DEVICE_ATTR_{RW,RO,WO} in amd64_edac.c Some function names needed to be changed to match the device conventions _show and _store, but the functionality itself is unchanged. The devices using EDAC_DCT_ATTR_SHOW() are left unchanged. Reviewed-by: Yazen Ghannam Signed-off-by: Dwaipayan Ray Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210713065130.2151-1-dwaipayanray1@gmail.com --- drivers/edac/amd64_edac.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index f0d8f60acee1..99b06a3e8fb1 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -571,8 +571,8 @@ EDAC_DCT_ATTR_SHOW(dbam0); EDAC_DCT_ATTR_SHOW(top_mem); EDAC_DCT_ATTR_SHOW(top_mem2); -static ssize_t hole_show(struct device *dev, struct device_attribute *mattr, - char *data) +static ssize_t dram_hole_show(struct device *dev, struct device_attribute *mattr, + char *data) { struct mem_ctl_info *mci = to_mci(dev); @@ -593,7 +593,7 @@ static DEVICE_ATTR(dhar, S_IRUGO, dhar_show, NULL); static DEVICE_ATTR(dbam, S_IRUGO, dbam0_show, NULL); static DEVICE_ATTR(topmem, S_IRUGO, top_mem_show, NULL); static DEVICE_ATTR(topmem2, S_IRUGO, top_mem2_show, NULL); -static DEVICE_ATTR(dram_hole, S_IRUGO, hole_show, NULL); +static DEVICE_ATTR_RO(dram_hole); static struct attribute *dbg_attrs[] = { &dev_attr_dhar.attr, @@ -802,16 +802,11 @@ static ssize_t inject_write_store(struct device *dev, * update NUM_INJ_ATTRS in case you add new members */ -static DEVICE_ATTR(inject_section, S_IRUGO | S_IWUSR, - inject_section_show, inject_section_store); -static DEVICE_ATTR(inject_word, S_IRUGO | S_IWUSR, - inject_word_show, inject_word_store); -static DEVICE_ATTR(inject_ecc_vector, S_IRUGO | S_IWUSR, - inject_ecc_vector_show, inject_ecc_vector_store); -static DEVICE_ATTR(inject_write, S_IWUSR, - NULL, inject_write_store); -static DEVICE_ATTR(inject_read, S_IWUSR, - NULL, inject_read_store); +static DEVICE_ATTR_RW(inject_section); +static DEVICE_ATTR_RW(inject_word); +static DEVICE_ATTR_RW(inject_ecc_vector); +static DEVICE_ATTR_WO(inject_write); +static DEVICE_ATTR_WO(inject_read); static struct attribute *inj_attrs[] = { &dev_attr_inject_section.attr, From e1ca90b7cc5cb5d3a38321cbb65ad36a59fcb574 Mon Sep 17 00:00:00 2001 From: Naveen Krishna Chatradhi Date: Wed, 30 Jun 2021 20:58:24 +0530 Subject: [PATCH 2/7] EDAC/mc: Add new HBM2 memory type Add a new entry to 'enum mem_type' and a new string to 'edac_mem_types[]' for HBM2 (High Bandwidth Memory Gen 2) new memory type. Reviewed-by: Yazen Ghannam Signed-off-by: Muralidhara M K Signed-off-by: Naveen Krishna Chatradhi Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210630152828.162659-4-nchatrad@amd.com --- drivers/edac/edac_mc.c | 1 + include/linux/edac.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index f6d462d0be2d..2c5975674723 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -166,6 +166,7 @@ const char * const edac_mem_types[] = { [MEM_DDR5] = "Unbuffered-DDR5", [MEM_NVDIMM] = "Non-volatile-RAM", [MEM_WIO2] = "Wide-IO-2", + [MEM_HBM2] = "High-bandwidth-memory-Gen2", }; EXPORT_SYMBOL_GPL(edac_mem_types); diff --git a/include/linux/edac.h b/include/linux/edac.h index 76d3562d3006..4207d06996a4 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -184,6 +184,7 @@ static inline char *mc_event_error_type(const unsigned int err_type) * @MEM_DDR5: Unbuffered DDR5 RAM * @MEM_NVDIMM: Non-volatile RAM * @MEM_WIO2: Wide I/O 2. + * @MEM_HBM2: High bandwidth Memory Gen 2. */ enum mem_type { MEM_EMPTY = 0, @@ -212,6 +213,7 @@ enum mem_type { MEM_DDR5, MEM_NVDIMM, MEM_WIO2, + MEM_HBM2, }; #define MEM_FLAG_EMPTY BIT(MEM_EMPTY) @@ -239,6 +241,7 @@ enum mem_type { #define MEM_FLAG_DDR5 BIT(MEM_DDR5) #define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) #define MEM_FLAG_WIO2 BIT(MEM_WIO2) +#define MEM_FLAG_HBM2 BIT(MEM_HBM2) /** * enum edac_type - Error Detection and Correction capabilities and mode From 767f4b620edadac579c9b8b6660761d4285fa6f9 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Mon, 28 Jun 2021 12:27:40 -0500 Subject: [PATCH 3/7] EDAC/mce_amd: Do not load edac_mce_amd module on guests Hypervisors likely do not expose the SMCA feature to the guest and loading this module leads to false warnings. This module should not be loaded in guests to begin with, but people tend to do so, especially when testing kernels in VMs. And then they complain about those false warnings. Do the practical thing and do not load this module when running as a guest to avoid all that complaining. [ bp: Rewrite commit message. ] Suggested-by: Borislav Petkov Signed-off-by: Smita Koralahalli Signed-off-by: Borislav Petkov Reviewed-by: Yazen Ghannam Tested-by: Kim Phillips Link: https://lkml.kernel.org/r/20210628172740.245689-1-Smita.KoralahalliChannabasappa@amd.com --- drivers/edac/mce_amd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 27d56920b469..67dbf4c31271 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1246,6 +1246,9 @@ static int __init mce_amd_init(void) c->x86_vendor != X86_VENDOR_HYGON) return -ENODEV; + if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return -ENODEV; + if (boot_cpu_has(X86_FEATURE_SMCA)) { xec_mask = 0x3f; goto out; From 7d07deb3b838ae93994003cf824515acb352eef3 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 1 Jun 2021 11:27:04 +0200 Subject: [PATCH 4/7] EDAC/altera: Skip defining unused structures for specific configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Altera EDAC driver has several features conditionally built depending on Kconfig options. The edac_device_prv_data structures are conditionally used in of_device_id tables. They reference other functions and structures which can be defined as __maybe_unused. Silence build warnings like: drivers/edac/altera_edac.c:643:37: warning: ‘altr_edac_device_inject_fops’ defined but not used [-Wunused-const-variable=] Reported-by: kernel test robot Signed-off-by: Krzysztof Kozlowski Signed-off-by: Borislav Petkov Acked-by: Dinh Nguyen Link: https://lkml.kernel.org/r/20210601092704.203555-1-krzysztof.kozlowski@canonical.com --- drivers/edac/altera_edac.c | 44 ++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 61c21bd880a4..2949edb93454 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -539,10 +539,18 @@ module_platform_driver(altr_edac_driver); * trigger testing are different for each memory. */ +#ifdef CONFIG_EDAC_ALTERA_OCRAM static const struct edac_device_prv_data ocramecc_data; +#endif +#ifdef CONFIG_EDAC_ALTERA_L2C static const struct edac_device_prv_data l2ecc_data; +#endif +#ifdef CONFIG_EDAC_ALTERA_OCRAM static const struct edac_device_prv_data a10_ocramecc_data; +#endif +#ifdef CONFIG_EDAC_ALTERA_L2C static const struct edac_device_prv_data a10_l2ecc_data; +#endif static irqreturn_t altr_edac_device_handler(int irq, void *dev_id) { @@ -569,9 +577,9 @@ static irqreturn_t altr_edac_device_handler(int irq, void *dev_id) return ret_value; } -static ssize_t altr_edac_device_trig(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) +static ssize_t __maybe_unused +altr_edac_device_trig(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) { u32 *ptemp, i, error_mask; @@ -640,27 +648,27 @@ static ssize_t altr_edac_device_trig(struct file *file, return count; } -static const struct file_operations altr_edac_device_inject_fops = { +static const struct file_operations altr_edac_device_inject_fops __maybe_unused = { .open = simple_open, .write = altr_edac_device_trig, .llseek = generic_file_llseek, }; -static ssize_t altr_edac_a10_device_trig(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos); +static ssize_t __maybe_unused +altr_edac_a10_device_trig(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos); -static const struct file_operations altr_edac_a10_device_inject_fops = { +static const struct file_operations altr_edac_a10_device_inject_fops __maybe_unused = { .open = simple_open, .write = altr_edac_a10_device_trig, .llseek = generic_file_llseek, }; -static ssize_t altr_edac_a10_device_trig2(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos); +static ssize_t __maybe_unused +altr_edac_a10_device_trig2(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos); -static const struct file_operations altr_edac_a10_device_inject2_fops = { +static const struct file_operations altr_edac_a10_device_inject2_fops __maybe_unused = { .open = simple_open, .write = altr_edac_a10_device_trig2, .llseek = generic_file_llseek, @@ -1697,9 +1705,9 @@ MODULE_DEVICE_TABLE(of, altr_edac_a10_device_of_match); * Based on xgene_edac.c peripheral code. */ -static ssize_t altr_edac_a10_device_trig(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) +static ssize_t __maybe_unused +altr_edac_a10_device_trig(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) { struct edac_device_ctl_info *edac_dci = file->private_data; struct altr_edac_device_dev *drvdata = edac_dci->pvt_info; @@ -1729,9 +1737,9 @@ static ssize_t altr_edac_a10_device_trig(struct file *file, * slightly. A few Arria10 peripherals can use this injection function. * Inject the error into the memory and then readback to trigger the IRQ. */ -static ssize_t altr_edac_a10_device_trig2(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) +static ssize_t __maybe_unused +altr_edac_a10_device_trig2(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) { struct edac_device_ctl_info *edac_dci = file->private_data; struct altr_edac_device_dev *drvdata = edac_dci->pvt_info; From fd07a4a0d30b5468a1f4a0739e34f5f014df7d44 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Tue, 20 Jul 2021 09:30:09 -0700 Subject: [PATCH 5/7] EDAC/skx_common: Set the memory type correctly for HBM memory Set the memory type to MEM_HBM2 if it's managed by the HBM2 memory controller. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210720163009.GA1417532@agluck-desk2.amr.corp.intel.com --- drivers/edac/skx_common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 5e83f59bef8a..f9120e36bf3a 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -345,7 +345,10 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, rows = numrow(mtr); cols = imc->hbm_mc ? 6 : numcol(mtr); - if (cfg->support_ddr5 && ((amap & 0x8) || imc->hbm_mc)) { + if (imc->hbm_mc) { + banks = 32; + mtype = MEM_HBM2; + } else if (cfg->support_ddr5 && (amap & 0x8)) { banks = 32; mtype = MEM_DDR5; } else { From 2294a7299f5e51667b841f63c6d69474491753fb Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Wed, 18 Aug 2021 10:57:00 -0700 Subject: [PATCH 6/7] EDAC/i10nm: Fix NVDIMM detection MCDDRCFG is a per-channel register and uses bit{0,1} to indicate the NVDIMM presence on DIMM slot{0,1}. Current i10nm_edac driver wrongly uses MCDDRCFG as per-DIMM register and fails to detect the NVDIMM. Fix it by reading MCDDRCFG as per-channel register and using its bit{0,1} to check whether the NVDIMM is populated on DIMM slot{0,1}. Fixes: d4dc89d069aa ("EDAC, i10nm: Add a driver for Intel 10nm server processors") Reported-by: Fan Du Tested-by: Wen Jin Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210818175701.1611513-2-tony.luck@intel.com --- drivers/edac/i10nm_base.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 6ce0ed2ffaaf..b4a024cb8b97 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -33,9 +33,9 @@ #define I10NM_GET_DIMMMTR(m, i, j) \ readl((m)->mbase + ((m)->hbm_mc ? 0x80c : 0x2080c) + \ (i) * (m)->chan_mmio_sz + (j) * 4) -#define I10NM_GET_MCDDRTCFG(m, i, j) \ +#define I10NM_GET_MCDDRTCFG(m, i) \ readl((m)->mbase + ((m)->hbm_mc ? 0x970 : 0x20970) + \ - (i) * (m)->chan_mmio_sz + (j) * 4) + (i) * (m)->chan_mmio_sz) #define I10NM_GET_MCMTR(m, i) \ readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : 0x20ef8) + \ (i) * (m)->chan_mmio_sz) @@ -321,10 +321,10 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci, ndimms = 0; amap = I10NM_GET_AMAP(imc, i); + mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i); for (j = 0; j < imc->num_dimms; j++) { dimm = edac_get_dimm(mci, i, j, 0); mtr = I10NM_GET_DIMMMTR(imc, i, j); - mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i, j); edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n", mtr, mcddrtcfg, imc->mc, i, j); From cf4e6d52f58399c777276172ec250502e19d5e63 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Wed, 18 Aug 2021 10:57:01 -0700 Subject: [PATCH 7/7] EDAC/i10nm: Retrieve and print retry_rd_err_log registers Retrieve and print retry_rd_err_log registers like the earlier change: commit e80634a75aba ("EDAC, skx: Retrieve and print retry_rd_err_log registers") This is a little trickier than on Skylake because of potential interference with BIOS use of the same registers. The default behavior is to ignore these registers. A module parameter retry_rd_err_log(default=0) controls the mode of operation: - 0=off : Default. - 1=bios : Linux doesn't reset any control bits, but just reports values. This is "no harm" mode, but it may miss reporting some data. - 2=linux: Linux tries to take control and resets mode bits, clears valid/UC bits after reading. This should be more reliable (especially if BIOS interference is reduced by disabling eMCA reporting mode in BIOS setup). Co-developed-by: Qiuxu Zhuo Signed-off-by: Qiuxu Zhuo Signed-off-by: Youquan Song Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210818175701.1611513-3-tony.luck@intel.com --- drivers/edac/i10nm_base.c | 146 ++++++++++++++++++++++++++++++++++++++ drivers/edac/skx_base.c | 3 +- drivers/edac/skx_common.c | 4 +- drivers/edac/skx_common.h | 7 +- 4 files changed, 157 insertions(+), 3 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index b4a024cb8b97..83345bfac246 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -42,6 +42,12 @@ #define I10NM_GET_AMAP(m, i) \ readl((m)->mbase + ((m)->hbm_mc ? 0x814 : 0x20814) + \ (i) * (m)->chan_mmio_sz) +#define I10NM_GET_REG32(m, i, offset) \ + readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) +#define I10NM_GET_REG64(m, i, offset) \ + readq((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) +#define I10NM_SET_REG32(m, i, offset, v) \ + writel(v, (m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) #define I10NM_GET_SCK_MMIO_BASE(reg) (GET_BITFIELD(reg, 0, 28) << 23) #define I10NM_GET_IMC_MMIO_OFFSET(reg) (GET_BITFIELD(reg, 0, 10) << 12) @@ -58,8 +64,125 @@ #define I10NM_SAD_ENABLE(reg) GET_BITFIELD(reg, 0, 0) #define I10NM_SAD_NM_CACHEABLE(reg) GET_BITFIELD(reg, 5, 5) +#define RETRY_RD_ERR_LOG_UC BIT(1) +#define RETRY_RD_ERR_LOG_NOOVER BIT(14) +#define RETRY_RD_ERR_LOG_EN BIT(15) +#define RETRY_RD_ERR_LOG_NOOVER_UC (BIT(14) | BIT(1)) +#define RETRY_RD_ERR_LOG_OVER_UC_V (BIT(2) | BIT(1) | BIT(0)) + static struct list_head *i10nm_edac_list; +static struct res_config *res_cfg; +static int retry_rd_err_log; + +static u32 offsets_scrub_icx[] = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8}; +static u32 offsets_scrub_spr[] = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8}; +static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0}; +static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0}; + +static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable) +{ + u32 s, d; + + if (!imc->mbase) + return; + + s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]); + d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]); + + if (enable) { + /* Save default configurations */ + imc->chan[chan].retry_rd_err_log_s = s; + imc->chan[chan].retry_rd_err_log_d = d; + + s &= ~RETRY_RD_ERR_LOG_NOOVER_UC; + s |= RETRY_RD_ERR_LOG_EN; + d &= ~RETRY_RD_ERR_LOG_NOOVER_UC; + d |= RETRY_RD_ERR_LOG_EN; + } else { + /* Restore default configurations */ + if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC) + s |= RETRY_RD_ERR_LOG_UC; + if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_NOOVER) + s |= RETRY_RD_ERR_LOG_NOOVER; + if (!(imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_EN)) + s &= ~RETRY_RD_ERR_LOG_EN; + if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_UC) + d |= RETRY_RD_ERR_LOG_UC; + if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_NOOVER) + d |= RETRY_RD_ERR_LOG_NOOVER; + if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN)) + d &= ~RETRY_RD_ERR_LOG_EN; + } + + I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s); + I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d); +} + +static void enable_retry_rd_err_log(bool enable) +{ + struct skx_dev *d; + int i, j; + + edac_dbg(2, "\n"); + + list_for_each_entry(d, i10nm_edac_list, list) + for (i = 0; i < I10NM_NUM_IMC; i++) + for (j = 0; j < I10NM_NUM_CHANNELS; j++) + __enable_retry_rd_err_log(&d->imc[i], j, enable); +} + +static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, + int len, bool scrub_err) +{ + struct skx_imc *imc = &res->dev->imc[res->imc]; + u32 log0, log1, log2, log3, log4; + u32 corr0, corr1, corr2, corr3; + u64 log2a, log5; + u32 *offsets; + int n; + + if (!imc->mbase) + return; + + offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand; + + log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]); + log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]); + log3 = I10NM_GET_REG32(imc, res->channel, offsets[3]); + log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]); + log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]); + + if (res_cfg->type == SPR) { + log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]); + n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx]", + log0, log1, log2a, log3, log4, log5); + } else { + log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]); + n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]", + log0, log1, log2, log3, log4, log5); + } + + corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18); + corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c); + corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20); + corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24); + + if (len - n > 0) + snprintf(msg + n, len - n, + " correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]", + corr0 & 0xffff, corr0 >> 16, + corr1 & 0xffff, corr1 >> 16, + corr2 & 0xffff, corr2 >> 16, + corr3 & 0xffff, corr3 >> 16); + + /* Clear status bits */ + if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) { + log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V; + I10NM_SET_REG32(imc, res->channel, offsets[0], log0); + } +} + static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus, unsigned int dev, unsigned int fun) { @@ -263,6 +386,8 @@ static struct res_config i10nm_cfg0 = { .ddr_chan_mmio_sz = 0x4000, .sad_all_devfn = PCI_DEVFN(29, 0), .sad_all_offset = 0x108, + .offsets_scrub = offsets_scrub_icx, + .offsets_demand = offsets_demand_icx, }; static struct res_config i10nm_cfg1 = { @@ -272,6 +397,8 @@ static struct res_config i10nm_cfg1 = { .ddr_chan_mmio_sz = 0x4000, .sad_all_devfn = PCI_DEVFN(29, 0), .sad_all_offset = 0x108, + .offsets_scrub = offsets_scrub_icx, + .offsets_demand = offsets_demand_icx, }; static struct res_config spr_cfg = { @@ -283,6 +410,8 @@ static struct res_config spr_cfg = { .support_ddr5 = true, .sad_all_devfn = PCI_DEVFN(10, 0), .sad_all_offset = 0x300, + .offsets_scrub = offsets_scrub_spr, + .offsets_demand = offsets_demand_spr, }; static const struct x86_cpu_id i10nm_cpuids[] = { @@ -422,6 +551,7 @@ static int __init i10nm_init(void) return -ENODEV; cfg = (struct res_config *)id->driver_data; + res_cfg = cfg; rc = skx_get_hi_lo(0x09a2, off, &tolm, &tohm); if (rc) @@ -486,6 +616,12 @@ static int __init i10nm_init(void) mce_register_decode_chain(&i10nm_mce_dec); setup_i10nm_debug(); + if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) { + skx_set_decode(NULL, show_retry_rd_err_log); + if (retry_rd_err_log == 2) + enable_retry_rd_err_log(true); + } + i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION); return 0; @@ -497,6 +633,13 @@ fail: static void __exit i10nm_exit(void) { edac_dbg(2, "\n"); + + if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) { + skx_set_decode(NULL, NULL); + if (retry_rd_err_log == 2) + enable_retry_rd_err_log(false); + } + teardown_i10nm_debug(); mce_unregister_decode_chain(&i10nm_mce_dec); skx_adxl_put(); @@ -506,5 +649,8 @@ static void __exit i10nm_exit(void) module_init(i10nm_init); module_exit(i10nm_exit); +module_param(retry_rd_err_log, int, 0444); +MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)"); + MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("MC Driver for Intel 10nm server processors"); diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 4dbd46575bfb..1abc020d49ab 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -230,7 +230,8 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) #define SKX_ILV_TARGET(tgt) ((tgt) & 7) static void skx_show_retry_rd_err_log(struct decoded_addr *res, - char *msg, int len) + char *msg, int len, + bool scrub_err) { u32 log0, log1, log2, log3, log4; u32 corr0, corr1, corr2, corr3; diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index f9120e36bf3a..19c17c5198c5 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -532,6 +532,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool overflow = GET_BITFIELD(m->status, 62, 62); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); + bool scrub_err = false; bool recoverable; int len; u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52); @@ -583,6 +584,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, break; case 4: optype = "memory scrubbing error"; + scrub_err = true; break; default: optype = "reserved"; @@ -605,7 +607,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, } if (skx_show_retry_rd_err_log) - skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len); + skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len, scrub_err); edac_dbg(0, "%s\n", skx_msg); diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 01f67e731766..03ac067a80b9 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -80,6 +80,8 @@ struct skx_dev { struct skx_channel { struct pci_dev *cdev; struct pci_dev *edev; + u32 retry_rd_err_log_s; + u32 retry_rd_err_log_d; struct skx_dimm { u8 close_pg; u8 bank_xor_enable; @@ -150,12 +152,15 @@ struct res_config { /* SAD device number and function number */ unsigned int sad_all_devfn; int sad_all_offset; + /* Offsets of retry_rd_err_log registers */ + u32 *offsets_scrub; + u32 *offsets_demand; }; typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci, struct res_config *cfg); typedef bool (*skx_decode_f)(struct decoded_addr *res); -typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len); +typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len, bool scrub_err); int __init skx_adxl_get(void); void __exit skx_adxl_put(void);