From 1c38bdc96941fcf8ce9c32d1c45eca5547abd45e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 25 Jul 2019 12:13:11 -0400 Subject: [PATCH 01/19] MAINTAINERS: update EDAC entry to reflect current tree and maintainers Tony will start to officially maintain EDAC trees. Also, we'll be using a single tree for the EDAC development. Signed-off-by: Mauro Carvalho Chehab Acked-by: Borislav Petkov Acked-by: Tony Luck --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 783569e3c4b4..7c22905b5aba 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5781,10 +5781,10 @@ F: drivers/edac/thunderx_edac* EDAC-CORE M: Borislav Petkov M: Mauro Carvalho Chehab +M: Tony Luck R: James Morse L: linux-edac@vger.kernel.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git for-next -T: git git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac.git linux_next +T: git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next S: Supported F: Documentation/admin-guide/ras.rst F: Documentation/driver-api/edac.rst From 3123c5c4ca157edc518102ee5385f60cb90f93f5 Mon Sep 17 00:00:00 2001 From: Thor Thayer Date: Fri, 12 Jul 2019 13:28:43 -0500 Subject: [PATCH 02/19] edac: altera: Move Stratix10 SDRAM ECC to peripheral ARM32 SoCFPGAs had separate IRQs for SDRAM. ARM64 SoCFPGAs send all DBEs to SError so filtering by source is necessary. The Stratix10 SDRAM ECC is a better match with the generic Altera peripheral ECC framework because the linked list can be searched to find the ECC block offset and printout the DBE Address. Signed-off-by: Thor Thayer Acked-by: James Morse Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/altera_edac.c | 54 +++++++++++++++++++++++++++++++++++--- drivers/edac/altera_edac.h | 25 +++++++++++++++++- 2 files changed, 74 insertions(+), 5 deletions(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index c2e693e34d43..09a80b53acea 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -222,7 +222,6 @@ static unsigned long get_total_mem(void) static const struct of_device_id altr_sdram_ctrl_of_match[] = { { .compatible = "altr,sdram-edac", .data = &c5_data}, { .compatible = "altr,sdram-edac-a10", .data = &a10_data}, - { .compatible = "altr,sdram-edac-s10", .data = &a10_data}, {}, }; MODULE_DEVICE_TABLE(of, altr_sdram_ctrl_of_match); @@ -1170,6 +1169,24 @@ static int __init __maybe_unused altr_init_a10_ecc_device_type(char *compat) return 0; } +/*********************** SDRAM EDAC Device Functions *********************/ + +#ifdef CONFIG_EDAC_ALTERA_SDRAM + +static const struct edac_device_prv_data s10_sdramecc_data = { + .setup = altr_check_ecc_deps, + .ce_clear_mask = ALTR_S10_ECC_SERRPENA, + .ue_clear_mask = ALTR_S10_ECC_DERRPENA, + .ecc_enable_mask = ALTR_S10_ECC_EN, + .ecc_en_ofst = ALTR_S10_ECC_CTRL_SDRAM_OFST, + .ce_set_mask = ALTR_S10_ECC_TSERRA, + .ue_set_mask = ALTR_S10_ECC_TDERRA, + .set_err_ofst = ALTR_S10_ECC_INTTEST_OFST, + .ecc_irq_handler = altr_edac_a10_ecc_irq, + .inject_fops = &altr_edac_a10_device_inject_fops, +}; +#endif /* CONFIG_EDAC_ALTERA_SDRAM */ + /*********************** OCRAM EDAC Device Functions *********************/ #ifdef CONFIG_EDAC_ALTERA_OCRAM @@ -1758,6 +1775,9 @@ static const struct of_device_id altr_edac_a10_device_of_match[] = { #endif #ifdef CONFIG_EDAC_ALTERA_SDMMC { .compatible = "altr,socfpga-sdmmc-ecc", .data = &a10_sdmmcecca_data }, +#endif +#ifdef CONFIG_EDAC_ALTERA_SDRAM + { .compatible = "altr,sdram-edac-s10", .data = &s10_sdramecc_data }, #endif {}, }; @@ -1889,6 +1909,10 @@ static int validate_parent_available(struct device_node *np) struct device_node *parent; int ret = 0; + /* SDRAM must be present for Linux (implied parent) */ + if (of_device_is_compatible(np, "altr,sdram-edac-s10")) + return 0; + /* Ensure parent device is enabled if parent node exists */ parent = of_parse_phandle(np, "altr,ecc-parent", 0); if (parent && !of_device_is_available(parent)) @@ -1898,6 +1922,22 @@ static int validate_parent_available(struct device_node *np) return ret; } +static int get_s10_sdram_edac_resource(struct device_node *np, + struct resource *res) +{ + struct device_node *parent; + int ret; + + parent = of_parse_phandle(np, "altr,sdr-syscon", 0); + if (!parent) + return -ENODEV; + + ret = of_address_to_resource(parent, 0, res); + of_node_put(parent); + + return ret; +} + static int altr_edac_a10_device_add(struct altr_arria10_edac *edac, struct device_node *np) { @@ -1925,7 +1965,11 @@ static int altr_edac_a10_device_add(struct altr_arria10_edac *edac, if (!devres_open_group(edac->dev, altr_edac_a10_device_add, GFP_KERNEL)) return -ENOMEM; - rc = of_address_to_resource(np, 0, &res); + if (of_device_is_compatible(np, "altr,sdram-edac-s10")) + rc = get_s10_sdram_edac_resource(np, &res); + else + rc = of_address_to_resource(np, 0, &res); + if (rc < 0) { edac_printk(KERN_ERR, EDAC_DEVICE, "%s: no resource address\n", ecc_name); @@ -2231,13 +2275,15 @@ static int altr_edac_a10_probe(struct platform_device *pdev) of_device_is_compatible(child, "altr,socfpga-dma-ecc") || of_device_is_compatible(child, "altr,socfpga-usb-ecc") || of_device_is_compatible(child, "altr,socfpga-qspi-ecc") || +#ifdef CONFIG_EDAC_ALTERA_SDRAM + of_device_is_compatible(child, "altr,sdram-edac-s10") || +#endif of_device_is_compatible(child, "altr,socfpga-sdmmc-ecc")) altr_edac_a10_device_add(edac, child); #ifdef CONFIG_EDAC_ALTERA_SDRAM - else if ((of_device_is_compatible(child, "altr,sdram-edac-a10")) || - (of_device_is_compatible(child, "altr,sdram-edac-s10"))) + else if (of_device_is_compatible(child, "altr,sdram-edac-a10")) of_platform_populate(pdev->dev.of_node, altr_sdram_ctrl_of_match, NULL, &pdev->dev); diff --git a/drivers/edac/altera_edac.h b/drivers/edac/altera_edac.h index 55654cc4bcdf..3727e72c8c2e 100644 --- a/drivers/edac/altera_edac.h +++ b/drivers/edac/altera_edac.h @@ -289,6 +289,29 @@ struct altr_sdram_mc_data { #define ALTR_A10_ECC_INIT_WATCHDOG_10US 10000 /************* Stratix10 Defines **************/ +#define ALTR_S10_ECC_CTRL_SDRAM_OFST 0x00 +#define ALTR_S10_ECC_EN BIT(0) + +#define ALTR_S10_ECC_ERRINTEN_OFST 0x10 +#define ALTR_S10_ECC_ERRINTENS_OFST 0x14 +#define ALTR_S10_ECC_ERRINTENR_OFST 0x18 +#define ALTR_S10_ECC_SERRINTEN BIT(0) + +#define ALTR_S10_ECC_INTMODE_OFST 0x1C +#define ALTR_S10_ECC_INTMODE BIT(0) + +#define ALTR_S10_ECC_INTSTAT_OFST 0x20 +#define ALTR_S10_ECC_SERRPENA BIT(0) +#define ALTR_S10_ECC_DERRPENA BIT(8) +#define ALTR_S10_ECC_ERRPENA_MASK (ALTR_S10_ECC_SERRPENA | \ + ALTR_S10_ECC_DERRPENA) + +#define ALTR_S10_ECC_INTTEST_OFST 0x24 +#define ALTR_S10_ECC_TSERRA BIT(0) +#define ALTR_S10_ECC_TDERRA BIT(8) +#define ALTR_S10_ECC_TSERRB BIT(16) +#define ALTR_S10_ECC_TDERRB BIT(24) + #define ALTR_S10_DERR_ADDRA_OFST 0x2C /* Stratix10 ECC Manager Defines */ @@ -300,7 +323,7 @@ struct altr_sdram_mc_data { #define S10_SYSMGR_UE_ADDR_OFST 0x224 #define S10_DDR0_IRQ_MASK BIT(16) -#define S10_DBE_IRQ_MASK 0x3FE +#define S10_DBE_IRQ_MASK 0x3FFFE /* Define ECC Block Offsets for peripherals */ #define ECC_BLK_ADDRESS_OFST 0x40 From 3724ace582d9f675134985727fd5e9811f23c059 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 24 Jun 2019 15:08:55 +0000 Subject: [PATCH 03/19] EDAC/mc: Fix grain_bits calculation The grain in EDAC is defined as "minimum granularity for an error report, in bytes". The following calculation of the grain_bits in edac_mc is wrong: grain_bits = fls_long(e->grain) + 1; Where grain_bits is defined as: grain = 1 << grain_bits Example: grain = 8 # 64 bit (8 bytes) grain_bits = fls_long(8) + 1 grain_bits = 4 + 1 = 5 grain = 1 << grain_bits grain = 1 << 5 = 32 Replace it with the correct calculation: grain_bits = fls_long(e->grain - 1); The example gives now: grain_bits = fls_long(8 - 1) grain_bits = fls_long(7) grain_bits = 3 grain = 1 << 3 = 8 Also, check if the hardware reports a reasonable grain != 0 and fallback with a warning to 1 byte granularity otherwise. [ bp: massage a bit. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Tony Luck Link: https://lkml.kernel.org/r/20190624150758.6695-2-rrichter@marvell.com --- drivers/edac/edac_mc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 64922c8fa7e3..d899d86897d0 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -1235,9 +1235,13 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, if (p > e->location) *(p - 1) = '\0'; - /* Report the error via the trace interface */ - grain_bits = fls_long(e->grain) + 1; + /* Sanity-check driver-supplied grain value. */ + if (WARN_ON_ONCE(!e->grain)) + e->grain = 1; + grain_bits = fls_long(e->grain - 1); + + /* Report the error via the trace interface */ if (IS_ENABLED(CONFIG_RAS)) trace_mc_event(type, e->msg, e->label, e->error_count, mci->mc_idx, e->top_layer, e->mid_layer, From 8faa1cf6ed82f33009f63986c3776cc48af1b7b2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 24 Jun 2019 16:47:17 +0300 Subject: [PATCH 04/19] EDAC/altera: Use the proper type for the IRQ status bits Smatch complains about the cast of a u32 pointer to unsigned long: drivers/edac/altera_edac.c:1878 altr_edac_a10_irq_handler() warn: passing casted pointer '&irq_status' to 'find_first_bit()' This code wouldn't work on a 64 bit big endian system because it would read past the end of &irq_status. [ bp: massage. ] Fixes: 13ab8448d2c9 ("EDAC, altera: Add ECC Manager IRQ controller support") Signed-off-by: Dan Carpenter Signed-off-by: Borislav Petkov Reviewed-by: Thor Thayer Cc: James Morse Cc: kernel-janitors@vger.kernel.org Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Tony Luck Link: https://lkml.kernel.org/r/20190624134717.GA1754@mwanda --- drivers/edac/altera_edac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 09a80b53acea..fbda4b876afd 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -1886,6 +1886,7 @@ static void altr_edac_a10_irq_handler(struct irq_desc *desc) struct altr_arria10_edac *edac = irq_desc_get_handler_data(desc); struct irq_chip *chip = irq_desc_get_chip(desc); int irq = irq_desc_get_irq(desc); + unsigned long bits; dberr = (irq == edac->db_irq) ? 1 : 0; sm_offset = dberr ? A10_SYSMGR_ECC_INTSTAT_DERR_OFST : @@ -1895,7 +1896,8 @@ static void altr_edac_a10_irq_handler(struct irq_desc *desc) regmap_read(edac->ecc_mgr_map, sm_offset, &irq_status); - for_each_set_bit(bit, (unsigned long *)&irq_status, 32) { + bits = irq_status; + for_each_set_bit(bit, &bits, 32) { irq = irq_linear_revmap(edac->domain, dberr * 32 + bit); if (irq) generic_handle_irq(irq); From 82413e562ea6eadfb6de946dcc6f74af31d64e7f Mon Sep 17 00:00:00 2001 From: Shravan Kumar Ramani Date: Tue, 25 Jun 2019 15:13:59 -0400 Subject: [PATCH 05/19] EDAC, mellanox: Add ECC support for BlueField DDR4 Add ECC support for Mellanox BlueField SoC DDR controller. This requires SMC to the running Arm Trusted Firmware to report what is the current memory configuration. Reviewed-by: James Morse Signed-off-by: Shravan Kumar Ramani Signed-off-by: Mauro Carvalho Chehab --- MAINTAINERS | 5 + drivers/edac/Kconfig | 7 + drivers/edac/Makefile | 1 + drivers/edac/bluefield_edac.c | 356 ++++++++++++++++++++++++++++++++++ 4 files changed, 369 insertions(+) create mode 100644 drivers/edac/bluefield_edac.c diff --git a/MAINTAINERS b/MAINTAINERS index 7c22905b5aba..97912d8333a9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5757,6 +5757,11 @@ S: Supported F: drivers/edac/aspeed_edac.c F: Documentation/devicetree/bindings/edac/aspeed-sdram-edac.txt +EDAC-BLUEFIELD +M: Shravan Kumar Ramani +S: Supported +F: drivers/edac/bluefield_edac.c + EDAC-CALXEDA M: Robert Richter L: linux-edac@vger.kernel.org diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 200c04ce5b0e..2a2603bfb918 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -510,4 +510,11 @@ config EDAC_ASPEED First, ECC must be configured in the bootloader. Then, this driver will expose error counters via the EDAC kernel framework. +config EDAC_BLUEFIELD + tristate "Mellanox BlueField Memory ECC" + depends on ARM64 && ((MELLANOX_PLATFORM && ACPI) || COMPILE_TEST) + help + Support for error detection and correction on the + Mellanox BlueField SoCs. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 165ca65e1a3a..d265ff9311f0 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -85,3 +85,4 @@ obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o obj-$(CONFIG_EDAC_TI) += ti_edac.o obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o obj-$(CONFIG_EDAC_ASPEED) += aspeed_edac.o +obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o diff --git a/drivers/edac/bluefield_edac.c b/drivers/edac/bluefield_edac.c new file mode 100644 index 000000000000..e4736eb37bfb --- /dev/null +++ b/drivers/edac/bluefield_edac.c @@ -0,0 +1,356 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Bluefield-specific EDAC driver. + * + * Copyright (c) 2019 Mellanox Technologies. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "edac_module.h" + +#define DRIVER_NAME "bluefield-edac" + +/* + * Mellanox BlueField EMI (External Memory Interface) register definitions. + */ + +#define MLXBF_ECC_CNT 0x340 +#define MLXBF_ECC_CNT__SERR_CNT GENMASK(15, 0) +#define MLXBF_ECC_CNT__DERR_CNT GENMASK(31, 16) + +#define MLXBF_ECC_ERR 0x348 +#define MLXBF_ECC_ERR__SECC BIT(0) +#define MLXBF_ECC_ERR__DECC BIT(16) + +#define MLXBF_ECC_LATCH_SEL 0x354 +#define MLXBF_ECC_LATCH_SEL__START BIT(24) + +#define MLXBF_ERR_ADDR_0 0x358 + +#define MLXBF_ERR_ADDR_1 0x37c + +#define MLXBF_SYNDROM 0x35c +#define MLXBF_SYNDROM__DERR BIT(0) +#define MLXBF_SYNDROM__SERR BIT(1) +#define MLXBF_SYNDROM__SYN GENMASK(25, 16) + +#define MLXBF_ADD_INFO 0x364 +#define MLXBF_ADD_INFO__ERR_PRANK GENMASK(9, 8) + +#define MLXBF_EDAC_MAX_DIMM_PER_MC 2 +#define MLXBF_EDAC_ERROR_GRAIN 8 + +/* + * Request MLNX_SIP_GET_DIMM_INFO + * + * Retrieve information about DIMM on a certain slot. + * + * Call register usage: + * a0: MLNX_SIP_GET_DIMM_INFO + * a1: (Memory controller index) << 16 | (Dimm index in memory controller) + * a2-7: not used. + * + * Return status: + * a0: MLXBF_DIMM_INFO defined below describing the DIMM. + * a1-3: not used. + */ +#define MLNX_SIP_GET_DIMM_INFO 0x82000008 + +/* Format for the SMC response about the memory information */ +#define MLXBF_DIMM_INFO__SIZE_GB GENMASK_ULL(15, 0) +#define MLXBF_DIMM_INFO__IS_RDIMM BIT(16) +#define MLXBF_DIMM_INFO__IS_LRDIMM BIT(17) +#define MLXBF_DIMM_INFO__IS_NVDIMM BIT(18) +#define MLXBF_DIMM_INFO__RANKS GENMASK_ULL(23, 21) +#define MLXBF_DIMM_INFO__PACKAGE_X GENMASK_ULL(31, 24) + +struct bluefield_edac_priv { + int dimm_ranks[MLXBF_EDAC_MAX_DIMM_PER_MC]; + void __iomem *emi_base; + int dimm_per_mc; +}; + +static u64 smc_call1(u64 smc_op, u64 smc_arg) +{ + struct arm_smccc_res res; + + arm_smccc_smc(smc_op, smc_arg, 0, 0, 0, 0, 0, 0, &res); + + return res.a0; +} + +/* + * Gather the ECC information from the External Memory Interface registers + * and report it to the edac handler. + */ +static void bluefield_gather_report_ecc(struct mem_ctl_info *mci, + int error_cnt, + int is_single_ecc) +{ + struct bluefield_edac_priv *priv = mci->pvt_info; + u32 dram_additional_info, err_prank, edea0, edea1; + u32 ecc_latch_select, dram_syndrom, serr, derr, syndrom; + enum hw_event_mc_err_type ecc_type; + u64 ecc_dimm_addr; + int ecc_dimm; + + ecc_type = is_single_ecc ? HW_EVENT_ERR_CORRECTED : + HW_EVENT_ERR_UNCORRECTED; + + /* + * Tell the External Memory Interface to populate the relevant + * registers with information about the last ECC error occurrence. + */ + ecc_latch_select = MLXBF_ECC_LATCH_SEL__START; + writel(ecc_latch_select, priv->emi_base + MLXBF_ECC_LATCH_SEL); + + /* + * Verify that the ECC reported info in the registers is of the + * same type as the one asked to report. If not, just report the + * error without the detailed information. + */ + dram_syndrom = readl(priv->emi_base + MLXBF_SYNDROM); + serr = FIELD_GET(MLXBF_SYNDROM__SERR, dram_syndrom); + derr = FIELD_GET(MLXBF_SYNDROM__DERR, dram_syndrom); + syndrom = FIELD_GET(MLXBF_SYNDROM__SYN, dram_syndrom); + + if ((is_single_ecc && !serr) || (!is_single_ecc && !derr)) { + edac_mc_handle_error(ecc_type, mci, error_cnt, 0, 0, 0, + 0, 0, -1, mci->ctl_name, ""); + return; + } + + dram_additional_info = readl(priv->emi_base + MLXBF_ADD_INFO); + err_prank = FIELD_GET(MLXBF_ADD_INFO__ERR_PRANK, dram_additional_info); + + ecc_dimm = (err_prank >= 2 && priv->dimm_ranks[0] <= 2) ? 1 : 0; + + edea0 = readl(priv->emi_base + MLXBF_ERR_ADDR_0); + edea1 = readl(priv->emi_base + MLXBF_ERR_ADDR_1); + + ecc_dimm_addr = ((u64)edea1 << 32) | edea0; + + edac_mc_handle_error(ecc_type, mci, error_cnt, + PFN_DOWN(ecc_dimm_addr), + offset_in_page(ecc_dimm_addr), + syndrom, ecc_dimm, 0, 0, mci->ctl_name, ""); +} + +static void bluefield_edac_check(struct mem_ctl_info *mci) +{ + struct bluefield_edac_priv *priv = mci->pvt_info; + u32 ecc_count, single_error_count, double_error_count, ecc_error = 0; + + /* + * The memory controller might not be initialized by the firmware + * when there isn't memory, which may lead to bad register readings. + */ + if (mci->edac_cap == EDAC_FLAG_NONE) + return; + + ecc_count = readl(priv->emi_base + MLXBF_ECC_CNT); + single_error_count = FIELD_GET(MLXBF_ECC_CNT__SERR_CNT, ecc_count); + double_error_count = FIELD_GET(MLXBF_ECC_CNT__DERR_CNT, ecc_count); + + if (single_error_count) { + ecc_error |= MLXBF_ECC_ERR__SECC; + + bluefield_gather_report_ecc(mci, single_error_count, 1); + } + + if (double_error_count) { + ecc_error |= MLXBF_ECC_ERR__DECC; + + bluefield_gather_report_ecc(mci, double_error_count, 0); + } + + /* Write to clear reported errors. */ + if (ecc_count) + writel(ecc_error, priv->emi_base + MLXBF_ECC_ERR); +} + +/* Initialize the DIMMs information for the given memory controller. */ +static void bluefield_edac_init_dimms(struct mem_ctl_info *mci) +{ + struct bluefield_edac_priv *priv = mci->pvt_info; + int mem_ctrl_idx = mci->mc_idx; + struct dimm_info *dimm; + u64 smc_info, smc_arg; + int is_empty = 1, i; + + for (i = 0; i < priv->dimm_per_mc; i++) { + dimm = mci->dimms[i]; + + smc_arg = mem_ctrl_idx << 16 | i; + smc_info = smc_call1(MLNX_SIP_GET_DIMM_INFO, smc_arg); + + if (!FIELD_GET(MLXBF_DIMM_INFO__SIZE_GB, smc_info)) { + dimm->mtype = MEM_EMPTY; + continue; + } + + is_empty = 0; + + dimm->edac_mode = EDAC_SECDED; + + if (FIELD_GET(MLXBF_DIMM_INFO__IS_NVDIMM, smc_info)) + dimm->mtype = MEM_NVDIMM; + else if (FIELD_GET(MLXBF_DIMM_INFO__IS_LRDIMM, smc_info)) + dimm->mtype = MEM_LRDDR4; + else if (FIELD_GET(MLXBF_DIMM_INFO__IS_RDIMM, smc_info)) + dimm->mtype = MEM_RDDR4; + else + dimm->mtype = MEM_DDR4; + + dimm->nr_pages = + FIELD_GET(MLXBF_DIMM_INFO__SIZE_GB, smc_info) * + (SZ_1G / PAGE_SIZE); + dimm->grain = MLXBF_EDAC_ERROR_GRAIN; + + /* Mem controller for BlueField only supports x4, x8 and x16 */ + switch (FIELD_GET(MLXBF_DIMM_INFO__PACKAGE_X, smc_info)) { + case 4: + dimm->dtype = DEV_X4; + break; + case 8: + dimm->dtype = DEV_X8; + break; + case 16: + dimm->dtype = DEV_X16; + break; + default: + dimm->dtype = DEV_UNKNOWN; + } + + priv->dimm_ranks[i] = + FIELD_GET(MLXBF_DIMM_INFO__RANKS, smc_info); + } + + if (is_empty) + mci->edac_cap = EDAC_FLAG_NONE; + else + mci->edac_cap = EDAC_FLAG_SECDED; +} + +static int bluefield_edac_mc_probe(struct platform_device *pdev) +{ + struct bluefield_edac_priv *priv; + struct device *dev = &pdev->dev; + struct edac_mc_layer layers[1]; + struct mem_ctl_info *mci; + struct resource *emi_res; + unsigned int mc_idx, dimm_count; + int rc, ret; + + /* Read the MSS (Memory SubSystem) index from ACPI table. */ + if (device_property_read_u32(dev, "mss_number", &mc_idx)) { + dev_warn(dev, "bf_edac: MSS number unknown\n"); + return -EINVAL; + } + + /* Read the DIMMs per MC from ACPI table. */ + if (device_property_read_u32(dev, "dimm_per_mc", &dimm_count)) { + dev_warn(dev, "bf_edac: DIMMs per MC unknown\n"); + return -EINVAL; + } + + if (dimm_count > MLXBF_EDAC_MAX_DIMM_PER_MC) { + dev_warn(dev, "bf_edac: DIMMs per MC not valid\n"); + return -EINVAL; + } + + emi_res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!emi_res) + return -EINVAL; + + layers[0].type = EDAC_MC_LAYER_SLOT; + layers[0].size = dimm_count; + layers[0].is_virt_csrow = true; + + mci = edac_mc_alloc(mc_idx, ARRAY_SIZE(layers), layers, sizeof(*priv)); + if (!mci) + return -ENOMEM; + + priv = mci->pvt_info; + + priv->dimm_per_mc = dimm_count; + priv->emi_base = devm_ioremap_resource(dev, emi_res); + if (IS_ERR(priv->emi_base)) { + dev_err(dev, "failed to map EMI IO resource\n"); + ret = PTR_ERR(priv->emi_base); + goto err; + } + + mci->pdev = dev; + mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_RDDR4 | + MEM_FLAG_LRDDR4 | MEM_FLAG_NVDIMM; + mci->edac_ctl_cap = EDAC_FLAG_SECDED; + + mci->mod_name = DRIVER_NAME; + mci->ctl_name = "BlueField_Memory_Controller"; + mci->dev_name = dev_name(dev); + mci->edac_check = bluefield_edac_check; + + /* Initialize mci with the actual populated DIMM information. */ + bluefield_edac_init_dimms(mci); + + platform_set_drvdata(pdev, mci); + + /* Register with EDAC core */ + rc = edac_mc_add_mc(mci); + if (rc) { + dev_err(dev, "failed to register with EDAC core\n"); + ret = rc; + goto err; + } + + /* Only POLL mode supported so far. */ + edac_op_state = EDAC_OPSTATE_POLL; + + return 0; + +err: + edac_mc_free(mci); + + return ret; + +} + +static int bluefield_edac_mc_remove(struct platform_device *pdev) +{ + struct mem_ctl_info *mci = platform_get_drvdata(pdev); + + edac_mc_del_mc(&pdev->dev); + edac_mc_free(mci); + + return 0; +} + +static const struct acpi_device_id bluefield_mc_acpi_ids[] = { + {"MLNXBF08", 0}, + {} +}; + +MODULE_DEVICE_TABLE(acpi, bluefield_mc_acpi_ids); + +static struct platform_driver bluefield_edac_mc_driver = { + .driver = { + .name = DRIVER_NAME, + .acpi_match_table = bluefield_mc_acpi_ids, + }, + .probe = bluefield_edac_mc_probe, + .remove = bluefield_edac_mc_remove, +}; + +module_platform_driver(bluefield_edac_mc_driver); + +MODULE_DESCRIPTION("Mellanox BlueField memory edac driver"); +MODULE_AUTHOR("Mellanox Technologies"); +MODULE_LICENSE("GPL v2"); From 29a3388bfcce7a6d087051376ea02bf8326a957b Mon Sep 17 00:00:00 2001 From: Stephen Douthit Date: Fri, 9 Aug 2019 14:18:02 +0000 Subject: [PATCH 06/19] EDAC, pnd2: Fix ioremap() size in dnv_rd_reg() Depending on how BIOS has marked the reserved region containing the 32KB MCHBAR you can get warnings like: resource sanity check: requesting [mem 0xfed10000-0xfed1ffff], which spans more than reserved [mem 0xfed10000-0xfed17fff] caller dnv_rd_reg+0xc8/0x240 [pnd2_edac] mapping multiple BARs Not all of the mmio regions used in dnv_rd_reg() are the same size. The MCHBAR window is 32KB and the sideband ports are 64KB. Pass the correct size to ioremap() depending on which resource we're reading from. Signed-off-by: Stephen Douthit Signed-off-by: Tony Luck --- drivers/edac/pnd2_edac.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index ca25f8fe57ef..1ad538baaa4a 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -260,11 +260,14 @@ static u64 get_sideband_reg_base_addr(void) } } +#define DNV_MCHBAR_SIZE 0x8000 +#define DNV_SB_PORT_SIZE 0x10000 static int dnv_rd_reg(int port, int off, int op, void *data, size_t sz, char *name) { struct pci_dev *pdev; char *base; u64 addr; + unsigned long size; if (op == 4) { pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x1980, NULL); @@ -279,15 +282,17 @@ static int dnv_rd_reg(int port, int off, int op, void *data, size_t sz, char *na addr = get_mem_ctrl_hub_base_addr(); if (!addr) return -ENODEV; + size = DNV_MCHBAR_SIZE; } else { /* MMIO via sideband register base address */ addr = get_sideband_reg_base_addr(); if (!addr) return -ENODEV; addr += (port << 16); + size = DNV_SB_PORT_SIZE; } - base = ioremap((resource_size_t)addr, 0x10000); + base = ioremap((resource_size_t)addr, size); if (!base) return -ENODEV; From 718d58514ebc5740f80460ae7b73c57a01eac4c2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 24 Jun 2019 15:09:13 +0000 Subject: [PATCH 07/19] EDAC/mc: Cleanup _edac_mc_free() code Remove needless and boilerplate variable declarations. No functional changes. [ bp: Add newlines for better readability. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Cc: linux-edac Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Tony Luck Link: https://lkml.kernel.org/r/20190624150758.6695-10-rrichter@marvell.com --- drivers/edac/edac_mc.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index d899d86897d0..c68f62ab54b0 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -275,28 +275,27 @@ void *edac_align_ptr(void **p, unsigned size, int n_elems) static void _edac_mc_free(struct mem_ctl_info *mci) { - int i, chn, row; struct csrow_info *csr; - const unsigned int tot_dimms = mci->tot_dimms; - const unsigned int tot_channels = mci->num_cschannel; - const unsigned int tot_csrows = mci->nr_csrows; + int i, chn, row; if (mci->dimms) { - for (i = 0; i < tot_dimms; i++) + for (i = 0; i < mci->tot_dimms; i++) kfree(mci->dimms[i]); kfree(mci->dimms); } + if (mci->csrows) { - for (row = 0; row < tot_csrows; row++) { + for (row = 0; row < mci->nr_csrows; row++) { csr = mci->csrows[row]; - if (csr) { - if (csr->channels) { - for (chn = 0; chn < tot_channels; chn++) - kfree(csr->channels[chn]); - kfree(csr->channels); - } - kfree(csr); + if (!csr) + continue; + + if (csr->channels) { + for (chn = 0; chn < mci->num_cschannel; chn++) + kfree(csr->channels[chn]); + kfree(csr->channels); } + kfree(csr); } kfree(mci->csrows); } From d971e28e2ce4696fcc32998c8aced5e47701fffe Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Wed, 21 Aug 2019 23:59:55 +0000 Subject: [PATCH 08/19] EDAC/amd64: Support more than two controllers for chip selects handling The struct chip_select array that's used for saving chip select bases and masks is fixed at length of two. There should be one struct chip_select for each controller, so this array should be increased to support systems that may have more than two controllers. Increase the size of the struct chip_select array to eight, which is the largest number of controllers per die currently supported on AMD systems. Fix number of DIMMs and Chip Select bases/masks on Family17h, because AMD Family 17h systems support 2 DIMMs, 4 CS bases, and 2 CS masks per channel. Also, carve out the Family 17h+ reading of the bases/masks into a separate function. This effectively reverts the original bases/masks reading code to before Family 17h support was added. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Tony Luck Link: https://lkml.kernel.org/r/20190821235938.118710-2-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 123 +++++++++++++++++++++----------------- drivers/edac/amd64_edac.h | 5 +- 2 files changed, 71 insertions(+), 57 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 873437be86d9..dd60cf5a3d96 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -810,7 +810,7 @@ static void debug_display_dimm_sizes_df(struct amd64_pvt *pvt, u8 ctrl) edac_printk(KERN_DEBUG, EDAC_MC, "UMC%d chip selects:\n", ctrl); - for (dimm = 0; dimm < 4; dimm++) { + for (dimm = 0; dimm < 2; dimm++) { size0 = 0; cs0 = dimm * 2; @@ -942,89 +942,102 @@ static void prep_chip_selects(struct amd64_pvt *pvt) } else if (pvt->fam == 0x15 && pvt->model == 0x30) { pvt->csels[0].b_cnt = pvt->csels[1].b_cnt = 4; pvt->csels[0].m_cnt = pvt->csels[1].m_cnt = 2; + } else if (pvt->fam >= 0x17) { + int umc; + + for_each_umc(umc) { + pvt->csels[umc].b_cnt = 4; + pvt->csels[umc].m_cnt = 2; + } + } else { pvt->csels[0].b_cnt = pvt->csels[1].b_cnt = 8; pvt->csels[0].m_cnt = pvt->csels[1].m_cnt = 4; } } +static void read_umc_base_mask(struct amd64_pvt *pvt) +{ + u32 umc_base_reg, umc_mask_reg; + u32 base_reg, mask_reg; + u32 *base, *mask; + int cs, umc; + + for_each_umc(umc) { + umc_base_reg = get_umc_base(umc) + UMCCH_BASE_ADDR; + + for_each_chip_select(cs, umc, pvt) { + base = &pvt->csels[umc].csbases[cs]; + + base_reg = umc_base_reg + (cs * 4); + + if (!amd_smn_read(pvt->mc_node_id, base_reg, base)) + edac_dbg(0, " DCSB%d[%d]=0x%08x reg: 0x%x\n", + umc, cs, *base, base_reg); + } + + umc_mask_reg = get_umc_base(umc) + UMCCH_ADDR_MASK; + + for_each_chip_select_mask(cs, umc, pvt) { + mask = &pvt->csels[umc].csmasks[cs]; + + mask_reg = umc_mask_reg + (cs * 4); + + if (!amd_smn_read(pvt->mc_node_id, mask_reg, mask)) + edac_dbg(0, " DCSM%d[%d]=0x%08x reg: 0x%x\n", + umc, cs, *mask, mask_reg); + } + } +} + /* * Function 2 Offset F10_DCSB0; read in the DCS Base and DCS Mask registers */ static void read_dct_base_mask(struct amd64_pvt *pvt) { - int base_reg0, base_reg1, mask_reg0, mask_reg1, cs; + int cs; prep_chip_selects(pvt); - if (pvt->umc) { - base_reg0 = get_umc_base(0) + UMCCH_BASE_ADDR; - base_reg1 = get_umc_base(1) + UMCCH_BASE_ADDR; - mask_reg0 = get_umc_base(0) + UMCCH_ADDR_MASK; - mask_reg1 = get_umc_base(1) + UMCCH_ADDR_MASK; - } else { - base_reg0 = DCSB0; - base_reg1 = DCSB1; - mask_reg0 = DCSM0; - mask_reg1 = DCSM1; - } + if (pvt->umc) + return read_umc_base_mask(pvt); for_each_chip_select(cs, 0, pvt) { - int reg0 = base_reg0 + (cs * 4); - int reg1 = base_reg1 + (cs * 4); + int reg0 = DCSB0 + (cs * 4); + int reg1 = DCSB1 + (cs * 4); u32 *base0 = &pvt->csels[0].csbases[cs]; u32 *base1 = &pvt->csels[1].csbases[cs]; - if (pvt->umc) { - if (!amd_smn_read(pvt->mc_node_id, reg0, base0)) - edac_dbg(0, " DCSB0[%d]=0x%08x reg: 0x%x\n", - cs, *base0, reg0); + if (!amd64_read_dct_pci_cfg(pvt, 0, reg0, base0)) + edac_dbg(0, " DCSB0[%d]=0x%08x reg: F2x%x\n", + cs, *base0, reg0); - if (!amd_smn_read(pvt->mc_node_id, reg1, base1)) - edac_dbg(0, " DCSB1[%d]=0x%08x reg: 0x%x\n", - cs, *base1, reg1); - } else { - if (!amd64_read_dct_pci_cfg(pvt, 0, reg0, base0)) - edac_dbg(0, " DCSB0[%d]=0x%08x reg: F2x%x\n", - cs, *base0, reg0); + if (pvt->fam == 0xf) + continue; - if (pvt->fam == 0xf) - continue; - - if (!amd64_read_dct_pci_cfg(pvt, 1, reg0, base1)) - edac_dbg(0, " DCSB1[%d]=0x%08x reg: F2x%x\n", - cs, *base1, (pvt->fam == 0x10) ? reg1 - : reg0); - } + if (!amd64_read_dct_pci_cfg(pvt, 1, reg0, base1)) + edac_dbg(0, " DCSB1[%d]=0x%08x reg: F2x%x\n", + cs, *base1, (pvt->fam == 0x10) ? reg1 + : reg0); } for_each_chip_select_mask(cs, 0, pvt) { - int reg0 = mask_reg0 + (cs * 4); - int reg1 = mask_reg1 + (cs * 4); + int reg0 = DCSM0 + (cs * 4); + int reg1 = DCSM1 + (cs * 4); u32 *mask0 = &pvt->csels[0].csmasks[cs]; u32 *mask1 = &pvt->csels[1].csmasks[cs]; - if (pvt->umc) { - if (!amd_smn_read(pvt->mc_node_id, reg0, mask0)) - edac_dbg(0, " DCSM0[%d]=0x%08x reg: 0x%x\n", - cs, *mask0, reg0); + if (!amd64_read_dct_pci_cfg(pvt, 0, reg0, mask0)) + edac_dbg(0, " DCSM0[%d]=0x%08x reg: F2x%x\n", + cs, *mask0, reg0); - if (!amd_smn_read(pvt->mc_node_id, reg1, mask1)) - edac_dbg(0, " DCSM1[%d]=0x%08x reg: 0x%x\n", - cs, *mask1, reg1); - } else { - if (!amd64_read_dct_pci_cfg(pvt, 0, reg0, mask0)) - edac_dbg(0, " DCSM0[%d]=0x%08x reg: F2x%x\n", - cs, *mask0, reg0); + if (pvt->fam == 0xf) + continue; - if (pvt->fam == 0xf) - continue; - - if (!amd64_read_dct_pci_cfg(pvt, 1, reg0, mask1)) - edac_dbg(0, " DCSM1[%d]=0x%08x reg: F2x%x\n", - cs, *mask1, (pvt->fam == 0x10) ? reg1 - : reg0); - } + if (!amd64_read_dct_pci_cfg(pvt, 1, reg0, mask1)) + edac_dbg(0, " DCSM1[%d]=0x%08x reg: F2x%x\n", + cs, *mask1, (pvt->fam == 0x10) ? reg1 + : reg0); } } diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 8f66472f7adc..4dce6a2ac75f 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -96,6 +96,7 @@ /* Hardware limit on ChipSelect rows per MC and processors per system */ #define NUM_CHIPSELECTS 8 #define DRAM_RANGES 8 +#define NUM_CONTROLLERS 8 #define ON true #define OFF false @@ -351,8 +352,8 @@ struct amd64_pvt { u32 dbam0; /* DRAM Base Address Mapping reg for DCT0 */ u32 dbam1; /* DRAM Base Address Mapping reg for DCT1 */ - /* one for each DCT */ - struct chip_select csels[2]; + /* one for each DCT/UMC */ + struct chip_select csels[NUM_CONTROLLERS]; /* DRAM base and limit pairs F1x[78,70,68,60,58,50,48,40] */ struct dram_range ranges[DRAM_RANGES]; From f8be8e5680225ac9caf07d4545f8529b7395327f Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Wed, 21 Aug 2019 23:59:56 +0000 Subject: [PATCH 09/19] EDAC/amd64: Recognize DRAM device type ECC capability AMD Family 17h systems support x4 and x16 DRAM devices. However, the device type is not checked when setting mci.edac_ctl_cap. Set the appropriate capability flag based on the device type. Default to x8 DRAM device when neither the x4 or x16 bits are set. [ bp: reverse cpk_en check to save an indentation level. ] Fixes: 2d09d8f301f5 ("EDAC, amd64: Determine EDAC MC capabilities on Fam17h") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Tony Luck Link: https://lkml.kernel.org/r/20190821235938.118710-3-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index dd60cf5a3d96..ffe56a8fe39d 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3150,12 +3150,15 @@ static bool ecc_enabled(struct pci_dev *F3, u16 nid) static inline void f17h_determine_edac_ctl_cap(struct mem_ctl_info *mci, struct amd64_pvt *pvt) { - u8 i, ecc_en = 1, cpk_en = 1; + u8 i, ecc_en = 1, cpk_en = 1, dev_x4 = 1, dev_x16 = 1; for_each_umc(i) { if (pvt->umc[i].sdp_ctrl & UMC_SDP_INIT) { ecc_en &= !!(pvt->umc[i].umc_cap_hi & UMC_ECC_ENABLED); cpk_en &= !!(pvt->umc[i].umc_cap_hi & UMC_ECC_CHIPKILL_CAP); + + dev_x4 &= !!(pvt->umc[i].dimm_cfg & BIT(6)); + dev_x16 &= !!(pvt->umc[i].dimm_cfg & BIT(7)); } } @@ -3163,8 +3166,15 @@ f17h_determine_edac_ctl_cap(struct mem_ctl_info *mci, struct amd64_pvt *pvt) if (ecc_en) { mci->edac_ctl_cap |= EDAC_FLAG_SECDED; - if (cpk_en) + if (!cpk_en) + return; + + if (dev_x4) mci->edac_ctl_cap |= EDAC_FLAG_S4ECD4ED; + else if (dev_x16) + mci->edac_ctl_cap |= EDAC_FLAG_S16ECD16ED; + else + mci->edac_ctl_cap |= EDAC_FLAG_S8ECD8ED; } } From 353a1fcb8f9e5857c0fb720b9e57a86c1fb7c17e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Wed, 21 Aug 2019 23:59:57 +0000 Subject: [PATCH 10/19] EDAC/amd64: Initialize DIMM info for systems with more than two channels Currently, the DIMM info for AMD Family 17h systems is initialized in init_csrows(). This function is shared with legacy systems, and it has a limit of two channel support. This prevents initialization of the DIMM info for a number of ranks, so there will be missing ranks in the EDAC sysfs. Create a new init_csrows_df() for Family17h+ and revert init_csrows() back to pre-Family17h support. Loop over all channels in the new function in order to support systems with more than two channels. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Tony Luck Link: https://lkml.kernel.org/r/20190821235938.118710-4-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 66 ++++++++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index ffe56a8fe39d..6f67c48b66cd 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2837,6 +2837,49 @@ static u32 get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr_orig) return nr_pages; } +static int init_csrows_df(struct mem_ctl_info *mci) +{ + struct amd64_pvt *pvt = mci->pvt_info; + enum edac_type edac_mode = EDAC_NONE; + enum dev_type dev_type = DEV_UNKNOWN; + struct dimm_info *dimm; + int empty = 1; + u8 umc, cs; + + if (mci->edac_ctl_cap & EDAC_FLAG_S16ECD16ED) { + edac_mode = EDAC_S16ECD16ED; + dev_type = DEV_X16; + } else if (mci->edac_ctl_cap & EDAC_FLAG_S8ECD8ED) { + edac_mode = EDAC_S8ECD8ED; + dev_type = DEV_X8; + } else if (mci->edac_ctl_cap & EDAC_FLAG_S4ECD4ED) { + edac_mode = EDAC_S4ECD4ED; + dev_type = DEV_X4; + } else if (mci->edac_ctl_cap & EDAC_FLAG_SECDED) { + edac_mode = EDAC_SECDED; + } + + for_each_umc(umc) { + for_each_chip_select(cs, umc, pvt) { + if (!csrow_enabled(cs, umc, pvt)) + continue; + + empty = 0; + dimm = mci->csrows[cs]->channels[umc]->dimm; + + edac_dbg(1, "MC node: %d, csrow: %d\n", + pvt->mc_node_id, cs); + + dimm->nr_pages = get_csrow_nr_pages(pvt, umc, cs); + dimm->mtype = pvt->dram_type; + dimm->edac_mode = edac_mode; + dimm->dtype = dev_type; + } + } + + return empty; +} + /* * Initialize the array of csrow attribute instances, based on the values * from pci config hardware registers. @@ -2851,15 +2894,16 @@ static int init_csrows(struct mem_ctl_info *mci) int nr_pages = 0; u32 val; - if (!pvt->umc) { - amd64_read_pci_cfg(pvt->F3, NBCFG, &val); + if (pvt->umc) + return init_csrows_df(mci); - pvt->nbcfg = val; + amd64_read_pci_cfg(pvt->F3, NBCFG, &val); - edac_dbg(0, "node %d, NBCFG=0x%08x[ChipKillEccCap: %d|DramEccEn: %d]\n", - pvt->mc_node_id, val, - !!(val & NBCFG_CHIPKILL), !!(val & NBCFG_ECC_ENABLE)); - } + pvt->nbcfg = val; + + edac_dbg(0, "node %d, NBCFG=0x%08x[ChipKillEccCap: %d|DramEccEn: %d]\n", + pvt->mc_node_id, val, + !!(val & NBCFG_CHIPKILL), !!(val & NBCFG_ECC_ENABLE)); /* * We iterate over DCT0 here but we look at DCT1 in parallel, if needed. @@ -2896,13 +2940,7 @@ static int init_csrows(struct mem_ctl_info *mci) edac_dbg(1, "Total csrow%d pages: %u\n", i, nr_pages); /* Determine DIMM ECC mode: */ - if (pvt->umc) { - if (mci->edac_ctl_cap & EDAC_FLAG_S4ECD4ED) - edac_mode = EDAC_S4ECD4ED; - else if (mci->edac_ctl_cap & EDAC_FLAG_SECDED) - edac_mode = EDAC_SECDED; - - } else if (pvt->nbcfg & NBCFG_ECC_ENABLE) { + if (pvt->nbcfg & NBCFG_ECC_ENABLE) { edac_mode = (pvt->nbcfg & NBCFG_CHIPKILL) ? EDAC_S4ECD4ED : EDAC_SECDED; From e53a3b267fb0a79db9ca1f1e08b97889b22013e6 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Wed, 21 Aug 2019 23:59:59 +0000 Subject: [PATCH 11/19] EDAC/amd64: Find Chip Select memory size using Address Mask Chip Select memory size reporting on AMD Family 17h was recently fixed in order to account for interleaving. However, the current method is not robust. The Chip Select Address Mask can be used to find the memory size. There are a couple of cases. 1) For single-rank and dual-rank non-interleaved, use the address mask plus 1 as the size. 2) For dual-rank interleaved, do #1 but "de-interleave" the address mask first. Always "de-interleave" the address mask in order to simplify the code flow. Bit mask manipulation is necessary to check for interleaving, so just go ahead and do the de-interleaving. In the non-interleaved case, the original and de-interleaved address masks will be the same. To de-interleave the mask, count the number of zero bits in the middle of the mask and swap them with the most significant bits. For example, Original=0xFFFF9FE, De-interleaved=0x3FFFFFE Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Tony Luck Link: https://lkml.kernel.org/r/20190821235938.118710-5-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 114 +++++++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 44 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 6f67c48b66cd..8f300c0714b8 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -788,51 +788,39 @@ static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan) (dclr & BIT(15)) ? "yes" : "no"); } -/* - * The Address Mask should be a contiguous set of bits in the non-interleaved - * case. So to check for CS interleaving, find the most- and least-significant - * bits of the mask, generate a contiguous bitmask, and compare the two. - */ -static bool f17_cs_interleaved(struct amd64_pvt *pvt, u8 ctrl, int cs) +#define CS_EVEN_PRIMARY BIT(0) +#define CS_ODD_PRIMARY BIT(1) + +#define CS_EVEN CS_EVEN_PRIMARY +#define CS_ODD CS_ODD_PRIMARY + +static int f17_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) { - u32 mask = pvt->csels[ctrl].csmasks[cs >> 1]; - u32 msb = fls(mask) - 1, lsb = ffs(mask) - 1; - u32 test_mask = GENMASK(msb, lsb); + int cs_mode = 0; - edac_dbg(1, "mask=0x%08x test_mask=0x%08x\n", mask, test_mask); + if (csrow_enabled(2 * dimm, ctrl, pvt)) + cs_mode |= CS_EVEN_PRIMARY; - return mask ^ test_mask; + if (csrow_enabled(2 * dimm + 1, ctrl, pvt)) + cs_mode |= CS_ODD_PRIMARY; + + return cs_mode; } static void debug_display_dimm_sizes_df(struct amd64_pvt *pvt, u8 ctrl) { - int dimm, size0, size1, cs0, cs1; + int dimm, size0, size1, cs0, cs1, cs_mode; edac_printk(KERN_DEBUG, EDAC_MC, "UMC%d chip selects:\n", ctrl); for (dimm = 0; dimm < 2; dimm++) { - size0 = 0; cs0 = dimm * 2; - - if (csrow_enabled(cs0, ctrl, pvt)) - size0 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, cs0); - - size1 = 0; cs1 = dimm * 2 + 1; - if (csrow_enabled(cs1, ctrl, pvt)) { - /* - * CS interleaving is only supported if both CSes have - * the same amount of memory. Because they are - * interleaved, it will look like both CSes have the - * full amount of memory. Save the size for both as - * half the amount we found on CS0, if interleaved. - */ - if (f17_cs_interleaved(pvt, ctrl, cs1)) - size1 = size0 = (size0 >> 1); - else - size1 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, cs1); - } + cs_mode = f17_get_cs_mode(dimm, ctrl, pvt); + + size0 = pvt->ops->dbam_to_cs(pvt, ctrl, cs_mode, cs0); + size1 = pvt->ops->dbam_to_cs(pvt, ctrl, cs_mode, cs1); amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n", cs0, size0, @@ -1569,18 +1557,54 @@ static int f16_dbam_to_chip_select(struct amd64_pvt *pvt, u8 dct, return ddr3_cs_size(cs_mode, false); } -static int f17_base_addr_to_cs_size(struct amd64_pvt *pvt, u8 umc, +static int f17_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, unsigned int cs_mode, int csrow_nr) { - u32 base_addr = pvt->csels[umc].csbases[csrow_nr]; + u32 addr_mask_orig, addr_mask_deinterleaved; + u32 msb, weight, num_zero_bits; + int dimm, size = 0; - /* Each mask is used for every two base addresses. */ - u32 addr_mask = pvt->csels[umc].csmasks[csrow_nr >> 1]; + /* No Chip Selects are enabled. */ + if (!cs_mode) + return size; - /* Register [31:1] = Address [39:9]. Size is in kBs here. */ - u32 size = ((addr_mask >> 1) - (base_addr >> 1) + 1) >> 1; + /* Requested size of an even CS but none are enabled. */ + if (!(cs_mode & CS_EVEN) && !(csrow_nr & 1)) + return size; - edac_dbg(1, "BaseAddr: 0x%x, AddrMask: 0x%x\n", base_addr, addr_mask); + /* Requested size of an odd CS but none are enabled. */ + if (!(cs_mode & CS_ODD) && (csrow_nr & 1)) + return size; + + /* + * There is one mask per DIMM, and two Chip Selects per DIMM. + * CS0 and CS1 -> DIMM0 + * CS2 and CS3 -> DIMM1 + */ + dimm = csrow_nr >> 1; + + addr_mask_orig = pvt->csels[umc].csmasks[dimm]; + + /* + * The number of zero bits in the mask is equal to the number of bits + * in a full mask minus the number of bits in the current mask. + * + * The MSB is the number of bits in the full mask because BIT[0] is + * always 0. + */ + msb = fls(addr_mask_orig) - 1; + weight = hweight_long(addr_mask_orig); + num_zero_bits = msb - weight; + + /* Take the number of zero bits off from the top of the mask. */ + addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1); + + edac_dbg(1, "CS%d DIMM%d AddrMasks:\n", csrow_nr, dimm); + edac_dbg(1, " Original AddrMask: 0x%x\n", addr_mask_orig); + edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", addr_mask_deinterleaved); + + /* Register [31:1] = Address [39:9]. Size is in kBs here. */ + size = (addr_mask_deinterleaved >> 2) + 1; /* Return size in MBs. */ return size >> 10; @@ -2245,7 +2269,7 @@ static struct amd64_family_type family_types[] = { .f6_id = PCI_DEVICE_ID_AMD_17H_DF_F6, .ops = { .early_channel_count = f17_early_channel_count, - .dbam_to_cs = f17_base_addr_to_cs_size, + .dbam_to_cs = f17_addr_mask_to_cs_size, } }, [F17_M10H_CPUS] = { @@ -2254,7 +2278,7 @@ static struct amd64_family_type family_types[] = { .f6_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F6, .ops = { .early_channel_count = f17_early_channel_count, - .dbam_to_cs = f17_base_addr_to_cs_size, + .dbam_to_cs = f17_addr_mask_to_cs_size, } }, [F17_M30H_CPUS] = { @@ -2263,7 +2287,7 @@ static struct amd64_family_type family_types[] = { .f6_id = PCI_DEVICE_ID_AMD_17H_M30H_DF_F6, .ops = { .early_channel_count = f17_early_channel_count, - .dbam_to_cs = f17_base_addr_to_cs_size, + .dbam_to_cs = f17_addr_mask_to_cs_size, } }, }; @@ -2822,10 +2846,12 @@ static u32 get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr_orig) int csrow_nr = csrow_nr_orig; u32 cs_mode, nr_pages; - if (!pvt->umc) + if (!pvt->umc) { csrow_nr >>= 1; - - cs_mode = DBAM_DIMM(csrow_nr, dbam); + cs_mode = DBAM_DIMM(csrow_nr, dbam); + } else { + cs_mode = f17_get_cs_mode(csrow_nr >> 1, dct, pvt); + } nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode, csrow_nr); nr_pages <<= 20 - PAGE_SHIFT; From 8a2eaab7daf03b23ac902481218034ae2fae5e16 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 22 Aug 2019 00:00:00 +0000 Subject: [PATCH 12/19] EDAC/amd64: Decode syndrome before translating address AMD Family 17h systems currently require address translation in order to report the system address of a DRAM ECC error. This is currently done before decoding the syndrome information. The syndrome information does not depend on the address translation, so the proper EDAC csrow/channel reporting can function without the address. However, the syndrome information will not be decoded if the address translation fails. Decode the syndrome information before doing the address translation. The syndrome information is architecturally defined in MCA_SYND and can be considered robust. The address translation is system-specific and may fail on newer systems without proper updates to the translation algorithm. Fixes: 713ad54675fd ("EDAC, amd64: Define and register UMC error decode function") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Tony Luck Link: https://lkml.kernel.org/r/20190821235938.118710-6-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 8f300c0714b8..e4bc48201b0c 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2574,13 +2574,6 @@ static void decode_umc_error(int node_id, struct mce *m) err.channel = find_umc_channel(m); - if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) { - err.err_code = ERR_NORM_ADDR; - goto log_error; - } - - error_address_to_page_and_offset(sys_addr, &err); - if (!(m->status & MCI_STATUS_SYNDV)) { err.err_code = ERR_SYND; goto log_error; @@ -2597,6 +2590,13 @@ static void decode_umc_error(int node_id, struct mce *m) err.csrow = m->synd & 0x7; + if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) { + err.err_code = ERR_NORM_ADDR; + goto log_error; + } + + error_address_to_page_and_offset(sys_addr, &err); + log_error: __log_ecc_error(mci, &err, ecc_type); } From 7574729e91468d568cc198de438feb35ef04f41a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 22 Aug 2019 00:00:01 +0000 Subject: [PATCH 13/19] EDAC/amd64: Cache secondary Chip Select registers AMD Family 17h systems have a set of secondary Chip Select Base Addresses and Address Masks. These do not represent unique Chip Selects, rather they are used in conjunction with the primary Chip Select registers in certain cases. Cache these secondary Chip Select registers for future use. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Tony Luck Link: https://lkml.kernel.org/r/20190821235938.118710-7-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 23 ++++++++++++++++++++--- drivers/edac/amd64_edac.h | 4 ++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e4bc48201b0c..23251bba8eb6 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -946,34 +946,51 @@ static void prep_chip_selects(struct amd64_pvt *pvt) static void read_umc_base_mask(struct amd64_pvt *pvt) { - u32 umc_base_reg, umc_mask_reg; - u32 base_reg, mask_reg; - u32 *base, *mask; + u32 umc_base_reg, umc_base_reg_sec; + u32 umc_mask_reg, umc_mask_reg_sec; + u32 base_reg, base_reg_sec; + u32 mask_reg, mask_reg_sec; + u32 *base, *base_sec; + u32 *mask, *mask_sec; int cs, umc; for_each_umc(umc) { umc_base_reg = get_umc_base(umc) + UMCCH_BASE_ADDR; + umc_base_reg_sec = get_umc_base(umc) + UMCCH_BASE_ADDR_SEC; for_each_chip_select(cs, umc, pvt) { base = &pvt->csels[umc].csbases[cs]; + base_sec = &pvt->csels[umc].csbases_sec[cs]; base_reg = umc_base_reg + (cs * 4); + base_reg_sec = umc_base_reg_sec + (cs * 4); if (!amd_smn_read(pvt->mc_node_id, base_reg, base)) edac_dbg(0, " DCSB%d[%d]=0x%08x reg: 0x%x\n", umc, cs, *base, base_reg); + + if (!amd_smn_read(pvt->mc_node_id, base_reg_sec, base_sec)) + edac_dbg(0, " DCSB_SEC%d[%d]=0x%08x reg: 0x%x\n", + umc, cs, *base_sec, base_reg_sec); } umc_mask_reg = get_umc_base(umc) + UMCCH_ADDR_MASK; + umc_mask_reg_sec = get_umc_base(umc) + UMCCH_ADDR_MASK_SEC; for_each_chip_select_mask(cs, umc, pvt) { mask = &pvt->csels[umc].csmasks[cs]; + mask_sec = &pvt->csels[umc].csmasks_sec[cs]; mask_reg = umc_mask_reg + (cs * 4); + mask_reg_sec = umc_mask_reg_sec + (cs * 4); if (!amd_smn_read(pvt->mc_node_id, mask_reg, mask)) edac_dbg(0, " DCSM%d[%d]=0x%08x reg: 0x%x\n", umc, cs, *mask, mask_reg); + + if (!amd_smn_read(pvt->mc_node_id, mask_reg_sec, mask_sec)) + edac_dbg(0, " DCSM_SEC%d[%d]=0x%08x reg: 0x%x\n", + umc, cs, *mask_sec, mask_reg_sec); } } } diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 4dce6a2ac75f..68f12de6e654 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -259,7 +259,9 @@ /* UMC CH register offsets */ #define UMCCH_BASE_ADDR 0x0 +#define UMCCH_BASE_ADDR_SEC 0x10 #define UMCCH_ADDR_MASK 0x20 +#define UMCCH_ADDR_MASK_SEC 0x28 #define UMCCH_ADDR_CFG 0x30 #define UMCCH_DIMM_CFG 0x80 #define UMCCH_UMC_CFG 0x100 @@ -312,9 +314,11 @@ struct dram_range { /* A DCT chip selects collection */ struct chip_select { u32 csbases[NUM_CHIPSELECTS]; + u32 csbases_sec[NUM_CHIPSELECTS]; u8 b_cnt; u32 csmasks[NUM_CHIPSELECTS]; + u32 csmasks_sec[NUM_CHIPSELECTS]; u8 m_cnt; }; From 81f5090db843be897414418c24fe472fa6e082b6 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 22 Aug 2019 00:00:02 +0000 Subject: [PATCH 14/19] EDAC/amd64: Support asymmetric dual-rank DIMMs Future AMD systems will support asymmetric dual-rank DIMMs. These are DIMMs where the ranks are of different sizes. The even rank will use the Primary Even Chip Select registers and the odd rank will use the Secondary Odd Chip Select registers. Recognize if a Secondary Odd Chip Select is being used. Use the Secondary Odd Address Mask when calculating the chip select size. [ bp: move csrow_sec_enabled() to the header, fix CS_ODD define and tone-down the capitalized words spelling. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Tony Luck Link: https://lkml.kernel.org/r/20190821235938.118710-8-Yazen.Ghannam@amd.com --- drivers/edac/amd64_edac.c | 16 +++++++++++++--- drivers/edac/amd64_edac.h | 3 ++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 23251bba8eb6..18ba9c898389 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -790,9 +790,11 @@ static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan) #define CS_EVEN_PRIMARY BIT(0) #define CS_ODD_PRIMARY BIT(1) +#define CS_EVEN_SECONDARY BIT(2) +#define CS_ODD_SECONDARY BIT(3) -#define CS_EVEN CS_EVEN_PRIMARY -#define CS_ODD CS_ODD_PRIMARY +#define CS_EVEN (CS_EVEN_PRIMARY | CS_EVEN_SECONDARY) +#define CS_ODD (CS_ODD_PRIMARY | CS_ODD_SECONDARY) static int f17_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) { @@ -804,6 +806,10 @@ static int f17_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) if (csrow_enabled(2 * dimm + 1, ctrl, pvt)) cs_mode |= CS_ODD_PRIMARY; + /* Asymmetric dual-rank DIMM support. */ + if (csrow_sec_enabled(2 * dimm + 1, ctrl, pvt)) + cs_mode |= CS_ODD_SECONDARY; + return cs_mode; } @@ -1600,7 +1606,11 @@ static int f17_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, */ dimm = csrow_nr >> 1; - addr_mask_orig = pvt->csels[umc].csmasks[dimm]; + /* Asymmetric dual-rank DIMM support. */ + if ((csrow_nr & 1) && (cs_mode & CS_ODD_SECONDARY)) + addr_mask_orig = pvt->csels[umc].csmasks_sec[dimm]; + else + addr_mask_orig = pvt->csels[umc].csmasks[dimm]; /* * The number of zero bits in the mask is equal to the number of bits diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 68f12de6e654..8addc4d95577 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -169,7 +169,8 @@ #define DCSM0 0x60 #define DCSM1 0x160 -#define csrow_enabled(i, dct, pvt) ((pvt)->csels[(dct)].csbases[(i)] & DCSB_CS_ENABLE) +#define csrow_enabled(i, dct, pvt) ((pvt)->csels[(dct)].csbases[(i)] & DCSB_CS_ENABLE) +#define csrow_sec_enabled(i, dct, pvt) ((pvt)->csels[(dct)].csbases_sec[(i)] & DCSB_CS_ENABLE) #define DRAM_CONTROL 0x78 From d55c79ac86f78fce3c224bda2b383edf96bb6438 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Sep 2019 12:33:41 +0000 Subject: [PATCH 15/19] EDAC: Prefer 'unsigned int' to bare use of 'unsigned' Use of 'unsigned int' instead of bare use of 'unsigned'. Fix this for edac_mc*, ghes and the i5100 driver as reported by checkpatch.pl. While at it, struct member dev_ch_attribute->channel is always used as unsigned int. Change type to unsigned int to avoid type casts. [ bp: Massage. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20190902123216.9809-2-rrichter@marvell.com --- drivers/edac/edac_mc.c | 20 ++++++++++---------- drivers/edac/edac_mc.h | 6 +++--- drivers/edac/edac_mc_sysfs.c | 8 ++++---- drivers/edac/ghes_edac.c | 2 +- drivers/edac/i5100_edac.c | 16 +++++++++------- include/linux/edac.h | 10 +++++----- 6 files changed, 32 insertions(+), 30 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index c68f62ab54b0..e6fd079783bd 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -114,8 +114,8 @@ static const struct kernel_param_ops edac_report_ops = { module_param_cb(edac_report, &edac_report_ops, &edac_report, 0644); -unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf, - unsigned len) +unsigned int edac_dimm_info_location(struct dimm_info *dimm, char *buf, + unsigned int len) { struct mem_ctl_info *mci = dimm->mci; int i, n, count = 0; @@ -236,9 +236,9 @@ EXPORT_SYMBOL_GPL(edac_mem_types); * At return, the pointer 'p' will be incremented to be used on a next call * to this function. */ -void *edac_align_ptr(void **p, unsigned size, int n_elems) +void *edac_align_ptr(void **p, unsigned int size, int n_elems) { - unsigned align, r; + unsigned int align, r; void *ptr = *p; *p += size * n_elems; @@ -302,10 +302,10 @@ static void _edac_mc_free(struct mem_ctl_info *mci) kfree(mci); } -struct mem_ctl_info *edac_mc_alloc(unsigned mc_num, - unsigned n_layers, +struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, + unsigned int n_layers, struct edac_mc_layer *layers, - unsigned sz_pvt) + unsigned int sz_pvt) { struct mem_ctl_info *mci; struct edac_mc_layer *layer; @@ -313,9 +313,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num, struct rank_info *chan; struct dimm_info *dimm; u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; - unsigned pos[EDAC_MAX_LAYERS]; - unsigned size, tot_dimms = 1, count = 1; - unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0; + unsigned int pos[EDAC_MAX_LAYERS]; + unsigned int size, tot_dimms = 1, count = 1; + unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0; void *pvt, *p, *ptr = NULL; int i, j, row, chn, n, len, off; bool per_rank = false; diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h index 4165e15995ad..02aac5c61d00 100644 --- a/drivers/edac/edac_mc.h +++ b/drivers/edac/edac_mc.h @@ -122,10 +122,10 @@ do { \ * On success, return a pointer to struct mem_ctl_info pointer; * %NULL otherwise */ -struct mem_ctl_info *edac_mc_alloc(unsigned mc_num, - unsigned n_layers, +struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, + unsigned int n_layers, struct edac_mc_layer *layers, - unsigned sz_pvt); + unsigned int sz_pvt); /** * edac_get_owner - Return the owner's mod_name of EDAC MC diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 4386ea4b9b5a..4eb8c5ceb973 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -131,7 +131,7 @@ static const char * const edac_caps[] = { struct dev_ch_attribute { struct device_attribute attr; - int channel; + unsigned int channel; }; #define DEVICE_CHANNEL(_name, _mode, _show, _store, _var) \ @@ -200,7 +200,7 @@ static ssize_t channel_dimm_label_show(struct device *dev, char *data) { struct csrow_info *csrow = to_csrow(dev); - unsigned chan = to_channel(mattr); + unsigned int chan = to_channel(mattr); struct rank_info *rank = csrow->channels[chan]; /* if field has not been initialized, there is nothing to send */ @@ -216,7 +216,7 @@ static ssize_t channel_dimm_label_store(struct device *dev, const char *data, size_t count) { struct csrow_info *csrow = to_csrow(dev); - unsigned chan = to_channel(mattr); + unsigned int chan = to_channel(mattr); struct rank_info *rank = csrow->channels[chan]; size_t copy_count = count; @@ -240,7 +240,7 @@ static ssize_t channel_ce_count_show(struct device *dev, struct device_attribute *mattr, char *data) { struct csrow_info *csrow = to_csrow(dev); - unsigned chan = to_channel(mattr); + unsigned int chan = to_channel(mattr); struct rank_info *rank = csrow->channels[chan]; return sprintf(data, "%u\n", rank->ce_count); diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 7f19f1c672c3..d413a0bdc9ad 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -68,7 +68,7 @@ struct memdev_dmi_entry { struct ghes_edac_dimm_fill { struct mem_ctl_info *mci; - unsigned count; + unsigned int count; }; static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c index b506eef6b146..251f2b692785 100644 --- a/drivers/edac/i5100_edac.c +++ b/drivers/edac/i5100_edac.c @@ -417,7 +417,8 @@ static const char *i5100_err_msg(unsigned err) } /* convert csrow index into a rank (per channel -- 0..5) */ -static int i5100_csrow_to_rank(const struct mem_ctl_info *mci, int csrow) +static unsigned int i5100_csrow_to_rank(const struct mem_ctl_info *mci, + unsigned int csrow) { const struct i5100_priv *priv = mci->pvt_info; @@ -425,7 +426,8 @@ static int i5100_csrow_to_rank(const struct mem_ctl_info *mci, int csrow) } /* convert csrow index into a channel (0..1) */ -static int i5100_csrow_to_chan(const struct mem_ctl_info *mci, int csrow) +static unsigned int i5100_csrow_to_chan(const struct mem_ctl_info *mci, + unsigned int csrow) { const struct i5100_priv *priv = mci->pvt_info; @@ -653,11 +655,11 @@ static struct pci_dev *pci_get_device_func(unsigned vendor, return ret; } -static unsigned long i5100_npages(struct mem_ctl_info *mci, int csrow) +static unsigned long i5100_npages(struct mem_ctl_info *mci, unsigned int csrow) { struct i5100_priv *priv = mci->pvt_info; - const unsigned chan_rank = i5100_csrow_to_rank(mci, csrow); - const unsigned chan = i5100_csrow_to_chan(mci, csrow); + const unsigned int chan_rank = i5100_csrow_to_rank(mci, csrow); + const unsigned int chan = i5100_csrow_to_chan(mci, csrow); unsigned addr_lines; /* dimm present? */ @@ -852,8 +854,8 @@ static void i5100_init_csrows(struct mem_ctl_info *mci) for (i = 0; i < mci->tot_dimms; i++) { struct dimm_info *dimm; const unsigned long npages = i5100_npages(mci, i); - const unsigned chan = i5100_csrow_to_chan(mci, i); - const unsigned rank = i5100_csrow_to_rank(mci, i); + const unsigned int chan = i5100_csrow_to_chan(mci, i); + const unsigned int rank = i5100_csrow_to_rank(mci, i); if (!npages) continue; diff --git a/include/linux/edac.h b/include/linux/edac.h index 342dabda9c7e..c19483b90079 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -440,7 +440,7 @@ struct dimm_info { char label[EDAC_MC_LABEL_LEN + 1]; /* DIMM label on motherboard */ /* Memory location data */ - unsigned location[EDAC_MAX_LAYERS]; + unsigned int location[EDAC_MAX_LAYERS]; struct mem_ctl_info *mci; /* the parent */ @@ -451,7 +451,7 @@ struct dimm_info { u32 nr_pages; /* number of pages on this dimm */ - unsigned csrow, cschannel; /* Points to the old API data */ + unsigned int csrow, cschannel; /* Points to the old API data */ u16 smbios_handle; /* Handle for SMBIOS type 17 */ }; @@ -597,7 +597,7 @@ struct mem_ctl_info { unsigned long page); int mc_idx; struct csrow_info **csrows; - unsigned nr_csrows, num_cschannel; + unsigned int nr_csrows, num_cschannel; /* * Memory Controller hierarchy @@ -608,14 +608,14 @@ struct mem_ctl_info { * of the recent drivers enumerate memories per DIMM, instead. * When the memory controller is per rank, csbased is true. */ - unsigned n_layers; + unsigned int n_layers; struct edac_mc_layer *layers; bool csbased; /* * DIMM info. Will eventually remove the entire csrows_info some day */ - unsigned tot_dimms; + unsigned int tot_dimms; struct dimm_info **dimms; /* From 644110e17d26e51f9f33830753cc899500d5d384 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Sep 2019 12:33:45 +0000 Subject: [PATCH 16/19] EDAC/mc_sysfs: Remove pointless gotos Use direct returns instead of gotos. Error handling code becomes smaller and better readable. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20190902123216.9809-4-rrichter@marvell.com --- drivers/edac/edac_mc_sysfs.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 4eb8c5ceb973..309fc24339b0 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -938,7 +938,7 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, if (err < 0) { edac_dbg(1, "failure: create device %s\n", dev_name(&mci->dev)); put_device(&mci->dev); - goto out; + return err; } /* @@ -987,7 +987,6 @@ fail_unregister_dimm: } device_unregister(&mci->dev); -out: return err; } @@ -1044,10 +1043,8 @@ int __init edac_mc_sysfs_init(void) int err; mci_pdev = kzalloc(sizeof(*mci_pdev), GFP_KERNEL); - if (!mci_pdev) { - err = -ENOMEM; - goto out; - } + if (!mci_pdev) + return -ENOMEM; mci_pdev->bus = edac_get_sysfs_subsys(); mci_pdev->type = &mc_attr_type; @@ -1055,17 +1052,14 @@ int __init edac_mc_sysfs_init(void) dev_set_name(mci_pdev, "mc"); err = device_add(mci_pdev); - if (err < 0) - goto out_put_device; + if (err < 0) { + put_device(mci_pdev); + return err; + } edac_dbg(0, "device %s created\n", dev_name(mci_pdev)); return 0; - - out_put_device: - put_device(mci_pdev); - out: - return err; } void edac_mc_sysfs_exit(void) From e701f412030ec3783f1c30c7741492693d6213e3 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Sep 2019 12:33:47 +0000 Subject: [PATCH 17/19] EDAC/mc_sysfs: Make debug messages consistent Debug messages are inconsistently used in the error handlers. Some lack an error message, some are called regardless of the return status, messages for the same error are at different locations in the code depending on the error code. This happens esp. near put_device() calls. Make those debug messages more consistent. Additionally, unify the error messages to have the same terms for the same operations of the device. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20190902123216.9809-5-rrichter@marvell.com --- drivers/edac/edac_mc_sysfs.c | 64 +++++++++++++++++------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 309fc24339b0..32d016f1ecd1 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -278,7 +278,7 @@ static void csrow_attr_release(struct device *dev) { struct csrow_info *csrow = container_of(dev, struct csrow_info, dev); - edac_dbg(1, "Releasing csrow device %s\n", dev_name(dev)); + edac_dbg(1, "device %s released\n", dev_name(dev)); kfree(csrow); } @@ -414,14 +414,16 @@ static int edac_create_csrow_object(struct mem_ctl_info *mci, dev_set_name(&csrow->dev, "csrow%d", index); dev_set_drvdata(&csrow->dev, csrow); - edac_dbg(0, "creating (virtual) csrow node %s\n", - dev_name(&csrow->dev)); - err = device_add(&csrow->dev); - if (err) + if (err) { + edac_dbg(1, "failure: create device %s\n", dev_name(&csrow->dev)); put_device(&csrow->dev); + return err; + } - return err; + edac_dbg(0, "device %s created\n", dev_name(&csrow->dev)); + + return 0; } /* Create a CSROW object under specifed edac_mc_device */ @@ -435,12 +437,8 @@ static int edac_create_csrow_objects(struct mem_ctl_info *mci) if (!nr_pages_per_csrow(csrow)) continue; err = edac_create_csrow_object(mci, mci->csrows[i], i); - if (err < 0) { - edac_dbg(1, - "failure: create csrow objects for csrow %d\n", - i); + if (err < 0) goto error; - } } return 0; @@ -624,7 +622,7 @@ static void dimm_attr_release(struct device *dev) { struct dimm_info *dimm = container_of(dev, struct dimm_info, dev); - edac_dbg(1, "Releasing dimm device %s\n", dev_name(dev)); + edac_dbg(1, "device %s released\n", dev_name(dev)); kfree(dimm); } @@ -653,12 +651,21 @@ static int edac_create_dimm_object(struct mem_ctl_info *mci, pm_runtime_forbid(&mci->dev); err = device_add(&dimm->dev); - if (err) + if (err) { + edac_dbg(1, "failure: create device %s\n", dev_name(&dimm->dev)); put_device(&dimm->dev); + return err; + } - edac_dbg(0, "created rank/dimm device %s\n", dev_name(&dimm->dev)); + if (IS_ENABLED(CONFIG_EDAC_DEBUG)) { + char location[80]; - return err; + edac_dimm_info_location(dimm, location, sizeof(location)); + edac_dbg(0, "device %s created at location %s\n", + dev_name(&dimm->dev), location); + } + + return 0; } /* @@ -901,7 +908,7 @@ static void mci_attr_release(struct device *dev) { struct mem_ctl_info *mci = container_of(dev, struct mem_ctl_info, dev); - edac_dbg(1, "Releasing csrow device %s\n", dev_name(dev)); + edac_dbg(1, "device %s released\n", dev_name(dev)); kfree(mci); } @@ -933,7 +940,6 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, dev_set_drvdata(&mci->dev, mci); pm_runtime_forbid(&mci->dev); - edac_dbg(0, "creating device %s\n", dev_name(&mci->dev)); err = device_add(&mci->dev); if (err < 0) { edac_dbg(1, "failure: create device %s\n", dev_name(&mci->dev)); @@ -941,6 +947,8 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, return err; } + edac_dbg(0, "device %s created\n", dev_name(&mci->dev)); + /* * Create the dimm/rank devices */ @@ -950,22 +958,9 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, if (!dimm->nr_pages) continue; -#ifdef CONFIG_EDAC_DEBUG - edac_dbg(1, "creating dimm%d, located at ", i); - if (edac_debug_level >= 1) { - int lay; - for (lay = 0; lay < mci->n_layers; lay++) - printk(KERN_CONT "%s %d ", - edac_layer_name[mci->layers[lay].type], - dimm->location[lay]); - printk(KERN_CONT "\n"); - } -#endif err = edac_create_dimm_object(mci, dimm, i); - if (err) { - edac_dbg(1, "failure: create dimm %d obj\n", i); + if (err) goto fail_unregister_dimm; - } } #ifdef CONFIG_EDAC_LEGACY_SYSFS @@ -1010,14 +1005,14 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci) struct dimm_info *dimm = mci->dimms[i]; if (dimm->nr_pages == 0) continue; - edac_dbg(0, "removing device %s\n", dev_name(&dimm->dev)); + edac_dbg(1, "unregistering device %s\n", dev_name(&dimm->dev)); device_unregister(&dimm->dev); } } void edac_unregister_sysfs(struct mem_ctl_info *mci) { - edac_dbg(1, "Unregistering device %s\n", dev_name(&mci->dev)); + edac_dbg(1, "unregistering device %s\n", dev_name(&mci->dev)); device_unregister(&mci->dev); } @@ -1028,7 +1023,7 @@ static void mc_attr_release(struct device *dev) * parent device, used to create the /sys/devices/mc sysfs node. * So, there are no attributes on it. */ - edac_dbg(1, "Releasing device %s\n", dev_name(dev)); + edac_dbg(1, "device %s released\n", dev_name(dev)); kfree(dev); } @@ -1053,6 +1048,7 @@ int __init edac_mc_sysfs_init(void) err = device_add(mci_pdev); if (err < 0) { + edac_dbg(1, "failure: create device %s\n", dev_name(mci_pdev)); put_device(mci_pdev); return err; } From 116085e589006f5b608be8e45a0ad390aba92839 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Sep 2019 12:33:49 +0000 Subject: [PATCH 18/19] MAINTAINERS: Add Robert as a EDAC reviewer I did some significant work with code in edac_mc.c and ghes_edac.c already, so I guess I can probably help out a bit as code reviewer here. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20190902123216.9809-6-rrichter@marvell.com --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 97912d8333a9..79d2ae8519e3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5788,6 +5788,7 @@ M: Borislav Petkov M: Mauro Carvalho Chehab M: Tony Luck R: James Morse +R: Robert Richter L: linux-edac@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next S: Supported From 3e443eb353eda6f4b4796e07f2599683fa752f1d Mon Sep 17 00:00:00 2001 From: Isaac Vaughn Date: Fri, 6 Sep 2019 23:21:38 +0000 Subject: [PATCH 19/19] EDAC/amd64: Add PCI device IDs for family 17h, model 70h Add the new Family 17h Model 70h PCI IDs (device 18h functions 0 and 6) to the AMD64 EDAC module. [ bp: s/f17_base_addr_to_cs_size/f17_addr_mask_to_cs_size/g ] Signed-off-by: Isaac Vaughn Signed-off-by: Borislav Petkov Cc: James Morse Cc: linux-edac@vger.kernel.org Cc: Mauro Carvalho Chehab Cc: Robert Richter Cc: Tony Luck Link: https://lkml.kernel.org/r/20190906192131.8ced0ca112146f32d82b6cae@knights.ucf.edu --- drivers/edac/amd64_edac.c | 13 +++++++++++++ drivers/edac/amd64_edac.h | 3 +++ 2 files changed, 16 insertions(+) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 18ba9c898389..c1d4536ae466 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2317,6 +2317,15 @@ static struct amd64_family_type family_types[] = { .dbam_to_cs = f17_addr_mask_to_cs_size, } }, + [F17_M70H_CPUS] = { + .ctl_name = "F17h_M70h", + .f0_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F0, + .f6_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F6, + .ops = { + .early_channel_count = f17_early_channel_count, + .dbam_to_cs = f17_addr_mask_to_cs_size, + } + }, }; /* @@ -3355,6 +3364,10 @@ static struct amd64_family_type *per_family_init(struct amd64_pvt *pvt) fam_type = &family_types[F17_M30H_CPUS]; pvt->ops = &family_types[F17_M30H_CPUS].ops; break; + } else if (pvt->model >= 0x70 && pvt->model <= 0x7f) { + fam_type = &family_types[F17_M70H_CPUS]; + pvt->ops = &family_types[F17_M70H_CPUS].ops; + break; } /* fall through */ case 0x18: diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 8addc4d95577..8c3cda81e619 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -120,6 +120,8 @@ #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F6 0x15ee #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F0 0x1490 #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F6 0x1496 +#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F0 0x1440 +#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F6 0x1446 /* * Function 1 - Address Map @@ -289,6 +291,7 @@ enum amd_families { F17_CPUS, F17_M10H_CPUS, F17_M30H_CPUS, + F17_M70H_CPUS, NUM_FAMILIES, };