From bea1bfd5b7226ac7c3f93f76be89221f33222a34 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 12 Feb 2020 13:03:40 +0100 Subject: [PATCH 01/16] EDAC/mc: Change mci device removal to use put_device() There are dimm and csrow devices linked to the mci device esp. to show up in sysfs. It must be granted that children devices are removed before its mci parent. Thus, the release functions must be called in the correct order and may not miss any child before releasing its parent. In the current implementation this is only granted by the correct order of release functions. A much better approach is to use put_device() that releases the device only after all users are gone. It is the recommended way to release a device and free its memory. The function uses the device's refcount and only frees it if there are no users of it anymore such as children. So implement a mci_release() function to remove mci devices, use put_device() to free them and early initialize the mci device right after its struct has been allocated. Change the release function so that it can be universally used no matter if the device is registered or not. Since subsequent dimm and csrow sysfs links are implemented as children devices, their refcounts will keep the parent mci device from being removed as long as sysfs entries exist and until all users have been unregistered in edac_remove_sysfs_mci_device(). Remove edac_unregister_sysfs() and merge mci sysfs removal into edac_remove_sysfs_mci_device(). There is only a single instance now that removes the sysfs entries. The function can now be used in the error paths for cleanup. Also, create device release functions for all involved devices (dev->release), remove device_type release functions (dev_type-> release) and also use dev->init_name instead of dev_set_name(). [ bp: Massage commit message and comments. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Acked-by: Aristeu Rozanski Link: https://lkml.kernel.org/r/20200212120340.4764-5-rrichter@marvell.com --- drivers/edac/edac_mc.c | 12 +++-- drivers/edac/edac_mc_sysfs.c | 90 +++++++++++++++--------------------- drivers/edac/edac_module.h | 1 - 3 files changed, 47 insertions(+), 56 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 69e0d90460e6..64785e644482 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -278,6 +278,12 @@ void *edac_align_ptr(void **p, unsigned int size, int n_elems) static void _edac_mc_free(struct mem_ctl_info *mci) { + put_device(&mci->dev); +} + +static void mci_release(struct device *dev) +{ + struct mem_ctl_info *mci = container_of(dev, struct mem_ctl_info, dev); struct csrow_info *csr; int i, chn, row; @@ -371,6 +377,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, if (mci == NULL) return NULL; + mci->dev.release = mci_release; + device_initialize(&mci->dev); + /* Adjust pointers so they point within the memory we just allocated * rather than an imaginary chunk of memory located at address 0. */ @@ -505,9 +514,6 @@ void edac_mc_free(struct mem_ctl_info *mci) { edac_dbg(1, "\n"); - if (device_is_registered(&mci->dev)) - edac_unregister_sysfs(mci); - _edac_mc_free(mci); } EXPORT_SYMBOL_GPL(edac_mc_free); diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index c70ec0a306d8..11e1b436f116 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -274,14 +274,8 @@ static const struct attribute_group *csrow_attr_groups[] = { NULL }; -static void csrow_attr_release(struct device *dev) -{ - /* release device with _edac_mc_free() */ -} - static const struct device_type csrow_attr_type = { .groups = csrow_attr_groups, - .release = csrow_attr_release, }; /* @@ -387,6 +381,14 @@ static const struct attribute_group *csrow_dev_groups[] = { NULL }; +static void csrow_release(struct device *dev) +{ + /* + * Nothing to do, just unregister sysfs here. The mci + * device owns the data and will also release it. + */ +} + static inline int nr_pages_per_csrow(struct csrow_info *csrow) { int chan, nr_pages = 0; @@ -405,6 +407,7 @@ static int edac_create_csrow_object(struct mem_ctl_info *mci, csrow->dev.type = &csrow_attr_type; csrow->dev.groups = csrow_dev_groups; + csrow->dev.release = csrow_release; device_initialize(&csrow->dev); csrow->dev.parent = &mci->dev; csrow->mci = mci; @@ -441,10 +444,8 @@ static int edac_create_csrow_objects(struct mem_ctl_info *mci) error: for (--i; i >= 0; i--) { - csrow = mci->csrows[i]; - if (!nr_pages_per_csrow(csrow)) - continue; - device_unregister(&mci->csrows[i]->dev); + if (device_is_registered(&mci->csrows[i]->dev)) + device_unregister(&mci->csrows[i]->dev); } return err; @@ -453,15 +454,13 @@ error: static void edac_delete_csrow_objects(struct mem_ctl_info *mci) { int i; - struct csrow_info *csrow; - for (i = mci->nr_csrows - 1; i >= 0; i--) { - csrow = mci->csrows[i]; - if (!nr_pages_per_csrow(csrow)) - continue; - device_unregister(&mci->csrows[i]->dev); + for (i = 0; i < mci->nr_csrows; i++) { + if (device_is_registered(&mci->csrows[i]->dev)) + device_unregister(&mci->csrows[i]->dev); } } + #endif /* @@ -602,16 +601,18 @@ static const struct attribute_group *dimm_attr_groups[] = { NULL }; -static void dimm_attr_release(struct device *dev) -{ - /* release device with _edac_mc_free() */ -} - static const struct device_type dimm_attr_type = { .groups = dimm_attr_groups, - .release = dimm_attr_release, }; +static void dimm_release(struct device *dev) +{ + /* + * Nothing to do, just unregister sysfs here. The mci + * device owns the data and will also release it. + */ +} + /* Create a DIMM object under specifed memory controller device */ static int edac_create_dimm_object(struct mem_ctl_info *mci, struct dimm_info *dimm) @@ -620,6 +621,7 @@ static int edac_create_dimm_object(struct mem_ctl_info *mci, dimm->mci = mci; dimm->dev.type = &dimm_attr_type; + dimm->dev.release = dimm_release; device_initialize(&dimm->dev); dimm->dev.parent = &mci->dev; @@ -884,14 +886,8 @@ static const struct attribute_group *mci_attr_groups[] = { NULL }; -static void mci_attr_release(struct device *dev) -{ - /* release device with _edac_mc_free() */ -} - static const struct device_type mci_attr_type = { .groups = mci_attr_groups, - .release = mci_attr_release, }; /* @@ -910,8 +906,6 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, /* get the /sys/devices/system/edac subsys reference */ mci->dev.type = &mci_attr_type; - device_initialize(&mci->dev); - mci->dev.parent = mci_pdev; mci->dev.groups = groups; dev_set_name(&mci->dev, "mc%d", mci->mc_idx); @@ -921,7 +915,7 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, err = device_add(&mci->dev); if (err < 0) { edac_dbg(1, "failure: create device %s\n", dev_name(&mci->dev)); - put_device(&mci->dev); + /* no put_device() here, free mci with _edac_mc_free() */ return err; } @@ -937,24 +931,20 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, err = edac_create_dimm_object(mci, dimm); if (err) - goto fail_unregister_dimm; + goto fail; } #ifdef CONFIG_EDAC_LEGACY_SYSFS err = edac_create_csrow_objects(mci); if (err < 0) - goto fail_unregister_dimm; + goto fail; #endif edac_create_debugfs_nodes(mci); return 0; -fail_unregister_dimm: - mci_for_each_dimm(mci, dimm) { - if (device_is_registered(&dimm->dev)) - device_unregister(&dimm->dev); - } - device_unregister(&mci->dev); +fail: + edac_remove_sysfs_mci_device(mci); return err; } @@ -966,6 +956,9 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci) { struct dimm_info *dimm; + if (!device_is_registered(&mci->dev)) + return; + edac_dbg(0, "\n"); #ifdef CONFIG_EDAC_DEBUG @@ -976,17 +969,14 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci) #endif mci_for_each_dimm(mci, dimm) { - if (dimm->nr_pages == 0) + if (!device_is_registered(&dimm->dev)) continue; edac_dbg(1, "unregistering device %s\n", dev_name(&dimm->dev)); device_unregister(&dimm->dev); } -} -void edac_unregister_sysfs(struct mem_ctl_info *mci) -{ - edac_dbg(1, "unregistering device %s\n", dev_name(&mci->dev)); - device_unregister(&mci->dev); + /* only remove the device, but keep mci */ + device_del(&mci->dev); } static void mc_attr_release(struct device *dev) @@ -1000,9 +990,6 @@ static void mc_attr_release(struct device *dev) kfree(dev); } -static const struct device_type mc_attr_type = { - .release = mc_attr_release, -}; /* * Init/exit code for the module. Basically, creates/removes /sys/class/rc */ @@ -1015,11 +1002,10 @@ int __init edac_mc_sysfs_init(void) return -ENOMEM; mci_pdev->bus = edac_get_sysfs_subsys(); - mci_pdev->type = &mc_attr_type; - device_initialize(mci_pdev); - dev_set_name(mci_pdev, "mc"); + mci_pdev->release = mc_attr_release; + mci_pdev->init_name = "mc"; - err = device_add(mci_pdev); + err = device_register(mci_pdev); if (err < 0) { edac_dbg(1, "failure: create device %s\n", dev_name(mci_pdev)); put_device(mci_pdev); diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h index 388427d378b1..aa1f91688eb8 100644 --- a/drivers/edac/edac_module.h +++ b/drivers/edac/edac_module.h @@ -28,7 +28,6 @@ void edac_mc_sysfs_exit(void); extern int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, const struct attribute_group **groups); extern void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci); -void edac_unregister_sysfs(struct mem_ctl_info *mci); extern int edac_get_log_ue(void); extern int edac_get_log_ce(void); extern int edac_get_panic_on_ue(void); From aad28c6f6b092b7382ee8dfb455772f7018a4cd1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 23 Jan 2020 09:02:49 +0000 Subject: [PATCH 02/16] EDAC/mc: Split edac_mc_alloc() into smaller functions edac_mc_alloc() is huge. Factor out code by moving it to the two new functions edac_mc_alloc_csrows() and edac_mc_alloc_dimms(). Do not move code yet for better review. [ bp: sort local args in reversed fir tree order. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Acked-by: Aristeu Rozanski Link: https://lkml.kernel.org/r/20200123090210.26933-2-rrichter@marvell.com --- drivers/edac/edac_mc.c | 237 +++++++++++++++++++++++------------------ 1 file changed, 136 insertions(+), 101 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 64785e644482..71777b4798a4 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -311,6 +311,9 @@ static void mci_release(struct device *dev) kfree(mci); } +static int edac_mc_alloc_csrows(struct mem_ctl_info *mci); +static int edac_mc_alloc_dimms(struct mem_ctl_info *mci); + struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, unsigned int n_layers, struct edac_mc_layer *layers, @@ -318,15 +321,11 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, { struct mem_ctl_info *mci; struct edac_mc_layer *layer; - struct csrow_info *csr; - struct rank_info *chan; - struct dimm_info *dimm; u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; - unsigned int pos[EDAC_MAX_LAYERS]; unsigned int idx, size, tot_dimms = 1, count = 1; unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0; - void *pvt, *p, *ptr = NULL; - int i, j, row, chn, n, len; + void *pvt, *ptr = NULL; + int i; bool per_rank = false; if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0)) @@ -401,103 +400,11 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, mci->num_cschannel = tot_channels; mci->csbased = per_rank; - /* - * Alocate and fill the csrow/channels structs - */ - mci->csrows = kcalloc(tot_csrows, sizeof(*mci->csrows), GFP_KERNEL); - if (!mci->csrows) - goto error; - for (row = 0; row < tot_csrows; row++) { - csr = kzalloc(sizeof(**mci->csrows), GFP_KERNEL); - if (!csr) - goto error; - mci->csrows[row] = csr; - csr->csrow_idx = row; - csr->mci = mci; - csr->nr_channels = tot_channels; - csr->channels = kcalloc(tot_channels, sizeof(*csr->channels), - GFP_KERNEL); - if (!csr->channels) - goto error; - - for (chn = 0; chn < tot_channels; chn++) { - chan = kzalloc(sizeof(**csr->channels), GFP_KERNEL); - if (!chan) - goto error; - csr->channels[chn] = chan; - chan->chan_idx = chn; - chan->csrow = csr; - } - } - - /* - * Allocate and fill the dimm structs - */ - mci->dimms = kcalloc(tot_dimms, sizeof(*mci->dimms), GFP_KERNEL); - if (!mci->dimms) + if (edac_mc_alloc_csrows(mci)) goto error; - memset(&pos, 0, sizeof(pos)); - row = 0; - chn = 0; - for (idx = 0; idx < tot_dimms; idx++) { - chan = mci->csrows[row]->channels[chn]; - - dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL); - if (!dimm) - goto error; - mci->dimms[idx] = dimm; - dimm->mci = mci; - dimm->idx = idx; - - /* - * Copy DIMM location and initialize it. - */ - len = sizeof(dimm->label); - p = dimm->label; - n = snprintf(p, len, "mc#%u", mc_num); - p += n; - len -= n; - for (j = 0; j < n_layers; j++) { - n = snprintf(p, len, "%s#%u", - edac_layer_name[layers[j].type], - pos[j]); - p += n; - len -= n; - dimm->location[j] = pos[j]; - - if (len <= 0) - break; - } - - /* Link it to the csrows old API data */ - chan->dimm = dimm; - dimm->csrow = row; - dimm->cschannel = chn; - - /* Increment csrow location */ - if (layers[0].is_virt_csrow) { - chn++; - if (chn == tot_channels) { - chn = 0; - row++; - } - } else { - row++; - if (row == tot_csrows) { - row = 0; - chn++; - } - } - - /* Increment dimm location */ - for (j = n_layers - 1; j >= 0; j--) { - pos[j]++; - if (pos[j] < layers[j].size) - break; - pos[j] = 0; - } - } + if (edac_mc_alloc_dimms(mci)) + goto error; mci->op_state = OP_ALLOC; @@ -510,6 +417,134 @@ error: } EXPORT_SYMBOL_GPL(edac_mc_alloc); +static int edac_mc_alloc_csrows(struct mem_ctl_info *mci) +{ + unsigned int tot_channels = mci->num_cschannel; + unsigned int tot_csrows = mci->nr_csrows; + unsigned int row, chn; + + /* + * Alocate and fill the csrow/channels structs + */ + mci->csrows = kcalloc(tot_csrows, sizeof(*mci->csrows), GFP_KERNEL); + if (!mci->csrows) + return -ENOMEM; + + for (row = 0; row < tot_csrows; row++) { + struct csrow_info *csr; + + csr = kzalloc(sizeof(**mci->csrows), GFP_KERNEL); + if (!csr) + return -ENOMEM; + + mci->csrows[row] = csr; + csr->csrow_idx = row; + csr->mci = mci; + csr->nr_channels = tot_channels; + csr->channels = kcalloc(tot_channels, sizeof(*csr->channels), + GFP_KERNEL); + if (!csr->channels) + return -ENOMEM; + + for (chn = 0; chn < tot_channels; chn++) { + struct rank_info *chan; + + chan = kzalloc(sizeof(**csr->channels), GFP_KERNEL); + if (!chan) + return -ENOMEM; + + csr->channels[chn] = chan; + chan->chan_idx = chn; + chan->csrow = csr; + } + } + + return 0; +} + +static int edac_mc_alloc_dimms(struct mem_ctl_info *mci) +{ + unsigned int pos[EDAC_MAX_LAYERS]; + unsigned int row, chn, idx; + int layer; + void *p; + + /* + * Allocate and fill the dimm structs + */ + mci->dimms = kcalloc(mci->tot_dimms, sizeof(*mci->dimms), GFP_KERNEL); + if (!mci->dimms) + return -ENOMEM; + + memset(&pos, 0, sizeof(pos)); + row = 0; + chn = 0; + for (idx = 0; idx < mci->tot_dimms; idx++) { + struct dimm_info *dimm; + struct rank_info *chan; + int n, len; + + chan = mci->csrows[row]->channels[chn]; + + dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL); + if (!dimm) + return -ENOMEM; + mci->dimms[idx] = dimm; + dimm->mci = mci; + dimm->idx = idx; + + /* + * Copy DIMM location and initialize it. + */ + len = sizeof(dimm->label); + p = dimm->label; + n = snprintf(p, len, "mc#%u", mci->mc_idx); + p += n; + len -= n; + for (layer = 0; layer < mci->n_layers; layer++) { + n = snprintf(p, len, "%s#%u", + edac_layer_name[mci->layers[layer].type], + pos[layer]); + p += n; + len -= n; + dimm->location[layer] = pos[layer]; + + if (len <= 0) + break; + } + + /* Link it to the csrows old API data */ + chan->dimm = dimm; + dimm->csrow = row; + dimm->cschannel = chn; + + /* Increment csrow location */ + if (mci->layers[0].is_virt_csrow) { + chn++; + if (chn == mci->num_cschannel) { + chn = 0; + row++; + } + } else { + row++; + if (row == mci->nr_csrows) { + row = 0; + chn++; + } + } + + /* Increment dimm location */ + for (layer = mci->n_layers - 1; layer >= 0; layer--) { + pos[layer]++; + if (pos[layer] < mci->layers[layer].size) + break; + pos[layer] = 0; + } + } + + return 0; +} + void edac_mc_free(struct mem_ctl_info *mci) { edac_dbg(1, "\n"); From 1f27c79062385af38c7c075f0fc174f98164ab2e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 23 Jan 2020 09:02:52 +0000 Subject: [PATCH 03/16] EDAC/mc: Reorder functions edac_mc_alloc*() Reorder the new created functions edac_mc_alloc_csrows() and edac_mc_alloc_dimms() and move them before edac_mc_alloc(). No further code changes. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Acked-by: Aristeu Rozanski Link: https://lkml.kernel.org/r/20200123090210.26933-3-rrichter@marvell.com --- drivers/edac/edac_mc.c | 209 ++++++++++++++++++++--------------------- 1 file changed, 103 insertions(+), 106 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 71777b4798a4..25130789e996 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -311,112 +311,6 @@ static void mci_release(struct device *dev) kfree(mci); } -static int edac_mc_alloc_csrows(struct mem_ctl_info *mci); -static int edac_mc_alloc_dimms(struct mem_ctl_info *mci); - -struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, - unsigned int n_layers, - struct edac_mc_layer *layers, - unsigned int sz_pvt) -{ - struct mem_ctl_info *mci; - struct edac_mc_layer *layer; - u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; - unsigned int idx, size, tot_dimms = 1, count = 1; - unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0; - void *pvt, *ptr = NULL; - int i; - bool per_rank = false; - - if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0)) - return NULL; - - /* - * Calculate the total amount of dimms and csrows/cschannels while - * in the old API emulation mode - */ - for (idx = 0; idx < n_layers; idx++) { - tot_dimms *= layers[idx].size; - - if (layers[idx].is_virt_csrow) - tot_csrows *= layers[idx].size; - else - tot_channels *= layers[idx].size; - - if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT) - per_rank = true; - } - - /* Figure out the offsets of the various items from the start of an mc - * structure. We want the alignment of each item to be at least as - * stringent as what the compiler would provide if we could simply - * hardcode everything into a single struct. - */ - mci = edac_align_ptr(&ptr, sizeof(*mci), 1); - layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers); - for (i = 0; i < n_layers; i++) { - count *= layers[i].size; - edac_dbg(4, "errcount layer %d size %d\n", i, count); - ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); - ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); - tot_errcount += 2 * count; - } - - edac_dbg(4, "allocating %d error counters\n", tot_errcount); - pvt = edac_align_ptr(&ptr, sz_pvt, 1); - size = ((unsigned long)pvt) + sz_pvt; - - edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n", - size, - tot_dimms, - per_rank ? "ranks" : "dimms", - tot_csrows * tot_channels); - - mci = kzalloc(size, GFP_KERNEL); - if (mci == NULL) - return NULL; - - mci->dev.release = mci_release; - device_initialize(&mci->dev); - - /* Adjust pointers so they point within the memory we just allocated - * rather than an imaginary chunk of memory located at address 0. - */ - layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer)); - for (i = 0; i < n_layers; i++) { - mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i])); - mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i])); - } - pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; - - /* setup index and various internal pointers */ - mci->mc_idx = mc_num; - mci->tot_dimms = tot_dimms; - mci->pvt_info = pvt; - mci->n_layers = n_layers; - mci->layers = layer; - memcpy(mci->layers, layers, sizeof(*layer) * n_layers); - mci->nr_csrows = tot_csrows; - mci->num_cschannel = tot_channels; - mci->csbased = per_rank; - - if (edac_mc_alloc_csrows(mci)) - goto error; - - if (edac_mc_alloc_dimms(mci)) - goto error; - - mci->op_state = OP_ALLOC; - - return mci; - -error: - _edac_mc_free(mci); - - return NULL; -} -EXPORT_SYMBOL_GPL(edac_mc_alloc); - static int edac_mc_alloc_csrows(struct mem_ctl_info *mci) { unsigned int tot_channels = mci->num_cschannel; @@ -545,6 +439,109 @@ static int edac_mc_alloc_dimms(struct mem_ctl_info *mci) return 0; } +struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, + unsigned int n_layers, + struct edac_mc_layer *layers, + unsigned int sz_pvt) +{ + struct mem_ctl_info *mci; + struct edac_mc_layer *layer; + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; + unsigned int idx, size, tot_dimms = 1, count = 1; + unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0; + void *pvt, *ptr = NULL; + int i; + bool per_rank = false; + + if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0)) + return NULL; + + /* + * Calculate the total amount of dimms and csrows/cschannels while + * in the old API emulation mode + */ + for (idx = 0; idx < n_layers; idx++) { + tot_dimms *= layers[idx].size; + + if (layers[idx].is_virt_csrow) + tot_csrows *= layers[idx].size; + else + tot_channels *= layers[idx].size; + + if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT) + per_rank = true; + } + + /* Figure out the offsets of the various items from the start of an mc + * structure. We want the alignment of each item to be at least as + * stringent as what the compiler would provide if we could simply + * hardcode everything into a single struct. + */ + mci = edac_align_ptr(&ptr, sizeof(*mci), 1); + layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers); + for (i = 0; i < n_layers; i++) { + count *= layers[i].size; + edac_dbg(4, "errcount layer %d size %d\n", i, count); + ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); + ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); + tot_errcount += 2 * count; + } + + edac_dbg(4, "allocating %d error counters\n", tot_errcount); + pvt = edac_align_ptr(&ptr, sz_pvt, 1); + size = ((unsigned long)pvt) + sz_pvt; + + edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n", + size, + tot_dimms, + per_rank ? "ranks" : "dimms", + tot_csrows * tot_channels); + + mci = kzalloc(size, GFP_KERNEL); + if (mci == NULL) + return NULL; + + mci->dev.release = mci_release; + device_initialize(&mci->dev); + + /* Adjust pointers so they point within the memory we just allocated + * rather than an imaginary chunk of memory located at address 0. + */ + layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer)); + for (i = 0; i < n_layers; i++) { + mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i])); + mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i])); + } + pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; + + /* setup index and various internal pointers */ + mci->mc_idx = mc_num; + mci->tot_dimms = tot_dimms; + mci->pvt_info = pvt; + mci->n_layers = n_layers; + mci->layers = layer; + memcpy(mci->layers, layers, sizeof(*layer) * n_layers); + mci->nr_csrows = tot_csrows; + mci->num_cschannel = tot_channels; + mci->csbased = per_rank; + + if (edac_mc_alloc_csrows(mci)) + goto error; + + if (edac_mc_alloc_dimms(mci)) + goto error; + + mci->op_state = OP_ALLOC; + + return mci; + +error: + _edac_mc_free(mci); + + return NULL; +} +EXPORT_SYMBOL_GPL(edac_mc_alloc); + void edac_mc_free(struct mem_ctl_info *mci) { edac_dbg(1, "\n"); From 672ef0e5684048a00aeb923b10131275ea688543 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 23 Jan 2020 09:02:54 +0000 Subject: [PATCH 04/16] EDAC: Store error type in struct edac_raw_error_desc Store the error type in struct edac_raw_error_desc. This makes the type parameter of edac_raw_mc_handle_error() obsolete. [ kernel-doc typo ] Reported-by: kbuild test robot Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Acked-by: Aristeu Rozanski Link: https://lkml.kernel.org/r/20200123090210.26933-4-rrichter@marvell.com --- drivers/edac/edac_mc.c | 10 +++++----- drivers/edac/edac_mc.h | 4 +--- drivers/edac/ghes_edac.c | 11 +++++------ include/linux/edac.h | 2 ++ 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 25130789e996..8d8c0997d41a 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -1084,8 +1084,7 @@ static void edac_ue_error(struct mem_ctl_info *mci, edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count); } -void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type, - struct mem_ctl_info *mci, +void edac_raw_mc_handle_error(struct mem_ctl_info *mci, struct edac_raw_error_desc *e) { char detail[80]; @@ -1100,14 +1099,14 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type, /* Report the error via the trace interface */ if (IS_ENABLED(CONFIG_RAS)) - trace_mc_event(type, e->msg, e->label, e->error_count, + trace_mc_event(e->type, e->msg, e->label, e->error_count, mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer, (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, grain_bits, e->syndrome, e->other_detail); /* Memory type dependent details about the error */ - if (type == HW_EVENT_ERR_CORRECTED) { + if (e->type == HW_EVENT_ERR_CORRECTED) { snprintf(detail, sizeof(detail), "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx", e->page_frame_number, e->offset_in_page, @@ -1152,6 +1151,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, /* Fills the error report buffer */ memset(e, 0, sizeof (*e)); e->error_count = error_count; + e->type = type; e->top_layer = top_layer; e->mid_layer = mid_layer; e->low_layer = low_layer; @@ -1282,6 +1282,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, if (p > e->location) *(p - 1) = '\0'; - edac_raw_mc_handle_error(type, mci, e); + edac_raw_mc_handle_error(mci, e); } EXPORT_SYMBOL_GPL(edac_mc_handle_error); diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h index 02aac5c61d00..5d78be774f9e 100644 --- a/drivers/edac/edac_mc.h +++ b/drivers/edac/edac_mc.h @@ -212,7 +212,6 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, * edac_raw_mc_handle_error() - Reports a memory event to userspace without * doing anything to discover the error location. * - * @type: severity of the error (CE/UE/Fatal) * @mci: a struct mem_ctl_info pointer * @e: error description * @@ -220,8 +219,7 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, * only be called directly when the hardware error come directly from BIOS, * like in the case of APEI GHES driver. */ -void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type, - struct mem_ctl_info *mci, +void edac_raw_mc_handle_error(struct mem_ctl_info *mci, struct edac_raw_error_desc *e); /** diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index b99080d8a10c..7c3e5264a41e 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -201,7 +201,6 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg) void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) { - enum hw_event_mc_err_type type; struct edac_raw_error_desc *e; struct mem_ctl_info *mci; struct ghes_edac_pvt *pvt; @@ -240,17 +239,17 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) switch (sev) { case GHES_SEV_CORRECTED: - type = HW_EVENT_ERR_CORRECTED; + e->type = HW_EVENT_ERR_CORRECTED; break; case GHES_SEV_RECOVERABLE: - type = HW_EVENT_ERR_UNCORRECTED; + e->type = HW_EVENT_ERR_UNCORRECTED; break; case GHES_SEV_PANIC: - type = HW_EVENT_ERR_FATAL; + e->type = HW_EVENT_ERR_FATAL; break; default: case GHES_SEV_NO: - type = HW_EVENT_ERR_INFO; + e->type = HW_EVENT_ERR_INFO; } edac_dbg(1, "error validation_bits: 0x%08llx\n", @@ -442,7 +441,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) if (p > pvt->other_detail) *(p - 1) = '\0'; - edac_raw_mc_handle_error(type, mci, e); + edac_raw_mc_handle_error(mci, e); unlock: spin_unlock_irqrestore(&ghes_lock, flags); diff --git a/include/linux/edac.h b/include/linux/edac.h index cc31b9742684..6703eb492cd2 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -442,6 +442,7 @@ struct errcount_attribute_data { * struct edac_raw_error_desc - Raw error report structure * @grain: minimum granularity for an error report, in bytes * @error_count: number of errors of the same type + * @type: severity of the error (CE/UE/Fatal) * @top_layer: top layer of the error (layer[0]) * @mid_layer: middle layer of the error (layer[1]) * @low_layer: low layer of the error (layer[2]) @@ -462,6 +463,7 @@ struct edac_raw_error_desc { long grain; u16 error_count; + enum hw_event_mc_err_type type; int top_layer; int mid_layer; int low_layer; From 91b327f6728b0c4dfa089f11a474789854baa0b1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 23 Jan 2020 09:02:56 +0000 Subject: [PATCH 05/16] EDAC/mc: Determine mci pointer from the error descriptor Each struct mci has its own error descriptor. Create a function error_desc_to_mci() to determine the corresponding mci from an error descriptor. This removes @mci from the parameter list of edac_raw_mc_handle_error() as the mci pointer does not need to be passed any longer. [ bp: Massage commit message. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Acked-by: Aristeu Rozanski Link: https://lkml.kernel.org/r/20200123090210.26933-5-rrichter@marvell.com --- drivers/edac/edac_mc.c | 11 ++++++++--- drivers/edac/edac_mc.h | 4 +--- drivers/edac/ghes_edac.c | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 8d8c0997d41a..bc8d1d7b21a0 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -55,6 +55,11 @@ static LIST_HEAD(mc_devices); */ static const char *edac_mc_owner; +static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e) +{ + return container_of(e, struct mem_ctl_info, error_desc); +} + int edac_get_report_status(void) { return edac_report; @@ -1084,9 +1089,9 @@ static void edac_ue_error(struct mem_ctl_info *mci, edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count); } -void edac_raw_mc_handle_error(struct mem_ctl_info *mci, - struct edac_raw_error_desc *e) +void edac_raw_mc_handle_error(struct edac_raw_error_desc *e) { + struct mem_ctl_info *mci = error_desc_to_mci(e); char detail[80]; int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer }; u8 grain_bits; @@ -1282,6 +1287,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, if (p > e->location) *(p - 1) = '\0'; - edac_raw_mc_handle_error(mci, e); + edac_raw_mc_handle_error(e); } EXPORT_SYMBOL_GPL(edac_mc_handle_error); diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h index 5d78be774f9e..881b00eadf7a 100644 --- a/drivers/edac/edac_mc.h +++ b/drivers/edac/edac_mc.h @@ -212,15 +212,13 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, * edac_raw_mc_handle_error() - Reports a memory event to userspace without * doing anything to discover the error location. * - * @mci: a struct mem_ctl_info pointer * @e: error description * * This raw function is used internally by edac_mc_handle_error(). It should * only be called directly when the hardware error come directly from BIOS, * like in the case of APEI GHES driver. */ -void edac_raw_mc_handle_error(struct mem_ctl_info *mci, - struct edac_raw_error_desc *e); +void edac_raw_mc_handle_error(struct edac_raw_error_desc *e); /** * edac_mc_handle_error() - Reports a memory event to userspace. diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 7c3e5264a41e..bef8a428c429 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -441,7 +441,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) if (p > pvt->other_detail) *(p - 1) = '\0'; - edac_raw_mc_handle_error(mci, e); + edac_raw_mc_handle_error(e); unlock: spin_unlock_irqrestore(&ghes_lock, flags); From 6334dc4e3ff53031a2d522b826c4fab92cfdea93 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 14 Feb 2020 15:17:56 +0100 Subject: [PATCH 06/16] EDAC/mc: Carve out error increment into a separate function Carve out the error_count increment into a separate function edac_inc_csrow(). This better separates code and reduces the indentation level. Implementation note: The function edac_inc_csrow() counts the same as before, ->ce_count is only incremented if row >= 0. This is esp. true for the case of (!e->enable_per_layer_report). Here, a DIMM was not found, variable row still has a value of -1 and ->ce_count is not incremented. [ bp: Massage commit message. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Acked-by: Aristeu Rozanski Link: https://lkml.kernel.org/r/20200214141757.8976-1-rrichter@marvell.com --- drivers/edac/edac_mc.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index bc8d1d7b21a0..35d282492505 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -1089,6 +1089,26 @@ static void edac_ue_error(struct mem_ctl_info *mci, edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count); } +static void edac_inc_csrow(struct edac_raw_error_desc *e, int row, int chan) +{ + struct mem_ctl_info *mci = error_desc_to_mci(e); + enum hw_event_mc_err_type type = e->type; + u16 count = e->error_count; + + if (row < 0) + return; + + edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan); + + if (type == HW_EVENT_ERR_CORRECTED) { + mci->csrows[row]->ce_count += count; + if (chan >= 0) + mci->csrows[row]->channels[chan]->ce_count += count; + } else { + mci->csrows[row]->ue_count += count; + } +} + void edac_raw_mc_handle_error(struct edac_raw_error_desc *e) { struct mem_ctl_info *mci = error_desc_to_mci(e); @@ -1256,22 +1276,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, chan = -2; } - if (!e->enable_per_layer_report) { + if (!e->enable_per_layer_report) strcpy(e->label, "any memory"); - } else { - edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan); - if (p == e->label) - strcpy(e->label, "unknown memory"); - if (type == HW_EVENT_ERR_CORRECTED) { - if (row >= 0) { - mci->csrows[row]->ce_count += error_count; - if (chan >= 0) - mci->csrows[row]->channels[chan]->ce_count += error_count; - } - } else - if (row >= 0) - mci->csrows[row]->ue_count += error_count; - } + else if (!*e->label) + strcpy(e->label, "unknown memory"); + + edac_inc_csrow(e, row, chan); /* Fill the RAM location data */ p = e->location; From 65bb4d1af92cf007adc0a0c59dadcc393c5cada6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 23 Jan 2020 09:03:00 +0000 Subject: [PATCH 07/16] EDAC/mc: Report "unknown memory" on too many DIMM labels found There is a limitation to report only EDAC_MAX_LABELS in e->label of the error descriptor. This is to prevent a potential string overflow. The current implementation falls back to "any memory" in this case and also stops all further processing to find a unique row and channel of the possible error location. Reporting "any memory" is wrong as the memory controller reported an error location for one of the layers. Instead, report "unknown memory" and also do not break early in the loop to further check row and channel for uniqueness. [ bp: Massage commit message. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Acked-by: Aristeu Rozanski Link: https://lkml.kernel.org/r/20200123090210.26933-7-rrichter@marvell.com --- drivers/edac/edac_mc.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 35d282492505..7a1445feb59d 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -1243,20 +1243,21 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, * channel/memory controller/... may be affected. * Also, don't show errors for empty DIMM slots. */ - if (!e->enable_per_layer_report || !dimm->nr_pages) + if (!dimm->nr_pages) continue; - if (n_labels >= EDAC_MAX_LABELS) { - e->enable_per_layer_report = false; - break; - } n_labels++; - if (p != e->label) { - strcpy(p, OTHER_LABEL); - p += strlen(OTHER_LABEL); + if (n_labels > EDAC_MAX_LABELS) { + p = e->label; + *p = '\0'; + } else { + if (p != e->label) { + strcpy(p, OTHER_LABEL); + p += strlen(OTHER_LABEL); + } + strcpy(p, dimm->label); + p += strlen(p); } - strcpy(p, dimm->label); - p += strlen(p); /* * get csrow/channel of the DIMM, in order to allow From 67792cf9583c7816667c6b90007b5840f1b471f4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 23 Jan 2020 09:03:02 +0000 Subject: [PATCH 08/16] EDAC/mc: Remove enable_per_layer_report function argument Many functions carry the enable_per_layer_report argument. This is a bool value indicating the error information contains some location data where the error occurred. This can easily being determined by checking the pos[] array for values. Negative values indicate there is no location available. So if the top layer is negative, the error location is unknown. Just check if the top layer is negative and remove enable_per_layer_report as function argument and also from struct edac_raw_error_desc. [ bp: Reflow comments to 80 columns, while at it. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Acked-by: Aristeu Rozanski Link: https://lkml.kernel.org/r/20200123090210.26933-8-rrichter@marvell.com --- drivers/edac/edac_mc.c | 40 ++++++++++++++++++---------------------- drivers/edac/ghes_edac.c | 5 +---- include/linux/edac.h | 3 --- 3 files changed, 19 insertions(+), 29 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 7a1445feb59d..1867f11a3438 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -946,7 +946,6 @@ const char *edac_layer_name[] = { EXPORT_SYMBOL_GPL(edac_layer_name); static void edac_inc_ce_error(struct mem_ctl_info *mci, - bool enable_per_layer_report, const int pos[EDAC_MAX_LAYERS], const u16 count) { @@ -954,7 +953,7 @@ static void edac_inc_ce_error(struct mem_ctl_info *mci, mci->ce_mc += count; - if (!enable_per_layer_report) { + if (pos[0] < 0) { mci->ce_noinfo_count += count; return; } @@ -971,7 +970,6 @@ static void edac_inc_ce_error(struct mem_ctl_info *mci, } static void edac_inc_ue_error(struct mem_ctl_info *mci, - bool enable_per_layer_report, const int pos[EDAC_MAX_LAYERS], const u16 count) { @@ -979,7 +977,7 @@ static void edac_inc_ue_error(struct mem_ctl_info *mci, mci->ue_mc += count; - if (!enable_per_layer_report) { + if (pos[0] < 0) { mci->ue_noinfo_count += count; return; } @@ -1003,7 +1001,6 @@ static void edac_ce_error(struct mem_ctl_info *mci, const char *label, const char *detail, const char *other_detail, - const bool enable_per_layer_report, const unsigned long page_frame_number, const unsigned long offset_in_page, long grain) @@ -1026,7 +1023,7 @@ static void edac_ce_error(struct mem_ctl_info *mci, error_count, msg, msg_aux, label, location, detail); } - edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count); + edac_inc_ce_error(mci, pos, error_count); if (mci->scrub_mode == SCRUB_SW_SRC) { /* @@ -1056,8 +1053,7 @@ static void edac_ue_error(struct mem_ctl_info *mci, const char *location, const char *label, const char *detail, - const char *other_detail, - const bool enable_per_layer_report) + const char *other_detail) { char *msg_aux = ""; @@ -1086,7 +1082,7 @@ static void edac_ue_error(struct mem_ctl_info *mci, msg, msg_aux, label, location, detail); } - edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count); + edac_inc_ue_error(mci, pos, error_count); } static void edac_inc_csrow(struct edac_raw_error_desc *e, int row, int chan) @@ -1136,16 +1132,16 @@ void edac_raw_mc_handle_error(struct edac_raw_error_desc *e) "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx", e->page_frame_number, e->offset_in_page, e->grain, e->syndrome); - edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label, - detail, e->other_detail, e->enable_per_layer_report, + edac_ce_error(mci, e->error_count, pos, e->msg, e->location, + e->label, detail, e->other_detail, e->page_frame_number, e->offset_in_page, e->grain); } else { snprintf(detail, sizeof(detail), "page:0x%lx offset:0x%lx grain:%ld", e->page_frame_number, e->offset_in_page, e->grain); - edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label, - detail, e->other_detail, e->enable_per_layer_report); + edac_ue_error(mci, e->error_count, pos, e->msg, e->location, + e->label, detail, e->other_detail); } @@ -1170,6 +1166,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer }; int i, n_labels = 0; struct edac_raw_error_desc *e = &mci->error_desc; + bool any_memory = true; edac_dbg(3, "MC%d\n", mci->mc_idx); @@ -1187,10 +1184,9 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, e->other_detail = other_detail; /* - * Check if the event report is consistent and if the memory - * location is known. If it is known, enable_per_layer_report will be - * true, the DIMM(s) label info will be filled and the per-layer - * error counters will be incremented. + * Check if the event report is consistent and if the memory location is + * known. If it is, the DIMM(s) label info will be filled and the + * per-layer error counters will be incremented. */ for (i = 0; i < mci->n_layers; i++) { if (pos[i] >= (int)mci->layers[i].size) { @@ -1208,7 +1204,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, pos[i] = -1; } if (pos[i] >= 0) - e->enable_per_layer_report = true; + any_memory = false; } /* @@ -1239,9 +1235,9 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, /* * If the error is memory-controller wide, there's no need to - * seek for the affected DIMMs because the whole - * channel/memory controller/... may be affected. - * Also, don't show errors for empty DIMM slots. + * seek for the affected DIMMs because the whole channel/memory + * controller/... may be affected. Also, don't show errors for + * empty DIMM slots. */ if (!dimm->nr_pages) continue; @@ -1277,7 +1273,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, chan = -2; } - if (!e->enable_per_layer_report) + if (any_memory) strcpy(e->label, "any memory"); else if (!*e->label) strcpy(e->label, "unknown memory"); diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index bef8a428c429..cb3dab56a875 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -355,11 +355,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) mem_err->mem_dev_handle); index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle); - if (index >= 0) { + if (index >= 0) e->top_layer = index; - e->enable_per_layer_report = true; - } - } if (p > e->location) *(p - 1) = '\0'; diff --git a/include/linux/edac.h b/include/linux/edac.h index 6703eb492cd2..815f246e0abd 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -454,8 +454,6 @@ struct errcount_attribute_data { * @location: location of the error * @label: label of the affected DIMM(s) * @other_detail: other driver-specific detail about the error - * @enable_per_layer_report: if false, the error affects all layers - * (typically, a memory controller error) */ struct edac_raw_error_desc { char location[LOCATION_SIZE]; @@ -472,7 +470,6 @@ struct edac_raw_error_desc { unsigned long syndrome; const char *msg; const char *other_detail; - bool enable_per_layer_report; }; /* MEMORY controller information structure From 6ab76179adb82006aea05f438a0601561c162b8f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 23 Jan 2020 09:03:04 +0000 Subject: [PATCH 09/16] EDAC/mc: Pass the error descriptor to error reporting functions Most arguments of error reporting functions are already stored in the struct edac_raw_error_desc error descriptor. Pass the error descriptor to the functions and reduce the functions' argument list. [ bp: Sort function args in reverse fir tree order. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Acked-by: Aristeu Rozanski Link: https://lkml.kernel.org/r/20200123090210.26933-9-rrichter@marvell.com --- drivers/edac/edac_mc.c | 100 +++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 58 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 1867f11a3438..120c8c1a9d3a 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -945,16 +945,16 @@ const char *edac_layer_name[] = { }; EXPORT_SYMBOL_GPL(edac_layer_name); -static void edac_inc_ce_error(struct mem_ctl_info *mci, - const int pos[EDAC_MAX_LAYERS], - const u16 count) +static void edac_inc_ce_error(struct edac_raw_error_desc *e) { + int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer }; + struct mem_ctl_info *mci = error_desc_to_mci(e); int i, index = 0; - mci->ce_mc += count; + mci->ce_mc += e->error_count; if (pos[0] < 0) { - mci->ce_noinfo_count += count; + mci->ce_noinfo_count += e->error_count; return; } @@ -962,23 +962,23 @@ static void edac_inc_ce_error(struct mem_ctl_info *mci, if (pos[i] < 0) break; index += pos[i]; - mci->ce_per_layer[i][index] += count; + mci->ce_per_layer[i][index] += e->error_count; if (i < mci->n_layers - 1) index *= mci->layers[i + 1].size; } } -static void edac_inc_ue_error(struct mem_ctl_info *mci, - const int pos[EDAC_MAX_LAYERS], - const u16 count) +static void edac_inc_ue_error(struct edac_raw_error_desc *e) { + int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer }; + struct mem_ctl_info *mci = error_desc_to_mci(e); int i, index = 0; - mci->ue_mc += count; + mci->ue_mc += e->error_count; if (pos[0] < 0) { - mci->ue_noinfo_count += count; + mci->ue_noinfo_count += e->error_count; return; } @@ -986,44 +986,37 @@ static void edac_inc_ue_error(struct mem_ctl_info *mci, if (pos[i] < 0) break; index += pos[i]; - mci->ue_per_layer[i][index] += count; + mci->ue_per_layer[i][index] += e->error_count; if (i < mci->n_layers - 1) index *= mci->layers[i + 1].size; } } -static void edac_ce_error(struct mem_ctl_info *mci, - const u16 error_count, - const int pos[EDAC_MAX_LAYERS], - const char *msg, - const char *location, - const char *label, - const char *detail, - const char *other_detail, - const unsigned long page_frame_number, - const unsigned long offset_in_page, - long grain) +static void edac_ce_error(struct edac_raw_error_desc *e, + const char *detail) { + struct mem_ctl_info *mci = error_desc_to_mci(e); unsigned long remapped_page; char *msg_aux = ""; - if (*msg) + if (*e->msg) msg_aux = " "; if (edac_mc_get_log_ce()) { - if (other_detail && *other_detail) + if (e->other_detail && *e->other_detail) edac_mc_printk(mci, KERN_WARNING, "%d CE %s%son %s (%s %s - %s)\n", - error_count, msg, msg_aux, label, - location, detail, other_detail); + e->error_count, e->msg, msg_aux, e->label, + e->location, detail, e->other_detail); else edac_mc_printk(mci, KERN_WARNING, "%d CE %s%son %s (%s %s)\n", - error_count, msg, msg_aux, label, - location, detail); + e->error_count, e->msg, msg_aux, e->label, + e->location, detail); } - edac_inc_ce_error(mci, pos, error_count); + + edac_inc_ce_error(e); if (mci->scrub_mode == SCRUB_SW_SRC) { /* @@ -1038,51 +1031,46 @@ static void edac_ce_error(struct mem_ctl_info *mci, * be scrubbed. */ remapped_page = mci->ctl_page_to_phys ? - mci->ctl_page_to_phys(mci, page_frame_number) : - page_frame_number; + mci->ctl_page_to_phys(mci, e->page_frame_number) : + e->page_frame_number; - edac_mc_scrub_block(remapped_page, - offset_in_page, grain); + edac_mc_scrub_block(remapped_page, e->offset_in_page, e->grain); } } -static void edac_ue_error(struct mem_ctl_info *mci, - const u16 error_count, - const int pos[EDAC_MAX_LAYERS], - const char *msg, - const char *location, - const char *label, - const char *detail, - const char *other_detail) +static void edac_ue_error(struct edac_raw_error_desc *e, + const char *detail) { + struct mem_ctl_info *mci = error_desc_to_mci(e); char *msg_aux = ""; - if (*msg) + if (*e->msg) msg_aux = " "; if (edac_mc_get_log_ue()) { - if (other_detail && *other_detail) + if (e->other_detail && *e->other_detail) edac_mc_printk(mci, KERN_WARNING, "%d UE %s%son %s (%s %s - %s)\n", - error_count, msg, msg_aux, label, - location, detail, other_detail); + e->error_count, e->msg, msg_aux, e->label, + e->location, detail, e->other_detail); else edac_mc_printk(mci, KERN_WARNING, "%d UE %s%son %s (%s %s)\n", - error_count, msg, msg_aux, label, - location, detail); + e->error_count, e->msg, msg_aux, e->label, + e->location, detail); } if (edac_mc_get_panic_on_ue()) { - if (other_detail && *other_detail) + if (e->other_detail && *e->other_detail) panic("UE %s%son %s (%s%s - %s)\n", - msg, msg_aux, label, location, detail, other_detail); + e->msg, msg_aux, e->label, e->location, detail, + e->other_detail); else panic("UE %s%son %s (%s%s)\n", - msg, msg_aux, label, location, detail); + e->msg, msg_aux, e->label, e->location, detail); } - edac_inc_ue_error(mci, pos, error_count); + edac_inc_ue_error(e); } static void edac_inc_csrow(struct edac_raw_error_desc *e, int row, int chan) @@ -1109,7 +1097,6 @@ void edac_raw_mc_handle_error(struct edac_raw_error_desc *e) { struct mem_ctl_info *mci = error_desc_to_mci(e); char detail[80]; - int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer }; u8 grain_bits; /* Sanity-check driver-supplied grain value. */ @@ -1132,16 +1119,13 @@ void edac_raw_mc_handle_error(struct edac_raw_error_desc *e) "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx", e->page_frame_number, e->offset_in_page, e->grain, e->syndrome); - edac_ce_error(mci, e->error_count, pos, e->msg, e->location, - e->label, detail, e->other_detail, - e->page_frame_number, e->offset_in_page, e->grain); + edac_ce_error(e, detail); } else { snprintf(detail, sizeof(detail), "page:0x%lx offset:0x%lx grain:%ld", e->page_frame_number, e->offset_in_page, e->grain); - edac_ue_error(mci, e->error_count, pos, e->msg, e->location, - e->label, detail, e->other_detail); + edac_ue_error(e, detail); } From 1853ee72997645e1aea85e20b94f7dcc55636887 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 23 Jan 2020 09:03:06 +0000 Subject: [PATCH 10/16] EDAC/mc: Remove detail[] string and cleanup error string generation The error descriptor is passed to the error reporting functions, so the error details can be directly generated there. Move string generation from edac_raw_mc_handle_error() to edac_ce_error() and edac_ue_error(). The intermediate detail[] string can be removed then. Also, cleanup the string generation by switching to a single variant only using the ternary operator. [ bp: put ternary operators on a separate line for better readability and use the short-form "inline if" in edac_mc_handle_error(). ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Acked-by: Aristeu Rozanski Link: https://lkml.kernel.org/r/20200123090210.26933-10-rrichter@marvell.com --- drivers/edac/edac_mc.c | 90 +++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 58 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 120c8c1a9d3a..80629c702a4f 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -993,27 +993,20 @@ static void edac_inc_ue_error(struct edac_raw_error_desc *e) } } -static void edac_ce_error(struct edac_raw_error_desc *e, - const char *detail) +static void edac_ce_error(struct edac_raw_error_desc *e) { struct mem_ctl_info *mci = error_desc_to_mci(e); unsigned long remapped_page; - char *msg_aux = ""; - - if (*e->msg) - msg_aux = " "; if (edac_mc_get_log_ce()) { - if (e->other_detail && *e->other_detail) - edac_mc_printk(mci, KERN_WARNING, - "%d CE %s%son %s (%s %s - %s)\n", - e->error_count, e->msg, msg_aux, e->label, - e->location, detail, e->other_detail); - else - edac_mc_printk(mci, KERN_WARNING, - "%d CE %s%son %s (%s %s)\n", - e->error_count, e->msg, msg_aux, e->label, - e->location, detail); + edac_mc_printk(mci, KERN_WARNING, + "%d CE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx%s%s)\n", + e->error_count, e->msg, + *e->msg ? " " : "", + e->label, e->location, e->page_frame_number, e->offset_in_page, + e->grain, e->syndrome, + *e->other_detail ? " - " : "", + e->other_detail); } edac_inc_ce_error(e); @@ -1038,36 +1031,29 @@ static void edac_ce_error(struct edac_raw_error_desc *e, } } -static void edac_ue_error(struct edac_raw_error_desc *e, - const char *detail) +static void edac_ue_error(struct edac_raw_error_desc *e) { struct mem_ctl_info *mci = error_desc_to_mci(e); - char *msg_aux = ""; - - if (*e->msg) - msg_aux = " "; if (edac_mc_get_log_ue()) { - if (e->other_detail && *e->other_detail) - edac_mc_printk(mci, KERN_WARNING, - "%d UE %s%son %s (%s %s - %s)\n", - e->error_count, e->msg, msg_aux, e->label, - e->location, detail, e->other_detail); - else - edac_mc_printk(mci, KERN_WARNING, - "%d UE %s%son %s (%s %s)\n", - e->error_count, e->msg, msg_aux, e->label, - e->location, detail); + edac_mc_printk(mci, KERN_WARNING, + "%d UE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld%s%s)\n", + e->error_count, e->msg, + *e->msg ? " " : "", + e->label, e->location, e->page_frame_number, e->offset_in_page, + e->grain, + *e->other_detail ? " - " : "", + e->other_detail); } if (edac_mc_get_panic_on_ue()) { - if (e->other_detail && *e->other_detail) - panic("UE %s%son %s (%s%s - %s)\n", - e->msg, msg_aux, e->label, e->location, detail, - e->other_detail); - else - panic("UE %s%son %s (%s%s)\n", - e->msg, msg_aux, e->label, e->location, detail); + panic("UE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld%s%s)\n", + e->msg, + *e->msg ? " " : "", + e->label, e->location, e->page_frame_number, e->offset_in_page, + e->grain, + *e->other_detail ? " - " : "", + e->other_detail); } edac_inc_ue_error(e); @@ -1096,7 +1082,6 @@ static void edac_inc_csrow(struct edac_raw_error_desc *e, int row, int chan) void edac_raw_mc_handle_error(struct edac_raw_error_desc *e) { struct mem_ctl_info *mci = error_desc_to_mci(e); - char detail[80]; u8 grain_bits; /* Sanity-check driver-supplied grain value. */ @@ -1113,22 +1098,10 @@ void edac_raw_mc_handle_error(struct edac_raw_error_desc *e) (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, grain_bits, e->syndrome, e->other_detail); - /* Memory type dependent details about the error */ - if (e->type == HW_EVENT_ERR_CORRECTED) { - snprintf(detail, sizeof(detail), - "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx", - e->page_frame_number, e->offset_in_page, - e->grain, e->syndrome); - edac_ce_error(e, detail); - } else { - snprintf(detail, sizeof(detail), - "page:0x%lx offset:0x%lx grain:%ld", - e->page_frame_number, e->offset_in_page, e->grain); - - edac_ue_error(e, detail); - } - - + if (e->type == HW_EVENT_ERR_CORRECTED) + edac_ce_error(e); + else + edac_ue_error(e); } EXPORT_SYMBOL_GPL(edac_raw_mc_handle_error); @@ -1164,8 +1137,9 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, e->page_frame_number = page_frame_number; e->offset_in_page = offset_in_page; e->syndrome = syndrome; - e->msg = msg; - e->other_detail = other_detail; + /* need valid strings here for both: */ + e->msg = msg ?: ""; + e->other_detail = other_detail ?: ""; /* * Check if the event report is consistent and if the memory location is From 4aa92c86463273b673e4170c60cb78e2625781eb Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 17 Feb 2020 12:30:23 +0100 Subject: [PATCH 11/16] EDAC/mc: Remove per layer counters Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it turns out that only the leaves in the memory hierarchy are consumed (in sysfs), but not the intermediate layers, e.g.: count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx]; These unused counters only add complexity, remove them. The error counter values are directly stored in struct dimm_info now. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Acked-by: Aristeu Rozanski Link: https://lkml.kernel.org/r/20200123090210.26933-11-rrichter@marvell.com --- drivers/edac/edac_mc.c | 67 +++++++++--------------------------- drivers/edac/edac_mc_sysfs.c | 20 +++++------ include/linux/edac.h | 4 ++- 3 files changed, 27 insertions(+), 64 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 80629c702a4f..75ede27bdf6a 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -451,11 +451,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, { struct mem_ctl_info *mci; struct edac_mc_layer *layer; - u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; - unsigned int idx, size, tot_dimms = 1, count = 1; - unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0; + unsigned int idx, size, tot_dimms = 1; + unsigned int tot_csrows = 1, tot_channels = 1; void *pvt, *ptr = NULL; - int i; bool per_rank = false; if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0)) @@ -482,19 +480,10 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, * stringent as what the compiler would provide if we could simply * hardcode everything into a single struct. */ - mci = edac_align_ptr(&ptr, sizeof(*mci), 1); - layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers); - for (i = 0; i < n_layers; i++) { - count *= layers[i].size; - edac_dbg(4, "errcount layer %d size %d\n", i, count); - ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); - ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); - tot_errcount += 2 * count; - } - - edac_dbg(4, "allocating %d error counters\n", tot_errcount); - pvt = edac_align_ptr(&ptr, sz_pvt, 1); - size = ((unsigned long)pvt) + sz_pvt; + mci = edac_align_ptr(&ptr, sizeof(*mci), 1); + layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers); + pvt = edac_align_ptr(&ptr, sz_pvt, 1); + size = ((unsigned long)pvt) + sz_pvt; edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n", size, @@ -513,10 +502,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, * rather than an imaginary chunk of memory located at address 0. */ layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer)); - for (i = 0; i < n_layers; i++) { - mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i])); - mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i])); - } pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; /* setup index and various internal pointers */ @@ -949,48 +934,28 @@ static void edac_inc_ce_error(struct edac_raw_error_desc *e) { int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer }; struct mem_ctl_info *mci = error_desc_to_mci(e); - int i, index = 0; + struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]); mci->ce_mc += e->error_count; - if (pos[0] < 0) { + if (dimm) + dimm->ce_count += e->error_count; + else mci->ce_noinfo_count += e->error_count; - return; - } - - for (i = 0; i < mci->n_layers; i++) { - if (pos[i] < 0) - break; - index += pos[i]; - mci->ce_per_layer[i][index] += e->error_count; - - if (i < mci->n_layers - 1) - index *= mci->layers[i + 1].size; - } } static void edac_inc_ue_error(struct edac_raw_error_desc *e) { int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer }; struct mem_ctl_info *mci = error_desc_to_mci(e); - int i, index = 0; + struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]); mci->ue_mc += e->error_count; - if (pos[0] < 0) { + if (dimm) + dimm->ue_count += e->error_count; + else mci->ue_noinfo_count += e->error_count; - return; - } - - for (i = 0; i < mci->n_layers; i++) { - if (pos[i] < 0) - break; - index += pos[i]; - mci->ue_per_layer[i][index] += e->error_count; - - if (i < mci->n_layers - 1) - index *= mci->layers[i + 1].size; - } } static void edac_ce_error(struct edac_raw_error_desc *e) @@ -1143,8 +1108,8 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, /* * Check if the event report is consistent and if the memory location is - * known. If it is, the DIMM(s) label info will be filled and the - * per-layer error counters will be incremented. + * known. If it is, the DIMM(s) label info will be filled and the DIMM's + * error counters will be incremented. */ for (i = 0; i < mci->n_layers; i++) { if (pos[i] >= (int)mci->layers[i].size) { diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 11e1b436f116..4e6aca595133 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -551,10 +551,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev, char *data) { struct dimm_info *dimm = to_dimm(dev); - u32 count; - count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx]; - return sprintf(data, "%u\n", count); + return sprintf(data, "%u\n", dimm->ce_count); } static ssize_t dimmdev_ue_count_show(struct device *dev, @@ -562,10 +560,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev, char *data) { struct dimm_info *dimm = to_dimm(dev); - u32 count; - count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx]; - return sprintf(data, "%u\n", count); + return sprintf(data, "%u\n", dimm->ue_count); } /* dimm/rank attribute files */ @@ -661,7 +657,9 @@ static ssize_t mci_reset_counters_store(struct device *dev, const char *data, size_t count) { struct mem_ctl_info *mci = to_mci(dev); - int cnt, row, chan, i; + struct dimm_info *dimm; + int row, chan; + mci->ue_mc = 0; mci->ce_mc = 0; mci->ue_noinfo_count = 0; @@ -677,11 +675,9 @@ static ssize_t mci_reset_counters_store(struct device *dev, ri->channels[chan]->ce_count = 0; } - cnt = 1; - for (i = 0; i < mci->n_layers; i++) { - cnt *= mci->layers[i].size; - memset(mci->ce_per_layer[i], 0, cnt * sizeof(u32)); - memset(mci->ue_per_layer[i], 0, cnt * sizeof(u32)); + mci_for_each_dimm(mci, dimm) { + dimm->ue_count = 0; + dimm->ce_count = 0; } mci->start_time = jiffies; diff --git a/include/linux/edac.h b/include/linux/edac.h index 815f246e0abd..0f20b986b0ab 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -383,6 +383,9 @@ struct dimm_info { unsigned int csrow, cschannel; /* Points to the old API data */ u16 smbios_handle; /* Handle for SMBIOS type 17 */ + + u32 ce_count; + u32 ue_count; }; /** @@ -559,7 +562,6 @@ struct mem_ctl_info { */ u32 ce_noinfo_count, ue_noinfo_count; u32 ue_mc, ce_mc; - u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; struct completion complete; From 52cff04a81e9571735976e4d70984e89a7ee3885 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 17 Feb 2020 08:46:27 -0500 Subject: [PATCH 12/16] EDAC/mce_amd: Print !SMCA processor warning only once This warning is output for every virtual CPU in a guest on an EPYC 2 system because kvm doesn't enable SMCA. Once is enough too. [ bp: Massage. ] Signed-off-by: Prarit Bhargava Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200217134627.19765-1-prarit@redhat.com --- drivers/edac/mce_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index ea980c556f2e..8874b7722b2f 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1239,7 +1239,7 @@ static int __init mce_amd_init(void) case 0x17: case 0x18: - pr_warn("Decoding supported only on Scalable MCA processors.\n"); + pr_warn_once("Decoding supported only on Scalable MCA processors.\n"); return -EINVAL; default: From db6c122b8fb16a346b87fbcb4987515429e5e7cf Mon Sep 17 00:00:00 2001 From: Lei Wang Date: Mon, 27 Jan 2020 08:23:08 -0800 Subject: [PATCH 13/16] dt-bindings: edac: Dmc-520.yaml Add the device tree bindings for the EDAC driver dmc520_edac. Signed-off-by: Lei Wang Signed-off-by: Shiping Ji Signed-off-by: Borislav Petkov Reviewed-by: James Morse Reviewed-by: Rob Herring Link: https://lkml.kernel.org/r/5354a9c3-5b5a-486a-9d19-fa9be169faef@gmail.com --- .../devicetree/bindings/edac/dmc-520.yaml | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 Documentation/devicetree/bindings/edac/dmc-520.yaml diff --git a/Documentation/devicetree/bindings/edac/dmc-520.yaml b/Documentation/devicetree/bindings/edac/dmc-520.yaml new file mode 100644 index 000000000000..9272d2bd8634 --- /dev/null +++ b/Documentation/devicetree/bindings/edac/dmc-520.yaml @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/edac/dmc-520.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: ARM DMC-520 EDAC bindings + +maintainers: + - Lei Wang + +description: |+ + DMC-520 node is defined to describe DRAM error detection and correction. + + https://static.docs.arm.com/100000/0200/corelink_dmc520_trm_100000_0200_01_en.pdf + +properties: + compatible: + items: + - const: brcm,dmc-520 + - const: arm,dmc-520 + + reg: + maxItems: 1 + + interrupts: + minItems: 1 + maxItems: 10 + + interrupt-names: + minItems: 1 + maxItems: 10 + items: + enum: + - ram_ecc_errc + - ram_ecc_errd + - dram_ecc_errc + - dram_ecc_errd + - failed_access + - failed_prog + - link_err + - temperature_event + - arch_fsm + - phy_request + +required: + - compatible + - reg + - interrupts + - interrupt-names + +examples: + - | + dmc0: dmc@200000 { + compatible = "brcm,dmc-520", "arm,dmc-520"; + reg = <0x200000 0x80000>; + interrupts = <0x0 0x349 0x4>, <0x0 0x34B 0x4>; + interrupt-names = "dram_ecc_errc", "dram_ecc_errd"; + }; From 1088750d78392cf79c113082ffbf955384737e9c Mon Sep 17 00:00:00 2001 From: Lei Wang Date: Wed, 22 Jan 2020 16:31:14 -0800 Subject: [PATCH 14/16] EDAC: Add EDAC driver for DMC520 The driver supports error detection and correction on devices with an ARM DMC-520 memory controller. Signed-off-by: Lei Wang Signed-off-by: Shiping Ji Signed-off-by: Borislav Petkov Reviewed-by: James Morse Link: https://lkml.kernel.org/r/83b48c70-dc06-d0d4-cae9-a2187fca628b@gmail.com --- MAINTAINERS | 6 + drivers/edac/Kconfig | 7 + drivers/edac/Makefile | 1 + drivers/edac/dmc520_edac.c | 656 +++++++++++++++++++++++++++++++++++++ 4 files changed, 670 insertions(+) create mode 100644 drivers/edac/dmc520_edac.c diff --git a/MAINTAINERS b/MAINTAINERS index a0d86490c2c6..8456c8ecb073 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5998,6 +5998,12 @@ F: Documentation/driver-api/edac.rst F: drivers/edac/ F: include/linux/edac.h +EDAC-DMC520 +M: Lei Wang +L: linux-edac@vger.kernel.org +S: Supported +F: drivers/edac/dmc520_edac.c + EDAC-E752X M: Mark Gross L: linux-edac@vger.kernel.org diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index b3c99bb5fe77..fe2eb892a1bd 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -523,4 +523,11 @@ config EDAC_BLUEFIELD Support for error detection and correction on the Mellanox BlueField SoCs. +config EDAC_DMC520 + tristate "ARM DMC-520 ECC" + depends on ARM64 + help + Support for error detection and correction on the + SoCs with ARM DMC-520 DRAM controller. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index d77200c9680b..269e15118cea 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -87,3 +87,4 @@ obj-$(CONFIG_EDAC_TI) += ti_edac.o obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o obj-$(CONFIG_EDAC_ASPEED) += aspeed_edac.o obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o +obj-$(CONFIG_EDAC_DMC520) += dmc520_edac.o diff --git a/drivers/edac/dmc520_edac.c b/drivers/edac/dmc520_edac.c new file mode 100644 index 000000000000..fc1153ab1ebb --- /dev/null +++ b/drivers/edac/dmc520_edac.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * EDAC driver for DMC-520 memory controller. + * + * The driver supports 10 interrupt lines, + * though only dram_ecc_errc and dram_ecc_errd are currently handled. + * + * Authors: Rui Zhao + * Lei Wang + * Shiping Ji + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "edac_mc.h" + +/* DMC-520 registers */ +#define REG_OFFSET_FEATURE_CONFIG 0x130 +#define REG_OFFSET_ECC_ERRC_COUNT_31_00 0x158 +#define REG_OFFSET_ECC_ERRC_COUNT_63_32 0x15C +#define REG_OFFSET_ECC_ERRD_COUNT_31_00 0x160 +#define REG_OFFSET_ECC_ERRD_COUNT_63_32 0x164 +#define REG_OFFSET_INTERRUPT_CONTROL 0x500 +#define REG_OFFSET_INTERRUPT_CLR 0x508 +#define REG_OFFSET_INTERRUPT_STATUS 0x510 +#define REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_31_00 0x528 +#define REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_63_32 0x52C +#define REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_31_00 0x530 +#define REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_63_32 0x534 +#define REG_OFFSET_ADDRESS_CONTROL_NOW 0x1010 +#define REG_OFFSET_MEMORY_TYPE_NOW 0x1128 +#define REG_OFFSET_SCRUB_CONTROL0_NOW 0x1170 +#define REG_OFFSET_FORMAT_CONTROL 0x18 + +/* DMC-520 types, masks and bitfields */ +#define RAM_ECC_INT_CE_BIT BIT(0) +#define RAM_ECC_INT_UE_BIT BIT(1) +#define DRAM_ECC_INT_CE_BIT BIT(2) +#define DRAM_ECC_INT_UE_BIT BIT(3) +#define FAILED_ACCESS_INT_BIT BIT(4) +#define FAILED_PROG_INT_BIT BIT(5) +#define LINK_ERR_INT_BIT BIT(6) +#define TEMPERATURE_EVENT_INT_BIT BIT(7) +#define ARCH_FSM_INT_BIT BIT(8) +#define PHY_REQUEST_INT_BIT BIT(9) +#define MEMORY_WIDTH_MASK GENMASK(1, 0) +#define SCRUB_TRIGGER0_NEXT_MASK GENMASK(1, 0) +#define REG_FIELD_DRAM_ECC_ENABLED GENMASK(1, 0) +#define REG_FIELD_MEMORY_TYPE GENMASK(2, 0) +#define REG_FIELD_DEVICE_WIDTH GENMASK(9, 8) +#define REG_FIELD_ADDRESS_CONTROL_COL GENMASK(2, 0) +#define REG_FIELD_ADDRESS_CONTROL_ROW GENMASK(10, 8) +#define REG_FIELD_ADDRESS_CONTROL_BANK GENMASK(18, 16) +#define REG_FIELD_ADDRESS_CONTROL_RANK GENMASK(25, 24) +#define REG_FIELD_ERR_INFO_LOW_VALID BIT(0) +#define REG_FIELD_ERR_INFO_LOW_COL GENMASK(10, 1) +#define REG_FIELD_ERR_INFO_LOW_ROW GENMASK(28, 11) +#define REG_FIELD_ERR_INFO_LOW_RANK GENMASK(31, 29) +#define REG_FIELD_ERR_INFO_HIGH_BANK GENMASK(3, 0) +#define REG_FIELD_ERR_INFO_HIGH_VALID BIT(31) + +#define DRAM_ADDRESS_CONTROL_MIN_COL_BITS 8 +#define DRAM_ADDRESS_CONTROL_MIN_ROW_BITS 11 + +#define DMC520_SCRUB_TRIGGER_ERR_DETECT 2 +#define DMC520_SCRUB_TRIGGER_IDLE 3 + +/* Driver settings */ +/* + * The max-length message would be: "rank:7 bank:15 row:262143 col:1023". + * Max length is 34. Using a 40-size buffer is enough. + */ +#define DMC520_MSG_BUF_SIZE 40 +#define EDAC_MOD_NAME "dmc520-edac" +#define EDAC_CTL_NAME "dmc520" + +/* the data bus width for the attached memory chips. */ +enum dmc520_mem_width { + MEM_WIDTH_X32 = 2, + MEM_WIDTH_X64 = 3 +}; + +/* memory type */ +enum dmc520_mem_type { + MEM_TYPE_DDR3 = 1, + MEM_TYPE_DDR4 = 2 +}; + +/* memory device width */ +enum dmc520_dev_width { + DEV_WIDTH_X4 = 0, + DEV_WIDTH_X8 = 1, + DEV_WIDTH_X16 = 2 +}; + +struct ecc_error_info { + u32 col; + u32 row; + u32 bank; + u32 rank; +}; + +/* The interrupt config */ +struct dmc520_irq_config { + char *name; + int mask; +}; + +/* The interrupt mappings */ +static struct dmc520_irq_config dmc520_irq_configs[] = { + { + .name = "ram_ecc_errc", + .mask = RAM_ECC_INT_CE_BIT + }, + { + .name = "ram_ecc_errd", + .mask = RAM_ECC_INT_UE_BIT + }, + { + .name = "dram_ecc_errc", + .mask = DRAM_ECC_INT_CE_BIT + }, + { + .name = "dram_ecc_errd", + .mask = DRAM_ECC_INT_UE_BIT + }, + { + .name = "failed_access", + .mask = FAILED_ACCESS_INT_BIT + }, + { + .name = "failed_prog", + .mask = FAILED_PROG_INT_BIT + }, + { + .name = "link_err", + .mask = LINK_ERR_INT_BIT + }, + { + .name = "temperature_event", + .mask = TEMPERATURE_EVENT_INT_BIT + }, + { + .name = "arch_fsm", + .mask = ARCH_FSM_INT_BIT + }, + { + .name = "phy_request", + .mask = PHY_REQUEST_INT_BIT + } +}; + +#define NUMBER_OF_IRQS ARRAY_SIZE(dmc520_irq_configs) + +/* + * The EDAC driver private data. + * error_lock is to protect concurrent writes to the mci->error_desc through + * edac_mc_handle_error(). + */ +struct dmc520_edac { + void __iomem *reg_base; + spinlock_t error_lock; + u32 mem_width_in_bytes; + int irqs[NUMBER_OF_IRQS]; + int masks[NUMBER_OF_IRQS]; +}; + +static int dmc520_mc_idx; + +static u32 dmc520_read_reg(struct dmc520_edac *pvt, u32 offset) +{ + return readl(pvt->reg_base + offset); +} + +static void dmc520_write_reg(struct dmc520_edac *pvt, u32 val, u32 offset) +{ + writel(val, pvt->reg_base + offset); +} + +static u32 dmc520_calc_dram_ecc_error(u32 value) +{ + u32 total = 0; + + /* Each rank's error counter takes one byte. */ + while (value > 0) { + total += (value & 0xFF); + value >>= 8; + } + return total; +} + +static u32 dmc520_get_dram_ecc_error_count(struct dmc520_edac *pvt, + bool is_ce) +{ + u32 reg_offset_low, reg_offset_high; + u32 err_low, err_high; + u32 err_count; + + reg_offset_low = is_ce ? REG_OFFSET_ECC_ERRC_COUNT_31_00 : + REG_OFFSET_ECC_ERRD_COUNT_31_00; + reg_offset_high = is_ce ? REG_OFFSET_ECC_ERRC_COUNT_63_32 : + REG_OFFSET_ECC_ERRD_COUNT_63_32; + + err_low = dmc520_read_reg(pvt, reg_offset_low); + err_high = dmc520_read_reg(pvt, reg_offset_high); + /* Reset error counters */ + dmc520_write_reg(pvt, 0, reg_offset_low); + dmc520_write_reg(pvt, 0, reg_offset_high); + + err_count = dmc520_calc_dram_ecc_error(err_low) + + dmc520_calc_dram_ecc_error(err_high); + + return err_count; +} + +static void dmc520_get_dram_ecc_error_info(struct dmc520_edac *pvt, + bool is_ce, + struct ecc_error_info *info) +{ + u32 reg_offset_low, reg_offset_high; + u32 reg_val_low, reg_val_high; + bool valid; + + reg_offset_low = is_ce ? REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_31_00 : + REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_31_00; + reg_offset_high = is_ce ? REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_63_32 : + REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_63_32; + + reg_val_low = dmc520_read_reg(pvt, reg_offset_low); + reg_val_high = dmc520_read_reg(pvt, reg_offset_high); + + valid = (FIELD_GET(REG_FIELD_ERR_INFO_LOW_VALID, reg_val_low) != 0) && + (FIELD_GET(REG_FIELD_ERR_INFO_HIGH_VALID, reg_val_high) != 0); + + if (valid) { + info->col = FIELD_GET(REG_FIELD_ERR_INFO_LOW_COL, reg_val_low); + info->row = FIELD_GET(REG_FIELD_ERR_INFO_LOW_ROW, reg_val_low); + info->rank = FIELD_GET(REG_FIELD_ERR_INFO_LOW_RANK, reg_val_low); + info->bank = FIELD_GET(REG_FIELD_ERR_INFO_HIGH_BANK, reg_val_high); + } else { + memset(info, 0, sizeof(*info)); + } +} + +static bool dmc520_is_ecc_enabled(void __iomem *reg_base) +{ + u32 reg_val = readl(reg_base + REG_OFFSET_FEATURE_CONFIG); + + return FIELD_GET(REG_FIELD_DRAM_ECC_ENABLED, reg_val); +} + +static enum scrub_type dmc520_get_scrub_type(struct dmc520_edac *pvt) +{ + enum scrub_type type = SCRUB_NONE; + u32 reg_val, scrub_cfg; + + reg_val = dmc520_read_reg(pvt, REG_OFFSET_SCRUB_CONTROL0_NOW); + scrub_cfg = FIELD_GET(SCRUB_TRIGGER0_NEXT_MASK, reg_val); + + if (scrub_cfg == DMC520_SCRUB_TRIGGER_ERR_DETECT || + scrub_cfg == DMC520_SCRUB_TRIGGER_IDLE) + type = SCRUB_HW_PROG; + + return type; +} + +/* Get the memory data bus width, in number of bytes. */ +static u32 dmc520_get_memory_width(struct dmc520_edac *pvt) +{ + enum dmc520_mem_width mem_width_field; + u32 mem_width_in_bytes = 0; + u32 reg_val; + + reg_val = dmc520_read_reg(pvt, REG_OFFSET_FORMAT_CONTROL); + mem_width_field = FIELD_GET(MEMORY_WIDTH_MASK, reg_val); + + if (mem_width_field == MEM_WIDTH_X32) + mem_width_in_bytes = 4; + else if (mem_width_field == MEM_WIDTH_X64) + mem_width_in_bytes = 8; + return mem_width_in_bytes; +} + +static enum mem_type dmc520_get_mtype(struct dmc520_edac *pvt) +{ + enum mem_type mt = MEM_UNKNOWN; + enum dmc520_mem_type type; + u32 reg_val; + + reg_val = dmc520_read_reg(pvt, REG_OFFSET_MEMORY_TYPE_NOW); + type = FIELD_GET(REG_FIELD_MEMORY_TYPE, reg_val); + + switch (type) { + case MEM_TYPE_DDR3: + mt = MEM_DDR3; + break; + + case MEM_TYPE_DDR4: + mt = MEM_DDR4; + break; + } + + return mt; +} + +static enum dev_type dmc520_get_dtype(struct dmc520_edac *pvt) +{ + enum dmc520_dev_width device_width; + enum dev_type dt = DEV_UNKNOWN; + u32 reg_val; + + reg_val = dmc520_read_reg(pvt, REG_OFFSET_MEMORY_TYPE_NOW); + device_width = FIELD_GET(REG_FIELD_DEVICE_WIDTH, reg_val); + + switch (device_width) { + case DEV_WIDTH_X4: + dt = DEV_X4; + break; + + case DEV_WIDTH_X8: + dt = DEV_X8; + break; + + case DEV_WIDTH_X16: + dt = DEV_X16; + break; + } + + return dt; +} + +static u32 dmc520_get_rank_count(void __iomem *reg_base) +{ + u32 reg_val, rank_bits; + + reg_val = readl(reg_base + REG_OFFSET_ADDRESS_CONTROL_NOW); + rank_bits = FIELD_GET(REG_FIELD_ADDRESS_CONTROL_RANK, reg_val); + + return BIT(rank_bits); +} + +static u64 dmc520_get_rank_size(struct dmc520_edac *pvt) +{ + u32 reg_val, col_bits, row_bits, bank_bits; + + reg_val = dmc520_read_reg(pvt, REG_OFFSET_ADDRESS_CONTROL_NOW); + + col_bits = FIELD_GET(REG_FIELD_ADDRESS_CONTROL_COL, reg_val) + + DRAM_ADDRESS_CONTROL_MIN_COL_BITS; + row_bits = FIELD_GET(REG_FIELD_ADDRESS_CONTROL_ROW, reg_val) + + DRAM_ADDRESS_CONTROL_MIN_ROW_BITS; + bank_bits = FIELD_GET(REG_FIELD_ADDRESS_CONTROL_BANK, reg_val); + + return (u64)pvt->mem_width_in_bytes << (col_bits + row_bits + bank_bits); +} + +static void dmc520_handle_dram_ecc_errors(struct mem_ctl_info *mci, + bool is_ce) +{ + struct dmc520_edac *pvt = mci->pvt_info; + char message[DMC520_MSG_BUF_SIZE]; + struct ecc_error_info info; + u32 cnt; + + dmc520_get_dram_ecc_error_info(pvt, is_ce, &info); + + cnt = dmc520_get_dram_ecc_error_count(pvt, is_ce); + if (!cnt) + return; + + snprintf(message, ARRAY_SIZE(message), + "rank:%d bank:%d row:%d col:%d", + info.rank, info.bank, + info.row, info.col); + + spin_lock(&pvt->error_lock); + edac_mc_handle_error((is_ce ? HW_EVENT_ERR_CORRECTED : + HW_EVENT_ERR_UNCORRECTED), + mci, cnt, 0, 0, 0, info.rank, -1, -1, + message, ""); + spin_unlock(&pvt->error_lock); +} + +static irqreturn_t dmc520_edac_dram_ecc_isr(int irq, struct mem_ctl_info *mci, + bool is_ce) +{ + struct dmc520_edac *pvt = mci->pvt_info; + u32 i_mask; + + i_mask = is_ce ? DRAM_ECC_INT_CE_BIT : DRAM_ECC_INT_UE_BIT; + + dmc520_handle_dram_ecc_errors(mci, is_ce); + + dmc520_write_reg(pvt, i_mask, REG_OFFSET_INTERRUPT_CLR); + + return IRQ_HANDLED; +} + +static irqreturn_t dmc520_edac_dram_all_isr(int irq, struct mem_ctl_info *mci, + u32 irq_mask) +{ + struct dmc520_edac *pvt = mci->pvt_info; + irqreturn_t irq_ret = IRQ_NONE; + u32 status; + + status = dmc520_read_reg(pvt, REG_OFFSET_INTERRUPT_STATUS); + + if ((irq_mask & DRAM_ECC_INT_CE_BIT) && + (status & DRAM_ECC_INT_CE_BIT)) + irq_ret = dmc520_edac_dram_ecc_isr(irq, mci, true); + + if ((irq_mask & DRAM_ECC_INT_UE_BIT) && + (status & DRAM_ECC_INT_UE_BIT)) + irq_ret = dmc520_edac_dram_ecc_isr(irq, mci, false); + + return irq_ret; +} + +static irqreturn_t dmc520_isr(int irq, void *data) +{ + struct mem_ctl_info *mci = data; + struct dmc520_edac *pvt = mci->pvt_info; + u32 mask = 0; + int idx; + + for (idx = 0; idx < NUMBER_OF_IRQS; idx++) { + if (pvt->irqs[idx] == irq) { + mask = pvt->masks[idx]; + break; + } + } + return dmc520_edac_dram_all_isr(irq, mci, mask); +} + +static void dmc520_init_csrow(struct mem_ctl_info *mci) +{ + struct dmc520_edac *pvt = mci->pvt_info; + struct csrow_info *csi; + struct dimm_info *dimm; + u32 pages_per_rank; + enum dev_type dt; + enum mem_type mt; + int row, ch; + u64 rs; + + dt = dmc520_get_dtype(pvt); + mt = dmc520_get_mtype(pvt); + rs = dmc520_get_rank_size(pvt); + pages_per_rank = rs >> PAGE_SHIFT; + + for (row = 0; row < mci->nr_csrows; row++) { + csi = mci->csrows[row]; + + for (ch = 0; ch < csi->nr_channels; ch++) { + dimm = csi->channels[ch]->dimm; + dimm->grain = pvt->mem_width_in_bytes; + dimm->dtype = dt; + dimm->mtype = mt; + dimm->edac_mode = EDAC_FLAG_SECDED; + dimm->nr_pages = pages_per_rank / csi->nr_channels; + } + } +} + +static int dmc520_edac_probe(struct platform_device *pdev) +{ + bool registered[NUMBER_OF_IRQS] = { false }; + int irqs[NUMBER_OF_IRQS] = { -ENXIO }; + int masks[NUMBER_OF_IRQS] = { 0 }; + struct edac_mc_layer layers[1]; + struct dmc520_edac *pvt = NULL; + struct mem_ctl_info *mci; + void __iomem *reg_base; + u32 irq_mask_all = 0; + struct resource *res; + struct device *dev; + int ret, idx, irq; + u32 reg_val; + + /* Parse the device node */ + dev = &pdev->dev; + + for (idx = 0; idx < NUMBER_OF_IRQS; idx++) { + irq = platform_get_irq_byname(pdev, dmc520_irq_configs[idx].name); + irqs[idx] = irq; + masks[idx] = dmc520_irq_configs[idx].mask; + if (irq >= 0) { + irq_mask_all |= dmc520_irq_configs[idx].mask; + edac_dbg(0, "Discovered %s, irq: %d.\n", dmc520_irq_configs[idx].name, irq); + } + } + + if (!irq_mask_all) { + edac_printk(KERN_ERR, EDAC_MOD_NAME, + "At least one valid interrupt line is expected.\n"); + return -EINVAL; + } + + /* Initialize dmc520 edac */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + reg_base = devm_ioremap_resource(dev, res); + if (IS_ERR(reg_base)) + return PTR_ERR(reg_base); + + if (!dmc520_is_ecc_enabled(reg_base)) + return -ENXIO; + + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = dmc520_get_rank_count(reg_base); + layers[0].is_virt_csrow = true; + + mci = edac_mc_alloc(dmc520_mc_idx++, ARRAY_SIZE(layers), layers, sizeof(*pvt)); + if (!mci) { + edac_printk(KERN_ERR, EDAC_MOD_NAME, + "Failed to allocate memory for mc instance\n"); + ret = -ENOMEM; + goto err; + } + + pvt = mci->pvt_info; + + pvt->reg_base = reg_base; + spin_lock_init(&pvt->error_lock); + memcpy(pvt->irqs, irqs, sizeof(irqs)); + memcpy(pvt->masks, masks, sizeof(masks)); + + platform_set_drvdata(pdev, mci); + + mci->pdev = dev; + mci->mtype_cap = MEM_FLAG_DDR3 | MEM_FLAG_DDR4; + mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED; + mci->edac_cap = EDAC_FLAG_SECDED; + mci->scrub_cap = SCRUB_FLAG_HW_SRC; + mci->scrub_mode = dmc520_get_scrub_type(pvt); + mci->ctl_name = EDAC_CTL_NAME; + mci->dev_name = dev_name(mci->pdev); + mci->mod_name = EDAC_MOD_NAME; + + edac_op_state = EDAC_OPSTATE_INT; + + pvt->mem_width_in_bytes = dmc520_get_memory_width(pvt); + + dmc520_init_csrow(mci); + + /* Clear interrupts, not affecting other unrelated interrupts */ + reg_val = dmc520_read_reg(pvt, REG_OFFSET_INTERRUPT_CONTROL); + dmc520_write_reg(pvt, reg_val & (~irq_mask_all), + REG_OFFSET_INTERRUPT_CONTROL); + dmc520_write_reg(pvt, irq_mask_all, REG_OFFSET_INTERRUPT_CLR); + + for (idx = 0; idx < NUMBER_OF_IRQS; idx++) { + irq = irqs[idx]; + if (irq >= 0) { + ret = devm_request_irq(&pdev->dev, irq, + dmc520_isr, IRQF_SHARED, + dev_name(&pdev->dev), mci); + if (ret < 0) { + edac_printk(KERN_ERR, EDAC_MC, + "Failed to request irq %d\n", irq); + goto err; + } + registered[idx] = true; + } + } + + /* Reset DRAM CE/UE counters */ + if (irq_mask_all & DRAM_ECC_INT_CE_BIT) + dmc520_get_dram_ecc_error_count(pvt, true); + + if (irq_mask_all & DRAM_ECC_INT_UE_BIT) + dmc520_get_dram_ecc_error_count(pvt, false); + + ret = edac_mc_add_mc(mci); + if (ret) { + edac_printk(KERN_ERR, EDAC_MOD_NAME, + "Failed to register with EDAC core\n"); + goto err; + } + + /* Enable interrupts, not affecting other unrelated interrupts */ + dmc520_write_reg(pvt, reg_val | irq_mask_all, + REG_OFFSET_INTERRUPT_CONTROL); + + return 0; + +err: + for (idx = 0; idx < NUMBER_OF_IRQS; idx++) { + if (registered[idx]) + devm_free_irq(&pdev->dev, pvt->irqs[idx], mci); + } + if (mci) + edac_mc_free(mci); + + return ret; +} + +static int dmc520_edac_remove(struct platform_device *pdev) +{ + u32 reg_val, idx, irq_mask_all = 0; + struct mem_ctl_info *mci; + struct dmc520_edac *pvt; + + mci = platform_get_drvdata(pdev); + pvt = mci->pvt_info; + + /* Disable interrupts */ + reg_val = dmc520_read_reg(pvt, REG_OFFSET_INTERRUPT_CONTROL); + dmc520_write_reg(pvt, reg_val & (~irq_mask_all), + REG_OFFSET_INTERRUPT_CONTROL); + + /* free irq's */ + for (idx = 0; idx < NUMBER_OF_IRQS; idx++) { + if (pvt->irqs[idx] >= 0) { + irq_mask_all |= pvt->masks[idx]; + devm_free_irq(&pdev->dev, pvt->irqs[idx], mci); + } + } + + edac_mc_del_mc(&pdev->dev); + edac_mc_free(mci); + + return 0; +} + +static const struct of_device_id dmc520_edac_driver_id[] = { + { .compatible = "arm,dmc-520", }, + { /* end of table */ } +}; + +MODULE_DEVICE_TABLE(of, dmc520_edac_driver_id); + +static struct platform_driver dmc520_edac_driver = { + .driver = { + .name = "dmc520", + .of_match_table = dmc520_edac_driver_id, + }, + + .probe = dmc520_edac_probe, + .remove = dmc520_edac_remove +}; + +module_platform_driver(dmc520_edac_driver); + +MODULE_AUTHOR("Rui Zhao "); +MODULE_AUTHOR("Lei Wang "); +MODULE_AUTHOR("Shiping Ji "); +MODULE_DESCRIPTION("DMC-520 ECC driver"); +MODULE_LICENSE("GPL v2"); From 2fb3f6e12510b02cf17b7c394fc34359bb4bfd32 Mon Sep 17 00:00:00 2001 From: Sherry Sun Date: Mon, 16 Mar 2020 21:34:39 +0800 Subject: [PATCH 15/16] EDAC/synopsys: Do not dump uninitialized pinf->col On the ZynqMP platform, zynqmp_get_error_info() is used to read out error information. In this function, the pinf->col parameter is not used (it is only used by the Zynq platform's zynq_get_error_info()). So there's no need to print pinf->col on ZynqMP. In order to differentiate on which platform handle_error() is executed, use DDR_ECC_INTR_SUPPORT as the check condition to distinguish between Zynq and ZynqMP platforms. [ bp: Massage. ] Fixes: b500b4a029d57 ("EDAC, synopsys: Add ECC support for ZynqMP DDR controller") Signed-off-by: Sherry Sun Signed-off-by: Borislav Petkov Reviewed-by: Manish Narani Link: https://lkml.kernel.org/r/1584365679-27443-1-git-send-email-sherry.sun@nxp.com --- drivers/edac/synopsys_edac.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 880ffd833718..12211dc040e8 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -477,16 +477,16 @@ static void handle_error(struct mem_ctl_info *mci, struct synps_ecc_status *p) if (p->ce_cnt) { pinf = &p->ceinfo; - if (!priv->p_data->quirks) { + if (priv->p_data->quirks & DDR_ECC_INTR_SUPPORT) { snprintf(priv->message, SYNPS_EDAC_MSG_SIZE, - "DDR ECC error type:%s Row %d Bank %d Col %d Bit Position: %d Data: 0x%08x", - "CE", pinf->row, pinf->bank, pinf->col, + "DDR ECC error type:%s Row %d Bank %d BankGroup Number %d Block Number %d Bit Position: %d Data: 0x%08x", + "CE", pinf->row, pinf->bank, + pinf->bankgrpnr, pinf->blknr, pinf->bitpos, pinf->data); } else { snprintf(priv->message, SYNPS_EDAC_MSG_SIZE, - "DDR ECC error type:%s Row %d Bank %d Col %d BankGroup Number %d Block Number %d Bit Position: %d Data: 0x%08x", + "DDR ECC error type:%s Row %d Bank %d Col %d Bit Position: %d Data: 0x%08x", "CE", pinf->row, pinf->bank, pinf->col, - pinf->bankgrpnr, pinf->blknr, pinf->bitpos, pinf->data); } @@ -497,15 +497,15 @@ static void handle_error(struct mem_ctl_info *mci, struct synps_ecc_status *p) if (p->ue_cnt) { pinf = &p->ueinfo; - if (!priv->p_data->quirks) { + if (priv->p_data->quirks & DDR_ECC_INTR_SUPPORT) { snprintf(priv->message, SYNPS_EDAC_MSG_SIZE, - "DDR ECC error type :%s Row %d Bank %d Col %d ", - "UE", pinf->row, pinf->bank, pinf->col); + "DDR ECC error type :%s Row %d Bank %d BankGroup Number %d Block Number %d", + "UE", pinf->row, pinf->bank, + pinf->bankgrpnr, pinf->blknr); } else { snprintf(priv->message, SYNPS_EDAC_MSG_SIZE, - "DDR ECC error type :%s Row %d Bank %d Col %d BankGroup Number %d Block Number %d", - "UE", pinf->row, pinf->bank, pinf->col, - pinf->bankgrpnr, pinf->blknr); + "DDR ECC error type :%s Row %d Bank %d Col %d ", + "UE", pinf->row, pinf->bank, pinf->col); } edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, From 215a423cc0799b054bdabbc5ba68a7178924621d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 11 Mar 2020 08:17:28 +0100 Subject: [PATCH 16/16] EDAC/armada_xp: Use scnprintf() for avoiding potential buffer overflow Since snprintf() returns the would-be-output size instead of the actual output size, the succeeding calls may go beyond the given buffer limit. Fix it by replacing with scnprintf(). Signed-off-by: Takashi Iwai Signed-off-by: Borislav Petkov Reviewed-by: Jan Luebbe Link: https://lkml.kernel.org/r/20200311071728.4541-1-tiwai@suse.de --- drivers/edac/armada_xp_edac.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/edac/armada_xp_edac.c b/drivers/edac/armada_xp_edac.c index 7f227bdcbc84..a7502ebe9bdc 100644 --- a/drivers/edac/armada_xp_edac.c +++ b/drivers/edac/armada_xp_edac.c @@ -429,26 +429,26 @@ static void aurora_l2_check(struct edac_device_ctl_info *dci) src = (attr_cap & AURORA_ERR_ATTR_SRC_MSK) >> AURORA_ERR_ATTR_SRC_OFF; if (src <= 3) - len += snprintf(msg+len, size-len, "src=CPU%d ", src); + len += scnprintf(msg+len, size-len, "src=CPU%d ", src); else - len += snprintf(msg+len, size-len, "src=IO "); + len += scnprintf(msg+len, size-len, "src=IO "); txn = (attr_cap & AURORA_ERR_ATTR_TXN_MSK) >> AURORA_ERR_ATTR_TXN_OFF; switch (txn) { case 0: - len += snprintf(msg+len, size-len, "txn=Data-Read "); + len += scnprintf(msg+len, size-len, "txn=Data-Read "); break; case 1: - len += snprintf(msg+len, size-len, "txn=Isn-Read "); + len += scnprintf(msg+len, size-len, "txn=Isn-Read "); break; case 2: - len += snprintf(msg+len, size-len, "txn=Clean-Flush "); + len += scnprintf(msg+len, size-len, "txn=Clean-Flush "); break; case 3: - len += snprintf(msg+len, size-len, "txn=Eviction "); + len += scnprintf(msg+len, size-len, "txn=Eviction "); break; case 4: - len += snprintf(msg+len, size-len, + len += scnprintf(msg+len, size-len, "txn=Read-Modify-Write "); break; } @@ -456,19 +456,19 @@ static void aurora_l2_check(struct edac_device_ctl_info *dci) err = (attr_cap & AURORA_ERR_ATTR_ERR_MSK) >> AURORA_ERR_ATTR_ERR_OFF; switch (err) { case 0: - len += snprintf(msg+len, size-len, "err=CorrECC "); + len += scnprintf(msg+len, size-len, "err=CorrECC "); break; case 1: - len += snprintf(msg+len, size-len, "err=UnCorrECC "); + len += scnprintf(msg+len, size-len, "err=UnCorrECC "); break; case 2: - len += snprintf(msg+len, size-len, "err=TagParity "); + len += scnprintf(msg+len, size-len, "err=TagParity "); break; } - len += snprintf(msg+len, size-len, "addr=0x%x ", addr_cap & AURORA_ERR_ADDR_CAP_ADDR_MASK); - len += snprintf(msg+len, size-len, "index=0x%x ", (way_cap & AURORA_ERR_WAY_IDX_MSK) >> AURORA_ERR_WAY_IDX_OFF); - len += snprintf(msg+len, size-len, "way=0x%x", (way_cap & AURORA_ERR_WAY_CAP_WAY_MASK) >> AURORA_ERR_WAY_CAP_WAY_OFFSET); + len += scnprintf(msg+len, size-len, "addr=0x%x ", addr_cap & AURORA_ERR_ADDR_CAP_ADDR_MASK); + len += scnprintf(msg+len, size-len, "index=0x%x ", (way_cap & AURORA_ERR_WAY_IDX_MSK) >> AURORA_ERR_WAY_IDX_OFF); + len += scnprintf(msg+len, size-len, "way=0x%x", (way_cap & AURORA_ERR_WAY_CAP_WAY_MASK) >> AURORA_ERR_WAY_CAP_WAY_OFFSET); /* clear error capture registers */ writel(AURORA_ERR_ATTR_CAP_VALID, drvdata->base + AURORA_ERR_ATTR_CAP_REG);