Option to let the bios set per-bank CMCI thresholds so they can

filter noisy error sources at a fine grained level based on platform
 specific knowledge.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1.4.11 (GNU/Linux)
 
 iQIcBAABAgAGBQJQZI6dAAoJEKurIx+X31iB2EQP/jnUqMWjmgwqpwFwag8ogrXq
 92EU9phTnyQFCQULpIPSPlKgqU9WqPdPUq3c31ZOE6QKx7OhCfj6N5ZBGRgYhhGO
 zvBuj6cpGfmW448cVQnjhZ9uiIfPZPMXgGun+b1cYI4rDcjtBScNB6D5SuCPtWyx
 pisLOzvbd/Vnhd7XsLnFtKFSZ764vh2+yLhFLH6rOjUqyv2gHjAkqdJKor1ofzJ2
 akwtOJcvm43MyxXOnJeENJ1tjx4DNp+/dl8dIEO/AvBy+zasfpkNNFn1nlMx7t9w
 H2KbSRUkmQ4sXZ7VTrIIYw79GtNTL8okYVy3G5k39ISIp4ZT8IYf4YwosuAUmpaL
 ivaKm/ifpd5yW661izIJuQOJqv/cZYfXJJpinGsTLvIMJyZpNcq3aC5k0Chyr51p
 L+gYJK6xIQQm/NA6Zsc6KPKcKHSoVhdsSnIRmOCdK8V/SpNbw7P3vVwKUzyATmEV
 f+2Dp24EsgmARmn9oKl7Xyj/jy8fcSovOs6bmG5oDi7aj0IiSIe894k+MiQY7c5W
 a5WWkn0KXzcVMPm19MFP9CLGYSFLLOKGbROzfRLbCgbvD1Ev+o9Ycg1tdMsBA3cX
 L8i6Vf1A9h7WgofX3LynC8hywQUUTJC5SZqlan0rY9JG8+VOR0V6XFVccX65Fxgl
 lv4KYq2zsleo3ntfBE0E
 =n9qI
 -----END PGP SIGNATURE-----

Merge tag 'please-pull-naveen' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras into x86/mce

Pull MCE updates from Tony Luck:

 "Option to let the bios set per-bank CMCI thresholds so they can
  filter noisy error sources at a fine grained level based on platform
  specific knowledge."

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2012-09-29 10:16:29 +02:00
commit 39ba5010d3
4 changed files with 50 additions and 3 deletions

View File

@ -50,6 +50,13 @@ Machine check
monarchtimeout: monarchtimeout:
Sets the time in us to wait for other CPUs on machine checks. 0 Sets the time in us to wait for other CPUs on machine checks. 0
to disable. to disable.
mce=bios_cmci_threshold
Don't overwrite the bios-set CMCI threshold. This boot option
prevents Linux from overwriting the CMCI threshold set by the
bios. Without this option, Linux always sets the CMCI
threshold to 1. Enabling this may make memory predictive failure
analysis less effective if the bios sets thresholds for memory
errors since we will not see details for all errors.
nomce (for compatibility with i386): same as mce=off nomce (for compatibility with i386): same as mce=off

View File

@ -161,6 +161,7 @@ DECLARE_PER_CPU(struct device *, mce_device);
#ifdef CONFIG_X86_MCE_INTEL #ifdef CONFIG_X86_MCE_INTEL
extern int mce_cmci_disabled; extern int mce_cmci_disabled;
extern int mce_ignore_ce; extern int mce_ignore_ce;
extern int mce_bios_cmci_threshold;
void mce_intel_feature_init(struct cpuinfo_x86 *c); void mce_intel_feature_init(struct cpuinfo_x86 *c);
void cmci_clear(void); void cmci_clear(void);
void cmci_reenable(void); void cmci_reenable(void);

View File

@ -83,6 +83,7 @@ static int mce_dont_log_ce __read_mostly;
int mce_cmci_disabled __read_mostly; int mce_cmci_disabled __read_mostly;
int mce_ignore_ce __read_mostly; int mce_ignore_ce __read_mostly;
int mce_ser __read_mostly; int mce_ser __read_mostly;
int mce_bios_cmci_threshold __read_mostly;
struct mce_bank *mce_banks __read_mostly; struct mce_bank *mce_banks __read_mostly;
@ -1946,6 +1947,7 @@ static struct miscdevice mce_chrdev_device = {
* check, or 0 to not wait * check, or 0 to not wait
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting. * mce=nobootlog Don't log MCEs from before booting.
* mce=bios_cmci_threshold Don't program the CMCI threshold
*/ */
static int __init mcheck_enable(char *str) static int __init mcheck_enable(char *str)
{ {
@ -1965,6 +1967,8 @@ static int __init mcheck_enable(char *str)
mce_ignore_ce = 1; mce_ignore_ce = 1;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
mce_bootlog = (str[0] == 'b'); mce_bootlog = (str[0] == 'b');
else if (!strcmp(str, "bios_cmci_threshold"))
mce_bios_cmci_threshold = 1;
else if (isdigit(str[0])) { else if (isdigit(str[0])) {
get_option(&str, &tolerant); get_option(&str, &tolerant);
if (*str == ',') { if (*str == ',') {
@ -2205,6 +2209,11 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
&mce_cmci_disabled &mce_cmci_disabled
}; };
static struct dev_ext_attribute dev_attr_bios_cmci_threshold = {
__ATTR(bios_cmci_threshold, 0444, device_show_int, NULL),
&mce_bios_cmci_threshold
};
static struct device_attribute *mce_device_attrs[] = { static struct device_attribute *mce_device_attrs[] = {
&dev_attr_tolerant.attr, &dev_attr_tolerant.attr,
&dev_attr_check_interval.attr, &dev_attr_check_interval.attr,
@ -2213,6 +2222,7 @@ static struct device_attribute *mce_device_attrs[] = {
&dev_attr_dont_log_ce.attr, &dev_attr_dont_log_ce.attr,
&dev_attr_ignore_ce.attr, &dev_attr_ignore_ce.attr,
&dev_attr_cmci_disabled.attr, &dev_attr_cmci_disabled.attr,
&dev_attr_bios_cmci_threshold.attr,
NULL NULL
}; };

View File

@ -181,10 +181,12 @@ static void cmci_discover(int banks)
unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
unsigned long flags; unsigned long flags;
int i; int i;
int bios_wrong_thresh = 0;
raw_spin_lock_irqsave(&cmci_discover_lock, flags); raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) { for (i = 0; i < banks; i++) {
u64 val; u64 val;
int bios_zero_thresh = 0;
if (test_bit(i, owned)) if (test_bit(i, owned))
continue; continue;
@ -198,8 +200,20 @@ static void cmci_discover(int banks)
continue; continue;
} }
val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; if (!mce_bios_cmci_threshold) {
val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
val |= CMCI_THRESHOLD;
} else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
/*
* If bios_cmci_threshold boot option was specified
* but the threshold is zero, we'll try to initialize
* it to 1.
*/
bios_zero_thresh = 1;
val |= CMCI_THRESHOLD;
}
val |= MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val); wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val);
@ -207,11 +221,26 @@ static void cmci_discover(int banks)
if (val & MCI_CTL2_CMCI_EN) { if (val & MCI_CTL2_CMCI_EN) {
set_bit(i, owned); set_bit(i, owned);
__clear_bit(i, __get_cpu_var(mce_poll_banks)); __clear_bit(i, __get_cpu_var(mce_poll_banks));
/*
* We are able to set thresholds for some banks that
* had a threshold of 0. This means the BIOS has not
* set the thresholds properly or does not work with
* this boot option. Note down now and report later.
*/
if (mce_bios_cmci_threshold && bios_zero_thresh &&
(val & MCI_CTL2_CMCI_THRESHOLD_MASK))
bios_wrong_thresh = 1;
} else { } else {
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
} }
} }
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mce_bios_cmci_threshold && bios_wrong_thresh) {
pr_info_once(
"bios_cmci_threshold: Some banks do not have valid thresholds set\n");
pr_info_once(
"bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
}
} }
/* /*
@ -249,7 +278,7 @@ void cmci_clear(void)
continue; continue;
/* Disable CMCI */ /* Disable CMCI */
rdmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val);
val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val); wrmsrl(MSR_IA32_MCx_CTL2(i), val);
__clear_bit(i, __get_cpu_var(mce_banks_owned)); __clear_bit(i, __get_cpu_var(mce_banks_owned));
} }