linux/arch/x86/kernel/cpu/mcheck/mce_intel.c
Tony Luck 4670a300a2 x86/mce: Make cmci_discover() quiet
cmci_discover() works out which machine check banks support CMCI, and
which of those are shared by multiple logical processors. It uses this
information to ensure that exactly one cpu is designated the owner of
each bank so that when interrupts are broadcast to multiple cpus, only one
of them will look in a shared bank to log the error and clear the bank.

At boot time cmci_discover() performs this task silently. But during
certain cpu hotplug operations it prints out a set of summary lines
like this:

CPU 35 MCA banks CMCI:0 CMCI:1 CMCI:3 CMCI:5 CMCI:6 CMCI:7 CMCI:8 CMCI:9 CMCI:10 CMCI:11
CPU 1 MCA banks CMCI:0 CMCI:1 CMCI:3
CPU 39 MCA banks CMCI:0 CMCI:1 CMCI:3
CPU 38 MCA banks CMCI:0 CMCI:1 CMCI:3
CPU 32 MCA banks CMCI:0 CMCI:1 CMCI:3
CPU 37 MCA banks CMCI:0 CMCI:1 CMCI:3
CPU 36 MCA banks CMCI:0 CMCI:1 CMCI:3
CPU 34 MCA banks CMCI:0 CMCI:1 CMCI:3

The value of these messages seems very low. A user might painstakingly
cross-check against the data sheet for a processor to ensure that all
CMCI supported banks are correctly reported, but this seems improbable.
If users really wanted to do this, we should print the information at
boot time too.

Remove the messages.

Signed-off-by: Tony Luck <tony.luck@intel.com>
2012-08-09 10:59:21 -07:00

217 lines
5.1 KiB
C

/*
* Intel specific MCE features.
* Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
* Copyright (C) 2008, 2009 Intel Corporation
* Author: Andi Kleen
*/
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
/*
* Support for Intel Correct Machine Check Interrupts. This allows
* the CPU to raise an interrupt when a corrected machine check happened.
* Normally we pick those up using a regular polling timer.
* Also supports reliable discovery of shared banks.
*/
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
static int cmci_supported(int *banks)
{
u64 cap;
if (mce_cmci_disabled || mce_ignore_ce)
return 0;
/*
* Vendor check is not strictly needed, but the initial
* initialization is vendor keyed and this
* makes sure none of the backdoors are entered otherwise.
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
if (!cpu_has_apic || lapic_get_maxlvt() < 6)
return 0;
rdmsrl(MSR_IA32_MCG_CAP, cap);
*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
return !!(cap & MCG_CMCI_P);
}
/*
* The interrupt handler. This is called on every event.
* Just call the poller directly to log any events.
* This could in theory increase the threshold under high load,
* but doesn't for now.
*/
static void intel_threshold_interrupt(void)
{
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
mce_notify_irq();
}
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
* banks.
*/
static void cmci_discover(int banks)
{
unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
unsigned long flags;
int i;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
if (test_bit(i, owned))
continue;
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Already owned by someone else? */
if (val & MCI_CTL2_CMCI_EN) {
clear_bit(i, owned);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
continue;
}
val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Did the enable bit stick? -- the bank supports CMCI */
if (val & MCI_CTL2_CMCI_EN) {
set_bit(i, owned);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
} else {
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
/*
* Just in case we missed an event during initialization check
* all the CMCI owned banks.
*/
void cmci_recheck(void)
{
unsigned long flags;
int banks;
if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
return;
local_irq_save(flags);
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
local_irq_restore(flags);
}
/*
* Disable CMCI on this CPU for all banks it owns when it goes down.
* This allows other CPUs to claim the banks on rediscovery.
*/
void cmci_clear(void)
{
unsigned long flags;
int i;
int banks;
u64 val;
if (!cmci_supported(&banks))
return;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
continue;
/* Disable CMCI */
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
__clear_bit(i, __get_cpu_var(mce_banks_owned));
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
/*
* After a CPU went down cycle through all the others and rediscover
* Must run in process context.
*/
void cmci_rediscover(int dying)
{
int banks;
int cpu;
cpumask_var_t old;
if (!cmci_supported(&banks))
return;
if (!alloc_cpumask_var(&old, GFP_KERNEL))
return;
cpumask_copy(old, &current->cpus_allowed);
for_each_online_cpu(cpu) {
if (cpu == dying)
continue;
if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
continue;
/* Recheck banks in case CPUs don't all have the same */
if (cmci_supported(&banks))
cmci_discover(banks);
}
set_cpus_allowed_ptr(current, old);
free_cpumask_var(old);
}
/*
* Reenable CMCI on this CPU in case a CPU down failed.
*/
void cmci_reenable(void)
{
int banks;
if (cmci_supported(&banks))
cmci_discover(banks);
}
static void intel_init_cmci(void)
{
int banks;
if (!cmci_supported(&banks))
return;
mce_threshold_vector = intel_threshold_interrupt;
cmci_discover(banks);
/*
* For CPU #0 this runs with still disabled APIC, but that's
* ok because only the vector is set up. We still do another
* check for the banks later for CPU #0 just to make sure
* to not miss any events.
*/
apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
cmci_recheck();
}
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
}