mirror of
https://github.com/torvalds/linux.git
synced 2025-01-01 07:42:07 +00:00
55babd8f41
On Intel systems corrected machine check interrupts (CMCI) may be sent to multiple logical processors; possibly to all processors on the affected socket (SDM Volume 3B "15.5.1 CMCI Local APIC Interface"). This means that a persistent error (such as a stuck bit in ECC memory) may cause a storm of interrupts that greatly hinders or prevents forward progress (probably on many processors). To solve this we keep track of the rate at which each processor sees CMCI. If we exceed a threshold, we disable CMCI delivery and switch to polling the machine check banks. If the storm subsides (none of the affected processors see any more errors for a complete poll interval) we re-enable CMCI. [Tony: Added console messages when storm begins/ends and increased storm threshold from 5 to 15 so we have a few more logged entries before we disable interrupts and start dropping reports] Signed-off-by: Chen Gong <gong.chen@linux.intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Chen Gong <gong.chen@linux.intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
66 lines
1.5 KiB
C
66 lines
1.5 KiB
C
#include <linux/device.h>
|
|
#include <asm/mce.h>
|
|
|
|
enum severity_level {
|
|
MCE_NO_SEVERITY,
|
|
MCE_KEEP_SEVERITY,
|
|
MCE_SOME_SEVERITY,
|
|
MCE_AO_SEVERITY,
|
|
MCE_UC_SEVERITY,
|
|
MCE_AR_SEVERITY,
|
|
MCE_PANIC_SEVERITY,
|
|
};
|
|
|
|
#define ATTR_LEN 16
|
|
|
|
/* One object for each MCE bank, shared by all CPUs */
|
|
struct mce_bank {
|
|
u64 ctl; /* subevents to enable */
|
|
unsigned char init; /* initialise bank? */
|
|
struct device_attribute attr; /* device attribute */
|
|
char attrname[ATTR_LEN]; /* attribute name */
|
|
};
|
|
|
|
int mce_severity(struct mce *a, int tolerant, char **msg);
|
|
struct dentry *mce_get_debugfs_dir(void);
|
|
|
|
extern int mce_ser;
|
|
|
|
extern struct mce_bank *mce_banks;
|
|
|
|
#ifdef CONFIG_X86_MCE_INTEL
|
|
unsigned long mce_intel_adjust_timer(unsigned long interval);
|
|
void mce_intel_cmci_poll(void);
|
|
void mce_intel_hcpu_update(unsigned long cpu);
|
|
#else
|
|
# define mce_intel_adjust_timer mce_adjust_timer_default
|
|
static inline void mce_intel_cmci_poll(void) { }
|
|
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
|
|
#endif
|
|
|
|
void mce_timer_kick(unsigned long interval);
|
|
|
|
#ifdef CONFIG_ACPI_APEI
|
|
int apei_write_mce(struct mce *m);
|
|
ssize_t apei_read_mce(struct mce *m, u64 *record_id);
|
|
int apei_check_mce(void);
|
|
int apei_clear_mce(u64 record_id);
|
|
#else
|
|
static inline int apei_write_mce(struct mce *m)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int apei_check_mce(void)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int apei_clear_mce(u64 record_id)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
#endif
|