A couple of issues here: 1) MCE_LOG_LEN is only 32 - so we may have more pending records than will fit in the buffer on high core count CPUs. 2) During a panic we may have a lot of duplicate records because multiple logical CPUs may have seen and logged the same error because some banks are shared. Switch to using the genpool to look for the pending records. Squeeze out duplicated records. Signed-off-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Borislav Petkov <bp@suse.de> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Ashok Raj <ashok.raj@intel.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/1462019637-16474-7-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
		
			
				
	
	
		
			99 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			99 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| #include <linux/device.h>
 | |
| #include <asm/mce.h>
 | |
| 
 | |
| enum severity_level {
 | |
| 	MCE_NO_SEVERITY,
 | |
| 	MCE_DEFERRED_SEVERITY,
 | |
| 	MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
 | |
| 	MCE_KEEP_SEVERITY,
 | |
| 	MCE_SOME_SEVERITY,
 | |
| 	MCE_AO_SEVERITY,
 | |
| 	MCE_UC_SEVERITY,
 | |
| 	MCE_AR_SEVERITY,
 | |
| 	MCE_PANIC_SEVERITY,
 | |
| };
 | |
| 
 | |
| extern struct atomic_notifier_head x86_mce_decoder_chain;
 | |
| 
 | |
| #define ATTR_LEN		16
 | |
| #define INITIAL_CHECK_INTERVAL	5 * 60 /* 5 minutes */
 | |
| 
 | |
| /* One object for each MCE bank, shared by all CPUs */
 | |
| struct mce_bank {
 | |
| 	u64			ctl;			/* subevents to enable */
 | |
| 	unsigned char init;				/* initialise bank? */
 | |
| 	struct device_attribute attr;			/* device attribute */
 | |
| 	char			attrname[ATTR_LEN];	/* attribute name */
 | |
| };
 | |
| 
 | |
| struct mce_evt_llist {
 | |
| 	struct llist_node llnode;
 | |
| 	struct mce mce;
 | |
| };
 | |
| 
 | |
| void mce_gen_pool_process(void);
 | |
| bool mce_gen_pool_empty(void);
 | |
| int mce_gen_pool_add(struct mce *mce);
 | |
| int mce_gen_pool_init(void);
 | |
| struct llist_node *mce_gen_pool_prepare_records(void);
 | |
| 
 | |
| extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
 | |
| struct dentry *mce_get_debugfs_dir(void);
 | |
| 
 | |
| extern struct mce_bank *mce_banks;
 | |
| extern mce_banks_t mce_banks_ce_disabled;
 | |
| 
 | |
| #ifdef CONFIG_X86_MCE_INTEL
 | |
| unsigned long cmci_intel_adjust_timer(unsigned long interval);
 | |
| bool mce_intel_cmci_poll(void);
 | |
| void mce_intel_hcpu_update(unsigned long cpu);
 | |
| void cmci_disable_bank(int bank);
 | |
| #else
 | |
| # define cmci_intel_adjust_timer mce_adjust_timer_default
 | |
| static inline bool mce_intel_cmci_poll(void) { return false; }
 | |
| static inline void mce_intel_hcpu_update(unsigned long cpu) { }
 | |
| static inline void cmci_disable_bank(int bank) { }
 | |
| #endif
 | |
| 
 | |
| void mce_timer_kick(unsigned long interval);
 | |
| 
 | |
| #ifdef CONFIG_ACPI_APEI
 | |
| int apei_write_mce(struct mce *m);
 | |
| ssize_t apei_read_mce(struct mce *m, u64 *record_id);
 | |
| int apei_check_mce(void);
 | |
| int apei_clear_mce(u64 record_id);
 | |
| #else
 | |
| static inline int apei_write_mce(struct mce *m)
 | |
| {
 | |
| 	return -EINVAL;
 | |
| }
 | |
| static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
 | |
| {
 | |
| 	return 0;
 | |
| }
 | |
| static inline int apei_check_mce(void)
 | |
| {
 | |
| 	return 0;
 | |
| }
 | |
| static inline int apei_clear_mce(u64 record_id)
 | |
| {
 | |
| 	return -EINVAL;
 | |
| }
 | |
| #endif
 | |
| 
 | |
| void mce_inject_log(struct mce *m);
 | |
| 
 | |
| /*
 | |
|  * We consider records to be equivalent if bank+status+addr+misc all match.
 | |
|  * This is only used when the system is going down because of a fatal error
 | |
|  * to avoid cluttering the console log with essentially repeated information.
 | |
|  * In normal processing all errors seen are logged.
 | |
|  */
 | |
| static inline bool mce_cmp(struct mce *m1, struct mce *m2)
 | |
| {
 | |
| 	return m1->bank != m2->bank ||
 | |
| 		m1->status != m2->status ||
 | |
| 		m1->addr != m2->addr ||
 | |
| 		m1->misc != m2->misc;
 | |
| }
 |