linux/arch/x86/kernel/nmi.c
Steven Rostedt c7d65a78fc x86: Remove cmpxchg from i386 NMI nesting code
I've been informed by someone on LWN called 'slashdot' that
some i386 machines do not support a true cmpxchg. The cmpxchg
used by the i386 NMI nesting code must be a true cmpxchg as
disabling interrupts will not work for NMIs (which is the work
around for i386s that do not have a true cmpxchg).

This 'slashdot' character also suggested a fix to the issue.
As the state of the nesting NMIs goes as follows:

  NOT_RUNNING -> EXECUTING
  EXECUTING   -> NOT_RUNNING
  EXECUTING   -> LATCHED
  LATCHED     -> EXECUTING

Having these states as enum values of:

  NOT_RUNNING = 0
  EXECUTING   = 1
  LATCHED     = 2

Instead of a cmpxchg to make EXECUTING -> NOT_RUNNING a
dec_and_test() would work as well. If the dec_and_test brings
the state to NOT_RUNNING, that is the same as a cmpxchg
succeeding to change EXECUTING to NOT_RUNNING. If a nested NMI
were to come in and change it to LATCHED, the dec_and_test() would
convert the state to EXECUTING (what we want it to be in such a
case anyway).

I asked 'slashdot' to post this as a patch, but it never came to
be. I decided to do the work instead.

Thanks to H. Peter Anvin for suggesting to use this_cpu_dec_and_return()
instead of local_dec_and_test(&__get_cpu_var()).

Link: http://lwn.net/Articles/484932/

Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2012-06-08 18:48:05 -04:00

500 lines
14 KiB
C

/*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
* Copyright (C) 2011 Don Zickus Red Hat, Inc.
*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
*/
/*
* Handle hardware traps and faults.
*/
#include <linux/spinlock.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/nmi.h>
#include <linux/delay.h>
#include <linux/hardirq.h>
#include <linux/slab.h>
#include <linux/export.h>
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
#include <linux/atomic.h>
#include <asm/traps.h>
#include <asm/mach_traps.h>
#include <asm/nmi.h>
#include <asm/x86_init.h>
struct nmi_desc {
spinlock_t lock;
struct list_head head;
};
static struct nmi_desc nmi_desc[NMI_MAX] =
{
{
.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
.head = LIST_HEAD_INIT(nmi_desc[0].head),
},
{
.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
.head = LIST_HEAD_INIT(nmi_desc[1].head),
},
{
.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
.head = LIST_HEAD_INIT(nmi_desc[2].head),
},
{
.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
.head = LIST_HEAD_INIT(nmi_desc[3].head),
},
};
struct nmi_stats {
unsigned int normal;
unsigned int unknown;
unsigned int external;
unsigned int swallow;
};
static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
static int ignore_nmis;
int unknown_nmi_panic;
/*
* Prevent NMI reason port (0x61) being accessed simultaneously, can
* only be used in NMI handler.
*/
static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
static int __init setup_unknown_nmi_panic(char *str)
{
unknown_nmi_panic = 1;
return 1;
}
__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
#define nmi_to_desc(type) (&nmi_desc[type])
static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
int handled=0;
rcu_read_lock();
/*
* NMIs are edge-triggered, which means if you have enough
* of them concurrently, you can lose some because only one
* can be latched at any given time. Walk the whole list
* to handle those situations.
*/
list_for_each_entry_rcu(a, &desc->head, list)
handled += a->handler(type, regs);
rcu_read_unlock();
/* return total number of NMI events handled */
return handled;
}
int __register_nmi_handler(unsigned int type, struct nmiaction *action)
{
struct nmi_desc *desc = nmi_to_desc(type);
unsigned long flags;
if (!action->handler)
return -EINVAL;
spin_lock_irqsave(&desc->lock, flags);
/*
* most handlers of type NMI_UNKNOWN never return because
* they just assume the NMI is theirs. Just a sanity check
* to manage expectations
*/
WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
/*
* some handlers need to be executed first otherwise a fake
* event confuses some handlers (kdump uses this flag)
*/
if (action->flags & NMI_FLAG_FIRST)
list_add_rcu(&action->list, &desc->head);
else
list_add_tail_rcu(&action->list, &desc->head);
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
EXPORT_SYMBOL(__register_nmi_handler);
void unregister_nmi_handler(unsigned int type, const char *name)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *n;
unsigned long flags;
spin_lock_irqsave(&desc->lock, flags);
list_for_each_entry_rcu(n, &desc->head, list) {
/*
* the name passed in to describe the nmi handler
* is used as the lookup key
*/
if (!strcmp(n->name, name)) {
WARN(in_nmi(),
"Trying to free NMI (%s) from NMI context!\n", n->name);
list_del_rcu(&n->list);
break;
}
}
spin_unlock_irqrestore(&desc->lock, flags);
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
static __kprobes void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
/* check to see if anyone registered against these types of errors */
if (nmi_handle(NMI_SERR, regs, false))
return;
pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
reason, smp_processor_id());
/*
* On some machines, PCI SERR line is used to report memory
* errors. EDAC makes use of it.
*/
#if defined(CONFIG_EDAC)
if (edac_handler_set()) {
edac_atomic_assert_error();
return;
}
#endif
if (panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
pr_emerg("Dazed and confused, but trying to continue\n");
/* Clear and disable the PCI SERR error line. */
reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
outb(reason, NMI_REASON_PORT);
}
static __kprobes void
io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
/* check to see if anyone registered against these types of errors */
if (nmi_handle(NMI_IO_CHECK, regs, false))
return;
pr_emerg(
"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
reason, smp_processor_id());
show_regs(regs);
if (panic_on_io_nmi)
panic("NMI IOCK error: Not continuing");
/* Re-enable the IOCK line, wait for a few seconds */
reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
outb(reason, NMI_REASON_PORT);
i = 20000;
while (--i) {
touch_nmi_watchdog();
udelay(100);
}
reason &= ~NMI_REASON_CLEAR_IOCHK;
outb(reason, NMI_REASON_PORT);
}
static __kprobes void
unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
int handled;
/*
* Use 'false' as back-to-back NMIs are dealt with one level up.
* Of course this makes having multiple 'unknown' handlers useless
* as only the first one is ever run (unless it can actually determine
* if it caused the NMI)
*/
handled = nmi_handle(NMI_UNKNOWN, regs, false);
if (handled) {
__this_cpu_add(nmi_stats.unknown, handled);
return;
}
__this_cpu_add(nmi_stats.unknown, 1);
pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
reason, smp_processor_id());
pr_emerg("Do you have a strange power saving mode enabled?\n");
if (unknown_nmi_panic || panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
pr_emerg("Dazed and confused, but trying to continue\n");
}
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
static __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;
bool b2b = false;
/*
* CPU-specific NMI must be processed before non-CPU-specific
* NMI, otherwise we may lose it, because the CPU-specific
* NMI can not be detected/processed on other CPUs.
*/
/*
* Back-to-back NMIs are interesting because they can either
* be two NMI or more than two NMIs (any thing over two is dropped
* due to NMI being edge-triggered). If this is the second half
* of the back-to-back NMI, assume we dropped things and process
* more handlers. Otherwise reset the 'swallow' NMI behaviour
*/
if (regs->ip == __this_cpu_read(last_nmi_rip))
b2b = true;
else
__this_cpu_write(swallow_nmi, false);
__this_cpu_write(last_nmi_rip, regs->ip);
handled = nmi_handle(NMI_LOCAL, regs, b2b);
__this_cpu_add(nmi_stats.normal, handled);
if (handled) {
/*
* There are cases when a NMI handler handles multiple
* events in the current NMI. One of these events may
* be queued for in the next NMI. Because the event is
* already handled, the next NMI will result in an unknown
* NMI. Instead lets flag this for a potential NMI to
* swallow.
*/
if (handled > 1)
__this_cpu_write(swallow_nmi, true);
return;
}
/* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
raw_spin_lock(&nmi_reason_lock);
reason = x86_platform.get_nmi_reason();
if (reason & NMI_REASON_MASK) {
if (reason & NMI_REASON_SERR)
pci_serr_error(reason, regs);
else if (reason & NMI_REASON_IOCHK)
io_check_error(reason, regs);
#ifdef CONFIG_X86_32
/*
* Reassert NMI in case it became active
* meanwhile as it's edge-triggered:
*/
reassert_nmi();
#endif
__this_cpu_add(nmi_stats.external, 1);
raw_spin_unlock(&nmi_reason_lock);
return;
}
raw_spin_unlock(&nmi_reason_lock);
/*
* Only one NMI can be latched at a time. To handle
* this we may process multiple nmi handlers at once to
* cover the case where an NMI is dropped. The downside
* to this approach is we may process an NMI prematurely,
* while its real NMI is sitting latched. This will cause
* an unknown NMI on the next run of the NMI processing.
*
* We tried to flag that condition above, by setting the
* swallow_nmi flag when we process more than one event.
* This condition is also only present on the second half
* of a back-to-back NMI, so we flag that condition too.
*
* If both are true, we assume we already processed this
* NMI previously and we swallow it. Otherwise we reset
* the logic.
*
* There are scenarios where we may accidentally swallow
* a 'real' unknown NMI. For example, while processing
* a perf NMI another perf NMI comes in along with a
* 'real' unknown NMI. These two NMIs get combined into
* one (as descibed above). When the next NMI gets
* processed, it will be flagged by perf as handled, but
* noone will know that there was a 'real' unknown NMI sent
* also. As a result it gets swallowed. Or if the first
* perf NMI returns two events handled then the second
* NMI will get eaten by the logic below, again losing a
* 'real' unknown NMI. But this is the best we can do
* for now.
*/
if (b2b && __this_cpu_read(swallow_nmi))
__this_cpu_add(nmi_stats.swallow, 1);
else
unknown_nmi_error(reason, regs);
}
/*
* NMIs can hit breakpoints which will cause it to lose its
* NMI context with the CPU when the breakpoint does an iret.
*/
#ifdef CONFIG_X86_32
/*
* For i386, NMIs use the same stack as the kernel, and we can
* add a workaround to the iret problem in C (preventing nested
* NMIs if an NMI takes a trap). Simply have 3 states the NMI
* can be in:
*
* 1) not running
* 2) executing
* 3) latched
*
* When no NMI is in progress, it is in the "not running" state.
* When an NMI comes in, it goes into the "executing" state.
* Normally, if another NMI is triggered, it does not interrupt
* the running NMI and the HW will simply latch it so that when
* the first NMI finishes, it will restart the second NMI.
* (Note, the latch is binary, thus multiple NMIs triggering,
* when one is running, are ignored. Only one NMI is restarted.)
*
* If an NMI hits a breakpoint that executes an iret, another
* NMI can preempt it. We do not want to allow this new NMI
* to run, but we want to execute it when the first one finishes.
* We set the state to "latched", and the exit of the first NMI will
* perform a dec_return, if the result is zero (NOT_RUNNING), then
* it will simply exit the NMI handler. If not, the dec_return
* would have set the state to NMI_EXECUTING (what we want it to
* be when we are running). In this case, we simply jump back
* to rerun the NMI handler again, and restart the 'latched' NMI.
*
* No trap (breakpoint or page fault) should be hit before nmi_restart,
* thus there is no race between the first check of state for NOT_RUNNING
* and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
* at this point.
*/
enum nmi_states {
NMI_NOT_RUNNING = 0,
NMI_EXECUTING,
NMI_LATCHED,
};
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
#define nmi_nesting_preprocess(regs) \
do { \
if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
this_cpu_write(nmi_state, NMI_LATCHED); \
return; \
} \
this_cpu_write(nmi_state, NMI_EXECUTING); \
} while (0); \
nmi_restart:
#define nmi_nesting_postprocess() \
do { \
if (this_cpu_dec_return(nmi_state)) \
goto nmi_restart; \
} while (0)
#else /* x86_64 */
/*
* In x86_64 things are a bit more difficult. This has the same problem
* where an NMI hitting a breakpoint that calls iret will remove the
* NMI context, allowing a nested NMI to enter. What makes this more
* difficult is that both NMIs and breakpoints have their own stack.
* When a new NMI or breakpoint is executed, the stack is set to a fixed
* point. If an NMI is nested, it will have its stack set at that same
* fixed address that the first NMI had, and will start corrupting the
* stack. This is handled in entry_64.S, but the same problem exists with
* the breakpoint stack.
*
* If a breakpoint is being processed, and the debug stack is being used,
* if an NMI comes in and also hits a breakpoint, the stack pointer
* will be set to the same fixed address as the breakpoint that was
* interrupted, causing that stack to be corrupted. To handle this case,
* check if the stack that was interrupted is the debug stack, and if
* so, change the IDT so that new breakpoints will use the current stack
* and not switch to the fixed address. On return of the NMI, switch back
* to the original IDT.
*/
static DEFINE_PER_CPU(int, update_debug_stack);
static inline void nmi_nesting_preprocess(struct pt_regs *regs)
{
/*
* If we interrupted a breakpoint, it is possible that
* the nmi handler will have breakpoints too. We need to
* change the IDT such that breakpoints that happen here
* continue to use the NMI stack.
*/
if (unlikely(is_debug_stack(regs->sp))) {
debug_stack_set_zero();
this_cpu_write(update_debug_stack, 1);
}
}
static inline void nmi_nesting_postprocess(void)
{
if (unlikely(this_cpu_read(update_debug_stack))) {
debug_stack_reset();
this_cpu_write(update_debug_stack, 0);
}
}
#endif
dotraplinkage notrace __kprobes void
do_nmi(struct pt_regs *regs, long error_code)
{
nmi_nesting_preprocess(regs);
nmi_enter();
inc_irq_stat(__nmi_count);
if (!ignore_nmis)
default_do_nmi(regs);
nmi_exit();
/* On i386, may loop back to preprocess */
nmi_nesting_postprocess();
}
void stop_nmi(void)
{
ignore_nmis++;
}
void restart_nmi(void)
{
ignore_nmis--;
}
/* reset the back-to-back NMI logic */
void local_touch_nmi(void)
{
__this_cpu_write(last_nmi_rip, 0);
}