panic, kexec: make __crash_kexec() NMI safe

Attempting to get a crash dump out of a debug PREEMPT_RT kernel via an NMI
panic() doesn't work.  The cause of that lies in the PREEMPT_RT definition
of mutex_trylock():

	if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
		return 0;

This prevents an nmi_panic() from executing the main body of
__crash_kexec() which does the actual kexec into the kdump kernel.  The
warning and return are explained by:

  6ce47fd961 ("rtmutex: Warn if trylock is called from hard/softirq context")
  [...]
  The reasons for this are:

      1) There is a potential deadlock in the slowpath

      2) Another cpu which blocks on the rtmutex will boost the task
	 which allegedly locked the rtmutex, but that cannot work
	 because the hard/softirq context borrows the task context.

Furthermore, grabbing the lock isn't NMI safe, so do away with kexec_mutex
and replace it with an atomic variable.  This is somewhat overzealous as
*some* callsites could keep using a mutex (e.g.  the sysfs-facing ones
like crash_shrink_memory()), but this has the benefit of involving a
single unified lock and preventing any future NMI-related surprises.

Tested by triggering NMI panics via:

  $ echo 1 > /proc/sys/kernel/panic_on_unrecovered_nmi
  $ echo 1 > /proc/sys/kernel/unknown_nmi_panic
  $ echo 1 > /proc/sys/kernel/panic

  $ ipmitool power diag

Link: https://lkml.kernel.org/r/20220630223258.4144112-3-vschneid@redhat.com
Fixes: 6ce47fd961 ("rtmutex: Warn if trylock is called from hard/softirq context")
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Juri Lelli <jlelli@redhat.com>
Cc: Luis Claudio R. Goncalves <lgoncalv@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Valentin Schneider 2022-06-30 23:32:58 +01:00 committed by Andrew Morton
parent 7bb5da0d49
commit 05c6257433
4 changed files with 30 additions and 20 deletions

View File

@ -93,13 +93,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
/* /*
* Because we write directly to the reserved memory region when loading * Because we write directly to the reserved memory region when loading
* crash kernels we need a mutex here to prevent multiple crash kernels * crash kernels we need a serialization here to prevent multiple crash
* from attempting to load simultaneously, and to prevent a crash kernel * kernels from attempting to load simultaneously.
* from loading over the top of a in use crash kernel.
*
* KISS: always take the mutex.
*/ */
if (!mutex_trylock(&kexec_mutex)) if (!kexec_trylock())
return -EBUSY; return -EBUSY;
if (flags & KEXEC_ON_CRASH) { if (flags & KEXEC_ON_CRASH) {
@ -165,7 +162,7 @@ out:
kimage_free(image); kimage_free(image);
out_unlock: out_unlock:
mutex_unlock(&kexec_mutex); kexec_unlock();
return ret; return ret;
} }

View File

@ -46,7 +46,7 @@
#include <crypto/hash.h> #include <crypto/hash.h>
#include "kexec_internal.h" #include "kexec_internal.h"
DEFINE_MUTEX(kexec_mutex); atomic_t __kexec_lock = ATOMIC_INIT(0);
/* Per cpu memory for storing cpu states in case of system crash. */ /* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes; note_buf_t __percpu *crash_notes;
@ -959,7 +959,7 @@ late_initcall(kexec_core_sysctl_init);
*/ */
void __noclone __crash_kexec(struct pt_regs *regs) void __noclone __crash_kexec(struct pt_regs *regs)
{ {
/* Take the kexec_mutex here to prevent sys_kexec_load /* Take the kexec_lock here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel * running on one cpu from replacing the crash kernel
* we are using after a panic on a different cpu. * we are using after a panic on a different cpu.
* *
@ -967,7 +967,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
* of memory the xchg(&kexec_crash_image) would be * of memory the xchg(&kexec_crash_image) would be
* sufficient. But since I reuse the memory... * sufficient. But since I reuse the memory...
*/ */
if (mutex_trylock(&kexec_mutex)) { if (kexec_trylock()) {
if (kexec_crash_image) { if (kexec_crash_image) {
struct pt_regs fixed_regs; struct pt_regs fixed_regs;
@ -976,7 +976,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
machine_crash_shutdown(&fixed_regs); machine_crash_shutdown(&fixed_regs);
machine_kexec(kexec_crash_image); machine_kexec(kexec_crash_image);
} }
mutex_unlock(&kexec_mutex); kexec_unlock();
} }
} }
STACK_FRAME_NON_STANDARD(__crash_kexec); STACK_FRAME_NON_STANDARD(__crash_kexec);
@ -1008,13 +1008,13 @@ ssize_t crash_get_memory_size(void)
{ {
ssize_t size = 0; ssize_t size = 0;
if (!mutex_trylock(&kexec_mutex)) if (!kexec_trylock())
return -EBUSY; return -EBUSY;
if (crashk_res.end != crashk_res.start) if (crashk_res.end != crashk_res.start)
size = resource_size(&crashk_res); size = resource_size(&crashk_res);
mutex_unlock(&kexec_mutex); kexec_unlock();
return size; return size;
} }
@ -1025,7 +1025,7 @@ int crash_shrink_memory(unsigned long new_size)
unsigned long old_size; unsigned long old_size;
struct resource *ram_res; struct resource *ram_res;
if (!mutex_trylock(&kexec_mutex)) if (!kexec_trylock())
return -EBUSY; return -EBUSY;
if (kexec_crash_image) { if (kexec_crash_image) {
@ -1064,7 +1064,7 @@ int crash_shrink_memory(unsigned long new_size)
insert_resource(&iomem_resource, ram_res); insert_resource(&iomem_resource, ram_res);
unlock: unlock:
mutex_unlock(&kexec_mutex); kexec_unlock();
return ret; return ret;
} }
@ -1136,7 +1136,7 @@ int kernel_kexec(void)
{ {
int error = 0; int error = 0;
if (!mutex_trylock(&kexec_mutex)) if (!kexec_trylock())
return -EBUSY; return -EBUSY;
if (!kexec_image) { if (!kexec_image) {
error = -EINVAL; error = -EINVAL;
@ -1212,6 +1212,6 @@ int kernel_kexec(void)
#endif #endif
Unlock: Unlock:
mutex_unlock(&kexec_mutex); kexec_unlock();
return error; return error;
} }

View File

@ -339,7 +339,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
image = NULL; image = NULL;
if (!mutex_trylock(&kexec_mutex)) if (!kexec_trylock())
return -EBUSY; return -EBUSY;
dest_image = &kexec_image; dest_image = &kexec_image;
@ -411,7 +411,7 @@ out:
if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres(); arch_kexec_protect_crashkres();
mutex_unlock(&kexec_mutex); kexec_unlock();
kimage_free(image); kimage_free(image);
return ret; return ret;
} }

View File

@ -13,7 +13,20 @@ void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image, int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end); unsigned long start, unsigned long end);
extern struct mutex kexec_mutex; /*
* Whatever is used to serialize accesses to the kexec_crash_image needs to be
* NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a
* "simple" atomic variable that is acquired with a cmpxchg().
*/
extern atomic_t __kexec_lock;
static inline bool kexec_trylock(void)
{
return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0;
}
static inline void kexec_unlock(void)
{
atomic_set_release(&__kexec_lock, 0);
}
#ifdef CONFIG_KEXEC_FILE #ifdef CONFIG_KEXEC_FILE
#include <linux/purgatory.h> #include <linux/purgatory.h>