mirror of
https://github.com/torvalds/linux.git
synced 2024-12-15 23:51:46 +00:00
ccd5b32351
The following commit:39a0526fb3
("x86/mm: Factor out LDT init from context init") renamed init_new_context() to init_new_context_ldt() and added a new init_new_context() which calls init_new_context_ldt(). However, the error code of init_new_context_ldt() was ignored. Consequently, if a memory allocation in alloc_ldt_struct() failed during a fork(), the ->context.ldt of the new task remained the same as that of the old task (due to the memcpy() in dup_mm()). ldt_struct's are not intended to be shared, so a use-after-free occurred after one task exited. Fix the bug by making init_new_context() pass through the error code of init_new_context_ldt(). This bug was found by syzkaller, which encountered the following splat: BUG: KASAN: use-after-free in free_ldt_struct.part.2+0x10a/0x150 arch/x86/kernel/ldt.c:116 Read of size 4 at addr ffff88006d2cb7c8 by task kworker/u9:0/3710 CPU: 1 PID: 3710 Comm: kworker/u9:0 Not tainted 4.13.0-rc4-next-20170811 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x24e/0x340 mm/kasan/report.c:409 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429 free_ldt_struct.part.2+0x10a/0x150 arch/x86/kernel/ldt.c:116 free_ldt_struct arch/x86/kernel/ldt.c:173 [inline] destroy_context_ldt+0x60/0x80 arch/x86/kernel/ldt.c:171 destroy_context arch/x86/include/asm/mmu_context.h:157 [inline] __mmdrop+0xe9/0x530 kernel/fork.c:889 mmdrop include/linux/sched/mm.h:42 [inline] exec_mmap fs/exec.c:1061 [inline] flush_old_exec+0x173c/0x1ff0 fs/exec.c:1291 load_elf_binary+0x81f/0x4ba0 fs/binfmt_elf.c:855 search_binary_handler+0x142/0x6b0 fs/exec.c:1652 exec_binprm fs/exec.c:1694 [inline] do_execveat_common.isra.33+0x1746/0x22e0 fs/exec.c:1816 do_execve+0x31/0x40 fs/exec.c:1860 call_usermodehelper_exec_async+0x457/0x8f0 kernel/umh.c:100 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431 Allocated by task 3700: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 kmem_cache_alloc_trace+0x136/0x750 mm/slab.c:3627 kmalloc include/linux/slab.h:493 [inline] alloc_ldt_struct+0x52/0x140 arch/x86/kernel/ldt.c:67 write_ldt+0x7b7/0xab0 arch/x86/kernel/ldt.c:277 sys_modify_ldt+0x1ef/0x240 arch/x86/kernel/ldt.c:307 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed by task 3700: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3503 [inline] kfree+0xca/0x250 mm/slab.c:3820 free_ldt_struct.part.2+0xdd/0x150 arch/x86/kernel/ldt.c:121 free_ldt_struct arch/x86/kernel/ldt.c:173 [inline] destroy_context_ldt+0x60/0x80 arch/x86/kernel/ldt.c:171 destroy_context arch/x86/include/asm/mmu_context.h:157 [inline] __mmdrop+0xe9/0x530 kernel/fork.c:889 mmdrop include/linux/sched/mm.h:42 [inline] __mmput kernel/fork.c:916 [inline] mmput+0x541/0x6e0 kernel/fork.c:927 copy_process.part.36+0x22e1/0x4af0 kernel/fork.c:1931 copy_process kernel/fork.c:1546 [inline] _do_fork+0x1ef/0xfb0 kernel/fork.c:2025 SYSC_clone kernel/fork.c:2135 [inline] SyS_clone+0x37/0x50 kernel/fork.c:2129 do_syscall_64+0x26c/0x8c0 arch/x86/entry/common.c:287 return_from_SYSCALL_64+0x0/0x7a Here is a C reproducer: #include <asm/ldt.h> #include <pthread.h> #include <signal.h> #include <stdlib.h> #include <sys/syscall.h> #include <sys/wait.h> #include <unistd.h> static void *fork_thread(void *_arg) { fork(); } int main(void) { struct user_desc desc = { .entry_number = 8191 }; syscall(__NR_modify_ldt, 1, &desc, sizeof(desc)); for (;;) { if (fork() == 0) { pthread_t t; srand(getpid()); pthread_create(&t, NULL, fork_thread, NULL); usleep(rand() % 10000); syscall(__NR_exit_group, 0); } wait(NULL); } } Note: the reproducer takes advantage of the fact that alloc_ldt_struct() may use vmalloc() to allocate a large ->entries array, and after commit:5d17a73a2e
("vmalloc: back off when the current task is killed") it is possible for userspace to fail a task's vmalloc() by sending a fatal signal, e.g. via exit_group(). It would be more difficult to reproduce this bug on kernels without that commit. This bug only affected kernels with CONFIG_MODIFY_LDT_SYSCALL=y. Signed-off-by: Eric Biggers <ebiggers@google.com> Acked-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: <stable@vger.kernel.org> [v4.6+] Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Fixes:39a0526fb3
("x86/mm: Factor out LDT init from context init") Link: http://lkml.kernel.org/r/20170824175029.76040-1-ebiggers3@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
301 lines
8.2 KiB
C
301 lines
8.2 KiB
C
#ifndef _ASM_X86_MMU_CONTEXT_H
|
|
#define _ASM_X86_MMU_CONTEXT_H
|
|
|
|
#include <asm/desc.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/mm_types.h>
|
|
#include <linux/pkeys.h>
|
|
|
|
#include <trace/events/tlb.h>
|
|
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/paravirt.h>
|
|
#include <asm/mpx.h>
|
|
#ifndef CONFIG_PARAVIRT
|
|
static inline void paravirt_activate_mm(struct mm_struct *prev,
|
|
struct mm_struct *next)
|
|
{
|
|
}
|
|
#endif /* !CONFIG_PARAVIRT */
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
extern struct static_key rdpmc_always_available;
|
|
|
|
static inline void load_mm_cr4(struct mm_struct *mm)
|
|
{
|
|
if (static_key_false(&rdpmc_always_available) ||
|
|
atomic_read(&mm->context.perf_rdpmc_allowed))
|
|
cr4_set_bits(X86_CR4_PCE);
|
|
else
|
|
cr4_clear_bits(X86_CR4_PCE);
|
|
}
|
|
#else
|
|
static inline void load_mm_cr4(struct mm_struct *mm) {}
|
|
#endif
|
|
|
|
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
|
/*
|
|
* ldt_structs can be allocated, used, and freed, but they are never
|
|
* modified while live.
|
|
*/
|
|
struct ldt_struct {
|
|
/*
|
|
* Xen requires page-aligned LDTs with special permissions. This is
|
|
* needed to prevent us from installing evil descriptors such as
|
|
* call gates. On native, we could merge the ldt_struct and LDT
|
|
* allocations, but it's not worth trying to optimize.
|
|
*/
|
|
struct desc_struct *entries;
|
|
unsigned int nr_entries;
|
|
};
|
|
|
|
/*
|
|
* Used for LDT copy/destruction.
|
|
*/
|
|
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
|
|
void destroy_context_ldt(struct mm_struct *mm);
|
|
#else /* CONFIG_MODIFY_LDT_SYSCALL */
|
|
static inline int init_new_context_ldt(struct task_struct *tsk,
|
|
struct mm_struct *mm)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline void destroy_context_ldt(struct mm_struct *mm) {}
|
|
#endif
|
|
|
|
static inline void load_mm_ldt(struct mm_struct *mm)
|
|
{
|
|
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
|
struct ldt_struct *ldt;
|
|
|
|
/* lockless_dereference synchronizes with smp_store_release */
|
|
ldt = lockless_dereference(mm->context.ldt);
|
|
|
|
/*
|
|
* Any change to mm->context.ldt is followed by an IPI to all
|
|
* CPUs with the mm active. The LDT will not be freed until
|
|
* after the IPI is handled by all such CPUs. This means that,
|
|
* if the ldt_struct changes before we return, the values we see
|
|
* will be safe, and the new values will be loaded before we run
|
|
* any user code.
|
|
*
|
|
* NB: don't try to convert this to use RCU without extreme care.
|
|
* We would still need IRQs off, because we don't want to change
|
|
* the local LDT after an IPI loaded a newer value than the one
|
|
* that we can see.
|
|
*/
|
|
|
|
if (unlikely(ldt))
|
|
set_ldt(ldt->entries, ldt->nr_entries);
|
|
else
|
|
clear_LDT();
|
|
#else
|
|
clear_LDT();
|
|
#endif
|
|
}
|
|
|
|
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
|
|
{
|
|
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
|
/*
|
|
* Load the LDT if either the old or new mm had an LDT.
|
|
*
|
|
* An mm will never go from having an LDT to not having an LDT. Two
|
|
* mms never share an LDT, so we don't gain anything by checking to
|
|
* see whether the LDT changed. There's also no guarantee that
|
|
* prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
|
|
* then prev->context.ldt will also be non-NULL.
|
|
*
|
|
* If we really cared, we could optimize the case where prev == next
|
|
* and we're exiting lazy mode. Most of the time, if this happens,
|
|
* we don't actually need to reload LDTR, but modify_ldt() is mostly
|
|
* used by legacy code and emulators where we don't need this level of
|
|
* performance.
|
|
*
|
|
* This uses | instead of || because it generates better code.
|
|
*/
|
|
if (unlikely((unsigned long)prev->context.ldt |
|
|
(unsigned long)next->context.ldt))
|
|
load_mm_ldt(next);
|
|
#endif
|
|
|
|
DEBUG_LOCKS_WARN_ON(preemptible());
|
|
}
|
|
|
|
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
|
{
|
|
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
|
|
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
|
|
}
|
|
|
|
static inline int init_new_context(struct task_struct *tsk,
|
|
struct mm_struct *mm)
|
|
{
|
|
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
|
|
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
|
|
/* pkey 0 is the default and always allocated */
|
|
mm->context.pkey_allocation_map = 0x1;
|
|
/* -1 means unallocated or invalid */
|
|
mm->context.execute_only_pkey = -1;
|
|
}
|
|
#endif
|
|
return init_new_context_ldt(tsk, mm);
|
|
}
|
|
static inline void destroy_context(struct mm_struct *mm)
|
|
{
|
|
destroy_context_ldt(mm);
|
|
}
|
|
|
|
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
|
struct task_struct *tsk);
|
|
|
|
extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|
struct task_struct *tsk);
|
|
#define switch_mm_irqs_off switch_mm_irqs_off
|
|
|
|
#define activate_mm(prev, next) \
|
|
do { \
|
|
paravirt_activate_mm((prev), (next)); \
|
|
switch_mm((prev), (next), NULL); \
|
|
} while (0);
|
|
|
|
#ifdef CONFIG_X86_32
|
|
#define deactivate_mm(tsk, mm) \
|
|
do { \
|
|
lazy_load_gs(0); \
|
|
} while (0)
|
|
#else
|
|
#define deactivate_mm(tsk, mm) \
|
|
do { \
|
|
load_gs_index(0); \
|
|
loadsegment(fs, 0); \
|
|
} while (0)
|
|
#endif
|
|
|
|
static inline void arch_dup_mmap(struct mm_struct *oldmm,
|
|
struct mm_struct *mm)
|
|
{
|
|
paravirt_arch_dup_mmap(oldmm, mm);
|
|
}
|
|
|
|
static inline void arch_exit_mmap(struct mm_struct *mm)
|
|
{
|
|
paravirt_arch_exit_mmap(mm);
|
|
}
|
|
|
|
#ifdef CONFIG_X86_64
|
|
static inline bool is_64bit_mm(struct mm_struct *mm)
|
|
{
|
|
return !IS_ENABLED(CONFIG_IA32_EMULATION) ||
|
|
!(mm->context.ia32_compat == TIF_IA32);
|
|
}
|
|
#else
|
|
static inline bool is_64bit_mm(struct mm_struct *mm)
|
|
{
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
static inline void arch_bprm_mm_init(struct mm_struct *mm,
|
|
struct vm_area_struct *vma)
|
|
{
|
|
mpx_mm_init(mm);
|
|
}
|
|
|
|
static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
unsigned long start, unsigned long end)
|
|
{
|
|
/*
|
|
* mpx_notify_unmap() goes and reads a rarely-hot
|
|
* cacheline in the mm_struct. That can be expensive
|
|
* enough to be seen in profiles.
|
|
*
|
|
* The mpx_notify_unmap() call and its contents have been
|
|
* observed to affect munmap() performance on hardware
|
|
* where MPX is not present.
|
|
*
|
|
* The unlikely() optimizes for the fast case: no MPX
|
|
* in the CPU, or no MPX use in the process. Even if
|
|
* we get this wrong (in the unlikely event that MPX
|
|
* is widely enabled on some system) the overhead of
|
|
* MPX itself (reading bounds tables) is expected to
|
|
* overwhelm the overhead of getting this unlikely()
|
|
* consistently wrong.
|
|
*/
|
|
if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
|
|
mpx_notify_unmap(mm, vma, start, end);
|
|
}
|
|
|
|
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
|
|
static inline int vma_pkey(struct vm_area_struct *vma)
|
|
{
|
|
unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
|
|
VM_PKEY_BIT2 | VM_PKEY_BIT3;
|
|
|
|
return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
|
|
}
|
|
#else
|
|
static inline int vma_pkey(struct vm_area_struct *vma)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* We only want to enforce protection keys on the current process
|
|
* because we effectively have no access to PKRU for other
|
|
* processes or any way to tell *which * PKRU in a threaded
|
|
* process we could use.
|
|
*
|
|
* So do not enforce things if the VMA is not from the current
|
|
* mm, or if we are in a kernel thread.
|
|
*/
|
|
static inline bool vma_is_foreign(struct vm_area_struct *vma)
|
|
{
|
|
if (!current->mm)
|
|
return true;
|
|
/*
|
|
* Should PKRU be enforced on the access to this VMA? If
|
|
* the VMA is from another process, then PKRU has no
|
|
* relevance and should not be enforced.
|
|
*/
|
|
if (current->mm != vma->vm_mm)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
|
|
bool write, bool execute, bool foreign)
|
|
{
|
|
/* pkeys never affect instruction fetches */
|
|
if (execute)
|
|
return true;
|
|
/* allow access if the VMA is not one from this process */
|
|
if (foreign || vma_is_foreign(vma))
|
|
return true;
|
|
return __pkru_allows_pkey(vma_pkey(vma), write);
|
|
}
|
|
|
|
|
|
/*
|
|
* This can be used from process context to figure out what the value of
|
|
* CR3 is without needing to do a (slow) __read_cr3().
|
|
*
|
|
* It's intended to be used for code like KVM that sneakily changes CR3
|
|
* and needs to restore it. It needs to be used very carefully.
|
|
*/
|
|
static inline unsigned long __get_current_cr3_fast(void)
|
|
{
|
|
unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
|
|
|
|
/* For now, be very restrictive about when this can be called. */
|
|
VM_WARN_ON(in_nmi() || preemptible());
|
|
|
|
VM_BUG_ON(cr3 != __read_cr3());
|
|
return cr3;
|
|
}
|
|
|
|
#endif /* _ASM_X86_MMU_CONTEXT_H */
|