From 81d30225bc0c246b53270eb90b23cfbb941a186d Mon Sep 17 00:00:00 2001 From: Jia Zhang Date: Mon, 1 Apr 2019 19:40:45 +0800 Subject: [PATCH 1/6] x86/vdso: Remove hpet_page from vDSO This trivial cleanup finalizes the removal of vDSO HPET support. Fixes: 1ed95e52d902 ("x86/vdso: Remove direct HPET access through the vDSO") Signed-off-by: Jia Zhang Signed-off-by: Thomas Gleixner Cc: luto@kernel.org Cc: bp@alien8.de Link: https://lkml.kernel.org/r/20190401114045.7280-1-zhang.jia@linux.alibaba.com Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vdso2c.c | 3 --- arch/x86/include/asm/vdso.h | 1 - 2 files changed, 4 deletions(-) diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 8e470b018512..3a4d8d4d39f8 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -73,14 +73,12 @@ const char *outfilename; enum { sym_vvar_start, sym_vvar_page, - sym_hpet_page, sym_pvclock_page, sym_hvclock_page, }; const int special_pages[] = { sym_vvar_page, - sym_hpet_page, sym_pvclock_page, sym_hvclock_page, }; @@ -93,7 +91,6 @@ struct vdso_sym { struct vdso_sym required_syms[] = { [sym_vvar_start] = {"vvar_start", true}, [sym_vvar_page] = {"vvar_page", true}, - [sym_hpet_page] = {"hpet_page", true}, [sym_pvclock_page] = {"pvclock_page", true}, [sym_hvclock_page] = {"hvclock_page", true}, {"VDSO32_NOTE_MASK", true}, diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 27566e57e87d..230474e2ddb5 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -19,7 +19,6 @@ struct vdso_image { long sym_vvar_start; /* Negative offset to the vvar area */ long sym_vvar_page; - long sym_hpet_page; long sym_pvclock_page; long sym_hvclock_page; long sym_VDSO32_NOTE_MASK; From 0e72499c3cc0cead32f88b94a02204d2b80768bf Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 29 Mar 2019 17:47:41 -0700 Subject: [PATCH 2/6] x86/kprobes: Make trampoline_handler() global and visible This function is referenced from assembler, so in LTO it needs to be global and visible to not be optimized away. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20190330004743.29541-7-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index cf52ee0d8711..9e4fa2484d10 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -768,7 +768,7 @@ static struct kprobe kretprobe_kprobe = { /* * Called from kretprobe_trampoline */ -static __used void *trampoline_handler(struct pt_regs *regs) +__used __visible void *trampoline_handler(struct pt_regs *regs) { struct kprobe_ctlblk *kcb; struct kretprobe_instance *ri = NULL; From eccd906484d1cd4b5da00f093d678badb6f48f28 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 17 Apr 2019 15:41:17 +0000 Subject: [PATCH 3/6] x86/mm: Do not use set_{pud, pmd}_safe() when splitting a large page The commit 0a9fe8ca844d ("x86/mm: Validate kernel_physical_mapping_init() PTE population") triggers this warning in SEV guests: WARNING: CPU: 0 PID: 0 at arch/x86/include/asm/pgalloc.h:87 phys_pmd_init+0x30d/0x386 Call Trace: kernel_physical_mapping_init+0xce/0x259 early_set_memory_enc_dec+0x10f/0x160 kvm_smp_prepare_boot_cpu+0x71/0x9d start_kernel+0x1c9/0x50b secondary_startup_64+0xa4/0xb0 A SEV guest calls kernel_physical_mapping_init() to clear the encryption mask from an existing mapping. While doing so, it also splits large pages into smaller. To split a page, kernel_physical_mapping_init() allocates a new page and updates the existing entry. The set_{pud,pmd}_safe() helpers trigger a warning when updating an entry with a page in the present state. Add a new kernel_physical_mapping_change() helper which uses the non-safe variants of set_{pmd,pud,p4d}() and {pmd,pud,p4d}_populate() routines when updating the entry. Since kernel_physical_mapping_change() may replace an existing entry with a new entry, the caller is responsible to flush the TLB at the end. Change early_set_memory_enc_dec() to use kernel_physical_mapping_change() when it wants to clear the memory encryption mask from the page table entry. [ bp: - massage commit message. - flesh out comment according to dhansen's request. - align function arguments at opening brace. ] Fixes: 0a9fe8ca844d ("x86/mm: Validate kernel_physical_mapping_init() PTE population") Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Dan Williams Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Kirill A . Shutemov" Cc: Thomas Gleixner Cc: Thomas Lendacky Cc: x86-ml Link: https://lkml.kernel.org/r/20190417154102.22613-1-brijesh.singh@amd.com --- arch/x86/mm/init_64.c | 144 +++++++++++++++++++++++++++----------- arch/x86/mm/mem_encrypt.c | 10 ++- arch/x86/mm/mm_internal.h | 3 + 3 files changed, 114 insertions(+), 43 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bccff68e3267..5cd125bd2a85 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -58,6 +58,37 @@ #include "ident_map.c" +#define DEFINE_POPULATE(fname, type1, type2, init) \ +static inline void fname##_init(struct mm_struct *mm, \ + type1##_t *arg1, type2##_t *arg2, bool init) \ +{ \ + if (init) \ + fname##_safe(mm, arg1, arg2); \ + else \ + fname(mm, arg1, arg2); \ +} + +DEFINE_POPULATE(p4d_populate, p4d, pud, init) +DEFINE_POPULATE(pgd_populate, pgd, p4d, init) +DEFINE_POPULATE(pud_populate, pud, pmd, init) +DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init) + +#define DEFINE_ENTRY(type1, type2, init) \ +static inline void set_##type1##_init(type1##_t *arg1, \ + type2##_t arg2, bool init) \ +{ \ + if (init) \ + set_##type1##_safe(arg1, arg2); \ + else \ + set_##type1(arg1, arg2); \ +} + +DEFINE_ENTRY(p4d, p4d, init) +DEFINE_ENTRY(pud, pud, init) +DEFINE_ENTRY(pmd, pmd, init) +DEFINE_ENTRY(pte, pte, init) + + /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move @@ -414,7 +445,7 @@ void __init cleanup_highmap(void) */ static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, - pgprot_t prot) + pgprot_t prot, bool init) { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; @@ -432,7 +463,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pte_safe(pte, __pte(0)); + set_pte_init(pte, __pte(0), init); continue; } @@ -452,7 +483,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); pages++; - set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); + set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init); paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; } @@ -468,7 +499,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, */ static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, - unsigned long page_size_mask, pgprot_t prot) + unsigned long page_size_mask, pgprot_t prot, bool init) { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; @@ -487,7 +518,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pmd_safe(pmd, __pmd(0)); + set_pmd_init(pmd, __pmd(0), init); continue; } @@ -496,7 +527,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, spin_lock(&init_mm.page_table_lock); pte = (pte_t *)pmd_page_vaddr(*pmd); paddr_last = phys_pte_init(pte, paddr, - paddr_end, prot); + paddr_end, prot, + init); spin_unlock(&init_mm.page_table_lock); continue; } @@ -524,19 +556,20 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, if (page_size_mask & (1<> PAGE_SHIFT, - __pgprot(pgprot_val(prot) | _PAGE_PSE))); + set_pte_init((pte_t *)pmd, + pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE)), + init); spin_unlock(&init_mm.page_table_lock); paddr_last = paddr_next; continue; } pte = alloc_low_page(); - paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot); + paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init); spin_lock(&init_mm.page_table_lock); - pmd_populate_kernel_safe(&init_mm, pmd, pte); + pmd_populate_kernel_init(&init_mm, pmd, pte, init); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); @@ -551,7 +584,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, */ static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, - unsigned long page_size_mask) + unsigned long page_size_mask, bool init) { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; @@ -573,7 +606,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pud_safe(pud, __pud(0)); + set_pud_init(pud, __pud(0), init); continue; } @@ -583,7 +616,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, paddr_last = phys_pmd_init(pmd, paddr, paddr_end, page_size_mask, - prot); + prot, init); continue; } /* @@ -610,9 +643,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, if (page_size_mask & (1<> PAGE_SHIFT, - PAGE_KERNEL_LARGE)); + set_pte_init((pte_t *)pud, + pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE), + init); spin_unlock(&init_mm.page_table_lock); paddr_last = paddr_next; continue; @@ -620,10 +654,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, pmd = alloc_low_page(); paddr_last = phys_pmd_init(pmd, paddr, paddr_end, - page_size_mask, prot); + page_size_mask, prot, init); spin_lock(&init_mm.page_table_lock); - pud_populate_safe(&init_mm, pud, pmd); + pud_populate_init(&init_mm, pud, pmd, init); spin_unlock(&init_mm.page_table_lock); } @@ -634,14 +668,15 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, static unsigned long __meminit phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, - unsigned long page_size_mask) + unsigned long page_size_mask, bool init) { unsigned long paddr_next, paddr_last = paddr_end; unsigned long vaddr = (unsigned long)__va(paddr); int i = p4d_index(vaddr); if (!pgtable_l5_enabled()) - return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); + return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, + page_size_mask, init); for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { p4d_t *p4d; @@ -657,39 +692,34 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_p4d_safe(p4d, __p4d(0)); + set_p4d_init(p4d, __p4d(0), init); continue; } if (!p4d_none(*p4d)) { pud = pud_offset(p4d, 0); - paddr_last = phys_pud_init(pud, paddr, - paddr_end, - page_size_mask); + paddr_last = phys_pud_init(pud, paddr, paddr_end, + page_size_mask, init); continue; } pud = alloc_low_page(); paddr_last = phys_pud_init(pud, paddr, paddr_end, - page_size_mask); + page_size_mask, init); spin_lock(&init_mm.page_table_lock); - p4d_populate_safe(&init_mm, p4d, pud); + p4d_populate_init(&init_mm, p4d, pud, init); spin_unlock(&init_mm.page_table_lock); } return paddr_last; } -/* - * Create page table mapping for the physical memory for specific physical - * addresses. The virtual and physical addresses have to be aligned on PMD level - * down. It returns the last physical address mapped. - */ -unsigned long __meminit -kernel_physical_mapping_init(unsigned long paddr_start, - unsigned long paddr_end, - unsigned long page_size_mask) +static unsigned long __meminit +__kernel_physical_mapping_init(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask, + bool init) { bool pgd_changed = false; unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last; @@ -709,19 +739,22 @@ kernel_physical_mapping_init(unsigned long paddr_start, p4d = (p4d_t *)pgd_page_vaddr(*pgd); paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), - page_size_mask); + page_size_mask, + init); continue; } p4d = alloc_low_page(); paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), - page_size_mask); + page_size_mask, init); spin_lock(&init_mm.page_table_lock); if (pgtable_l5_enabled()) - pgd_populate_safe(&init_mm, pgd, p4d); + pgd_populate_init(&init_mm, pgd, p4d, init); else - p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); + p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr), + (pud_t *) p4d, init); + spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } @@ -732,6 +765,37 @@ kernel_physical_mapping_init(unsigned long paddr_start, return paddr_last; } + +/* + * Create page table mapping for the physical memory for specific physical + * addresses. Note that it can only be used to populate non-present entries. + * The virtual and physical addresses have to be aligned on PMD level + * down. It returns the last physical address mapped. + */ +unsigned long __meminit +kernel_physical_mapping_init(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask) +{ + return __kernel_physical_mapping_init(paddr_start, paddr_end, + page_size_mask, true); +} + +/* + * This function is similar to kernel_physical_mapping_init() above with the + * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe() + * when updating the mapping. The caller is responsible to flush the TLBs after + * the function returns. + */ +unsigned long __meminit +kernel_physical_mapping_change(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask) +{ + return __kernel_physical_mapping_init(paddr_start, paddr_end, + page_size_mask, false); +} + #ifndef CONFIG_NUMA void __init initmem_init(void) { diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 385afa2b9e17..51f50a7a07ef 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -301,9 +301,13 @@ static int __init early_set_memory_enc_dec(unsigned long vaddr, else split_page_size_mask = 1 << PG_LEVEL_2M; - kernel_physical_mapping_init(__pa(vaddr & pmask), - __pa((vaddr_end & pmask) + psize), - split_page_size_mask); + /* + * kernel_physical_mapping_change() does not flush the TLBs, so + * a TLB flush is required after we exit from the for loop. + */ + kernel_physical_mapping_change(__pa(vaddr & pmask), + __pa((vaddr_end & pmask) + psize), + split_page_size_mask); } ret = 0; diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 319bde386d5f..eeae142062ed 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -13,6 +13,9 @@ void early_ioremap_page_table_range_init(void); unsigned long kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); +unsigned long kernel_physical_mapping_change(unsigned long start, + unsigned long end, + unsigned long page_size_mask); void zone_sizes_init(void); extern int after_bootmem; From 409ca45526a428620d8efb362ccfd4b1e6b80642 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 12 May 2019 21:52:01 +0900 Subject: [PATCH 4/6] x86/kconfig: Disable CONFIG_GENERIC_HWEIGHT and remove __HAVE_ARCH_SW_HWEIGHT Remove an unnecessary arch complication: arch/x86/include/asm/arch_hweight.h uses __sw_hweight{32,64} as alternatives, and they are implemented in arch/x86/lib/hweight.S x86 does not rely on the generic C implementation lib/hweight.c at all, so CONFIG_GENERIC_HWEIGHT should be disabled. __HAVE_ARCH_SW_HWEIGHT is not necessary either. No change in functionality intended. Signed-off-by: Masahiro Yamada Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Uros Bizjak Link: http://lkml.kernel.org/r/1557665521-17570-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 3 --- arch/x86/include/asm/arch_hweight.h | 2 -- lib/hweight.c | 4 ---- 3 files changed, 9 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0a3cc347143f..de071d7e67b6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -261,9 +261,6 @@ config GENERIC_BUG config GENERIC_BUG_RELATIVE_POINTERS bool -config GENERIC_HWEIGHT - def_bool y - config ARCH_MAY_HAVE_PC_FDC def_bool y depends on ISA_DMA_API diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index fc0693569f7a..ba88edd0d58b 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -12,8 +12,6 @@ #define REG_OUT "a" #endif -#define __HAVE_ARCH_SW_HWEIGHT - static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res; diff --git a/lib/hweight.c b/lib/hweight.c index 7660d88fd496..c94586b62551 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -10,7 +10,6 @@ * The Hamming Weight of a number is the total number of bits set in it. */ -#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned int __sw_hweight32(unsigned int w) { #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER @@ -27,7 +26,6 @@ unsigned int __sw_hweight32(unsigned int w) #endif } EXPORT_SYMBOL(__sw_hweight32); -#endif unsigned int __sw_hweight16(unsigned int w) { @@ -46,7 +44,6 @@ unsigned int __sw_hweight8(unsigned int w) } EXPORT_SYMBOL(__sw_hweight8); -#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 @@ -69,4 +66,3 @@ unsigned long __sw_hweight64(__u64 w) #endif } EXPORT_SYMBOL(__sw_hweight64); -#endif From 88640e1dcd089879530a49a8d212d1814678dfe7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 14 May 2019 13:24:39 -0700 Subject: [PATCH 5/6] x86/speculation/mds: Revert CPU buffer clear on double fault exit The double fault ESPFIX path doesn't return to user mode at all -- it returns back to the kernel by simulating a #GP fault. prepare_exit_to_usermode() will run on the way out of general_protection before running user code. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: Greg Kroah-Hartman Cc: Jon Masters Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 04dcbdb80578 ("x86/speculation/mds: Clear CPU buffers on exit to user") Link: http://lkml.kernel.org/r/ac97612445c0a44ee10374f6ea79c222fe22a5c4.1557865329.git.luto@kernel.org Signed-off-by: Ingo Molnar --- Documentation/x86/mds.rst | 7 ------- arch/x86/kernel/traps.c | 8 -------- 2 files changed, 15 deletions(-) diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst index 534e9baa4e1d..0dc812bb9249 100644 --- a/Documentation/x86/mds.rst +++ b/Documentation/x86/mds.rst @@ -158,13 +158,6 @@ Mitigation points mitigated on the return from do_nmi() to provide almost complete coverage. - - Double fault (#DF): - - A double fault is usually fatal, but the ESPFIX workaround, which can - be triggered from user space through modify_ldt(2) is a recoverable - double fault. #DF uses the paranoid exit path, so explicit mitigation - in the double fault handler is required. - - Machine Check Exception (#MC): Another corner case is a #MC which hits between the CPU buffer clear diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7de466eb960b..8b6d03e55d2f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -58,7 +58,6 @@ #include #include #include -#include #include #include #include @@ -368,13 +367,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) regs->ip = (unsigned long)general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; - /* - * This situation can be triggered by userspace via - * modify_ldt(2) and the return does not take the regular - * user space exit, so a CPU buffer clear is required when - * MDS mitigation is enabled. - */ - mds_user_clear_cpu_buffers(); return; } #endif From 9d8d0294e78a164d407133dea05caf4b84247d6a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 14 May 2019 13:24:40 -0700 Subject: [PATCH 6/6] x86/speculation/mds: Improve CPU buffer clear documentation On x86_64, all returns to usermode go through prepare_exit_to_usermode(), with the sole exception of do_nmi(). This even includes machine checks -- this was added several years ago to support MCE recovery. Update the documentation. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: Greg Kroah-Hartman Cc: Jon Masters Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 04dcbdb80578 ("x86/speculation/mds: Clear CPU buffers on exit to user") Link: http://lkml.kernel.org/r/999fa9e126ba6a48e9d214d2f18dbde5c62ac55c.1557865329.git.luto@kernel.org Signed-off-by: Ingo Molnar --- Documentation/x86/mds.rst | 37 ++++++------------------------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst index 0dc812bb9249..5d4330be200f 100644 --- a/Documentation/x86/mds.rst +++ b/Documentation/x86/mds.rst @@ -142,38 +142,13 @@ Mitigation points mds_user_clear. The mitigation is invoked in prepare_exit_to_usermode() which covers - most of the kernel to user space transitions. There are a few exceptions - which are not invoking prepare_exit_to_usermode() on return to user - space. These exceptions use the paranoid exit code. + all but one of the kernel to user space transitions. The exception + is when we return from a Non Maskable Interrupt (NMI), which is + handled directly in do_nmi(). - - Non Maskable Interrupt (NMI): - - Access to sensible data like keys, credentials in the NMI context is - mostly theoretical: The CPU can do prefetching or execute a - misspeculated code path and thereby fetching data which might end up - leaking through a buffer. - - But for mounting other attacks the kernel stack address of the task is - already valuable information. So in full mitigation mode, the NMI is - mitigated on the return from do_nmi() to provide almost complete - coverage. - - - Machine Check Exception (#MC): - - Another corner case is a #MC which hits between the CPU buffer clear - invocation and the actual return to user. As this still is in kernel - space it takes the paranoid exit path which does not clear the CPU - buffers. So the #MC handler repopulates the buffers to some - extent. Machine checks are not reliably controllable and the window is - extremly small so mitigation would just tick a checkbox that this - theoretical corner case is covered. To keep the amount of special - cases small, ignore #MC. - - - Debug Exception (#DB): - - This takes the paranoid exit path only when the INT1 breakpoint is in - kernel space. #DB on a user space address takes the regular exit path, - so no extra mitigation required. + (The reason that NMI is special is that prepare_exit_to_usermode() can + enable IRQs. In NMI context, NMIs are blocked, and we don't want to + enable IRQs with NMIs blocked.) 2. C-State transition