From 8e41f8726dcf423621e2b6938d015b9796f6f676 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 27 May 2019 14:10:57 -0700 Subject: [PATCH 01/15] mm/vmalloc: Fix calculation of direct map addr range The calculation of the direct map address range to flush was wrong. This could cause the RO direct map alias to not get flushed. Today this shouldn't be a problem because this flush is only needed on x86 right now and the spurious fault handler will fix cached RO->RW translations. In the future though, it could cause the permissions to remain RO in the TLB for the direct map alias, and then the page would return from the page allocator to some other component as RO and cause a crash. So fix fix the address range calculation so the flush will include the direct map range. Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: David S. Miller Cc: Linus Torvalds Cc: Meelis Roos Cc: Nadav Amit Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 868b104d7379 ("mm/vmalloc: Add flag for freeing of special permsissions") Link: https://lkml.kernel.org/r/20190527211058.2729-2-rick.p.edgecombe@intel.com Signed-off-by: Ingo Molnar --- mm/vmalloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7350a124524b..93b2dca2aadb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2123,7 +2123,6 @@ static inline void set_area_direct_map(const struct vm_struct *area, /* Handle removing and resetting vm mappings related to the vm_struct. */ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) { - unsigned long addr = (unsigned long)area->addr; unsigned long start = ULONG_MAX, end = 0; int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int i; @@ -2135,8 +2134,8 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) * execute permissions, without leaving a RW+X window. */ if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { - set_memory_nx(addr, area->nr_pages); - set_memory_rw(addr, area->nr_pages); + set_memory_nx((unsigned long)area->addr, area->nr_pages); + set_memory_rw((unsigned long)area->addr, area->nr_pages); } remove_vm_area(area->addr); @@ -2160,9 +2159,10 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) * the vm_unmap_aliases() flush includes the direct map. */ for (i = 0; i < area->nr_pages; i++) { - if (page_address(area->pages[i])) { + unsigned long addr = (unsigned long)page_address(area->pages[i]); + if (addr) { start = min(addr, start); - end = max(addr, end); + end = max(addr + PAGE_SIZE, end); } } From 31e67340cc65edfd9dac5ef26f81de8414ce5906 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 27 May 2019 14:10:58 -0700 Subject: [PATCH 02/15] mm/vmalloc: Avoid rare case of flushing TLB with weird arguments In a rare case, flush_tlb_kernel_range() could be called with a start higher than the end. In vm_remove_mappings(), in case page_address() returns 0 for all pages (for example they were all in highmem), _vm_unmap_aliases() will be called with start = ULONG_MAX, end = 0 and flush = 1. If at the same time, the vmalloc purge operation is triggered by something else while the current operation is between remove_vm_area() and _vm_unmap_aliases(), then the vm mapping just removed will be already purged. In this case the call of vm_unmap_aliases() may not find any other mappings to flush and so end up flushing start = ULONG_MAX, end = 0. So only set flush = true if we find something in the direct mapping that we need to flush, and this way this can't happen. Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: David S. Miller Cc: Linus Torvalds Cc: Meelis Roos Cc: Nadav Amit Cc: Peter Zijlstra Cc: Andrew Morton Cc: Thomas Gleixner Fixes: 868b104d7379 ("mm/vmalloc: Add flag for freeing of special permsissions") Link: https://lkml.kernel.org/r/20190527211058.2729-3-rick.p.edgecombe@intel.com Signed-off-by: Ingo Molnar --- mm/vmalloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 93b2dca2aadb..4c9e150e5ad3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2125,6 +2125,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) { unsigned long start = ULONG_MAX, end = 0; int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; + int flush_dmap = 0; int i; /* @@ -2163,6 +2164,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) if (addr) { start = min(addr, start); end = max(addr + PAGE_SIZE, end); + flush_dmap = 1; } } @@ -2172,7 +2174,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) * reset the direct map permissions to the default. */ set_area_direct_map(area, set_direct_map_invalid_noflush); - _vm_unmap_aliases(start, end, 1); + _vm_unmap_aliases(start, end, flush_dmap); set_area_direct_map(area, set_direct_map_default_noflush); } From e35faeb64146f2015f2aec14b358ae508e4066db Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 3 Jun 2019 06:41:20 -0700 Subject: [PATCH 03/15] x86/CPU: Add more Icelake model numbers Add the CPUID model numbers of Icelake (ICL) desktop and server processors to the Intel family list. [ Qiuxu: Sort the macros by model number. ] Signed-off-by: Kan Liang Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Shevchenko Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Qiuxu Zhuo Cc: Rajneesh Bhardwaj Cc: rui.zhang@intel.com Cc: Thomas Gleixner Cc: Tony Luck Cc: x86-ml Link: https://lkml.kernel.org/r/20190603134122.13853-1-kan.liang@linux.intel.com --- arch/x86/include/asm/intel-family.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 9f15384c504a..310118805f57 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -52,6 +52,9 @@ #define INTEL_FAM6_CANNONLAKE_MOBILE 0x66 +#define INTEL_FAM6_ICELAKE_X 0x6A +#define INTEL_FAM6_ICELAKE_XEON_D 0x6C +#define INTEL_FAM6_ICELAKE_DESKTOP 0x7D #define INTEL_FAM6_ICELAKE_MOBILE 0x7E /* "Small Core" Processors (Atom) */ From b81ff1013eb8eef2934ca7e8cf53d553c1029e84 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 29 May 2019 09:25:40 +0200 Subject: [PATCH 04/15] x86/fpu: Use fault_in_pages_writeable() for pre-faulting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit d9c9ce34ed5c8 ("x86/fpu: Fault-in user stack if copy_fpstate_to_sigframe() fails") get_user_pages_unlocked() pre-faults user's memory if a write generates a page fault while the handler is disabled. This works in general and uncovered a bug as reported by Mike Rapoport¹. It has been pointed out that this function may be fragile and a simple pre-fault as in fault_in_pages_writeable() would be a better solution. Better as in taste and simplicity: that write (as performed by the alternative function) performs exactly the same faulting of memory as before. This was suggested by Hugh Dickins and Andrew Morton. Use fault_in_pages_writeable() for pre-faulting user's stack. [ bigeasy: Write commit message. ] [ bp: Massage some. ] ¹ https://lkml.kernel.org/r/1557844195-18882-1-git-send-email-rppt@linux.ibm.com Fixes: d9c9ce34ed5c8 ("x86/fpu: Fault-in user stack if copy_fpstate_to_sigframe() fails") Suggested-by: Andrew Morton Signed-off-by: Hugh Dickins Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Tested-by: Chris Wilson Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: linux-mm Cc: Mike Rapoport Cc: Pavel Machek Cc: Rik van Riel Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190529072540.g46j4kfeae37a3iu@linutronix.de Link: https://lkml.kernel.org/r/1557844195-18882-1-git-send-email-rppt@linux.ibm.com --- arch/x86/kernel/fpu/signal.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 5a8d118bc423..060d6188b453 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -189,15 +190,7 @@ retry: fpregs_unlock(); if (ret) { - int aligned_size; - int nr_pages; - - aligned_size = offset_in_page(buf_fx) + fpu_user_xstate_size; - nr_pages = DIV_ROUND_UP(aligned_size, PAGE_SIZE); - - ret = get_user_pages_unlocked((unsigned long)buf_fx, nr_pages, - NULL, FOLL_WRITE); - if (ret == nr_pages) + if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size)) goto retry; return -EFAULT; } From 00e5a2bbcc31d5fea853f8daeba0f06c1c88c3ff Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 23 May 2019 10:57:44 +0800 Subject: [PATCH 05/15] x86/mm/KASLR: Compute the size of the vmemmap section properly The size of the vmemmap section is hardcoded to 1 TB to support the maximum amount of system RAM in 4-level paging mode - 64 TB. However, 1 TB is not enough for vmemmap in 5-level paging mode. Assuming the size of struct page is 64 Bytes, to support 4 PB system RAM in 5-level, 64 TB of vmemmap area is needed: 4 * 1000^5 PB / 4096 bytes page size * 64 bytes per page struct / 1000^4 TB = 62.5 TB. This hardcoding may cause vmemmap to corrupt the following cpu_entry_area section, if KASLR puts vmemmap very close to it and the actual vmemmap size is bigger than 1 TB. So calculate the actual size of the vmemmap region needed and then align it up to 1 TB boundary. In 4-level paging mode it is always 1 TB. In 5-level it's adjusted on demand. The current code reserves 0.5 PB for vmemmap on 5-level. With this change, the space can be saved and thus used to increase entropy for the randomization. [ bp: Spell out how the 64 TB needed for vmemmap is computed and massage commit message. ] Fixes: eedb92abb9bb ("x86/mm: Make virtual memory layout dynamic for CONFIG_X86_5LEVEL=y") Signed-off-by: Baoquan He Signed-off-by: Borislav Petkov Reviewed-by: Kees Cook Acked-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: kirill.shutemov@linux.intel.com Cc: Peter Zijlstra Cc: stable Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190523025744.3756-1-bhe@redhat.com --- arch/x86/mm/kaslr.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index dc3f058bdf9b..dc6182eecefa 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -52,7 +52,7 @@ static __initdata struct kaslr_memory_region { } kaslr_regions[] = { { &page_offset_base, 0 }, { &vmalloc_base, 0 }, - { &vmemmap_base, 1 }, + { &vmemmap_base, 0 }, }; /* Get size in bytes used by the memory region */ @@ -78,6 +78,7 @@ void __init kernel_randomize_memory(void) unsigned long rand, memory_tb; struct rnd_state rand_state; unsigned long remain_entropy; + unsigned long vmemmap_size; vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4; vaddr = vaddr_start; @@ -109,6 +110,14 @@ void __init kernel_randomize_memory(void) if (memory_tb < kaslr_regions[0].size_tb) kaslr_regions[0].size_tb = memory_tb; + /* + * Calculate the vmemmap region size in TBs, aligned to a TB + * boundary. + */ + vmemmap_size = (kaslr_regions[0].size_tb << (TB_SHIFT - PAGE_SHIFT)) * + sizeof(struct page); + kaslr_regions[2].size_tb = DIV_ROUND_UP(vmemmap_size, 1UL << TB_SHIFT); + /* Calculate entropy available between regions */ remain_entropy = vaddr_end - vaddr_start; for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) From aab8445c4e1cceeb3f739352041ec1c2586bc923 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 7 Jun 2019 16:29:16 +0200 Subject: [PATCH 06/15] x86/fpu: Update kernel's FPU state before using for the fsave header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 39388e80f9b0c ("x86/fpu: Don't save fxregs for ia32 frames in copy_fpstate_to_sigframe()") I removed the statement | if (ia32_fxstate) | copy_fxregs_to_kernel(fpu); and argued that it was wrongly merged because the content was already saved in kernel's state. This was wrong: It is required to write it back because it is only saved on the user-stack and save_fsave_header() reads it from task's FPU-state. I missed that part… Save x87 FPU state unless thread's FPU registers are already up to date. Fixes: 39388e80f9b0c ("x86/fpu: Don't save fxregs for ia32 frames in copy_fpstate_to_sigframe()") Reported-by: Eric Biggers Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Tested-by: Eric Biggers Cc: Andy Lutomirski Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190607142915.y52mfmgk5lvhll7n@linutronix.de --- arch/x86/kernel/fpu/signal.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 060d6188b453..0071b794ed19 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -62,6 +62,11 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; + fpregs_lock(); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + copy_fxregs_to_kernel(&tsk->thread.fpu); + fpregs_unlock(); + convert_from_fxsr(&env, tsk); if (__copy_to_user(buf, &env, sizeof(env)) || From 87d3aa28f345bea77c396855fa5d5fec4c24461f Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 3 Jun 2019 18:25:31 +0100 Subject: [PATCH 07/15] x86/resctrl: Don't stop walking closids when a locksetup group is found When a new control group is created __init_one_rdt_domain() walks all the other closids to calculate the sets of used and unused bits. If it discovers a pseudo_locksetup group, it breaks out of the loop. This means any later closid doesn't get its used bits added to used_b. These bits will then get set in unused_b, and added to the new control group's configuration, even if they were marked as exclusive for a later closid. When encountering a pseudo_locksetup group, we should continue. This is because "a resource group enters 'pseudo-locked' mode after the schemata is written while the resource group is in 'pseudo-locksetup' mode." When we find a pseudo_locksetup group, its configuration is expected to be overwritten, we can skip it. Fixes: dfe9674b04ff6 ("x86/intel_rdt: Enable entering of pseudo-locksetup mode") Signed-off-by: James Morse Signed-off-by: Thomas Gleixner Acked-by: Reinette Chatre Cc: Fenghua Yu Cc: Borislav Petkov Cc: H Peter Avin Cc: Link: https://lkml.kernel.org/r/20190603172531.178830-1-james.morse@arm.com --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 333c177a2471..869cbef5da81 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2542,7 +2542,12 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, if (closid_allocated(i) && i != closid) { mode = rdtgroup_mode_by_closid(i); if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - break; + /* + * ctrl values for locksetup aren't relevant + * until the schemata is written, and the mode + * becomes RDT_MODE_PSEUDO_LOCKED. + */ + continue; /* * If CDP is active include peer domain's * usage to ensure there is no overlap From c7563e62a6d720aa3b068e26ddffab5f0df29263 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 10 Jun 2019 13:15:44 -0400 Subject: [PATCH 08/15] x86/resctrl: Prevent NULL pointer dereference when local MBM is disabled Booting with kernel parameter "rdt=cmt,mbmtotal,memlocal,l3cat,mba" and executing "mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl" results in a NULL pointer dereference on systems which do not have local MBM support enabled.. BUG: kernel NULL pointer dereference, address: 0000000000000020 PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 722 Comm: kworker/0:3 Not tainted 5.2.0-0.rc3.git0.1.el7_UNSUPPORTED.x86_64 #2 Workqueue: events mbm_handle_overflow RIP: 0010:mbm_handle_overflow+0x150/0x2b0 Only enter the bandwith update loop if the system has local MBM enabled. Fixes: de73f38f7680 ("x86/intel_rdt/mba_sc: Feedback loop to dynamically update mem bandwidth") Signed-off-by: Prarit Bhargava Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: Reinette Chatre Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190610171544.13474-1-prarit@redhat.com --- arch/x86/kernel/cpu/resctrl/monitor.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 1573a0a6b525..ff6e8e561405 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -368,6 +368,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) struct list_head *head; struct rdtgroup *entry; + if (!is_mbm_local_enabled()) + return; + r_mba = &rdt_resources_all[RDT_RESOURCE_MBA]; closid = rgrp->closid; rmid = rgrp->mon.rmid; From 71ab8323cc357c68985a2d6fc6cfc22b1dbbc1c3 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 31 May 2019 12:47:54 -0700 Subject: [PATCH 09/15] x86/kgdb: Return 0 from kgdb_arch_set_breakpoint() err must be nonzero in order to reach text_poke(), which caused kgdb to fail to set breakpoints: (gdb) break __x64_sys_sync Breakpoint 1 at 0xffffffff81288910: file ../fs/sync.c, line 124. (gdb) c Continuing. Warning: Cannot insert breakpoint 1. Cannot access memory at address 0xffffffff81288910 Command aborted. Fixes: 86a22057127d ("x86/kgdb: Avoid redundant comparison of patched code") Signed-off-by: Matt Mullins Signed-off-by: Borislav Petkov Reviewed-by: Nadav Amit Cc: Andy Lutomirski Cc: Christophe Leroy Cc: Daniel Thompson Cc: Douglas Anderson Cc: "Gustavo A. R. Silva" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Peter Zijlstra (Intel)" Cc: Rick Edgecombe Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190531194755.6320-1-mmullins@fb.com --- arch/x86/kernel/kgdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 9a8c1648fc9a..6690c5652aeb 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -758,7 +758,7 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) BREAK_INSTR_SIZE); bpt->type = BP_POKE_BREAKPOINT; - return err; + return 0; } int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) From 8d3289f2fa1e0c7e2f72c7352f1efb75d2ad7c76 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 4 Jun 2019 19:54:12 +0200 Subject: [PATCH 10/15] x86/fpu: Don't use current->mm to check for a kthread current->mm can be non-NULL if a kthread calls use_mm(). Check for PF_KTHREAD instead to decide when to store user mode FP state. Fixes: 2722146eb784 ("x86/fpu: Remove fpu->initialized") Reported-by: Peter Zijlstra Signed-off-by: Christoph Hellwig Signed-off-by: Borislav Petkov Acked-by: Sebastian Andrzej Siewior Cc: Aubrey Li Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Nicolai Stange Cc: Rik van Riel Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190604175411.GA27477@lst.de --- arch/x86/include/asm/fpu/internal.h | 6 +++--- arch/x86/kernel/fpu/core.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 9e27fa05a7ae..4c95c365058a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -536,7 +536,7 @@ static inline void __fpregs_load_activate(void) struct fpu *fpu = ¤t->thread.fpu; int cpu = smp_processor_id(); - if (WARN_ON_ONCE(current->mm == NULL)) + if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) return; if (!fpregs_state_valid(fpu, cpu)) { @@ -567,11 +567,11 @@ static inline void __fpregs_load_activate(void) * otherwise. * * The FPU context is only stored/restored for a user task and - * ->mm is used to distinguish between kernel and user threads. + * PF_KTHREAD is used to distinguish between kernel and user threads. */ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (static_cpu_has(X86_FEATURE_FPU) && current->mm) { + if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 466fca686fb9..649fbc3fcf9f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -102,7 +102,7 @@ static void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (current->mm) { + if (!(current->flags & PF_KTHREAD)) { if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); /* From f3176ec9420de0c385023afa3e4970129444ac2f Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 14 Jun 2019 17:31:49 +0300 Subject: [PATCH 11/15] x86/kasan: Fix boot with 5-level paging and KASAN Since commit d52888aa2753 ("x86/mm: Move LDT remap out of KASLR region on 5-level paging") kernel doesn't boot with KASAN on 5-level paging machines. The bug is actually in early_p4d_offset() and introduced by commit 12a8cc7fcf54 ("x86/kasan: Use the same shadow offset for 4- and 5-level paging") early_p4d_offset() tries to convert pgd_val(*pgd) value to a physical address. This doesn't make sense because pgd_val() already contains the physical address. It did work prior to commit d52888aa2753 because the result of "__pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK" was the same as "pgd_val(*pgd) & PTE_PFN_MASK". __pa_nodebug() just set some high bits which were masked out by applying PTE_PFN_MASK. After the change of the PAGE_OFFSET offset in commit d52888aa2753 __pa_nodebug(pgd_val(*pgd)) started to return a value with more high bits set and PTE_PFN_MASK wasn't enough to mask out all of them. So it returns a wrong not even canonical address and crashes on the attempt to dereference it. Switch back to pgd_val() & PTE_PFN_MASK to cure the issue. Fixes: 12a8cc7fcf54 ("x86/kasan: Use the same shadow offset for 4- and 5-level paging") Reported-by: Kirill A. Shutemov Signed-off-by: Andrey Ryabinin Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: kasan-dev@googlegroups.com Cc: stable@vger.kernel.org Cc: Link: https://lkml.kernel.org/r/20190614143149.2227-1-aryabinin@virtuozzo.com --- arch/x86/mm/kasan_init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 8dc0fc0b1382..296da58f3013 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -199,7 +199,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) if (!pgtable_l5_enabled()) return (p4d_t *)pgd; - p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; + p4d = pgd_val(*pgd) & PTE_PFN_MASK; p4d += __START_KERNEL_map - phys_base; return (p4d_t *)p4d + p4d_index(addr); } From 78f4e932f7760d965fb1569025d1576ab77557c5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 13 Jun 2019 15:49:02 +0200 Subject: [PATCH 12/15] x86/microcode, cpuhotplug: Add a microcode loader CPU hotplug callback Adric Blake reported the following warning during suspend-resume: Enabling non-boot CPUs ... x86: Booting SMP configuration: smpboot: Booting Node 0 Processor 1 APIC 0x2 unchecked MSR access error: WRMSR to 0x10f (tried to write 0x0000000000000000) \ at rIP: 0xffffffff8d267924 (native_write_msr+0x4/0x20) Call Trace: intel_set_tfa intel_pmu_cpu_starting ? x86_pmu_dead_cpu x86_pmu_starting_cpu cpuhp_invoke_callback ? _raw_spin_lock_irqsave notify_cpu_starting start_secondary secondary_startup_64 microcode: sig=0x806ea, pf=0x80, revision=0x96 microcode: updated to revision 0xb4, date = 2019-04-01 CPU1 is up The MSR in question is MSR_TFA_RTM_FORCE_ABORT and that MSR is emulated by microcode. The log above shows that the microcode loader callback happens after the PMU restoration, leading to the conjecture that because the microcode hasn't been updated yet, that MSR is not present yet, leading to the #GP. Add a microcode loader-specific hotplug vector which comes before the PERF vectors and thus executes earlier and makes sure the MSR is present. Fixes: 400816f60c54 ("perf/x86/intel: Implement support for TSX Force Abort") Reported-by: Adric Blake Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Cc: x86@kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=203637 --- arch/x86/kernel/cpu/microcode/core.c | 2 +- include/linux/cpuhotplug.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 70a04436380e..a813987b5552 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -872,7 +872,7 @@ int __init microcode_init(void) goto out_ucode_group; register_syscore_ops(&mc_syscore_ops); - cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", + cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 6a381594608c..5c6062206760 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -101,6 +101,7 @@ enum cpuhp_state { CPUHP_AP_IRQ_BCM2836_STARTING, CPUHP_AP_IRQ_MIPS_GIC_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, + CPUHP_AP_MICROCODE_LOADER, CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, CPUHP_AP_PERF_X86_AMD_IBS_STARTING, From 5423f5ce5ca410b3646f355279e4e937d452e622 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 18 Jun 2019 22:31:40 +0200 Subject: [PATCH 13/15] x86/microcode: Fix the microcode load on CPU hotplug for real A recent change moved the microcode loader hotplug callback into the early startup phase which is running with interrupts disabled. It missed that the callbacks invoke sysfs functions which might sleep causing nice 'might sleep' splats with proper debugging enabled. Split the callbacks and only load the microcode in the early startup phase and move the sysfs handling back into the later threaded and preemptible bringup phase where it was before. Fixes: 78f4e932f776 ("x86/microcode, cpuhotplug: Add a microcode loader CPU hotplug callback") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: stable@vger.kernel.org Cc: x86-ml Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1906182228350.1766@nanos.tec.linutronix.de --- arch/x86/kernel/cpu/microcode/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index a813987b5552..cb0fdcaf1415 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -789,13 +789,16 @@ static struct syscore_ops mc_syscore_ops = { .resume = mc_bp_resume, }; -static int mc_cpu_online(unsigned int cpu) +static int mc_cpu_starting(unsigned int cpu) { - struct device *dev; - - dev = get_cpu_device(cpu); microcode_update_cpu(cpu); pr_debug("CPU%d added\n", cpu); + return 0; +} + +static int mc_cpu_online(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); if (sysfs_create_group(&dev->kobj, &mc_attr_group)) pr_err("Failed to create group for CPU%d\n", cpu); @@ -872,7 +875,9 @@ int __init microcode_init(void) goto out_ucode_group; register_syscore_ops(&mc_syscore_ops); - cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:online", + cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting", + mc_cpu_starting, NULL); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION); From 32f010deab575199df4ebe7b6aec20c17bb7eccd Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Wed, 19 Jun 2019 13:27:16 -0700 Subject: [PATCH 14/15] x86/resctrl: Prevent possible overrun during bitmap operations While the DOC at the beginning of lib/bitmap.c explicitly states that "The number of valid bits in a given bitmap does _not_ need to be an exact multiple of BITS_PER_LONG.", some of the bitmap operations do indeed access BITS_PER_LONG portions of the provided bitmap no matter the size of the provided bitmap. For example, if find_first_bit() is provided with an 8 bit bitmap the operation will access BITS_PER_LONG bits from the provided bitmap. While the operation ensures that these extra bits do not affect the result, the memory is still accessed. The capacity bitmasks (CBMs) are typically stored in u32 since they can never exceed 32 bits. A few instances exist where a bitmap_* operation is performed on a CBM by simply pointing the bitmap operation to the stored u32 value. The consequence of this pattern is that some bitmap_* operations will access out-of-bounds memory when interacting with the provided CBM. This same issue has previously been addressed with commit 49e00eee0061 ("x86/intel_rdt: Fix out-of-bounds memory access in CBM tests") but at that time not all instances of the issue were fixed. Fix this by using an unsigned long to store the capacity bitmask data that is passed to bitmap functions. Fixes: e651901187ab ("x86/intel_rdt: Introduce "bit_usage" to display cache allocations details") Fixes: f4e80d67a527 ("x86/intel_rdt: Resctrl files reflect pseudo-locked information") Fixes: 95f0b77efa57 ("x86/intel_rdt: Initialize new resource group with sane defaults") Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Cc: Fenghua Yu Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: stable Cc: Thomas Gleixner Cc: Tony Luck Cc: x86-ml Link: https://lkml.kernel.org/r/58c9b6081fd9bf599af0dfc01a6fdd335768efef.1560975645.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 35 ++++++++++++-------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 869cbef5da81..f9d8ed6ab03b 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -804,8 +804,12 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct rdt_resource *r = of->kn->parent->priv; - u32 sw_shareable = 0, hw_shareable = 0; - u32 exclusive = 0, pseudo_locked = 0; + /* + * Use unsigned long even though only 32 bits are used to ensure + * test_bit() is used safely. + */ + unsigned long sw_shareable = 0, hw_shareable = 0; + unsigned long exclusive = 0, pseudo_locked = 0; struct rdt_domain *dom; int i, hwb, swb, excl, psl; enum rdtgrp_mode mode; @@ -850,10 +854,10 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, } for (i = r->cache.cbm_len - 1; i >= 0; i--) { pseudo_locked = dom->plr ? dom->plr->cbm : 0; - hwb = test_bit(i, (unsigned long *)&hw_shareable); - swb = test_bit(i, (unsigned long *)&sw_shareable); - excl = test_bit(i, (unsigned long *)&exclusive); - psl = test_bit(i, (unsigned long *)&pseudo_locked); + hwb = test_bit(i, &hw_shareable); + swb = test_bit(i, &sw_shareable); + excl = test_bit(i, &exclusive); + psl = test_bit(i, &pseudo_locked); if (hwb && swb) seq_putc(seq, 'X'); else if (hwb && !swb) @@ -2494,26 +2498,19 @@ out_destroy: */ static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) { - /* - * Convert the u32 _val to an unsigned long required by all the bit - * operations within this function. No more than 32 bits of this - * converted value can be accessed because all bit operations are - * additionally provided with cbm_len that is initialized during - * hardware enumeration using five bits from the EAX register and - * thus never can exceed 32 bits. - */ - unsigned long *val = (unsigned long *)_val; + unsigned long val = *_val; unsigned int cbm_len = r->cache.cbm_len; unsigned long first_bit, zero_bit; - if (*val == 0) + if (val == 0) return; - first_bit = find_first_bit(val, cbm_len); - zero_bit = find_next_zero_bit(val, cbm_len, first_bit); + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); /* Clear any remaining bits to ensure contiguous region */ - bitmap_clear(val, zero_bit, cbm_len - zero_bit); + bitmap_clear(&val, zero_bit, cbm_len - zero_bit); + *_val = (u32)val; } /* From ea136a112d89bade596314a1ae49f748902f4727 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 19 Jun 2019 19:14:46 +0100 Subject: [PATCH 15/15] x86/apic: Fix integer overflow on 10 bit left shift of cpu_khz The left shift of unsigned int cpu_khz will overflow for large values of cpu_khz, so cast it to a long long before shifting it to avoid overvlow. For example, this can happen when cpu_khz is 4194305, i.e. ~4.2 GHz. Addresses-Coverity: ("Unintentional integer overflow") Fixes: 8c3ba8d04924 ("x86, apic: ack all pending irqs when crashed/on kexec") Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: "H . Peter Anvin" Cc: kernel-janitors@vger.kernel.org Link: https://lkml.kernel.org/r/20190619181446.13635-1-colin.king@canonical.com --- arch/x86/kernel/apic/apic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 177aa8ef2afa..85be316665b4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1464,7 +1464,8 @@ static void apic_pending_intr_clear(void) if (queued) { if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { ntsc = rdtsc(); - max_loops = (cpu_khz << 10) - (ntsc - tsc); + max_loops = (long long)cpu_khz << 10; + max_loops -= ntsc - tsc; } else { max_loops--; }