From 4c62360d7562a20c996836d163259c87d9378120 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Tue, 30 Jun 2015 15:57:51 -0700 Subject: [PATCH 01/13] efi: Handle memory error structures produced based on old versions of standard The memory error record structure includes as its first field a bitmask of which subsequent fields are valid. The allows new fields to be added to the structure while keeping compatibility with older software that parses these records. This mechanism was used between versions 2.2 and 2.3 to add four new fields, growing the size of the structure from 73 bytes to 80. But Linux just added all the new fields so this test: if (gdata->error_data_length >= sizeof(*mem_err)) cper_print_mem(newpfx, mem_err); else goto err_section_too_small; now make Linux complain about old format records being too short. Add a definition for the old format of the structure and use that for the minimum size check. Pass the actual size to cper_print_mem() so it can sanity check the validation_bits field to ensure that if a BIOS using the old format sets bits as if it were new, we won't access fields beyond the end of the structure. Signed-off-by: Tony Luck Cc: Signed-off-by: Matt Fleming --- drivers/firmware/efi/cper.c | 15 ++++++++++++--- include/linux/cper.h | 22 +++++++++++++++++++++- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 4fd9961d552e..d42537425438 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -305,10 +305,17 @@ const char *cper_mem_err_unpack(struct trace_seq *p, return ret; } -static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem) +static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem, + int len) { struct cper_mem_err_compact cmem; + /* Don't trust UEFI 2.1/2.2 structure with bad validation bits */ + if (len == sizeof(struct cper_sec_mem_err_old) && + (mem->validation_bits & ~(CPER_MEM_VALID_RANK_NUMBER - 1))) { + pr_err(FW_WARN "valid bits set for fields beyond structure\n"); + return; + } if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS) printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status); if (mem->validation_bits & CPER_MEM_VALID_PA) @@ -405,8 +412,10 @@ static void cper_estatus_print_section( } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = (void *)(gdata + 1); printk("%s""section_type: memory error\n", newpfx); - if (gdata->error_data_length >= sizeof(*mem_err)) - cper_print_mem(newpfx, mem_err); + if (gdata->error_data_length >= + sizeof(struct cper_sec_mem_err_old)) + cper_print_mem(newpfx, mem_err, + gdata->error_data_length); else goto err_section_too_small; } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) { diff --git a/include/linux/cper.h b/include/linux/cper.h index 76abba4b238e..dcacb1a72e26 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -340,7 +340,27 @@ struct cper_ia_proc_ctx { __u64 mm_reg_addr; }; -/* Memory Error Section */ +/* Old Memory Error Section UEFI 2.1, 2.2 */ +struct cper_sec_mem_err_old { + __u64 validation_bits; + __u64 error_status; + __u64 physical_addr; + __u64 physical_addr_mask; + __u16 node; + __u16 card; + __u16 module; + __u16 bank; + __u16 device; + __u16 row; + __u16 column; + __u16 bit_pos; + __u64 requestor_id; + __u64 responder_id; + __u64 target_id; + __u8 error_type; +}; + +/* Memory Error Section UEFI >= 2.3 */ struct cper_sec_mem_err { __u64 validation_bits; __u64 error_status; From bbc03778b9954a2ec93baed63718e4df0192f130 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 20 Jul 2015 16:01:53 -0700 Subject: [PATCH 02/13] x86/mm: Add parenthesis for TLB tracepoint size calculation flush_tlb_info->flush_start/end are both normal virtual addresses. When calculating 'nr_pages' (only used for the tracepoint), I neglected to put parenthesis in. Thanks to David Koufaty for pointing this out. Signed-off-by: Dave Hansen Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@sr71.net Cc: Link: http://lkml.kernel.org/r/20150720230153.9E834081@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3250f2371aea..90b924acd982 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -117,7 +117,7 @@ static void flush_tlb_func(void *info) } else { unsigned long addr; unsigned long nr_pages = - f->flush_end - f->flush_start / PAGE_SIZE; + (f->flush_end - f->flush_start) / PAGE_SIZE; addr = f->flush_start; while (addr < f->flush_end) { __flush_tlb_single(addr); From a89652769470d12cd484ee3d3f7bde0742be8d96 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 20 Jul 2015 14:29:58 -0700 Subject: [PATCH 03/13] x86/mpx: Do not set ->vm_ops on MPX VMAs MPX setups private anonymous mapping, but uses vma->vm_ops too. This can confuse core VM, as it relies on vm->vm_ops to distinguish file VMAs from anonymous. As result we will get SIGBUS, because handle_pte_fault() thinks it's file VMA without vm_ops->fault and it doesn't know how to handle the situation properly. Let's fix that by not setting ->vm_ops. We don't really need ->vm_ops here: MPX VMA can be detected with VM_MPX flag. And vma_merge() will not merge MPX VMA with non-MPX VMA, because ->vm_flags won't match. The only thing left is name of VMA. I'm not sure if it's part of ABI, or we can just drop it. The patch keep it by providing arch_vma_name() on x86. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Cc: # Fixes: 6b7339f4 (mm: avoid setting up anonymous pages into file mapping) Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@sr71.net Link: http://lkml.kernel.org/r/20150720212958.305CC3E9@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mmap.c | 7 +++++++ arch/x86/mm/mpx.c | 24 +++--------------------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 9d518d693b4b..844b06d67df4 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -126,3 +126,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_MPX) + return "[mpx]"; + return NULL; +} diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 7a657f58bbea..db1b0bc5017c 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -20,20 +20,6 @@ #define CREATE_TRACE_POINTS #include -static const char *mpx_mapping_name(struct vm_area_struct *vma) -{ - return "[mpx]"; -} - -static struct vm_operations_struct mpx_vma_ops = { - .name = mpx_mapping_name, -}; - -static int is_mpx_vma(struct vm_area_struct *vma) -{ - return (vma->vm_ops == &mpx_vma_ops); -} - static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm) { if (is_64bit_mm(mm)) @@ -53,9 +39,6 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm) /* * This is really a simplified "vm_mmap". it only handles MPX * bounds tables (the bounds directory is user-allocated). - * - * Later on, we use the vma->vm_ops to uniquely identify these - * VMAs. */ static unsigned long mpx_mmap(unsigned long len) { @@ -101,7 +84,6 @@ static unsigned long mpx_mmap(unsigned long len) ret = -ENOMEM; goto out; } - vma->vm_ops = &mpx_vma_ops; if (vm_flags & VM_LOCKED) { up_write(&mm->mmap_sem); @@ -812,7 +794,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm, * so stop immediately and return an error. This * probably results in a SIGSEGV. */ - if (!is_mpx_vma(vma)) + if (!(vma->vm_flags & VM_MPX)) return -EINVAL; len = min(vma->vm_end, end) - addr; @@ -945,9 +927,9 @@ static int try_unmap_single_bt(struct mm_struct *mm, * lots of tables even though we have no actual table * entries in use. */ - while (next && is_mpx_vma(next)) + while (next && (next->vm_flags & VM_MPX)) next = next->vm_next; - while (prev && is_mpx_vma(prev)) + while (prev && (prev->vm_flags & VM_MPX)) prev = prev->vm_prev; /* * We know 'start' and 'end' lie within an area controlled From 5bc016f1abaa1c5ac0e3af23aa79faec4634a074 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 20 Jul 2015 08:49:01 +0100 Subject: [PATCH 04/13] x86/fpu: Disable dependent CPU features on "noxsave" Complete the set of dependent features that need disabling at once: XSAVEC, AVX-512 and all currently known to the kernel extensions to it, as well as MPX need to be disabled too. Signed-off-by: Jan Beulich Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/55ACC40D0200007800092E6C@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 0b39173dd971..1e173f6285c7 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -351,9 +351,15 @@ static int __init x86_noxsave_setup(char *s) setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_XSAVEC); setup_clear_cpu_cap(X86_FEATURE_XSAVES); setup_clear_cpu_cap(X86_FEATURE_AVX); setup_clear_cpu_cap(X86_FEATURE_AVX2); + setup_clear_cpu_cap(X86_FEATURE_AVX512F); + setup_clear_cpu_cap(X86_FEATURE_AVX512PF); + setup_clear_cpu_cap(X86_FEATURE_AVX512ER); + setup_clear_cpu_cap(X86_FEATURE_AVX512CD); + setup_clear_cpu_cap(X86_FEATURE_MPX); return 1; } From ca1fec58bc6a90be96a59b4769e951156846c6ca Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 20 Jul 2015 08:46:14 +0100 Subject: [PATCH 05/13] x86/mm/pat: Adjust default caching mode translation tables Make WT really mean WT (rather than UC). I can't see why commit 9cd25aac1f ("x86/mm/pat: Emulate PAT when it is disabled") didn't make this to match its changes to pat_init(). Signed-off-by: Jan Beulich Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/55ACC3660200007800092E62@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8533b46e6bee..7a4532229f51 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -43,18 +43,18 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, - [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_WT ] = _PAGE_PWT | 0, [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; EXPORT_SYMBOL(__cachemode2pte_tbl); uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WT, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WT, [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; From fd0a1b8607ef311a2c800dd54c9a4a3583756ea6 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 17 Jul 2015 14:07:24 -0700 Subject: [PATCH 06/13] x86/mm/pat, drivers/infiniband/ipath: Replace WARN() with pr_warn() WARN() may confuse users, fix that. ipath_init_one() is part the device's probe so this would only be triggered if a corresponding device was found. Signed-off-by: Luis R. Rodriguez Acked-by: Doug Ledford Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andy@silverblocksystems.net Cc: benh@kernel.crashing.org Cc: bp@suse.de Cc: dan.j.williams@intel.com Cc: jkosina@suse.cz Cc: julia.lawall@lip6.fr Cc: luto@amacapital.net Cc: mchehab@osg.samsung.com Link: http://lkml.kernel.org/r/1437167245-28273-2-git-send-email-mcgrof@do-not-panic.com Signed-off-by: Ingo Molnar --- drivers/infiniband/hw/ipath/ipath_driver.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 2d7e503d13cb..871dbe56216a 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -31,6 +31,8 @@ * SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -399,8 +401,8 @@ static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) u32 bar0 = 0, bar1 = 0; #ifdef CONFIG_X86_64 - if (WARN(pat_enabled(), - "ipath needs PAT disabled, boot with nopat kernel parameter\n")) { + if (pat_enabled()) { + pr_warn("ipath needs PAT disabled, boot with nopat kernel parameter\n"); ret = -ENODEV; goto bail; } From f5530d5af835ffa82a0607f5f1977d63ac02551f Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 17 Jul 2015 14:07:25 -0700 Subject: [PATCH 07/13] x86/mm/pat, drivers/media/ivtv: Move the PAT warning and replace WARN() with pr_warn() On built-in kernels this warning will always splat, even if no ivtvfb hardware is present, as this is part of the module init: if (WARN(pat_enabled(), "ivtvfb needs PAT disabled, boot with nopat kernel parameter\n")) { Fix that by shifting the PAT requirement check out under the code that does the "quasi-probe" for the device. This device driver relies on an existing driver to find its own devices, it looks for that device driver and its own found devices, then uses driver_for_each_device() to try to see if it can probe each of those devices as a frambuffer device with ivtvfb_init_card(). We tuck the PAT requiremenet check then on the ivtvfb_init_card() call making the check at least require an ivtv device present before complaining. Reported-by: Fengguang Wu [0-day test robot] Signed-off-by: Luis R. Rodriguez Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andy@silverblocksystems.net Cc: benh@kernel.crashing.org Cc: bp@suse.de Cc: dan.j.williams@intel.com Cc: dledford@redhat.com Cc: jkosina@suse.cz Cc: julia.lawall@lip6.fr Cc: luto@amacapital.net Cc: mchehab@osg.samsung.com Link: http://lkml.kernel.org/r/1437167245-28273-3-git-send-email-mcgrof@do-not-panic.com Signed-off-by: Ingo Molnar --- drivers/media/pci/ivtv/ivtvfb.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/media/pci/ivtv/ivtvfb.c b/drivers/media/pci/ivtv/ivtvfb.c index 4cb365d4ffdc..8b95eefb610b 100644 --- a/drivers/media/pci/ivtv/ivtvfb.c +++ b/drivers/media/pci/ivtv/ivtvfb.c @@ -38,6 +38,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -1171,6 +1173,13 @@ static int ivtvfb_init_card(struct ivtv *itv) { int rc; +#ifdef CONFIG_X86_64 + if (pat_enabled()) { + pr_warn("ivtvfb needs PAT disabled, boot with nopat kernel parameter\n"); + return -ENODEV; + } +#endif + if (itv->osd_info) { IVTVFB_ERR("Card %d already initialised\n", ivtvfb_card_id); return -EBUSY; @@ -1265,12 +1274,6 @@ static int __init ivtvfb_init(void) int registered = 0; int err; -#ifdef CONFIG_X86_64 - if (WARN(pat_enabled(), - "ivtvfb needs PAT disabled, boot with nopat kernel parameter\n")) { - return -ENODEV; - } -#endif if (ivtvfb_card_id < -1 || ivtvfb_card_id >= IVTV_MAX_CARDS) { printk(KERN_ERR "ivtvfb: ivtvfb_card_id parameter is out of range (valid range: -1 - %d)\n", From 1c9cf9b211030a454a84cbc1cb15b82d9aa49011 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 16 Jul 2015 17:23:14 -0600 Subject: [PATCH 08/13] x86/mm: Move warning from __ioremap_check_ram() to the call site __ioremap_check_ram() has a WARN_ONCE() which is emitted when the given pfn range is not RAM. The warning is bogus in two aspects: - it never triggers since walk_system_ram_range() only calls __ioremap_check_ram() for RAM ranges. - the warning message is wrong as it says: "ioremap on RAM' after it established that the pfn range is not RAM. Move the WARN_ONCE() to __ioremap_caller(), and update the message to include the address range so we get an actual warning when something tries to ioremap system RAM. [ tglx: Massaged changelog ] Signed-off-by: Toshi Kani Reviewed-by: Dan Williams Cc: Roland Dreier Cc: Luis R. Rodriguez Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1437088996-28511-2-git-send-email-toshi.kani@hp.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index cc5ccc415cc0..fd3df0ddb4d4 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -63,8 +63,6 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, !PageReserved(pfn_to_page(start_pfn + i))) return 1; - WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn); - return 0; } @@ -131,8 +129,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, pfn = phys_addr >> PAGE_SHIFT; last_pfn = last_addr >> PAGE_SHIFT; if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, - __ioremap_check_ram) == 1) + __ioremap_check_ram) == 1) { + WARN_ONCE(1, "ioremap on RAM at 0x%llx - 0x%llx\n", + phys_addr, last_addr); return NULL; + } } /* * Mappings have to be page-aligned From 9a58eebe1ace609bedf8c5a65e70a097459f5696 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 16 Jul 2015 17:23:15 -0600 Subject: [PATCH 09/13] x86/mm: Remove region_is_ram() call from ioremap __ioremap_caller() calls region_is_ram() to walk through the iomem_resource table to check if a target range is in RAM, which was added to improve the lookup performance over page_is_ram() (commit 906e36c5c717 "x86: use optimized ioresource lookup in ioremap function"). page_is_ram() was no longer used when this change was added, though. __ioremap_caller() then calls walk_system_ram_range(), which had replaced page_is_ram() to improve the lookup performance (commit c81c8a1eeede "x86, ioremap: Speed up check for RAM pages"). Since both checks walk through the same iomem_resource table for the same purpose, there is no need to call both functions. Aside of that walk_system_ram_range() is the only useful check at the moment because region_is_ram() always returns -1 due to an implementation bug. That bug in region_is_ram() cannot be fixed without breaking existing ioremap callers, which rely on the subtle difference of walk_system_ram_range() versus non page aligned ranges. Once these offending callers are fixed we can use region_is_ram() and remove walk_system_ram_range(). [ tglx: Massaged changelog ] Signed-off-by: Toshi Kani Reviewed-by: Dan Williams Cc: Roland Dreier Cc: Mike Travis Cc: Luis R. Rodriguez Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1437088996-28511-3-git-send-email-toshi.kani@hp.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index fd3df0ddb4d4..b9d4a33017fd 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -92,7 +92,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, pgprot_t prot; int retval; void __iomem *ret_addr; - int ram_region; /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; @@ -115,26 +114,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't allow anybody to remap normal RAM that we're using.. */ - /* First check if whole region can be identified as RAM or not */ - ram_region = region_is_ram(phys_addr, size); - if (ram_region > 0) { - WARN_ONCE(1, "ioremap on RAM at 0x%lx - 0x%lx\n", - (unsigned long int)phys_addr, - (unsigned long int)last_addr); + pfn = phys_addr >> PAGE_SHIFT; + last_pfn = last_addr >> PAGE_SHIFT; + if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, + __ioremap_check_ram) == 1) { + WARN_ONCE(1, "ioremap on RAM at 0x%llx - 0x%llx\n", + phys_addr, last_addr); return NULL; } - /* If could not be identified(-1), check page by page */ - if (ram_region < 0) { - pfn = phys_addr >> PAGE_SHIFT; - last_pfn = last_addr >> PAGE_SHIFT; - if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, - __ioremap_check_ram) == 1) { - WARN_ONCE(1, "ioremap on RAM at 0x%llx - 0x%llx\n", - phys_addr, last_addr); - return NULL; - } - } /* * Mappings have to be page-aligned */ From 8c38de992be9aed0b34c4fab8f972c83d3b00dc4 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 16 Jul 2015 17:23:16 -0600 Subject: [PATCH 10/13] mm: Fix bugs in region_is_ram() region_is_ram() looks up the iomem_resource table to check if a target range is in RAM. However, it always returns with -1 due to invalid range checks. It always breaks the loop at the first entry of the table. Another issue is that it compares p->flags and flags, but it always fails. flags is declared as int, which makes it as a negative value with IORESOURCE_BUSY (0x80000000) set while p->flags is unsigned long. Fix the range check and flags so that region_is_ram() works as advertised. Signed-off-by: Toshi Kani Reviewed-by: Dan Williams Cc: Mike Travis Cc: Luis R. Rodriguez Cc: Andrew Morton Cc: Roland Dreier Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1437088996-28511-4-git-send-email-toshi.kani@hp.com Signed-off-by: Thomas Gleixner --- kernel/resource.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 90552aab5f2d..fed052a1bc9f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -504,13 +504,13 @@ int region_is_ram(resource_size_t start, unsigned long size) { struct resource *p; resource_size_t end = start + size - 1; - int flags = IORESOURCE_MEM | IORESOURCE_BUSY; + unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; const char *name = "System RAM"; int ret = -1; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - if (end < p->start) + if (p->end < start) continue; if (p->start <= start && end <= p->end) { @@ -521,7 +521,7 @@ int region_is_ram(resource_size_t start, unsigned long size) ret = 1; break; } - if (p->end < start) + if (end < p->start) break; /* not found */ } read_unlock(&resource_lock); From 8a0a5da6d9cbf1b142115ff6e6b253a82633c3d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Jul 2015 16:13:43 +0200 Subject: [PATCH 11/13] x86/mm: Fix newly introduced printk format warnings Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index b9d4a33017fd..b9c78f3bcd67 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -118,8 +118,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, last_pfn = last_addr >> PAGE_SHIFT; if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, __ioremap_check_ram) == 1) { - WARN_ONCE(1, "ioremap on RAM at 0x%llx - 0x%llx\n", - phys_addr, last_addr); + WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", + &phys_addr, &last_addr); return NULL; } From c0c3322e98029e752526d906d9e3a680ed213c03 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Jul 2015 14:16:43 +0200 Subject: [PATCH 12/13] x86/asm/entry/32: Revert 'Do not use R9 in SYSCALL32' commit This change reverts most of commit 53e9accf0f 'Do not use R9 in SYSCALL32'. I don't yet understand how, but code in that commit sometimes fails to preserve EBP. See https://bugzilla.kernel.org/show_bug.cgi?id=101061 "Problems while executing 32-bit code on AMD64" Reported-and-tested-by: Krzysztof A. Sobiecki Signed-off-by: Denys Vlasenko Cc: Linus Torvalds Cc: Steven Rostedt Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: Alexei Starovoitov Cc: Will Drewry Cc: Kees Cook CC: x86@kernel.org Link: http://lkml.kernel.org/r/1437740203-11552-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/entry/entry_64_compat.S | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index bb187a6a877c..5a1844765a7a 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -205,7 +205,6 @@ sysexit_from_sys_call: movl RDX(%rsp), %edx /* arg3 */ movl RSI(%rsp), %ecx /* arg4 */ movl RDI(%rsp), %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ .endm .macro auditsys_exit exit @@ -236,6 +235,7 @@ sysexit_from_sys_call: sysenter_auditsys: auditsys_entry_common + movl %ebp, %r9d /* reload 6th syscall arg */ jmp sysenter_dispatch sysexit_audit: @@ -336,7 +336,7 @@ ENTRY(entry_SYSCALL_compat) * 32-bit zero extended: */ ASM_STAC -1: movl (%r8), %ebp +1: movl (%r8), %r9d _ASM_EXTABLE(1b, ia32_badarg) ASM_CLAC orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) @@ -346,7 +346,7 @@ ENTRY(entry_SYSCALL_compat) cstar_do_call: /* 32-bit syscall -> 64-bit C ABI argument conversion */ movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ + /* r9 already loaded */ /* arg6 */ xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ movl %ebx, %edi /* arg1 */ movl %edx, %edx /* arg3 (zero extension) */ @@ -358,7 +358,6 @@ cstar_dispatch: call *ia32_sys_call_table(, %rax, 8) movq %rax, RAX(%rsp) 1: - movl RCX(%rsp), %ebp DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -392,7 +391,9 @@ sysretl_from_sys_call: #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: + movl %r9d, R9(%rsp) /* register to be clobbered by call */ auditsys_entry_common + movl R9(%rsp), %r9d /* reload 6th syscall arg */ jmp cstar_dispatch sysretl_audit: @@ -404,14 +405,16 @@ cstar_tracesys: testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz cstar_auditsys #endif + xchgl %r9d, %ebp SAVE_EXTRA_REGS xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) - movq %rax, R9(%rsp) + movq %r9, R9(%rsp) movq %rax, R8(%rsp) movq %rsp, %rdi /* &pt_regs -> arg1 */ call syscall_trace_enter + movl R9(%rsp), %r9d /* Reload arg registers from stack. (see sysenter_tracesys) */ movl RCX(%rsp), %ecx @@ -421,6 +424,7 @@ cstar_tracesys: movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS + xchgl %ebp, %r9d jmp cstar_do_call END(entry_SYSCALL_compat) From 1a4e8795711f474b31ff6eac37f3efd304ed8a93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Jul 2015 10:27:37 +0200 Subject: [PATCH 13/13] x86/mm/pat: Revert 'Adjust default caching mode translation tables' Toshi explains: "No, the default values need to be set to the fallback types, i.e. minimal supported mode. For WC and WT, UC is the fallback type. When PAT is disabled, pat_init() does update the tables below to enable WT per the default BIOS setup. However, when PAT is enabled, but CPU has PAT -errata, WT falls back to UC per the default values." Revert: ca1fec58bc6a 'x86/mm/pat: Adjust default caching mode translation tables' Requested-by: Toshi Kani Cc: Jan Beulich Link: http://lkml.kernel.org/r/1437577776.3214.252.camel@hp.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7a4532229f51..8533b46e6bee 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -43,18 +43,18 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, - [_PAGE_CACHE_MODE_WT ] = _PAGE_PWT | 0, + [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; EXPORT_SYMBOL(__cachemode2pte_tbl); uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WT, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WT, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, };