From fb535ccb30845fe0b7bd09caa37a838985b72ff9 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:14 -0600 Subject: [PATCH 01/12] x86/vdso32: Define PGTABLE_LEVELS to 32bit VDSO In case of CONFIG_X86_64, vdso32/vclock_gettime.c fakes a 32-bit non-PAE kernel configuration by re-defining it to CONFIG_X86_32. However, it does not re-define CONFIG_PGTABLE_LEVELS leaving it as 4 levels. This mismatch leads to NOT include and , which will cause compile errors when a later patch enhances to use PUD_SHIFT and PMD_SHIFT. These -nopud & -nopmd headers define these SHIFTs for the 32-bit non-PAE kernel. Fix it by re-defining CONFIG_PGTABLE_LEVELS to 2 levels. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-2-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/entry/vdso/vdso32/vclock_gettime.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 175cc72c0f68..87a86e017f0e 100644 --- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -14,11 +14,13 @@ */ #undef CONFIG_64BIT #undef CONFIG_X86_64 +#undef CONFIG_PGTABLE_LEVELS #undef CONFIG_ILLEGAL_POINTER_VALUE #undef CONFIG_SPARSEMEM_VMEMMAP #undef CONFIG_NR_CPUS #define CONFIG_X86_32 1 +#define CONFIG_PGTABLE_LEVELS 2 #define CONFIG_PAGE_OFFSET 0 #define CONFIG_ILLEGAL_POINTER_VALUE 0 #define CONFIG_NR_CPUS 1 From 832102671855f73962e7a04fdafd48b9385ea5c6 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:15 -0600 Subject: [PATCH 02/12] x86/asm: Move PUD_PAGE macros to page_types.h PUD_SHIFT is defined according to a given kernel configuration, which allows it be commonly used by any x86 kernels. However, PUD_PAGE_SIZE and PUD_PAGE_MASK, which are set from PUD_SHIFT, are defined in page_64_types.h, which can be used by 64-bit kernel only. Move PUD_PAGE_SIZE and PUD_PAGE_MASK to page_types.h so that they can be used by any x86 kernels as well. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-3-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/page_64_types.h | 3 --- arch/x86/include/asm/page_types.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 4edd53b79a81..4928cf0d5af0 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -26,9 +26,6 @@ #define MCE_STACK 4 #define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ -#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) -#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) - /* * Set __PAGE_OFFSET to the most negative possible address + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index c7c712f2648b..c5b7fb2774d0 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -20,6 +20,9 @@ #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) + #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) From 4be4c1fb9a754b100466ebaec50f825be0b2050b Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:16 -0600 Subject: [PATCH 03/12] x86/asm: Add pud/pmd mask interfaces to handle large PAT bit The PAT bit gets relocated to bit 12 when PUD and PMD mappings are used. This bit 12, however, is not covered by PTE_FLAGS_MASK, which is used for masking pfn and flags for all levels. Add pud/pmd mask interfaces to handle pfn and flags properly by using P?D_PAGE_MASK when PUD/PMD mappings are used, i.e. PSE bit is set. Suggested-by: Juergen Gross Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-4-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable_types.h | 36 ++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 13f310bfc09a..7c8f79780038 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -209,10 +209,10 @@ enum page_cache_mode { #include -/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ +/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) -/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ +/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_FLAGS_MASK (~PTE_PFN_MASK) typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; @@ -276,11 +276,43 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) } #endif +static inline pudval_t pud_pfn_mask(pud_t pud) +{ + if (native_pud_val(pud) & _PAGE_PSE) + return PUD_PAGE_MASK & PHYSICAL_PAGE_MASK; + else + return PTE_PFN_MASK; +} + +static inline pudval_t pud_flags_mask(pud_t pud) +{ + if (native_pud_val(pud) & _PAGE_PSE) + return ~(PUD_PAGE_MASK & (pudval_t)PHYSICAL_PAGE_MASK); + else + return ~PTE_PFN_MASK; +} + static inline pudval_t pud_flags(pud_t pud) { return native_pud_val(pud) & PTE_FLAGS_MASK; } +static inline pmdval_t pmd_pfn_mask(pmd_t pmd) +{ + if (native_pmd_val(pmd) & _PAGE_PSE) + return PMD_PAGE_MASK & PHYSICAL_PAGE_MASK; + else + return PTE_PFN_MASK; +} + +static inline pmdval_t pmd_flags_mask(pmd_t pmd) +{ + if (native_pmd_val(pmd) & _PAGE_PSE) + return ~(PMD_PAGE_MASK & (pmdval_t)PHYSICAL_PAGE_MASK); + else + return ~PTE_PFN_MASK; +} + static inline pmdval_t pmd_flags(pmd_t pmd) { return native_pmd_val(pmd) & PTE_FLAGS_MASK; From f70abb0fc3da1b2945c92751ccda2744081bf2b7 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:17 -0600 Subject: [PATCH 04/12] x86/asm: Fix pud/pmd interfaces to handle large PAT bit Now that we have pud/pmd mask interfaces, which handle pfn & flags mask properly for the large PAT bit. Fix pud/pmd pfn & flags interfaces by replacing PTE_PFN_MASK and PTE_FLAGS_MASK with the pud/pmd mask interfaces. Suggested-by: Juergen Gross Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-5-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable.h | 14 ++++++++------ arch/x86/include/asm/pgtable_types.h | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 867da5bbb4a3..0733ec7c8036 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -142,12 +142,12 @@ static inline unsigned long pte_pfn(pte_t pte) static inline unsigned long pmd_pfn(pmd_t pmd) { - return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } static inline unsigned long pud_pfn(pud_t pud) { - return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; } #define pte_page(pte) pfn_to_page(pte_pfn(pte)) @@ -502,14 +502,15 @@ static inline int pmd_none(pmd_t pmd) static inline unsigned long pmd_page_vaddr(pmd_t pmd) { - return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK); + return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT) +#define pmd_page(pmd) \ + pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -570,14 +571,15 @@ static inline int pud_present(pud_t pud) static inline unsigned long pud_page_vaddr(pud_t pud) { - return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK); + return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) +#define pud_page(pud) \ + pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7c8f79780038..dd5b0aa9dd2f 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -294,7 +294,7 @@ static inline pudval_t pud_flags_mask(pud_t pud) static inline pudval_t pud_flags(pud_t pud) { - return native_pud_val(pud) & PTE_FLAGS_MASK; + return native_pud_val(pud) & pud_flags_mask(pud); } static inline pmdval_t pmd_pfn_mask(pmd_t pmd) @@ -315,7 +315,7 @@ static inline pmdval_t pmd_flags_mask(pmd_t pmd) static inline pmdval_t pmd_flags(pmd_t pmd) { - return native_pmd_val(pmd) & PTE_FLAGS_MASK; + return native_pmd_val(pmd) & pmd_flags_mask(pmd); } static inline pte_t native_make_pte(pteval_t val) From bbac8c6deadab921f4b7d00ce675ffa4f358ec7f Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:18 -0600 Subject: [PATCH 05/12] x86/asm: Add pud_pgprot() and pmd_pgprot() pte_pgprot() returns a pgprot_t value by calling pte_flags(). Now that pud_flags() and pmd_flags() work specifically for the pud/pmd levels, define pud_pgprot() and pmd_pgprot() for PUD/PMD. Also update pte_pgprot() to remove the unnecessary mask with PTE_FLAGS_MASK as pte_flags() takes care of it. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-6-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0733ec7c8036..59fc3414c68b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -379,7 +379,9 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) return __pgprot(preservebits | addbits); } -#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK) +#define pte_pgprot(x) __pgprot(pte_flags(x)) +#define pmd_pgprot(x) __pgprot(pmd_flags(x)) +#define pud_pgprot(x) __pgprot(pud_flags(x)) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) From da25e628c4c231a281b1c1de3168a36ab9bfe473 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:19 -0600 Subject: [PATCH 06/12] x86/mm: Fix page table dump to show PAT bit /sys/kernel/debug/kernel_page_tables does not show the PAT bit for PUD/PMD mappings. This is because walk_pud_level(), walk_pmd_level() and note_page() mask the flags with PTE_FLAGS_MASK, which does not cover their PAT bit, _PAGE_PAT_LARGE. Fix it by replacing the use of PTE_FLAGS_MASK with p?d_flags(), which masks the flags properly. Also change to show the PAT bit as "PAT" to be consistent with other bits. Reported-by: Robert Elliott Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-7-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/dump_pagetables.c | 39 +++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index f0cedf3395af..71ab2d741024 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -155,7 +155,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) pt_dump_cont_printf(m, dmsg, " "); if ((level == 4 && pr & _PAGE_PAT) || ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) - pt_dump_cont_printf(m, dmsg, "pat "); + pt_dump_cont_printf(m, dmsg, "PAT "); else pt_dump_cont_printf(m, dmsg, " "); if (pr & _PAGE_GLOBAL) @@ -198,8 +198,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, * we have now. "break" is either changing perms, levels or * address space marker. */ - prot = pgprot_val(new_prot) & PTE_FLAGS_MASK; - cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK; + prot = pgprot_val(new_prot); + cur = pgprot_val(st->current_prot); if (!st->level) { /* First entry */ @@ -269,13 +269,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, { int i; pte_t *start; + pgprotval_t prot; start = (pte_t *) pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { - pgprot_t prot = pte_pgprot(*start); - + prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - note_page(m, st, prot, 4); + note_page(m, st, __pgprot(prot), 4); start++; } } @@ -287,18 +287,19 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, { int i; pmd_t *start; + pgprotval_t prot; start = (pmd_t *) pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { - pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK; - - if (pmd_large(*start) || !pmd_present(*start)) + if (pmd_large(*start) || !pmd_present(*start)) { + prot = pmd_flags(*start); note_page(m, st, __pgprot(prot), 3); - else + } else { walk_pte_level(m, st, *start, P + i * PMD_LEVEL_MULT); + } } else note_page(m, st, __pgprot(0), 3); start++; @@ -318,19 +319,20 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, { int i; pud_t *start; + pgprotval_t prot; start = (pud_t *) pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); if (!pud_none(*start)) { - pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK; - - if (pud_large(*start) || !pud_present(*start)) + if (pud_large(*start) || !pud_present(*start)) { + prot = pud_flags(*start); note_page(m, st, __pgprot(prot), 2); - else + } else { walk_pmd_level(m, st, *start, P + i * PUD_LEVEL_MULT); + } } else note_page(m, st, __pgprot(0), 2); @@ -351,6 +353,7 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) #else pgd_t *start = swapper_pg_dir; #endif + pgprotval_t prot; int i; struct pg_state st = {}; @@ -362,13 +365,13 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); if (!pgd_none(*start)) { - pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK; - - if (pgd_large(*start) || !pgd_present(*start)) + if (pgd_large(*start) || !pgd_present(*start)) { + prot = pgd_flags(*start); note_page(m, &st, __pgprot(prot), 1); - else + } else { walk_pud_level(m, &st, *start, i * PGD_LEVEL_MULT); + } } else note_page(m, &st, __pgprot(0), 1); From 34437e67a6727885bdf6cbfd8441b1ac43a1ee65 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:20 -0600 Subject: [PATCH 07/12] x86/mm: Fix slow_virt_to_phys() to handle large PAT bit slow_virt_to_phys() calls lookup_address() to obtain *pte and its level. It then calls pte_pfn() to obtain a physical address for any level. However, this physical address is not correct when the large PAT bit is set because pte_pfn() does not mask the large PAT bit properly for PUD/PMD. Fix slow_virt_to_phys() to use pud_pfn() and pmd_pfn() for 1GB and 2MB mapping levels. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-8-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2c44c0792301..1441368a7931 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -414,18 +414,28 @@ pmd_t *lookup_pmd_address(unsigned long address) phys_addr_t slow_virt_to_phys(void *__virt_addr) { unsigned long virt_addr = (unsigned long)__virt_addr; - phys_addr_t phys_addr; - unsigned long offset; + unsigned long phys_addr, offset; enum pg_level level; - unsigned long pmask; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); - pmask = page_level_mask(level); - offset = virt_addr & ~pmask; - phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; - return (phys_addr | offset); + + switch (level) { + case PG_LEVEL_1G: + phys_addr = pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; + offset = virt_addr & ~PUD_PAGE_MASK; + break; + case PG_LEVEL_2M: + phys_addr = pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; + offset = virt_addr & ~PMD_PAGE_MASK; + break; + default: + phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + offset = virt_addr & ~PAGE_MASK; + } + + return (phys_addr_t)(phys_addr | offset); } EXPORT_SYMBOL_GPL(slow_virt_to_phys); From daf3e35c5888e8bd6a2f5ed15ed392b2df362ecf Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:21 -0600 Subject: [PATCH 08/12] x86/mm: Fix gup_huge_p?d() to handle large PAT bit gup_huge_pud() and gup_huge_pmd() cast *pud and *pmd to *pte, and use pte_xxx() interfaces to obtain the flags and PFN. However, the pte_xxx() interface does not handle the large PAT bit properly for PUD/PMD. Fix gup_huge_pud() and gup_huge_pmd() to use pud_xxx() and pmd_xxx() interfaces according to their type. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-9-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/gup.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 81bf3d2af3eb..ae9a37bf1371 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -118,21 +118,20 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; - pte_t pte = *(pte_t *)&pmd; struct page *head, *page; int refs; mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; - if ((pte_flags(pte) & mask) != mask) + if ((pmd_flags(pmd) & mask) != mask) return 0; /* hugepages are never "special" */ - VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); refs = 0; - head = pte_page(pte); + head = pmd_page(pmd); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON_PAGE(compound_head(page) != head, page); @@ -195,21 +194,20 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; - pte_t pte = *(pte_t *)&pud; struct page *head, *page; int refs; mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; - if ((pte_flags(pte) & mask) != mask) + if ((pud_flags(pud) & mask) != mask) return 0; /* hugepages are never "special" */ - VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pud_pfn(pud))); refs = 0; - head = pte_page(pte); + head = pud_page(pud); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON_PAGE(compound_head(page) != head, page); From 3a19109efbfa7d887996a74257556a46e00525c2 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:22 -0600 Subject: [PATCH 09/12] x86/mm: Fix try_preserve_large_page() to handle large PAT bit try_preserve_large_page() is called from __change_page_attr() to change the mapping attribute of a given large page. This function uses pte_pfn() and pte_pgprot() for PUD/PMD, which do not handle the large PAT bit properly. Fix try_preserve_large_page() by using the corresponding pud/pmd prot/pfn interfaces. Also remove '#ifdef CONFIG_X86_64', which is not necessary. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-10-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1441368a7931..3f612713159a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -468,7 +468,7 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; + unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; @@ -488,17 +488,21 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, switch (level) { case PG_LEVEL_2M: -#ifdef CONFIG_X86_64 + old_prot = pmd_pgprot(*(pmd_t *)kpte); + old_pfn = pmd_pfn(*(pmd_t *)kpte); + break; case PG_LEVEL_1G: -#endif - psize = page_level_size(level); - pmask = page_level_mask(level); + old_prot = pud_pgprot(*(pud_t *)kpte); + old_pfn = pud_pfn(*(pud_t *)kpte); break; default: do_split = -EINVAL; goto out_unlock; } + psize = page_level_size(level); + pmask = page_level_mask(level); + /* * Calculate the number of pages, which fit into this large * page starting at address: @@ -514,7 +518,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * up accordingly. */ old_pte = *kpte; - old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte)); + old_prot = req_prot = pgprot_large_2_4k(old_prot); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); @@ -540,10 +544,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, req_prot = canon_pgprot(req_prot); /* - * old_pte points to the large page base address. So we need + * old_pfn points to the large page base pfn. So we need * to add the offset of the virtual address: */ - pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); + pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; new_prot = static_protections(req_prot, address, pfn); @@ -554,7 +558,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * the pages in the range we try to preserve: */ addr = address & pmask; - pfn = pte_pfn(old_pte); + pfn = old_pfn; for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { pgprot_t chk_prot = static_protections(req_prot, addr, pfn); @@ -584,7 +588,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * The address is aligned and the number of pages * covers the full page. */ - new_pte = pfn_pte(pte_pfn(old_pte), new_prot); + new_pte = pfn_pte(old_pfn, new_prot); __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; do_split = 0; From d551aaa2f7e1387fa66093ce9914c2e91f283a50 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:23 -0600 Subject: [PATCH 10/12] x86/mm: Fix __split_large_page() to handle large PAT bit __split_large_page() is called from __change_page_attr() to change the mapping attribute by splitting a given large page into smaller pages. This function uses pte_pfn() and pte_pgprot() for PUD/PMD, which do not handle the large PAT bit properly. Fix __split_large_page() by using the corresponding pud/pmd pfn/ pgprot interfaces. Also remove '#ifdef CONFIG_X86_64', which is not necessary. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-11-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3f612713159a..ab9465b9160f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -605,7 +605,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, struct page *base) { pte_t *pbase = (pte_t *)page_address(base); - unsigned long pfn, pfninc = 1; + unsigned long ref_pfn, pfn, pfninc = 1; unsigned int i, level; pte_t *tmp; pgprot_t ref_prot; @@ -622,26 +622,33 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, } paravirt_alloc_pte(&init_mm, page_to_pfn(base)); - ref_prot = pte_pgprot(pte_clrhuge(*kpte)); - /* promote PAT bit to correct position */ - if (level == PG_LEVEL_2M) + switch (level) { + case PG_LEVEL_2M: + ref_prot = pmd_pgprot(*(pmd_t *)kpte); + /* clear PSE and promote PAT bit to correct position */ ref_prot = pgprot_large_2_4k(ref_prot); + ref_pfn = pmd_pfn(*(pmd_t *)kpte); + break; -#ifdef CONFIG_X86_64 - if (level == PG_LEVEL_1G) { + case PG_LEVEL_1G: + ref_prot = pud_pgprot(*(pud_t *)kpte); + ref_pfn = pud_pfn(*(pud_t *)kpte); pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + /* - * Set the PSE flags only if the PRESENT flag is set + * Clear the PSE flags if the PRESENT flag is not set * otherwise pmd_present/pmd_huge will return true * even on a non present pmd. */ - if (pgprot_val(ref_prot) & _PAGE_PRESENT) - pgprot_val(ref_prot) |= _PAGE_PSE; - else + if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) pgprot_val(ref_prot) &= ~_PAGE_PSE; + break; + + default: + spin_unlock(&pgd_lock); + return 1; } -#endif /* * Set the GLOBAL flags only if the PRESENT flag is set @@ -657,7 +664,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, /* * Get the target pfn from the original entry: */ - pfn = pte_pfn(*kpte); + pfn = ref_pfn; for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); From 55696b1f664e52b3036f21631f9c2247b667f587 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:24 -0600 Subject: [PATCH 11/12] x86/mm: Fix no-change case in try_preserve_large_page() try_preserve_large_page() checks if new_prot is the same as old_prot. If so, it simply sets do_split to 0, and returns with no-operation. However, old_prot is set as a 4KB pgprot value while new_prot is a large page pgprot value. Now that old_prot is initially set from p?d_pgprot() as a large page pgprot value, fix it by not overwriting old_prot with a 4KB pgprot value. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-12-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index ab9465b9160f..e2621a8e8213 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -518,7 +518,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * up accordingly. */ old_pte = *kpte; - old_prot = req_prot = pgprot_large_2_4k(old_prot); + req_prot = pgprot_large_2_4k(old_prot); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); From e1a58320a38dfa72be48a0f1a3a92273663ba6db Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Mon, 5 Oct 2015 12:55:20 -0400 Subject: [PATCH 12/12] x86/mm: Warn on W^X mappings Warn on any residual W+X mappings after setting NX if DEBUG_WX is enabled. Introduce a separate X86_PTDUMP_CORE config that enables the code for dumping the page tables without enabling the debugfs interface, so that DEBUG_WX can be enabled without exposing the debugfs interface. Switch EFI_PGT_DUMP to using X86_PTDUMP_CORE so that it also does not require enabling the debugfs interface. On success it prints this to the kernel log: x86/mm: Checked W+X mappings: passed, no W+X pages found. On failure it prints a warning and a count of the failed pages: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 1 at arch/x86/mm/dump_pagetables.c:226 note_page+0x610/0x7b0() x86/mm: Found insecure W+X mapping at address ffffffff81755000/__stop___ex_table+0xfa8/0xabfa8 [...] Call Trace: [] dump_stack+0x44/0x55 [] warn_slowpath_common+0x82/0xc0 [] warn_slowpath_fmt+0x5c/0x80 [] ? note_page+0x5c9/0x7b0 [] note_page+0x610/0x7b0 [] ptdump_walk_pgd_level_core+0x259/0x3c0 [] ptdump_walk_pgd_level_checkwx+0x17/0x20 [] mark_rodata_ro+0xf5/0x100 [] ? rest_init+0x80/0x80 [] kernel_init+0x1d/0xe0 [] ret_from_fork+0x3f/0x70 [] ? rest_init+0x80/0x80 ---[ end trace a1f23a1e42a2ac76 ]--- x86/mm: Checked W+X mappings: FAILED, 171 W+X pages found. Signed-off-by: Stephen Smalley Acked-by: Kees Cook Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1444064120-11450-1-git-send-email-sds@tycho.nsa.gov [ Improved the Kconfig help text and made the new option default-y if CONFIG_DEBUG_RODATA=y, because it already found buggy mappings, so we really want people to have this on by default. ] Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 36 ++++++++++++++++++++++++++++- arch/x86/include/asm/pgtable.h | 7 ++++++ arch/x86/mm/Makefile | 2 +- arch/x86/mm/dump_pagetables.c | 42 +++++++++++++++++++++++++++++++++- arch/x86/mm/init_32.c | 2 ++ arch/x86/mm/init_64.c | 2 ++ 6 files changed, 88 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d8c0d3266173..3e0baf726eef 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -65,10 +65,14 @@ config EARLY_PRINTK_EFI This is useful for kernel debugging when your machine crashes very early before the console code is initialized. +config X86_PTDUMP_CORE + def_bool n + config X86_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL select DEBUG_FS + select X86_PTDUMP_CORE ---help--- Say Y here if you want to show the kernel pagetable layout in a debugfs file. This information is only useful for kernel developers @@ -79,7 +83,8 @@ config X86_PTDUMP config EFI_PGT_DUMP bool "Dump the EFI pagetable" - depends on EFI && X86_PTDUMP + depends on EFI + select X86_PTDUMP_CORE ---help--- Enable this if you want to dump the EFI page table before enabling virtual mode. This can be used to debug miscellaneous @@ -105,6 +110,35 @@ config DEBUG_RODATA_TEST feature as well as for the change_page_attr() infrastructure. If in doubt, say "N" +config DEBUG_WX + bool "Warn on W+X mappings at boot" + depends on DEBUG_RODATA + default y + select X86_PTDUMP_CORE + ---help--- + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving + W+X mappings after applying NX, as such mappings are a security risk. + + Look for a message in dmesg output like this: + + x86/mm: Checked W+X mappings: passed, no W+X pages found. + + or like this, if the check failed: + + x86/mm: Checked W+X mappings: FAILED, W+X pages found. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config DEBUG_SET_MODULE_RONX bool "Set loadable kernel module data as NX and text as RO" depends on MODULES diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 59fc3414c68b..c0b41f111a9a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -19,6 +19,13 @@ #include void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); +void ptdump_walk_pgd_level_checkwx(void); + +#ifdef CONFIG_DEBUG_WX +#define debug_checkwx() ptdump_walk_pgd_level_checkwx() +#else +#define debug_checkwx() do { } while (0) +#endif /* * ZERO_PAGE is a global shared page that is always zero: used diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index a482d105172b..65c47fda26fc 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o +obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 71ab2d741024..1bf417e9cc13 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -32,6 +32,8 @@ struct pg_state { const struct addr_marker *marker; unsigned long lines; bool to_dmesg; + bool check_wx; + unsigned long wx_pages; }; struct addr_marker { @@ -214,6 +216,16 @@ static void note_page(struct seq_file *m, struct pg_state *st, const char *unit = units; unsigned long delta; int width = sizeof(unsigned long) * 2; + pgprotval_t pr = pgprot_val(st->current_prot); + + if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) { + WARN_ONCE(1, + "x86/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, + (void *)st->start_address); + st->wx_pages += (st->current_address - + st->start_address) / PAGE_SIZE; + } /* * Now print the actual finished series @@ -346,7 +358,8 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, #define pgd_none(a) pud_none(__pud(pgd_val(a))) #endif -void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) +static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, + bool checkwx) { #ifdef CONFIG_X86_64 pgd_t *start = (pgd_t *) &init_level4_pgt; @@ -362,6 +375,10 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) st.to_dmesg = true; } + st.check_wx = checkwx; + if (checkwx) + st.wx_pages = 0; + for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); if (!pgd_none(*start)) { @@ -381,8 +398,26 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) /* Flush out the last page */ st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); note_page(m, &st, __pgprot(0), 0); + if (!checkwx) + return; + if (st.wx_pages) + pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", + st.wx_pages); + else + pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); } +void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) +{ + ptdump_walk_pgd_level_core(m, pgd, false); +} + +void ptdump_walk_pgd_level_checkwx(void) +{ + ptdump_walk_pgd_level_core(NULL, NULL, true); +} + +#ifdef CONFIG_X86_PTDUMP static int ptdump_show(struct seq_file *m, void *v) { ptdump_walk_pgd_level(m, NULL); @@ -400,10 +435,13 @@ static const struct file_operations ptdump_fops = { .llseek = seq_lseek, .release = single_release, }; +#endif static int pt_dump_init(void) { +#ifdef CONFIG_X86_PTDUMP struct dentry *pe; +#endif #ifdef CONFIG_X86_32 /* Not a compile-time constant on x86-32 */ @@ -415,10 +453,12 @@ static int pt_dump_init(void) address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; #endif +#ifdef CONFIG_X86_PTDUMP pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, &ptdump_fops); if (!pe) return -ENOMEM; +#endif return 0; } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7562f42914b4..cb4ef3de61f9 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -957,6 +957,8 @@ void mark_rodata_ro(void) set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif mark_nxdata_nx(); + if (__supported_pte_mask & _PAGE_NX) + debug_checkwx(); } #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 30564e2752d3..f8b157366700 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1150,6 +1150,8 @@ void mark_rodata_ro(void) free_init_pages("unused kernel", (unsigned long) __va(__pa_symbol(rodata_end)), (unsigned long) __va(__pa_symbol(_sdata))); + + debug_checkwx(); } #endif