x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()

The kernel has several code paths that read CR3.  Most of them assume that
CR3 contains the PGD's physical address, whereas some of them awkwardly
use PHYSICAL_PAGE_MASK to mask off low bits.

Add explicit mask macros for CR3 and convert all of the CR3 readers.
This will keep them from breaking when PCID is enabled.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: xen-devel <xen-devel@lists.xen.org>
Link: http://lkml.kernel.org/r/883f8fb121f4616c1c1427ad87350bb2f5ffeca1.1497288170.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Andy Lutomirski 2017-06-12 10:26:14 -07:00 committed by Ingo Molnar
parent 3f365cf304
commit 6c690ee103
20 changed files with 79 additions and 29 deletions

View File

@ -92,7 +92,7 @@ void initialize_identity_maps(void)
* and we must append to the existing area instead of entirely * and we must append to the existing area instead of entirely
* overwriting it. * overwriting it.
*/ */
level4p = read_cr3(); level4p = read_cr3_pa();
if (level4p == (unsigned long)_pgtable) { if (level4p == (unsigned long)_pgtable) {
debug_putstr("booted via startup_32()\n"); debug_putstr("booted via startup_32()\n");
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;

View File

@ -74,7 +74,7 @@ struct efi_scratch {
__kernel_fpu_begin(); \ __kernel_fpu_begin(); \
\ \
if (efi_scratch.use_pgd) { \ if (efi_scratch.use_pgd) { \
efi_scratch.prev_cr3 = read_cr3(); \ efi_scratch.prev_cr3 = __read_cr3(); \
write_cr3((unsigned long)efi_scratch.efi_pgt); \ write_cr3((unsigned long)efi_scratch.efi_pgt); \
__flush_tlb_all(); \ __flush_tlb_all(); \
} \ } \

View File

@ -269,7 +269,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
/* /*
* This can be used from process context to figure out what the value of * This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) read_cr3(). * CR3 is without needing to do a (slow) __read_cr3().
* *
* It's intended to be used for code like KVM that sneakily changes CR3 * It's intended to be used for code like KVM that sneakily changes CR3
* and needs to restore it. It needs to be used very carefully. * and needs to restore it. It needs to be used very carefully.
@ -281,7 +281,7 @@ static inline unsigned long __get_current_cr3_fast(void)
/* For now, be very restrictive about when this can be called. */ /* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || !in_atomic()); VM_WARN_ON(in_nmi() || !in_atomic());
VM_BUG_ON(cr3 != read_cr3()); VM_BUG_ON(cr3 != __read_cr3());
return cr3; return cr3;
} }

View File

@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr2, x); PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
} }
static inline unsigned long read_cr3(void) static inline unsigned long __read_cr3(void)
{ {
return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3); return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
} }

View File

@ -8,4 +8,40 @@
#else #else
#define X86_VM_MASK 0 /* No VM86 support */ #define X86_VM_MASK 0 /* No VM86 support */
#endif #endif
/*
* CR3's layout varies depending on several things.
*
* If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
* If PAE is enabled, then CR3[11:5] is part of the PDPT address
* (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
* Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
* CR3[2:0] and CR3[11:5] are ignored.
*
* In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
*
* CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
* written as 1 to prevent the write to CR3 from flushing the TLB.
*
* On systems with SME, one bit (in a variable position!) is stolen to indicate
* that the top-level paging structure is encrypted.
*
* All of the remaining bits indicate the physical address of the top-level
* paging structure.
*
* CR3_ADDR_MASK is the mask used by read_cr3_pa().
*/
#ifdef CONFIG_X86_64
/* Mask off the address space ID bits. */
#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
#define CR3_PCID_MASK 0xFFFull
#else
/*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
* a tiny bit of code size by setting all the bits.
*/
#define CR3_ADDR_MASK 0xFFFFFFFFull
#define CR3_PCID_MASK 0ull
#endif
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */ #endif /* _ASM_X86_PROCESSOR_FLAGS_H */

View File

@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
native_cpuid_reg(ecx) native_cpuid_reg(ecx)
native_cpuid_reg(edx) native_cpuid_reg(edx)
/*
* Friendlier CR3 helpers.
*/
static inline unsigned long read_cr3_pa(void)
{
return __read_cr3() & CR3_ADDR_MASK;
}
static inline void load_cr3(pgd_t *pgdir) static inline void load_cr3(pgd_t *pgdir)
{ {
write_cr3(__pa(pgdir)); write_cr3(__pa(pgdir));

View File

@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
} }
static inline unsigned long native_read_cr3(void) static inline unsigned long __native_read_cr3(void)
{ {
unsigned long val; unsigned long val;
asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
native_write_cr2(x); native_write_cr2(x);
} }
static inline unsigned long read_cr3(void) /*
* Careful! CR3 contains more than just an address. You probably want
* read_cr3_pa() instead.
*/
static inline unsigned long __read_cr3(void)
{ {
return native_read_cr3(); return __native_read_cr3();
} }
static inline void write_cr3(unsigned long x) static inline void write_cr3(unsigned long x)

View File

@ -156,7 +156,7 @@ static inline void __native_flush_tlb(void)
* back: * back:
*/ */
preempt_disable(); preempt_disable();
native_write_cr3(native_read_cr3()); native_write_cr3(__native_read_cr3());
preempt_enable(); preempt_enable();
} }
@ -264,7 +264,7 @@ static inline void reset_lazy_tlbstate(void)
this_cpu_write(cpu_tlbstate.state, 0); this_cpu_write(cpu_tlbstate.state, 0);
this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm); this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm);
WARN_ON(read_cr3() != __pa_symbol(swapper_pg_dir)); WARN_ON(read_cr3_pa() != __pa_symbol(swapper_pg_dir));
} }
static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,

View File

@ -55,7 +55,8 @@ int __init early_make_pgtable(unsigned long address)
pmdval_t pmd, *pmd_p; pmdval_t pmd, *pmd_p;
/* Invalid address or early pgt is done ? */ /* Invalid address or early pgt is done ? */
if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) if (physaddr >= MAXMEM ||
read_cr3_pa() != __pa_nodebug(early_level4_pgt))
return -1; return -1;
again: again:

View File

@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.read_cr2 = native_read_cr2, .read_cr2 = native_read_cr2,
.write_cr2 = native_write_cr2, .write_cr2 = native_write_cr2,
.read_cr3 = native_read_cr3, .read_cr3 = __native_read_cr3,
.write_cr3 = native_write_cr3, .write_cr3 = native_write_cr3,
.flush_tlb_user = native_flush_tlb, .flush_tlb_user = native_flush_tlb,

View File

@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0(); cr0 = read_cr0();
cr2 = read_cr2(); cr2 = read_cr2();
cr3 = read_cr3(); cr3 = __read_cr3();
cr4 = __read_cr4(); cr4 = __read_cr4();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4); cr0, cr2, cr3, cr4);

View File

@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0(); cr0 = read_cr0();
cr2 = read_cr2(); cr2 = read_cr2();
cr3 = read_cr3(); cr3 = __read_cr3();
cr4 = __read_cr4(); cr4 = __read_cr4();
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",

View File

@ -5024,7 +5024,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
* Save the most likely value for this task's CR3 in the VMCS. * Save the most likely value for this task's CR3 in the VMCS.
* We can't use __get_current_cr3_fast() because we're not atomic. * We can't use __get_current_cr3_fast() because we're not atomic.
*/ */
cr3 = read_cr3(); cr3 = __read_cr3();
vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
vmx->host_state.vmcs_host_cr3 = cr3; vmx->host_state.vmcs_host_cr3 = cr3;

View File

@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
* Do _not_ use "current" here. We might be inside * Do _not_ use "current" here. We might be inside
* an interrupt in the middle of a task switch.. * an interrupt in the middle of a task switch..
*/ */
pgd_paddr = read_cr3(); pgd_paddr = read_cr3_pa();
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
if (!pmd_k) if (!pmd_k)
return -1; return -1;
@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
static void dump_pagetable(unsigned long address) static void dump_pagetable(unsigned long address)
{ {
pgd_t *base = __va(read_cr3()); pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = &base[pgd_index(address)]; pgd_t *pgd = &base[pgd_index(address)];
p4d_t *p4d; p4d_t *p4d;
pud_t *pud; pud_t *pud;
@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
* happen within a race in page table update. In the later * happen within a race in page table update. In the later
* case just flush: * case just flush:
*/ */
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address); pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
pgd_ref = pgd_offset_k(address); pgd_ref = pgd_offset_k(address);
if (pgd_none(*pgd_ref)) if (pgd_none(*pgd_ref))
return -1; return -1;
@ -555,7 +555,7 @@ static int bad_address(void *p)
static void dump_pagetable(unsigned long address) static void dump_pagetable(unsigned long address)
{ {
pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = base + pgd_index(address); pgd_t *pgd = base + pgd_index(address);
p4d_t *p4d; p4d_t *p4d;
pud_t *pud; pud_t *pud;
@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
pgd_t *pgd; pgd_t *pgd;
pte_t *pte; pte_t *pte;
pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd = __va(read_cr3_pa());
pgd += pgd_index(address); pgd += pgd_index(address);
pte = lookup_address_in_pgd(pgd, address, &level); pte = lookup_address_in_pgd(pgd, address, &level);

View File

@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{ {
/* Don't assume we're using swapper_pg_dir at this point */ /* Don't assume we're using swapper_pg_dir at this point */
pgd_t *base = __va(read_cr3()); pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = &base[pgd_index(addr)]; pgd_t *pgd = &base[pgd_index(addr)];
p4d_t *p4d = p4d_offset(pgd, addr); p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr); pud_t *pud = pud_offset(p4d, addr);

View File

@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
int n_pgds, i, j; int n_pgds, i, j;
if (!efi_enabled(EFI_OLD_MEMMAP)) { if (!efi_enabled(EFI_OLD_MEMMAP)) {
save_pgd = (pgd_t *)read_cr3(); save_pgd = (pgd_t *)__read_cr3();
write_cr3((unsigned long)efi_scratch.efi_pgt); write_cr3((unsigned long)efi_scratch.efi_pgt);
goto out; goto out;
} }
@ -646,7 +646,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
efi_sync_low_kernel_mappings(); efi_sync_low_kernel_mappings();
local_irq_save(flags); local_irq_save(flags);
efi_scratch.prev_cr3 = read_cr3(); efi_scratch.prev_cr3 = __read_cr3();
write_cr3((unsigned long)efi_scratch.efi_pgt); write_cr3((unsigned long)efi_scratch.efi_pgt);
__flush_tlb_all(); __flush_tlb_all();

View File

@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
asmlinkage __visible int xo1_do_sleep(u8 sleep_state) asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
{ {
void *pgd_addr = __va(read_cr3()); void *pgd_addr = __va(read_cr3_pa());
/* Program wakeup mask (using dword access to CS5536_PM1_EN) */ /* Program wakeup mask (using dword access to CS5536_PM1_EN) */
outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS); outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);

View File

@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
*/ */
ctxt->cr0 = read_cr0(); ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2(); ctxt->cr2 = read_cr2();
ctxt->cr3 = read_cr3(); ctxt->cr3 = __read_cr3();
ctxt->cr4 = __read_cr4(); ctxt->cr4 = __read_cr4();
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
ctxt->cr8 = read_cr8(); ctxt->cr8 = read_cr8();

View File

@ -150,7 +150,8 @@ static int relocate_restore_code(void)
memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
/* Make the page containing the relocated code executable */ /* Make the page containing the relocated code executable */
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); pgd = (pgd_t *)__va(read_cr3_pa()) +
pgd_index(relocated_restore_code);
p4d = p4d_offset(pgd, relocated_restore_code); p4d = p4d_offset(pgd, relocated_restore_code);
if (p4d_large(*p4d)) { if (p4d_large(*p4d)) {
set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));

View File

@ -2017,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
pmd_t pmd; pmd_t pmd;
pte_t pte; pte_t pte;
pa = read_cr3(); pa = read_cr3_pa();
pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
sizeof(pgd))); sizeof(pgd)));
if (!pgd_present(pgd)) if (!pgd_present(pgd))
@ -2097,7 +2097,7 @@ void __init xen_relocate_p2m(void)
pt_phys = pmd_phys + PFN_PHYS(n_pmd); pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt; p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
pgd = __va(read_cr3()); pgd = __va(read_cr3_pa());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE); new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
idx_p4d = 0; idx_p4d = 0;
save_pud = n_pud; save_pud = n_pud;
@ -2204,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
{ {
unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
BUG_ON(read_cr3() != __pa(initial_page_table)); BUG_ON(read_cr3_pa() != __pa(initial_page_table));
BUG_ON(cr3 != __pa(swapper_pg_dir)); BUG_ON(cr3 != __pa(swapper_pg_dir));
/* /*