linux/arch/x86/boot/compressed/ident_map_64.c
Arvind Sankar b17a45b6e5 x86/boot/64: Explicitly map boot_params and command line
Commits

  ca0e22d4f0 ("x86/boot/compressed/64: Always switch to own page table")
  8570978ea0 ("x86/boot/compressed/64: Don't pre-map memory in KASLR code")

set up a new page table in the decompressor stub, but without explicit
mappings for boot_params and the kernel command line, relying on the #PF
handler instead.

This is fragile, as boot_params and the command line mappings are
required for the main kernel. If EARLY_PRINTK and RANDOMIZE_BASE are
disabled, a QEMU/OVMF boot never accesses the command line in the
decompressor stub, and so it never gets mapped. The main kernel accesses
it from the identity mapping if AMD_MEM_ENCRYPT is enabled, and will
crash.

Fix this by adding back the explicit mapping of boot_params and the
command line.

Note: the changes also removed the explicit mapping of the main kernel,
with the result that .bss and .brk may not be in the identity mapping,
but those don't get accessed by the main kernel before it switches to
its own page tables.

 [ bp: Pass boot_params with a MOV %rsp... instead of PUSH/POP. Use
   block formatting for the comment. ]

Signed-off-by: Arvind Sankar <nivedita@alum.mit.edu>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
Link: https://lkml.kernel.org/r/20201016200404.1615994-1-nivedita@alum.mit.edu
2020-10-19 19:39:50 +02:00

361 lines
10 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* This code is used on x86_64 to create page table identity mappings on
* demand by building up a new set of page tables (or appending to the
* existing ones), and then switching over to them when ready.
*
* Copyright (C) 2015-2016 Yinghai Lu
* Copyright (C) 2016 Kees Cook
*/
/*
* Since we're dealing with identity mappings, physical and virtual
* addresses are the same, so override these defines which are ultimately
* used by the headers in misc.h.
*/
#define __pa(x) ((unsigned long)(x))
#define __va(x) ((void *)((unsigned long)(x)))
/* No PAGE_TABLE_ISOLATION support needed either: */
#undef CONFIG_PAGE_TABLE_ISOLATION
#include "error.h"
#include "misc.h"
/* These actually do the work of building the kernel identity maps. */
#include <linux/pgtable.h>
#include <asm/cmpxchg.h>
#include <asm/trap_pf.h>
#include <asm/trapnr.h>
#include <asm/init.h>
/* Use the static base for this part of the boot process */
#undef __PAGE_OFFSET
#define __PAGE_OFFSET __PAGE_OFFSET_BASE
#include "../../mm/ident_map.c"
#define _SETUP
#include <asm/setup.h> /* For COMMAND_LINE_SIZE */
#undef _SETUP
extern unsigned long get_cmd_line_ptr(void);
/* Used by PAGE_KERN* macros: */
pteval_t __default_kernel_pte_mask __read_mostly = ~0;
/* Used to track our page table allocation area. */
struct alloc_pgt_data {
unsigned char *pgt_buf;
unsigned long pgt_buf_size;
unsigned long pgt_buf_offset;
};
/*
* Allocates space for a page table entry, using struct alloc_pgt_data
* above. Besides the local callers, this is used as the allocation
* callback in mapping_info below.
*/
static void *alloc_pgt_page(void *context)
{
struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context;
unsigned char *entry;
/* Validate there is space available for a new page. */
if (pages->pgt_buf_offset >= pages->pgt_buf_size) {
debug_putstr("out of pgt_buf in " __FILE__ "!?\n");
debug_putaddr(pages->pgt_buf_offset);
debug_putaddr(pages->pgt_buf_size);
return NULL;
}
entry = pages->pgt_buf + pages->pgt_buf_offset;
pages->pgt_buf_offset += PAGE_SIZE;
return entry;
}
/* Used to track our allocated page tables. */
static struct alloc_pgt_data pgt_data;
/* The top level page table entry pointer. */
static unsigned long top_level_pgt;
phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
/*
* Mapping information structure passed to kernel_ident_mapping_init().
* Due to relocation, pointers must be assigned at run time not build time.
*/
static struct x86_mapping_info mapping_info;
/*
* Adds the specified range to the identity mappings.
*/
static void add_identity_map(unsigned long start, unsigned long end)
{
int ret;
/* Align boundary to 2M. */
start = round_down(start, PMD_SIZE);
end = round_up(end, PMD_SIZE);
if (start >= end)
return;
/* Build the mapping. */
ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end);
if (ret)
error("Error: kernel_ident_mapping_init() failed\n");
}
/* Locates and clears a region for a new top level page table. */
void initialize_identity_maps(void *rmode)
{
unsigned long cmdline;
/* Exclude the encryption mask from __PHYSICAL_MASK */
physical_mask &= ~sme_me_mask;
/* Init mapping_info with run-time function/buffer pointers. */
mapping_info.alloc_pgt_page = alloc_pgt_page;
mapping_info.context = &pgt_data;
mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
mapping_info.kernpg_flag = _KERNPG_TABLE;
/*
* It should be impossible for this not to already be true,
* but since calling this a second time would rewind the other
* counters, let's just make sure this is reset too.
*/
pgt_data.pgt_buf_offset = 0;
/*
* If we came here via startup_32(), cr3 will be _pgtable already
* and we must append to the existing area instead of entirely
* overwriting it.
*
* With 5-level paging, we use '_pgtable' to allocate the p4d page table,
* the top-level page table is allocated separately.
*
* p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
* cases. On 4-level paging it's equal to 'top_level_pgt'.
*/
top_level_pgt = read_cr3_pa();
if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
} else {
pgt_data.pgt_buf = _pgtable;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
}
/*
* New page-table is set up - map the kernel image, boot_params and the
* command line. The uncompressed kernel requires boot_params and the
* command line to be mapped in the identity mapping. Map them
* explicitly here in case the compressed kernel does not touch them,
* or does not touch all the pages covering them.
*/
add_identity_map((unsigned long)_head, (unsigned long)_end);
boot_params = rmode;
add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1));
cmdline = get_cmd_line_ptr();
add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);
/* Load the new page-table. */
write_cr3(top_level_pgt);
}
/*
* This switches the page tables to the new level4 that has been built
* via calls to add_identity_map() above. If booted via startup_32(),
* this is effectively a no-op.
*/
void finalize_identity_maps(void)
{
write_cr3(top_level_pgt);
}
static pte_t *split_large_pmd(struct x86_mapping_info *info,
pmd_t *pmdp, unsigned long __address)
{
unsigned long page_flags;
unsigned long address;
pte_t *pte;
pmd_t pmd;
int i;
pte = (pte_t *)info->alloc_pgt_page(info->context);
if (!pte)
return NULL;
address = __address & PMD_MASK;
/* No large page - clear PSE flag */
page_flags = info->page_flag & ~_PAGE_PSE;
/* Populate the PTEs */
for (i = 0; i < PTRS_PER_PMD; i++) {
set_pte(&pte[i], __pte(address | page_flags));
address += PAGE_SIZE;
}
/*
* Ideally we need to clear the large PMD first and do a TLB
* flush before we write the new PMD. But the 2M range of the
* PMD might contain the code we execute and/or the stack
* we are on, so we can't do that. But that should be safe here
* because we are going from large to small mappings and we are
* also the only user of the page-table, so there is no chance
* of a TLB multihit.
*/
pmd = __pmd((unsigned long)pte | info->kernpg_flag);
set_pmd(pmdp, pmd);
/* Flush TLB to establish the new PMD */
write_cr3(top_level_pgt);
return pte + pte_index(__address);
}
static void clflush_page(unsigned long address)
{
unsigned int flush_size;
char *cl, *start, *end;
/*
* Hardcode cl-size to 64 - CPUID can't be used here because that might
* cause another #VC exception and the GHCB is not ready to use yet.
*/
flush_size = 64;
start = (char *)(address & PAGE_MASK);
end = start + PAGE_SIZE;
/*
* First make sure there are no pending writes on the cache-lines to
* flush.
*/
asm volatile("mfence" : : : "memory");
for (cl = start; cl != end; cl += flush_size)
clflush(cl);
}
static int set_clr_page_flags(struct x86_mapping_info *info,
unsigned long address,
pteval_t set, pteval_t clr)
{
pgd_t *pgdp = (pgd_t *)top_level_pgt;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep, pte;
/*
* First make sure there is a PMD mapping for 'address'.
* It should already exist, but keep things generic.
*
* To map the page just read from it and fault it in if there is no
* mapping yet. add_identity_map() can't be called here because that
* would unconditionally map the address on PMD level, destroying any
* PTE-level mappings that might already exist. Use assembly here so
* the access won't be optimized away.
*/
asm volatile("mov %[address], %%r9"
:: [address] "g" (*(unsigned long *)address)
: "r9", "memory");
/*
* The page is mapped at least with PMD size - so skip checks and walk
* directly to the PMD.
*/
p4dp = p4d_offset(pgdp, address);
pudp = pud_offset(p4dp, address);
pmdp = pmd_offset(pudp, address);
if (pmd_large(*pmdp))
ptep = split_large_pmd(info, pmdp, address);
else
ptep = pte_offset_kernel(pmdp, address);
if (!ptep)
return -ENOMEM;
/*
* Changing encryption attributes of a page requires to flush it from
* the caches.
*/
if ((set | clr) & _PAGE_ENC)
clflush_page(address);
/* Update PTE */
pte = *ptep;
pte = pte_set_flags(pte, set);
pte = pte_clear_flags(pte, clr);
set_pte(ptep, pte);
/* Flush TLB after changing encryption attribute */
write_cr3(top_level_pgt);
return 0;
}
int set_page_decrypted(unsigned long address)
{
return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC);
}
int set_page_encrypted(unsigned long address)
{
return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0);
}
int set_page_non_present(unsigned long address)
{
return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT);
}
static void do_pf_error(const char *msg, unsigned long error_code,
unsigned long address, unsigned long ip)
{
error_putstr(msg);
error_putstr("\nError Code: ");
error_puthex(error_code);
error_putstr("\nCR2: 0x");
error_puthex(address);
error_putstr("\nRIP relative to _head: 0x");
error_puthex(ip - (unsigned long)_head);
error_putstr("\n");
error("Stopping.\n");
}
void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
{
unsigned long address = native_read_cr2();
unsigned long end;
bool ghcb_fault;
ghcb_fault = sev_es_check_ghcb_fault(address);
address &= PMD_MASK;
end = address + PMD_SIZE;
/*
* Check for unexpected error codes. Unexpected are:
* - Faults on present pages
* - User faults
* - Reserved bits set
*/
if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD))
do_pf_error("Unexpected page-fault:", error_code, address, regs->ip);
else if (ghcb_fault)
do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip);
/*
* Error code is sane - now identity map the 2M region around
* the faulting address.
*/
add_identity_map(address, end);
}