linux/arch/x86/entry/vdso/vdso2c.h
Sean Christopherson 8382c668ce x86/vdso: Add support for exception fixup in vDSO functions
Signals are a horrid little mechanism.  They are especially nasty in
multi-threaded environments because signal state like handlers is global
across the entire process.  But, signals are basically the only way that
userspace can “gracefully” handle and recover from exceptions.

The kernel generally does not like exceptions to occur during execution.
But, exceptions are a fact of life and must be handled in some
circumstances.  The kernel handles them by keeping a list of individual
instructions which may cause exceptions.  Instead of truly handling the
exception and returning to the instruction that caused it, the kernel
instead restarts execution at a *different* instruction.  This makes it
obvious to that thread of execution that the exception occurred and lets
*that* code handle the exception instead of the handler.

This is not dissimilar to the try/catch exceptions mechanisms that some
programming languages have, but applied *very* surgically to single
instructions.  It effectively changes the visible architecture of the
instruction.

Problem
=======

SGX generates a lot of signals, and the code to enter and exit enclaves and
muck with signal handling is truly horrid.  At the same time, an approach
like kernel exception fixup can not be easily applied to userspace
instructions because it changes the visible instruction architecture.

Solution
========

The vDSO is a special page of kernel-provided instructions that run in
userspace.  Any userspace calling into the vDSO knows that it is special.
This allows the kernel a place to legitimately rewrite the user/kernel
contract and change instruction behavior.

Add support for fixing up exceptions that occur while executing in the
vDSO.  This replaces what could traditionally only be done with signal
handling.

This new mechanism will be used to replace previously direct use of SGX
instructions by userspace.

Just introduce the vDSO infrastructure.  Later patches will actually
replace signal generation with vDSO exception fixup.

Suggested-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Link: https://lkml.kernel.org/r/20201112220135.165028-17-jarkko@kernel.org
2020-11-18 18:02:50 +01:00

223 lines
6.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is included twice from vdso2c.c. It generates code for 32-bit
* and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs
* are built for 32-bit userspace.
*/
static void BITSFUNC(copy)(FILE *outfile, const unsigned char *data, size_t len)
{
size_t i;
for (i = 0; i < len; i++) {
if (i % 10 == 0)
fprintf(outfile, "\n\t");
fprintf(outfile, "0x%02X, ", (int)(data)[i]);
}
}
/*
* Extract a section from the input data into a standalone blob. Used to
* capture kernel-only data that needs to persist indefinitely, e.g. the
* exception fixup tables, but only in the kernel, i.e. the section can
* be stripped from the final vDSO image.
*/
static void BITSFUNC(extract)(const unsigned char *data, size_t data_len,
FILE *outfile, ELF(Shdr) *sec, const char *name)
{
unsigned long offset;
size_t len;
offset = (unsigned long)GET_LE(&sec->sh_offset);
len = (size_t)GET_LE(&sec->sh_size);
if (offset + len > data_len)
fail("section to extract overruns input data");
fprintf(outfile, "static const unsigned char %s[%lu] = {", name, len);
BITSFUNC(copy)(outfile, data + offset, len);
fprintf(outfile, "\n};\n\n");
}
static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
void *stripped_addr, size_t stripped_len,
FILE *outfile, const char *image_name)
{
int found_load = 0;
unsigned long load_size = -1; /* Work around bogus warning */
unsigned long mapping_size;
ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
unsigned long i, syms_nr;
ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
*alt_sec = NULL, *extable_sec = NULL;
ELF(Dyn) *dyn = 0, *dyn_end = 0;
const char *secstrings;
INT_BITS syms[NSYMS] = {};
ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
if (GET_LE(&hdr->e_type) != ET_DYN)
fail("input is not a shared object\n");
/* Walk the segment table. */
for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
if (GET_LE(&pt[i].p_type) == PT_LOAD) {
if (found_load)
fail("multiple PT_LOAD segs\n");
if (GET_LE(&pt[i].p_offset) != 0 ||
GET_LE(&pt[i].p_vaddr) != 0)
fail("PT_LOAD in wrong place\n");
if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz))
fail("cannot handle memsz != filesz\n");
load_size = GET_LE(&pt[i].p_memsz);
found_load = 1;
} else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) {
dyn = raw_addr + GET_LE(&pt[i].p_offset);
dyn_end = raw_addr + GET_LE(&pt[i].p_offset) +
GET_LE(&pt[i].p_memsz);
}
}
if (!found_load)
fail("no PT_LOAD seg\n");
if (stripped_len < load_size)
fail("stripped input is too short\n");
if (!dyn)
fail("input has no PT_DYNAMIC section -- your toolchain is buggy\n");
/* Walk the dynamic table */
for (i = 0; dyn + i < dyn_end &&
GET_LE(&dyn[i].d_tag) != DT_NULL; i++) {
typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag);
if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA ||
tag == DT_RELENT || tag == DT_TEXTREL)
fail("vdso image contains dynamic relocations\n");
}
/* Walk the section table */
secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx);
secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset);
for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * i;
if (GET_LE(&sh->sh_type) == SHT_SYMTAB)
symtab_hdr = sh;
if (!strcmp(secstrings + GET_LE(&sh->sh_name),
".altinstructions"))
alt_sec = sh;
if (!strcmp(secstrings + GET_LE(&sh->sh_name), "__ex_table"))
extable_sec = sh;
}
if (!symtab_hdr)
fail("no symbol table\n");
strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link);
syms_nr = GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
/* Walk the symbol table */
for (i = 0; i < syms_nr; i++) {
unsigned int k;
ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
GET_LE(&symtab_hdr->sh_entsize) * i;
const char *sym_name = raw_addr +
GET_LE(&strtab_hdr->sh_offset) +
GET_LE(&sym->st_name);
for (k = 0; k < NSYMS; k++) {
if (!strcmp(sym_name, required_syms[k].name)) {
if (syms[k]) {
fail("duplicate symbol %s\n",
required_syms[k].name);
}
/*
* Careful: we use negative addresses, but
* st_value is unsigned, so we rely
* on syms[k] being a signed type of the
* correct width.
*/
syms[k] = GET_LE(&sym->st_value);
}
}
}
/* Validate mapping addresses. */
for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
INT_BITS symval = syms[special_pages[i]];
if (!symval)
continue; /* The mapping isn't used; ignore it. */
if (symval % 4096)
fail("%s must be a multiple of 4096\n",
required_syms[i].name);
if (symval + 4096 < syms[sym_vvar_start])
fail("%s underruns vvar_start\n",
required_syms[i].name);
if (symval + 4096 > 0)
fail("%s is on the wrong side of the vdso text\n",
required_syms[i].name);
}
if (syms[sym_vvar_start] % 4096)
fail("vvar_begin must be a multiple of 4096\n");
if (!image_name) {
fwrite(stripped_addr, stripped_len, 1, outfile);
return;
}
mapping_size = (stripped_len + 4095) / 4096 * 4096;
fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
fprintf(outfile, "#include <linux/linkage.h>\n");
fprintf(outfile, "#include <asm/page_types.h>\n");
fprintf(outfile, "#include <asm/vdso.h>\n");
fprintf(outfile, "\n");
fprintf(outfile,
"static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
mapping_size);
for (i = 0; i < stripped_len; i++) {
if (i % 10 == 0)
fprintf(outfile, "\n\t");
fprintf(outfile, "0x%02X, ",
(int)((unsigned char *)stripped_addr)[i]);
}
fprintf(outfile, "\n};\n\n");
if (extable_sec)
BITSFUNC(extract)(raw_addr, raw_len, outfile,
extable_sec, "extable");
fprintf(outfile, "const struct vdso_image %s = {\n", image_name);
fprintf(outfile, "\t.data = raw_data,\n");
fprintf(outfile, "\t.size = %lu,\n", mapping_size);
if (alt_sec) {
fprintf(outfile, "\t.alt = %lu,\n",
(unsigned long)GET_LE(&alt_sec->sh_offset));
fprintf(outfile, "\t.alt_len = %lu,\n",
(unsigned long)GET_LE(&alt_sec->sh_size));
}
if (extable_sec) {
fprintf(outfile, "\t.extable_base = %lu,\n",
(unsigned long)GET_LE(&extable_sec->sh_offset));
fprintf(outfile, "\t.extable_len = %lu,\n",
(unsigned long)GET_LE(&extable_sec->sh_size));
fprintf(outfile, "\t.extable = extable,\n");
}
for (i = 0; i < NSYMS; i++) {
if (required_syms[i].export && syms[i])
fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
required_syms[i].name, (int64_t)syms[i]);
}
fprintf(outfile, "};\n");
}