Merge branch 'for-next/pkey-signal' into for-next/core

* for-next/pkey-signal:
  : Bring arm64 pkey signal delivery in line with the x86 behaviour
  selftests/mm: Fix unused function warning for aarch64_write_signal_pkey()
  selftests/mm: Define PKEY_UNRESTRICTED for pkey_sighandler_tests
  selftests/mm: Enable pkey_sighandler_tests on arm64
  selftests/mm: Use generic pkey register manipulation
  arm64: signal: Remove unused macro
  arm64: signal: Remove unnecessary check when saving POE state
  arm64: signal: Improve POR_EL0 handling to avoid uaccess failures
  firmware: arm_sdei: Fix the input parameter of cpuhp_remove_state()
  Revert "kasan: Disable Software Tag-Based KASAN with GCC"
  kasan: Fix Software Tag-Based KASAN with GCC
  kasan: Disable Software Tag-Based KASAN with GCC
  Documentation/protection-keys: add AArch64 to documentation
  arm64: set POR_EL0 for kernel threads

# Conflicts:
#	arch/arm64/kernel/signal.c
This commit is contained in:
Catalin Marinas 2024-11-14 12:07:30 +00:00
commit 83ef4a378e
10 changed files with 222 additions and 55 deletions

View File

@ -12,7 +12,10 @@ Pkeys Userspace (PKU) is a feature which can be found on:
* Intel server CPUs, Skylake and later
* Intel client CPUs, Tiger Lake (11th Gen Core) and later
* Future AMD CPUs
* arm64 CPUs implementing the Permission Overlay Extension (FEAT_S1POE)
x86_64
======
Pkeys work by dedicating 4 previously Reserved bits in each page table entry to
a "protection key", giving 16 possible keys.
@ -28,6 +31,22 @@ register. The feature is only available in 64-bit mode, even though there is
theoretically space in the PAE PTEs. These permissions are enforced on data
access only and have no effect on instruction fetches.
arm64
=====
Pkeys use 3 bits in each page table entry, to encode a "protection key index",
giving 8 possible keys.
Protections for each key are defined with a per-CPU user-writable system
register (POR_EL0). This is a 64-bit register encoding read, write and execute
overlay permissions for each protection key index.
Being a CPU register, POR_EL0 is inherently thread-local, potentially giving
each thread a different set of protections from every other thread.
Unlike x86_64, the protection key permissions also apply to instruction
fetches.
Syscalls
========
@ -38,11 +57,10 @@ There are 3 system calls which directly interact with pkeys::
int pkey_mprotect(unsigned long start, size_t len,
unsigned long prot, int pkey);
Before a pkey can be used, it must first be allocated with
pkey_alloc(). An application calls the WRPKRU instruction
directly in order to change access permissions to memory covered
with a key. In this example WRPKRU is wrapped by a C function
called pkey_set().
Before a pkey can be used, it must first be allocated with pkey_alloc(). An
application writes to the architecture specific CPU register directly in order
to change access permissions to memory covered with a key. In this example
this is wrapped by a C function called pkey_set().
::
int real_prot = PROT_READ|PROT_WRITE;
@ -64,9 +82,9 @@ is no longer in use::
munmap(ptr, PAGE_SIZE);
pkey_free(pkey);
.. note:: pkey_set() is a wrapper for the RDPKRU and WRPKRU instructions.
An example implementation can be found in
tools/testing/selftests/x86/protection_keys.c.
.. note:: pkey_set() is a wrapper around writing to the CPU register.
Example implementations can be found in
tools/testing/selftests/mm/pkey-{arm64,powerpc,x86}.h
Behavior
========
@ -96,3 +114,7 @@ with a read()::
The kernel will send a SIGSEGV in both cases, but si_code will be set
to SEGV_PKERR when violating protection keys versus SEGV_ACCERR when
the plain mprotect() permissions are violated.
Note that kernel accesses from a kthread (such as io_uring) will use a default
value for the protection key register and so will not be consistent with
userspace's value of the register or mprotect().

View File

@ -466,6 +466,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
p->thread.cpu_context.x19 = (unsigned long)args->fn;
p->thread.cpu_context.x20 = (unsigned long)args->fn_arg;
if (system_supports_poe())
p->thread.por_el0 = POR_EL0_INIT;
}
p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
p->thread.cpu_context.sp = (unsigned long)childregs;

View File

@ -19,6 +19,7 @@
#include <linux/ratelimit.h>
#include <linux/rseq.h>
#include <linux/syscalls.h>
#include <linux/pkeys.h>
#include <asm/daifflags.h>
#include <asm/debug-monitors.h>
@ -72,10 +73,62 @@ struct rt_sigframe_user_layout {
unsigned long end_offset;
};
#define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16)
/*
* Holds any EL0-controlled state that influences unprivileged memory accesses.
* This includes both accesses done in userspace and uaccess done in the kernel.
*
* This state needs to be carefully managed to ensure that it doesn't cause
* uaccess to fail when setting up the signal frame, and the signal handler
* itself also expects a well-defined state when entered.
*/
struct user_access_state {
u64 por_el0;
};
#define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16)
#define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16)
/*
* Save the user access state into ua_state and reset it to disable any
* restrictions.
*/
static void save_reset_user_access_state(struct user_access_state *ua_state)
{
if (system_supports_poe()) {
u64 por_enable_all = 0;
for (int pkey = 0; pkey < arch_max_pkey(); pkey++)
por_enable_all |= POE_RXW << (pkey * POR_BITS_PER_PKEY);
ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0);
write_sysreg_s(por_enable_all, SYS_POR_EL0);
/* Ensure that any subsequent uaccess observes the updated value */
isb();
}
}
/*
* Set the user access state for invoking the signal handler.
*
* No uaccess should be done after that function is called.
*/
static void set_handler_user_access_state(void)
{
if (system_supports_poe())
write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
}
/*
* Restore the user access state to the values saved in ua_state.
*
* No uaccess should be done after that function is called.
*/
static void restore_user_access_state(const struct user_access_state *ua_state)
{
if (system_supports_poe())
write_sysreg_s(ua_state->por_el0, SYS_POR_EL0);
}
static void init_user_layout(struct rt_sigframe_user_layout *user)
{
const size_t reserved_size =
@ -269,18 +322,20 @@ static int restore_fpmr_context(struct user_ctxs *user)
return err;
}
static int preserve_poe_context(struct poe_context __user *ctx)
static int preserve_poe_context(struct poe_context __user *ctx,
const struct user_access_state *ua_state)
{
int err = 0;
__put_user_error(POE_MAGIC, &ctx->head.magic, err);
__put_user_error(sizeof(*ctx), &ctx->head.size, err);
__put_user_error(read_sysreg_s(SYS_POR_EL0), &ctx->por_el0, err);
__put_user_error(ua_state->por_el0, &ctx->por_el0, err);
return err;
}
static int restore_poe_context(struct user_ctxs *user)
static int restore_poe_context(struct user_ctxs *user,
struct user_access_state *ua_state)
{
u64 por_el0;
int err = 0;
@ -290,7 +345,7 @@ static int restore_poe_context(struct user_ctxs *user)
__get_user_error(por_el0, &(user->poe->por_el0), err);
if (!err)
write_sysreg_s(por_el0, SYS_POR_EL0);
ua_state->por_el0 = por_el0;
return err;
}
@ -946,7 +1001,8 @@ invalid:
}
static int restore_sigframe(struct pt_regs *regs,
struct rt_sigframe __user *sf)
struct rt_sigframe __user *sf,
struct user_access_state *ua_state)
{
sigset_t set;
int i, err;
@ -998,7 +1054,7 @@ static int restore_sigframe(struct pt_regs *regs,
err = restore_zt_context(&user);
if (err == 0 && system_supports_poe() && user.poe)
err = restore_poe_context(&user);
err = restore_poe_context(&user, ua_state);
return err;
}
@ -1059,6 +1115,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
{
struct pt_regs *regs = current_pt_regs();
struct rt_sigframe __user *frame;
struct user_access_state ua_state;
/* Always make any pending restarted system calls return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
@ -1075,7 +1132,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
if (!access_ok(frame, sizeof (*frame)))
goto badframe;
if (restore_sigframe(regs, frame))
if (restore_sigframe(regs, frame, &ua_state))
goto badframe;
if (gcs_restore_signal())
@ -1084,6 +1141,8 @@ SYSCALL_DEFINE0(rt_sigreturn)
if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
restore_user_access_state(&ua_state);
return regs->regs[0];
badframe:
@ -1198,7 +1257,8 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
}
static int setup_sigframe(struct rt_sigframe_user_layout *user,
struct pt_regs *regs, sigset_t *set)
struct pt_regs *regs, sigset_t *set,
const struct user_access_state *ua_state)
{
int i, err = 0;
struct rt_sigframe __user *sf = user->sigframe;
@ -1262,14 +1322,13 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
err |= preserve_fpmr_context(fpmr_ctx);
}
if (system_supports_poe() && err == 0 && user->poe_offset) {
if (system_supports_poe() && err == 0) {
struct poe_context __user *poe_ctx =
apply_user_offset(user, user->poe_offset);
err |= preserve_poe_context(poe_ctx);
err |= preserve_poe_context(poe_ctx, ua_state);
}
/* ZA state if present */
if (system_supports_sme() && err == 0 && user->za_offset) {
struct za_context __user *za_ctx =
@ -1447,9 +1506,6 @@ static int setup_return(struct pt_regs *regs, struct ksignal *ksig,
sme_smstop();
}
if (system_supports_poe())
write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
if (ksig->ka.sa.sa_flags & SA_RESTORER)
sigtramp = ksig->ka.sa.sa_restorer;
else
@ -1465,6 +1521,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
{
struct rt_sigframe_user_layout user;
struct rt_sigframe __user *frame;
struct user_access_state ua_state;
int err = 0;
fpsimd_signal_preserve_current_state();
@ -1472,13 +1529,14 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
if (get_sigframe(&user, ksig, regs))
return 1;
save_reset_user_access_state(&ua_state);
frame = user.sigframe;
__put_user_error(0, &frame->uc.uc_flags, err);
__put_user_error(NULL, &frame->uc.uc_link, err);
err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
err |= setup_sigframe(&user, regs, set);
err |= setup_sigframe(&user, regs, set, &ua_state);
if (err == 0) {
err = setup_return(regs, ksig, &user, usig);
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
@ -1488,6 +1546,11 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
}
}
if (err == 0)
set_handler_user_access_state();
else
restore_user_access_state(&ua_state);
return err;
}

View File

@ -763,7 +763,7 @@ static int sdei_device_freeze(struct device *dev)
int err;
/* unregister private events */
cpuhp_remove_state(sdei_entry_point);
cpuhp_remove_state(sdei_hp_state);
err = sdei_unregister_shared();
if (err)

View File

@ -80,7 +80,11 @@
#define __noscs __attribute__((__no_sanitize__("shadow-call-stack")))
#endif
#ifdef __SANITIZE_HWADDRESS__
#define __no_sanitize_address __attribute__((__no_sanitize__("hwaddress")))
#else
#define __no_sanitize_address __attribute__((__no_sanitize_address__))
#endif
#if defined(__SANITIZE_THREAD__)
#define __no_sanitize_thread __attribute__((__no_sanitize_thread__))

View File

@ -105,12 +105,12 @@ endif
ifeq ($(CAN_BUILD_X86_64),1)
TEST_GEN_FILES += $(BINARIES_64)
endif
else
ifneq (,$(filter $(ARCH),arm64 powerpc))
else ifeq ($(ARCH),arm64)
TEST_GEN_FILES += protection_keys
TEST_GEN_FILES += pkey_sighandler_tests
else ifeq ($(ARCH),powerpc)
TEST_GEN_FILES += protection_keys
endif
endif
ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390))

View File

@ -31,6 +31,7 @@
#define NR_RESERVED_PKEYS 1 /* pkey-0 */
#define PKEY_ALLOW_ALL 0x77777777
#define PKEY_REG_ALLOW_NONE 0x0
#define PKEY_BITS_PER_PKEY 4
#define PAGE_SIZE sysconf(_SC_PAGESIZE)
@ -126,7 +127,7 @@ static inline u64 get_pkey_bits(u64 reg, int pkey)
return 0;
}
static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey)
static inline void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey)
{
struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uctxt);
struct poe_context *poe_ctx =

View File

@ -112,6 +112,13 @@ void record_pkey_malloc(void *ptr, long size, int prot);
#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
#endif
/*
* FIXME: Remove once the generic PKEY_UNRESTRICTED definition is merged.
*/
#ifndef PKEY_UNRESTRICTED
#define PKEY_UNRESTRICTED 0x0
#endif
#ifndef set_pkey_bits
static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
{

View File

@ -34,6 +34,8 @@
#define PAGE_SIZE 4096
#define MB (1<<20)
#define PKEY_REG_ALLOW_NONE 0x55555555
static inline void __page_o_noops(void)
{
/* 8-bytes of instruction * 512 bytes = 1 page */

View File

@ -11,6 +11,7 @@
*/
#define _GNU_SOURCE
#define __SANE_USERSPACE_TYPES__
#include <linux/mman.h>
#include <errno.h>
#include <sys/syscall.h>
#include <string.h>
@ -59,12 +60,58 @@ long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6)
: "=a"(ret)
: "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5)
: "memory");
#elif defined __aarch64__
register long x0 asm("x0") = a1;
register long x1 asm("x1") = a2;
register long x2 asm("x2") = a3;
register long x3 asm("x3") = a4;
register long x4 asm("x4") = a5;
register long x5 asm("x5") = a6;
register long x8 asm("x8") = n;
asm volatile ("svc #0"
: "=r"(x0)
: "r"(x0), "r"(x1), "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(x8)
: "memory");
ret = x0;
#else
# error syscall_raw() not implemented
#endif
return ret;
}
static inline long clone_raw(unsigned long flags, void *stack,
int *parent_tid, int *child_tid)
{
long a1 = flags;
long a2 = (long)stack;
long a3 = (long)parent_tid;
#if defined(__x86_64__) || defined(__i386)
long a4 = (long)child_tid;
long a5 = 0;
#elif defined(__aarch64__)
long a4 = 0;
long a5 = (long)child_tid;
#else
# error clone_raw() not implemented
#endif
return syscall_raw(SYS_clone, a1, a2, a3, a4, a5, 0);
}
/*
* Returns the most restrictive pkey register value that can be used by the
* tests.
*/
static inline u64 pkey_reg_restrictive_default(void)
{
/*
* Disallow everything except execution on pkey 0, so that each caller
* doesn't need to enable it explicitly (the selftest code runs with
* its code mapped with pkey 0).
*/
return set_pkey_bits(PKEY_REG_ALLOW_NONE, 0, PKEY_DISABLE_ACCESS);
}
static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext)
{
pthread_mutex_lock(&mutex);
@ -113,7 +160,7 @@ static void raise_sigusr2(void)
static void *thread_segv_with_pkey0_disabled(void *ptr)
{
/* Disable MPK 0 (and all others too) */
__write_pkey_reg(0x55555555);
__write_pkey_reg(pkey_reg_restrictive_default());
/* Segfault (with SEGV_MAPERR) */
*(int *) (0x1) = 1;
@ -123,7 +170,7 @@ static void *thread_segv_with_pkey0_disabled(void *ptr)
static void *thread_segv_pkuerr_stack(void *ptr)
{
/* Disable MPK 0 (and all others too) */
__write_pkey_reg(0x55555555);
__write_pkey_reg(pkey_reg_restrictive_default());
/* After we disable MPK 0, we can't access the stack to return */
return NULL;
@ -133,6 +180,7 @@ static void *thread_segv_maperr_ptr(void *ptr)
{
stack_t *stack = ptr;
int *bad = (int *)1;
u64 pkey_reg;
/*
* Setup alternate signal stack, which should be pkey_mprotect()ed by
@ -142,7 +190,9 @@ static void *thread_segv_maperr_ptr(void *ptr)
syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
/* Disable MPK 0. Only MPK 1 is enabled. */
__write_pkey_reg(0x55555551);
pkey_reg = pkey_reg_restrictive_default();
pkey_reg = set_pkey_bits(pkey_reg, 1, PKEY_UNRESTRICTED);
__write_pkey_reg(pkey_reg);
/* Segfault */
*bad = 1;
@ -240,6 +290,7 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
int pkey;
int parent_pid = 0;
int child_pid = 0;
u64 pkey_reg;
sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
@ -257,7 +308,10 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
assert(stack != MAP_FAILED);
/* Allow access to MPK 0 and MPK 1 */
__write_pkey_reg(0x55555550);
pkey_reg = pkey_reg_restrictive_default();
pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
pkey_reg = set_pkey_bits(pkey_reg, 1, PKEY_UNRESTRICTED);
__write_pkey_reg(pkey_reg);
/* Protect the new stack with MPK 1 */
pkey = pkey_alloc(0, 0);
@ -272,14 +326,13 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
memset(&siginfo, 0, sizeof(siginfo));
/* Use clone to avoid newer glibcs using rseq on new threads */
long ret = syscall_raw(SYS_clone,
CLONE_VM | CLONE_FS | CLONE_FILES |
CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
CLONE_DETACHED,
(long) ((char *)(stack) + STACK_SIZE),
(long) &parent_pid,
(long) &child_pid, 0, 0);
long ret = clone_raw(CLONE_VM | CLONE_FS | CLONE_FILES |
CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
CLONE_DETACHED,
stack + STACK_SIZE,
&parent_pid,
&child_pid);
if (ret < 0) {
errno = -ret;
@ -307,7 +360,13 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
static void test_pkru_preserved_after_sigusr1(void)
{
struct sigaction sa;
unsigned long pkru = 0x45454544;
u64 pkey_reg;
/* Allow access to MPK 0 and an arbitrary set of keys */
pkey_reg = pkey_reg_restrictive_default();
pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
pkey_reg = set_pkey_bits(pkey_reg, 3, PKEY_UNRESTRICTED);
pkey_reg = set_pkey_bits(pkey_reg, 7, PKEY_UNRESTRICTED);
sa.sa_flags = SA_SIGINFO;
@ -320,7 +379,7 @@ static void test_pkru_preserved_after_sigusr1(void)
memset(&siginfo, 0, sizeof(siginfo));
__write_pkey_reg(pkru);
__write_pkey_reg(pkey_reg);
raise(SIGUSR1);
@ -330,7 +389,7 @@ static void test_pkru_preserved_after_sigusr1(void)
pthread_mutex_unlock(&mutex);
/* Ensure the pkru value is the same after returning from signal. */
ksft_test_result(pkru == __read_pkey_reg() &&
ksft_test_result(pkey_reg == __read_pkey_reg() &&
siginfo.si_signo == SIGUSR1,
"%s\n", __func__);
}
@ -347,6 +406,7 @@ static noinline void *thread_sigusr2_self(void *ptr)
'S', 'I', 'G', 'U', 'S', 'R', '2',
'.', '.', '.', '\n', '\0'};
stack_t *stack = ptr;
u64 pkey_reg;
/*
* Setup alternate signal stack, which should be pkey_mprotect()ed by
@ -356,7 +416,9 @@ static noinline void *thread_sigusr2_self(void *ptr)
syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
/* Disable MPK 0. Only MPK 2 is enabled. */
__write_pkey_reg(0x55555545);
pkey_reg = pkey_reg_restrictive_default();
pkey_reg = set_pkey_bits(pkey_reg, 2, PKEY_UNRESTRICTED);
__write_pkey_reg(pkey_reg);
raise_sigusr2();
@ -384,6 +446,7 @@ static void test_pkru_sigreturn(void)
int pkey;
int parent_pid = 0;
int child_pid = 0;
u64 pkey_reg;
sa.sa_handler = SIG_DFL;
sa.sa_flags = 0;
@ -418,7 +481,10 @@ static void test_pkru_sigreturn(void)
* the current thread's stack is protected by the default MPK 0. Hence
* both need to be enabled.
*/
__write_pkey_reg(0x55555544);
pkey_reg = pkey_reg_restrictive_default();
pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
pkey_reg = set_pkey_bits(pkey_reg, 2, PKEY_UNRESTRICTED);
__write_pkey_reg(pkey_reg);
/* Protect the stack with MPK 2 */
pkey = pkey_alloc(0, 0);
@ -431,14 +497,13 @@ static void test_pkru_sigreturn(void)
sigstack.ss_size = STACK_SIZE;
/* Use clone to avoid newer glibcs using rseq on new threads */
long ret = syscall_raw(SYS_clone,
CLONE_VM | CLONE_FS | CLONE_FILES |
CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
CLONE_DETACHED,
(long) ((char *)(stack) + STACK_SIZE),
(long) &parent_pid,
(long) &child_pid, 0, 0);
long ret = clone_raw(CLONE_VM | CLONE_FS | CLONE_FILES |
CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
CLONE_DETACHED,
stack + STACK_SIZE,
&parent_pid,
&child_pid);
if (ret < 0) {
errno = -ret;