linux/arch/arm64/kernel/entry.S

1098 lines
26 KiB
ArmAsm
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Low-level exception handling code
*
* Copyright (C) 2012 ARM Ltd.
* Authors: Catalin Marinas <catalin.marinas@arm.com>
* Will Deacon <will.deacon@arm.com>
*/
#include <linux/arm-smccc.h>
#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/alternative.h>
#include <asm/assembler.h>
#include <asm/asm-offsets.h>
#include <asm/asm_pointer_auth.h>
#include <asm/cpufeature.h>
#include <asm/errno.h>
#include <asm/esr.h>
#include <asm/irq.h>
#include <asm/memory.h>
#include <asm/mmu.h>
#include <asm/processor.h>
#include <asm/ptrace.h>
#include <asm/thread_info.h>
#include <asm/asm-uaccess.h>
#include <asm/unistd.h>
/*
* Context tracking subsystem. Used to instrument transitions
* between user and kernel mode.
*/
arm64: entry: Move ct_user_exit before any other exception When taking an SError or Debug exception from EL0, we run the C handler for these exceptions before updating the context tracking code and unmasking lower priority interrupts. When booting with nohz_full lockdep tells us we got this wrong: | ============================= | WARNING: suspicious RCU usage | 5.3.0-rc2-00010-gb4b5e9dcb11b-dirty #11271 Not tainted | ----------------------------- | include/linux/rcupdate.h:643 rcu_read_unlock() used illegally wh! | | other info that might help us debug this: | | | RCU used illegally from idle CPU! | rcu_scheduler_active = 2, debug_locks = 1 | RCU used illegally from extended quiescent state! | 1 lock held by a.out/432: | #0: 00000000c7a79515 (rcu_read_lock){....}, at: brk_handler+0x00 | | stack backtrace: | CPU: 1 PID: 432 Comm: a.out Not tainted 5.3.0-rc2-00010-gb4b5e9d1 | Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno De8 | Call trace: | dump_backtrace+0x0/0x140 | show_stack+0x14/0x20 | dump_stack+0xbc/0x104 | lockdep_rcu_suspicious+0xf8/0x108 | brk_handler+0x164/0x1b0 | do_debug_exception+0x11c/0x278 | el0_dbg+0x14/0x20 Moving the ct_user_exit calls to be before do_debug_exception() means they are also before trace_hardirqs_off() has been updated. Add a new ct_user_exit_irqoff macro to avoid the context-tracking code using irqsave/restore before we've updated trace_hardirqs_off(). To be consistent, do this everywhere. The C helper is called enter_from_user_mode() to match x86 in the hope we can merge them into kernel/context_tracking.c later. Cc: Masami Hiramatsu <mhiramat@kernel.org> Fixes: 6c81fe7925cc4c42 ("arm64: enable context tracking") Signed-off-by: James Morse <james.morse@arm.com> Signed-off-by: Will Deacon <will@kernel.org>
2019-08-20 17:45:57 +00:00
.macro ct_user_exit_irqoff
#ifdef CONFIG_CONTEXT_TRACKING
arm64: entry: Move ct_user_exit before any other exception When taking an SError or Debug exception from EL0, we run the C handler for these exceptions before updating the context tracking code and unmasking lower priority interrupts. When booting with nohz_full lockdep tells us we got this wrong: | ============================= | WARNING: suspicious RCU usage | 5.3.0-rc2-00010-gb4b5e9dcb11b-dirty #11271 Not tainted | ----------------------------- | include/linux/rcupdate.h:643 rcu_read_unlock() used illegally wh! | | other info that might help us debug this: | | | RCU used illegally from idle CPU! | rcu_scheduler_active = 2, debug_locks = 1 | RCU used illegally from extended quiescent state! | 1 lock held by a.out/432: | #0: 00000000c7a79515 (rcu_read_lock){....}, at: brk_handler+0x00 | | stack backtrace: | CPU: 1 PID: 432 Comm: a.out Not tainted 5.3.0-rc2-00010-gb4b5e9d1 | Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno De8 | Call trace: | dump_backtrace+0x0/0x140 | show_stack+0x14/0x20 | dump_stack+0xbc/0x104 | lockdep_rcu_suspicious+0xf8/0x108 | brk_handler+0x164/0x1b0 | do_debug_exception+0x11c/0x278 | el0_dbg+0x14/0x20 Moving the ct_user_exit calls to be before do_debug_exception() means they are also before trace_hardirqs_off() has been updated. Add a new ct_user_exit_irqoff macro to avoid the context-tracking code using irqsave/restore before we've updated trace_hardirqs_off(). To be consistent, do this everywhere. The C helper is called enter_from_user_mode() to match x86 in the hope we can merge them into kernel/context_tracking.c later. Cc: Masami Hiramatsu <mhiramat@kernel.org> Fixes: 6c81fe7925cc4c42 ("arm64: enable context tracking") Signed-off-by: James Morse <james.morse@arm.com> Signed-off-by: Will Deacon <will@kernel.org>
2019-08-20 17:45:57 +00:00
bl enter_from_user_mode
#endif
.endm
.macro ct_user_enter
#ifdef CONFIG_CONTEXT_TRACKING
bl context_tracking_user_enter
#endif
.endm
.macro clear_gp_regs
.irp n,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
mov x\n, xzr
.endr
.endm
/*
* Bad Abort numbers
*-----------------
*/
#define BAD_SYNC 0
#define BAD_IRQ 1
#define BAD_FIQ 2
#define BAD_ERROR 3
.macro kernel_ventry, el, label, regsize = 64
.align 7
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
.if \el == 0
alternative_if ARM64_UNMAP_KERNEL_AT_EL0
.if \regsize == 64
mrs x30, tpidrro_el0
msr tpidrro_el0, xzr
.else
mov x30, xzr
.endif
alternative_else_nop_endif
.endif
#endif
sub sp, sp, #S_FRAME_SIZE
arm64: add VMAP_STACK overflow detection This patch adds stack overflow detection to arm64, usable when vmap'd stacks are in use. Overflow is detected in a small preamble executed for each exception entry, which checks whether there is enough space on the current stack for the general purpose registers to be saved. If there is not enough space, the overflow handler is invoked on a per-cpu overflow stack. This approach preserves the original exception information in ESR_EL1 (and where appropriate, FAR_EL1). Task and IRQ stacks are aligned to double their size, enabling overflow to be detected with a single bit test. For example, a 16K stack is aligned to 32K, ensuring that bit 14 of the SP must be zero. On an overflow (or underflow), this bit is flipped. Thus, overflow (of less than the size of the stack) can be detected by testing whether this bit is set. The overflow check is performed before any attempt is made to access the stack, avoiding recursive faults (and the loss of exception information these would entail). As logical operations cannot be performed on the SP directly, the SP is temporarily swapped with a general purpose register using arithmetic operations to enable the test to be performed. This gives us a useful error message on stack overflow, as can be trigger with the LKDTM overflow test: [ 305.388749] lkdtm: Performing direct entry OVERFLOW [ 305.395444] Insufficient stack space to handle exception! [ 305.395482] ESR: 0x96000047 -- DABT (current EL) [ 305.399890] FAR: 0xffff00000a5e7f30 [ 305.401315] Task stack: [0xffff00000a5e8000..0xffff00000a5ec000] [ 305.403815] IRQ stack: [0xffff000008000000..0xffff000008004000] [ 305.407035] Overflow stack: [0xffff80003efce4e0..0xffff80003efcf4e0] [ 305.409622] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5 [ 305.412785] Hardware name: linux,dummy-virt (DT) [ 305.415756] task: ffff80003d051c00 task.stack: ffff00000a5e8000 [ 305.419221] PC is at recursive_loop+0x10/0x48 [ 305.421637] LR is at recursive_loop+0x38/0x48 [ 305.423768] pc : [<ffff00000859f330>] lr : [<ffff00000859f358>] pstate: 40000145 [ 305.428020] sp : ffff00000a5e7f50 [ 305.430469] x29: ffff00000a5e8350 x28: ffff80003d051c00 [ 305.433191] x27: ffff000008981000 x26: ffff000008f80400 [ 305.439012] x25: ffff00000a5ebeb8 x24: ffff00000a5ebeb8 [ 305.440369] x23: ffff000008f80138 x22: 0000000000000009 [ 305.442241] x21: ffff80003ce65000 x20: ffff000008f80188 [ 305.444552] x19: 0000000000000013 x18: 0000000000000006 [ 305.446032] x17: 0000ffffa2601280 x16: ffff0000081fe0b8 [ 305.448252] x15: ffff000008ff546d x14: 000000000047a4c8 [ 305.450246] x13: ffff000008ff7872 x12: 0000000005f5e0ff [ 305.452953] x11: ffff000008ed2548 x10: 000000000005ee8d [ 305.454824] x9 : ffff000008545380 x8 : ffff00000a5e8770 [ 305.457105] x7 : 1313131313131313 x6 : 00000000000000e1 [ 305.459285] x5 : 0000000000000000 x4 : 0000000000000000 [ 305.461781] x3 : 0000000000000000 x2 : 0000000000000400 [ 305.465119] x1 : 0000000000000013 x0 : 0000000000000012 [ 305.467724] Kernel panic - not syncing: kernel stack overflow [ 305.470561] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5 [ 305.473325] Hardware name: linux,dummy-virt (DT) [ 305.475070] Call trace: [ 305.476116] [<ffff000008088ad8>] dump_backtrace+0x0/0x378 [ 305.478991] [<ffff000008088e64>] show_stack+0x14/0x20 [ 305.481237] [<ffff00000895a178>] dump_stack+0x98/0xb8 [ 305.483294] [<ffff0000080c3288>] panic+0x118/0x280 [ 305.485673] [<ffff0000080c2e9c>] nmi_panic+0x6c/0x70 [ 305.486216] [<ffff000008089710>] handle_bad_stack+0x118/0x128 [ 305.486612] Exception stack(0xffff80003efcf3a0 to 0xffff80003efcf4e0) [ 305.487334] f3a0: 0000000000000012 0000000000000013 0000000000000400 0000000000000000 [ 305.488025] f3c0: 0000000000000000 0000000000000000 00000000000000e1 1313131313131313 [ 305.488908] f3e0: ffff00000a5e8770 ffff000008545380 000000000005ee8d ffff000008ed2548 [ 305.489403] f400: 0000000005f5e0ff ffff000008ff7872 000000000047a4c8 ffff000008ff546d [ 305.489759] f420: ffff0000081fe0b8 0000ffffa2601280 0000000000000006 0000000000000013 [ 305.490256] f440: ffff000008f80188 ffff80003ce65000 0000000000000009 ffff000008f80138 [ 305.490683] f460: ffff00000a5ebeb8 ffff00000a5ebeb8 ffff000008f80400 ffff000008981000 [ 305.491051] f480: ffff80003d051c00 ffff00000a5e8350 ffff00000859f358 ffff00000a5e7f50 [ 305.491444] f4a0: ffff00000859f330 0000000040000145 0000000000000000 0000000000000000 [ 305.492008] f4c0: 0001000000000000 0000000000000000 ffff00000a5e8350 ffff00000859f330 [ 305.493063] [<ffff00000808205c>] __bad_stack+0x88/0x8c [ 305.493396] [<ffff00000859f330>] recursive_loop+0x10/0x48 [ 305.493731] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494088] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494425] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494649] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494898] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.495205] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.495453] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.495708] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496000] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496302] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496644] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496894] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.497138] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.497325] [<ffff00000859f3dc>] lkdtm_OVERFLOW+0x14/0x20 [ 305.497506] [<ffff00000859f314>] lkdtm_do_action+0x1c/0x28 [ 305.497786] [<ffff00000859f178>] direct_entry+0xe0/0x170 [ 305.498095] [<ffff000008345568>] full_proxy_write+0x60/0xa8 [ 305.498387] [<ffff0000081fb7f4>] __vfs_write+0x1c/0x128 [ 305.498679] [<ffff0000081fcc68>] vfs_write+0xa0/0x1b0 [ 305.498926] [<ffff0000081fe0fc>] SyS_write+0x44/0xa0 [ 305.499182] Exception stack(0xffff00000a5ebec0 to 0xffff00000a5ec000) [ 305.499429] bec0: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0 [ 305.499674] bee0: 574f4c465245564f 0000000000000000 0000000000000000 8000000080808080 [ 305.499904] bf00: 0000000000000040 0000000000000038 fefefeff1b4bc2ff 7f7f7f7f7f7fff7f [ 305.500189] bf20: 0101010101010101 0000000000000000 000000000047a4c8 0000000000000038 [ 305.500712] bf40: 0000000000000000 0000ffffa2601280 0000ffffc63f6068 00000000004b5000 [ 305.501241] bf60: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0 [ 305.501791] bf80: 0000000000000020 0000000000000000 00000000004b5000 000000001c4cc458 [ 305.502314] bfa0: 0000000000000000 0000ffffc63f7950 000000000040a3c4 0000ffffc63f70e0 [ 305.502762] bfc0: 0000ffffa2601268 0000000080000000 0000000000000001 0000000000000040 [ 305.503207] bfe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 305.503680] [<ffff000008082fb0>] el0_svc_naked+0x24/0x28 [ 305.504720] Kernel Offset: disabled [ 305.505189] CPU features: 0x002082 [ 305.505473] Memory Limit: none [ 305.506181] ---[ end Kernel panic - not syncing: kernel stack overflow This patch was co-authored by Ard Biesheuvel and Mark Rutland. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Mark Rutland <mark.rutland@arm.com> Reviewed-by: Will Deacon <will.deacon@arm.com> Tested-by: Laura Abbott <labbott@redhat.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: James Morse <james.morse@arm.com>
2017-07-14 19:30:35 +00:00
#ifdef CONFIG_VMAP_STACK
/*
* Test whether the SP has overflowed, without corrupting a GPR.
* Task and IRQ stacks are aligned so that SP & (1 << THREAD_SHIFT)
* should always be zero.
arm64: add VMAP_STACK overflow detection This patch adds stack overflow detection to arm64, usable when vmap'd stacks are in use. Overflow is detected in a small preamble executed for each exception entry, which checks whether there is enough space on the current stack for the general purpose registers to be saved. If there is not enough space, the overflow handler is invoked on a per-cpu overflow stack. This approach preserves the original exception information in ESR_EL1 (and where appropriate, FAR_EL1). Task and IRQ stacks are aligned to double their size, enabling overflow to be detected with a single bit test. For example, a 16K stack is aligned to 32K, ensuring that bit 14 of the SP must be zero. On an overflow (or underflow), this bit is flipped. Thus, overflow (of less than the size of the stack) can be detected by testing whether this bit is set. The overflow check is performed before any attempt is made to access the stack, avoiding recursive faults (and the loss of exception information these would entail). As logical operations cannot be performed on the SP directly, the SP is temporarily swapped with a general purpose register using arithmetic operations to enable the test to be performed. This gives us a useful error message on stack overflow, as can be trigger with the LKDTM overflow test: [ 305.388749] lkdtm: Performing direct entry OVERFLOW [ 305.395444] Insufficient stack space to handle exception! [ 305.395482] ESR: 0x96000047 -- DABT (current EL) [ 305.399890] FAR: 0xffff00000a5e7f30 [ 305.401315] Task stack: [0xffff00000a5e8000..0xffff00000a5ec000] [ 305.403815] IRQ stack: [0xffff000008000000..0xffff000008004000] [ 305.407035] Overflow stack: [0xffff80003efce4e0..0xffff80003efcf4e0] [ 305.409622] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5 [ 305.412785] Hardware name: linux,dummy-virt (DT) [ 305.415756] task: ffff80003d051c00 task.stack: ffff00000a5e8000 [ 305.419221] PC is at recursive_loop+0x10/0x48 [ 305.421637] LR is at recursive_loop+0x38/0x48 [ 305.423768] pc : [<ffff00000859f330>] lr : [<ffff00000859f358>] pstate: 40000145 [ 305.428020] sp : ffff00000a5e7f50 [ 305.430469] x29: ffff00000a5e8350 x28: ffff80003d051c00 [ 305.433191] x27: ffff000008981000 x26: ffff000008f80400 [ 305.439012] x25: ffff00000a5ebeb8 x24: ffff00000a5ebeb8 [ 305.440369] x23: ffff000008f80138 x22: 0000000000000009 [ 305.442241] x21: ffff80003ce65000 x20: ffff000008f80188 [ 305.444552] x19: 0000000000000013 x18: 0000000000000006 [ 305.446032] x17: 0000ffffa2601280 x16: ffff0000081fe0b8 [ 305.448252] x15: ffff000008ff546d x14: 000000000047a4c8 [ 305.450246] x13: ffff000008ff7872 x12: 0000000005f5e0ff [ 305.452953] x11: ffff000008ed2548 x10: 000000000005ee8d [ 305.454824] x9 : ffff000008545380 x8 : ffff00000a5e8770 [ 305.457105] x7 : 1313131313131313 x6 : 00000000000000e1 [ 305.459285] x5 : 0000000000000000 x4 : 0000000000000000 [ 305.461781] x3 : 0000000000000000 x2 : 0000000000000400 [ 305.465119] x1 : 0000000000000013 x0 : 0000000000000012 [ 305.467724] Kernel panic - not syncing: kernel stack overflow [ 305.470561] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5 [ 305.473325] Hardware name: linux,dummy-virt (DT) [ 305.475070] Call trace: [ 305.476116] [<ffff000008088ad8>] dump_backtrace+0x0/0x378 [ 305.478991] [<ffff000008088e64>] show_stack+0x14/0x20 [ 305.481237] [<ffff00000895a178>] dump_stack+0x98/0xb8 [ 305.483294] [<ffff0000080c3288>] panic+0x118/0x280 [ 305.485673] [<ffff0000080c2e9c>] nmi_panic+0x6c/0x70 [ 305.486216] [<ffff000008089710>] handle_bad_stack+0x118/0x128 [ 305.486612] Exception stack(0xffff80003efcf3a0 to 0xffff80003efcf4e0) [ 305.487334] f3a0: 0000000000000012 0000000000000013 0000000000000400 0000000000000000 [ 305.488025] f3c0: 0000000000000000 0000000000000000 00000000000000e1 1313131313131313 [ 305.488908] f3e0: ffff00000a5e8770 ffff000008545380 000000000005ee8d ffff000008ed2548 [ 305.489403] f400: 0000000005f5e0ff ffff000008ff7872 000000000047a4c8 ffff000008ff546d [ 305.489759] f420: ffff0000081fe0b8 0000ffffa2601280 0000000000000006 0000000000000013 [ 305.490256] f440: ffff000008f80188 ffff80003ce65000 0000000000000009 ffff000008f80138 [ 305.490683] f460: ffff00000a5ebeb8 ffff00000a5ebeb8 ffff000008f80400 ffff000008981000 [ 305.491051] f480: ffff80003d051c00 ffff00000a5e8350 ffff00000859f358 ffff00000a5e7f50 [ 305.491444] f4a0: ffff00000859f330 0000000040000145 0000000000000000 0000000000000000 [ 305.492008] f4c0: 0001000000000000 0000000000000000 ffff00000a5e8350 ffff00000859f330 [ 305.493063] [<ffff00000808205c>] __bad_stack+0x88/0x8c [ 305.493396] [<ffff00000859f330>] recursive_loop+0x10/0x48 [ 305.493731] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494088] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494425] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494649] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494898] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.495205] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.495453] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.495708] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496000] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496302] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496644] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496894] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.497138] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.497325] [<ffff00000859f3dc>] lkdtm_OVERFLOW+0x14/0x20 [ 305.497506] [<ffff00000859f314>] lkdtm_do_action+0x1c/0x28 [ 305.497786] [<ffff00000859f178>] direct_entry+0xe0/0x170 [ 305.498095] [<ffff000008345568>] full_proxy_write+0x60/0xa8 [ 305.498387] [<ffff0000081fb7f4>] __vfs_write+0x1c/0x128 [ 305.498679] [<ffff0000081fcc68>] vfs_write+0xa0/0x1b0 [ 305.498926] [<ffff0000081fe0fc>] SyS_write+0x44/0xa0 [ 305.499182] Exception stack(0xffff00000a5ebec0 to 0xffff00000a5ec000) [ 305.499429] bec0: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0 [ 305.499674] bee0: 574f4c465245564f 0000000000000000 0000000000000000 8000000080808080 [ 305.499904] bf00: 0000000000000040 0000000000000038 fefefeff1b4bc2ff 7f7f7f7f7f7fff7f [ 305.500189] bf20: 0101010101010101 0000000000000000 000000000047a4c8 0000000000000038 [ 305.500712] bf40: 0000000000000000 0000ffffa2601280 0000ffffc63f6068 00000000004b5000 [ 305.501241] bf60: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0 [ 305.501791] bf80: 0000000000000020 0000000000000000 00000000004b5000 000000001c4cc458 [ 305.502314] bfa0: 0000000000000000 0000ffffc63f7950 000000000040a3c4 0000ffffc63f70e0 [ 305.502762] bfc0: 0000ffffa2601268 0000000080000000 0000000000000001 0000000000000040 [ 305.503207] bfe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 305.503680] [<ffff000008082fb0>] el0_svc_naked+0x24/0x28 [ 305.504720] Kernel Offset: disabled [ 305.505189] CPU features: 0x002082 [ 305.505473] Memory Limit: none [ 305.506181] ---[ end Kernel panic - not syncing: kernel stack overflow This patch was co-authored by Ard Biesheuvel and Mark Rutland. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Mark Rutland <mark.rutland@arm.com> Reviewed-by: Will Deacon <will.deacon@arm.com> Tested-by: Laura Abbott <labbott@redhat.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: James Morse <james.morse@arm.com>
2017-07-14 19:30:35 +00:00
*/
add sp, sp, x0 // sp' = sp + x0
sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp
tbnz x0, #THREAD_SHIFT, 0f
sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0
sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp
b el\()\el\()_\label
arm64: add VMAP_STACK overflow detection This patch adds stack overflow detection to arm64, usable when vmap'd stacks are in use. Overflow is detected in a small preamble executed for each exception entry, which checks whether there is enough space on the current stack for the general purpose registers to be saved. If there is not enough space, the overflow handler is invoked on a per-cpu overflow stack. This approach preserves the original exception information in ESR_EL1 (and where appropriate, FAR_EL1). Task and IRQ stacks are aligned to double their size, enabling overflow to be detected with a single bit test. For example, a 16K stack is aligned to 32K, ensuring that bit 14 of the SP must be zero. On an overflow (or underflow), this bit is flipped. Thus, overflow (of less than the size of the stack) can be detected by testing whether this bit is set. The overflow check is performed before any attempt is made to access the stack, avoiding recursive faults (and the loss of exception information these would entail). As logical operations cannot be performed on the SP directly, the SP is temporarily swapped with a general purpose register using arithmetic operations to enable the test to be performed. This gives us a useful error message on stack overflow, as can be trigger with the LKDTM overflow test: [ 305.388749] lkdtm: Performing direct entry OVERFLOW [ 305.395444] Insufficient stack space to handle exception! [ 305.395482] ESR: 0x96000047 -- DABT (current EL) [ 305.399890] FAR: 0xffff00000a5e7f30 [ 305.401315] Task stack: [0xffff00000a5e8000..0xffff00000a5ec000] [ 305.403815] IRQ stack: [0xffff000008000000..0xffff000008004000] [ 305.407035] Overflow stack: [0xffff80003efce4e0..0xffff80003efcf4e0] [ 305.409622] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5 [ 305.412785] Hardware name: linux,dummy-virt (DT) [ 305.415756] task: ffff80003d051c00 task.stack: ffff00000a5e8000 [ 305.419221] PC is at recursive_loop+0x10/0x48 [ 305.421637] LR is at recursive_loop+0x38/0x48 [ 305.423768] pc : [<ffff00000859f330>] lr : [<ffff00000859f358>] pstate: 40000145 [ 305.428020] sp : ffff00000a5e7f50 [ 305.430469] x29: ffff00000a5e8350 x28: ffff80003d051c00 [ 305.433191] x27: ffff000008981000 x26: ffff000008f80400 [ 305.439012] x25: ffff00000a5ebeb8 x24: ffff00000a5ebeb8 [ 305.440369] x23: ffff000008f80138 x22: 0000000000000009 [ 305.442241] x21: ffff80003ce65000 x20: ffff000008f80188 [ 305.444552] x19: 0000000000000013 x18: 0000000000000006 [ 305.446032] x17: 0000ffffa2601280 x16: ffff0000081fe0b8 [ 305.448252] x15: ffff000008ff546d x14: 000000000047a4c8 [ 305.450246] x13: ffff000008ff7872 x12: 0000000005f5e0ff [ 305.452953] x11: ffff000008ed2548 x10: 000000000005ee8d [ 305.454824] x9 : ffff000008545380 x8 : ffff00000a5e8770 [ 305.457105] x7 : 1313131313131313 x6 : 00000000000000e1 [ 305.459285] x5 : 0000000000000000 x4 : 0000000000000000 [ 305.461781] x3 : 0000000000000000 x2 : 0000000000000400 [ 305.465119] x1 : 0000000000000013 x0 : 0000000000000012 [ 305.467724] Kernel panic - not syncing: kernel stack overflow [ 305.470561] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5 [ 305.473325] Hardware name: linux,dummy-virt (DT) [ 305.475070] Call trace: [ 305.476116] [<ffff000008088ad8>] dump_backtrace+0x0/0x378 [ 305.478991] [<ffff000008088e64>] show_stack+0x14/0x20 [ 305.481237] [<ffff00000895a178>] dump_stack+0x98/0xb8 [ 305.483294] [<ffff0000080c3288>] panic+0x118/0x280 [ 305.485673] [<ffff0000080c2e9c>] nmi_panic+0x6c/0x70 [ 305.486216] [<ffff000008089710>] handle_bad_stack+0x118/0x128 [ 305.486612] Exception stack(0xffff80003efcf3a0 to 0xffff80003efcf4e0) [ 305.487334] f3a0: 0000000000000012 0000000000000013 0000000000000400 0000000000000000 [ 305.488025] f3c0: 0000000000000000 0000000000000000 00000000000000e1 1313131313131313 [ 305.488908] f3e0: ffff00000a5e8770 ffff000008545380 000000000005ee8d ffff000008ed2548 [ 305.489403] f400: 0000000005f5e0ff ffff000008ff7872 000000000047a4c8 ffff000008ff546d [ 305.489759] f420: ffff0000081fe0b8 0000ffffa2601280 0000000000000006 0000000000000013 [ 305.490256] f440: ffff000008f80188 ffff80003ce65000 0000000000000009 ffff000008f80138 [ 305.490683] f460: ffff00000a5ebeb8 ffff00000a5ebeb8 ffff000008f80400 ffff000008981000 [ 305.491051] f480: ffff80003d051c00 ffff00000a5e8350 ffff00000859f358 ffff00000a5e7f50 [ 305.491444] f4a0: ffff00000859f330 0000000040000145 0000000000000000 0000000000000000 [ 305.492008] f4c0: 0001000000000000 0000000000000000 ffff00000a5e8350 ffff00000859f330 [ 305.493063] [<ffff00000808205c>] __bad_stack+0x88/0x8c [ 305.493396] [<ffff00000859f330>] recursive_loop+0x10/0x48 [ 305.493731] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494088] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494425] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494649] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494898] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.495205] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.495453] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.495708] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496000] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496302] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496644] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496894] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.497138] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.497325] [<ffff00000859f3dc>] lkdtm_OVERFLOW+0x14/0x20 [ 305.497506] [<ffff00000859f314>] lkdtm_do_action+0x1c/0x28 [ 305.497786] [<ffff00000859f178>] direct_entry+0xe0/0x170 [ 305.498095] [<ffff000008345568>] full_proxy_write+0x60/0xa8 [ 305.498387] [<ffff0000081fb7f4>] __vfs_write+0x1c/0x128 [ 305.498679] [<ffff0000081fcc68>] vfs_write+0xa0/0x1b0 [ 305.498926] [<ffff0000081fe0fc>] SyS_write+0x44/0xa0 [ 305.499182] Exception stack(0xffff00000a5ebec0 to 0xffff00000a5ec000) [ 305.499429] bec0: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0 [ 305.499674] bee0: 574f4c465245564f 0000000000000000 0000000000000000 8000000080808080 [ 305.499904] bf00: 0000000000000040 0000000000000038 fefefeff1b4bc2ff 7f7f7f7f7f7fff7f [ 305.500189] bf20: 0101010101010101 0000000000000000 000000000047a4c8 0000000000000038 [ 305.500712] bf40: 0000000000000000 0000ffffa2601280 0000ffffc63f6068 00000000004b5000 [ 305.501241] bf60: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0 [ 305.501791] bf80: 0000000000000020 0000000000000000 00000000004b5000 000000001c4cc458 [ 305.502314] bfa0: 0000000000000000 0000ffffc63f7950 000000000040a3c4 0000ffffc63f70e0 [ 305.502762] bfc0: 0000ffffa2601268 0000000080000000 0000000000000001 0000000000000040 [ 305.503207] bfe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 305.503680] [<ffff000008082fb0>] el0_svc_naked+0x24/0x28 [ 305.504720] Kernel Offset: disabled [ 305.505189] CPU features: 0x002082 [ 305.505473] Memory Limit: none [ 305.506181] ---[ end Kernel panic - not syncing: kernel stack overflow This patch was co-authored by Ard Biesheuvel and Mark Rutland. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Mark Rutland <mark.rutland@arm.com> Reviewed-by: Will Deacon <will.deacon@arm.com> Tested-by: Laura Abbott <labbott@redhat.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: James Morse <james.morse@arm.com>
2017-07-14 19:30:35 +00:00
0:
/*
* Either we've just detected an overflow, or we've taken an exception
* while on the overflow stack. Either way, we won't return to
* userspace, and can clobber EL0 registers to free up GPRs.
*/
/* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */
msr tpidr_el0, x0
/* Recover the original x0 value and stash it in tpidrro_el0 */
sub x0, sp, x0
msr tpidrro_el0, x0
/* Switch to the overflow stack */
adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0
/*
* Check whether we were already on the overflow stack. This may happen
* after panic() re-enables interrupts.
*/
mrs x0, tpidr_el0 // sp of interrupted context
sub x0, sp, x0 // delta with top of overflow stack
tst x0, #~(OVERFLOW_STACK_SIZE - 1) // within range?
b.ne __bad_stack // no? -> bad stack pointer
/* We were already on the overflow stack. Restore sp/x0 and carry on. */
sub sp, sp, x0
mrs x0, tpidrro_el0
#endif
b el\()\el\()_\label
.endm
.macro tramp_alias, dst, sym
mov_q \dst, TRAMP_VALIAS
add \dst, \dst, #(\sym - .entry.tramp.text)
.endm
// This macro corrupts x0-x3. It is the caller's duty
// to save/restore them if required.
.macro apply_ssbd, state, tmp1, tmp2
#ifdef CONFIG_ARM64_SSBD
alternative_cb arm64_enable_wa2_handling
b .L__asm_ssbd_skip\@
alternative_cb_end
ldr_this_cpu \tmp2, arm64_ssbd_callback_required, \tmp1
cbz \tmp2, .L__asm_ssbd_skip\@
ldr \tmp2, [tsk, #TSK_TI_FLAGS]
tbnz \tmp2, #TIF_SSBD, .L__asm_ssbd_skip\@
mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2
mov w1, #\state
alternative_cb arm64_update_smccc_conduit
nop // Patched to SMC/HVC #0
alternative_cb_end
.L__asm_ssbd_skip\@:
#endif
.endm
.macro kernel_entry, el, regsize = 64
.if \regsize == 32
mov w0, w0 // zero upper 32 bits of x0
.endif
stp x0, x1, [sp, #16 * 0]
stp x2, x3, [sp, #16 * 1]
stp x4, x5, [sp, #16 * 2]
stp x6, x7, [sp, #16 * 3]
stp x8, x9, [sp, #16 * 4]
stp x10, x11, [sp, #16 * 5]
stp x12, x13, [sp, #16 * 6]
stp x14, x15, [sp, #16 * 7]
stp x16, x17, [sp, #16 * 8]
stp x18, x19, [sp, #16 * 9]
stp x20, x21, [sp, #16 * 10]
stp x22, x23, [sp, #16 * 11]
stp x24, x25, [sp, #16 * 12]
stp x26, x27, [sp, #16 * 13]
stp x28, x29, [sp, #16 * 14]
.if \el == 0
clear_gp_regs
mrs x21, sp_el0
ldr_this_cpu tsk, __entry_task, x20
msr sp_el0, tsk
// Ensure MDSCR_EL1.SS is clear, since we can unmask debug exceptions
// when scheduling.
ldr x19, [tsk, #TSK_TI_FLAGS]
disable_step_tsk x19, x20
apply_ssbd 1, x22, x23
ptrauth_keys_install_kernel tsk, 1, x20, x22, x23
.else
add x21, sp, #S_FRAME_SIZE
get_current_task tsk
/* Save the task's original addr_limit and set USER_DS */
arm64: split thread_info from task stack This patch moves arm64's struct thread_info from the task stack into task_struct. This protects thread_info from corruption in the case of stack overflows, and makes its address harder to determine if stack addresses are leaked, making a number of attacks more difficult. Precise detection and handling of overflow is left for subsequent patches. Largely, this involves changing code to store the task_struct in sp_el0, and acquire the thread_info from the task struct. Core code now implements current_thread_info(), and as noted in <linux/sched.h> this relies on offsetof(task_struct, thread_info) == 0, enforced by core code. This change means that the 'tsk' register used in entry.S now points to a task_struct, rather than a thread_info as it used to. To make this clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets appropriately updated to account for the structural change. Userspace clobbers sp_el0, and we can no longer restore this from the stack. Instead, the current task is cached in a per-cpu variable that we can safely access from early assembly as interrupts are disabled (and we are thus not preemptible). Both secondary entry and idle are updated to stash the sp and task pointer separately. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Tested-by: Laura Abbott <labbott@redhat.com> Cc: AKASHI Takahiro <takahiro.akashi@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: James Morse <james.morse@arm.com> Cc: Kees Cook <keescook@chromium.org> Cc: Suzuki K Poulose <suzuki.poulose@arm.com> Cc: Will Deacon <will.deacon@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-03 20:23:13 +00:00
ldr x20, [tsk, #TSK_TI_ADDR_LIMIT]
str x20, [sp, #S_ORIG_ADDR_LIMIT]
mov x20, #USER_DS
arm64: split thread_info from task stack This patch moves arm64's struct thread_info from the task stack into task_struct. This protects thread_info from corruption in the case of stack overflows, and makes its address harder to determine if stack addresses are leaked, making a number of attacks more difficult. Precise detection and handling of overflow is left for subsequent patches. Largely, this involves changing code to store the task_struct in sp_el0, and acquire the thread_info from the task struct. Core code now implements current_thread_info(), and as noted in <linux/sched.h> this relies on offsetof(task_struct, thread_info) == 0, enforced by core code. This change means that the 'tsk' register used in entry.S now points to a task_struct, rather than a thread_info as it used to. To make this clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets appropriately updated to account for the structural change. Userspace clobbers sp_el0, and we can no longer restore this from the stack. Instead, the current task is cached in a per-cpu variable that we can safely access from early assembly as interrupts are disabled (and we are thus not preemptible). Both secondary entry and idle are updated to stash the sp and task pointer separately. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Tested-by: Laura Abbott <labbott@redhat.com> Cc: AKASHI Takahiro <takahiro.akashi@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: James Morse <james.morse@arm.com> Cc: Kees Cook <keescook@chromium.org> Cc: Suzuki K Poulose <suzuki.poulose@arm.com> Cc: Will Deacon <will.deacon@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-03 20:23:13 +00:00
str x20, [tsk, #TSK_TI_ADDR_LIMIT]
/* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
.endif /* \el == 0 */
mrs x22, elr_el1
mrs x23, spsr_el1
stp lr, x21, [sp, #S_LR]
arm64: unwind: reference pt_regs via embedded stack frame As it turns out, the unwind code is slightly broken, and probably has been for a while. The problem is in the dumping of the exception stack, which is intended to dump the contents of the pt_regs struct at each level in the call stack where an exception was taken and routed to a routine marked as __exception (which means its stack frame is right below the pt_regs struct on the stack). 'Right below the pt_regs struct' is ill defined, though: the unwind code assigns 'frame pointer + 0x10' to the .sp member of the stackframe struct at each level, and dump_backtrace() happily dereferences that as the pt_regs pointer when encountering an __exception routine. However, the actual size of the stack frame created by this routine (which could be one of many __exception routines we have in the kernel) is not known, and so frame.sp is pretty useless to figure out where struct pt_regs really is. So it seems the only way to ensure that we can find our struct pt_regs when walking the stack frames is to put it at a known fixed offset of the stack frame pointer that is passed to such __exception routines. The simplest way to do that is to put it inside pt_regs itself, which is the main change implemented by this patch. As a bonus, doing this allows us to get rid of a fair amount of cruft related to walking from one stack to the other, which is especially nice since we intend to introduce yet another stack for overflow handling once we add support for vmapped stacks. It also fixes an inconsistency where we only add a stack frame pointing to ELR_EL1 if we are executing from the IRQ stack but not when we are executing from the task stack. To consistly identify exceptions regs even in the presence of exceptions taken from entry code, we must check whether the next frame was created by entry text, rather than whether the current frame was crated by exception text. To avoid backtracing using PCs that fall in the idmap, or are controlled by userspace, we must explcitly zero the FP and LR in startup paths, and must ensure that the frame embedded in pt_regs is zeroed upon entry from EL0. To avoid these NULL entries showin in the backtrace, unwind_frame() is updated to avoid them. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> [Mark: compare current frame against .entry.text, avoid bogus PCs] Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Will Deacon <will.deacon@arm.com>
2017-07-22 17:45:33 +00:00
/*
* In order to be able to dump the contents of struct pt_regs at the
* time the exception was taken (in case we attempt to walk the call
* stack later), chain it together with the stack frames.
*/
.if \el == 0
stp xzr, xzr, [sp, #S_STACKFRAME]
.else
stp x29, x22, [sp, #S_STACKFRAME]
.endif
add x29, sp, #S_STACKFRAME
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
/*
* Set the TTBR0 PAN bit in SPSR. When the exception is taken from
* EL0, there is no need to check the state of TTBR0_EL1 since
* accesses are always enabled.
* Note that the meaning of this bit differs from the ARMv8.1 PAN
* feature as all TTBR0_EL1 accesses are disabled, not just those to
* user mappings.
*/
alternative_if ARM64_HAS_PAN
b 1f // skip TTBR0 PAN
alternative_else_nop_endif
.if \el != 0
mrs x21, ttbr0_el1
tst x21, #TTBR_ASID_MASK // Check for the reserved ASID
orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR
b.eq 1f // TTBR0 access already disabled
and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR
.endif
__uaccess_ttbr0_disable x21
1:
#endif
stp x22, x23, [sp, #S_PC]
/* Not in a syscall by default (el0_svc overwrites for real syscall) */
.if \el == 0
mov w21, #NO_SYSCALL
arm64: syscallno is secretly an int, make it official The upper 32 bits of the syscallno field in thread_struct are handled inconsistently, being sometimes zero extended and sometimes sign-extended. In fact, only the lower 32 bits seem to have any real significance for the behaviour of the code: it's been OK to handle the upper bits inconsistently because they don't matter. Currently, the only place I can find where those bits are significant is in calling trace_sys_enter(), which may be unintentional: for example, if a compat tracer attempts to cancel a syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the syscall-enter-stop, it will be traced as syscall 4294967295 rather than -1 as might be expected (and as occurs for a native tracer doing the same thing). Elsewhere, reads of syscallno cast it to an int or truncate it. There's also a conspicuous amount of code and casting to bodge around the fact that although semantically an int, syscallno is stored as a u64. Let's not pretend any more. In order to preserve the stp x instruction that stores the syscall number in entry.S, this patch special-cases the layout of struct pt_regs for big endian so that the newly 32-bit syscallno field maps onto the low bits of the stored value. This is not beautiful, but benchmarking of the getpid syscall on Juno suggests indicates a minor slowdown if the stp is split into an stp x and stp w. Signed-off-by: Dave Martin <Dave.Martin@arm.com> Acked-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-08-01 14:35:53 +00:00
str w21, [sp, #S_SYSCALLNO]
.endif
/* Save pmr */
alternative_if ARM64_HAS_IRQ_PRIO_MASKING
mrs_s x20, SYS_ICC_PMR_EL1
str x20, [sp, #S_PMR_SAVE]
alternative_else_nop_endif
/*
* Registers that may be useful after this macro is invoked:
*
arm64: Fix incorrect irqflag restore for priority masking When using IRQ priority masking to disable interrupts, in order to deal with the PSR.I state, local_irq_save() would convert the I bit into a PMR value (GIC_PRIO_IRQOFF). This resulted in local_irq_restore() potentially modifying the value of PMR in undesired location due to the state of PSR.I upon flag saving [1]. In an attempt to solve this issue in a less hackish manner, introduce a bit (GIC_PRIO_IGNORE_PMR) for the PMR values that can represent whether PSR.I is being used to disable interrupts, in which case it takes precedence of the status of interrupt masking via PMR. GIC_PRIO_PSR_I_SET is chosen such that (<pmr_value> | GIC_PRIO_PSR_I_SET) does not mask more interrupts than <pmr_value> as some sections (e.g. arch_cpu_idle(), interrupt acknowledge path) requires PMR not to mask interrupts that could be signaled to the CPU when using only PSR.I. [1] https://www.spinics.net/lists/arm-kernel/msg716956.html Fixes: 4a503217ce37 ("arm64: irqflags: Use ICC_PMR_EL1 for interrupt masking") Cc: <stable@vger.kernel.org> # 5.1.x- Reported-by: Zenghui Yu <yuzenghui@huawei.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Wei Li <liwei391@huawei.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Christoffer Dall <christoffer.dall@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Suzuki K Pouloze <suzuki.poulose@arm.com> Cc: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Julien Thierry <julien.thierry@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2019-06-11 09:38:10 +00:00
* x20 - ICC_PMR_EL1
* x21 - aborted SP
* x22 - aborted PC
* x23 - aborted PSTATE
*/
.endm
.macro kernel_exit, el
.if \el != 0
disable_daif
/* Restore the task's original addr_limit. */
ldr x20, [sp, #S_ORIG_ADDR_LIMIT]
arm64: split thread_info from task stack This patch moves arm64's struct thread_info from the task stack into task_struct. This protects thread_info from corruption in the case of stack overflows, and makes its address harder to determine if stack addresses are leaked, making a number of attacks more difficult. Precise detection and handling of overflow is left for subsequent patches. Largely, this involves changing code to store the task_struct in sp_el0, and acquire the thread_info from the task struct. Core code now implements current_thread_info(), and as noted in <linux/sched.h> this relies on offsetof(task_struct, thread_info) == 0, enforced by core code. This change means that the 'tsk' register used in entry.S now points to a task_struct, rather than a thread_info as it used to. To make this clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets appropriately updated to account for the structural change. Userspace clobbers sp_el0, and we can no longer restore this from the stack. Instead, the current task is cached in a per-cpu variable that we can safely access from early assembly as interrupts are disabled (and we are thus not preemptible). Both secondary entry and idle are updated to stash the sp and task pointer separately. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Tested-by: Laura Abbott <labbott@redhat.com> Cc: AKASHI Takahiro <takahiro.akashi@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: James Morse <james.morse@arm.com> Cc: Kees Cook <keescook@chromium.org> Cc: Suzuki K Poulose <suzuki.poulose@arm.com> Cc: Will Deacon <will.deacon@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-03 20:23:13 +00:00
str x20, [tsk, #TSK_TI_ADDR_LIMIT]
/* No need to restore UAO, it will be restored from SPSR_EL1 */
.endif
/* Restore pmr */
alternative_if ARM64_HAS_IRQ_PRIO_MASKING
ldr x20, [sp, #S_PMR_SAVE]
msr_s SYS_ICC_PMR_EL1, x20
mrs_s x21, SYS_ICC_CTLR_EL1
tbz x21, #6, .L__skip_pmr_sync\@ // Check for ICC_CTLR_EL1.PMHE
dsb sy // Ensure priority change is seen by redistributor
.L__skip_pmr_sync\@:
alternative_else_nop_endif
ldp x21, x22, [sp, #S_PC] // load ELR, SPSR
.if \el == 0
ct_user_enter
.endif
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
/*
* Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
* PAN bit checking.
*/
alternative_if ARM64_HAS_PAN
b 2f // skip TTBR0 PAN
alternative_else_nop_endif
.if \el != 0
tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
.endif
__uaccess_ttbr0_enable x0, x1
.if \el == 0
/*
* Enable errata workarounds only if returning to user. The only
* workaround currently required for TTBR0_EL1 changes are for the
* Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
* corruption).
*/
bl post_ttbr_update_workaround
.endif
1:
.if \el != 0
and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit
.endif
2:
#endif
.if \el == 0
ldr x23, [sp, #S_SP] // load return stack pointer
msr sp_el0, x23
tst x22, #PSR_MODE32_BIT // native task?
b.eq 3f
#ifdef CONFIG_ARM64_ERRATUM_845719
alternative_if ARM64_WORKAROUND_845719
#ifdef CONFIG_PID_IN_CONTEXTIDR
mrs x29, contextidr_el1
msr contextidr_el1, x29
#else
msr contextidr_el1, xzr
#endif
alternative_else_nop_endif
#endif
3:
#ifdef CONFIG_ARM64_ERRATUM_1418040
alternative_if_not ARM64_WORKAROUND_1418040
b 4f
alternative_else_nop_endif
/*
* if (x22.mode32 == cntkctl_el1.el0vcten)
* cntkctl_el1.el0vcten = ~cntkctl_el1.el0vcten
*/
mrs x1, cntkctl_el1
eon x0, x1, x22, lsr #3
tbz x0, #1, 4f
eor x1, x1, #2 // ARCH_TIMER_USR_VCT_ACCESS_EN
msr cntkctl_el1, x1
4:
#endif
/* No kernel C function calls after this as user keys are set. */
ptrauth_keys_install_user tsk, x0, x1, x2
apply_ssbd 0, x0, x1
.endif
msr elr_el1, x21 // set up the return data
msr spsr_el1, x22
ldp x0, x1, [sp, #16 * 0]
ldp x2, x3, [sp, #16 * 1]
ldp x4, x5, [sp, #16 * 2]
ldp x6, x7, [sp, #16 * 3]
ldp x8, x9, [sp, #16 * 4]
ldp x10, x11, [sp, #16 * 5]
ldp x12, x13, [sp, #16 * 6]
ldp x14, x15, [sp, #16 * 7]
ldp x16, x17, [sp, #16 * 8]
ldp x18, x19, [sp, #16 * 9]
ldp x20, x21, [sp, #16 * 10]
ldp x22, x23, [sp, #16 * 11]
ldp x24, x25, [sp, #16 * 12]
ldp x26, x27, [sp, #16 * 13]
ldp x28, x29, [sp, #16 * 14]
ldr lr, [sp, #S_LR]
add sp, sp, #S_FRAME_SIZE // restore sp
.if \el == 0
alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
bne 5f
msr far_el1, x30
tramp_alias x30, tramp_exit_native
br x30
5:
tramp_alias x30, tramp_exit_compat
br x30
#endif
.else
eret
.endif
sb
.endm
.macro irq_stack_entry
mov x19, sp // preserve the original sp
/*
arm64: split thread_info from task stack This patch moves arm64's struct thread_info from the task stack into task_struct. This protects thread_info from corruption in the case of stack overflows, and makes its address harder to determine if stack addresses are leaked, making a number of attacks more difficult. Precise detection and handling of overflow is left for subsequent patches. Largely, this involves changing code to store the task_struct in sp_el0, and acquire the thread_info from the task struct. Core code now implements current_thread_info(), and as noted in <linux/sched.h> this relies on offsetof(task_struct, thread_info) == 0, enforced by core code. This change means that the 'tsk' register used in entry.S now points to a task_struct, rather than a thread_info as it used to. To make this clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets appropriately updated to account for the structural change. Userspace clobbers sp_el0, and we can no longer restore this from the stack. Instead, the current task is cached in a per-cpu variable that we can safely access from early assembly as interrupts are disabled (and we are thus not preemptible). Both secondary entry and idle are updated to stash the sp and task pointer separately. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Tested-by: Laura Abbott <labbott@redhat.com> Cc: AKASHI Takahiro <takahiro.akashi@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: James Morse <james.morse@arm.com> Cc: Kees Cook <keescook@chromium.org> Cc: Suzuki K Poulose <suzuki.poulose@arm.com> Cc: Will Deacon <will.deacon@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-03 20:23:13 +00:00
* Compare sp with the base of the task stack.
* If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
* and should switch to the irq stack.
*/
arm64: split thread_info from task stack This patch moves arm64's struct thread_info from the task stack into task_struct. This protects thread_info from corruption in the case of stack overflows, and makes its address harder to determine if stack addresses are leaked, making a number of attacks more difficult. Precise detection and handling of overflow is left for subsequent patches. Largely, this involves changing code to store the task_struct in sp_el0, and acquire the thread_info from the task struct. Core code now implements current_thread_info(), and as noted in <linux/sched.h> this relies on offsetof(task_struct, thread_info) == 0, enforced by core code. This change means that the 'tsk' register used in entry.S now points to a task_struct, rather than a thread_info as it used to. To make this clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets appropriately updated to account for the structural change. Userspace clobbers sp_el0, and we can no longer restore this from the stack. Instead, the current task is cached in a per-cpu variable that we can safely access from early assembly as interrupts are disabled (and we are thus not preemptible). Both secondary entry and idle are updated to stash the sp and task pointer separately. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Tested-by: Laura Abbott <labbott@redhat.com> Cc: AKASHI Takahiro <takahiro.akashi@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: James Morse <james.morse@arm.com> Cc: Kees Cook <keescook@chromium.org> Cc: Suzuki K Poulose <suzuki.poulose@arm.com> Cc: Will Deacon <will.deacon@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-03 20:23:13 +00:00
ldr x25, [tsk, TSK_STACK]
eor x25, x25, x19
and x25, x25, #~(THREAD_SIZE - 1)
cbnz x25, 9998f
ldr_this_cpu x25, irq_stack_ptr, x26
arm64: kernel: remove {THREAD,IRQ_STACK}_START_SP For historical reasons, we leave the top 16 bytes of our task and IRQ stacks unused, a practice used to ensure that the SP can always be masked to find the base of the current stack (historically, where thread_info could be found). However, this is not necessary, as: * When an exception is taken from a task stack, we decrement the SP by S_FRAME_SIZE and stash the exception registers before we compare the SP against the task stack. In such cases, the SP must be at least S_FRAME_SIZE below the limit, and can be safely masked to determine whether the task stack is in use. * When transitioning to an IRQ stack, we'll place a dummy frame onto the IRQ stack before enabling asynchronous exceptions, or executing code we expect to trigger faults. Thus, if an exception is taken from the IRQ stack, the SP must be at least 16 bytes below the limit. * We no longer mask the SP to find the thread_info, which is now found via sp_el0. Note that historically, the offset was critical to ensure that cpu_switch_to() found the correct stack for new threads that hadn't yet executed ret_from_fork(). Given that, this initial offset serves no purpose, and can be removed. This brings us in-line with other architectures (e.g. x86) which do not rely on this masking. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> [Mark: rebase, kill THREAD_START_SP, commit msg additions] Signed-off-by: Mark Rutland <mark.rutland@arm.com> Reviewed-by: Will Deacon <will.deacon@arm.com> Tested-by: Laura Abbott <labbott@redhat.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: James Morse <james.morse@arm.com>
2017-07-20 16:15:45 +00:00
mov x26, #IRQ_STACK_SIZE
add x26, x25, x26
/* switch to the irq stack */
mov sp, x26
9998:
.endm
/*
* x19 should be preserved between irq_stack_entry and
* irq_stack_exit.
*/
.macro irq_stack_exit
mov sp, x19
.endm
/* GPRs used by entry code */
tsk .req x28 // current thread_info
/*
* Interrupt handling.
*/
.macro irq_handler
ldr_l x1, handle_arch_irq
mov x0, sp
irq_stack_entry
blr x1
irq_stack_exit
.endm
#ifdef CONFIG_ARM64_PSEUDO_NMI
/*
* Set res to 0 if irqs were unmasked in interrupted context.
* Otherwise set res to non-0 value.
*/
.macro test_irqs_unmasked res:req, pmr:req
alternative_if ARM64_HAS_IRQ_PRIO_MASKING
sub \res, \pmr, #GIC_PRIO_IRQON
alternative_else
mov \res, xzr
alternative_endif
.endm
#endif
arm64: Fix incorrect irqflag restore for priority masking When using IRQ priority masking to disable interrupts, in order to deal with the PSR.I state, local_irq_save() would convert the I bit into a PMR value (GIC_PRIO_IRQOFF). This resulted in local_irq_restore() potentially modifying the value of PMR in undesired location due to the state of PSR.I upon flag saving [1]. In an attempt to solve this issue in a less hackish manner, introduce a bit (GIC_PRIO_IGNORE_PMR) for the PMR values that can represent whether PSR.I is being used to disable interrupts, in which case it takes precedence of the status of interrupt masking via PMR. GIC_PRIO_PSR_I_SET is chosen such that (<pmr_value> | GIC_PRIO_PSR_I_SET) does not mask more interrupts than <pmr_value> as some sections (e.g. arch_cpu_idle(), interrupt acknowledge path) requires PMR not to mask interrupts that could be signaled to the CPU when using only PSR.I. [1] https://www.spinics.net/lists/arm-kernel/msg716956.html Fixes: 4a503217ce37 ("arm64: irqflags: Use ICC_PMR_EL1 for interrupt masking") Cc: <stable@vger.kernel.org> # 5.1.x- Reported-by: Zenghui Yu <yuzenghui@huawei.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Wei Li <liwei391@huawei.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Christoffer Dall <christoffer.dall@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Suzuki K Pouloze <suzuki.poulose@arm.com> Cc: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Julien Thierry <julien.thierry@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2019-06-11 09:38:10 +00:00
.macro gic_prio_kentry_setup, tmp:req
#ifdef CONFIG_ARM64_PSEUDO_NMI
alternative_if ARM64_HAS_IRQ_PRIO_MASKING
mov \tmp, #(GIC_PRIO_PSR_I_SET | GIC_PRIO_IRQON)
msr_s SYS_ICC_PMR_EL1, \tmp
alternative_else_nop_endif
#endif
.endm
.macro gic_prio_irq_setup, pmr:req, tmp:req
#ifdef CONFIG_ARM64_PSEUDO_NMI
alternative_if ARM64_HAS_IRQ_PRIO_MASKING
orr \tmp, \pmr, #GIC_PRIO_PSR_I_SET
msr_s SYS_ICC_PMR_EL1, \tmp
alternative_else_nop_endif
#endif
.endm
.text
/*
* Exception vectors.
*/
.pushsection ".entry.text", "ax"
.align 11
SYM_CODE_START(vectors)
kernel_ventry 1, sync_invalid // Synchronous EL1t
kernel_ventry 1, irq_invalid // IRQ EL1t
kernel_ventry 1, fiq_invalid // FIQ EL1t
kernel_ventry 1, error_invalid // Error EL1t
kernel_ventry 1, sync // Synchronous EL1h
kernel_ventry 1, irq // IRQ EL1h
kernel_ventry 1, fiq_invalid // FIQ EL1h
kernel_ventry 1, error // Error EL1h
kernel_ventry 0, sync // Synchronous 64-bit EL0
kernel_ventry 0, irq // IRQ 64-bit EL0
kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0
kernel_ventry 0, error // Error 64-bit EL0
#ifdef CONFIG_COMPAT
kernel_ventry 0, sync_compat, 32 // Synchronous 32-bit EL0
kernel_ventry 0, irq_compat, 32 // IRQ 32-bit EL0
kernel_ventry 0, fiq_invalid_compat, 32 // FIQ 32-bit EL0
kernel_ventry 0, error_compat, 32 // Error 32-bit EL0
#else
kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0
kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0
kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0
kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0
#endif
SYM_CODE_END(vectors)
arm64: add VMAP_STACK overflow detection This patch adds stack overflow detection to arm64, usable when vmap'd stacks are in use. Overflow is detected in a small preamble executed for each exception entry, which checks whether there is enough space on the current stack for the general purpose registers to be saved. If there is not enough space, the overflow handler is invoked on a per-cpu overflow stack. This approach preserves the original exception information in ESR_EL1 (and where appropriate, FAR_EL1). Task and IRQ stacks are aligned to double their size, enabling overflow to be detected with a single bit test. For example, a 16K stack is aligned to 32K, ensuring that bit 14 of the SP must be zero. On an overflow (or underflow), this bit is flipped. Thus, overflow (of less than the size of the stack) can be detected by testing whether this bit is set. The overflow check is performed before any attempt is made to access the stack, avoiding recursive faults (and the loss of exception information these would entail). As logical operations cannot be performed on the SP directly, the SP is temporarily swapped with a general purpose register using arithmetic operations to enable the test to be performed. This gives us a useful error message on stack overflow, as can be trigger with the LKDTM overflow test: [ 305.388749] lkdtm: Performing direct entry OVERFLOW [ 305.395444] Insufficient stack space to handle exception! [ 305.395482] ESR: 0x96000047 -- DABT (current EL) [ 305.399890] FAR: 0xffff00000a5e7f30 [ 305.401315] Task stack: [0xffff00000a5e8000..0xffff00000a5ec000] [ 305.403815] IRQ stack: [0xffff000008000000..0xffff000008004000] [ 305.407035] Overflow stack: [0xffff80003efce4e0..0xffff80003efcf4e0] [ 305.409622] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5 [ 305.412785] Hardware name: linux,dummy-virt (DT) [ 305.415756] task: ffff80003d051c00 task.stack: ffff00000a5e8000 [ 305.419221] PC is at recursive_loop+0x10/0x48 [ 305.421637] LR is at recursive_loop+0x38/0x48 [ 305.423768] pc : [<ffff00000859f330>] lr : [<ffff00000859f358>] pstate: 40000145 [ 305.428020] sp : ffff00000a5e7f50 [ 305.430469] x29: ffff00000a5e8350 x28: ffff80003d051c00 [ 305.433191] x27: ffff000008981000 x26: ffff000008f80400 [ 305.439012] x25: ffff00000a5ebeb8 x24: ffff00000a5ebeb8 [ 305.440369] x23: ffff000008f80138 x22: 0000000000000009 [ 305.442241] x21: ffff80003ce65000 x20: ffff000008f80188 [ 305.444552] x19: 0000000000000013 x18: 0000000000000006 [ 305.446032] x17: 0000ffffa2601280 x16: ffff0000081fe0b8 [ 305.448252] x15: ffff000008ff546d x14: 000000000047a4c8 [ 305.450246] x13: ffff000008ff7872 x12: 0000000005f5e0ff [ 305.452953] x11: ffff000008ed2548 x10: 000000000005ee8d [ 305.454824] x9 : ffff000008545380 x8 : ffff00000a5e8770 [ 305.457105] x7 : 1313131313131313 x6 : 00000000000000e1 [ 305.459285] x5 : 0000000000000000 x4 : 0000000000000000 [ 305.461781] x3 : 0000000000000000 x2 : 0000000000000400 [ 305.465119] x1 : 0000000000000013 x0 : 0000000000000012 [ 305.467724] Kernel panic - not syncing: kernel stack overflow [ 305.470561] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5 [ 305.473325] Hardware name: linux,dummy-virt (DT) [ 305.475070] Call trace: [ 305.476116] [<ffff000008088ad8>] dump_backtrace+0x0/0x378 [ 305.478991] [<ffff000008088e64>] show_stack+0x14/0x20 [ 305.481237] [<ffff00000895a178>] dump_stack+0x98/0xb8 [ 305.483294] [<ffff0000080c3288>] panic+0x118/0x280 [ 305.485673] [<ffff0000080c2e9c>] nmi_panic+0x6c/0x70 [ 305.486216] [<ffff000008089710>] handle_bad_stack+0x118/0x128 [ 305.486612] Exception stack(0xffff80003efcf3a0 to 0xffff80003efcf4e0) [ 305.487334] f3a0: 0000000000000012 0000000000000013 0000000000000400 0000000000000000 [ 305.488025] f3c0: 0000000000000000 0000000000000000 00000000000000e1 1313131313131313 [ 305.488908] f3e0: ffff00000a5e8770 ffff000008545380 000000000005ee8d ffff000008ed2548 [ 305.489403] f400: 0000000005f5e0ff ffff000008ff7872 000000000047a4c8 ffff000008ff546d [ 305.489759] f420: ffff0000081fe0b8 0000ffffa2601280 0000000000000006 0000000000000013 [ 305.490256] f440: ffff000008f80188 ffff80003ce65000 0000000000000009 ffff000008f80138 [ 305.490683] f460: ffff00000a5ebeb8 ffff00000a5ebeb8 ffff000008f80400 ffff000008981000 [ 305.491051] f480: ffff80003d051c00 ffff00000a5e8350 ffff00000859f358 ffff00000a5e7f50 [ 305.491444] f4a0: ffff00000859f330 0000000040000145 0000000000000000 0000000000000000 [ 305.492008] f4c0: 0001000000000000 0000000000000000 ffff00000a5e8350 ffff00000859f330 [ 305.493063] [<ffff00000808205c>] __bad_stack+0x88/0x8c [ 305.493396] [<ffff00000859f330>] recursive_loop+0x10/0x48 [ 305.493731] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494088] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494425] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494649] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.494898] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.495205] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.495453] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.495708] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496000] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496302] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496644] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.496894] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.497138] [<ffff00000859f358>] recursive_loop+0x38/0x48 [ 305.497325] [<ffff00000859f3dc>] lkdtm_OVERFLOW+0x14/0x20 [ 305.497506] [<ffff00000859f314>] lkdtm_do_action+0x1c/0x28 [ 305.497786] [<ffff00000859f178>] direct_entry+0xe0/0x170 [ 305.498095] [<ffff000008345568>] full_proxy_write+0x60/0xa8 [ 305.498387] [<ffff0000081fb7f4>] __vfs_write+0x1c/0x128 [ 305.498679] [<ffff0000081fcc68>] vfs_write+0xa0/0x1b0 [ 305.498926] [<ffff0000081fe0fc>] SyS_write+0x44/0xa0 [ 305.499182] Exception stack(0xffff00000a5ebec0 to 0xffff00000a5ec000) [ 305.499429] bec0: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0 [ 305.499674] bee0: 574f4c465245564f 0000000000000000 0000000000000000 8000000080808080 [ 305.499904] bf00: 0000000000000040 0000000000000038 fefefeff1b4bc2ff 7f7f7f7f7f7fff7f [ 305.500189] bf20: 0101010101010101 0000000000000000 000000000047a4c8 0000000000000038 [ 305.500712] bf40: 0000000000000000 0000ffffa2601280 0000ffffc63f6068 00000000004b5000 [ 305.501241] bf60: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0 [ 305.501791] bf80: 0000000000000020 0000000000000000 00000000004b5000 000000001c4cc458 [ 305.502314] bfa0: 0000000000000000 0000ffffc63f7950 000000000040a3c4 0000ffffc63f70e0 [ 305.502762] bfc0: 0000ffffa2601268 0000000080000000 0000000000000001 0000000000000040 [ 305.503207] bfe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 305.503680] [<ffff000008082fb0>] el0_svc_naked+0x24/0x28 [ 305.504720] Kernel Offset: disabled [ 305.505189] CPU features: 0x002082 [ 305.505473] Memory Limit: none [ 305.506181] ---[ end Kernel panic - not syncing: kernel stack overflow This patch was co-authored by Ard Biesheuvel and Mark Rutland. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Mark Rutland <mark.rutland@arm.com> Reviewed-by: Will Deacon <will.deacon@arm.com> Tested-by: Laura Abbott <labbott@redhat.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: James Morse <james.morse@arm.com>
2017-07-14 19:30:35 +00:00
#ifdef CONFIG_VMAP_STACK
/*
* We detected an overflow in kernel_ventry, which switched to the
* overflow stack. Stash the exception regs, and head to our overflow
* handler.
*/
__bad_stack:
/* Restore the original x0 value */
mrs x0, tpidrro_el0
/*
* Store the original GPRs to the new stack. The orginal SP (minus
* S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry.
*/
sub sp, sp, #S_FRAME_SIZE
kernel_entry 1
mrs x0, tpidr_el0
add x0, x0, #S_FRAME_SIZE
str x0, [sp, #S_SP]
/* Stash the regs for handle_bad_stack */
mov x0, sp
/* Time to die */
bl handle_bad_stack
ASM_BUG()
#endif /* CONFIG_VMAP_STACK */
/*
* Invalid mode handlers
*/
.macro inv_entry, el, reason, regsize = 64
kernel_entry \el, \regsize
mov x0, sp
mov x1, #\reason
mrs x2, esr_el1
bl bad_mode
ASM_BUG()
.endm
SYM_CODE_START_LOCAL(el0_sync_invalid)
inv_entry 0, BAD_SYNC
SYM_CODE_END(el0_sync_invalid)
SYM_CODE_START_LOCAL(el0_irq_invalid)
inv_entry 0, BAD_IRQ
SYM_CODE_END(el0_irq_invalid)
SYM_CODE_START_LOCAL(el0_fiq_invalid)
inv_entry 0, BAD_FIQ
SYM_CODE_END(el0_fiq_invalid)
SYM_CODE_START_LOCAL(el0_error_invalid)
inv_entry 0, BAD_ERROR
SYM_CODE_END(el0_error_invalid)
#ifdef CONFIG_COMPAT
SYM_CODE_START_LOCAL(el0_fiq_invalid_compat)
inv_entry 0, BAD_FIQ, 32
SYM_CODE_END(el0_fiq_invalid_compat)
#endif
SYM_CODE_START_LOCAL(el1_sync_invalid)
inv_entry 1, BAD_SYNC
SYM_CODE_END(el1_sync_invalid)
SYM_CODE_START_LOCAL(el1_irq_invalid)
inv_entry 1, BAD_IRQ
SYM_CODE_END(el1_irq_invalid)
SYM_CODE_START_LOCAL(el1_fiq_invalid)
inv_entry 1, BAD_FIQ
SYM_CODE_END(el1_fiq_invalid)
SYM_CODE_START_LOCAL(el1_error_invalid)
inv_entry 1, BAD_ERROR
SYM_CODE_END(el1_error_invalid)
/*
* EL1 mode handlers.
*/
.align 6
SYM_CODE_START_LOCAL_NOALIGN(el1_sync)
kernel_entry 1
mov x0, sp
bl el1_sync_handler
kernel_exit 1
SYM_CODE_END(el1_sync)
.align 6
SYM_CODE_START_LOCAL_NOALIGN(el1_irq)
kernel_entry 1
arm64: Fix incorrect irqflag restore for priority masking When using IRQ priority masking to disable interrupts, in order to deal with the PSR.I state, local_irq_save() would convert the I bit into a PMR value (GIC_PRIO_IRQOFF). This resulted in local_irq_restore() potentially modifying the value of PMR in undesired location due to the state of PSR.I upon flag saving [1]. In an attempt to solve this issue in a less hackish manner, introduce a bit (GIC_PRIO_IGNORE_PMR) for the PMR values that can represent whether PSR.I is being used to disable interrupts, in which case it takes precedence of the status of interrupt masking via PMR. GIC_PRIO_PSR_I_SET is chosen such that (<pmr_value> | GIC_PRIO_PSR_I_SET) does not mask more interrupts than <pmr_value> as some sections (e.g. arch_cpu_idle(), interrupt acknowledge path) requires PMR not to mask interrupts that could be signaled to the CPU when using only PSR.I. [1] https://www.spinics.net/lists/arm-kernel/msg716956.html Fixes: 4a503217ce37 ("arm64: irqflags: Use ICC_PMR_EL1 for interrupt masking") Cc: <stable@vger.kernel.org> # 5.1.x- Reported-by: Zenghui Yu <yuzenghui@huawei.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Wei Li <liwei391@huawei.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Christoffer Dall <christoffer.dall@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Suzuki K Pouloze <suzuki.poulose@arm.com> Cc: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Julien Thierry <julien.thierry@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2019-06-11 09:38:10 +00:00
gic_prio_irq_setup pmr=x20, tmp=x1
enable_da_f
#ifdef CONFIG_ARM64_PSEUDO_NMI
test_irqs_unmasked res=x0, pmr=x20
cbz x0, 1f
bl asm_nmi_enter
1:
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_off
#endif
irq_handler
#ifdef CONFIG_PREEMPTION
ldr x24, [tsk, #TSK_TI_PREEMPT] // get preempt count
alternative_if ARM64_HAS_IRQ_PRIO_MASKING
/*
* DA_F were cleared at start of handling. If anything is set in DAIF,
* we come back from an NMI, so skip preemption
*/
mrs x0, daif
orr x24, x24, x0
alternative_else_nop_endif
cbnz x24, 1f // preempt count != 0 || NMI return path
arm64: entry.S: Do not preempt from IRQ before all cpufeatures are enabled Preempting from IRQ-return means that the task has its PSTATE saved on the stack, which will get restored when the task is resumed and does the actual IRQ return. However, enabling some CPU features requires modifying the PSTATE. This means that, if a task was scheduled out during an IRQ-return before all CPU features are enabled, the task might restore a PSTATE that does not include the feature enablement changes once scheduled back in. * Task 1: PAN == 0 ---| |--------------- | |<- return from IRQ, PSTATE.PAN = 0 | <- IRQ | +--------+ <- preempt() +-- ^ | reschedule Task 1, PSTATE.PAN == 1 * Init: --------------------+------------------------ ^ | enable_cpu_features set PSTATE.PAN on all CPUs Worse than this, since PSTATE is untouched when task switching is done, a task missing the new bits in PSTATE might affect another task, if both do direct calls to schedule() (outside of IRQ/exception contexts). Fix this by preventing preemption on IRQ-return until features are enabled on all CPUs. This way the only PSTATE values that are saved on the stack are from synchronous exceptions. These are expected to be fatal this early, the exception is BRK for WARN_ON(), but as this uses do_debug_exception() which keeps IRQs masked, it shouldn't call schedule(). Signed-off-by: Julien Thierry <julien.thierry@arm.com> [james: Replaced a really cool hack, with an even simpler static key in C. expanded commit message with Julien's cover-letter ascii art] Signed-off-by: James Morse <james.morse@arm.com> Signed-off-by: Will Deacon <will@kernel.org>
2019-10-15 17:25:44 +00:00
bl arm64_preempt_schedule_irq // irq en/disable is done inside
1:
#endif
#ifdef CONFIG_ARM64_PSEUDO_NMI
/*
arm64: Fix incorrect irqflag restore for priority masking When using IRQ priority masking to disable interrupts, in order to deal with the PSR.I state, local_irq_save() would convert the I bit into a PMR value (GIC_PRIO_IRQOFF). This resulted in local_irq_restore() potentially modifying the value of PMR in undesired location due to the state of PSR.I upon flag saving [1]. In an attempt to solve this issue in a less hackish manner, introduce a bit (GIC_PRIO_IGNORE_PMR) for the PMR values that can represent whether PSR.I is being used to disable interrupts, in which case it takes precedence of the status of interrupt masking via PMR. GIC_PRIO_PSR_I_SET is chosen such that (<pmr_value> | GIC_PRIO_PSR_I_SET) does not mask more interrupts than <pmr_value> as some sections (e.g. arch_cpu_idle(), interrupt acknowledge path) requires PMR not to mask interrupts that could be signaled to the CPU when using only PSR.I. [1] https://www.spinics.net/lists/arm-kernel/msg716956.html Fixes: 4a503217ce37 ("arm64: irqflags: Use ICC_PMR_EL1 for interrupt masking") Cc: <stable@vger.kernel.org> # 5.1.x- Reported-by: Zenghui Yu <yuzenghui@huawei.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Wei Li <liwei391@huawei.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Christoffer Dall <christoffer.dall@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Suzuki K Pouloze <suzuki.poulose@arm.com> Cc: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Julien Thierry <julien.thierry@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2019-06-11 09:38:10 +00:00
* When using IRQ priority masking, we can get spurious interrupts while
* PMR is set to GIC_PRIO_IRQOFF. An NMI might also have occurred in a
* section with interrupts disabled. Skip tracing in those cases.
*/
test_irqs_unmasked res=x0, pmr=x20
cbz x0, 1f
bl asm_nmi_exit
1:
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
#ifdef CONFIG_ARM64_PSEUDO_NMI
test_irqs_unmasked res=x0, pmr=x20
cbnz x0, 1f
#endif
bl trace_hardirqs_on
1:
#endif
kernel_exit 1
SYM_CODE_END(el1_irq)
/*
* EL0 mode handlers.
*/
.align 6
SYM_CODE_START_LOCAL_NOALIGN(el0_sync)
kernel_entry 0
mov x0, sp
bl el0_sync_handler
b ret_to_user
SYM_CODE_END(el0_sync)
#ifdef CONFIG_COMPAT
.align 6
SYM_CODE_START_LOCAL_NOALIGN(el0_sync_compat)
kernel_entry 0, 32
mov x0, sp
bl el0_sync_compat_handler
b ret_to_user
SYM_CODE_END(el0_sync_compat)
.align 6
SYM_CODE_START_LOCAL_NOALIGN(el0_irq_compat)
kernel_entry 0, 32
b el0_irq_naked
SYM_CODE_END(el0_irq_compat)
SYM_CODE_START_LOCAL_NOALIGN(el0_error_compat)
kernel_entry 0, 32
b el0_error_naked
SYM_CODE_END(el0_error_compat)
#endif
.align 6
SYM_CODE_START_LOCAL_NOALIGN(el0_irq)
kernel_entry 0
el0_irq_naked:
arm64: Fix incorrect irqflag restore for priority masking When using IRQ priority masking to disable interrupts, in order to deal with the PSR.I state, local_irq_save() would convert the I bit into a PMR value (GIC_PRIO_IRQOFF). This resulted in local_irq_restore() potentially modifying the value of PMR in undesired location due to the state of PSR.I upon flag saving [1]. In an attempt to solve this issue in a less hackish manner, introduce a bit (GIC_PRIO_IGNORE_PMR) for the PMR values that can represent whether PSR.I is being used to disable interrupts, in which case it takes precedence of the status of interrupt masking via PMR. GIC_PRIO_PSR_I_SET is chosen such that (<pmr_value> | GIC_PRIO_PSR_I_SET) does not mask more interrupts than <pmr_value> as some sections (e.g. arch_cpu_idle(), interrupt acknowledge path) requires PMR not to mask interrupts that could be signaled to the CPU when using only PSR.I. [1] https://www.spinics.net/lists/arm-kernel/msg716956.html Fixes: 4a503217ce37 ("arm64: irqflags: Use ICC_PMR_EL1 for interrupt masking") Cc: <stable@vger.kernel.org> # 5.1.x- Reported-by: Zenghui Yu <yuzenghui@huawei.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Wei Li <liwei391@huawei.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Christoffer Dall <christoffer.dall@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Suzuki K Pouloze <suzuki.poulose@arm.com> Cc: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Julien Thierry <julien.thierry@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2019-06-11 09:38:10 +00:00
gic_prio_irq_setup pmr=x20, tmp=x0
arm64: entry: Move ct_user_exit before any other exception When taking an SError or Debug exception from EL0, we run the C handler for these exceptions before updating the context tracking code and unmasking lower priority interrupts. When booting with nohz_full lockdep tells us we got this wrong: | ============================= | WARNING: suspicious RCU usage | 5.3.0-rc2-00010-gb4b5e9dcb11b-dirty #11271 Not tainted | ----------------------------- | include/linux/rcupdate.h:643 rcu_read_unlock() used illegally wh! | | other info that might help us debug this: | | | RCU used illegally from idle CPU! | rcu_scheduler_active = 2, debug_locks = 1 | RCU used illegally from extended quiescent state! | 1 lock held by a.out/432: | #0: 00000000c7a79515 (rcu_read_lock){....}, at: brk_handler+0x00 | | stack backtrace: | CPU: 1 PID: 432 Comm: a.out Not tainted 5.3.0-rc2-00010-gb4b5e9d1 | Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno De8 | Call trace: | dump_backtrace+0x0/0x140 | show_stack+0x14/0x20 | dump_stack+0xbc/0x104 | lockdep_rcu_suspicious+0xf8/0x108 | brk_handler+0x164/0x1b0 | do_debug_exception+0x11c/0x278 | el0_dbg+0x14/0x20 Moving the ct_user_exit calls to be before do_debug_exception() means they are also before trace_hardirqs_off() has been updated. Add a new ct_user_exit_irqoff macro to avoid the context-tracking code using irqsave/restore before we've updated trace_hardirqs_off(). To be consistent, do this everywhere. The C helper is called enter_from_user_mode() to match x86 in the hope we can merge them into kernel/context_tracking.c later. Cc: Masami Hiramatsu <mhiramat@kernel.org> Fixes: 6c81fe7925cc4c42 ("arm64: enable context tracking") Signed-off-by: James Morse <james.morse@arm.com> Signed-off-by: Will Deacon <will@kernel.org>
2019-08-20 17:45:57 +00:00
ct_user_exit_irqoff
enable_da_f
arm64: Fix incorrect irqflag restore for priority masking When using IRQ priority masking to disable interrupts, in order to deal with the PSR.I state, local_irq_save() would convert the I bit into a PMR value (GIC_PRIO_IRQOFF). This resulted in local_irq_restore() potentially modifying the value of PMR in undesired location due to the state of PSR.I upon flag saving [1]. In an attempt to solve this issue in a less hackish manner, introduce a bit (GIC_PRIO_IGNORE_PMR) for the PMR values that can represent whether PSR.I is being used to disable interrupts, in which case it takes precedence of the status of interrupt masking via PMR. GIC_PRIO_PSR_I_SET is chosen such that (<pmr_value> | GIC_PRIO_PSR_I_SET) does not mask more interrupts than <pmr_value> as some sections (e.g. arch_cpu_idle(), interrupt acknowledge path) requires PMR not to mask interrupts that could be signaled to the CPU when using only PSR.I. [1] https://www.spinics.net/lists/arm-kernel/msg716956.html Fixes: 4a503217ce37 ("arm64: irqflags: Use ICC_PMR_EL1 for interrupt masking") Cc: <stable@vger.kernel.org> # 5.1.x- Reported-by: Zenghui Yu <yuzenghui@huawei.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Wei Li <liwei391@huawei.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Christoffer Dall <christoffer.dall@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Suzuki K Pouloze <suzuki.poulose@arm.com> Cc: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Julien Thierry <julien.thierry@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2019-06-11 09:38:10 +00:00
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_off
#endif
#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
tbz x22, #55, 1f
bl do_el0_irq_bp_hardening
1:
#endif
irq_handler
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_on
#endif
b ret_to_user
SYM_CODE_END(el0_irq)
SYM_CODE_START_LOCAL(el1_error)
kernel_entry 1
mrs x1, esr_el1
arm64: Fix incorrect irqflag restore for priority masking When using IRQ priority masking to disable interrupts, in order to deal with the PSR.I state, local_irq_save() would convert the I bit into a PMR value (GIC_PRIO_IRQOFF). This resulted in local_irq_restore() potentially modifying the value of PMR in undesired location due to the state of PSR.I upon flag saving [1]. In an attempt to solve this issue in a less hackish manner, introduce a bit (GIC_PRIO_IGNORE_PMR) for the PMR values that can represent whether PSR.I is being used to disable interrupts, in which case it takes precedence of the status of interrupt masking via PMR. GIC_PRIO_PSR_I_SET is chosen such that (<pmr_value> | GIC_PRIO_PSR_I_SET) does not mask more interrupts than <pmr_value> as some sections (e.g. arch_cpu_idle(), interrupt acknowledge path) requires PMR not to mask interrupts that could be signaled to the CPU when using only PSR.I. [1] https://www.spinics.net/lists/arm-kernel/msg716956.html Fixes: 4a503217ce37 ("arm64: irqflags: Use ICC_PMR_EL1 for interrupt masking") Cc: <stable@vger.kernel.org> # 5.1.x- Reported-by: Zenghui Yu <yuzenghui@huawei.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Wei Li <liwei391@huawei.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Christoffer Dall <christoffer.dall@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Suzuki K Pouloze <suzuki.poulose@arm.com> Cc: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Julien Thierry <julien.thierry@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2019-06-11 09:38:10 +00:00
gic_prio_kentry_setup tmp=x2
enable_dbg
mov x0, sp
bl do_serror
kernel_exit 1
SYM_CODE_END(el1_error)
SYM_CODE_START_LOCAL(el0_error)
kernel_entry 0
el0_error_naked:
arm64: entry: Move ct_user_exit before any other exception When taking an SError or Debug exception from EL0, we run the C handler for these exceptions before updating the context tracking code and unmasking lower priority interrupts. When booting with nohz_full lockdep tells us we got this wrong: | ============================= | WARNING: suspicious RCU usage | 5.3.0-rc2-00010-gb4b5e9dcb11b-dirty #11271 Not tainted | ----------------------------- | include/linux/rcupdate.h:643 rcu_read_unlock() used illegally wh! | | other info that might help us debug this: | | | RCU used illegally from idle CPU! | rcu_scheduler_active = 2, debug_locks = 1 | RCU used illegally from extended quiescent state! | 1 lock held by a.out/432: | #0: 00000000c7a79515 (rcu_read_lock){....}, at: brk_handler+0x00 | | stack backtrace: | CPU: 1 PID: 432 Comm: a.out Not tainted 5.3.0-rc2-00010-gb4b5e9d1 | Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno De8 | Call trace: | dump_backtrace+0x0/0x140 | show_stack+0x14/0x20 | dump_stack+0xbc/0x104 | lockdep_rcu_suspicious+0xf8/0x108 | brk_handler+0x164/0x1b0 | do_debug_exception+0x11c/0x278 | el0_dbg+0x14/0x20 Moving the ct_user_exit calls to be before do_debug_exception() means they are also before trace_hardirqs_off() has been updated. Add a new ct_user_exit_irqoff macro to avoid the context-tracking code using irqsave/restore before we've updated trace_hardirqs_off(). To be consistent, do this everywhere. The C helper is called enter_from_user_mode() to match x86 in the hope we can merge them into kernel/context_tracking.c later. Cc: Masami Hiramatsu <mhiramat@kernel.org> Fixes: 6c81fe7925cc4c42 ("arm64: enable context tracking") Signed-off-by: James Morse <james.morse@arm.com> Signed-off-by: Will Deacon <will@kernel.org>
2019-08-20 17:45:57 +00:00
mrs x25, esr_el1
arm64: Fix incorrect irqflag restore for priority masking When using IRQ priority masking to disable interrupts, in order to deal with the PSR.I state, local_irq_save() would convert the I bit into a PMR value (GIC_PRIO_IRQOFF). This resulted in local_irq_restore() potentially modifying the value of PMR in undesired location due to the state of PSR.I upon flag saving [1]. In an attempt to solve this issue in a less hackish manner, introduce a bit (GIC_PRIO_IGNORE_PMR) for the PMR values that can represent whether PSR.I is being used to disable interrupts, in which case it takes precedence of the status of interrupt masking via PMR. GIC_PRIO_PSR_I_SET is chosen such that (<pmr_value> | GIC_PRIO_PSR_I_SET) does not mask more interrupts than <pmr_value> as some sections (e.g. arch_cpu_idle(), interrupt acknowledge path) requires PMR not to mask interrupts that could be signaled to the CPU when using only PSR.I. [1] https://www.spinics.net/lists/arm-kernel/msg716956.html Fixes: 4a503217ce37 ("arm64: irqflags: Use ICC_PMR_EL1 for interrupt masking") Cc: <stable@vger.kernel.org> # 5.1.x- Reported-by: Zenghui Yu <yuzenghui@huawei.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Wei Li <liwei391@huawei.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Christoffer Dall <christoffer.dall@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Suzuki K Pouloze <suzuki.poulose@arm.com> Cc: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Julien Thierry <julien.thierry@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2019-06-11 09:38:10 +00:00
gic_prio_kentry_setup tmp=x2
arm64: entry: Move ct_user_exit before any other exception When taking an SError or Debug exception from EL0, we run the C handler for these exceptions before updating the context tracking code and unmasking lower priority interrupts. When booting with nohz_full lockdep tells us we got this wrong: | ============================= | WARNING: suspicious RCU usage | 5.3.0-rc2-00010-gb4b5e9dcb11b-dirty #11271 Not tainted | ----------------------------- | include/linux/rcupdate.h:643 rcu_read_unlock() used illegally wh! | | other info that might help us debug this: | | | RCU used illegally from idle CPU! | rcu_scheduler_active = 2, debug_locks = 1 | RCU used illegally from extended quiescent state! | 1 lock held by a.out/432: | #0: 00000000c7a79515 (rcu_read_lock){....}, at: brk_handler+0x00 | | stack backtrace: | CPU: 1 PID: 432 Comm: a.out Not tainted 5.3.0-rc2-00010-gb4b5e9d1 | Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno De8 | Call trace: | dump_backtrace+0x0/0x140 | show_stack+0x14/0x20 | dump_stack+0xbc/0x104 | lockdep_rcu_suspicious+0xf8/0x108 | brk_handler+0x164/0x1b0 | do_debug_exception+0x11c/0x278 | el0_dbg+0x14/0x20 Moving the ct_user_exit calls to be before do_debug_exception() means they are also before trace_hardirqs_off() has been updated. Add a new ct_user_exit_irqoff macro to avoid the context-tracking code using irqsave/restore before we've updated trace_hardirqs_off(). To be consistent, do this everywhere. The C helper is called enter_from_user_mode() to match x86 in the hope we can merge them into kernel/context_tracking.c later. Cc: Masami Hiramatsu <mhiramat@kernel.org> Fixes: 6c81fe7925cc4c42 ("arm64: enable context tracking") Signed-off-by: James Morse <james.morse@arm.com> Signed-off-by: Will Deacon <will@kernel.org>
2019-08-20 17:45:57 +00:00
ct_user_exit_irqoff
enable_dbg
mov x0, sp
arm64: entry: Move ct_user_exit before any other exception When taking an SError or Debug exception from EL0, we run the C handler for these exceptions before updating the context tracking code and unmasking lower priority interrupts. When booting with nohz_full lockdep tells us we got this wrong: | ============================= | WARNING: suspicious RCU usage | 5.3.0-rc2-00010-gb4b5e9dcb11b-dirty #11271 Not tainted | ----------------------------- | include/linux/rcupdate.h:643 rcu_read_unlock() used illegally wh! | | other info that might help us debug this: | | | RCU used illegally from idle CPU! | rcu_scheduler_active = 2, debug_locks = 1 | RCU used illegally from extended quiescent state! | 1 lock held by a.out/432: | #0: 00000000c7a79515 (rcu_read_lock){....}, at: brk_handler+0x00 | | stack backtrace: | CPU: 1 PID: 432 Comm: a.out Not tainted 5.3.0-rc2-00010-gb4b5e9d1 | Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno De8 | Call trace: | dump_backtrace+0x0/0x140 | show_stack+0x14/0x20 | dump_stack+0xbc/0x104 | lockdep_rcu_suspicious+0xf8/0x108 | brk_handler+0x164/0x1b0 | do_debug_exception+0x11c/0x278 | el0_dbg+0x14/0x20 Moving the ct_user_exit calls to be before do_debug_exception() means they are also before trace_hardirqs_off() has been updated. Add a new ct_user_exit_irqoff macro to avoid the context-tracking code using irqsave/restore before we've updated trace_hardirqs_off(). To be consistent, do this everywhere. The C helper is called enter_from_user_mode() to match x86 in the hope we can merge them into kernel/context_tracking.c later. Cc: Masami Hiramatsu <mhiramat@kernel.org> Fixes: 6c81fe7925cc4c42 ("arm64: enable context tracking") Signed-off-by: James Morse <james.morse@arm.com> Signed-off-by: Will Deacon <will@kernel.org>
2019-08-20 17:45:57 +00:00
mov x1, x25
bl do_serror
enable_da_f
b ret_to_user
SYM_CODE_END(el0_error)
/*
* Ok, we need to do extra processing, enter the slow path.
*/
work_pending:
mov x0, sp // 'regs'
bl do_notify_resume
arm64: Add trace_hardirqs_off annotation in ret_to_user When a kernel is built with CONFIG_TRACE_IRQFLAGS the following warning is produced when entering userspace for the first time: WARNING: at /work/Linux/linux-2.6-aarch64/kernel/locking/lockdep.c:3519 Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc3+ #639 Hardware name: Juno (DT) task: ffffffc9768a0000 ti: ffffffc9768a8000 task.ti: ffffffc9768a8000 PC is at check_flags.part.22+0x19c/0x1a8 LR is at check_flags.part.22+0x19c/0x1a8 pc : [<ffffffc0000fba6c>] lr : [<ffffffc0000fba6c>] pstate: 600001c5 sp : ffffffc9768abe10 x29: ffffffc9768abe10 x28: ffffffc9768a8000 x27: 0000000000000000 x26: 0000000000000001 x25: 00000000000000a6 x24: ffffffc00064be6c x23: ffffffc0009f249e x22: ffffffc9768a0000 x21: ffffffc97fea5480 x20: 00000000000001c0 x19: ffffffc00169a000 x18: 0000005558cc7b58 x17: 0000007fb78e3180 x16: 0000005558d2e238 x15: ffffffffffffffff x14: 0ffffffffffffffd x13: 0000000000000008 x12: 0101010101010101 x11: 7f7f7f7f7f7f7f7f x10: fefefefefefeff63 x9 : 7f7f7f7f7f7f7f7f x8 : 6e655f7371726964 x7 : 0000000000000001 x6 : ffffffc0001079c4 x5 : 0000000000000000 x4 : 0000000000000001 x3 : ffffffc001698438 x2 : 0000000000000000 x1 : ffffffc9768a0000 x0 : 000000000000002e Call trace: [<ffffffc0000fba6c>] check_flags.part.22+0x19c/0x1a8 [<ffffffc0000fc440>] lock_is_held+0x80/0x98 [<ffffffc00064bafc>] __schedule+0x404/0x730 [<ffffffc00064be6c>] schedule+0x44/0xb8 [<ffffffc000085bb0>] ret_to_user+0x0/0x24 possible reason: unannotated irqs-off. irq event stamp: 502169 hardirqs last enabled at (502169): [<ffffffc000085a98>] el0_irq_naked+0x1c/0x24 hardirqs last disabled at (502167): [<ffffffc0000bb3bc>] __do_softirq+0x17c/0x298 softirqs last enabled at (502168): [<ffffffc0000bb43c>] __do_softirq+0x1fc/0x298 softirqs last disabled at (502143): [<ffffffc0000bb830>] irq_exit+0xa0/0xf0 This happens because we disable interrupts in ret_to_user before calling schedule() in work_resched. This patch adds the necessary trace_hardirqs_off annotation. Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> Reported-by: Mark Rutland <mark.rutland@arm.com> Cc: Will Deacon <will.deacon@arm.com> Signed-off-by: Will Deacon <will.deacon@arm.com>
2015-12-04 12:42:29 +00:00
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_on // enabled while in userspace
arm64: Add trace_hardirqs_off annotation in ret_to_user When a kernel is built with CONFIG_TRACE_IRQFLAGS the following warning is produced when entering userspace for the first time: WARNING: at /work/Linux/linux-2.6-aarch64/kernel/locking/lockdep.c:3519 Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc3+ #639 Hardware name: Juno (DT) task: ffffffc9768a0000 ti: ffffffc9768a8000 task.ti: ffffffc9768a8000 PC is at check_flags.part.22+0x19c/0x1a8 LR is at check_flags.part.22+0x19c/0x1a8 pc : [<ffffffc0000fba6c>] lr : [<ffffffc0000fba6c>] pstate: 600001c5 sp : ffffffc9768abe10 x29: ffffffc9768abe10 x28: ffffffc9768a8000 x27: 0000000000000000 x26: 0000000000000001 x25: 00000000000000a6 x24: ffffffc00064be6c x23: ffffffc0009f249e x22: ffffffc9768a0000 x21: ffffffc97fea5480 x20: 00000000000001c0 x19: ffffffc00169a000 x18: 0000005558cc7b58 x17: 0000007fb78e3180 x16: 0000005558d2e238 x15: ffffffffffffffff x14: 0ffffffffffffffd x13: 0000000000000008 x12: 0101010101010101 x11: 7f7f7f7f7f7f7f7f x10: fefefefefefeff63 x9 : 7f7f7f7f7f7f7f7f x8 : 6e655f7371726964 x7 : 0000000000000001 x6 : ffffffc0001079c4 x5 : 0000000000000000 x4 : 0000000000000001 x3 : ffffffc001698438 x2 : 0000000000000000 x1 : ffffffc9768a0000 x0 : 000000000000002e Call trace: [<ffffffc0000fba6c>] check_flags.part.22+0x19c/0x1a8 [<ffffffc0000fc440>] lock_is_held+0x80/0x98 [<ffffffc00064bafc>] __schedule+0x404/0x730 [<ffffffc00064be6c>] schedule+0x44/0xb8 [<ffffffc000085bb0>] ret_to_user+0x0/0x24 possible reason: unannotated irqs-off. irq event stamp: 502169 hardirqs last enabled at (502169): [<ffffffc000085a98>] el0_irq_naked+0x1c/0x24 hardirqs last disabled at (502167): [<ffffffc0000bb3bc>] __do_softirq+0x17c/0x298 softirqs last enabled at (502168): [<ffffffc0000bb43c>] __do_softirq+0x1fc/0x298 softirqs last disabled at (502143): [<ffffffc0000bb830>] irq_exit+0xa0/0xf0 This happens because we disable interrupts in ret_to_user before calling schedule() in work_resched. This patch adds the necessary trace_hardirqs_off annotation. Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> Reported-by: Mark Rutland <mark.rutland@arm.com> Cc: Will Deacon <will.deacon@arm.com> Signed-off-by: Will Deacon <will.deacon@arm.com>
2015-12-04 12:42:29 +00:00
#endif
arm64: split thread_info from task stack This patch moves arm64's struct thread_info from the task stack into task_struct. This protects thread_info from corruption in the case of stack overflows, and makes its address harder to determine if stack addresses are leaked, making a number of attacks more difficult. Precise detection and handling of overflow is left for subsequent patches. Largely, this involves changing code to store the task_struct in sp_el0, and acquire the thread_info from the task struct. Core code now implements current_thread_info(), and as noted in <linux/sched.h> this relies on offsetof(task_struct, thread_info) == 0, enforced by core code. This change means that the 'tsk' register used in entry.S now points to a task_struct, rather than a thread_info as it used to. To make this clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets appropriately updated to account for the structural change. Userspace clobbers sp_el0, and we can no longer restore this from the stack. Instead, the current task is cached in a per-cpu variable that we can safely access from early assembly as interrupts are disabled (and we are thus not preemptible). Both secondary entry and idle are updated to stash the sp and task pointer separately. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Tested-by: Laura Abbott <labbott@redhat.com> Cc: AKASHI Takahiro <takahiro.akashi@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: James Morse <james.morse@arm.com> Cc: Kees Cook <keescook@chromium.org> Cc: Suzuki K Poulose <suzuki.poulose@arm.com> Cc: Will Deacon <will.deacon@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-03 20:23:13 +00:00
ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step
b finish_ret_to_user
/*
* "slow" syscall return path.
*/
ret_to_user:
disable_daif
arm64: Fix incorrect irqflag restore for priority masking When using IRQ priority masking to disable interrupts, in order to deal with the PSR.I state, local_irq_save() would convert the I bit into a PMR value (GIC_PRIO_IRQOFF). This resulted in local_irq_restore() potentially modifying the value of PMR in undesired location due to the state of PSR.I upon flag saving [1]. In an attempt to solve this issue in a less hackish manner, introduce a bit (GIC_PRIO_IGNORE_PMR) for the PMR values that can represent whether PSR.I is being used to disable interrupts, in which case it takes precedence of the status of interrupt masking via PMR. GIC_PRIO_PSR_I_SET is chosen such that (<pmr_value> | GIC_PRIO_PSR_I_SET) does not mask more interrupts than <pmr_value> as some sections (e.g. arch_cpu_idle(), interrupt acknowledge path) requires PMR not to mask interrupts that could be signaled to the CPU when using only PSR.I. [1] https://www.spinics.net/lists/arm-kernel/msg716956.html Fixes: 4a503217ce37 ("arm64: irqflags: Use ICC_PMR_EL1 for interrupt masking") Cc: <stable@vger.kernel.org> # 5.1.x- Reported-by: Zenghui Yu <yuzenghui@huawei.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Wei Li <liwei391@huawei.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Christoffer Dall <christoffer.dall@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Suzuki K Pouloze <suzuki.poulose@arm.com> Cc: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Julien Thierry <julien.thierry@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2019-06-11 09:38:10 +00:00
gic_prio_kentry_setup tmp=x3
arm64: split thread_info from task stack This patch moves arm64's struct thread_info from the task stack into task_struct. This protects thread_info from corruption in the case of stack overflows, and makes its address harder to determine if stack addresses are leaked, making a number of attacks more difficult. Precise detection and handling of overflow is left for subsequent patches. Largely, this involves changing code to store the task_struct in sp_el0, and acquire the thread_info from the task struct. Core code now implements current_thread_info(), and as noted in <linux/sched.h> this relies on offsetof(task_struct, thread_info) == 0, enforced by core code. This change means that the 'tsk' register used in entry.S now points to a task_struct, rather than a thread_info as it used to. To make this clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets appropriately updated to account for the structural change. Userspace clobbers sp_el0, and we can no longer restore this from the stack. Instead, the current task is cached in a per-cpu variable that we can safely access from early assembly as interrupts are disabled (and we are thus not preemptible). Both secondary entry and idle are updated to stash the sp and task pointer separately. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Tested-by: Laura Abbott <labbott@redhat.com> Cc: AKASHI Takahiro <takahiro.akashi@linaro.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: James Morse <james.morse@arm.com> Cc: Kees Cook <keescook@chromium.org> Cc: Suzuki K Poulose <suzuki.poulose@arm.com> Cc: Will Deacon <will.deacon@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-03 20:23:13 +00:00
ldr x1, [tsk, #TSK_TI_FLAGS]
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
finish_ret_to_user:
arm64: debug: avoid accessing mdscr_el1 on fault paths where possible Since mdscr_el1 is part of the debug register group, it is highly likely to be trapped by a hypervisor to prevent virtual machines from debugging (buggering?) each other. Unfortunately, this absolutely destroys our performance, since we access the register on many of our low-level fault handling paths to keep track of the various debug state machines. This patch removes our dependency on mdscr_el1 in the case that debugging is not being used. More specifically we: - Use TIF_SINGLESTEP to indicate that a task is stepping at EL0 and avoid disabling step in the MDSCR when we don't need to. MDSCR_EL1.SS handling is moved to kernel_entry, when trapping from userspace. - Ensure debug exceptions are re-enabled on *all* exception entry paths, even the debug exception handling path (where we re-enable exceptions after invoking the handler). Since we can now rely on MDSCR_EL1.SS being cleared by the entry code, exception handlers can usually enable debug immediately before enabling interrupts. - Remove all debug exception unmasking from ret_to_user and el1_preempt, since we will never get here with debug exceptions masked. This results in a slight change to kernel debug behaviour, where we now step into interrupt handlers and data aborts from EL1 when debugging the kernel, which is actually a useful thing to do. A side-effect of this is that it *does* potentially prevent stepping off {break,watch}points when there is a high-frequency interrupt source (e.g. a timer), so a debugger would need to use either breakpoints or manually disable interrupts to get around this issue. With this patch applied, guest performance is restored under KVM when debug register accesses are trapped (and we get a measurable performance increase on the host on Cortex-A57 too). Cc: Ian Campbell <ian.campbell@citrix.com> Tested-by: Marc Zyngier <marc.zyngier@arm.com> Signed-off-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2014-04-29 18:04:06 +00:00
enable_step_tsk x1, x2
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
bl stackleak_erase
#endif
kernel_exit 0
ENDPROC(ret_to_user)
.popsection // .entry.text
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
/*
* Exception vectors trampoline.
*/
.pushsection ".entry.tramp.text", "ax"
.macro tramp_map_kernel, tmp
mrs \tmp, ttbr1_el1
add \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE)
bic \tmp, \tmp, #USER_ASID_FLAG
msr ttbr1_el1, \tmp
#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003
alternative_if ARM64_WORKAROUND_QCOM_FALKOR_E1003
/* ASID already in \tmp[63:48] */
movk \tmp, #:abs_g2_nc:(TRAMP_VALIAS >> 12)
movk \tmp, #:abs_g1_nc:(TRAMP_VALIAS >> 12)
/* 2MB boundary containing the vectors, so we nobble the walk cache */
movk \tmp, #:abs_g0_nc:((TRAMP_VALIAS & ~(SZ_2M - 1)) >> 12)
isb
tlbi vae1, \tmp
dsb nsh
alternative_else_nop_endif
#endif /* CONFIG_QCOM_FALKOR_ERRATUM_1003 */
.endm
.macro tramp_unmap_kernel, tmp
mrs \tmp, ttbr1_el1
sub \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE)
orr \tmp, \tmp, #USER_ASID_FLAG
msr ttbr1_el1, \tmp
/*
* We avoid running the post_ttbr_update_workaround here because
* it's only needed by Cavium ThunderX, which requires KPTI to be
* disabled.
*/
.endm
.macro tramp_ventry, regsize = 64
.align 7
1:
.if \regsize == 64
msr tpidrro_el0, x30 // Restored in kernel_ventry
.endif
/*
* Defend against branch aliasing attacks by pushing a dummy
* entry onto the return stack and using a RET instruction to
* enter the full-fat kernel vectors.
*/
bl 2f
b .
2:
tramp_map_kernel x30
#ifdef CONFIG_RANDOMIZE_BASE
adr x30, tramp_vectors + PAGE_SIZE
alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003
ldr x30, [x30]
#else
ldr x30, =vectors
#endif
alternative_if_not ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM
prfm plil1strm, [x30, #(1b - tramp_vectors)]
alternative_else_nop_endif
msr vbar_el1, x30
add x30, x30, #(1b - tramp_vectors)
isb
ret
.endm
.macro tramp_exit, regsize = 64
adr x30, tramp_vectors
msr vbar_el1, x30
tramp_unmap_kernel x30
.if \regsize == 64
mrs x30, far_el1
.endif
eret
sb
.endm
.align 11
SYM_CODE_START_NOALIGN(tramp_vectors)
.space 0x400
tramp_ventry
tramp_ventry
tramp_ventry
tramp_ventry
tramp_ventry 32
tramp_ventry 32
tramp_ventry 32
tramp_ventry 32
SYM_CODE_END(tramp_vectors)
SYM_CODE_START(tramp_exit_native)
tramp_exit
SYM_CODE_END(tramp_exit_native)
SYM_CODE_START(tramp_exit_compat)
tramp_exit 32
SYM_CODE_END(tramp_exit_compat)
.ltorg
.popsection // .entry.tramp.text
#ifdef CONFIG_RANDOMIZE_BASE
.pushsection ".rodata", "a"
.align PAGE_SHIFT
SYM_DATA_START(__entry_tramp_data_start)
.quad vectors
SYM_DATA_END(__entry_tramp_data_start)
.popsection // .rodata
#endif /* CONFIG_RANDOMIZE_BASE */
#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
/*
* Register switch for AArch64. The callee-saved registers need to be saved
* and restored. On entry:
* x0 = previous task_struct (must be preserved across the switch)
* x1 = next task_struct
* Previous and next are guaranteed not to be the same.
*
*/
SYM_FUNC_START(cpu_switch_to)
mov x10, #THREAD_CPU_CONTEXT
add x8, x0, x10
mov x9, sp
stp x19, x20, [x8], #16 // store callee-saved registers
stp x21, x22, [x8], #16
stp x23, x24, [x8], #16
stp x25, x26, [x8], #16
stp x27, x28, [x8], #16
stp x29, x9, [x8], #16
str lr, [x8]
add x8, x1, x10
ldp x19, x20, [x8], #16 // restore callee-saved registers
ldp x21, x22, [x8], #16
ldp x23, x24, [x8], #16
ldp x25, x26, [x8], #16
ldp x27, x28, [x8], #16
ldp x29, x9, [x8], #16
ldr lr, [x8]
mov sp, x9
msr sp_el0, x1
ptrauth_keys_install_kernel x1, 1, x8, x9, x10
ret
SYM_FUNC_END(cpu_switch_to)
NOKPROBE(cpu_switch_to)
/*
* This is how we return from a fork.
*/
SYM_CODE_START(ret_from_fork)
bl schedule_tail
cbz x19, 1f // not a kernel thread
mov x0, x20
blr x19
1: get_current_task tsk
b ret_to_user
SYM_CODE_END(ret_from_fork)
NOKPROBE(ret_from_fork)
arm64: kernel: Add arch-specific SDEI entry code and CPU masking The Software Delegated Exception Interface (SDEI) is an ARM standard for registering callbacks from the platform firmware into the OS. This is typically used to implement RAS notifications. Such notifications enter the kernel at the registered entry-point with the register values of the interrupted CPU context. Because this is not a CPU exception, it cannot reuse the existing entry code. (crucially we don't implicitly know which exception level we interrupted), Add the entry point to entry.S to set us up for calling into C code. If the event interrupted code that had interrupts masked, we always return to that location. Otherwise we pretend this was an IRQ, and use SDEI's complete_and_resume call to return to vbar_el1 + offset. This allows the kernel to deliver signals to user space processes. For KVM this triggers the world switch, a quick spin round vcpu_run, then back into the guest, unless there are pending signals. Add sdei_mask_local_cpu() calls to the smp_send_stop() code, this covers the panic() code-path, which doesn't invoke cpuhotplug notifiers. Because we can interrupt entry-from/exit-to another EL, we can't trust the value in sp_el0 or x29, even if we interrupted the kernel, in this case the code in entry.S will save/restore sp_el0 and use the value in __entry_task. When we have VMAP stacks we can interrupt the stack-overflow test, which stirs x0 into sp, meaning we have to have our own VMAP stacks. For now these are allocated when we probe the interface. Future patches will add refcounting hooks to allow the arch code to allocate them lazily. Signed-off-by: James Morse <james.morse@arm.com> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2018-01-08 15:38:12 +00:00
#ifdef CONFIG_ARM_SDE_INTERFACE
#include <asm/sdei.h>
#include <uapi/linux/arm_sdei.h>
.macro sdei_handler_exit exit_mode
/* On success, this call never returns... */
cmp \exit_mode, #SDEI_EXIT_SMC
b.ne 99f
smc #0
b .
99: hvc #0
b .
.endm
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
/*
* The regular SDEI entry point may have been unmapped along with the rest of
* the kernel. This trampoline restores the kernel mapping to make the x1 memory
* argument accessible.
*
* This clobbers x4, __sdei_handler() will restore this from firmware's
* copy.
*/
.ltorg
.pushsection ".entry.tramp.text", "ax"
SYM_CODE_START(__sdei_asm_entry_trampoline)
mrs x4, ttbr1_el1
tbz x4, #USER_ASID_BIT, 1f
tramp_map_kernel tmp=x4
isb
mov x4, xzr
/*
* Use reg->interrupted_regs.addr_limit to remember whether to unmap
* the kernel on exit.
*/
1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)]
#ifdef CONFIG_RANDOMIZE_BASE
adr x4, tramp_vectors + PAGE_SIZE
add x4, x4, #:lo12:__sdei_asm_trampoline_next_handler
ldr x4, [x4]
#else
ldr x4, =__sdei_asm_handler
#endif
br x4
SYM_CODE_END(__sdei_asm_entry_trampoline)
NOKPROBE(__sdei_asm_entry_trampoline)
/*
* Make the exit call and restore the original ttbr1_el1
*
* x0 & x1: setup for the exit API call
* x2: exit_mode
* x4: struct sdei_registered_event argument from registration time.
*/
SYM_CODE_START(__sdei_asm_exit_trampoline)
ldr x4, [x4, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)]
cbnz x4, 1f
tramp_unmap_kernel tmp=x4
1: sdei_handler_exit exit_mode=x2
SYM_CODE_END(__sdei_asm_exit_trampoline)
NOKPROBE(__sdei_asm_exit_trampoline)
.ltorg
.popsection // .entry.tramp.text
#ifdef CONFIG_RANDOMIZE_BASE
.pushsection ".rodata", "a"
SYM_DATA_START(__sdei_asm_trampoline_next_handler)
.quad __sdei_asm_handler
SYM_DATA_END(__sdei_asm_trampoline_next_handler)
.popsection // .rodata
#endif /* CONFIG_RANDOMIZE_BASE */
#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
arm64: kernel: Add arch-specific SDEI entry code and CPU masking The Software Delegated Exception Interface (SDEI) is an ARM standard for registering callbacks from the platform firmware into the OS. This is typically used to implement RAS notifications. Such notifications enter the kernel at the registered entry-point with the register values of the interrupted CPU context. Because this is not a CPU exception, it cannot reuse the existing entry code. (crucially we don't implicitly know which exception level we interrupted), Add the entry point to entry.S to set us up for calling into C code. If the event interrupted code that had interrupts masked, we always return to that location. Otherwise we pretend this was an IRQ, and use SDEI's complete_and_resume call to return to vbar_el1 + offset. This allows the kernel to deliver signals to user space processes. For KVM this triggers the world switch, a quick spin round vcpu_run, then back into the guest, unless there are pending signals. Add sdei_mask_local_cpu() calls to the smp_send_stop() code, this covers the panic() code-path, which doesn't invoke cpuhotplug notifiers. Because we can interrupt entry-from/exit-to another EL, we can't trust the value in sp_el0 or x29, even if we interrupted the kernel, in this case the code in entry.S will save/restore sp_el0 and use the value in __entry_task. When we have VMAP stacks we can interrupt the stack-overflow test, which stirs x0 into sp, meaning we have to have our own VMAP stacks. For now these are allocated when we probe the interface. Future patches will add refcounting hooks to allow the arch code to allocate them lazily. Signed-off-by: James Morse <james.morse@arm.com> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2018-01-08 15:38:12 +00:00
/*
* Software Delegated Exception entry point.
*
* x0: Event number
* x1: struct sdei_registered_event argument from registration time.
* x2: interrupted PC
* x3: interrupted PSTATE
* x4: maybe clobbered by the trampoline
arm64: kernel: Add arch-specific SDEI entry code and CPU masking The Software Delegated Exception Interface (SDEI) is an ARM standard for registering callbacks from the platform firmware into the OS. This is typically used to implement RAS notifications. Such notifications enter the kernel at the registered entry-point with the register values of the interrupted CPU context. Because this is not a CPU exception, it cannot reuse the existing entry code. (crucially we don't implicitly know which exception level we interrupted), Add the entry point to entry.S to set us up for calling into C code. If the event interrupted code that had interrupts masked, we always return to that location. Otherwise we pretend this was an IRQ, and use SDEI's complete_and_resume call to return to vbar_el1 + offset. This allows the kernel to deliver signals to user space processes. For KVM this triggers the world switch, a quick spin round vcpu_run, then back into the guest, unless there are pending signals. Add sdei_mask_local_cpu() calls to the smp_send_stop() code, this covers the panic() code-path, which doesn't invoke cpuhotplug notifiers. Because we can interrupt entry-from/exit-to another EL, we can't trust the value in sp_el0 or x29, even if we interrupted the kernel, in this case the code in entry.S will save/restore sp_el0 and use the value in __entry_task. When we have VMAP stacks we can interrupt the stack-overflow test, which stirs x0 into sp, meaning we have to have our own VMAP stacks. For now these are allocated when we probe the interface. Future patches will add refcounting hooks to allow the arch code to allocate them lazily. Signed-off-by: James Morse <james.morse@arm.com> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2018-01-08 15:38:12 +00:00
*
* Firmware has preserved x0->x17 for us, we must save/restore the rest to
* follow SMC-CC. We save (or retrieve) all the registers as the handler may
* want them.
*/
SYM_CODE_START(__sdei_asm_handler)
arm64: kernel: Add arch-specific SDEI entry code and CPU masking The Software Delegated Exception Interface (SDEI) is an ARM standard for registering callbacks from the platform firmware into the OS. This is typically used to implement RAS notifications. Such notifications enter the kernel at the registered entry-point with the register values of the interrupted CPU context. Because this is not a CPU exception, it cannot reuse the existing entry code. (crucially we don't implicitly know which exception level we interrupted), Add the entry point to entry.S to set us up for calling into C code. If the event interrupted code that had interrupts masked, we always return to that location. Otherwise we pretend this was an IRQ, and use SDEI's complete_and_resume call to return to vbar_el1 + offset. This allows the kernel to deliver signals to user space processes. For KVM this triggers the world switch, a quick spin round vcpu_run, then back into the guest, unless there are pending signals. Add sdei_mask_local_cpu() calls to the smp_send_stop() code, this covers the panic() code-path, which doesn't invoke cpuhotplug notifiers. Because we can interrupt entry-from/exit-to another EL, we can't trust the value in sp_el0 or x29, even if we interrupted the kernel, in this case the code in entry.S will save/restore sp_el0 and use the value in __entry_task. When we have VMAP stacks we can interrupt the stack-overflow test, which stirs x0 into sp, meaning we have to have our own VMAP stacks. For now these are allocated when we probe the interface. Future patches will add refcounting hooks to allow the arch code to allocate them lazily. Signed-off-by: James Morse <james.morse@arm.com> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2018-01-08 15:38:12 +00:00
stp x2, x3, [x1, #SDEI_EVENT_INTREGS + S_PC]
stp x4, x5, [x1, #SDEI_EVENT_INTREGS + 16 * 2]
stp x6, x7, [x1, #SDEI_EVENT_INTREGS + 16 * 3]
stp x8, x9, [x1, #SDEI_EVENT_INTREGS + 16 * 4]
stp x10, x11, [x1, #SDEI_EVENT_INTREGS + 16 * 5]
stp x12, x13, [x1, #SDEI_EVENT_INTREGS + 16 * 6]
stp x14, x15, [x1, #SDEI_EVENT_INTREGS + 16 * 7]
stp x16, x17, [x1, #SDEI_EVENT_INTREGS + 16 * 8]
stp x18, x19, [x1, #SDEI_EVENT_INTREGS + 16 * 9]
stp x20, x21, [x1, #SDEI_EVENT_INTREGS + 16 * 10]
stp x22, x23, [x1, #SDEI_EVENT_INTREGS + 16 * 11]
stp x24, x25, [x1, #SDEI_EVENT_INTREGS + 16 * 12]
stp x26, x27, [x1, #SDEI_EVENT_INTREGS + 16 * 13]
stp x28, x29, [x1, #SDEI_EVENT_INTREGS + 16 * 14]
mov x4, sp
stp lr, x4, [x1, #SDEI_EVENT_INTREGS + S_LR]
mov x19, x1
#ifdef CONFIG_VMAP_STACK
/*
* entry.S may have been using sp as a scratch register, find whether
* this is a normal or critical event and switch to the appropriate
* stack for this CPU.
*/
ldrb w4, [x19, #SDEI_EVENT_PRIORITY]
cbnz w4, 1f
ldr_this_cpu dst=x5, sym=sdei_stack_normal_ptr, tmp=x6
b 2f
1: ldr_this_cpu dst=x5, sym=sdei_stack_critical_ptr, tmp=x6
2: mov x6, #SDEI_STACK_SIZE
add x5, x5, x6
mov sp, x5
#endif
/*
* We may have interrupted userspace, or a guest, or exit-from or
* return-to either of these. We can't trust sp_el0, restore it.
*/
mrs x28, sp_el0
ldr_this_cpu dst=x0, sym=__entry_task, tmp=x1
msr sp_el0, x0
/* If we interrupted the kernel point to the previous stack/frame. */
and x0, x3, #0xc
mrs x1, CurrentEL
cmp x0, x1
csel x29, x29, xzr, eq // fp, or zero
csel x4, x2, xzr, eq // elr, or zero
stp x29, x4, [sp, #-16]!
mov x29, sp
add x0, x19, #SDEI_EVENT_INTREGS
mov x1, x19
bl __sdei_handler
msr sp_el0, x28
/* restore regs >x17 that we clobbered */
mov x4, x19 // keep x4 for __sdei_asm_exit_trampoline
ldp x28, x29, [x4, #SDEI_EVENT_INTREGS + 16 * 14]
ldp x18, x19, [x4, #SDEI_EVENT_INTREGS + 16 * 9]
ldp lr, x1, [x4, #SDEI_EVENT_INTREGS + S_LR]
mov sp, x1
arm64: kernel: Add arch-specific SDEI entry code and CPU masking The Software Delegated Exception Interface (SDEI) is an ARM standard for registering callbacks from the platform firmware into the OS. This is typically used to implement RAS notifications. Such notifications enter the kernel at the registered entry-point with the register values of the interrupted CPU context. Because this is not a CPU exception, it cannot reuse the existing entry code. (crucially we don't implicitly know which exception level we interrupted), Add the entry point to entry.S to set us up for calling into C code. If the event interrupted code that had interrupts masked, we always return to that location. Otherwise we pretend this was an IRQ, and use SDEI's complete_and_resume call to return to vbar_el1 + offset. This allows the kernel to deliver signals to user space processes. For KVM this triggers the world switch, a quick spin round vcpu_run, then back into the guest, unless there are pending signals. Add sdei_mask_local_cpu() calls to the smp_send_stop() code, this covers the panic() code-path, which doesn't invoke cpuhotplug notifiers. Because we can interrupt entry-from/exit-to another EL, we can't trust the value in sp_el0 or x29, even if we interrupted the kernel, in this case the code in entry.S will save/restore sp_el0 and use the value in __entry_task. When we have VMAP stacks we can interrupt the stack-overflow test, which stirs x0 into sp, meaning we have to have our own VMAP stacks. For now these are allocated when we probe the interface. Future patches will add refcounting hooks to allow the arch code to allocate them lazily. Signed-off-by: James Morse <james.morse@arm.com> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2018-01-08 15:38:12 +00:00
mov x1, x0 // address to complete_and_resume
/* x0 = (x0 <= 1) ? EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME */
cmp x0, #1
mov_q x2, SDEI_1_0_FN_SDEI_EVENT_COMPLETE
mov_q x3, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME
csel x0, x2, x3, ls
ldr_l x2, sdei_exit_mode
alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0
sdei_handler_exit exit_mode=x2
alternative_else_nop_endif
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline
br x5
#endif
SYM_CODE_END(__sdei_asm_handler)
arm64: kernel: Add arch-specific SDEI entry code and CPU masking The Software Delegated Exception Interface (SDEI) is an ARM standard for registering callbacks from the platform firmware into the OS. This is typically used to implement RAS notifications. Such notifications enter the kernel at the registered entry-point with the register values of the interrupted CPU context. Because this is not a CPU exception, it cannot reuse the existing entry code. (crucially we don't implicitly know which exception level we interrupted), Add the entry point to entry.S to set us up for calling into C code. If the event interrupted code that had interrupts masked, we always return to that location. Otherwise we pretend this was an IRQ, and use SDEI's complete_and_resume call to return to vbar_el1 + offset. This allows the kernel to deliver signals to user space processes. For KVM this triggers the world switch, a quick spin round vcpu_run, then back into the guest, unless there are pending signals. Add sdei_mask_local_cpu() calls to the smp_send_stop() code, this covers the panic() code-path, which doesn't invoke cpuhotplug notifiers. Because we can interrupt entry-from/exit-to another EL, we can't trust the value in sp_el0 or x29, even if we interrupted the kernel, in this case the code in entry.S will save/restore sp_el0 and use the value in __entry_task. When we have VMAP stacks we can interrupt the stack-overflow test, which stirs x0 into sp, meaning we have to have our own VMAP stacks. For now these are allocated when we probe the interface. Future patches will add refcounting hooks to allow the arch code to allocate them lazily. Signed-off-by: James Morse <james.morse@arm.com> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2018-01-08 15:38:12 +00:00
NOKPROBE(__sdei_asm_handler)
#endif /* CONFIG_ARM_SDE_INTERFACE */