335aca5f65
From Nick's cover letter: Linux powerpc new system call instruction and ABI System Call Vectored (scv) ABI ============================== The scv instruction is introduced with POWER9 / ISA3, it comes with an rfscv counter-part. The benefit of these instructions is performance (trading slower SRR0/1 with faster LR/CTR registers, and entering the kernel with MSR[EE] and MSR[RI] left enabled, which can reduce MSR updates. The scv instruction has 128 levels (not enough to cover the Linux system call space). Assignment and advertisement ---------------------------- The proposal is to assign scv levels conservatively, and advertise them with HWCAP feature bits as we add support for more. Linux has not enabled FSCR[SCV] yet, so executing the scv instruction will cause the kernel to log a "SCV facility unavilable" message, and deliver a SIGILL with ILL_ILLOPC to the process. Linux has defined a HWCAP2 bit PPC_FEATURE2_SCV for SCV support, but does not set it. This change allocates the zero level ('scv 0'), advertised with PPC_FEATURE2_SCV, which will be used to provide normal Linux system calls (equivalent to 'sc'). Attempting to execute scv with other levels will cause a SIGILL to be delivered the same as before, but will not log a "SCV facility unavailable" message (because the processor facility is enabled). Calling convention ------------------ The proposal is for scv 0 to provide the standard Linux system call ABI with the following differences from sc convention[1]: - LR is to be volatile across scv calls. This is necessary because the scv instruction clobbers LR. From previous discussion, this should be possible to deal with in GCC clobbers and CFI. - cr1 and cr5-cr7 are volatile. This matches the C ABI and would allow the kernel system call exit to avoid restoring the volatile cr registers (although we probably still would anyway to avoid information leaks). - Error handling: The consensus among kernel, glibc, and musl is to move to using negative return values in r3 rather than CR0[SO]=1 to indicate error, which matches most other architectures, and is closer to a function call. Notes ----- - r0,r4-r8 are documented as volatile in the ABI, but the kernel patch as submitted currently preserves them. This is to leave room for deciding which way to go with these. Some small benefit was found by preserving them[1] but I'm not convinced it's worth deviating from the C function call ABI just for this. Release code should follow the ABI. Previous discussions: https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-April/208691.html https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-April/209268.html [1] https://github.com/torvalds/linux/blob/master/Documentation/powerpc/syscall64-abi.rst [2] https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-April/209263.html
408 lines
10 KiB
C
408 lines
10 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
#include <linux/err.h>
|
|
#include <asm/asm-prototypes.h>
|
|
#include <asm/book3s/64/kup-radix.h>
|
|
#include <asm/cputime.h>
|
|
#include <asm/hw_irq.h>
|
|
#include <asm/kprobes.h>
|
|
#include <asm/paca.h>
|
|
#include <asm/ptrace.h>
|
|
#include <asm/reg.h>
|
|
#include <asm/signal.h>
|
|
#include <asm/switch_to.h>
|
|
#include <asm/syscall.h>
|
|
#include <asm/time.h>
|
|
#include <asm/unistd.h>
|
|
|
|
typedef long (*syscall_fn)(long, long, long, long, long, long);
|
|
|
|
/* Has to run notrace because it is entered not completely "reconciled" */
|
|
notrace long system_call_exception(long r3, long r4, long r5,
|
|
long r6, long r7, long r8,
|
|
unsigned long r0, struct pt_regs *regs)
|
|
{
|
|
syscall_fn f;
|
|
|
|
if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
|
|
BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED);
|
|
|
|
trace_hardirqs_off(); /* finish reconciling */
|
|
|
|
if (IS_ENABLED(CONFIG_PPC_BOOK3S))
|
|
BUG_ON(!(regs->msr & MSR_RI));
|
|
BUG_ON(!(regs->msr & MSR_PR));
|
|
BUG_ON(!FULL_REGS(regs));
|
|
BUG_ON(regs->softe != IRQS_ENABLED);
|
|
|
|
kuap_check_amr();
|
|
|
|
account_cpu_user_entry();
|
|
|
|
#ifdef CONFIG_PPC_SPLPAR
|
|
if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) &&
|
|
firmware_has_feature(FW_FEATURE_SPLPAR)) {
|
|
struct lppaca *lp = local_paca->lppaca_ptr;
|
|
|
|
if (unlikely(local_paca->dtl_ridx != be64_to_cpu(lp->dtl_idx)))
|
|
accumulate_stolen_time();
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* This is not required for the syscall exit path, but makes the
|
|
* stack frame look nicer. If this was initialised in the first stack
|
|
* frame, or if the unwinder was taught the first stack frame always
|
|
* returns to user with IRQS_ENABLED, this store could be avoided!
|
|
*/
|
|
regs->softe = IRQS_ENABLED;
|
|
|
|
local_irq_enable();
|
|
|
|
if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) {
|
|
if (unlikely(regs->trap == 0x7ff0)) {
|
|
/* Unsupported scv vector */
|
|
_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
|
|
return regs->gpr[3];
|
|
}
|
|
/*
|
|
* We use the return value of do_syscall_trace_enter() as the
|
|
* syscall number. If the syscall was rejected for any reason
|
|
* do_syscall_trace_enter() returns an invalid syscall number
|
|
* and the test against NR_syscalls will fail and the return
|
|
* value to be used is in regs->gpr[3].
|
|
*/
|
|
r0 = do_syscall_trace_enter(regs);
|
|
if (unlikely(r0 >= NR_syscalls))
|
|
return regs->gpr[3];
|
|
r3 = regs->gpr[3];
|
|
r4 = regs->gpr[4];
|
|
r5 = regs->gpr[5];
|
|
r6 = regs->gpr[6];
|
|
r7 = regs->gpr[7];
|
|
r8 = regs->gpr[8];
|
|
|
|
} else if (unlikely(r0 >= NR_syscalls)) {
|
|
if (unlikely(regs->trap == 0x7ff0)) {
|
|
/* Unsupported scv vector */
|
|
_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
|
|
return regs->gpr[3];
|
|
}
|
|
return -ENOSYS;
|
|
}
|
|
|
|
/* May be faster to do array_index_nospec? */
|
|
barrier_nospec();
|
|
|
|
if (unlikely(is_32bit_task())) {
|
|
f = (void *)compat_sys_call_table[r0];
|
|
|
|
r3 &= 0x00000000ffffffffULL;
|
|
r4 &= 0x00000000ffffffffULL;
|
|
r5 &= 0x00000000ffffffffULL;
|
|
r6 &= 0x00000000ffffffffULL;
|
|
r7 &= 0x00000000ffffffffULL;
|
|
r8 &= 0x00000000ffffffffULL;
|
|
|
|
} else {
|
|
f = (void *)sys_call_table[r0];
|
|
}
|
|
|
|
return f(r3, r4, r5, r6, r7, r8);
|
|
}
|
|
|
|
/*
|
|
* local irqs must be disabled. Returns false if the caller must re-enable
|
|
* them, check for new work, and try again.
|
|
*/
|
|
static notrace inline bool prep_irq_for_enabled_exit(bool clear_ri)
|
|
{
|
|
/* This must be done with RI=1 because tracing may touch vmaps */
|
|
trace_hardirqs_on();
|
|
|
|
/* This pattern matches prep_irq_for_idle */
|
|
if (clear_ri)
|
|
__hard_EE_RI_disable();
|
|
else
|
|
__hard_irq_disable();
|
|
if (unlikely(lazy_irq_pending_nocheck())) {
|
|
/* Took an interrupt, may have more exit work to do. */
|
|
if (clear_ri)
|
|
__hard_RI_enable();
|
|
trace_hardirqs_off();
|
|
local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
|
|
|
|
return false;
|
|
}
|
|
local_paca->irq_happened = 0;
|
|
irq_soft_mask_set(IRQS_ENABLED);
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* This should be called after a syscall returns, with r3 the return value
|
|
* from the syscall. If this function returns non-zero, the system call
|
|
* exit assembly should additionally load all GPR registers and CTR and XER
|
|
* from the interrupt frame.
|
|
*
|
|
* The function graph tracer can not trace the return side of this function,
|
|
* because RI=0 and soft mask state is "unreconciled", so it is marked notrace.
|
|
*/
|
|
notrace unsigned long syscall_exit_prepare(unsigned long r3,
|
|
struct pt_regs *regs,
|
|
long scv)
|
|
{
|
|
unsigned long *ti_flagsp = ¤t_thread_info()->flags;
|
|
unsigned long ti_flags;
|
|
unsigned long ret = 0;
|
|
|
|
kuap_check_amr();
|
|
|
|
regs->result = r3;
|
|
|
|
/* Check whether the syscall is issued inside a restartable sequence */
|
|
rseq_syscall(regs);
|
|
|
|
ti_flags = *ti_flagsp;
|
|
|
|
if (unlikely(r3 >= (unsigned long)-MAX_ERRNO) && !scv) {
|
|
if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) {
|
|
r3 = -r3;
|
|
regs->ccr |= 0x10000000; /* Set SO bit in CR */
|
|
}
|
|
}
|
|
|
|
if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) {
|
|
if (ti_flags & _TIF_RESTOREALL)
|
|
ret = _TIF_RESTOREALL;
|
|
else
|
|
regs->gpr[3] = r3;
|
|
clear_bits(_TIF_PERSYSCALL_MASK, ti_flagsp);
|
|
} else {
|
|
regs->gpr[3] = r3;
|
|
}
|
|
|
|
if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) {
|
|
do_syscall_trace_leave(regs);
|
|
ret |= _TIF_RESTOREALL;
|
|
}
|
|
|
|
again:
|
|
local_irq_disable();
|
|
ti_flags = READ_ONCE(*ti_flagsp);
|
|
while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
|
|
local_irq_enable();
|
|
if (ti_flags & _TIF_NEED_RESCHED) {
|
|
schedule();
|
|
} else {
|
|
/*
|
|
* SIGPENDING must restore signal handler function
|
|
* argument GPRs, and some non-volatiles (e.g., r1).
|
|
* Restore all for now. This could be made lighter.
|
|
*/
|
|
if (ti_flags & _TIF_SIGPENDING)
|
|
ret |= _TIF_RESTOREALL;
|
|
do_notify_resume(regs, ti_flags);
|
|
}
|
|
local_irq_disable();
|
|
ti_flags = READ_ONCE(*ti_flagsp);
|
|
}
|
|
|
|
if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) {
|
|
if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
|
|
unlikely((ti_flags & _TIF_RESTORE_TM))) {
|
|
restore_tm_state(regs);
|
|
} else {
|
|
unsigned long mathflags = MSR_FP;
|
|
|
|
if (cpu_has_feature(CPU_FTR_VSX))
|
|
mathflags |= MSR_VEC | MSR_VSX;
|
|
else if (cpu_has_feature(CPU_FTR_ALTIVEC))
|
|
mathflags |= MSR_VEC;
|
|
|
|
/*
|
|
* If userspace MSR has all available FP bits set,
|
|
* then they are live and no need to restore. If not,
|
|
* it means the regs were given up and restore_math
|
|
* may decide to restore them (to avoid taking an FP
|
|
* fault).
|
|
*/
|
|
if ((regs->msr & mathflags) != mathflags)
|
|
restore_math(regs);
|
|
}
|
|
}
|
|
|
|
/* scv need not set RI=0 because SRRs are not used */
|
|
if (unlikely(!prep_irq_for_enabled_exit(!scv))) {
|
|
local_irq_enable();
|
|
goto again;
|
|
}
|
|
|
|
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
|
|
local_paca->tm_scratch = regs->msr;
|
|
#endif
|
|
|
|
account_cpu_user_exit();
|
|
|
|
return ret;
|
|
}
|
|
|
|
#ifdef CONFIG_PPC_BOOK3S /* BOOK3E not yet using this */
|
|
notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned long msr)
|
|
{
|
|
#ifdef CONFIG_PPC_BOOK3E
|
|
struct thread_struct *ts = ¤t->thread;
|
|
#endif
|
|
unsigned long *ti_flagsp = ¤t_thread_info()->flags;
|
|
unsigned long ti_flags;
|
|
unsigned long flags;
|
|
unsigned long ret = 0;
|
|
|
|
if (IS_ENABLED(CONFIG_PPC_BOOK3S))
|
|
BUG_ON(!(regs->msr & MSR_RI));
|
|
BUG_ON(!(regs->msr & MSR_PR));
|
|
BUG_ON(!FULL_REGS(regs));
|
|
BUG_ON(regs->softe != IRQS_ENABLED);
|
|
|
|
/*
|
|
* We don't need to restore AMR on the way back to userspace for KUAP.
|
|
* AMR can only have been unlocked if we interrupted the kernel.
|
|
*/
|
|
kuap_check_amr();
|
|
|
|
local_irq_save(flags);
|
|
|
|
again:
|
|
ti_flags = READ_ONCE(*ti_flagsp);
|
|
while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
|
|
local_irq_enable(); /* returning to user: may enable */
|
|
if (ti_flags & _TIF_NEED_RESCHED) {
|
|
schedule();
|
|
} else {
|
|
if (ti_flags & _TIF_SIGPENDING)
|
|
ret |= _TIF_RESTOREALL;
|
|
do_notify_resume(regs, ti_flags);
|
|
}
|
|
local_irq_disable();
|
|
ti_flags = READ_ONCE(*ti_flagsp);
|
|
}
|
|
|
|
if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) {
|
|
if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
|
|
unlikely((ti_flags & _TIF_RESTORE_TM))) {
|
|
restore_tm_state(regs);
|
|
} else {
|
|
unsigned long mathflags = MSR_FP;
|
|
|
|
if (cpu_has_feature(CPU_FTR_VSX))
|
|
mathflags |= MSR_VEC | MSR_VSX;
|
|
else if (cpu_has_feature(CPU_FTR_ALTIVEC))
|
|
mathflags |= MSR_VEC;
|
|
|
|
/* See above restore_math comment */
|
|
if ((regs->msr & mathflags) != mathflags)
|
|
restore_math(regs);
|
|
}
|
|
}
|
|
|
|
if (unlikely(!prep_irq_for_enabled_exit(true))) {
|
|
local_irq_enable();
|
|
local_irq_disable();
|
|
goto again;
|
|
}
|
|
|
|
#ifdef CONFIG_PPC_BOOK3E
|
|
if (unlikely(ts->debug.dbcr0 & DBCR0_IDM)) {
|
|
/*
|
|
* Check to see if the dbcr0 register is set up to debug.
|
|
* Use the internal debug mode bit to do this.
|
|
*/
|
|
mtmsr(mfmsr() & ~MSR_DE);
|
|
mtspr(SPRN_DBCR0, ts->debug.dbcr0);
|
|
mtspr(SPRN_DBSR, -1);
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
|
|
local_paca->tm_scratch = regs->msr;
|
|
#endif
|
|
|
|
account_cpu_user_exit();
|
|
|
|
return ret;
|
|
}
|
|
|
|
void unrecoverable_exception(struct pt_regs *regs);
|
|
void preempt_schedule_irq(void);
|
|
|
|
notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr)
|
|
{
|
|
unsigned long *ti_flagsp = ¤t_thread_info()->flags;
|
|
unsigned long flags;
|
|
unsigned long ret = 0;
|
|
unsigned long amr;
|
|
|
|
if (IS_ENABLED(CONFIG_PPC_BOOK3S) && unlikely(!(regs->msr & MSR_RI)))
|
|
unrecoverable_exception(regs);
|
|
BUG_ON(regs->msr & MSR_PR);
|
|
BUG_ON(!FULL_REGS(regs));
|
|
|
|
amr = kuap_get_and_check_amr();
|
|
|
|
if (unlikely(*ti_flagsp & _TIF_EMULATE_STACK_STORE)) {
|
|
clear_bits(_TIF_EMULATE_STACK_STORE, ti_flagsp);
|
|
ret = 1;
|
|
}
|
|
|
|
local_irq_save(flags);
|
|
|
|
if (regs->softe == IRQS_ENABLED) {
|
|
/* Returning to a kernel context with local irqs enabled. */
|
|
WARN_ON_ONCE(!(regs->msr & MSR_EE));
|
|
again:
|
|
if (IS_ENABLED(CONFIG_PREEMPT)) {
|
|
/* Return to preemptible kernel context */
|
|
if (unlikely(*ti_flagsp & _TIF_NEED_RESCHED)) {
|
|
if (preempt_count() == 0)
|
|
preempt_schedule_irq();
|
|
}
|
|
}
|
|
|
|
if (unlikely(!prep_irq_for_enabled_exit(true))) {
|
|
/*
|
|
* Can't local_irq_restore to replay if we were in
|
|
* interrupt context. Must replay directly.
|
|
*/
|
|
if (irqs_disabled_flags(flags)) {
|
|
replay_soft_interrupts();
|
|
} else {
|
|
local_irq_restore(flags);
|
|
local_irq_save(flags);
|
|
}
|
|
/* Took an interrupt, may have more exit work to do. */
|
|
goto again;
|
|
}
|
|
} else {
|
|
/* Returning to a kernel context with local irqs disabled. */
|
|
__hard_EE_RI_disable();
|
|
if (regs->msr & MSR_EE)
|
|
local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
|
|
local_paca->tm_scratch = regs->msr;
|
|
#endif
|
|
|
|
/*
|
|
* Don't want to mfspr(SPRN_AMR) here, because this comes after mtmsr,
|
|
* which would cause Read-After-Write stalls. Hence, we take the AMR
|
|
* value from the check above.
|
|
*/
|
|
kuap_restore_amr(regs, amr);
|
|
|
|
return ret;
|
|
}
|
|
#endif
|