linux/arch/powerpc/kernel/signal_64.c

924 lines
28 KiB
C
Raw Normal View History

/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Derived from "arch/i386/kernel/signal.c"
* Copyright (C) 1991, 1992 Linus Torvalds
* 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
#include <linux/elf.h>
#include <linux/ptrace.h>
#include <linux/ratelimit.h>
#include <linux/syscalls.h>
#include <asm/sigcontext.h>
#include <asm/ucontext.h>
#include <linux/uaccess.h>
#include <asm/pgtable.h>
#include <asm/unistd.h>
#include <asm/cacheflush.h>
#include <asm/syscalls.h>
#include <asm/vdso.h>
#include <asm/switch_to.h>
#include <asm/tm.h>
#include <asm/asm-prototypes.h>
#include "signal.h"
#define GP_REGS_SIZE min(sizeof(elf_gregset_t), sizeof(struct pt_regs))
#define FP_REGS_SIZE sizeof(elf_fpregset_t)
#define TRAMP_TRACEBACK 3
#define TRAMP_SIZE 6
/*
* When we have signals to deliver, we set up on the user stack,
* going down from the original stack pointer:
* 1) a rt_sigframe struct which contains the ucontext
* 2) a gap of __SIGNAL_FRAMESIZE bytes which acts as a dummy caller
* frame for the signal handler.
*/
struct rt_sigframe {
/* sys_rt_sigreturn requires the ucontext be the first field */
struct ucontext uc;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
struct ucontext uc_transact;
#endif
unsigned long _unused[2];
unsigned int tramp[TRAMP_SIZE];
struct siginfo __user *pinfo;
void __user *puc;
struct siginfo info;
/* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */
char abigap[USER_REDZONE_SIZE];
} __attribute__ ((aligned (16)));
static const char fmt32[] = KERN_INFO \
"%s[%d]: bad frame in %s: %08lx nip %08lx lr %08lx\n";
static const char fmt64[] = KERN_INFO \
"%s[%d]: bad frame in %s: %016lx nip %016lx lr %016lx\n";
/*
* This computes a quad word aligned pointer inside the vmx_reserve array
* element. For historical reasons sigcontext might not be quad word aligned,
* but the location we write the VMX regs to must be. See the comment in
* sigcontext for more detail.
*/
#ifdef CONFIG_ALTIVEC
static elf_vrreg_t __user *sigcontext_vmx_regs(struct sigcontext __user *sc)
{
return (elf_vrreg_t __user *) (((unsigned long)sc->vmx_reserve + 15) & ~0xful);
}
#endif
/*
* Set up the sigcontext for the signal frame.
*/
static long setup_sigcontext(struct sigcontext __user *sc,
struct task_struct *tsk, int signr, sigset_t *set,
unsigned long handler, int ctx_has_vsx_region)
{
/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
* process never used altivec yet (MSR_VEC is zero in pt_regs of
* the context). This is very important because we must ensure we
* don't lose the VRSAVE content that may have been set prior to
* the process doing its first vector operation
* Userland shall check AT_HWCAP to know whether it can rely on the
* v_regs pointer or not
*/
#ifdef CONFIG_ALTIVEC
elf_vrreg_t __user *v_regs = sigcontext_vmx_regs(sc);
unsigned long vrsave;
#endif
struct pt_regs *regs = tsk->thread.regs;
unsigned long msr = regs->msr;
long err = 0;
/* Force usr to alway see softe as 1 (interrupts enabled) */
unsigned long softe = 0x1;
BUG_ON(tsk != current);
#ifdef CONFIG_ALTIVEC
err |= __put_user(v_regs, &sc->v_regs);
/* save altivec registers */
if (tsk->thread.used_vr) {
flush_altivec_to_thread(tsk);
/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
err |= __copy_to_user(v_regs, &tsk->thread.vr_state,
33 * sizeof(vector128));
/* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg)
* contains valid data.
*/
msr |= MSR_VEC;
}
/* We always copy to/from vrsave, it's 0 if we don't have or don't
* use altivec.
*/
vrsave = 0;
if (cpu_has_feature(CPU_FTR_ALTIVEC)) {
vrsave = mfspr(SPRN_VRSAVE);
tsk->thread.vrsave = vrsave;
}
err |= __put_user(vrsave, (u32 __user *)&v_regs[33]);
#else /* CONFIG_ALTIVEC */
err |= __put_user(0, &sc->v_regs);
#endif /* CONFIG_ALTIVEC */
flush_fp_to_thread(tsk);
/* copy fpr regs and fpscr */
err |= copy_fpr_to_user(&sc->fp_regs, tsk);
/*
* Clear the MSR VSX bit to indicate there is no valid state attached
* to this context, except in the specific case below where we set it.
*/
msr &= ~MSR_VSX;
powerpc: Introduce VSX thread_struct and CONFIG_VSX The layout of the new VSR registers and how they overlap on top of the legacy FPR and VR registers is: VSR doubleword 0 VSR doubleword 1 ---------------------------------------------------------------- VSR[0] | FPR[0] | | ---------------------------------------------------------------- VSR[1] | FPR[1] | | ---------------------------------------------------------------- | ... | | | ... | | ---------------------------------------------------------------- VSR[30] | FPR[30] | | ---------------------------------------------------------------- VSR[31] | FPR[31] | | ---------------------------------------------------------------- VSR[32] | VR[0] | ---------------------------------------------------------------- VSR[33] | VR[1] | ---------------------------------------------------------------- | ... | | ... | ---------------------------------------------------------------- VSR[62] | VR[30] | ---------------------------------------------------------------- VSR[63] | VR[31] | ---------------------------------------------------------------- VSX has 64 128bit registers. The first 32 regs overlap with the FP registers and hence extend them with and additional 64 bits. The second 32 regs overlap with the VMX registers. This commit introduces the thread_struct changes required to reflect this register layout. Ptrace and signals code is updated so that the floating point registers are correctly accessed from the thread_struct when CONFIG_VSX is enabled. Signed-off-by: Michael Neuling <mikey@neuling.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
2008-06-25 04:07:18 +00:00
#ifdef CONFIG_VSX
/*
* Copy VSX low doubleword to local buffer for formatting,
* then out to userspace. Update v_regs to point after the
* VMX data.
*/
if (tsk->thread.used_vsr && ctx_has_vsx_region) {
flush_vsx_to_thread(tsk);
v_regs += ELF_NVRREG;
err |= copy_vsx_to_user(v_regs, tsk);
/* set MSR_VSX in the MSR value in the frame to
* indicate that sc->vs_reg) contains valid data.
*/
msr |= MSR_VSX;
}
powerpc: Introduce VSX thread_struct and CONFIG_VSX The layout of the new VSR registers and how they overlap on top of the legacy FPR and VR registers is: VSR doubleword 0 VSR doubleword 1 ---------------------------------------------------------------- VSR[0] | FPR[0] | | ---------------------------------------------------------------- VSR[1] | FPR[1] | | ---------------------------------------------------------------- | ... | | | ... | | ---------------------------------------------------------------- VSR[30] | FPR[30] | | ---------------------------------------------------------------- VSR[31] | FPR[31] | | ---------------------------------------------------------------- VSR[32] | VR[0] | ---------------------------------------------------------------- VSR[33] | VR[1] | ---------------------------------------------------------------- | ... | | ... | ---------------------------------------------------------------- VSR[62] | VR[30] | ---------------------------------------------------------------- VSR[63] | VR[31] | ---------------------------------------------------------------- VSX has 64 128bit registers. The first 32 regs overlap with the FP registers and hence extend them with and additional 64 bits. The second 32 regs overlap with the VMX registers. This commit introduces the thread_struct changes required to reflect this register layout. Ptrace and signals code is updated so that the floating point registers are correctly accessed from the thread_struct when CONFIG_VSX is enabled. Signed-off-by: Michael Neuling <mikey@neuling.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
2008-06-25 04:07:18 +00:00
#endif /* CONFIG_VSX */
err |= __put_user(&sc->gp_regs, &sc->regs);
powerpc: Fix various syscall/signal/swapcontext bugs A careful reading of the recent changes to the system call entry/exit paths revealed several problems, plus some things that could be simplified and improved: * 32-bit wasn't testing the _TIF_NOERROR bit in the syscall fast exit path, so it was only doing anything with it once it saw some other bit being set. In other words, the noerror behaviour would apply to the next system call where we had to reschedule or deliver a signal, which is not necessarily the current system call. * 32-bit wasn't doing the call to ptrace_notify in the syscall exit path when the _TIF_SINGLESTEP bit was set. * _TIF_RESTOREALL was in both _TIF_USER_WORK_MASK and _TIF_PERSYSCALL_MASK, which is odd since _TIF_RESTOREALL is only set by system calls. I took it out of _TIF_USER_WORK_MASK. * On 64-bit, _TIF_RESTOREALL wasn't causing the non-volatile registers to be restored (unless perhaps a signal was delivered or the syscall was traced or single-stepped). Thus the non-volatile registers weren't restored on exit from a signal handler. We probably got away with it mostly because signal handlers written in C wouldn't alter the non-volatile registers. * On 32-bit I simplified the code and made it more like 64-bit by making the syscall exit path jump to ret_from_except to handle preemption and signal delivery. * 32-bit was calling do_signal unnecessarily when _TIF_RESTOREALL was set - but I think because of that 32-bit was actually restoring the non-volatile registers on exit from a signal handler. * I changed the order of enabling interrupts and saving the non-volatile registers before calling do_syscall_trace_leave; now we enable interrupts first. Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-03-08 02:24:22 +00:00
WARN_ON(!FULL_REGS(regs));
err |= __copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE);
err |= __put_user(msr, &sc->gp_regs[PT_MSR]);
err |= __put_user(softe, &sc->gp_regs[PT_SOFTE]);
err |= __put_user(signr, &sc->signal);
err |= __put_user(handler, &sc->handler);
if (set != NULL)
err |= __put_user(set->sig[0], &sc->oldmask);
return err;
}
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/*
* As above, but Transactional Memory is in use, so deliver sigcontexts
* containing checkpointed and transactional register states.
*
* To do this, we treclaim (done before entering here) to gather both sets of
* registers and set up the 'normal' sigcontext registers with rolled-back
* register values such that a simple signal handler sees a correct
* checkpointed register state. If interested, a TM-aware sighandler can
* examine the transactional registers in the 2nd sigcontext to determine the
* real origin of the signal.
*/
static long setup_tm_sigcontexts(struct sigcontext __user *sc,
struct sigcontext __user *tm_sc,
struct task_struct *tsk,
int signr, sigset_t *set, unsigned long handler)
{
/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
* process never used altivec yet (MSR_VEC is zero in pt_regs of
* the context). This is very important because we must ensure we
* don't lose the VRSAVE content that may have been set prior to
* the process doing its first vector operation
* Userland shall check AT_HWCAP to know wether it can rely on the
* v_regs pointer or not.
*/
#ifdef CONFIG_ALTIVEC
elf_vrreg_t __user *v_regs = sigcontext_vmx_regs(sc);
elf_vrreg_t __user *tm_v_regs = sigcontext_vmx_regs(tm_sc);
#endif
struct pt_regs *regs = tsk->thread.regs;
powerpc/tm: Fix endianness flip on trap Currently it's possible that a thread on PPC64 LE has its endianness flipped inadvertently to Big-Endian resulting in a crash once the process is back from the signal handler. If giveup_all() is called when regs->msr has the bits MSR.FP and MSR.VEC disabled (and hence MSR.VSX disabled too) it returns without calling check_if_tm_restore_required() which copies regs->msr to ckpt_regs->msr if the process caught a signal whilst in transactional mode. Then once in setup_tm_sigcontexts() MSR from ckpt_regs.msr is used, but since check_if_tm_restore_required() was not called previuosly, gp_regs[PT_MSR] gets a copy of invalid MSR bits as MSR in ckpt_regs was not updated from regs->msr and so is zeroed. Later when leaving the signal handler once in sys_rt_sigreturn() the TS bits of gp_regs[PT_MSR] are checked to determine if restore_tm_sigcontexts() must be called to pull in the correct MSR state into the user context. Because TS bits are zeroed restore_tm_sigcontexts() is never called and MSR restored from the user context on returning from the signal handler has the MSR.LE (the endianness bit) forced to zero (Big-Endian). That leads, for instance, to 'nop' being treated as an illegal instruction in the following sequence: tbegin. beq 1f trap tend. 1: nop on PPC64 LE machines and the process dies just after returning from the signal handler. PPC64 BE is also affected but in a subtle way since forcing Big-Endian on a BE machine does not change the endianness. This commit fixes the issue described above by ensuring that once in setup_tm_sigcontexts() the MSR used is from regs->msr instead of from ckpt_regs->msr and by ensuring that we pull in only the MSR.FP, MSR.VEC, and MSR.VSX bits from ckpt_regs->msr. The fix was tested both on LE and BE machines and no regression regarding the powerpc/tm selftests was observed. Signed-off-by: Gustavo Romero <gromero@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-12-31 23:20:45 +00:00
unsigned long msr = tsk->thread.regs->msr;
long err = 0;
BUG_ON(tsk != current);
BUG_ON(!MSR_TM_ACTIVE(regs->msr));
WARN_ON(tm_suspend_disabled);
powerpc/tm: Fix endianness flip on trap Currently it's possible that a thread on PPC64 LE has its endianness flipped inadvertently to Big-Endian resulting in a crash once the process is back from the signal handler. If giveup_all() is called when regs->msr has the bits MSR.FP and MSR.VEC disabled (and hence MSR.VSX disabled too) it returns without calling check_if_tm_restore_required() which copies regs->msr to ckpt_regs->msr if the process caught a signal whilst in transactional mode. Then once in setup_tm_sigcontexts() MSR from ckpt_regs.msr is used, but since check_if_tm_restore_required() was not called previuosly, gp_regs[PT_MSR] gets a copy of invalid MSR bits as MSR in ckpt_regs was not updated from regs->msr and so is zeroed. Later when leaving the signal handler once in sys_rt_sigreturn() the TS bits of gp_regs[PT_MSR] are checked to determine if restore_tm_sigcontexts() must be called to pull in the correct MSR state into the user context. Because TS bits are zeroed restore_tm_sigcontexts() is never called and MSR restored from the user context on returning from the signal handler has the MSR.LE (the endianness bit) forced to zero (Big-Endian). That leads, for instance, to 'nop' being treated as an illegal instruction in the following sequence: tbegin. beq 1f trap tend. 1: nop on PPC64 LE machines and the process dies just after returning from the signal handler. PPC64 BE is also affected but in a subtle way since forcing Big-Endian on a BE machine does not change the endianness. This commit fixes the issue described above by ensuring that once in setup_tm_sigcontexts() the MSR used is from regs->msr instead of from ckpt_regs->msr and by ensuring that we pull in only the MSR.FP, MSR.VEC, and MSR.VSX bits from ckpt_regs->msr. The fix was tested both on LE and BE machines and no regression regarding the powerpc/tm selftests was observed. Signed-off-by: Gustavo Romero <gromero@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-12-31 23:20:45 +00:00
/* Restore checkpointed FP, VEC, and VSX bits from ckpt_regs as
* it contains the correct FP, VEC, VSX state after we treclaimed
* the transaction and giveup_all() was called on reclaiming.
*/
msr |= tsk->thread.ckpt_regs.msr & (MSR_FP | MSR_VEC | MSR_VSX);
powerpc: Don't corrupt transactional state when using FP/VMX in kernel Currently, when we have a process using the transactional memory facilities on POWER8 (that is, the processor is in transactional or suspended state), and the process enters the kernel and the kernel then uses the floating-point or vector (VMX/Altivec) facility, we end up corrupting the user-visible FP/VMX/VSX state. This happens, for example, if a page fault causes a copy-on-write operation, because the copy_page function will use VMX to do the copy on POWER8. The test program below demonstrates the bug. The bug happens because when FP/VMX state for a transactional process is stored in the thread_struct, we store the checkpointed state in .fp_state/.vr_state and the transactional (current) state in .transact_fp/.transact_vr. However, when the kernel wants to use FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(), which saves the current state in .fp_state/.vr_state. Furthermore, when we return to the user process we return with FP/VMX/VSX disabled. The next time the process uses FP/VMX/VSX, we don't know which set of state (the current register values, .fp_state/.vr_state, or .transact_fp/.transact_vr) we should be using, since we have no way to tell if we are still in the same transaction, and if not, whether the previous transaction succeeded or failed. Thus it is necessary to strictly adhere to the rule that if FP has been enabled at any point in a transaction, we must keep FP enabled for the user process with the current transactional state in the FP registers, until we detect that it is no longer in a transaction. Similarly for VMX; once enabled it must stay enabled until the process is no longer transactional. In order to keep this rule, we add a new thread_info flag which we test when returning from the kernel to userspace, called TIF_RESTORE_TM. This flag indicates that there is FP/VMX/VSX state to be restored before entering userspace, and when it is set the .tm_orig_msr field in the thread_struct indicates what state needs to be restored. The restoration is done by restore_tm_state(). The TIF_RESTORE_TM bit is set by new giveup_fpu/altivec_maybe_transactional helpers, which are called from enable_kernel_fp/altivec, giveup_vsx, and flush_fp/altivec_to_thread instead of giveup_fpu/altivec. The other thing to be done is to get the transactional FP/VMX/VSX state from .fp_state/.vr_state when doing reclaim, if that state has been saved there by giveup_fpu/altivec_maybe_transactional. Having done this, we set the FP/VMX bit in the thread's MSR after reclaim to indicate that that part of the state is now valid (having been reclaimed from the processor's checkpointed state). Finally, in the signal handling code, we move the clearing of the transactional state bits in the thread's MSR a bit earlier, before calling flush_fp_to_thread(), so that we don't unnecessarily set the TIF_RESTORE_TM bit. This is the test program: /* Michael Neuling 4/12/2013 * * See if the altivec state is leaked out of an aborted transaction due to * kernel vmx copy loops. * * gcc -m64 htm_vmxcopy.c -o htm_vmxcopy * */ /* We don't use all of these, but for reference: */ int main(int argc, char *argv[]) { long double vecin = 1.3; long double vecout; unsigned long pgsize = getpagesize(); int i; int fd; int size = pgsize*16; char tmpfile[] = "/tmp/page_faultXXXXXX"; char buf[pgsize]; char *a; uint64_t aborted = 0; fd = mkstemp(tmpfile); assert(fd >= 0); memset(buf, 0, pgsize); for (i = 0; i < size; i += pgsize) assert(write(fd, buf, pgsize) == pgsize); unlink(tmpfile); a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); assert(a != MAP_FAILED); asm __volatile__( "lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value TBEGIN "beq 3f ;" TSUSPEND "xxlxor 40,40,40 ; " // set 40 to 0 "std 5, 0(%[map]) ;" // cause kernel vmx copy page TABORT TRESUME TEND "li %[res], 0 ;" "b 5f ;" "3: ;" // Abort handler "li %[res], 1 ;" "5: ;" "stxvd2x 40,0,%[vecoutptr] ; " : [res]"=r"(aborted) : [vecinptr]"r"(&vecin), [vecoutptr]"r"(&vecout), [map]"r"(a) : "memory", "r0", "r3", "r4", "r5", "r6", "r7"); if (aborted && (vecin != vecout)){ printf("FAILED: vector state leaked on abort %f != %f\n", (double)vecin, (double)vecout); exit(1); } munmap(a, size); close(fd); printf("PASSED!\n"); return 0; } Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 04:56:29 +00:00
/* Remove TM bits from thread's MSR. The MSR in the sigcontext
* just indicates to userland that we were doing a transaction, but we
* don't want to return in transactional state. This also ensures
* that flush_fp_to_thread won't set TIF_RESTORE_TM again.
*/
regs->msr &= ~MSR_TS_MASK;
#ifdef CONFIG_ALTIVEC
err |= __put_user(v_regs, &sc->v_regs);
err |= __put_user(tm_v_regs, &tm_sc->v_regs);
/* save altivec registers */
if (tsk->thread.used_vr) {
/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
err |= __copy_to_user(v_regs, &tsk->thread.ckvr_state,
33 * sizeof(vector128));
/* If VEC was enabled there are transactional VRs valid too,
* else they're a copy of the checkpointed VRs.
*/
if (msr & MSR_VEC)
err |= __copy_to_user(tm_v_regs,
powerpc: tm: Always use fp_state and vr_state to store live registers There is currently an inconsistency as to how the entire CPU register state is saved and restored when a thread uses transactional memory (TM). Using transactional memory results in the CPU having duplicated (almost) all of its register state. This duplication results in a set of registers which can be considered 'live', those being currently modified by the instructions being executed and another set that is frozen at a point in time. On context switch, both sets of state have to be saved and (later) restored. These two states are often called a variety of different things. Common terms for the state which only exists after the CPU has entered a transaction (performed a TBEGIN instruction) in hardware are 'transactional' or 'speculative'. Between a TBEGIN and a TEND or TABORT (or an event that causes the hardware to abort), regardless of the use of TSUSPEND the transactional state can be referred to as the live state. The second state is often to referred to as the 'checkpointed' state and is a duplication of the live state when the TBEGIN instruction is executed. This state is kept in the hardware and will be rolled back to on transaction failure. Currently all the registers stored in pt_regs are ALWAYS the live registers, that is, when a thread has transactional registers their values are stored in pt_regs and the checkpointed state is in ckpt_regs. A strange opposite is true for fp_state/vr_state. When a thread is non transactional fp_state/vr_state holds the live registers. When a thread has initiated a transaction fp_state/vr_state holds the checkpointed state and transact_fp/transact_vr become the structure which holds the live state (at this point it is a transactional state). This method creates confusion as to where the live state is, in some circumstances it requires extra work to determine where to put the live state and prevents the use of common functions designed (probably before TM) to save the live state. With this patch pt_regs, fp_state and vr_state all represent the same thing and the other structures [pending rename] are for checkpointed state. Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-09-23 06:18:24 +00:00
&tsk->thread.vr_state,
33 * sizeof(vector128));
else
err |= __copy_to_user(tm_v_regs,
&tsk->thread.ckvr_state,
33 * sizeof(vector128));
/* set MSR_VEC in the MSR value in the frame to indicate
* that sc->v_reg contains valid data.
*/
msr |= MSR_VEC;
}
/* We always copy to/from vrsave, it's 0 if we don't have or don't
* use altivec.
*/
if (cpu_has_feature(CPU_FTR_ALTIVEC))
tsk->thread.ckvrsave = mfspr(SPRN_VRSAVE);
err |= __put_user(tsk->thread.ckvrsave, (u32 __user *)&v_regs[33]);
if (msr & MSR_VEC)
powerpc: tm: Always use fp_state and vr_state to store live registers There is currently an inconsistency as to how the entire CPU register state is saved and restored when a thread uses transactional memory (TM). Using transactional memory results in the CPU having duplicated (almost) all of its register state. This duplication results in a set of registers which can be considered 'live', those being currently modified by the instructions being executed and another set that is frozen at a point in time. On context switch, both sets of state have to be saved and (later) restored. These two states are often called a variety of different things. Common terms for the state which only exists after the CPU has entered a transaction (performed a TBEGIN instruction) in hardware are 'transactional' or 'speculative'. Between a TBEGIN and a TEND or TABORT (or an event that causes the hardware to abort), regardless of the use of TSUSPEND the transactional state can be referred to as the live state. The second state is often to referred to as the 'checkpointed' state and is a duplication of the live state when the TBEGIN instruction is executed. This state is kept in the hardware and will be rolled back to on transaction failure. Currently all the registers stored in pt_regs are ALWAYS the live registers, that is, when a thread has transactional registers their values are stored in pt_regs and the checkpointed state is in ckpt_regs. A strange opposite is true for fp_state/vr_state. When a thread is non transactional fp_state/vr_state holds the live registers. When a thread has initiated a transaction fp_state/vr_state holds the checkpointed state and transact_fp/transact_vr become the structure which holds the live state (at this point it is a transactional state). This method creates confusion as to where the live state is, in some circumstances it requires extra work to determine where to put the live state and prevents the use of common functions designed (probably before TM) to save the live state. With this patch pt_regs, fp_state and vr_state all represent the same thing and the other structures [pending rename] are for checkpointed state. Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-09-23 06:18:24 +00:00
err |= __put_user(tsk->thread.vrsave,
(u32 __user *)&tm_v_regs[33]);
else
err |= __put_user(tsk->thread.ckvrsave,
(u32 __user *)&tm_v_regs[33]);
#else /* CONFIG_ALTIVEC */
err |= __put_user(0, &sc->v_regs);
err |= __put_user(0, &tm_sc->v_regs);
#endif /* CONFIG_ALTIVEC */
/* copy fpr regs and fpscr */
err |= copy_ckfpr_to_user(&sc->fp_regs, tsk);
if (msr & MSR_FP)
err |= copy_fpr_to_user(&tm_sc->fp_regs, tsk);
powerpc: tm: Always use fp_state and vr_state to store live registers There is currently an inconsistency as to how the entire CPU register state is saved and restored when a thread uses transactional memory (TM). Using transactional memory results in the CPU having duplicated (almost) all of its register state. This duplication results in a set of registers which can be considered 'live', those being currently modified by the instructions being executed and another set that is frozen at a point in time. On context switch, both sets of state have to be saved and (later) restored. These two states are often called a variety of different things. Common terms for the state which only exists after the CPU has entered a transaction (performed a TBEGIN instruction) in hardware are 'transactional' or 'speculative'. Between a TBEGIN and a TEND or TABORT (or an event that causes the hardware to abort), regardless of the use of TSUSPEND the transactional state can be referred to as the live state. The second state is often to referred to as the 'checkpointed' state and is a duplication of the live state when the TBEGIN instruction is executed. This state is kept in the hardware and will be rolled back to on transaction failure. Currently all the registers stored in pt_regs are ALWAYS the live registers, that is, when a thread has transactional registers their values are stored in pt_regs and the checkpointed state is in ckpt_regs. A strange opposite is true for fp_state/vr_state. When a thread is non transactional fp_state/vr_state holds the live registers. When a thread has initiated a transaction fp_state/vr_state holds the checkpointed state and transact_fp/transact_vr become the structure which holds the live state (at this point it is a transactional state). This method creates confusion as to where the live state is, in some circumstances it requires extra work to determine where to put the live state and prevents the use of common functions designed (probably before TM) to save the live state. With this patch pt_regs, fp_state and vr_state all represent the same thing and the other structures [pending rename] are for checkpointed state. Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-09-23 06:18:24 +00:00
else
err |= copy_ckfpr_to_user(&tm_sc->fp_regs, tsk);
#ifdef CONFIG_VSX
/*
* Copy VSX low doubleword to local buffer for formatting,
* then out to userspace. Update v_regs to point after the
* VMX data.
*/
if (tsk->thread.used_vsr) {
v_regs += ELF_NVRREG;
tm_v_regs += ELF_NVRREG;
err |= copy_ckvsx_to_user(v_regs, tsk);
if (msr & MSR_VSX)
err |= copy_vsx_to_user(tm_v_regs, tsk);
powerpc: tm: Always use fp_state and vr_state to store live registers There is currently an inconsistency as to how the entire CPU register state is saved and restored when a thread uses transactional memory (TM). Using transactional memory results in the CPU having duplicated (almost) all of its register state. This duplication results in a set of registers which can be considered 'live', those being currently modified by the instructions being executed and another set that is frozen at a point in time. On context switch, both sets of state have to be saved and (later) restored. These two states are often called a variety of different things. Common terms for the state which only exists after the CPU has entered a transaction (performed a TBEGIN instruction) in hardware are 'transactional' or 'speculative'. Between a TBEGIN and a TEND or TABORT (or an event that causes the hardware to abort), regardless of the use of TSUSPEND the transactional state can be referred to as the live state. The second state is often to referred to as the 'checkpointed' state and is a duplication of the live state when the TBEGIN instruction is executed. This state is kept in the hardware and will be rolled back to on transaction failure. Currently all the registers stored in pt_regs are ALWAYS the live registers, that is, when a thread has transactional registers their values are stored in pt_regs and the checkpointed state is in ckpt_regs. A strange opposite is true for fp_state/vr_state. When a thread is non transactional fp_state/vr_state holds the live registers. When a thread has initiated a transaction fp_state/vr_state holds the checkpointed state and transact_fp/transact_vr become the structure which holds the live state (at this point it is a transactional state). This method creates confusion as to where the live state is, in some circumstances it requires extra work to determine where to put the live state and prevents the use of common functions designed (probably before TM) to save the live state. With this patch pt_regs, fp_state and vr_state all represent the same thing and the other structures [pending rename] are for checkpointed state. Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-09-23 06:18:24 +00:00
else
err |= copy_ckvsx_to_user(tm_v_regs, tsk);
/* set MSR_VSX in the MSR value in the frame to
* indicate that sc->vs_reg) contains valid data.
*/
msr |= MSR_VSX;
}
#endif /* CONFIG_VSX */
err |= __put_user(&sc->gp_regs, &sc->regs);
err |= __put_user(&tm_sc->gp_regs, &tm_sc->regs);
WARN_ON(!FULL_REGS(regs));
err |= __copy_to_user(&tm_sc->gp_regs, regs, GP_REGS_SIZE);
err |= __copy_to_user(&sc->gp_regs,
&tsk->thread.ckpt_regs, GP_REGS_SIZE);
err |= __put_user(msr, &tm_sc->gp_regs[PT_MSR]);
err |= __put_user(msr, &sc->gp_regs[PT_MSR]);
err |= __put_user(signr, &sc->signal);
err |= __put_user(handler, &sc->handler);
if (set != NULL)
err |= __put_user(set->sig[0], &sc->oldmask);
return err;
}
#endif
/*
* Restore the sigcontext from the signal frame.
*/
static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig,
struct sigcontext __user *sc)
{
#ifdef CONFIG_ALTIVEC
elf_vrreg_t __user *v_regs;
#endif
unsigned long err = 0;
unsigned long save_r13 = 0;
unsigned long msr;
struct pt_regs *regs = tsk->thread.regs;
#ifdef CONFIG_VSX
int i;
#endif
BUG_ON(tsk != current);
/* If this is not a signal return, we preserve the TLS in r13 */
if (!sig)
save_r13 = regs->gpr[13];
/* copy the GPRs */
err |= __copy_from_user(regs->gpr, sc->gp_regs, sizeof(regs->gpr));
err |= __get_user(regs->nip, &sc->gp_regs[PT_NIP]);
/* get MSR separately, transfer the LE bit if doing signal return */
err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
if (sig)
regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
err |= __get_user(regs->orig_gpr3, &sc->gp_regs[PT_ORIG_R3]);
err |= __get_user(regs->ctr, &sc->gp_regs[PT_CTR]);
err |= __get_user(regs->link, &sc->gp_regs[PT_LNK]);
err |= __get_user(regs->xer, &sc->gp_regs[PT_XER]);
err |= __get_user(regs->ccr, &sc->gp_regs[PT_CCR]);
/* skip SOFTE */
regs->trap = 0;
err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]);
err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]);
err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]);
if (!sig)
regs->gpr[13] = save_r13;
if (set != NULL)
err |= __get_user(set->sig[0], &sc->oldmask);
/*
* Force reload of FP/VEC.
* This has to be done before copying stuff into tsk->thread.fpr/vr
* for the reasons explained in the previous comment.
*/
regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX);
#ifdef CONFIG_ALTIVEC
err |= __get_user(v_regs, &sc->v_regs);
if (err)
return err;
Remove 'type' argument from access_ok() function Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument of the user address range verification function since we got rid of the old racy i386-only code to walk page tables by hand. It existed because the original 80386 would not honor the write protect bit when in kernel mode, so you had to do COW by hand before doing any user access. But we haven't supported that in a long time, and these days the 'type' argument is a purely historical artifact. A discussion about extending 'user_access_begin()' to do the range checking resulted this patch, because there is no way we're going to move the old VERIFY_xyz interface to that model. And it's best done at the end of the merge window when I've done most of my merges, so let's just get this done once and for all. This patch was mostly done with a sed-script, with manual fix-ups for the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form. There were a couple of notable cases: - csky still had the old "verify_area()" name as an alias. - the iter_iov code had magical hardcoded knowledge of the actual values of VERIFY_{READ,WRITE} (not that they mattered, since nothing really used it) - microblaze used the type argument for a debug printout but other than those oddities this should be a total no-op patch. I tried to fix up all architectures, did fairly extensive grepping for access_ok() uses, and the changes are trivial, but I may have missed something. Any missed conversion should be trivially fixable, though. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 02:57:57 +00:00
if (v_regs && !access_ok(v_regs, 34 * sizeof(vector128)))
return -EFAULT;
/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
if (v_regs != NULL && (msr & MSR_VEC) != 0) {
err |= __copy_from_user(&tsk->thread.vr_state, v_regs,
33 * sizeof(vector128));
tsk->thread.used_vr = true;
} else if (tsk->thread.used_vr) {
memset(&tsk->thread.vr_state, 0, 33 * sizeof(vector128));
}
/* Always get VRSAVE back */
if (v_regs != NULL)
err |= __get_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33]);
else
tsk->thread.vrsave = 0;
if (cpu_has_feature(CPU_FTR_ALTIVEC))
mtspr(SPRN_VRSAVE, tsk->thread.vrsave);
#endif /* CONFIG_ALTIVEC */
powerpc: Introduce VSX thread_struct and CONFIG_VSX The layout of the new VSR registers and how they overlap on top of the legacy FPR and VR registers is: VSR doubleword 0 VSR doubleword 1 ---------------------------------------------------------------- VSR[0] | FPR[0] | | ---------------------------------------------------------------- VSR[1] | FPR[1] | | ---------------------------------------------------------------- | ... | | | ... | | ---------------------------------------------------------------- VSR[30] | FPR[30] | | ---------------------------------------------------------------- VSR[31] | FPR[31] | | ---------------------------------------------------------------- VSR[32] | VR[0] | ---------------------------------------------------------------- VSR[33] | VR[1] | ---------------------------------------------------------------- | ... | | ... | ---------------------------------------------------------------- VSR[62] | VR[30] | ---------------------------------------------------------------- VSR[63] | VR[31] | ---------------------------------------------------------------- VSX has 64 128bit registers. The first 32 regs overlap with the FP registers and hence extend them with and additional 64 bits. The second 32 regs overlap with the VMX registers. This commit introduces the thread_struct changes required to reflect this register layout. Ptrace and signals code is updated so that the floating point registers are correctly accessed from the thread_struct when CONFIG_VSX is enabled. Signed-off-by: Michael Neuling <mikey@neuling.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
2008-06-25 04:07:18 +00:00
/* restore floating point */
err |= copy_fpr_from_user(tsk, &sc->fp_regs);
#ifdef CONFIG_VSX
/*
* Get additional VSX data. Update v_regs to point after the
* VMX data. Copy VSX low doubleword from userspace to local
* buffer for formatting, then into the taskstruct.
*/
v_regs += ELF_NVRREG;
if ((msr & MSR_VSX) != 0) {
err |= copy_vsx_from_user(tsk, v_regs);
tsk->thread.used_vsr = true;
} else {
for (i = 0; i < 32 ; i++)
tsk->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
}
powerpc: Introduce VSX thread_struct and CONFIG_VSX The layout of the new VSR registers and how they overlap on top of the legacy FPR and VR registers is: VSR doubleword 0 VSR doubleword 1 ---------------------------------------------------------------- VSR[0] | FPR[0] | | ---------------------------------------------------------------- VSR[1] | FPR[1] | | ---------------------------------------------------------------- | ... | | | ... | | ---------------------------------------------------------------- VSR[30] | FPR[30] | | ---------------------------------------------------------------- VSR[31] | FPR[31] | | ---------------------------------------------------------------- VSR[32] | VR[0] | ---------------------------------------------------------------- VSR[33] | VR[1] | ---------------------------------------------------------------- | ... | | ... | ---------------------------------------------------------------- VSR[62] | VR[30] | ---------------------------------------------------------------- VSR[63] | VR[31] | ---------------------------------------------------------------- VSX has 64 128bit registers. The first 32 regs overlap with the FP registers and hence extend them with and additional 64 bits. The second 32 regs overlap with the VMX registers. This commit introduces the thread_struct changes required to reflect this register layout. Ptrace and signals code is updated so that the floating point registers are correctly accessed from the thread_struct when CONFIG_VSX is enabled. Signed-off-by: Michael Neuling <mikey@neuling.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
2008-06-25 04:07:18 +00:00
#endif
return err;
}
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/*
* Restore the two sigcontexts from the frame of a transactional processes.
*/
static long restore_tm_sigcontexts(struct task_struct *tsk,
struct sigcontext __user *sc,
struct sigcontext __user *tm_sc)
{
#ifdef CONFIG_ALTIVEC
elf_vrreg_t __user *v_regs, *tm_v_regs;
#endif
unsigned long err = 0;
unsigned long msr;
struct pt_regs *regs = tsk->thread.regs;
#ifdef CONFIG_VSX
int i;
#endif
BUG_ON(tsk != current);
if (tm_suspend_disabled)
return -EINVAL;
/* copy the GPRs */
err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr));
err |= __copy_from_user(&tsk->thread.ckpt_regs, sc->gp_regs,
sizeof(regs->gpr));
/*
* TFHAR is restored from the checkpointed 'wound-back' ucontext's NIP.
* TEXASR was set by the signal delivery reclaim, as was TFIAR.
* Users doing anything abhorrent like thread-switching w/ signals for
* TM-Suspended code will have to back TEXASR/TFIAR up themselves.
* For the case of getting a signal and simply returning from it,
* we don't need to re-copy them here.
*/
err |= __get_user(regs->nip, &tm_sc->gp_regs[PT_NIP]);
err |= __get_user(tsk->thread.tm_tfhar, &sc->gp_regs[PT_NIP]);
/* get MSR separately, transfer the LE bit if doing signal return */
err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
/* Don't allow reserved mode. */
if (MSR_TM_RESV(msr))
return -EINVAL;
/* pull in MSR LE from user context */
regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
/* The following non-GPR non-FPR non-VR state is also checkpointed: */
err |= __get_user(regs->ctr, &tm_sc->gp_regs[PT_CTR]);
err |= __get_user(regs->link, &tm_sc->gp_regs[PT_LNK]);
err |= __get_user(regs->xer, &tm_sc->gp_regs[PT_XER]);
err |= __get_user(regs->ccr, &tm_sc->gp_regs[PT_CCR]);
err |= __get_user(tsk->thread.ckpt_regs.ctr,
&sc->gp_regs[PT_CTR]);
err |= __get_user(tsk->thread.ckpt_regs.link,
&sc->gp_regs[PT_LNK]);
err |= __get_user(tsk->thread.ckpt_regs.xer,
&sc->gp_regs[PT_XER]);
err |= __get_user(tsk->thread.ckpt_regs.ccr,
&sc->gp_regs[PT_CCR]);
/* These regs are not checkpointed; they can go in 'regs'. */
err |= __get_user(regs->trap, &sc->gp_regs[PT_TRAP]);
err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]);
err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]);
err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]);
/*
* Force reload of FP/VEC.
* This has to be done before copying stuff into tsk->thread.fpr/vr
* for the reasons explained in the previous comment.
*/
regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX);
#ifdef CONFIG_ALTIVEC
err |= __get_user(v_regs, &sc->v_regs);
err |= __get_user(tm_v_regs, &tm_sc->v_regs);
if (err)
return err;
Remove 'type' argument from access_ok() function Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument of the user address range verification function since we got rid of the old racy i386-only code to walk page tables by hand. It existed because the original 80386 would not honor the write protect bit when in kernel mode, so you had to do COW by hand before doing any user access. But we haven't supported that in a long time, and these days the 'type' argument is a purely historical artifact. A discussion about extending 'user_access_begin()' to do the range checking resulted this patch, because there is no way we're going to move the old VERIFY_xyz interface to that model. And it's best done at the end of the merge window when I've done most of my merges, so let's just get this done once and for all. This patch was mostly done with a sed-script, with manual fix-ups for the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form. There were a couple of notable cases: - csky still had the old "verify_area()" name as an alias. - the iter_iov code had magical hardcoded knowledge of the actual values of VERIFY_{READ,WRITE} (not that they mattered, since nothing really used it) - microblaze used the type argument for a debug printout but other than those oddities this should be a total no-op patch. I tried to fix up all architectures, did fairly extensive grepping for access_ok() uses, and the changes are trivial, but I may have missed something. Any missed conversion should be trivially fixable, though. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 02:57:57 +00:00
if (v_regs && !access_ok(v_regs, 34 * sizeof(vector128)))
return -EFAULT;
Remove 'type' argument from access_ok() function Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument of the user address range verification function since we got rid of the old racy i386-only code to walk page tables by hand. It existed because the original 80386 would not honor the write protect bit when in kernel mode, so you had to do COW by hand before doing any user access. But we haven't supported that in a long time, and these days the 'type' argument is a purely historical artifact. A discussion about extending 'user_access_begin()' to do the range checking resulted this patch, because there is no way we're going to move the old VERIFY_xyz interface to that model. And it's best done at the end of the merge window when I've done most of my merges, so let's just get this done once and for all. This patch was mostly done with a sed-script, with manual fix-ups for the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form. There were a couple of notable cases: - csky still had the old "verify_area()" name as an alias. - the iter_iov code had magical hardcoded knowledge of the actual values of VERIFY_{READ,WRITE} (not that they mattered, since nothing really used it) - microblaze used the type argument for a debug printout but other than those oddities this should be a total no-op patch. I tried to fix up all architectures, did fairly extensive grepping for access_ok() uses, and the changes are trivial, but I may have missed something. Any missed conversion should be trivially fixable, though. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 02:57:57 +00:00
if (tm_v_regs && !access_ok(tm_v_regs, 34 * sizeof(vector128)))
return -EFAULT;
/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
if (v_regs != NULL && tm_v_regs != NULL && (msr & MSR_VEC) != 0) {
err |= __copy_from_user(&tsk->thread.ckvr_state, v_regs,
33 * sizeof(vector128));
powerpc: tm: Always use fp_state and vr_state to store live registers There is currently an inconsistency as to how the entire CPU register state is saved and restored when a thread uses transactional memory (TM). Using transactional memory results in the CPU having duplicated (almost) all of its register state. This duplication results in a set of registers which can be considered 'live', those being currently modified by the instructions being executed and another set that is frozen at a point in time. On context switch, both sets of state have to be saved and (later) restored. These two states are often called a variety of different things. Common terms for the state which only exists after the CPU has entered a transaction (performed a TBEGIN instruction) in hardware are 'transactional' or 'speculative'. Between a TBEGIN and a TEND or TABORT (or an event that causes the hardware to abort), regardless of the use of TSUSPEND the transactional state can be referred to as the live state. The second state is often to referred to as the 'checkpointed' state and is a duplication of the live state when the TBEGIN instruction is executed. This state is kept in the hardware and will be rolled back to on transaction failure. Currently all the registers stored in pt_regs are ALWAYS the live registers, that is, when a thread has transactional registers their values are stored in pt_regs and the checkpointed state is in ckpt_regs. A strange opposite is true for fp_state/vr_state. When a thread is non transactional fp_state/vr_state holds the live registers. When a thread has initiated a transaction fp_state/vr_state holds the checkpointed state and transact_fp/transact_vr become the structure which holds the live state (at this point it is a transactional state). This method creates confusion as to where the live state is, in some circumstances it requires extra work to determine where to put the live state and prevents the use of common functions designed (probably before TM) to save the live state. With this patch pt_regs, fp_state and vr_state all represent the same thing and the other structures [pending rename] are for checkpointed state. Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-09-23 06:18:24 +00:00
err |= __copy_from_user(&tsk->thread.vr_state, tm_v_regs,
33 * sizeof(vector128));
current->thread.used_vr = true;
}
else if (tsk->thread.used_vr) {
memset(&tsk->thread.vr_state, 0, 33 * sizeof(vector128));
memset(&tsk->thread.ckvr_state, 0, 33 * sizeof(vector128));
}
/* Always get VRSAVE back */
if (v_regs != NULL && tm_v_regs != NULL) {
err |= __get_user(tsk->thread.ckvrsave,
powerpc: tm: Always use fp_state and vr_state to store live registers There is currently an inconsistency as to how the entire CPU register state is saved and restored when a thread uses transactional memory (TM). Using transactional memory results in the CPU having duplicated (almost) all of its register state. This duplication results in a set of registers which can be considered 'live', those being currently modified by the instructions being executed and another set that is frozen at a point in time. On context switch, both sets of state have to be saved and (later) restored. These two states are often called a variety of different things. Common terms for the state which only exists after the CPU has entered a transaction (performed a TBEGIN instruction) in hardware are 'transactional' or 'speculative'. Between a TBEGIN and a TEND or TABORT (or an event that causes the hardware to abort), regardless of the use of TSUSPEND the transactional state can be referred to as the live state. The second state is often to referred to as the 'checkpointed' state and is a duplication of the live state when the TBEGIN instruction is executed. This state is kept in the hardware and will be rolled back to on transaction failure. Currently all the registers stored in pt_regs are ALWAYS the live registers, that is, when a thread has transactional registers their values are stored in pt_regs and the checkpointed state is in ckpt_regs. A strange opposite is true for fp_state/vr_state. When a thread is non transactional fp_state/vr_state holds the live registers. When a thread has initiated a transaction fp_state/vr_state holds the checkpointed state and transact_fp/transact_vr become the structure which holds the live state (at this point it is a transactional state). This method creates confusion as to where the live state is, in some circumstances it requires extra work to determine where to put the live state and prevents the use of common functions designed (probably before TM) to save the live state. With this patch pt_regs, fp_state and vr_state all represent the same thing and the other structures [pending rename] are for checkpointed state. Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-09-23 06:18:24 +00:00
(u32 __user *)&v_regs[33]);
err |= __get_user(tsk->thread.vrsave,
(u32 __user *)&tm_v_regs[33]);
}
else {
tsk->thread.vrsave = 0;
tsk->thread.ckvrsave = 0;
}
if (cpu_has_feature(CPU_FTR_ALTIVEC))
mtspr(SPRN_VRSAVE, tsk->thread.vrsave);
#endif /* CONFIG_ALTIVEC */
/* restore floating point */
powerpc: tm: Always use fp_state and vr_state to store live registers There is currently an inconsistency as to how the entire CPU register state is saved and restored when a thread uses transactional memory (TM). Using transactional memory results in the CPU having duplicated (almost) all of its register state. This duplication results in a set of registers which can be considered 'live', those being currently modified by the instructions being executed and another set that is frozen at a point in time. On context switch, both sets of state have to be saved and (later) restored. These two states are often called a variety of different things. Common terms for the state which only exists after the CPU has entered a transaction (performed a TBEGIN instruction) in hardware are 'transactional' or 'speculative'. Between a TBEGIN and a TEND or TABORT (or an event that causes the hardware to abort), regardless of the use of TSUSPEND the transactional state can be referred to as the live state. The second state is often to referred to as the 'checkpointed' state and is a duplication of the live state when the TBEGIN instruction is executed. This state is kept in the hardware and will be rolled back to on transaction failure. Currently all the registers stored in pt_regs are ALWAYS the live registers, that is, when a thread has transactional registers their values are stored in pt_regs and the checkpointed state is in ckpt_regs. A strange opposite is true for fp_state/vr_state. When a thread is non transactional fp_state/vr_state holds the live registers. When a thread has initiated a transaction fp_state/vr_state holds the checkpointed state and transact_fp/transact_vr become the structure which holds the live state (at this point it is a transactional state). This method creates confusion as to where the live state is, in some circumstances it requires extra work to determine where to put the live state and prevents the use of common functions designed (probably before TM) to save the live state. With this patch pt_regs, fp_state and vr_state all represent the same thing and the other structures [pending rename] are for checkpointed state. Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-09-23 06:18:24 +00:00
err |= copy_fpr_from_user(tsk, &tm_sc->fp_regs);
err |= copy_ckfpr_from_user(tsk, &sc->fp_regs);
#ifdef CONFIG_VSX
/*
* Get additional VSX data. Update v_regs to point after the
* VMX data. Copy VSX low doubleword from userspace to local
* buffer for formatting, then into the taskstruct.
*/
if (v_regs && ((msr & MSR_VSX) != 0)) {
v_regs += ELF_NVRREG;
tm_v_regs += ELF_NVRREG;
powerpc: tm: Always use fp_state and vr_state to store live registers There is currently an inconsistency as to how the entire CPU register state is saved and restored when a thread uses transactional memory (TM). Using transactional memory results in the CPU having duplicated (almost) all of its register state. This duplication results in a set of registers which can be considered 'live', those being currently modified by the instructions being executed and another set that is frozen at a point in time. On context switch, both sets of state have to be saved and (later) restored. These two states are often called a variety of different things. Common terms for the state which only exists after the CPU has entered a transaction (performed a TBEGIN instruction) in hardware are 'transactional' or 'speculative'. Between a TBEGIN and a TEND or TABORT (or an event that causes the hardware to abort), regardless of the use of TSUSPEND the transactional state can be referred to as the live state. The second state is often to referred to as the 'checkpointed' state and is a duplication of the live state when the TBEGIN instruction is executed. This state is kept in the hardware and will be rolled back to on transaction failure. Currently all the registers stored in pt_regs are ALWAYS the live registers, that is, when a thread has transactional registers their values are stored in pt_regs and the checkpointed state is in ckpt_regs. A strange opposite is true for fp_state/vr_state. When a thread is non transactional fp_state/vr_state holds the live registers. When a thread has initiated a transaction fp_state/vr_state holds the checkpointed state and transact_fp/transact_vr become the structure which holds the live state (at this point it is a transactional state). This method creates confusion as to where the live state is, in some circumstances it requires extra work to determine where to put the live state and prevents the use of common functions designed (probably before TM) to save the live state. With this patch pt_regs, fp_state and vr_state all represent the same thing and the other structures [pending rename] are for checkpointed state. Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-09-23 06:18:24 +00:00
err |= copy_vsx_from_user(tsk, tm_v_regs);
err |= copy_ckvsx_from_user(tsk, v_regs);
tsk->thread.used_vsr = true;
} else {
for (i = 0; i < 32 ; i++) {
tsk->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
tsk->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
}
}
#endif
tm_enable();
powerpc/tm: Disable IRQ in tm_recheckpoint We can't take an IRQ when we're about to do a trechkpt as our GPR state is set to user GPR values. We've hit this when running some IBM Java stress tests in the lab resulting in the following dump: cpu 0x3f: Vector: 700 (Program Check) at [c000000007eb3d40] pc: c000000000050074: restore_gprs+0xc0/0x148 lr: 00000000b52a8184 sp: ac57d360 msr: 8000000100201030 current = 0xc00000002c500000 paca = 0xc000000007dbfc00 softe: 0 irq_happened: 0x00 pid = 34535, comm = Pooled Thread # R00 = 00000000b52a8184 R16 = 00000000b3e48fda R01 = 00000000ac57d360 R17 = 00000000ade79bd8 R02 = 00000000ac586930 R18 = 000000000fac9bcc R03 = 00000000ade60000 R19 = 00000000ac57f930 R04 = 00000000f6624918 R20 = 00000000ade79be8 R05 = 00000000f663f238 R21 = 00000000ac218a54 R06 = 0000000000000002 R22 = 000000000f956280 R07 = 0000000000000008 R23 = 000000000000007e R08 = 000000000000000a R24 = 000000000000000c R09 = 00000000b6e69160 R25 = 00000000b424cf00 R10 = 0000000000000181 R26 = 00000000f66256d4 R11 = 000000000f365ec0 R27 = 00000000b6fdcdd0 R12 = 00000000f66400f0 R28 = 0000000000000001 R13 = 00000000ada71900 R29 = 00000000ade5a300 R14 = 00000000ac2185a8 R30 = 00000000f663f238 R15 = 0000000000000004 R31 = 00000000f6624918 pc = c000000000050074 restore_gprs+0xc0/0x148 cfar= c00000000004fe28 dont_restore_vec+0x1c/0x1a4 lr = 00000000b52a8184 msr = 8000000100201030 cr = 24804888 ctr = 0000000000000000 xer = 0000000000000000 trap = 700 This moves tm_recheckpoint to a C function and moves the tm_restore_sprs into that function. It then adds IRQ disabling over the trechkpt critical section. It also sets the TEXASR FS in the signals code to ensure this is never set now that we explictly write the TM sprs in tm_recheckpoint. Signed-off-by: Michael Neuling <mikey@neuling.org> cc: stable@vger.kernel.org Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-04-04 09:19:48 +00:00
/* Make sure the transaction is marked as failed */
tsk->thread.tm_texasr |= TEXASR_FS;
powerpc/tm: Set MSR[TS] just prior to recheckpoint On a signal handler return, the user could set a context with MSR[TS] bits set, and these bits would be copied to task regs->msr. At restore_tm_sigcontexts(), after current task regs->msr[TS] bits are set, several __get_user() are called and then a recheckpoint is executed. This is a problem since a page fault (in kernel space) could happen when calling __get_user(). If it happens, the process MSR[TS] bits were already set, but recheckpoint was not executed, and SPRs are still invalid. The page fault can cause the current process to be de-scheduled, with MSR[TS] active and without tm_recheckpoint() being called. More importantly, without TEXASR[FS] bit set also. Since TEXASR might not have the FS bit set, and when the process is scheduled back, it will try to reclaim, which will be aborted because of the CPU is not in the suspended state, and, then, recheckpoint. This recheckpoint will restore thread->texasr into TEXASR SPR, which might be zero, hitting a BUG_ON(). kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434! cpu 0xb: Vector: 700 (Program Check) at [c00000041f1576d0] pc: c000000000054550: restore_gprs+0xb0/0x180 lr: 0000000000000000 sp: c00000041f157950 msr: 8000000100021033 current = 0xc00000041f143000 paca = 0xc00000000fb86300 softe: 0 irq_happened: 0x01 pid = 1021, comm = kworker/11:1 kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434! Linux version 4.9.0-3-powerpc64le (debian-kernel@lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26) enter ? for help [c00000041f157b30] c00000000001bc3c tm_recheckpoint.part.11+0x6c/0xa0 [c00000041f157b70] c00000000001d184 __switch_to+0x1e4/0x4c0 [c00000041f157bd0] c00000000082eeb8 __schedule+0x2f8/0x990 [c00000041f157cb0] c00000000082f598 schedule+0x48/0xc0 [c00000041f157ce0] c0000000000f0d28 worker_thread+0x148/0x610 [c00000041f157d80] c0000000000f96b0 kthread+0x120/0x140 [c00000041f157e30] c00000000000c0e0 ret_from_kernel_thread+0x5c/0x7c This patch simply delays the MSR[TS] set, so, if there is any page fault in the __get_user() section, it does not have regs->msr[TS] set, since the TM structures are still invalid, thus avoiding doing TM operations for in-kernel exceptions and possible process reschedule. With this patch, the MSR[TS] will only be set just before recheckpointing and setting TEXASR[FS] = 1, thus avoiding an interrupt with TM registers in invalid state. Other than that, if CONFIG_PREEMPT is set, there might be a preemption just after setting MSR[TS] and before tm_recheckpoint(), thus, this block must be atomic from a preemption perspective, thus, calling preempt_disable/enable() on this code. It is not possible to move tm_recheckpoint to happen earlier, because it is required to get the checkpointed registers from userspace, with __get_user(), thus, the only way to avoid this undesired behavior is delaying the MSR[TS] set. The 32-bits signal handler seems to be safe this current issue, but, it might be exposed to the preemption issue, thus, disabling preemption in this chunk of code. Changes from v2: * Run the critical section with preempt_disable. Fixes: 87b4e5393af7 ("powerpc/tm: Fix return of active 64bit signals") Cc: stable@vger.kernel.org (v3.9+) Signed-off-by: Breno Leitao <leitao@debian.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-11-21 19:21:09 +00:00
/*
* Disabling preemption, since it is unsafe to be preempted
* with MSR[TS] set without recheckpointing.
*/
preempt_disable();
/* pull in MSR TS bits from user context */
powerpc/tm: Avoid machine crash on rt_sigreturn() There is a kernel crash that happens if rt_sigreturn() is called inside a transactional block. This crash happens if the kernel hits an in-kernel page fault when accessing userspace memory, usually through copy_ckvsx_to_user(). A major page fault calls might_sleep() function, which can cause a task reschedule. A task reschedule (switch_to()) reclaim and recheckpoint the TM states, but, in the signal return path, the checkpointed memory was already reclaimed, thus the exception stack has MSR that points to MSR[TS]=0. When the code returns from might_sleep() and a task reschedule happened, then this task is returned with the memory recheckpointed, and CPU MSR[TS] = suspended. This means that there is a side effect at might_sleep() if it is called with CPU MSR[TS] = 0 and the task has regs->msr[TS] != 0. This side effect can cause a TM bad thing, since at the exception entrance, the stack saves MSR[TS]=0, and this is what will be used at RFID, but, the processor has MSR[TS] = Suspended, and this transition will be invalid and a TM Bad thing will be raised, causing the following crash: Unexpected TM Bad Thing exception at c00000000000e9ec (msr 0x8000000302a03031) tm_scratch=800000010280b033 cpu 0xc: Vector: 700 (Program Check) at [c00000003ff1fd70] pc: c00000000000e9ec: fast_exception_return+0x100/0x1bc lr: c000000000032948: handle_rt_signal64+0xb8/0xaf0 sp: c0000004263ebc40 msr: 8000000302a03031 current = 0xc000000415050300 paca = 0xc00000003ffc4080 irqmask: 0x03 irq_happened: 0x01 pid = 25006, comm = sigfuz Linux version 5.0.0-rc1-00001-g3bd6e94bec12 (breno@debian) (gcc version 8.2.0 (Debian 8.2.0-3)) #899 SMP Mon Jan 7 11:30:07 EST 2019 WARNING: exception is not recoverable, can't continue enter ? for help [c0000004263ebc40] c000000000032948 handle_rt_signal64+0xb8/0xaf0 (unreliable) [c0000004263ebd30] c000000000022780 do_notify_resume+0x2f0/0x430 [c0000004263ebe20] c00000000000e844 ret_from_except_lite+0x70/0x74 --- Exception: c00 (System Call) at 00007fffbaac400c SP (7fffeca90f40) is in userspace The solution for this problem is running the sigreturn code with regs->msr[TS] disabled, thus, avoiding hitting the side effect above. This does not seem to be a problem since regs->msr will be replaced by the ucontext value, so, it is being flushed already. In this case, it is flushed earlier. Signed-off-by: Breno Leitao <leitao@debian.org> Acked-by: Michael Neuling <mikey@neuling.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2019-01-16 16:47:44 +00:00
regs->msr |= msr & MSR_TS_MASK;
powerpc/tm: Set MSR[TS] just prior to recheckpoint On a signal handler return, the user could set a context with MSR[TS] bits set, and these bits would be copied to task regs->msr. At restore_tm_sigcontexts(), after current task regs->msr[TS] bits are set, several __get_user() are called and then a recheckpoint is executed. This is a problem since a page fault (in kernel space) could happen when calling __get_user(). If it happens, the process MSR[TS] bits were already set, but recheckpoint was not executed, and SPRs are still invalid. The page fault can cause the current process to be de-scheduled, with MSR[TS] active and without tm_recheckpoint() being called. More importantly, without TEXASR[FS] bit set also. Since TEXASR might not have the FS bit set, and when the process is scheduled back, it will try to reclaim, which will be aborted because of the CPU is not in the suspended state, and, then, recheckpoint. This recheckpoint will restore thread->texasr into TEXASR SPR, which might be zero, hitting a BUG_ON(). kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434! cpu 0xb: Vector: 700 (Program Check) at [c00000041f1576d0] pc: c000000000054550: restore_gprs+0xb0/0x180 lr: 0000000000000000 sp: c00000041f157950 msr: 8000000100021033 current = 0xc00000041f143000 paca = 0xc00000000fb86300 softe: 0 irq_happened: 0x01 pid = 1021, comm = kworker/11:1 kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434! Linux version 4.9.0-3-powerpc64le (debian-kernel@lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26) enter ? for help [c00000041f157b30] c00000000001bc3c tm_recheckpoint.part.11+0x6c/0xa0 [c00000041f157b70] c00000000001d184 __switch_to+0x1e4/0x4c0 [c00000041f157bd0] c00000000082eeb8 __schedule+0x2f8/0x990 [c00000041f157cb0] c00000000082f598 schedule+0x48/0xc0 [c00000041f157ce0] c0000000000f0d28 worker_thread+0x148/0x610 [c00000041f157d80] c0000000000f96b0 kthread+0x120/0x140 [c00000041f157e30] c00000000000c0e0 ret_from_kernel_thread+0x5c/0x7c This patch simply delays the MSR[TS] set, so, if there is any page fault in the __get_user() section, it does not have regs->msr[TS] set, since the TM structures are still invalid, thus avoiding doing TM operations for in-kernel exceptions and possible process reschedule. With this patch, the MSR[TS] will only be set just before recheckpointing and setting TEXASR[FS] = 1, thus avoiding an interrupt with TM registers in invalid state. Other than that, if CONFIG_PREEMPT is set, there might be a preemption just after setting MSR[TS] and before tm_recheckpoint(), thus, this block must be atomic from a preemption perspective, thus, calling preempt_disable/enable() on this code. It is not possible to move tm_recheckpoint to happen earlier, because it is required to get the checkpointed registers from userspace, with __get_user(), thus, the only way to avoid this undesired behavior is delaying the MSR[TS] set. The 32-bits signal handler seems to be safe this current issue, but, it might be exposed to the preemption issue, thus, disabling preemption in this chunk of code. Changes from v2: * Run the critical section with preempt_disable. Fixes: 87b4e5393af7 ("powerpc/tm: Fix return of active 64bit signals") Cc: stable@vger.kernel.org (v3.9+) Signed-off-by: Breno Leitao <leitao@debian.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-11-21 19:21:09 +00:00
/*
* Ensure that TM is enabled in regs->msr before we leave the signal
* handler. It could be the case that (a) user disabled the TM bit
* through the manipulation of the MSR bits in uc_mcontext or (b) the
* TM bit was disabled because a sufficient number of context switches
* happened whilst in the signal handler and load_tm overflowed,
* disabling the TM bit. In either case we can end up with an illegal
* TM state leading to a TM Bad Thing when we return to userspace.
*
* CAUTION:
* After regs->MSR[TS] being updated, make sure that get_user(),
* put_user() or similar functions are *not* called. These
* functions can generate page faults which will cause the process
* to be de-scheduled with MSR[TS] set but without calling
* tm_recheckpoint(). This can cause a bug.
*/
regs->msr |= MSR_TM;
/* This loads the checkpointed FP/VEC state, if used */
powerpc: Always save/restore checkpointed regs during treclaim/trecheckpoint Lazy save and restore of FP/Altivec means that a userspace process can be sent to userspace with FP or Altivec disabled and loaded only as required (by way of an FP/Altivec unavailable exception). Transactional Memory complicates this situation as a transaction could be started without FP/Altivec being loaded up. This causes the hardware to checkpoint incorrect registers. Handling FP/Altivec unavailable exceptions while a thread is transactional requires a reclaim and recheckpoint to ensure the CPU has correct state for both sets of registers. tm_reclaim() has optimisations to not always save the FP/Altivec registers to the checkpointed save area. This was originally done because the caller might have information that the checkpointed registers aren't valid due to lazy save and restore. We've also been a little vague as to how tm_reclaim() leaves the FP/Altivec state since it doesn't necessarily always save it to the thread struct. This has lead to an (incorrect) assumption that it leaves the checkpointed state on the CPU. tm_recheckpoint() has similar optimisations in reverse. It may not always reload the checkpointed FP/Altivec registers from the thread struct before the trecheckpoint. It is therefore quite unclear where it expects to get the state from. This didn't help with the assumption made about tm_reclaim(). These optimisations sit in what is by definition a slow path. If a process has to go through a reclaim/recheckpoint then its transaction will be doomed on returning to userspace. This mean that the process will be unable to complete its transaction and be forced to its failure handler. This is already an out if line case for userspace. Furthermore, the cost of copying 64 times 128 bits from registers isn't very long[0] (at all) on modern processors. As such it appears these optimisations have only served to increase code complexity and are unlikely to have had a measurable performance impact. Our transactional memory handling has been riddled with bugs. A cause of this has been difficulty in following the code flow, code complexity has not been our friend here. It makes sense to remove these optimisations in favour of a (hopefully) more stable implementation. This patch does mean that some times the assembly will needlessly save 'junk' registers which will subsequently get overwritten with the correct value by the C code which calls the assembly function. This small inefficiency is far outweighed by the reduction in complexity for general TM code, context switching paths, and transactional facility unavailable exception handler. 0: I tried to measure it once for other work and found that it was hiding in the noise of everything else I was working with. I find it exceedingly likely this will be the case here. Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-11-02 03:09:05 +00:00
tm_recheckpoint(&tsk->thread);
powerpc: tm: Always use fp_state and vr_state to store live registers There is currently an inconsistency as to how the entire CPU register state is saved and restored when a thread uses transactional memory (TM). Using transactional memory results in the CPU having duplicated (almost) all of its register state. This duplication results in a set of registers which can be considered 'live', those being currently modified by the instructions being executed and another set that is frozen at a point in time. On context switch, both sets of state have to be saved and (later) restored. These two states are often called a variety of different things. Common terms for the state which only exists after the CPU has entered a transaction (performed a TBEGIN instruction) in hardware are 'transactional' or 'speculative'. Between a TBEGIN and a TEND or TABORT (or an event that causes the hardware to abort), regardless of the use of TSUSPEND the transactional state can be referred to as the live state. The second state is often to referred to as the 'checkpointed' state and is a duplication of the live state when the TBEGIN instruction is executed. This state is kept in the hardware and will be rolled back to on transaction failure. Currently all the registers stored in pt_regs are ALWAYS the live registers, that is, when a thread has transactional registers their values are stored in pt_regs and the checkpointed state is in ckpt_regs. A strange opposite is true for fp_state/vr_state. When a thread is non transactional fp_state/vr_state holds the live registers. When a thread has initiated a transaction fp_state/vr_state holds the checkpointed state and transact_fp/transact_vr become the structure which holds the live state (at this point it is a transactional state). This method creates confusion as to where the live state is, in some circumstances it requires extra work to determine where to put the live state and prevents the use of common functions designed (probably before TM) to save the live state. With this patch pt_regs, fp_state and vr_state all represent the same thing and the other structures [pending rename] are for checkpointed state. Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-09-23 06:18:24 +00:00
msr_check_and_set(msr & (MSR_FP | MSR_VEC));
if (msr & MSR_FP) {
powerpc: tm: Always use fp_state and vr_state to store live registers There is currently an inconsistency as to how the entire CPU register state is saved and restored when a thread uses transactional memory (TM). Using transactional memory results in the CPU having duplicated (almost) all of its register state. This duplication results in a set of registers which can be considered 'live', those being currently modified by the instructions being executed and another set that is frozen at a point in time. On context switch, both sets of state have to be saved and (later) restored. These two states are often called a variety of different things. Common terms for the state which only exists after the CPU has entered a transaction (performed a TBEGIN instruction) in hardware are 'transactional' or 'speculative'. Between a TBEGIN and a TEND or TABORT (or an event that causes the hardware to abort), regardless of the use of TSUSPEND the transactional state can be referred to as the live state. The second state is often to referred to as the 'checkpointed' state and is a duplication of the live state when the TBEGIN instruction is executed. This state is kept in the hardware and will be rolled back to on transaction failure. Currently all the registers stored in pt_regs are ALWAYS the live registers, that is, when a thread has transactional registers their values are stored in pt_regs and the checkpointed state is in ckpt_regs. A strange opposite is true for fp_state/vr_state. When a thread is non transactional fp_state/vr_state holds the live registers. When a thread has initiated a transaction fp_state/vr_state holds the checkpointed state and transact_fp/transact_vr become the structure which holds the live state (at this point it is a transactional state). This method creates confusion as to where the live state is, in some circumstances it requires extra work to determine where to put the live state and prevents the use of common functions designed (probably before TM) to save the live state. With this patch pt_regs, fp_state and vr_state all represent the same thing and the other structures [pending rename] are for checkpointed state. Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-09-23 06:18:24 +00:00
load_fp_state(&tsk->thread.fp_state);
regs->msr |= (MSR_FP | tsk->thread.fpexc_mode);
}
if (msr & MSR_VEC) {
powerpc: tm: Always use fp_state and vr_state to store live registers There is currently an inconsistency as to how the entire CPU register state is saved and restored when a thread uses transactional memory (TM). Using transactional memory results in the CPU having duplicated (almost) all of its register state. This duplication results in a set of registers which can be considered 'live', those being currently modified by the instructions being executed and another set that is frozen at a point in time. On context switch, both sets of state have to be saved and (later) restored. These two states are often called a variety of different things. Common terms for the state which only exists after the CPU has entered a transaction (performed a TBEGIN instruction) in hardware are 'transactional' or 'speculative'. Between a TBEGIN and a TEND or TABORT (or an event that causes the hardware to abort), regardless of the use of TSUSPEND the transactional state can be referred to as the live state. The second state is often to referred to as the 'checkpointed' state and is a duplication of the live state when the TBEGIN instruction is executed. This state is kept in the hardware and will be rolled back to on transaction failure. Currently all the registers stored in pt_regs are ALWAYS the live registers, that is, when a thread has transactional registers their values are stored in pt_regs and the checkpointed state is in ckpt_regs. A strange opposite is true for fp_state/vr_state. When a thread is non transactional fp_state/vr_state holds the live registers. When a thread has initiated a transaction fp_state/vr_state holds the checkpointed state and transact_fp/transact_vr become the structure which holds the live state (at this point it is a transactional state). This method creates confusion as to where the live state is, in some circumstances it requires extra work to determine where to put the live state and prevents the use of common functions designed (probably before TM) to save the live state. With this patch pt_regs, fp_state and vr_state all represent the same thing and the other structures [pending rename] are for checkpointed state. Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-09-23 06:18:24 +00:00
load_vr_state(&tsk->thread.vr_state);
regs->msr |= MSR_VEC;
}
powerpc/tm: Set MSR[TS] just prior to recheckpoint On a signal handler return, the user could set a context with MSR[TS] bits set, and these bits would be copied to task regs->msr. At restore_tm_sigcontexts(), after current task regs->msr[TS] bits are set, several __get_user() are called and then a recheckpoint is executed. This is a problem since a page fault (in kernel space) could happen when calling __get_user(). If it happens, the process MSR[TS] bits were already set, but recheckpoint was not executed, and SPRs are still invalid. The page fault can cause the current process to be de-scheduled, with MSR[TS] active and without tm_recheckpoint() being called. More importantly, without TEXASR[FS] bit set also. Since TEXASR might not have the FS bit set, and when the process is scheduled back, it will try to reclaim, which will be aborted because of the CPU is not in the suspended state, and, then, recheckpoint. This recheckpoint will restore thread->texasr into TEXASR SPR, which might be zero, hitting a BUG_ON(). kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434! cpu 0xb: Vector: 700 (Program Check) at [c00000041f1576d0] pc: c000000000054550: restore_gprs+0xb0/0x180 lr: 0000000000000000 sp: c00000041f157950 msr: 8000000100021033 current = 0xc00000041f143000 paca = 0xc00000000fb86300 softe: 0 irq_happened: 0x01 pid = 1021, comm = kworker/11:1 kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434! Linux version 4.9.0-3-powerpc64le (debian-kernel@lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26) enter ? for help [c00000041f157b30] c00000000001bc3c tm_recheckpoint.part.11+0x6c/0xa0 [c00000041f157b70] c00000000001d184 __switch_to+0x1e4/0x4c0 [c00000041f157bd0] c00000000082eeb8 __schedule+0x2f8/0x990 [c00000041f157cb0] c00000000082f598 schedule+0x48/0xc0 [c00000041f157ce0] c0000000000f0d28 worker_thread+0x148/0x610 [c00000041f157d80] c0000000000f96b0 kthread+0x120/0x140 [c00000041f157e30] c00000000000c0e0 ret_from_kernel_thread+0x5c/0x7c This patch simply delays the MSR[TS] set, so, if there is any page fault in the __get_user() section, it does not have regs->msr[TS] set, since the TM structures are still invalid, thus avoiding doing TM operations for in-kernel exceptions and possible process reschedule. With this patch, the MSR[TS] will only be set just before recheckpointing and setting TEXASR[FS] = 1, thus avoiding an interrupt with TM registers in invalid state. Other than that, if CONFIG_PREEMPT is set, there might be a preemption just after setting MSR[TS] and before tm_recheckpoint(), thus, this block must be atomic from a preemption perspective, thus, calling preempt_disable/enable() on this code. It is not possible to move tm_recheckpoint to happen earlier, because it is required to get the checkpointed registers from userspace, with __get_user(), thus, the only way to avoid this undesired behavior is delaying the MSR[TS] set. The 32-bits signal handler seems to be safe this current issue, but, it might be exposed to the preemption issue, thus, disabling preemption in this chunk of code. Changes from v2: * Run the critical section with preempt_disable. Fixes: 87b4e5393af7 ("powerpc/tm: Fix return of active 64bit signals") Cc: stable@vger.kernel.org (v3.9+) Signed-off-by: Breno Leitao <leitao@debian.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-11-21 19:21:09 +00:00
preempt_enable();
return err;
}
#endif
/*
* Setup the trampoline code on the stack
*/
static long setup_trampoline(unsigned int syscall, unsigned int __user *tramp)
{
int i;
long err = 0;
/* addi r1, r1, __SIGNAL_FRAMESIZE # Pop the dummy stackframe */
err |= __put_user(PPC_INST_ADDI | __PPC_RT(R1) | __PPC_RA(R1) |
(__SIGNAL_FRAMESIZE & 0xffff), &tramp[0]);
/* li r0, __NR_[rt_]sigreturn| */
err |= __put_user(PPC_INST_ADDI | (syscall & 0xffff), &tramp[1]);
/* sc */
err |= __put_user(PPC_INST_SC, &tramp[2]);
/* Minimal traceback info */
for (i=TRAMP_TRACEBACK; i < TRAMP_SIZE ;i++)
err |= __put_user(0, &tramp[i]);
if (!err)
flush_icache_range((unsigned long) &tramp[0],
(unsigned long) &tramp[TRAMP_SIZE]);
return err;
}
/*
* Userspace code may pass a ucontext which doesn't include VSX added
* at the end. We need to check for this case.
*/
#define UCONTEXTSIZEWITHOUTVSX \
(sizeof(struct ucontext) - 32*sizeof(long))
/*
* Handle {get,set,swap}_context operations
*/
SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
struct ucontext __user *, new_ctx, long, ctx_size)
{
unsigned char tmp;
sigset_t set;
unsigned long new_msr = 0;
int ctx_has_vsx_region = 0;
if (new_ctx &&
get_user(new_msr, &new_ctx->uc_mcontext.gp_regs[PT_MSR]))
return -EFAULT;
/*
* Check that the context is not smaller than the original
* size (with VMX but without VSX)
*/
if (ctx_size < UCONTEXTSIZEWITHOUTVSX)
return -EINVAL;
/*
* If the new context state sets the MSR VSX bits but
* it doesn't provide VSX state.
*/
if ((ctx_size < sizeof(struct ucontext)) &&
(new_msr & MSR_VSX))
return -EINVAL;
/* Does the context have enough room to store VSX data? */
if (ctx_size >= sizeof(struct ucontext))
ctx_has_vsx_region = 1;
if (old_ctx != NULL) {
Remove 'type' argument from access_ok() function Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument of the user address range verification function since we got rid of the old racy i386-only code to walk page tables by hand. It existed because the original 80386 would not honor the write protect bit when in kernel mode, so you had to do COW by hand before doing any user access. But we haven't supported that in a long time, and these days the 'type' argument is a purely historical artifact. A discussion about extending 'user_access_begin()' to do the range checking resulted this patch, because there is no way we're going to move the old VERIFY_xyz interface to that model. And it's best done at the end of the merge window when I've done most of my merges, so let's just get this done once and for all. This patch was mostly done with a sed-script, with manual fix-ups for the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form. There were a couple of notable cases: - csky still had the old "verify_area()" name as an alias. - the iter_iov code had magical hardcoded knowledge of the actual values of VERIFY_{READ,WRITE} (not that they mattered, since nothing really used it) - microblaze used the type argument for a debug printout but other than those oddities this should be a total no-op patch. I tried to fix up all architectures, did fairly extensive grepping for access_ok() uses, and the changes are trivial, but I may have missed something. Any missed conversion should be trivially fixable, though. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 02:57:57 +00:00
if (!access_ok(old_ctx, ctx_size)
|| setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, 0,
ctx_has_vsx_region)
|| __copy_to_user(&old_ctx->uc_sigmask,
&current->blocked, sizeof(sigset_t)))
return -EFAULT;
}
if (new_ctx == NULL)
return 0;
Remove 'type' argument from access_ok() function Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument of the user address range verification function since we got rid of the old racy i386-only code to walk page tables by hand. It existed because the original 80386 would not honor the write protect bit when in kernel mode, so you had to do COW by hand before doing any user access. But we haven't supported that in a long time, and these days the 'type' argument is a purely historical artifact. A discussion about extending 'user_access_begin()' to do the range checking resulted this patch, because there is no way we're going to move the old VERIFY_xyz interface to that model. And it's best done at the end of the merge window when I've done most of my merges, so let's just get this done once and for all. This patch was mostly done with a sed-script, with manual fix-ups for the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form. There were a couple of notable cases: - csky still had the old "verify_area()" name as an alias. - the iter_iov code had magical hardcoded knowledge of the actual values of VERIFY_{READ,WRITE} (not that they mattered, since nothing really used it) - microblaze used the type argument for a debug printout but other than those oddities this should be a total no-op patch. I tried to fix up all architectures, did fairly extensive grepping for access_ok() uses, and the changes are trivial, but I may have missed something. Any missed conversion should be trivially fixable, though. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 02:57:57 +00:00
if (!access_ok(new_ctx, ctx_size)
|| __get_user(tmp, (u8 __user *) new_ctx)
|| __get_user(tmp, (u8 __user *) new_ctx + ctx_size - 1))
return -EFAULT;
/*
* If we get a fault copying the context into the kernel's
* image of the user's registers, we can't just return -EFAULT
* because the user's registers will be corrupted. For instance
* the NIP value may have been updated but not some of the
* other registers. Given that we have done the access_ok
* and successfully read the first and last bytes of the region
* above, this should only happen in an out-of-memory situation
* or if another thread unmaps the region containing the context.
* We kill the task with a SIGSEGV in this situation.
*/
if (__copy_from_user(&set, &new_ctx->uc_sigmask, sizeof(set)))
do_exit(SIGSEGV);
set_current_blocked(&set);
if (restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext))
do_exit(SIGSEGV);
/* This returns like rt_sigreturn */
[PATCH] syscall entry/exit revamp This cleanup patch speeds up the null syscall path on ppc64 by about 3%, and brings the ppc32 and ppc64 code slightly closer together. The ppc64 code was checking current_thread_info()->flags twice in the syscall exit path; once for TIF_SYSCALL_T_OR_A before disabling interrupts, and then again for TIF_SIGPENDING|TIF_NEED_RESCHED etc after disabling interrupts. Now we do the same as ppc32 -- check the flags only once in the fast path, and re-enable interrupts if necessary in the ptrace case. The patch abolishes the 'syscall_noerror' member of struct thread_info and replaces it with a TIF_NOERROR bit in the flags, which is handled in the slow path. This shortens the syscall entry code, which no longer needs to clear syscall_noerror. The patch adds a TIF_SAVE_NVGPRS flag which causes the syscall exit slow path to save the non-volatile GPRs into a signal frame. This removes the need for the assembly wrappers around sys_sigsuspend(), sys_rt_sigsuspend(), et al which existed solely to save those registers in advance. It also means I don't have to add new wrappers for ppoll() and pselect(), which is what I was supposed to be doing when I got distracted into this... Finally, it unifies the ppc64 and ppc32 methods of handling syscall exit directly into a signal handler (as required by sigsuspend et al) by introducing a TIF_RESTOREALL flag which causes _all_ the registers to be reloaded from the pt_regs by taking the ret_from_exception path, instead of the normal syscall exit path which stomps on the callee-saved GPRs. It appears to pass an LTP test run on ppc64, and passes basic testing on ppc32 too. Brief tests of ptrace functionality with strace and gdb also appear OK. I wouldn't send it to Linus for 2.6.15 just yet though :) Signed-off-by: David Woodhouse <dwmw2@infradead.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
2005-11-15 18:52:18 +00:00
set_thread_flag(TIF_RESTOREALL);
return 0;
}
/*
* Do a signal return; undo the signal stack.
*/
SYSCALL_DEFINE0(rt_sigreturn)
{
struct pt_regs *regs = current_pt_regs();
struct ucontext __user *uc = (struct ucontext __user *)regs->gpr[1];
sigset_t set;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
unsigned long msr;
#endif
/* Always make any pending restarted system calls return -EINTR */
all arches, signal: move restart_block to struct task_struct If an attacker can cause a controlled kernel stack overflow, overwriting the restart block is a very juicy exploit target. This is because the restart_block is held in the same memory allocation as the kernel stack. Moving the restart block to struct task_struct prevents this exploit by making the restart_block harder to locate. Note that there are other fields in thread_info that are also easy targets, at least on some architectures. It's also a decent simplification, since the restart code is more or less identical on all architectures. [james.hogan@imgtec.com: metag: align thread_info::supervisor_stack] Signed-off-by: Andy Lutomirski <luto@amacapital.net> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kees Cook <keescook@chromium.org> Cc: David Miller <davem@davemloft.net> Acked-by: Richard Weinberger <richard@nod.at> Cc: Richard Henderson <rth@twiddle.net> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Matt Turner <mattst88@gmail.com> Cc: Vineet Gupta <vgupta@synopsys.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Haavard Skinnemoen <hskinnemoen@gmail.com> Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no> Cc: Steven Miao <realmz6@gmail.com> Cc: Mark Salter <msalter@redhat.com> Cc: Aurelien Jacquiot <a-jacquiot@ti.com> Cc: Mikael Starvik <starvik@axis.com> Cc: Jesper Nilsson <jesper.nilsson@axis.com> Cc: David Howells <dhowells@redhat.com> Cc: Richard Kuo <rkuo@codeaurora.org> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Michal Simek <monstr@monstr.eu> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Jonas Bonn <jonas@southpole.se> Cc: "James E.J. Bottomley" <jejb@parisc-linux.org> Cc: Helge Deller <deller@gmx.de> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc) Tested-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc) Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Chen Liqin <liqin.linux@gmail.com> Cc: Lennox Wu <lennox.wu@gmail.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: Chris Zankel <chris@zankel.net> Cc: Max Filippov <jcmvbkbc@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Guenter Roeck <linux@roeck-us.net> Signed-off-by: James Hogan <james.hogan@imgtec.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 23:01:14 +00:00
current->restart_block.fn = do_no_restart_syscall;
Remove 'type' argument from access_ok() function Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument of the user address range verification function since we got rid of the old racy i386-only code to walk page tables by hand. It existed because the original 80386 would not honor the write protect bit when in kernel mode, so you had to do COW by hand before doing any user access. But we haven't supported that in a long time, and these days the 'type' argument is a purely historical artifact. A discussion about extending 'user_access_begin()' to do the range checking resulted this patch, because there is no way we're going to move the old VERIFY_xyz interface to that model. And it's best done at the end of the merge window when I've done most of my merges, so let's just get this done once and for all. This patch was mostly done with a sed-script, with manual fix-ups for the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form. There were a couple of notable cases: - csky still had the old "verify_area()" name as an alias. - the iter_iov code had magical hardcoded knowledge of the actual values of VERIFY_{READ,WRITE} (not that they mattered, since nothing really used it) - microblaze used the type argument for a debug printout but other than those oddities this should be a total no-op patch. I tried to fix up all architectures, did fairly extensive grepping for access_ok() uses, and the changes are trivial, but I may have missed something. Any missed conversion should be trivially fixable, though. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 02:57:57 +00:00
if (!access_ok(uc, sizeof(*uc)))
goto badframe;
if (__copy_from_user(&set, &uc->uc_sigmask, sizeof(set)))
goto badframe;
set_current_blocked(&set);
powerpc: signals: Discard transaction state from signal frames Userspace can begin and suspend a transaction within the signal handler which means they might enter sys_rt_sigreturn() with the processor in suspended state. sys_rt_sigreturn() wants to restore process context (which may have been in a transaction before signal delivery). To do this it must restore TM SPRS. To achieve this, any transaction initiated within the signal frame must be discarded in order to be able to restore TM SPRs as TM SPRs can only be manipulated non-transactionally.. >From the PowerPC ISA: TM Bad Thing Exception [Category: Transactional Memory] An attempt is made to execute a mtspr targeting a TM register in other than Non-transactional state. Not doing so results in a TM Bad Thing: [12045.221359] Kernel BUG at c000000000050a40 [verbose debug info unavailable] [12045.221470] Unexpected TM Bad Thing exception at c000000000050a40 (msr 0x201033) [12045.221540] Oops: Unrecoverable exception, sig: 6 [#1] [12045.221586] SMP NR_CPUS=2048 NUMA PowerNV [12045.221634] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter ip_tables x_tables kvm_hv kvm uio_pdrv_genirq ipmi_powernv uio powernv_rng ipmi_msghandler autofs4 ses enclosure scsi_transport_sas bnx2x ipr mdio libcrc32c [12045.222167] CPU: 68 PID: 6178 Comm: sigreturnpanic Not tainted 4.7.0 #34 [12045.222224] task: c0000000fce38600 ti: c0000000fceb4000 task.ti: c0000000fceb4000 [12045.222293] NIP: c000000000050a40 LR: c0000000000163bc CTR: 0000000000000000 [12045.222361] REGS: c0000000fceb7ac0 TRAP: 0700 Not tainted (4.7.0) [12045.222418] MSR: 9000000300201033 <SF,HV,ME,IR,DR,RI,LE,TM[SE]> CR: 28444280 XER: 20000000 [12045.222625] CFAR: c0000000000163b8 SOFTE: 0 PACATMSCRATCH: 900000014280f033 GPR00: 01100000b8000001 c0000000fceb7d40 c00000000139c100 c0000000fce390d0 GPR04: 900000034280f033 0000000000000000 0000000000000000 0000000000000000 GPR08: 0000000000000000 b000000000001033 0000000000000001 0000000000000000 GPR12: 0000000000000000 c000000002926400 0000000000000000 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: 0000000000000000 00003ffff98cadd0 00003ffff98cb470 0000000000000000 GPR28: 900000034280f033 c0000000fceb7ea0 0000000000000001 c0000000fce390d0 [12045.223535] NIP [c000000000050a40] tm_restore_sprs+0xc/0x1c [12045.223584] LR [c0000000000163bc] tm_recheckpoint+0x5c/0xa0 [12045.223630] Call Trace: [12045.223655] [c0000000fceb7d80] [c000000000026e74] sys_rt_sigreturn+0x494/0x6c0 [12045.223738] [c0000000fceb7e30] [c0000000000092e0] system_call+0x38/0x108 [12045.223806] Instruction dump: [12045.223841] 7c800164 4e800020 7c0022a6 f80304a8 7c0222a6 f80304b0 7c0122a6 f80304b8 [12045.223955] 4e800020 e80304a8 7c0023a6 e80304b0 <7c0223a6> e80304b8 7c0123a6 4e800020 [12045.224074] ---[ end trace cb8002ee240bae76 ]--- It isn't clear exactly if there is really a use case for userspace returning with a suspended transaction, however, doing so doesn't (on its own) constitute a bad frame. As such, this patch simply discards the transactional state of the context calling the sigreturn and continues. Reported-by: Laurent Dufour <ldufour@linux.vnet.ibm.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Tested-by: Laurent Dufour <ldufour@linux.vnet.ibm.com> Reviewed-by: Laurent Dufour <ldufour@linux.vnet.ibm.com> Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2016-08-23 00:46:17 +00:00
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
powerpc: signals: Discard transaction state from signal frames Userspace can begin and suspend a transaction within the signal handler which means they might enter sys_rt_sigreturn() with the processor in suspended state. sys_rt_sigreturn() wants to restore process context (which may have been in a transaction before signal delivery). To do this it must restore TM SPRS. To achieve this, any transaction initiated within the signal frame must be discarded in order to be able to restore TM SPRs as TM SPRs can only be manipulated non-transactionally.. >From the PowerPC ISA: TM Bad Thing Exception [Category: Transactional Memory] An attempt is made to execute a mtspr targeting a TM register in other than Non-transactional state. Not doing so results in a TM Bad Thing: [12045.221359] Kernel BUG at c000000000050a40 [verbose debug info unavailable] [12045.221470] Unexpected TM Bad Thing exception at c000000000050a40 (msr 0x201033) [12045.221540] Oops: Unrecoverable exception, sig: 6 [#1] [12045.221586] SMP NR_CPUS=2048 NUMA PowerNV [12045.221634] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter ip_tables x_tables kvm_hv kvm uio_pdrv_genirq ipmi_powernv uio powernv_rng ipmi_msghandler autofs4 ses enclosure scsi_transport_sas bnx2x ipr mdio libcrc32c [12045.222167] CPU: 68 PID: 6178 Comm: sigreturnpanic Not tainted 4.7.0 #34 [12045.222224] task: c0000000fce38600 ti: c0000000fceb4000 task.ti: c0000000fceb4000 [12045.222293] NIP: c000000000050a40 LR: c0000000000163bc CTR: 0000000000000000 [12045.222361] REGS: c0000000fceb7ac0 TRAP: 0700 Not tainted (4.7.0) [12045.222418] MSR: 9000000300201033 <SF,HV,ME,IR,DR,RI,LE,TM[SE]> CR: 28444280 XER: 20000000 [12045.222625] CFAR: c0000000000163b8 SOFTE: 0 PACATMSCRATCH: 900000014280f033 GPR00: 01100000b8000001 c0000000fceb7d40 c00000000139c100 c0000000fce390d0 GPR04: 900000034280f033 0000000000000000 0000000000000000 0000000000000000 GPR08: 0000000000000000 b000000000001033 0000000000000001 0000000000000000 GPR12: 0000000000000000 c000000002926400 0000000000000000 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: 0000000000000000 00003ffff98cadd0 00003ffff98cb470 0000000000000000 GPR28: 900000034280f033 c0000000fceb7ea0 0000000000000001 c0000000fce390d0 [12045.223535] NIP [c000000000050a40] tm_restore_sprs+0xc/0x1c [12045.223584] LR [c0000000000163bc] tm_recheckpoint+0x5c/0xa0 [12045.223630] Call Trace: [12045.223655] [c0000000fceb7d80] [c000000000026e74] sys_rt_sigreturn+0x494/0x6c0 [12045.223738] [c0000000fceb7e30] [c0000000000092e0] system_call+0x38/0x108 [12045.223806] Instruction dump: [12045.223841] 7c800164 4e800020 7c0022a6 f80304a8 7c0222a6 f80304b0 7c0122a6 f80304b8 [12045.223955] 4e800020 e80304a8 7c0023a6 e80304b0 <7c0223a6> e80304b8 7c0123a6 4e800020 [12045.224074] ---[ end trace cb8002ee240bae76 ]--- It isn't clear exactly if there is really a use case for userspace returning with a suspended transaction, however, doing so doesn't (on its own) constitute a bad frame. As such, this patch simply discards the transactional state of the context calling the sigreturn and continues. Reported-by: Laurent Dufour <ldufour@linux.vnet.ibm.com> Signed-off-by: Cyril Bur <cyrilbur@gmail.com> Tested-by: Laurent Dufour <ldufour@linux.vnet.ibm.com> Reviewed-by: Laurent Dufour <ldufour@linux.vnet.ibm.com> Acked-by: Simon Guo <wei.guo.simon@gmail.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2016-08-23 00:46:17 +00:00
/*
* If there is a transactional state then throw it away.
* The purpose of a sigreturn is to destroy all traces of the
* signal frame, this includes any transactional state created
* within in. We only check for suspended as we can never be
* active in the kernel, we are active, there is nothing better to
* do than go ahead and Bad Thing later.
* The cause is not important as there will never be a
* recheckpoint so it's not user visible.
*/
if (MSR_TM_SUSPENDED(mfmsr()))
tm_reclaim_current(0);
powerpc/tm: Avoid machine crash on rt_sigreturn() There is a kernel crash that happens if rt_sigreturn() is called inside a transactional block. This crash happens if the kernel hits an in-kernel page fault when accessing userspace memory, usually through copy_ckvsx_to_user(). A major page fault calls might_sleep() function, which can cause a task reschedule. A task reschedule (switch_to()) reclaim and recheckpoint the TM states, but, in the signal return path, the checkpointed memory was already reclaimed, thus the exception stack has MSR that points to MSR[TS]=0. When the code returns from might_sleep() and a task reschedule happened, then this task is returned with the memory recheckpointed, and CPU MSR[TS] = suspended. This means that there is a side effect at might_sleep() if it is called with CPU MSR[TS] = 0 and the task has regs->msr[TS] != 0. This side effect can cause a TM bad thing, since at the exception entrance, the stack saves MSR[TS]=0, and this is what will be used at RFID, but, the processor has MSR[TS] = Suspended, and this transition will be invalid and a TM Bad thing will be raised, causing the following crash: Unexpected TM Bad Thing exception at c00000000000e9ec (msr 0x8000000302a03031) tm_scratch=800000010280b033 cpu 0xc: Vector: 700 (Program Check) at [c00000003ff1fd70] pc: c00000000000e9ec: fast_exception_return+0x100/0x1bc lr: c000000000032948: handle_rt_signal64+0xb8/0xaf0 sp: c0000004263ebc40 msr: 8000000302a03031 current = 0xc000000415050300 paca = 0xc00000003ffc4080 irqmask: 0x03 irq_happened: 0x01 pid = 25006, comm = sigfuz Linux version 5.0.0-rc1-00001-g3bd6e94bec12 (breno@debian) (gcc version 8.2.0 (Debian 8.2.0-3)) #899 SMP Mon Jan 7 11:30:07 EST 2019 WARNING: exception is not recoverable, can't continue enter ? for help [c0000004263ebc40] c000000000032948 handle_rt_signal64+0xb8/0xaf0 (unreliable) [c0000004263ebd30] c000000000022780 do_notify_resume+0x2f0/0x430 [c0000004263ebe20] c00000000000e844 ret_from_except_lite+0x70/0x74 --- Exception: c00 (System Call) at 00007fffbaac400c SP (7fffeca90f40) is in userspace The solution for this problem is running the sigreturn code with regs->msr[TS] disabled, thus, avoiding hitting the side effect above. This does not seem to be a problem since regs->msr will be replaced by the ucontext value, so, it is being flushed already. In this case, it is flushed earlier. Signed-off-by: Breno Leitao <leitao@debian.org> Acked-by: Michael Neuling <mikey@neuling.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2019-01-16 16:47:44 +00:00
/*
* Disable MSR[TS] bit also, so, if there is an exception in the
* code below (as a page fault in copy_ckvsx_to_user()), it does
* not recheckpoint this task if there was a context switch inside
* the exception.
*
* A major page fault can indirectly call schedule(). A reschedule
* process in the middle of an exception can have a side effect
* (Changing the CPU MSR[TS] state), since schedule() is called
* with the CPU MSR[TS] disable and returns with MSR[TS]=Suspended
* (switch_to() calls tm_recheckpoint() for the 'new' process). In
* this case, the process continues to be the same in the CPU, but
* the CPU state just changed.
*
* This can cause a TM Bad Thing, since the MSR in the stack will
* have the MSR[TS]=0, and this is what will be used to RFID.
*
* Clearing MSR[TS] state here will avoid a recheckpoint if there
* is any process reschedule in kernel space. The MSR[TS] state
* does not need to be saved also, since it will be replaced with
* the MSR[TS] that came from user context later, at
* restore_tm_sigcontexts.
*/
regs->msr &= ~MSR_TS_MASK;
if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR]))
goto badframe;
if (MSR_TM_ACTIVE(msr)) {
/* We recheckpoint on return. */
struct ucontext __user *uc_transact;
if (__get_user(uc_transact, &uc->uc_link))
goto badframe;
if (restore_tm_sigcontexts(current, &uc->uc_mcontext,
&uc_transact->uc_mcontext))
goto badframe;
} else
#endif
{
powerpc/tm: Unset MSR[TS] if not recheckpointing There is a TM Bad Thing bug that can be caused when you return from a signal context in a suspended transaction but with ucontext MSR[TS] unset. This forces regs->msr[TS] to be set at syscall entrance (since the CPU state is transactional). It also calls treclaim() to flush the transaction state, which is done based on the live (mfmsr) MSR state. Since user context MSR[TS] is not set, then restore_tm_sigcontexts() is not called, thus, not executing recheckpoint, keeping the CPU state as not transactional. When calling rfid, SRR1 will have MSR[TS] set, but the CPU state is non transactional, causing the TM Bad Thing with the following stack: [ 33.862316] Bad kernel stack pointer 3fffd9dce3e0 at c00000000000c47c cpu 0x8: Vector: 700 (Program Check) at [c00000003ff7fd40] pc: c00000000000c47c: fast_exception_return+0xac/0xb4 lr: 00003fff865f442c sp: 3fffd9dce3e0 msr: 8000000102a03031 current = 0xc00000041f68b700 paca = 0xc00000000fb84800 softe: 0 irq_happened: 0x01 pid = 1721, comm = tm-signal-sigre Linux version 4.9.0-3-powerpc64le (debian-kernel@lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26) WARNING: exception is not recoverable, can't continue The same problem happens on 32-bits signal handler, and the fix is very similar, if tm_recheckpoint() is not executed, then regs->msr[TS] should be zeroed. This patch also fixes a sparse warning related to lack of indentation when CONFIG_PPC_TRANSACTIONAL_MEM is set. Fixes: 2b0a576d15e0e ("powerpc: Add new transactional memory state to the signal context") CC: Stable <stable@vger.kernel.org> # 3.10+ Signed-off-by: Breno Leitao <leitao@debian.org> Tested-by: Michal Suchánek <msuchanek@suse.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-11-26 20:12:00 +00:00
/*
* Fall through, for non-TM restore
*
powerpc/tm: Unset MSR[TS] if not recheckpointing There is a TM Bad Thing bug that can be caused when you return from a signal context in a suspended transaction but with ucontext MSR[TS] unset. This forces regs->msr[TS] to be set at syscall entrance (since the CPU state is transactional). It also calls treclaim() to flush the transaction state, which is done based on the live (mfmsr) MSR state. Since user context MSR[TS] is not set, then restore_tm_sigcontexts() is not called, thus, not executing recheckpoint, keeping the CPU state as not transactional. When calling rfid, SRR1 will have MSR[TS] set, but the CPU state is non transactional, causing the TM Bad Thing with the following stack: [ 33.862316] Bad kernel stack pointer 3fffd9dce3e0 at c00000000000c47c cpu 0x8: Vector: 700 (Program Check) at [c00000003ff7fd40] pc: c00000000000c47c: fast_exception_return+0xac/0xb4 lr: 00003fff865f442c sp: 3fffd9dce3e0 msr: 8000000102a03031 current = 0xc00000041f68b700 paca = 0xc00000000fb84800 softe: 0 irq_happened: 0x01 pid = 1721, comm = tm-signal-sigre Linux version 4.9.0-3-powerpc64le (debian-kernel@lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26) WARNING: exception is not recoverable, can't continue The same problem happens on 32-bits signal handler, and the fix is very similar, if tm_recheckpoint() is not executed, then regs->msr[TS] should be zeroed. This patch also fixes a sparse warning related to lack of indentation when CONFIG_PPC_TRANSACTIONAL_MEM is set. Fixes: 2b0a576d15e0e ("powerpc: Add new transactional memory state to the signal context") CC: Stable <stable@vger.kernel.org> # 3.10+ Signed-off-by: Breno Leitao <leitao@debian.org> Tested-by: Michal Suchánek <msuchanek@suse.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-11-26 20:12:00 +00:00
* Unset MSR[TS] on the thread regs since MSR from user
* context does not have MSR active, and recheckpoint was
* not called since restore_tm_sigcontexts() was not called
* also.
*
* If not unsetting it, the code can RFID to userspace with
* MSR[TS] set, but without CPU in the proper state,
* causing a TM bad thing.
*/
current->thread.regs->msr &= ~MSR_TS_MASK;
if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
goto badframe;
}
if (restore_altstack(&uc->uc_stack))
goto badframe;
[PATCH] syscall entry/exit revamp This cleanup patch speeds up the null syscall path on ppc64 by about 3%, and brings the ppc32 and ppc64 code slightly closer together. The ppc64 code was checking current_thread_info()->flags twice in the syscall exit path; once for TIF_SYSCALL_T_OR_A before disabling interrupts, and then again for TIF_SIGPENDING|TIF_NEED_RESCHED etc after disabling interrupts. Now we do the same as ppc32 -- check the flags only once in the fast path, and re-enable interrupts if necessary in the ptrace case. The patch abolishes the 'syscall_noerror' member of struct thread_info and replaces it with a TIF_NOERROR bit in the flags, which is handled in the slow path. This shortens the syscall entry code, which no longer needs to clear syscall_noerror. The patch adds a TIF_SAVE_NVGPRS flag which causes the syscall exit slow path to save the non-volatile GPRs into a signal frame. This removes the need for the assembly wrappers around sys_sigsuspend(), sys_rt_sigsuspend(), et al which existed solely to save those registers in advance. It also means I don't have to add new wrappers for ppoll() and pselect(), which is what I was supposed to be doing when I got distracted into this... Finally, it unifies the ppc64 and ppc32 methods of handling syscall exit directly into a signal handler (as required by sigsuspend et al) by introducing a TIF_RESTOREALL flag which causes _all_ the registers to be reloaded from the pt_regs by taking the ret_from_exception path, instead of the normal syscall exit path which stomps on the callee-saved GPRs. It appears to pass an LTP test run on ppc64, and passes basic testing on ppc32 too. Brief tests of ptrace functionality with strace and gdb also appear OK. I wouldn't send it to Linus for 2.6.15 just yet though :) Signed-off-by: David Woodhouse <dwmw2@infradead.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
2005-11-15 18:52:18 +00:00
set_thread_flag(TIF_RESTOREALL);
return 0;
badframe:
if (show_unhandled_signals)
printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32,
current->comm, current->pid, "rt_sigreturn",
(long)uc, regs->nip, regs->link);
force_sig(SIGSEGV, current);
return 0;
}
int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
struct task_struct *tsk)
{
struct rt_sigframe __user *frame;
unsigned long newsp = 0;
long err = 0;
struct pt_regs *regs = tsk->thread.regs;
BUG_ON(tsk != current);
frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 0);
if (unlikely(frame == NULL))
goto badframe;
err |= __put_user(&frame->info, &frame->pinfo);
err |= __put_user(&frame->uc, &frame->puc);
err |= copy_siginfo_to_user(&frame->info, &ksig->info);
if (err)
goto badframe;
/* Create the ucontext. */
err |= __put_user(0, &frame->uc.uc_flags);
err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]);
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (MSR_TM_ACTIVE(regs->msr)) {
/* The ucontext_t passed to userland points to the second
* ucontext_t (for transactional state) with its uc_link ptr.
*/
err |= __put_user(&frame->uc_transact, &frame->uc.uc_link);
err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext,
&frame->uc_transact.uc_mcontext,
tsk, ksig->sig, NULL,
(unsigned long)ksig->ka.sa.sa_handler);
} else
#endif
{
err |= __put_user(0, &frame->uc.uc_link);
err |= setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig,
NULL, (unsigned long)ksig->ka.sa.sa_handler,
1);
}
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
if (err)
goto badframe;
/* Make sure signal handler doesn't get spurious FP exceptions */
tsk->thread.fp_state.fpscr = 0;
/* Set up to return from userspace. */
if (vdso64_rt_sigtramp && tsk->mm->context.vdso_base) {
regs->link = tsk->mm->context.vdso_base + vdso64_rt_sigtramp;
} else {
err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]);
if (err)
goto badframe;
regs->link = (unsigned long) &frame->tramp[0];
}
/* Allocate a dummy caller frame for the signal handler. */
newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
err |= put_user(regs->gpr[1], (unsigned long __user *)newsp);
/* Set up "regs" so we "return" to the signal handler. */
if (is_elf2_task()) {
regs->nip = (unsigned long) ksig->ka.sa.sa_handler;
regs->gpr[12] = regs->nip;
} else {
/* Handler is *really* a pointer to the function descriptor for
* the signal routine. The first entry in the function
* descriptor is the entry address of signal and the second
* entry is the TOC value we need to use.
*/
func_descr_t __user *funct_desc_ptr =
(func_descr_t __user *) ksig->ka.sa.sa_handler;
err |= get_user(regs->nip, &funct_desc_ptr->entry);
err |= get_user(regs->gpr[2], &funct_desc_ptr->toc);
}
/* enter the signal handler in native-endian mode */
regs->msr &= ~MSR_LE;
regs->msr |= (MSR_KERNEL & MSR_LE);
regs->gpr[1] = newsp;
regs->gpr[3] = ksig->sig;
regs->result = 0;
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
err |= get_user(regs->gpr[4], (unsigned long __user *)&frame->pinfo);
err |= get_user(regs->gpr[5], (unsigned long __user *)&frame->puc);
regs->gpr[6] = (unsigned long) frame;
} else {
regs->gpr[4] = (unsigned long)&frame->uc.uc_mcontext;
}
if (err)
goto badframe;
return 0;
badframe:
if (show_unhandled_signals)
printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32,
tsk->comm, tsk->pid, "setup_rt_frame",
(long)frame, regs->nip, regs->link);
return 1;
}