linux/arch/powerpc/kernel/vdso64/sigtramp.S
Nicholas Piggin 0138ba5783 powerpc/64/signal: Balance return predictor stack in signal trampoline
Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:

    # interrupt at user address xyz
    # kernel stuff... signal is raised
    rfid
    # void handler(int sig)
    addis 2,12,.TOC.-.LCF0@ha
    addi 2,2,.TOC.-.LCF0@l
    mflr 0
    std 0,16(1)
    stdu 1,-96(1)
    # handler stuff
    ld 0,16(1)
    mtlr 0
    blr
    # __kernel_sigtramp_rt64
    addi    r1,r1,__SIGNAL_FRAMESIZE
    li      r0,__NR_rt_sigreturn
    sc
    # kernel executes rt_sigreturn
    rfid
    # back to user address xyz

Note the blr with no matching bl. This can corrupt the return
predictor.

Solve this by instead resuming execution at the signal trampoline
which then calls the signal handler. qtrace-tools link_stack checker
confirms the entire user/kernel/vdso cycle is balanced after this
patch, whereas it's not upstream.

Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it
break inside a signal handler.

Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a
microbenchmark:

 Performance counter stats for './signal':

       13,085.72 msec task-clock                #    1.000 CPUs utilized
  45,024,760,101      cycles                    #    3.441 GHz
  65,102,895,542      instructions              #    1.45  insn per cycle
  11,271,673,787      branches                  #  861.372 M/sec
      59,468,979      branch-misses             #    0.53% of all branches

       12,989.09 msec task-clock                #    1.000 CPUs utilized
  44,692,719,559      cycles                    #    3.441 GHz
  65,109,984,964      instructions              #    1.46  insn per cycle
  11,282,136,057      branches                  #  868.585 M/sec
      39,786,942      branch-misses             #    0.35% of all branches

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200511101952.1463138-1-npiggin@gmail.com
2020-07-15 11:08:27 +10:00

305 lines
9.8 KiB
ArmAsm

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Signal trampoline for 64 bits processes in a ppc64 kernel for
* use in the vDSO
*
* Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
* Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp.
*/
#include <asm/cache.h> /* IFETCH_ALIGN_BYTES */
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/unistd.h>
#include <asm/vdso.h>
#include <asm/ptrace.h> /* XXX for __SIGNAL_FRAMESIZE */
.text
.balign 8
.balign IFETCH_ALIGN_BYTES
V_FUNCTION_BEGIN(__kernel_sigtramp_rt64)
.Lsigrt_start:
bctrl /* call the handler */
addi r1, r1, __SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
.Lsigrt_end:
V_FUNCTION_END(__kernel_sigtramp_rt64)
/* The .balign 8 above and the following zeros mimic the old stack
trampoline layout. The last magic value is the ucontext pointer,
chosen in such a way that older libgcc unwind code returns a zero
for a sigcontext pointer. */
.long 0,0,0
.quad 0,-21*8
/* Register r1 can be found at offset 8 of a pt_regs structure.
A pointer to the pt_regs is stored in memory at the old sp plus PTREGS. */
#define cfa_save \
.byte 0x0f; /* DW_CFA_def_cfa_expression */ \
.uleb128 9f - 1f; /* length */ \
1: \
.byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \
.byte 0x06; /* DW_OP_deref */ \
.byte 0x23; .uleb128 RSIZE; /* DW_OP_plus_uconst */ \
.byte 0x06; /* DW_OP_deref */ \
9:
/* Register REGNO can be found at offset OFS of a pt_regs structure.
A pointer to the pt_regs is stored in memory at the old sp plus PTREGS. */
#define rsave(regno, ofs) \
.byte 0x10; /* DW_CFA_expression */ \
.uleb128 regno; /* regno */ \
.uleb128 9f - 1f; /* length */ \
1: \
.byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \
.byte 0x06; /* DW_OP_deref */ \
.ifne ofs; \
.byte 0x23; .uleb128 ofs; /* DW_OP_plus_uconst */ \
.endif; \
9:
/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16
of the VMX reg struct. A pointer to the VMX reg struct is at VREGS in
the pt_regs struct. This macro is for REGNO == 0, and contains
'subroutines' that the other macros jump to. */
#define vsave_msr0(regno) \
.byte 0x10; /* DW_CFA_expression */ \
.uleb128 regno + 77; /* regno */ \
.uleb128 9f - 1f; /* length */ \
1: \
.byte 0x30 + regno; /* DW_OP_lit0 */ \
2: \
.byte 0x40; /* DW_OP_lit16 */ \
.byte 0x1e; /* DW_OP_mul */ \
3: \
.byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \
.byte 0x06; /* DW_OP_deref */ \
.byte 0x12; /* DW_OP_dup */ \
.byte 0x23; /* DW_OP_plus_uconst */ \
.uleb128 33*RSIZE; /* msr offset */ \
.byte 0x06; /* DW_OP_deref */ \
.byte 0x0c; .long 1 << 25; /* DW_OP_const4u */ \
.byte 0x1a; /* DW_OP_and */ \
.byte 0x12; /* DW_OP_dup, ret 0 if bra taken */ \
.byte 0x30; /* DW_OP_lit0 */ \
.byte 0x29; /* DW_OP_eq */ \
.byte 0x28; .short 0x7fff; /* DW_OP_bra to end */ \
.byte 0x13; /* DW_OP_drop, pop the 0 */ \
.byte 0x23; .uleb128 VREGS; /* DW_OP_plus_uconst */ \
.byte 0x06; /* DW_OP_deref */ \
.byte 0x22; /* DW_OP_plus */ \
.byte 0x2f; .short 0x7fff; /* DW_OP_skip to end */ \
9:
/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16
of the VMX reg struct. REGNO is 1 thru 31. */
#define vsave_msr1(regno) \
.byte 0x10; /* DW_CFA_expression */ \
.uleb128 regno + 77; /* regno */ \
.uleb128 9f - 1f; /* length */ \
1: \
.byte 0x30 + regno; /* DW_OP_lit n */ \
.byte 0x2f; .short 2b - 9f; /* DW_OP_skip */ \
9:
/* If msr bit 1<<25 is set, then VMX register REGNO is at offset OFS of
the VMX save block. */
#define vsave_msr2(regno, ofs) \
.byte 0x10; /* DW_CFA_expression */ \
.uleb128 regno + 77; /* regno */ \
.uleb128 9f - 1f; /* length */ \
1: \
.byte 0x0a; .short ofs; /* DW_OP_const2u */ \
.byte 0x2f; .short 3b - 9f; /* DW_OP_skip */ \
9:
/* VMX register REGNO is at offset OFS of the VMX save area. */
#define vsave(regno, ofs) \
.byte 0x10; /* DW_CFA_expression */ \
.uleb128 regno + 77; /* regno */ \
.uleb128 9f - 1f; /* length */ \
1: \
.byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \
.byte 0x06; /* DW_OP_deref */ \
.byte 0x23; .uleb128 VREGS; /* DW_OP_plus_uconst */ \
.byte 0x06; /* DW_OP_deref */ \
.byte 0x23; .uleb128 ofs; /* DW_OP_plus_uconst */ \
9:
/* This is where the pt_regs pointer can be found on the stack. */
#define PTREGS 128+168+56
/* Size of regs. */
#define RSIZE 8
/* Size of CR reg in DWARF unwind info. */
#define CRSIZE 4
/* Offset of CR reg within a full word. */
#ifdef __LITTLE_ENDIAN__
#define CROFF 0
#else
#define CROFF (RSIZE - CRSIZE)
#endif
/* This is the offset of the VMX reg pointer. */
#define VREGS 48*RSIZE+33*8
/* Describe where general purpose regs are saved. */
#define EH_FRAME_GEN \
cfa_save; \
rsave ( 0, 0*RSIZE); \
rsave ( 2, 2*RSIZE); \
rsave ( 3, 3*RSIZE); \
rsave ( 4, 4*RSIZE); \
rsave ( 5, 5*RSIZE); \
rsave ( 6, 6*RSIZE); \
rsave ( 7, 7*RSIZE); \
rsave ( 8, 8*RSIZE); \
rsave ( 9, 9*RSIZE); \
rsave (10, 10*RSIZE); \
rsave (11, 11*RSIZE); \
rsave (12, 12*RSIZE); \
rsave (13, 13*RSIZE); \
rsave (14, 14*RSIZE); \
rsave (15, 15*RSIZE); \
rsave (16, 16*RSIZE); \
rsave (17, 17*RSIZE); \
rsave (18, 18*RSIZE); \
rsave (19, 19*RSIZE); \
rsave (20, 20*RSIZE); \
rsave (21, 21*RSIZE); \
rsave (22, 22*RSIZE); \
rsave (23, 23*RSIZE); \
rsave (24, 24*RSIZE); \
rsave (25, 25*RSIZE); \
rsave (26, 26*RSIZE); \
rsave (27, 27*RSIZE); \
rsave (28, 28*RSIZE); \
rsave (29, 29*RSIZE); \
rsave (30, 30*RSIZE); \
rsave (31, 31*RSIZE); \
rsave (67, 32*RSIZE); /* ap, used as temp for nip */ \
rsave (65, 36*RSIZE); /* lr */ \
rsave (68, 38*RSIZE + CROFF); /* cr fields */ \
rsave (69, 38*RSIZE + CROFF); \
rsave (70, 38*RSIZE + CROFF); \
rsave (71, 38*RSIZE + CROFF); \
rsave (72, 38*RSIZE + CROFF); \
rsave (73, 38*RSIZE + CROFF); \
rsave (74, 38*RSIZE + CROFF); \
rsave (75, 38*RSIZE + CROFF)
/* Describe where the FP regs are saved. */
#define EH_FRAME_FP \
rsave (32, 48*RSIZE + 0*8); \
rsave (33, 48*RSIZE + 1*8); \
rsave (34, 48*RSIZE + 2*8); \
rsave (35, 48*RSIZE + 3*8); \
rsave (36, 48*RSIZE + 4*8); \
rsave (37, 48*RSIZE + 5*8); \
rsave (38, 48*RSIZE + 6*8); \
rsave (39, 48*RSIZE + 7*8); \
rsave (40, 48*RSIZE + 8*8); \
rsave (41, 48*RSIZE + 9*8); \
rsave (42, 48*RSIZE + 10*8); \
rsave (43, 48*RSIZE + 11*8); \
rsave (44, 48*RSIZE + 12*8); \
rsave (45, 48*RSIZE + 13*8); \
rsave (46, 48*RSIZE + 14*8); \
rsave (47, 48*RSIZE + 15*8); \
rsave (48, 48*RSIZE + 16*8); \
rsave (49, 48*RSIZE + 17*8); \
rsave (50, 48*RSIZE + 18*8); \
rsave (51, 48*RSIZE + 19*8); \
rsave (52, 48*RSIZE + 20*8); \
rsave (53, 48*RSIZE + 21*8); \
rsave (54, 48*RSIZE + 22*8); \
rsave (55, 48*RSIZE + 23*8); \
rsave (56, 48*RSIZE + 24*8); \
rsave (57, 48*RSIZE + 25*8); \
rsave (58, 48*RSIZE + 26*8); \
rsave (59, 48*RSIZE + 27*8); \
rsave (60, 48*RSIZE + 28*8); \
rsave (61, 48*RSIZE + 29*8); \
rsave (62, 48*RSIZE + 30*8); \
rsave (63, 48*RSIZE + 31*8)
/* Describe where the VMX regs are saved. */
#ifdef CONFIG_ALTIVEC
#define EH_FRAME_VMX \
vsave_msr0 ( 0); \
vsave_msr1 ( 1); \
vsave_msr1 ( 2); \
vsave_msr1 ( 3); \
vsave_msr1 ( 4); \
vsave_msr1 ( 5); \
vsave_msr1 ( 6); \
vsave_msr1 ( 7); \
vsave_msr1 ( 8); \
vsave_msr1 ( 9); \
vsave_msr1 (10); \
vsave_msr1 (11); \
vsave_msr1 (12); \
vsave_msr1 (13); \
vsave_msr1 (14); \
vsave_msr1 (15); \
vsave_msr1 (16); \
vsave_msr1 (17); \
vsave_msr1 (18); \
vsave_msr1 (19); \
vsave_msr1 (20); \
vsave_msr1 (21); \
vsave_msr1 (22); \
vsave_msr1 (23); \
vsave_msr1 (24); \
vsave_msr1 (25); \
vsave_msr1 (26); \
vsave_msr1 (27); \
vsave_msr1 (28); \
vsave_msr1 (29); \
vsave_msr1 (30); \
vsave_msr1 (31); \
vsave_msr2 (33, 32*16+12); \
vsave (32, 33*16)
#else
#define EH_FRAME_VMX
#endif
.section .eh_frame,"a",@progbits
.Lcie:
.long .Lcie_end - .Lcie_start
.Lcie_start:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zRS" /* NUL-terminated augmentation string */
.uleb128 4 /* Code alignment factor */
.sleb128 -8 /* Data alignment factor */
.byte 67 /* Return address register column, ap */
.uleb128 1 /* Augmentation value length */
.byte 0x14 /* DW_EH_PE_pcrel | DW_EH_PE_udata8. */
.byte 0x0c,1,0 /* DW_CFA_def_cfa: r1 ofs 0 */
.balign 8
.Lcie_end:
.long .Lfde0_end - .Lfde0_start
.Lfde0_start:
.long .Lfde0_start - .Lcie /* CIE pointer. */
.quad .Lsigrt_start - . /* PC start, length */
.quad .Lsigrt_end - .Lsigrt_start
.uleb128 0 /* Augmentation */
EH_FRAME_GEN
EH_FRAME_FP
EH_FRAME_VMX
# Do we really need to describe the frame at this point? ie. will
# we ever have some call chain that returns somewhere past the addi?
# I don't think so, since gcc doesn't support async signals.
# .byte 0x41 /* DW_CFA_advance_loc 1*4 */
#undef PTREGS
#define PTREGS 168+56
# EH_FRAME_GEN
# EH_FRAME_FP
# EH_FRAME_VMX
.balign 8
.Lfde0_end: