x86: Entry cleanups and a bugfix for 3.20
This fixes a bug in the RCU code I added in ist_enter. It also includes the sysret stuff discussed here: http://lkml.kernel.org/g/cover.1421453410.git.luto%40amacapital.net -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQEcBAABAgAGBQJUzhZ0AAoJEK9N98ZeDfrksUEH/j7wkUlMGan5h1AQIZQW6gKk OjlE1a4rfcgKocgkc0ix6UMc8Ks/NAUWKpeHR08eqR+Xi6Yk29cqLkboTEmAdYJ3 jQvKjGu51kiprNjAGqF5wdqxvCT3oBSdm7CWdtY4zHkEr+2W93Ht9PM7xZhj4r+P ekUC8mIKQrhyhlC7g7VpXLAi3Bk4mO+f499T7XBVsVoywWpgVpOMYMhtUobV1reW V7/zul/dMerzNLB0t3amvdgCLphHBQTQ0fHBAN62RY78UvSDt36EZFyS65isirsR LhO4FpWzF5YNMRk8Dep/fB8jYlhsCi40ZIlOtGSE6kNJyLhPt+oLnkpgOwWAMQc= =uiRw -----END PGP SIGNATURE----- Merge tag 'pr-20150201-x86-entry' of git://git.kernel.org/pub/scm/linux/kernel/git/luto/linux into x86/asm Pull "x86: Entry cleanups and a bugfix for 3.20" from Andy Lutomirski: " This fixes a bug in the RCU code I added in ist_enter. It also includes the sysret stuff discussed here: http://lkml.kernel.org/g/cover.1421453410.git.luto%40amacapital.net " Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
commit
b57c0b5175
@ -361,15 +361,12 @@ system_call_fastpath:
|
|||||||
* Has incomplete stack frame and undefined top of stack.
|
* Has incomplete stack frame and undefined top of stack.
|
||||||
*/
|
*/
|
||||||
ret_from_sys_call:
|
ret_from_sys_call:
|
||||||
movl $_TIF_ALLWORK_MASK,%edi
|
testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
|
||||||
/* edi: flagmask */
|
jnz int_ret_from_sys_call_fixup /* Go the the slow path */
|
||||||
sysret_check:
|
|
||||||
LOCKDEP_SYS_EXIT
|
LOCKDEP_SYS_EXIT
|
||||||
DISABLE_INTERRUPTS(CLBR_NONE)
|
DISABLE_INTERRUPTS(CLBR_NONE)
|
||||||
TRACE_IRQS_OFF
|
TRACE_IRQS_OFF
|
||||||
movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
|
|
||||||
andl %edi,%edx
|
|
||||||
jnz sysret_careful
|
|
||||||
CFI_REMEMBER_STATE
|
CFI_REMEMBER_STATE
|
||||||
/*
|
/*
|
||||||
* sysretq will re-enable interrupts:
|
* sysretq will re-enable interrupts:
|
||||||
@ -383,49 +380,10 @@ sysret_check:
|
|||||||
USERGS_SYSRET64
|
USERGS_SYSRET64
|
||||||
|
|
||||||
CFI_RESTORE_STATE
|
CFI_RESTORE_STATE
|
||||||
/* Handle reschedules */
|
|
||||||
/* edx: work, edi: workmask */
|
|
||||||
sysret_careful:
|
|
||||||
bt $TIF_NEED_RESCHED,%edx
|
|
||||||
jnc sysret_signal
|
|
||||||
TRACE_IRQS_ON
|
|
||||||
ENABLE_INTERRUPTS(CLBR_NONE)
|
|
||||||
pushq_cfi %rdi
|
|
||||||
SCHEDULE_USER
|
|
||||||
popq_cfi %rdi
|
|
||||||
jmp sysret_check
|
|
||||||
|
|
||||||
/* Handle a signal */
|
int_ret_from_sys_call_fixup:
|
||||||
sysret_signal:
|
|
||||||
TRACE_IRQS_ON
|
|
||||||
ENABLE_INTERRUPTS(CLBR_NONE)
|
|
||||||
#ifdef CONFIG_AUDITSYSCALL
|
|
||||||
bt $TIF_SYSCALL_AUDIT,%edx
|
|
||||||
jc sysret_audit
|
|
||||||
#endif
|
|
||||||
/*
|
|
||||||
* We have a signal, or exit tracing or single-step.
|
|
||||||
* These all wind up with the iret return path anyway,
|
|
||||||
* so just join that path right now.
|
|
||||||
*/
|
|
||||||
FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
|
FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
|
||||||
jmp int_check_syscall_exit_work
|
jmp int_ret_from_sys_call
|
||||||
|
|
||||||
#ifdef CONFIG_AUDITSYSCALL
|
|
||||||
/*
|
|
||||||
* Return fast path for syscall audit. Call __audit_syscall_exit()
|
|
||||||
* directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
|
|
||||||
* masked off.
|
|
||||||
*/
|
|
||||||
sysret_audit:
|
|
||||||
movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
|
|
||||||
cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
|
|
||||||
setbe %al /* 1 if so, 0 if not */
|
|
||||||
movzbl %al,%edi /* zero-extend that into %edi */
|
|
||||||
call __audit_syscall_exit
|
|
||||||
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
|
|
||||||
jmp sysret_check
|
|
||||||
#endif /* CONFIG_AUDITSYSCALL */
|
|
||||||
|
|
||||||
/* Do syscall tracing */
|
/* Do syscall tracing */
|
||||||
tracesys:
|
tracesys:
|
||||||
@ -794,6 +752,60 @@ retint_swapgs: /* return to user-space */
|
|||||||
*/
|
*/
|
||||||
DISABLE_INTERRUPTS(CLBR_ANY)
|
DISABLE_INTERRUPTS(CLBR_ANY)
|
||||||
TRACE_IRQS_IRETQ
|
TRACE_IRQS_IRETQ
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Try to use SYSRET instead of IRET if we're returning to
|
||||||
|
* a completely clean 64-bit userspace context.
|
||||||
|
*/
|
||||||
|
movq (RCX-R11)(%rsp), %rcx
|
||||||
|
cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
|
||||||
|
jne opportunistic_sysret_failed
|
||||||
|
|
||||||
|
/*
|
||||||
|
* On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
|
||||||
|
* in kernel space. This essentially lets the user take over
|
||||||
|
* the kernel, since userspace controls RSP. It's not worth
|
||||||
|
* testing for canonicalness exactly -- this check detects any
|
||||||
|
* of the 17 high bits set, which is true for non-canonical
|
||||||
|
* or kernel addresses. (This will pessimize vsyscall=native.
|
||||||
|
* Big deal.)
|
||||||
|
*
|
||||||
|
* If virtual addresses ever become wider, this will need
|
||||||
|
* to be updated to remain correct on both old and new CPUs.
|
||||||
|
*/
|
||||||
|
.ifne __VIRTUAL_MASK_SHIFT - 47
|
||||||
|
.error "virtual address width changed -- sysret checks need update"
|
||||||
|
.endif
|
||||||
|
shr $__VIRTUAL_MASK_SHIFT, %rcx
|
||||||
|
jnz opportunistic_sysret_failed
|
||||||
|
|
||||||
|
cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
|
||||||
|
jne opportunistic_sysret_failed
|
||||||
|
|
||||||
|
movq (R11-ARGOFFSET)(%rsp), %r11
|
||||||
|
cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
|
||||||
|
jne opportunistic_sysret_failed
|
||||||
|
|
||||||
|
testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
|
||||||
|
jnz opportunistic_sysret_failed
|
||||||
|
|
||||||
|
/* nothing to check for RSP */
|
||||||
|
|
||||||
|
cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
|
||||||
|
jne opportunistic_sysret_failed
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We win! This label is here just for ease of understanding
|
||||||
|
* perf profiles. Nothing jumps here.
|
||||||
|
*/
|
||||||
|
irq_return_via_sysret:
|
||||||
|
CFI_REMEMBER_STATE
|
||||||
|
RESTORE_ARGS 1,8,1
|
||||||
|
movq (RSP-RIP)(%rsp),%rsp
|
||||||
|
USERGS_SYSRET64
|
||||||
|
CFI_RESTORE_STATE
|
||||||
|
|
||||||
|
opportunistic_sysret_failed:
|
||||||
SWAPGS
|
SWAPGS
|
||||||
jmp restore_args
|
jmp restore_args
|
||||||
|
|
||||||
|
@ -110,15 +110,11 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
|
|||||||
|
|
||||||
enum ctx_state ist_enter(struct pt_regs *regs)
|
enum ctx_state ist_enter(struct pt_regs *regs)
|
||||||
{
|
{
|
||||||
/*
|
enum ctx_state prev_state;
|
||||||
* We are atomic because we're on the IST stack (or we're on x86_32,
|
|
||||||
* in which case we still shouldn't schedule.
|
|
||||||
*/
|
|
||||||
preempt_count_add(HARDIRQ_OFFSET);
|
|
||||||
|
|
||||||
if (user_mode_vm(regs)) {
|
if (user_mode_vm(regs)) {
|
||||||
/* Other than that, we're just an exception. */
|
/* Other than that, we're just an exception. */
|
||||||
return exception_enter();
|
prev_state = exception_enter();
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* We might have interrupted pretty much anything. In
|
* We might have interrupted pretty much anything. In
|
||||||
@ -127,12 +123,27 @@ enum ctx_state ist_enter(struct pt_regs *regs)
|
|||||||
* but we need to notify RCU.
|
* but we need to notify RCU.
|
||||||
*/
|
*/
|
||||||
rcu_nmi_enter();
|
rcu_nmi_enter();
|
||||||
return IN_KERNEL; /* the value is irrelevant. */
|
prev_state = IN_KERNEL; /* the value is irrelevant. */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We are atomic because we're on the IST stack (or we're on x86_32,
|
||||||
|
* in which case we still shouldn't schedule).
|
||||||
|
*
|
||||||
|
* This must be after exception_enter(), because exception_enter()
|
||||||
|
* won't do anything if in_interrupt() returns true.
|
||||||
|
*/
|
||||||
|
preempt_count_add(HARDIRQ_OFFSET);
|
||||||
|
|
||||||
|
/* This code is a bit fragile. Test it. */
|
||||||
|
rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
|
||||||
|
|
||||||
|
return prev_state;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
|
void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
|
||||||
{
|
{
|
||||||
|
/* Must be before exception_exit. */
|
||||||
preempt_count_sub(HARDIRQ_OFFSET);
|
preempt_count_sub(HARDIRQ_OFFSET);
|
||||||
|
|
||||||
if (user_mode_vm(regs))
|
if (user_mode_vm(regs))
|
||||||
|
Loading…
Reference in New Issue
Block a user