76f5df43ca
The 64-bit entry code was using six stack slots less by not saving/restoring registers which are callee-preserved according to the C ABI, and was not allocating space for them. Only when syscalls needed a complete "struct pt_regs" was the complete area allocated and filled in. As an additional twist, on interrupt entry a "slightly less truncated pt_regs" trick is used, to make nested interrupt stacks easier to unwind. This proved to be a source of significant obfuscation and subtle bugs. For example, 'stub_fork' had to pop the return address, extend the struct, save registers, and push return address back. Ugly. 'ia32_ptregs_common' pops return address and "returns" via jmp insn, throwing a wrench into CPU return stack cache. This patch changes the code to always allocate a complete "struct pt_regs" on the kernel stack. The saving of registers is still done lazily. "Partial pt_regs" trick on interrupt stack is retained. Macros which manipulate "struct pt_regs" on stack are reworked: - ALLOC_PT_GPREGS_ON_STACK allocates the structure. - SAVE_C_REGS saves to it those registers which are clobbered by C code. - SAVE_EXTRA_REGS saves to it all other registers. - Corresponding RESTORE_* and REMOVE_PT_GPREGS_FROM_STACK macros reverse it. 'ia32_ptregs_common', 'stub_fork' and friends lost their ugly dance with the return pointer. LOAD_ARGS32 in ia32entry.S now uses symbolic stack offsets instead of magic numbers. 'error_entry' and 'save_paranoid' now use SAVE_C_REGS + SAVE_EXTRA_REGS instead of having it open-coded yet again. Patch was run-tested: 64-bit executables, 32-bit executables, strace works. Timing tests did not show measurable difference in 32-bit and 64-bit syscalls. Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com> Signed-off-by: Andy Lutomirski <luto@amacapital.net> Cc: Alexei Starovoitov <ast@plumgrid.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Will Drewry <wad@chromium.org> Link: http://lkml.kernel.org/r/1423778052-21038-2-git-send-email-dvlasenk@redhat.com Link: http://lkml.kernel.org/r/b89763d354aa23e670b9bdf3a40ae320320a7c2e.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
210 lines
4.0 KiB
C
210 lines
4.0 KiB
C
#ifndef _X86_IRQFLAGS_H_
|
|
#define _X86_IRQFLAGS_H_
|
|
|
|
#include <asm/processor-flags.h>
|
|
|
|
#ifndef __ASSEMBLY__
|
|
/*
|
|
* Interrupt control:
|
|
*/
|
|
|
|
static inline unsigned long native_save_fl(void)
|
|
{
|
|
unsigned long flags;
|
|
|
|
/*
|
|
* "=rm" is safe here, because "pop" adjusts the stack before
|
|
* it evaluates its effective address -- this is part of the
|
|
* documented behavior of the "pop" instruction.
|
|
*/
|
|
asm volatile("# __raw_save_flags\n\t"
|
|
"pushf ; pop %0"
|
|
: "=rm" (flags)
|
|
: /* no input */
|
|
: "memory");
|
|
|
|
return flags;
|
|
}
|
|
|
|
static inline void native_restore_fl(unsigned long flags)
|
|
{
|
|
asm volatile("push %0 ; popf"
|
|
: /* no output */
|
|
:"g" (flags)
|
|
:"memory", "cc");
|
|
}
|
|
|
|
static inline void native_irq_disable(void)
|
|
{
|
|
asm volatile("cli": : :"memory");
|
|
}
|
|
|
|
static inline void native_irq_enable(void)
|
|
{
|
|
asm volatile("sti": : :"memory");
|
|
}
|
|
|
|
static inline void native_safe_halt(void)
|
|
{
|
|
asm volatile("sti; hlt": : :"memory");
|
|
}
|
|
|
|
static inline void native_halt(void)
|
|
{
|
|
asm volatile("hlt": : :"memory");
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_PARAVIRT
|
|
#include <asm/paravirt.h>
|
|
#else
|
|
#ifndef __ASSEMBLY__
|
|
#include <linux/types.h>
|
|
|
|
static inline notrace unsigned long arch_local_save_flags(void)
|
|
{
|
|
return native_save_fl();
|
|
}
|
|
|
|
static inline notrace void arch_local_irq_restore(unsigned long flags)
|
|
{
|
|
native_restore_fl(flags);
|
|
}
|
|
|
|
static inline notrace void arch_local_irq_disable(void)
|
|
{
|
|
native_irq_disable();
|
|
}
|
|
|
|
static inline notrace void arch_local_irq_enable(void)
|
|
{
|
|
native_irq_enable();
|
|
}
|
|
|
|
/*
|
|
* Used in the idle loop; sti takes one instruction cycle
|
|
* to complete:
|
|
*/
|
|
static inline void arch_safe_halt(void)
|
|
{
|
|
native_safe_halt();
|
|
}
|
|
|
|
/*
|
|
* Used when interrupts are already enabled or to
|
|
* shutdown the processor:
|
|
*/
|
|
static inline void halt(void)
|
|
{
|
|
native_halt();
|
|
}
|
|
|
|
/*
|
|
* For spinlocks, etc:
|
|
*/
|
|
static inline notrace unsigned long arch_local_irq_save(void)
|
|
{
|
|
unsigned long flags = arch_local_save_flags();
|
|
arch_local_irq_disable();
|
|
return flags;
|
|
}
|
|
#else
|
|
|
|
#define ENABLE_INTERRUPTS(x) sti
|
|
#define DISABLE_INTERRUPTS(x) cli
|
|
|
|
#ifdef CONFIG_X86_64
|
|
#define SWAPGS swapgs
|
|
/*
|
|
* Currently paravirt can't handle swapgs nicely when we
|
|
* don't have a stack we can rely on (such as a user space
|
|
* stack). So we either find a way around these or just fault
|
|
* and emulate if a guest tries to call swapgs directly.
|
|
*
|
|
* Either way, this is a good way to document that we don't
|
|
* have a reliable stack. x86_64 only.
|
|
*/
|
|
#define SWAPGS_UNSAFE_STACK swapgs
|
|
|
|
#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
|
|
|
|
#define INTERRUPT_RETURN jmp native_iret
|
|
#define USERGS_SYSRET64 \
|
|
swapgs; \
|
|
sysretq;
|
|
#define USERGS_SYSRET32 \
|
|
swapgs; \
|
|
sysretl
|
|
#define ENABLE_INTERRUPTS_SYSEXIT32 \
|
|
swapgs; \
|
|
sti; \
|
|
sysexit
|
|
|
|
#else
|
|
#define INTERRUPT_RETURN iret
|
|
#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
|
|
#define GET_CR0_INTO_EAX movl %cr0, %eax
|
|
#endif
|
|
|
|
|
|
#endif /* __ASSEMBLY__ */
|
|
#endif /* CONFIG_PARAVIRT */
|
|
|
|
#ifndef __ASSEMBLY__
|
|
static inline int arch_irqs_disabled_flags(unsigned long flags)
|
|
{
|
|
return !(flags & X86_EFLAGS_IF);
|
|
}
|
|
|
|
static inline int arch_irqs_disabled(void)
|
|
{
|
|
unsigned long flags = arch_local_save_flags();
|
|
|
|
return arch_irqs_disabled_flags(flags);
|
|
}
|
|
|
|
#else
|
|
|
|
#ifdef CONFIG_X86_64
|
|
#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
|
|
#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
|
|
TRACE_IRQS_ON; \
|
|
sti; \
|
|
SAVE_EXTRA_REGS; \
|
|
LOCKDEP_SYS_EXIT; \
|
|
RESTORE_EXTRA_REGS; \
|
|
cli; \
|
|
TRACE_IRQS_OFF;
|
|
|
|
#else
|
|
#define ARCH_LOCKDEP_SYS_EXIT \
|
|
pushl %eax; \
|
|
pushl %ecx; \
|
|
pushl %edx; \
|
|
call lockdep_sys_exit; \
|
|
popl %edx; \
|
|
popl %ecx; \
|
|
popl %eax;
|
|
|
|
#define ARCH_LOCKDEP_SYS_EXIT_IRQ
|
|
#endif
|
|
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
|
# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
|
|
# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
|
|
#else
|
|
# define TRACE_IRQS_ON
|
|
# define TRACE_IRQS_OFF
|
|
#endif
|
|
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
|
# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
|
|
# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
|
|
# else
|
|
# define LOCKDEP_SYS_EXIT
|
|
# define LOCKDEP_SYS_EXIT_IRQ
|
|
# endif
|
|
|
|
#endif /* __ASSEMBLY__ */
|
|
#endif
|