linux/arch/x86/include/asm/switch_to.h
Lai Jiangshan 1591584e2e x86/process/64: Move cpu_current_top_of_stack out of TSS
cpu_current_top_of_stack is currently stored in TSS.sp1. TSS is exposed
through the cpu_entry_area which is visible with user CR3 when PTI is
enabled and active.

This makes it a coveted fruit for attackers.  An attacker can fetch the
kernel stack top from it and continue next steps of actions based on the
kernel stack.

But it is actualy not necessary to be stored in the TSS.  It is only
accessed after the entry code switched to kernel CR3 and kernel GS_BASE
which means it can be in any regular percpu variable.

The reason why it is in TSS is historical (pre PTI) because TSS is also
used as scratch space in SYSCALL_64 and therefore cache hot.

A syscall also needs the per CPU variable current_task and eventually
__preempt_count, so placing cpu_current_top_of_stack next to them makes it
likely that they end up in the same cache line which should avoid
performance regressions. This is not enforced as the compiler is free to
place these variables, so these entry relevant variables should move into
a data structure to make this enforceable.

The seccomp_benchmark doesn't show any performance loss in the "getpid
native" test result.  Actually, the result changes from 93ns before to 92ns
with this change when KPTI is disabled. The test is very stable and
although the test doesn't show a higher degree of precision it gives enough
confidence that moving cpu_current_top_of_stack does not cause a
regression.

[ tglx: Removed unneeded export. Massaged changelog ]

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210125173444.22696-2-jiangshanlai@gmail.com
2021-03-28 22:40:10 +02:00

92 lines
2.3 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SWITCH_TO_H
#define _ASM_X86_SWITCH_TO_H
#include <linux/sched/task_stack.h>
struct task_struct; /* one of the stranger aspects of C forward declarations */
struct task_struct *__switch_to_asm(struct task_struct *prev,
struct task_struct *next);
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);
asmlinkage void ret_from_fork(void);
/*
* This is the structure pointed to by thread.sp for an inactive task. The
* order of the fields must match the code in __switch_to_asm().
*/
struct inactive_task_frame {
#ifdef CONFIG_X86_64
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
#else
unsigned long flags;
unsigned long si;
unsigned long di;
#endif
unsigned long bx;
/*
* These two fields must be together. They form a stack frame header,
* needed by get_frame_pointer().
*/
unsigned long bp;
unsigned long ret_addr;
};
struct fork_frame {
struct inactive_task_frame frame;
struct pt_regs regs;
};
#define switch_to(prev, next, last) \
do { \
((last) = __switch_to_asm((prev), (next))); \
} while (0)
#ifdef CONFIG_X86_32
static inline void refresh_sysenter_cs(struct thread_struct *thread)
{
/* Only happens when SEP is enabled, no need to test "SEP"arately: */
if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
return;
this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
}
#endif
/* This is used when switching tasks or entering/exiting vm86 mode. */
static inline void update_task_stack(struct task_struct *task)
{
/* sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task->thread.sp0);
else
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
/* Xen PV enters the kernel on the thread stack. */
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif
}
static inline void kthread_frame_init(struct inactive_task_frame *frame,
unsigned long fun, unsigned long arg)
{
frame->bx = fun;
#ifdef CONFIG_X86_32
frame->di = arg;
#else
frame->r12 = arg;
#endif
}
#endif /* _ASM_X86_SWITCH_TO_H */