linux/arch/x86/entry/entry_64_compat.S

/*
 * Compatibility mode system call entry point for x86-64.
 *
 * Copyright 2000-2002 Andi Kleen, SuSE Labs.
 */
#include "calling.h"
#include <asm/asm-offsets.h>
#include <asm/current.h>
#include <asm/errno.h>
#include <asm/ia32_unistd.h>
#include <asm/thread_info.h>
#include <asm/segment.h>
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <linux/linkage.h>
#include <linux/err.h>

	.section .entry.text, "ax"

#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret32)
	swapgs
	sysretl
ENDPROC(native_usergs_sysret32)
#endif

/*
 * 32-bit SYSENTER instruction entry.
 *
 * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
 * IF and VM in rflags are cleared (IOW: interrupts are off).
 * SYSENTER does not save anything on the stack,
 * and does not save old rip (!!!) and rflags.
 *
 * Arguments:
 * eax  system call number
 * ebx  arg1
 * ecx  arg2
 * edx  arg3
 * esi  arg4
 * edi  arg5
 * ebp  user stack
 * 0(%ebp) arg6
 *
 * This is purely a fast path. For anything complicated we use the int 0x80
 * path below. We set up a complete hardware stack frame to share code
 * with the int 0x80 path.
 */
ENTRY(entry_SYSENTER_compat)
	/* Interrupts are off on entry. */
	SWAPGS_UNSAFE_STACK
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

	/* Zero-extending 32-bit regs, do not remove */
	movl	%ebp, %ebp
	movl	%eax, %eax

	movl	ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d

	/* Construct struct pt_regs on stack */
	pushq	$__USER32_DS		/* pt_regs->ss */
	pushq	%rbp			/* pt_regs->sp */

	/*
	 * Push flags.  This is nasty.  First, interrupts are currently
	 * off, but we need pt_regs->flags to have IF set.  Second, even
	 * if TF was set when SYSENTER started, it's clear by now.  We fix
	 * that later using TIF_SINGLESTEP.
	 */
	pushfq				/* pt_regs->flags (except IF = 0) */
	orl	$X86_EFLAGS_IF, (%rsp)	/* Fix saved flags */

	pushq	$__USER32_CS		/* pt_regs->cs */
	pushq	%r10			/* pt_regs->ip = thread_info->sysenter_return */
	pushq	%rax			/* pt_regs->orig_ax */
	pushq	%rdi			/* pt_regs->di */
	pushq	%rsi			/* pt_regs->si */
	pushq	%rdx			/* pt_regs->dx */
	pushq	%rcx			/* pt_regs->cx */
	pushq	$-ENOSYS		/* pt_regs->ax */
	cld
	sub	$(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */

	/*
	 * Sysenter doesn't filter flags, so we need to clear NT
	 * ourselves.  To save a few cycles, we can check whether
	 * NT was set instead of doing an unconditional popfq.
	 * This needs to happen before enabling interrupts so that
	 * we don't get preempted with NT set.
	 */
	testl	$X86_EFLAGS_NT, EFLAGS(%rsp)
	jnz	sysenter_fix_flags
sysenter_flags_fixed:

	/* Temporary: SYSENTER is disabled. */
#ifdef CONFIG_CONTEXT_TRACKING
	call enter_from_user_mode
#endif
	ENABLE_INTERRUPTS(CLBR_NONE)
	movl $11, %edi
	call do_exit

	/*
	 * Re-enable interrupts.  IRQ tracing already thinks that IRQs are
	 * on (since we treat user mode as having IRQs on), and the
	 * prologue above is too short for it to be worth adding a
	 * tracing round trip.
	 */
	ENABLE_INTERRUPTS(CLBR_NONE)

	/*
	 * No need to do an access_ok() check here because RBP has been
	 * 32-bit zero extended:
	 */
	ASM_STAC
1:	movl	(%rbp), %ebp
	_ASM_EXTABLE(1b, ia32_badarg)
	ASM_CLAC

	orl	$TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
	jnz	sysenter_tracesys

sysenter_do_call:
	/* 32-bit syscall -> 64-bit C ABI argument conversion */
	movl	%edi, %r8d		/* arg5 */
	movl	%ebp, %r9d		/* arg6 */
	xchg	%ecx, %esi		/* rsi:arg2, rcx:arg4 */
	movl	%ebx, %edi		/* arg1 */
	movl	%edx, %edx		/* arg3 (zero extension) */
sysenter_dispatch:
	cmpq	$(IA32_NR_syscalls-1), %rax
	ja	1f
	call	*ia32_sys_call_table(, %rax, 8)
	movq	%rax, RAX(%rsp)
1:
	DISABLE_INTERRUPTS(CLBR_NONE)
	TRACE_IRQS_OFF
	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
	jnz	ia32_ret_from_sys_call_irqs_off
sysexit_from_sys_call:
	/*
	 * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
	 * NMI between STI and SYSEXIT has poorly specified behavior,
	 * and and NMI followed by an IRQ with usergs is fatal.  So
	 * we just pretend we're using SYSEXIT but we really use
	 * SYSRETL instead.
	 *
	 * This code path is still called 'sysexit' because it pairs
	 * with 'sysenter' and it uses the SYSENTER calling convention.
	 */
	andl	$~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
	movl	RIP(%rsp), %ecx		/* User %eip */
	movq    RAX(%rsp), %rax
	movl	RSI(%rsp), %esi
	movl	RDI(%rsp), %edi
	xorl	%edx, %edx		/* Do not leak kernel information */
	xorq	%r8, %r8
	xorq	%r9, %r9
	xorq	%r10, %r10
	movl	EFLAGS(%rsp), %r11d	/* User eflags */
	TRACE_IRQS_ON

	/*
	 * SYSRETL works even on Intel CPUs.  Use it in preference to SYSEXIT,
	 * since it avoids a dicey window with interrupts enabled.
	 */
	movl	RSP(%rsp), %esp

	/*
	 * USERGS_SYSRET32 does:
	 *  gsbase = user's gs base
	 *  eip = ecx
	 *  rflags = r11
	 *  cs = __USER32_CS
	 *  ss = __USER_DS
	 *
	 * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
	 *
	 *  pop %ebp
	 *  pop %edx
	 *  pop %ecx
	 *
	 * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
	 * avoid info leaks.  R11 ends up with VDSO32_SYSENTER_RETURN's
	 * address (already known to user code), and R12-R15 are
	 * callee-saved and therefore don't contain any interesting
	 * kernel data.
	 */
	USERGS_SYSRET32

sysenter_fix_flags:
	pushq	$X86_EFLAGS_FIXED
	popfq
	jmp	sysenter_flags_fixed

sysenter_tracesys:
	SAVE_EXTRA_REGS
	xorl	%eax, %eax		/* Do not leak kernel information */
	movq	%rax, R11(%rsp)
	movq	%rax, R10(%rsp)
	movq	%rax, R9(%rsp)
	movq	%rax, R8(%rsp)
	movq	%rsp, %rdi		/* &pt_regs -> arg1 */
	call	syscall_trace_enter

	/* Reload arg registers from stack. (see sysenter_tracesys) */
	movl	RCX(%rsp), %ecx
	movl	RDX(%rsp), %edx
	movl	RSI(%rsp), %esi
	movl	RDI(%rsp), %edi
	movl	%eax, %eax		/* zero extension */

	RESTORE_EXTRA_REGS
	jmp	sysenter_do_call
ENDPROC(entry_SYSENTER_compat)

/*
 * 32-bit SYSCALL instruction entry.
 *
 * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
 * then loads new ss, cs, and rip from previously programmed MSRs.
 * rflags gets masked by a value from another MSR (so CLD and CLAC
 * are not needed). SYSCALL does not save anything on the stack
 * and does not change rsp.
 *
 * Note: rflags saving+masking-with-MSR happens only in Long mode
 * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
 * Don't get confused: rflags saving+masking depends on Long Mode Active bit
 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
 *
 * Arguments:
 * eax  system call number
 * ecx  return address
 * ebx  arg1
 * ebp  arg2	(note: not saved in the stack frame, should not be touched)
 * edx  arg3
 * esi  arg4
 * edi  arg5
 * esp  user stack
 * 0(%esp) arg6
 *
 * This is purely a fast path. For anything complicated we use the int 0x80
 * path below. We set up a complete hardware stack frame to share code
 * with the int 0x80 path.
 */
ENTRY(entry_SYSCALL_compat)
	/*
	 * Interrupts are off on entry.
	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
	 * it is too small to ever cause noticeable irq latency.
	 */
	SWAPGS_UNSAFE_STACK

	/* Temporary: SYSCALL32 is disabled. */
	movl	$-ENOSYS, %eax
	USERGS_SYSRET32

	movl	%esp, %r8d
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
	ENABLE_INTERRUPTS(CLBR_NONE)

	/* Zero-extending 32-bit regs, do not remove */
	movl	%eax, %eax

	/* Construct struct pt_regs on stack */
	pushq	$__USER32_DS		/* pt_regs->ss */
	pushq	%r8			/* pt_regs->sp */
	pushq	%r11			/* pt_regs->flags */
	pushq	$__USER32_CS		/* pt_regs->cs */
	pushq	%rcx			/* pt_regs->ip */
	pushq	%rax			/* pt_regs->orig_ax */
	pushq	%rdi			/* pt_regs->di */
	pushq	%rsi			/* pt_regs->si */
	pushq	%rdx			/* pt_regs->dx */
	pushq	%rbp			/* pt_regs->cx */
	movl	%ebp, %ecx
	pushq	$-ENOSYS		/* pt_regs->ax */
	sub	$(10*8), %rsp		/* pt_regs->r8-11, bp, bx, r12-15 not saved */

	/*
	 * No need to do an access_ok check here because r8 has been
	 * 32-bit zero extended:
	 */
	ASM_STAC
1:	movl	(%r8), %r9d
	_ASM_EXTABLE(1b, ia32_badarg)
	ASM_CLAC
	orl	$TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
	jnz	cstar_tracesys

cstar_do_call:
	/* 32-bit syscall -> 64-bit C ABI argument conversion */
	movl	%edi, %r8d		/* arg5 */
	/* r9 already loaded */		/* arg6 */
	xchg	%ecx, %esi		/* rsi:arg2, rcx:arg4 */
	movl	%ebx, %edi		/* arg1 */
	movl	%edx, %edx		/* arg3 (zero extension) */

cstar_dispatch:
	cmpq	$(IA32_NR_syscalls-1), %rax
	ja	1f

	call	*ia32_sys_call_table(, %rax, 8)
	movq	%rax, RAX(%rsp)
1:
	DISABLE_INTERRUPTS(CLBR_NONE)
	TRACE_IRQS_OFF
	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
	jnz	ia32_ret_from_sys_call_irqs_off

sysretl_from_sys_call:
	andl	$~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
	movl	RDX(%rsp), %edx
	movl	RSI(%rsp), %esi
	movl	RDI(%rsp), %edi
	movl	RIP(%rsp), %ecx
	movl	EFLAGS(%rsp), %r11d
	movq    RAX(%rsp), %rax
	xorq	%r10, %r10
	xorq	%r9, %r9
	xorq	%r8, %r8
	TRACE_IRQS_ON
	movl	RSP(%rsp), %esp
	/*
	 * 64-bit->32-bit SYSRET restores eip from ecx,
	 * eflags from r11 (but RF and VM bits are forced to 0),
	 * cs and ss are loaded from MSRs.
	 * (Note: 32-bit->32-bit SYSRET is different: since r11
	 * does not exist, it merely sets eflags.IF=1).
	 *
	 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
	 * descriptor is not reinitialized.  This means that we must
	 * avoid SYSRET with SS == NULL, which could happen if we schedule,
	 * exit the kernel, and re-enter using an interrupt vector.  (All
	 * interrupt entries on x86_64 set SS to NULL.)  We prevent that
	 * from happening by reloading SS in __switch_to.
	 */
	USERGS_SYSRET32

cstar_tracesys:
	xchgl	%r9d, %ebp
	SAVE_EXTRA_REGS
	xorl	%eax, %eax		/* Do not leak kernel information */
	movq	%rax, R11(%rsp)
	movq	%rax, R10(%rsp)
	movq	%r9, R9(%rsp)
	movq	%rax, R8(%rsp)
	movq	%rsp, %rdi		/* &pt_regs -> arg1 */
	call	syscall_trace_enter
	movl	R9(%rsp), %r9d

	/* Reload arg registers from stack. (see sysenter_tracesys) */
	movl	RCX(%rsp), %ecx
	movl	RDX(%rsp), %edx
	movl	RSI(%rsp), %esi
	movl	RDI(%rsp), %edi
	movl	%eax, %eax		/* zero extension */

	RESTORE_EXTRA_REGS
	xchgl	%ebp, %r9d
	jmp	cstar_do_call
END(entry_SYSCALL_compat)

ia32_badarg:
	/*
	 * So far, we've entered kernel mode, set AC, turned on IRQs, and
	 * saved C regs except r8-r11.  We haven't done any of the other
	 * standard entry work, though.  We want to bail, but we shouldn't
	 * treat this as a syscall entry since we don't even know what the
	 * args are.  Instead, treat this as a non-syscall entry, finish
	 * the entry work, and immediately exit after setting AX = -EFAULT.
	 *
	 * We're really just being polite here.  Killing the task outright
	 * would be a reasonable action, too.  Given that the only valid
	 * way to have gotten here is through the vDSO, and we already know
	 * that the stack pointer is bad, the task isn't going to survive
	 * for long no matter what we do.
	 */

	ASM_CLAC			/* undo STAC */
	movq	$-EFAULT, RAX(%rsp)	/* return -EFAULT if possible */

	/* Fill in the rest of pt_regs */
	xorl	%eax, %eax
	movq	%rax, R11(%rsp)
	movq	%rax, R10(%rsp)
	movq	%rax, R9(%rsp)
	movq	%rax, R8(%rsp)
	SAVE_EXTRA_REGS

	/* Turn IRQs back off. */
	DISABLE_INTERRUPTS(CLBR_NONE)
	TRACE_IRQS_OFF

	/* Now finish entering normal kernel mode. */
#ifdef CONFIG_CONTEXT_TRACKING
	call enter_from_user_mode
#endif

	/* And exit again. */
	jmp retint_user

ia32_ret_from_sys_call_irqs_off:
	TRACE_IRQS_ON
	ENABLE_INTERRUPTS(CLBR_NONE)

ia32_ret_from_sys_call:
	xorl	%eax, %eax		/* Do not leak kernel information */
	movq	%rax, R11(%rsp)
	movq	%rax, R10(%rsp)
	movq	%rax, R9(%rsp)
	movq	%rax, R8(%rsp)
	jmp	int_ret_from_sys_call

/*
 * Emulated IA32 system calls via int 0x80.
 *
 * Arguments:
 * eax  system call number
 * ebx  arg1
 * ecx  arg2
 * edx  arg3
 * esi  arg4
 * edi  arg5
 * ebp  arg6	(note: not saved in the stack frame, should not be touched)
 *
 * Notes:
 * Uses the same stack frame as the x86-64 version.
 * All registers except eax must be saved (but ptrace may violate that).
 * Arguments are zero extended. For system calls that want sign extension and
 * take long arguments a wrapper is needed. Most calls can just be called
 * directly.
 * Assumes it is only called from user space and entered with interrupts off.
 */

ENTRY(entry_INT80_compat)
	/*
	 * Interrupts are off on entry.
	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
	 * it is too small to ever cause noticeable irq latency.
	 */
	PARAVIRT_ADJUST_EXCEPTION_FRAME
	SWAPGS
	ENABLE_INTERRUPTS(CLBR_NONE)

	/* Zero-extending 32-bit regs, do not remove */
	movl	%eax, %eax

	/* Construct struct pt_regs on stack (iret frame is already on stack) */
	pushq	%rax			/* pt_regs->orig_ax */
	pushq	%rdi			/* pt_regs->di */
	pushq	%rsi			/* pt_regs->si */
	pushq	%rdx			/* pt_regs->dx */
	pushq	%rcx			/* pt_regs->cx */
	pushq	$-ENOSYS		/* pt_regs->ax */
	pushq	$0			/* pt_regs->r8 */
	pushq	$0			/* pt_regs->r9 */
	pushq	$0			/* pt_regs->r10 */
	pushq	$0			/* pt_regs->r11 */
	cld
	sub	$(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */

	orl	$TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
	jnz	ia32_tracesys

ia32_do_call:
	/* 32-bit syscall -> 64-bit C ABI argument conversion */
	movl	%edi, %r8d		/* arg5 */
	movl	%ebp, %r9d		/* arg6 */
	xchg	%ecx, %esi		/* rsi:arg2, rcx:arg4 */
	movl	%ebx, %edi		/* arg1 */
	movl	%edx, %edx		/* arg3 (zero extension) */
	cmpq	$(IA32_NR_syscalls-1), %rax
	ja	1f

	call	*ia32_sys_call_table(, %rax, 8)
	movq	%rax, RAX(%rsp)
1:
	jmp	int_ret_from_sys_call

ia32_tracesys:
	SAVE_EXTRA_REGS
	movq	%rsp, %rdi			/* &pt_regs -> arg1 */
	call	syscall_trace_enter
	/*
	 * Reload arg registers from stack in case ptrace changed them.
	 * Don't reload %eax because syscall_trace_enter() returned
	 * the %rax value we should see.  But do truncate it to 32 bits.
	 * If it's -1 to make us punt the syscall, then (u32)-1 is still
	 * an appropriately invalid value.
	 */
	movl	RCX(%rsp), %ecx
	movl	RDX(%rsp), %edx
	movl	RSI(%rsp), %esi
	movl	RDI(%rsp), %edi
	movl	%eax, %eax		/* zero extension */
	RESTORE_EXTRA_REGS
	jmp	ia32_do_call
END(entry_INT80_compat)

	.macro PTREGSCALL label, func
	ALIGN
GLOBAL(\label)
	leaq	\func(%rip), %rax
	jmp	ia32_ptregs_common
	.endm

	PTREGSCALL stub32_rt_sigreturn,	sys32_rt_sigreturn
	PTREGSCALL stub32_sigreturn,	sys32_sigreturn
	PTREGSCALL stub32_fork,		sys_fork
	PTREGSCALL stub32_vfork,	sys_vfork

	ALIGN
GLOBAL(stub32_clone)
	leaq	sys_clone(%rip), %rax
	/*
	 * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
	 * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
	 *
	 * The native 64-bit kernel's sys_clone() implements the latter,
	 * so we need to swap arguments here before calling it:
	 */
	xchg	%r8, %rcx
	jmp	ia32_ptregs_common

	ALIGN
ia32_ptregs_common:
	SAVE_EXTRA_REGS 8
	call	*%rax
	RESTORE_EXTRA_REGS 8
	ret
END(ia32_ptregs_common)