[PATCH] i386: Use %gs as the PDA base-segment in the kernel

This patch is the meat of the PDA change.  This patch makes several related
changes:

1: Most significantly, %gs is now used in the kernel.  This means that on
   entry, the old value of %gs is saved away, and it is reloaded with
   __KERNEL_PDA.

2: entry.S constructs the stack in the shape of struct pt_regs, and this
   is passed around the kernel so that the process's saved register
   state can be accessed.

   Unfortunately struct pt_regs doesn't currently have space for %gs
   (or %fs). This patch extends pt_regs to add space for gs (no space
   is allocated for %fs, since it won't be used, and it would just
   complicate the code in entry.S to work around the space).

3: Because %gs is now saved on the stack like %ds, %es and the integer
   registers, there are a number of places where it no longer needs to
   be handled specially; namely context switch, and saving/restoring the
   register state in a signal context.

4: And since kernel threads run in kernel space and call normal kernel
   code, they need to be created with their %gs == __KERNEL_PDA.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
This commit is contained in:
Jeremy Fitzhardinge 2006-12-07 02:14:02 +01:00 committed by Andi Kleen
parent 6211119580
commit f95d47caae
10 changed files with 117 additions and 50 deletions

View File

@ -72,6 +72,7 @@ void foo(void)
OFFSET(PT_EAX, pt_regs, eax);
OFFSET(PT_DS, pt_regs, xds);
OFFSET(PT_ES, pt_regs, xes);
OFFSET(PT_GS, pt_regs, xgs);
OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
OFFSET(PT_EIP, pt_regs, eip);
OFFSET(PT_CS, pt_regs, xcs);

View File

@ -593,6 +593,14 @@ void __init early_cpu_init(void)
#endif
}
/* Make sure %gs is initialized properly in idle threads */
struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(struct pt_regs));
regs->xgs = __KERNEL_PDA;
return regs;
}
__cpuinit int alloc_gdt(int cpu)
{
struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
@ -644,6 +652,14 @@ struct i386_pda boot_pda = {
._pda = &boot_pda,
};
static inline void set_kernel_gs(void)
{
/* Set %gs for this CPU's PDA. Memory clobber is to create a
barrier with respect to any PDA operations, so the compiler
doesn't move any before here. */
asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
}
/* Initialize the CPU's GDT and PDA. The boot CPU does this for
itself, but secondaries find this done for them. */
__cpuinit int init_gdt(int cpu, struct task_struct *idle)
@ -693,6 +709,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
the boot CPU, this will transition from the boot gdt+pda to
the real ones). */
load_gdt(cpu_gdt_descr);
set_kernel_gs();
if (cpu_test_and_set(cpu, cpu_initialized)) {
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@ -731,8 +748,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif
/* Clear %fs and %gs. */
asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
/* Clear %fs. */
asm volatile ("mov %0, %%fs" : : "r" (0));
/* Clear all 6 debug registers: */
set_debugreg(0, 0);

View File

@ -30,12 +30,13 @@
* 18(%esp) - %eax
* 1C(%esp) - %ds
* 20(%esp) - %es
* 24(%esp) - orig_eax
* 28(%esp) - %eip
* 2C(%esp) - %cs
* 30(%esp) - %eflags
* 34(%esp) - %oldesp
* 38(%esp) - %oldss
* 24(%esp) - %gs
* 28(%esp) - orig_eax
* 2C(%esp) - %eip
* 30(%esp) - %cs
* 34(%esp) - %eflags
* 38(%esp) - %oldesp
* 3C(%esp) - %oldss
*
* "current" is in register %ebx during any slow entries.
*/
@ -92,6 +93,9 @@ VM_MASK = 0x00020000
#define SAVE_ALL \
cld; \
pushl %gs; \
CFI_ADJUST_CFA_OFFSET 4;\
/*CFI_REL_OFFSET gs, 0;*/\
pushl %es; \
CFI_ADJUST_CFA_OFFSET 4;\
/*CFI_REL_OFFSET es, 0;*/\
@ -121,7 +125,9 @@ VM_MASK = 0x00020000
CFI_REL_OFFSET ebx, 0;\
movl $(__USER_DS), %edx; \
movl %edx, %ds; \
movl %edx, %es;
movl %edx, %es; \
movl $(__KERNEL_PDA), %edx; \
movl %edx, %gs
#define RESTORE_INT_REGS \
popl %ebx; \
@ -154,17 +160,22 @@ VM_MASK = 0x00020000
2: popl %es; \
CFI_ADJUST_CFA_OFFSET -4;\
/*CFI_RESTORE es;*/\
.section .fixup,"ax"; \
3: movl $0,(%esp); \
jmp 1b; \
3: popl %gs; \
CFI_ADJUST_CFA_OFFSET -4;\
/*CFI_RESTORE gs;*/\
.pushsection .fixup,"ax"; \
4: movl $0,(%esp); \
jmp 1b; \
5: movl $0,(%esp); \
jmp 2b; \
.previous; \
6: movl $0,(%esp); \
jmp 3b; \
.section __ex_table,"a";\
.align 4; \
.long 1b,3b; \
.long 2b,4b; \
.previous
.long 1b,4b; \
.long 2b,5b; \
.long 3b,6b; \
.popsection
#define RING0_INT_FRAME \
CFI_STARTPROC simple;\
@ -231,6 +242,7 @@ check_userspace:
andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
cmpl $USER_RPL, %eax
jb resume_kernel # not returning to v8086 or userspace
ENTRY(resume_userspace)
DISABLE_INTERRUPTS # make sure we don't miss an interrupt
# setting need_resched or sigpending
@ -327,9 +339,16 @@ sysenter_past_esp:
movl PT_OLDESP(%esp), %ecx
xorl %ebp,%ebp
TRACE_IRQS_ON
1: mov PT_GS(%esp), %gs
ENABLE_INTERRUPTS_SYSEXIT
CFI_ENDPROC
.pushsection .fixup,"ax"
2: movl $0,PT_GS(%esp)
jmp 1b
.section __ex_table,"a"
.align 4
.long 1b,2b
.popsection
# system call handler stub
ENTRY(system_call)
@ -375,7 +394,7 @@ restore_nocheck:
TRACE_IRQS_IRET
restore_nocheck_notrace:
RESTORE_REGS
addl $4, %esp
addl $4, %esp # skip orig_eax/error_code
CFI_ADJUST_CFA_OFFSET -4
1: INTERRUPT_RETURN
.section .fixup,"ax"
@ -588,6 +607,10 @@ KPROBE_ENTRY(page_fault)
CFI_ADJUST_CFA_OFFSET 4
ALIGN
error_code:
/* the function address is in %gs's slot on the stack */
pushl %es
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET es, 0*/
pushl %ds
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET ds, 0*/
@ -613,18 +636,20 @@ error_code:
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ebx, 0
cld
pushl %es
pushl %gs
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET es, 0*/
/*CFI_REL_OFFSET gs, 0*/
movl $(__KERNEL_PDA), %ecx
movl %ecx, %gs
UNWIND_ESPFIX_STACK
popl %ecx
CFI_ADJUST_CFA_OFFSET -4
/*CFI_REGISTER es, ecx*/
movl PT_ES(%esp), %edi # get the function address
movl PT_GS(%esp), %edi # get the function address
movl PT_ORIG_EAX(%esp), %edx # get the error code
movl $-1, PT_ORIG_EAX(%esp)
movl %ecx, PT_ES(%esp)
/*CFI_REL_OFFSET es, ES*/
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
mov %ecx, PT_GS(%esp)
/*CFI_REL_OFFSET gs, ES*/
movl $(__USER_DS), %ecx
movl %ecx, %ds
movl %ecx, %es
@ -936,6 +961,7 @@ ENTRY(arch_unwind_init_running)
movl %ebx, PT_EAX(%edx)
movl $__USER_DS, PT_DS(%edx)
movl $__USER_DS, PT_ES(%edx)
movl $0, PT_GS(%edx)
movl %ebx, PT_ORIG_EAX(%edx)
movl %ecx, PT_EIP(%edx)
movl 12(%esp), %ecx

View File

@ -302,6 +302,7 @@ is386: movl $2,%ecx # set MP
movl %eax,%cr0
call check_x87
call setup_pda
lgdt cpu_gdt_descr
lidt idt_descr
ljmp $(__KERNEL_CS),$1f
@ -312,10 +313,13 @@ is386: movl $2,%ecx # set MP
movl %eax,%ds
movl %eax,%es
xorl %eax,%eax # Clear FS/GS and LDT
xorl %eax,%eax # Clear FS and LDT
movl %eax,%fs
movl %eax,%gs
lldt %ax
movl $(__KERNEL_PDA),%eax
mov %eax,%gs
cld # gcc2 wants the direction flag cleared at all times
pushl $0 # fake return address for unwinder
#ifdef CONFIG_SMP
@ -345,6 +349,23 @@ check_x87:
.byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
ret
/*
* Point the GDT at this CPU's PDA. On boot this will be
* cpu_gdt_table and boot_pda; for secondary CPUs, these will be
* that CPU's GDT and PDA.
*/
setup_pda:
/* get the PDA pointer */
movl start_pda, %eax
/* slot the PDA address into the GDT */
mov cpu_gdt_descr+2, %ecx
mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
shr $16, %eax
mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
ret
/*
* setup_idt
*
@ -484,6 +505,8 @@ ENTRY(empty_zero_page)
* This starts the data section.
*/
.data
ENTRY(start_pda)
.long boot_pda
ENTRY(stack_start)
.long init_thread_union+THREAD_SIZE
@ -525,7 +548,7 @@ idt_descr:
# boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
cpu_gdt_descr:
ENTRY(cpu_gdt_descr)
.word GDT_ENTRIES*8-1
.long cpu_gdt_table
@ -585,7 +608,7 @@ ENTRY(cpu_gdt_table)
.quad 0x004092000000ffff /* 0xc8 APM DS data */
.quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */
.quad 0x0000000000000000 /* 0xd8 - PDA */
.quad 0x00cf92000000ffff /* 0xd8 - PDA */
.quad 0x0000000000000000 /* 0xe0 - unused */
.quad 0x0000000000000000 /* 0xe8 - unused */
.quad 0x0000000000000000 /* 0xf0 - unused */

View File

@ -56,6 +56,7 @@
#include <asm/tlbflush.h>
#include <asm/cpu.h>
#include <asm/pda.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@ -346,6 +347,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
regs.xds = __USER_DS;
regs.xes = __USER_DS;
regs.xgs = __KERNEL_PDA;
regs.orig_eax = -1;
regs.eip = (unsigned long) kernel_thread_helper;
regs.xcs = __KERNEL_CS | get_kernel_rpl();
@ -431,7 +433,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
p->thread.eip = (unsigned long) ret_from_fork;
savesegment(fs,p->thread.fs);
savesegment(gs,p->thread.gs);
tsk = current;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@ -659,16 +660,16 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
load_esp0(tss, next);
/*
* Save away %fs and %gs. No need to save %es and %ds, as
* those are always kernel segments while inside the kernel.
* Doing this before setting the new TLS descriptors avoids
* the situation where we temporarily have non-reloadable
* segments in %fs and %gs. This could be an issue if the
* NMI handler ever used %fs or %gs (it does not today), or
* if the kernel is running inside of a hypervisor layer.
* Save away %fs. No need to save %gs, as it was saved on the
* stack on entry. No need to save %es and %ds, as those are
* always kernel segments while inside the kernel. Doing this
* before setting the new TLS descriptors avoids the situation
* where we temporarily have non-reloadable segments in %fs
* and %gs. This could be an issue if the NMI handler ever
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
savesegment(fs, prev->fs);
savesegment(gs, prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
@ -676,16 +677,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
load_TLS(next, cpu);
/*
* Restore %fs and %gs if needed.
* Restore %fs if needed.
*
* Glibc normally makes %fs be zero, and %gs is one of
* the TLS segments.
* Glibc normally makes %fs be zero.
*/
if (unlikely(prev->fs | next->fs))
loadsegment(fs, next->fs);
if (prev->gs | next->gs)
loadsegment(gs, next->gs);
/*
* Restore IOPL if needed.

View File

@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
GET_SEG(gs);
COPY_SEG(gs);
GET_SEG(fs);
COPY_SEG(es);
COPY_SEG(ds);
@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
{
int tmp, err = 0;
tmp = 0;
savesegment(gs, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
savesegment(fs, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->fs);

View File

@ -62,8 +62,8 @@ static inline void switch_mm(struct mm_struct *prev,
#endif
}
#define deactivate_mm(tsk, mm) \
asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
#define deactivate_mm(tsk, mm) \
asm("movl %0,%%fs": :"r" (0));
#define activate_mm(prev, next) \
switch_mm((prev),(next),NULL)

View File

@ -473,6 +473,7 @@ struct thread_struct {
.vm86_info = NULL, \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
.gs = __KERNEL_PDA, \
}
/*
@ -500,7 +501,8 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa
}
#define start_thread(regs, new_eip, new_esp) do { \
__asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \
__asm__("movl %0,%%fs": :"r" (0)); \
regs->xgs = 0; \
set_fs(USER_DS); \
regs->xds = __USER_DS; \
regs->xes = __USER_DS; \

View File

@ -16,6 +16,8 @@ struct pt_regs {
long eax;
int xds;
int xes;
/* int xfs; */
int xgs;
long orig_eax;
long eip;
int xcs;

View File

@ -1303,7 +1303,7 @@ fork_out:
return ERR_PTR(retval);
}
struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(struct pt_regs));
return regs;