linux/arch/powerpc/kernel/vdso32/datapage.S

/*
 * Access to the shared data page by the vDSO & syscall map
 *
 * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */

#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/unistd.h>
#include <asm/vdso.h>

	.text
	.global	__kernel_datapage_offset;
__kernel_datapage_offset:
	.long	0

V_FUNCTION_BEGIN(__get_datapage)
  .cfi_startproc
	/* We don't want that exposed or overridable as we want other objects
	 * to be able to bl directly to here
	 */
	.protected __get_datapage
	.hidden __get_datapage

	mflr	r0
  .cfi_register lr,r0

	bcl	20,31,data_page_branch
data_page_branch:
	mflr	r3
	mtlr	r0
	addi	r3, r3, __kernel_datapage_offset-data_page_branch
	lwz	r0,0(r3)
	add	r3,r0,r3
	blr
  .cfi_endproc
V_FUNCTION_END(__get_datapage)

/*
 * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
 *
 * returns a pointer to the syscall map. the map is agnostic to the
 * size of "long", unlike kernel bitops, it stores bits from top to
 * bottom so that memory actually contains a linear bitmap
 * check for syscall N by testing bit (0x80000000 >> (N & 0x1f)) of
 * 32 bits int at N >> 5.
 */
V_FUNCTION_BEGIN(__kernel_get_syscall_map)
  .cfi_startproc
	mflr	r12
  .cfi_register lr,r12
	mr	r4,r3
	bl	__get_datapage@local
	mtlr	r12
	addi	r3,r3,CFG_SYSCALL_MAP32
	cmpli	cr0,r4,0
	beqlr
	li	r0,NR_syscalls
	stw	r0,0(r4)
	crclr	cr0*4+so
	blr
  .cfi_endproc
V_FUNCTION_END(__kernel_get_syscall_map)

/*
 * void unsigned long long  __kernel_get_tbfreq(void);
 *
 * returns the timebase frequency in HZ
 */
V_FUNCTION_BEGIN(__kernel_get_tbfreq)
  .cfi_startproc
	mflr	r12
  .cfi_register lr,r12
	bl	__get_datapage@local
	lwz	r4,(CFG_TB_TICKS_PER_SEC + 4)(r3)
	lwz	r3,CFG_TB_TICKS_PER_SEC(r3)
	mtlr	r12
	crclr	cr0*4+so
	blr
  .cfi_endproc
V_FUNCTION_END(__kernel_get_tbfreq)
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`/*`
			`* Access to the shared data page by the vDSO & syscall map`
			`*`
			`* Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.`
			`*`
			`* This program is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU General Public License`
			`* as published by the Free Software Foundation; either version`
			`* 2 of the License, or (at your option) any later version.`
			`*/`

			`#include <asm/processor.h>`
			`#include <asm/ppc_asm.h>`
kbuild: m68k,parisc,ppc,ppc64,s390,xtensa use generic asm-offsets.h support Delete obsoleted parts form arch makefiles and rename to asm-offsets.h Signed-off-by: Sam Ravnborg <sam@ravnborg.org> 2005-09-09 18:57:26 +00:00			`#include <asm/asm-offsets.h>`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`#include <asm/unistd.h>`
			`#include <asm/vdso.h>`

			`.text`
powerpc/vdso: Avoid link stack corruption in __get_datapage() powerpc has a link register (lr) used for calling functions. We "bl <func>" to call a function, and "blr" to return back to the call site. The lr is only a single register, so if we call another function from inside this function (ie. nested calls), software must save away the lr on the software stack before calling the new function. Before returning (ie. before the "blr"), the lr is restored by software from the software stack. This makes branch prediction quite difficult for the processor as it will only know the branch target just before the "blr". To help with this, modern powerpc processors keep a (non-architected) hardware stack of lr called a "link stack". When a "bl <func>" is run, the lr is pushed onto this stack. When a "blr" is called, the branch predictor pops the lr value from the top of the link stack, and uses it to predict the branch target. Hence the processor pipeline knows a lot earlier the branch target. This works great but there are some cases where you call "bl" but without a matching "blr". Once such case is when trying to determine the program counter (which can't be read directly). Here you "bl+4; mflr" to get the program counter. If you do this, the link stack will get out of sync with reality, causing the branch predictor to mis-predict subsequent function returns. To avoid this, modern micro-architectures have a special case of bl. Using the form "bcl 20,31,+4", ensures the processor doesn't push to the link stack. The 32 and 64 bit variants of __get_datapage() use a "bl; mflr" to determine the loaded address of the VDSO. The current versions of these attempt to use this special bl variant. Unfortunately they use +8 rather than the required +4. Hence the current code results in the link stack getting out of sync with reality and hence the resulting performance degradation. This patch moves it to bcl+4 by moving __kernel_datapage_offset out of __get_datapage(). With this patch, running a gettimeofday() (which uses __get_datapage()) microbenchmark we get a decent bump in performance on POWER7/8. For the benchmark in tools/testing/selftests/powerpc/benchmarks/gettimeofday.c POWER8: 64bit gets ~4% improvement 32bit gets ~9% improvement POWER7: 64bit gets ~7% improvement Signed-off-by: Michael Neuling <mikey@neuling.org> Reported-by: Aaron Sawdey <sawdey@us.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> 2015-09-25 04:01:40 +00:00			`.global __kernel_datapage_offset;`
			`__kernel_datapage_offset:`
			`.long 0`

Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`V_FUNCTION_BEGIN(__get_datapage)`
			`.cfi_startproc`
			`/* We don't want that exposed or overridable as we want other objects`
			`* to be able to bl directly to here`
			`*/`
			`.protected __get_datapage`
			`.hidden __get_datapage`

			`mflr r0`
			`.cfi_register lr,r0`

powerpc/vdso: Avoid link stack corruption in __get_datapage() powerpc has a link register (lr) used for calling functions. We "bl <func>" to call a function, and "blr" to return back to the call site. The lr is only a single register, so if we call another function from inside this function (ie. nested calls), software must save away the lr on the software stack before calling the new function. Before returning (ie. before the "blr"), the lr is restored by software from the software stack. This makes branch prediction quite difficult for the processor as it will only know the branch target just before the "blr". To help with this, modern powerpc processors keep a (non-architected) hardware stack of lr called a "link stack". When a "bl <func>" is run, the lr is pushed onto this stack. When a "blr" is called, the branch predictor pops the lr value from the top of the link stack, and uses it to predict the branch target. Hence the processor pipeline knows a lot earlier the branch target. This works great but there are some cases where you call "bl" but without a matching "blr". Once such case is when trying to determine the program counter (which can't be read directly). Here you "bl+4; mflr" to get the program counter. If you do this, the link stack will get out of sync with reality, causing the branch predictor to mis-predict subsequent function returns. To avoid this, modern micro-architectures have a special case of bl. Using the form "bcl 20,31,+4", ensures the processor doesn't push to the link stack. The 32 and 64 bit variants of __get_datapage() use a "bl; mflr" to determine the loaded address of the VDSO. The current versions of these attempt to use this special bl variant. Unfortunately they use +8 rather than the required +4. Hence the current code results in the link stack getting out of sync with reality and hence the resulting performance degradation. This patch moves it to bcl+4 by moving __kernel_datapage_offset out of __get_datapage(). With this patch, running a gettimeofday() (which uses __get_datapage()) microbenchmark we get a decent bump in performance on POWER7/8. For the benchmark in tools/testing/selftests/powerpc/benchmarks/gettimeofday.c POWER8: 64bit gets ~4% improvement 32bit gets ~9% improvement POWER7: 64bit gets ~7% improvement Signed-off-by: Michael Neuling <mikey@neuling.org> Reported-by: Aaron Sawdey <sawdey@us.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> 2015-09-25 04:01:40 +00:00			`bcl 20,31,data_page_branch`
			`data_page_branch:`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`mflr r3`
			`mtlr r0`
powerpc/vdso: Avoid link stack corruption in __get_datapage() powerpc has a link register (lr) used for calling functions. We "bl <func>" to call a function, and "blr" to return back to the call site. The lr is only a single register, so if we call another function from inside this function (ie. nested calls), software must save away the lr on the software stack before calling the new function. Before returning (ie. before the "blr"), the lr is restored by software from the software stack. This makes branch prediction quite difficult for the processor as it will only know the branch target just before the "blr". To help with this, modern powerpc processors keep a (non-architected) hardware stack of lr called a "link stack". When a "bl <func>" is run, the lr is pushed onto this stack. When a "blr" is called, the branch predictor pops the lr value from the top of the link stack, and uses it to predict the branch target. Hence the processor pipeline knows a lot earlier the branch target. This works great but there are some cases where you call "bl" but without a matching "blr". Once such case is when trying to determine the program counter (which can't be read directly). Here you "bl+4; mflr" to get the program counter. If you do this, the link stack will get out of sync with reality, causing the branch predictor to mis-predict subsequent function returns. To avoid this, modern micro-architectures have a special case of bl. Using the form "bcl 20,31,+4", ensures the processor doesn't push to the link stack. The 32 and 64 bit variants of __get_datapage() use a "bl; mflr" to determine the loaded address of the VDSO. The current versions of these attempt to use this special bl variant. Unfortunately they use +8 rather than the required +4. Hence the current code results in the link stack getting out of sync with reality and hence the resulting performance degradation. This patch moves it to bcl+4 by moving __kernel_datapage_offset out of __get_datapage(). With this patch, running a gettimeofday() (which uses __get_datapage()) microbenchmark we get a decent bump in performance on POWER7/8. For the benchmark in tools/testing/selftests/powerpc/benchmarks/gettimeofday.c POWER8: 64bit gets ~4% improvement 32bit gets ~9% improvement POWER7: 64bit gets ~7% improvement Signed-off-by: Michael Neuling <mikey@neuling.org> Reported-by: Aaron Sawdey <sawdey@us.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> 2015-09-25 04:01:40 +00:00			`addi r3, r3, __kernel_datapage_offset-data_page_branch`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`lwz r0,0(r3)`
			`add r3,r0,r3`
			`blr`
			`.cfi_endproc`
			`V_FUNCTION_END(__get_datapage)`

			`/*`
			`* void __kernel_get_syscall_map(unsigned int syscall_count) ;`
			`*`
			`* returns a pointer to the syscall map. the map is agnostic to the`
			`* size of "long", unlike kernel bitops, it stores bits from top to`
			`* bottom so that memory actually contains a linear bitmap`
			`* check for syscall N by testing bit (0x80000000 >> (N & 0x1f)) of`
			`* 32 bits int at N >> 5.`
			`*/`
			`V_FUNCTION_BEGIN(__kernel_get_syscall_map)`
			`.cfi_startproc`
			`mflr r12`
			`.cfi_register lr,r12`
			`mr r4,r3`
			`bl __get_datapage@local`
			`mtlr r12`
			`addi r3,r3,CFG_SYSCALL_MAP32`
			`cmpli cr0,r4,0`
			`beqlr`
powerpc: Standardise on NR_syscalls rather than __NR_syscalls. Most architectures use NR_syscalls as the #define for the number of syscalls. We use __NR_syscalls, and then define NR_syscalls as __NR_syscalls. __NR_syscalls is not used outside arch code, whereas NR_syscalls is. So as NR_syscalls must be defined and __NR_syscalls does not, replace __NR_syscalls with NR_syscalls. Signed-off-by: Rashmica Gupta <rashmicy@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> 2015-11-19 06:04:53 +00:00			`li r0,NR_syscalls`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`stw r0,0(r4)`
[PATCH] powerpc: Make the vDSO functions set error code (#2) The vDSO functions should have the same calling convention as a syscall. Unfortunately, they currently don't set the cr0.so bit which is used to indicate an error. This patch makes them clear this bit unconditionally since all functions currently succeed. The syscall fallback done by some of them will eventually override this if the syscall fails. This also changes the symbol version of all vdso exports to make sure glibc can differenciate between old and fixed calls for existing ones like __kernel_gettimeofday. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@samba.org> 2005-11-16 02:54:32 +00:00			`crclr cr0*4+so`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`blr`
			`.cfi_endproc`
			`V_FUNCTION_END(__kernel_get_syscall_map)`
[PATCH] powerpc: Merge vdso's and add vdso support to 32 bits kernel This patch moves the vdso's to arch/powerpc, adds support for the 32 bits vdso to the 32 bits kernel, rename systemcfg (finally !), and adds some new (still untested) routines to both vdso's: clock_gettime() with support for CLOCK_REALTIME and CLOCK_MONOTONIC, clock_getres() (same clocks) and get_tbfreq() for glibc to retreive the timebase frequency. Tom,Steve: The implementation of get_tbfreq() I've done for 32 bits returns a long long (r3, r4) not a long. This is such that if we ever add support for >4Ghz timebases on ppc32, the userland interface won't have to change. I have tested gettimeofday() using some glibc patches in both ppc32 and ppc64 kernels using 32 bits userland (I haven't had a chance to test a 64 bits userland yet, but the implementation didn't change and was tested earlier). I haven't tested yet the new functions. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@samba.org> 2005-11-11 10:15:21 +00:00
			`/*`
			`* void unsigned long long __kernel_get_tbfreq(void);`
			`*`
			`* returns the timebase frequency in HZ`
			`*/`
			`V_FUNCTION_BEGIN(__kernel_get_tbfreq)`
			`.cfi_startproc`
			`mflr r12`
			`.cfi_register lr,r12`
			`bl __get_datapage@local`
			`lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3)`
[PATCH] powerpc: vdso fixes (take #2) This fixes various errors in the new functions added in the vDSO's, I've now verified all functions on both 32 and 64 bits vDSOs. It also fix a sign extension bug getting the initial time of day at boot that could cause the monotonic clock value to be completely on bogus for 64 bits applications (with either the vDSO or the syscall) on powermacs. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@samba.org> 2005-11-14 03:55:58 +00:00			`lwz r3,CFG_TB_TICKS_PER_SEC(r3)`
[PATCH] powerpc: Merge vdso's and add vdso support to 32 bits kernel This patch moves the vdso's to arch/powerpc, adds support for the 32 bits vdso to the 32 bits kernel, rename systemcfg (finally !), and adds some new (still untested) routines to both vdso's: clock_gettime() with support for CLOCK_REALTIME and CLOCK_MONOTONIC, clock_getres() (same clocks) and get_tbfreq() for glibc to retreive the timebase frequency. Tom,Steve: The implementation of get_tbfreq() I've done for 32 bits returns a long long (r3, r4) not a long. This is such that if we ever add support for >4Ghz timebases on ppc32, the userland interface won't have to change. I have tested gettimeofday() using some glibc patches in both ppc32 and ppc64 kernels using 32 bits userland (I haven't had a chance to test a 64 bits userland yet, but the implementation didn't change and was tested earlier). I haven't tested yet the new functions. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@samba.org> 2005-11-11 10:15:21 +00:00			`mtlr r12`
[PATCH] powerpc: Make the vDSO functions set error code (#2) The vDSO functions should have the same calling convention as a syscall. Unfortunately, they currently don't set the cr0.so bit which is used to indicate an error. This patch makes them clear this bit unconditionally since all functions currently succeed. The syscall fallback done by some of them will eventually override this if the syscall fails. This also changes the symbol version of all vdso exports to make sure glibc can differenciate between old and fixed calls for existing ones like __kernel_gettimeofday. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@samba.org> 2005-11-16 02:54:32 +00:00			`crclr cr0*4+so`
[PATCH] powerpc: vdso fixes (take #2) This fixes various errors in the new functions added in the vDSO's, I've now verified all functions on both 32 and 64 bits vDSOs. It also fix a sign extension bug getting the initial time of day at boot that could cause the monotonic clock value to be completely on bogus for 64 bits applications (with either the vDSO or the syscall) on powermacs. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@samba.org> 2005-11-14 03:55:58 +00:00			`blr`
[PATCH] powerpc: Merge vdso's and add vdso support to 32 bits kernel This patch moves the vdso's to arch/powerpc, adds support for the 32 bits vdso to the 32 bits kernel, rename systemcfg (finally !), and adds some new (still untested) routines to both vdso's: clock_gettime() with support for CLOCK_REALTIME and CLOCK_MONOTONIC, clock_getres() (same clocks) and get_tbfreq() for glibc to retreive the timebase frequency. Tom,Steve: The implementation of get_tbfreq() I've done for 32 bits returns a long long (r3, r4) not a long. This is such that if we ever add support for >4Ghz timebases on ppc32, the userland interface won't have to change. I have tested gettimeofday() using some glibc patches in both ppc32 and ppc64 kernels using 32 bits userland (I haven't had a chance to test a 64 bits userland yet, but the implementation didn't change and was tested earlier). I haven't tested yet the new functions. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@samba.org> 2005-11-11 10:15:21 +00:00			`.cfi_endproc`
			`V_FUNCTION_END(__kernel_get_tbfreq)`