1ddf0b1b11
In Linux 3.18 and below, GCC hoists the lsl instructions in the pvclock code all the way to the beginning of __vdso_clock_gettime, slowing the non-paravirt case significantly. For unknown reasons, presumably related to the removal of a branch, the performance issue is gone as ofe76b027e64
x86,vdso: Use LSL unconditionally for vgetcpu but I don't trust GCC enough to expect the problem to stay fixed. There should be no correctness issue, because the __getcpu calls in __vdso_vlock_gettime were never necessary in the first place. Note to stable maintainers: In 3.18 and below, depending on configuration, gcc 4.9.2 generates code like this: 9c3: 44 0f 03 e8 lsl %ax,%r13d 9c7: 45 89 eb mov %r13d,%r11d 9ca: 0f 03 d8 lsl %ax,%ebx This patch won't apply as is to any released kernel, but I'll send a trivial backported version if needed. Fixes:51c19b4f59
x86: vdso: pvclock gettime support Cc: stable@vger.kernel.org # 3.8+ Cc: Marcelo Tosatti <mtosatti@redhat.com> Acked-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
95 lines
1.8 KiB
C
95 lines
1.8 KiB
C
#ifndef _ASM_X86_VGTOD_H
|
|
#define _ASM_X86_VGTOD_H
|
|
|
|
#include <linux/compiler.h>
|
|
#include <linux/clocksource.h>
|
|
|
|
#ifdef BUILD_VDSO32_64
|
|
typedef u64 gtod_long_t;
|
|
#else
|
|
typedef unsigned long gtod_long_t;
|
|
#endif
|
|
/*
|
|
* vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time
|
|
* so be carefull by modifying this structure.
|
|
*/
|
|
struct vsyscall_gtod_data {
|
|
unsigned seq;
|
|
|
|
int vclock_mode;
|
|
cycle_t cycle_last;
|
|
cycle_t mask;
|
|
u32 mult;
|
|
u32 shift;
|
|
|
|
/* open coded 'struct timespec' */
|
|
u64 wall_time_snsec;
|
|
gtod_long_t wall_time_sec;
|
|
gtod_long_t monotonic_time_sec;
|
|
u64 monotonic_time_snsec;
|
|
gtod_long_t wall_time_coarse_sec;
|
|
gtod_long_t wall_time_coarse_nsec;
|
|
gtod_long_t monotonic_time_coarse_sec;
|
|
gtod_long_t monotonic_time_coarse_nsec;
|
|
|
|
int tz_minuteswest;
|
|
int tz_dsttime;
|
|
};
|
|
extern struct vsyscall_gtod_data vsyscall_gtod_data;
|
|
|
|
static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
|
|
{
|
|
unsigned ret;
|
|
|
|
repeat:
|
|
ret = ACCESS_ONCE(s->seq);
|
|
if (unlikely(ret & 1)) {
|
|
cpu_relax();
|
|
goto repeat;
|
|
}
|
|
smp_rmb();
|
|
return ret;
|
|
}
|
|
|
|
static inline int gtod_read_retry(const struct vsyscall_gtod_data *s,
|
|
unsigned start)
|
|
{
|
|
smp_rmb();
|
|
return unlikely(s->seq != start);
|
|
}
|
|
|
|
static inline void gtod_write_begin(struct vsyscall_gtod_data *s)
|
|
{
|
|
++s->seq;
|
|
smp_wmb();
|
|
}
|
|
|
|
static inline void gtod_write_end(struct vsyscall_gtod_data *s)
|
|
{
|
|
smp_wmb();
|
|
++s->seq;
|
|
}
|
|
|
|
#ifdef CONFIG_X86_64
|
|
|
|
#define VGETCPU_CPU_MASK 0xfff
|
|
|
|
static inline unsigned int __getcpu(void)
|
|
{
|
|
unsigned int p;
|
|
|
|
/*
|
|
* Load per CPU data from GDT. LSL is faster than RDTSCP and
|
|
* works on all CPUs. This is volatile so that it orders
|
|
* correctly wrt barrier() and to keep gcc from cleverly
|
|
* hoisting it out of the calling function.
|
|
*/
|
|
asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
|
|
|
|
return p;
|
|
}
|
|
|
|
#endif /* CONFIG_X86_64 */
|
|
|
|
#endif /* _ASM_X86_VGTOD_H */
|