mirror of
https://github.com/torvalds/linux.git
synced 2024-12-27 05:11:48 +00:00
5dd12c2152
Use mul_u64_u32_shr() so that x86_64 can use a single 64x64->128 mul. Before: 0000000000000560 <native_sched_clock>: 560: 44 8b 1d 00 00 00 00 mov 0x0(%rip),%r11d # 567 <native_sched_clock+0x7> 567: 55 push %rbp 568: 48 89 e5 mov %rsp,%rbp 56b: 45 85 db test %r11d,%r11d 56e: 75 4f jne 5bf <native_sched_clock+0x5f> 570: 0f 31 rdtsc 572: 89 c0 mov %eax,%eax 574: 48 c1 e2 20 shl $0x20,%rdx 578: 48 c7 c1 00 00 00 00 mov $0x0,%rcx 57f: 48 09 c2 or %rax,%rdx 582: 48 c7 c7 00 00 00 00 mov $0x0,%rdi 589: 65 8b 04 25 00 00 00 mov %gs:0x0,%eax 590: 00 591: 48 98 cltq 593: 48 8b 34 c5 00 00 00 mov 0x0(,%rax,8),%rsi 59a: 00 59b: 48 89 d0 mov %rdx,%rax 59e: 81 e2 ff 03 00 00 and $0x3ff,%edx 5a4: 48 c1 e8 0a shr $0xa,%rax 5a8: 48 0f af 14 0e imul (%rsi,%rcx,1),%rdx 5ad: 48 0f af 04 0e imul (%rsi,%rcx,1),%rax 5b2: 5d pop %rbp 5b3: 48 03 04 3e add (%rsi,%rdi,1),%rax 5b7: 48 c1 ea 0a shr $0xa,%rdx 5bb: 48 01 d0 add %rdx,%rax 5be: c3 retq After: 0000000000000550 <native_sched_clock>: 550: 8b 3d 00 00 00 00 mov 0x0(%rip),%edi # 556 <native_sched_clock+0x6> 556: 55 push %rbp 557: 48 89 e5 mov %rsp,%rbp 55a: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp 55e: 85 ff test %edi,%edi 560: 75 2c jne 58e <native_sched_clock+0x3e> 562: 0f 31 rdtsc 564: 89 c0 mov %eax,%eax 566: 48 c1 e2 20 shl $0x20,%rdx 56a: 48 09 c2 or %rax,%rdx 56d: 65 48 8b 04 25 00 00 mov %gs:0x0,%rax 574: 00 00 576: 89 c0 mov %eax,%eax 578: 48 f7 e2 mul %rdx 57b: 65 48 8b 0c 25 00 00 mov %gs:0x0,%rcx 582: 00 00 584: c9 leaveq 585: 48 0f ac d0 0a shrd $0xa,%rdx,%rax 58a: 48 01 c8 add %rcx,%rax 58d: c3 retq MAINLINE POST sched_clock_stable: 1 1 (cold) sched_clock: 329841 331312 (cold) local_clock: 301773 310296 (warm) sched_clock: 38375 38247 (warm) local_clock: 100371 102713 (warm) rdtsc: 27340 27289 sched_clock_stable: 0 0 (cold) sched_clock: 382634 372706 (cold) local_clock: 396890 399275 (warm) sched_clock: 38194 38124 (warm) local_clock: 143452 148698 (warm) rdtsc: 27345 27365 Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/n/tip-piu203ses5y1g36bnyw2n16x@git.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
79 lines
2.1 KiB
C
79 lines
2.1 KiB
C
#ifndef _ASM_X86_TIMER_H
|
|
#define _ASM_X86_TIMER_H
|
|
#include <linux/init.h>
|
|
#include <linux/pm.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/math64.h>
|
|
|
|
#define TICK_SIZE (tick_nsec / 1000)
|
|
|
|
unsigned long long native_sched_clock(void);
|
|
extern int recalibrate_cpu_khz(void);
|
|
|
|
extern int no_timer_check;
|
|
|
|
/* Accelerators for sched_clock()
|
|
* convert from cycles(64bits) => nanoseconds (64bits)
|
|
* basic equation:
|
|
* ns = cycles / (freq / ns_per_sec)
|
|
* ns = cycles * (ns_per_sec / freq)
|
|
* ns = cycles * (10^9 / (cpu_khz * 10^3))
|
|
* ns = cycles * (10^6 / cpu_khz)
|
|
*
|
|
* Then we use scaling math (suggested by george@mvista.com) to get:
|
|
* ns = cycles * (10^6 * SC / cpu_khz) / SC
|
|
* ns = cycles * cyc2ns_scale / SC
|
|
*
|
|
* And since SC is a constant power of two, we can convert the div
|
|
* into a shift.
|
|
*
|
|
* We can use khz divisor instead of mhz to keep a better precision, since
|
|
* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
|
|
* (mathieu.desnoyers@polymtl.ca)
|
|
*
|
|
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
|
|
*
|
|
* In:
|
|
*
|
|
* ns = cycles * cyc2ns_scale / SC
|
|
*
|
|
* Although we may still have enough bits to store the value of ns,
|
|
* in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
|
|
* leading to an incorrect result.
|
|
*
|
|
* To avoid this, we can decompose 'cycles' into quotient and remainder
|
|
* of division by SC. Then,
|
|
*
|
|
* ns = (quot * SC + rem) * cyc2ns_scale / SC
|
|
* = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
|
|
*
|
|
* - sqazi@google.com
|
|
*/
|
|
|
|
DECLARE_PER_CPU(unsigned long, cyc2ns);
|
|
DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
|
|
|
|
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
|
|
|
|
static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
|
|
{
|
|
unsigned long long ns = this_cpu_read(cyc2ns_offset);
|
|
ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR);
|
|
return ns;
|
|
}
|
|
|
|
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
|
|
{
|
|
unsigned long long ns;
|
|
unsigned long flags;
|
|
|
|
local_irq_save(flags);
|
|
ns = __cycles_2_ns(cyc);
|
|
local_irq_restore(flags);
|
|
|
|
return ns;
|
|
}
|
|
|
|
#endif /* _ASM_X86_TIMER_H */
|