forked from Minki/linux
dd0fd8bca1
It was found when running fio sequential write test with a XFS ramdisk on a KVM guest running on a 2-socket x86-64 system, the %CPU times as reported by perf were as follows: 69.75% 0.59% fio [k] down_write 69.15% 0.01% fio [k] call_rwsem_down_write_failed 67.12% 1.12% fio [k] rwsem_down_write_failed 63.48% 52.77% fio [k] osq_lock 9.46% 7.88% fio [k] __raw_callee_save___kvm_vcpu_is_preempt 3.93% 3.93% fio [k] __kvm_vcpu_is_preempted Making vcpu_is_preempted() a callee-save function has a relatively high cost on x86-64 primarily due to at least one more cacheline of data access from the saving and restoring of registers (8 of them) to and from stack as well as one more level of function call. To reduce this performance overhead, an optimized assembly version of the the __raw_callee_save___kvm_vcpu_is_preempt() function is provided for x86-64. With this patch applied on a KVM guest on a 2-socket 16-core 32-thread system with 16 parallel jobs (8 on each socket), the aggregrate bandwidth of the fio test on an XFS ramdisk were as follows: I/O Type w/o patch with patch -------- --------- ---------- random read 8141.2 MB/s 8497.1 MB/s seq read 8229.4 MB/s 8304.2 MB/s random write 1675.5 MB/s 1701.5 MB/s seq write 1681.3 MB/s 1699.9 MB/s There are some increases in the aggregated bandwidth because of the patch. The perf data now became: 70.78% 0.58% fio [k] down_write 70.20% 0.01% fio [k] call_rwsem_down_write_failed 69.70% 1.17% fio [k] rwsem_down_write_failed 59.91% 55.42% fio [k] osq_lock 10.14% 10.14% fio [k] __kvm_vcpu_is_preempted The assembly code was verified by using a test kernel module to compare the output of C __kvm_vcpu_is_preempted() and that of assembly __raw_callee_save___kvm_vcpu_is_preempt() to verify that they matched. Suggested-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Waiman Long <longman@redhat.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
81 lines
1.7 KiB
C
81 lines
1.7 KiB
C
#ifndef __LINUX_KBUILD_H
|
|
# error "Please do not build this file directly, build asm-offsets.c instead"
|
|
#endif
|
|
|
|
#include <asm/ia32.h>
|
|
|
|
#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
|
|
static char syscalls_64[] = {
|
|
#include <asm/syscalls_64.h>
|
|
};
|
|
#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
|
|
static char syscalls_ia32[] = {
|
|
#include <asm/syscalls_32.h>
|
|
};
|
|
|
|
#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
|
|
#include <asm/kvm_para.h>
|
|
#endif
|
|
|
|
int main(void)
|
|
{
|
|
#ifdef CONFIG_PARAVIRT
|
|
OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
|
|
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
|
|
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
|
|
BLANK();
|
|
#endif
|
|
|
|
#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
|
|
OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
|
|
BLANK();
|
|
#endif
|
|
|
|
#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
|
|
ENTRY(bx);
|
|
ENTRY(cx);
|
|
ENTRY(dx);
|
|
ENTRY(sp);
|
|
ENTRY(bp);
|
|
ENTRY(si);
|
|
ENTRY(di);
|
|
ENTRY(r8);
|
|
ENTRY(r9);
|
|
ENTRY(r10);
|
|
ENTRY(r11);
|
|
ENTRY(r12);
|
|
ENTRY(r13);
|
|
ENTRY(r14);
|
|
ENTRY(r15);
|
|
ENTRY(flags);
|
|
BLANK();
|
|
#undef ENTRY
|
|
|
|
#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
|
|
ENTRY(cr0);
|
|
ENTRY(cr2);
|
|
ENTRY(cr3);
|
|
ENTRY(cr4);
|
|
ENTRY(cr8);
|
|
ENTRY(gdt_desc);
|
|
BLANK();
|
|
#undef ENTRY
|
|
|
|
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
|
|
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
|
|
BLANK();
|
|
|
|
#ifdef CONFIG_CC_STACKPROTECTOR
|
|
DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));
|
|
BLANK();
|
|
#endif
|
|
|
|
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
|
|
DEFINE(NR_syscalls, sizeof(syscalls_64));
|
|
|
|
DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1);
|
|
DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
|
|
|
|
return 0;
|
|
}
|