From 3f0c3d0bb2bcc4b88b22452a7cf0073ee9a0f1e6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 26 Jan 2013 23:56:04 +0200 Subject: [PATCH 01/22] KVM: x86 emulator: fix test_cc() build failure on i386 'pushq' doesn't exist on i386. Replace with 'push', which should work since the operand is a register. Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e99fb72cd4c5..2b11318151a4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1013,7 +1013,7 @@ static u8 test_cc(unsigned int condition, unsigned long flags) void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; - asm("pushq %[flags]; popf; call *%[fastop]" + asm("push %[flags]; popf; call *%[fastop]" : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); return rc; } From 261874b0d5ebe2a5ccc544df7170d6559635e79a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 24 Jan 2013 15:04:03 -0700 Subject: [PATCH 02/22] kvm: Force IOMMU remapping on memory slot read-only flag changes Memory slot flags can be altered without changing other parameters of the slot. The read-only attribute is the only one the IOMMU cares about, so generate an un-map, re-map when this occurs. This also avoid unnecessarily re-mapping the slot when no IOMMU visible changes are made. Reviewed-by: Xiao Guangrong Signed-off-by: Alex Williamson Signed-off-by: Gleb Natapov --- virt/kvm/kvm_main.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5e709ebb7c40..3fec2cdd951b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -731,6 +731,7 @@ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_memory_slot *slot; struct kvm_memory_slot old, new; struct kvm_memslots *slots = NULL, *old_memslots; + bool old_iommu_mapped; r = check_memory_region_flags(mem); if (r) @@ -772,6 +773,8 @@ int __kvm_set_memory_region(struct kvm *kvm, new.npages = npages; new.flags = mem->flags; + old_iommu_mapped = old.npages; + /* * Disallow changing a memory slot's size or changing anything about * zero sized slots that doesn't involve making them non-zero. @@ -835,6 +838,7 @@ int __kvm_set_memory_region(struct kvm *kvm, /* slot was deleted or moved, clear iommu mapping */ kvm_iommu_unmap_pages(kvm, &old); + old_iommu_mapped = false; /* From this point no new shadow pages pointing to a deleted, * or moved, memslot will be created. * @@ -863,11 +867,27 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out_free; } - /* map new memory slot into the iommu */ + /* + * IOMMU mapping: New slots need to be mapped. Old slots need to be + * un-mapped and re-mapped if their base changes or if flags that the + * iommu cares about change (read-only). Base change unmapping is + * handled above with slot deletion, so we only unmap incompatible + * flags here. Anything else the iommu might care about for existing + * slots (size changes, userspace addr changes) is disallowed above, + * so any other attribute changes getting here can be skipped. + */ if (npages) { - r = kvm_iommu_map_pages(kvm, &new); - if (r) - goto out_slots; + if (old_iommu_mapped && + ((new.flags ^ old.flags) & KVM_MEM_READONLY)) { + kvm_iommu_unmap_pages(kvm, &old); + old_iommu_mapped = false; + } + + if (!old_iommu_mapped) { + r = kvm_iommu_map_pages(kvm, &new); + if (r) + goto out_slots; + } } /* actual memory is freed via old in kvm_free_physmem_slot below */ From d47510e295c0f82699192a61d715351cf00f65de Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 24 Jan 2013 15:04:09 -0700 Subject: [PATCH 03/22] kvm: Obey read-only mappings in iommu We've been ignoring read-only mappings and programming everything into the iommu as read-write. Fix this to only include the write access flag when read-only is not set. Signed-off-by: Alex Williamson Signed-off-by: Gleb Natapov --- virt/kvm/iommu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 4a340cb23013..72a130bc448a 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -76,7 +76,9 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) gfn = slot->base_gfn; end_gfn = gfn + slot->npages; - flags = IOMMU_READ | IOMMU_WRITE; + flags = IOMMU_READ; + if (!(slot->flags & KVM_MEM_READONLY)) + flags |= IOMMU_WRITE; if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) flags |= IOMMU_CACHE; From 83d4c286931c9d28c5be21bac3c73a2332cab681 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Fri, 25 Jan 2013 10:18:49 +0800 Subject: [PATCH 04/22] x86, apicv: add APICv register virtualization support - APIC read doesn't cause VM-Exit - APIC write becomes trap-like Reviewed-by: Marcelo Tosatti Signed-off-by: Kevin Tian Signed-off-by: Yang Zhang Signed-off-by: Gleb Natapov --- arch/x86/include/asm/vmx.h | 2 ++ arch/x86/kvm/lapic.c | 15 +++++++++++++++ arch/x86/kvm/lapic.h | 2 ++ arch/x86/kvm/vmx.c | 33 ++++++++++++++++++++++++++++++++- 4 files changed, 51 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index e385df97bfdc..44c3f7eb4532 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -66,6 +66,7 @@ #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_APIC_WRITE 56 #define EXIT_REASON_INVPCID 58 #define VMX_EXIT_REASONS \ @@ -141,6 +142,7 @@ #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 +#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9392f527f107..0664c138e860 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1212,6 +1212,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); +/* emulate APIC access in a trap manner */ +void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) +{ + u32 val = 0; + + /* hw has done the conditional check and inst decode */ + offset &= 0xff0; + + apic_reg_read(vcpu->arch.apic, offset, 4, &val); + + /* TODO: optimize to just emulate side effect w/o one more write */ + apic_reg_write(vcpu->arch.apic, offset, val); +} +EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); + void kvm_free_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index e5ebf9f3571f..9a8ee22bc7a3 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -64,6 +64,8 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); +void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset); + void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 02eeba86328d..5ad7c8531083 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -84,6 +84,9 @@ module_param(vmm_exclusive, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); +static bool __read_mostly enable_apicv_reg = 1; +module_param(enable_apicv_reg, bool, S_IRUGO); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -764,6 +767,12 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } +static inline bool cpu_has_vmx_apic_register_virt(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_APIC_REGISTER_VIRT; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -2540,7 +2549,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_RDTSCP | - SECONDARY_EXEC_ENABLE_INVPCID; + SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_APIC_REGISTER_VIRT; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -2551,6 +2561,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif + + if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + _cpu_based_2nd_exec_control &= ~( + SECONDARY_EXEC_APIC_REGISTER_VIRT); + if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT enabled */ @@ -2748,6 +2763,9 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; + if (!cpu_has_vmx_apic_register_virt()) + enable_apicv_reg = 0; + if (nested) nested_vmx_setup_ctls_msrs(); @@ -3829,6 +3847,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; + if (!enable_apicv_reg || !irqchip_in_kernel(vmx->vcpu.kvm)) + exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT; return exec_control; } @@ -4787,6 +4807,16 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return emulate_instruction(vcpu, 0) == EMULATE_DONE; } +static int handle_apic_write(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 offset = exit_qualification & 0xfff; + + /* APIC-write VM exit is trap-like and thus no need to adjust IP */ + kvm_apic_write_nodecode(vcpu, offset); + return 1; +} + static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5721,6 +5751,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMON] = handle_vmon, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, + [EXIT_REASON_APIC_WRITE] = handle_apic_write, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, From 8d14695f9542e9e0195d6e41ddaa52c32322adf5 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Fri, 25 Jan 2013 10:18:50 +0800 Subject: [PATCH 05/22] x86, apicv: add virtual x2apic support basically to benefit from apicv, we need to enable virtualized x2apic mode. Currently, we only enable it when guest is really using x2apic. Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled x2apic: 0x800 - 0x8ff: no read intercept for apicv register virtualization, except APIC ID and TMCCT which need software's assistance to get right value. Reviewed-by: Marcelo Tosatti Signed-off-by: Kevin Tian Signed-off-by: Yang Zhang Signed-off-by: Gleb Natapov --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/lapic.c | 19 +-- arch/x86/kvm/lapic.h | 5 + arch/x86/kvm/svm.c | 6 + arch/x86/kvm/vmx.c | 198 ++++++++++++++++++++++++++++---- 6 files changed, 201 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 77d56a4ba89c..d42c2839be98 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -699,6 +699,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); + void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 44c3f7eb4532..0a54df0b36fc 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -139,6 +139,7 @@ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 #define SECONDARY_EXEC_RDTSCP 0x00000008 +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0664c138e860..f69fc5077a89 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -140,11 +140,6 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -static inline int apic_x2apic_mode(struct kvm_lapic *apic) -{ - return apic->vcpu->arch.apic_base & X2APIC_ENABLE; -} - static inline int kvm_apic_id(struct kvm_lapic *apic) { return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; @@ -1303,6 +1298,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) { + u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; if (!apic) { @@ -1324,11 +1320,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) value &= ~MSR_IA32_APICBASE_BSP; vcpu->arch.apic_base = value; - if (apic_x2apic_mode(apic)) { - u32 id = kvm_apic_id(apic); - u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - kvm_apic_set_ldr(apic, ldr); + if ((old_value ^ value) & X2APIC_ENABLE) { + if (value & X2APIC_ENABLE) { + u32 id = kvm_apic_id(apic); + u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + kvm_apic_set_ldr(apic, ldr); + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); + } else + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); } + apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 9a8ee22bc7a3..22a5397b638c 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -126,4 +126,9 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu) return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); } +static inline int apic_x2apic_mode(struct kvm_lapic *apic) +{ + return apic->vcpu->arch.apic_base & X2APIC_ENABLE; +} + #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d29d3cd1c156..38407e9fd1bd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) +{ + return; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, + .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5ad7c8531083..3ce8a1629330 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -643,6 +643,8 @@ static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; +static unsigned long *vmx_msr_bitmap_legacy_x2apic; +static unsigned long *vmx_msr_bitmap_longmode_x2apic; static bool cpu_has_load_ia32_efer; static bool cpu_has_load_perf_global_ctrl; @@ -767,6 +769,12 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +} + static inline bool cpu_has_vmx_apic_register_virt(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -1830,6 +1838,25 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) vmx->guest_msrs[from] = tmp; } +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) +{ + unsigned long *msr_bitmap; + + if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; + else + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + } else { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode; + else + msr_bitmap = vmx_msr_bitmap_legacy; + } + + vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); +} + /* * Set up the vmcs to automatically save and restore system * msrs. Don't touch the 64-bit msrs if the guest is in legacy @@ -1838,7 +1865,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) static void setup_msrs(struct vcpu_vmx *vmx) { int save_nmsrs, index; - unsigned long *msr_bitmap; save_nmsrs = 0; #ifdef CONFIG_X86_64 @@ -1870,14 +1896,8 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx->save_nmsrs = save_nmsrs; - if (cpu_has_vmx_msr_bitmap()) { - if (is_long_mode(&vmx->vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode; - else - msr_bitmap = vmx_msr_bitmap_legacy; - - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); - } + if (cpu_has_vmx_msr_bitmap()) + vmx_set_msr_bitmap(&vmx->vcpu); } /* @@ -2543,6 +2563,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { min2 = 0; opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | @@ -2564,7 +2585,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) _cpu_based_2nd_exec_control &= ~( - SECONDARY_EXEC_APIC_REGISTER_VIRT); + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT @@ -3725,7 +3747,10 @@ static void free_vpid(struct vcpu_vmx *vmx) spin_unlock(&vmx_vpid_lock); } -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) { int f = sizeof(unsigned long); @@ -3738,20 +3763,93 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. */ if (msr <= 0x1fff) { - __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ + if (type & MSR_TYPE_R) + /* read-low */ + __clear_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __clear_bit(msr, msr_bitmap + 0x800 / f); + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { msr &= 0x1fff; - __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ + if (type & MSR_TYPE_R) + /* read-high */ + __clear_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __clear_bit(msr, msr_bitmap + 0xc00 / f); + + } +} + +static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return; + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R) + /* read-low */ + __set_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __set_bit(msr, msr_bitmap + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R) + /* read-high */ + __set_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __set_bit(msr, msr_bitmap + 0xc00 / f); + } } static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) { if (!longmode_only) - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, + msr, MSR_TYPE_R | MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, + msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void vmx_enable_intercept_msr_read_x2apic(u32 msr) +{ + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); +} + +static void vmx_disable_intercept_msr_read_x2apic(u32 msr) +{ + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); +} + +static void vmx_disable_intercept_msr_write_x2apic(u32 msr) +{ + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_W); } /* @@ -3849,6 +3947,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; if (!enable_apicv_reg || !irqchip_in_kernel(vmx->vcpu.kvm)) exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT; + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; return exec_control; } @@ -6101,6 +6200,34 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) vmcs_write32(TPR_THRESHOLD, irr); } +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) +{ + u32 sec_exec_control; + + /* + * There is not point to enable virtualize x2apic without enable + * apicv + */ + if (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) + return; + + if (!vm_need_tpr_shadow(vcpu->kvm)) + return; + + sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + if (set) { + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + } else { + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + + vmx_set_msr_bitmap(vcpu); +} + static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -7364,6 +7491,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, + .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, @@ -7396,7 +7524,7 @@ static struct kvm_x86_ops vmx_x86_ops = { static int __init vmx_init(void) { - int r, i; + int r, i, msr; rdmsrl_safe(MSR_EFER, &host_efer); @@ -7417,11 +7545,19 @@ static int __init vmx_init(void) if (!vmx_msr_bitmap_legacy) goto out1; + vmx_msr_bitmap_legacy_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy_x2apic) + goto out2; vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode) - goto out2; + goto out3; + vmx_msr_bitmap_longmode_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode_x2apic) + goto out4; /* * Allow direct access to the PC debug port (it is often used for I/O @@ -7453,6 +7589,24 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + memcpy(vmx_msr_bitmap_legacy_x2apic, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic, + vmx_msr_bitmap_longmode, PAGE_SIZE); + + if (enable_apicv_reg) { + for (msr = 0x800; msr <= 0x8ff; msr++) + vmx_disable_intercept_msr_read_x2apic(msr); + + /* According SDM, in x2apic mode, the whole id reg is used. + * But in KVM, it only use the highest eight bits. Need to + * intercept it */ + vmx_enable_intercept_msr_read_x2apic(0x802); + /* TMCCT */ + vmx_enable_intercept_msr_read_x2apic(0x839); + /* TPR */ + vmx_disable_intercept_msr_write_x2apic(0x808); + } if (enable_ept) { kvm_mmu_set_mask_ptes(0ull, @@ -7466,8 +7620,10 @@ static int __init vmx_init(void) return 0; -out3: +out4: free_page((unsigned long)vmx_msr_bitmap_longmode); +out3: + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); out2: free_page((unsigned long)vmx_msr_bitmap_legacy); out1: @@ -7479,6 +7635,8 @@ out: static void __exit vmx_exit(void) { + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); free_page((unsigned long)vmx_msr_bitmap_legacy); free_page((unsigned long)vmx_msr_bitmap_longmode); free_page((unsigned long)vmx_io_bitmap_b); From c7c9c56ca26f7b9458711b2d78b60b60e0d38ba7 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Fri, 25 Jan 2013 10:18:51 +0800 Subject: [PATCH 06/22] x86, apicv: add virtual interrupt delivery support Virtual interrupt delivery avoids KVM to inject vAPIC interrupts manually, which is fully taken care of by the hardware. This needs some special awareness into existing interrupr injection path: - for pending interrupt, instead of direct injection, we may need update architecture specific indicators before resuming to guest. - A pending interrupt, which is masked by ISR, should be also considered in above update action, since hardware will decide when to inject it at right time. Current has_interrupt and get_interrupt only returns a valid vector from injection p.o.v. Reviewed-by: Marcelo Tosatti Signed-off-by: Kevin Tian Signed-off-by: Yang Zhang Signed-off-by: Gleb Natapov --- arch/ia64/kvm/lapic.h | 6 ++ arch/x86/include/asm/kvm_host.h | 5 ++ arch/x86/include/asm/vmx.h | 11 +++ arch/x86/kvm/irq.c | 56 +++++++++++++-- arch/x86/kvm/lapic.c | 106 ++++++++++++++++++++++------ arch/x86/kvm/lapic.h | 27 ++++++++ arch/x86/kvm/svm.c | 18 +++++ arch/x86/kvm/vmx.c | 119 +++++++++++++++++++++++++++++--- arch/x86/kvm/x86.c | 23 +++++- include/linux/kvm_host.h | 3 + virt/kvm/ioapic.c | 39 +++++++++++ virt/kvm/ioapic.h | 4 ++ virt/kvm/irq_comm.c | 25 +++++++ virt/kvm/kvm_main.c | 5 ++ 14 files changed, 407 insertions(+), 40 deletions(-) diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h index c5f92a926a9a..c3e2935b6db4 100644 --- a/arch/ia64/kvm/lapic.h +++ b/arch/ia64/kvm/lapic.h @@ -27,4 +27,10 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); #define kvm_apic_present(x) (true) #define kvm_lapic_enabled(x) (true) +static inline bool kvm_apic_vid_enabled(void) +{ + /* IA64 has no apicv supporting, do nothing here */ + return false; +} + #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d42c2839be98..635a74d22409 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -699,6 +699,10 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); + int (*vm_has_apicv)(struct kvm *kvm); + void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); + void (*hwapic_isr_update)(struct kvm *kvm, int isr); + void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); @@ -994,6 +998,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); +int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0a54df0b36fc..694586ca6456 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -62,6 +62,7 @@ #define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_EOI_INDUCED 45 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_WBINVD 54 @@ -144,6 +145,7 @@ #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 +#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 @@ -181,6 +183,7 @@ enum vmcs_field { GUEST_GS_SELECTOR = 0x0000080a, GUEST_LDTR_SELECTOR = 0x0000080c, GUEST_TR_SELECTOR = 0x0000080e, + GUEST_INTR_STATUS = 0x00000810, HOST_ES_SELECTOR = 0x00000c00, HOST_CS_SELECTOR = 0x00000c02, HOST_SS_SELECTOR = 0x00000c04, @@ -208,6 +211,14 @@ enum vmcs_field { APIC_ACCESS_ADDR_HIGH = 0x00002015, EPT_POINTER = 0x0000201a, EPT_POINTER_HIGH = 0x0000201b, + EOI_EXIT_BITMAP0 = 0x0000201c, + EOI_EXIT_BITMAP0_HIGH = 0x0000201d, + EOI_EXIT_BITMAP1 = 0x0000201e, + EOI_EXIT_BITMAP1_HIGH = 0x0000201f, + EOI_EXIT_BITMAP2 = 0x00002020, + EOI_EXIT_BITMAP2_HIGH = 0x00002021, + EOI_EXIT_BITMAP3 = 0x00002022, + EOI_EXIT_BITMAP3_HIGH = 0x00002023, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b111aee815f8..484bc874688b 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -37,6 +37,38 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL(kvm_cpu_has_pending_timer); +/* + * check if there is pending interrupt from + * non-APIC source without intack. + */ +static int kvm_cpu_has_extint(struct kvm_vcpu *v) +{ + if (kvm_apic_accept_pic_intr(v)) + return pic_irqchip(v->kvm)->output; /* PIC */ + else + return 0; +} + +/* + * check if there is injectable interrupt: + * when virtual interrupt delivery enabled, + * interrupt from apic will handled by hardware, + * we don't need to check it here. + */ +int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) +{ + if (!irqchip_in_kernel(v->kvm)) + return v->arch.interrupt.pending; + + if (kvm_cpu_has_extint(v)) + return 1; + + if (kvm_apic_vid_enabled(v->kvm)) + return 0; + + return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ +} + /* * check if there is pending interrupt without * intack. @@ -46,27 +78,41 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) if (!irqchip_in_kernel(v->kvm)) return v->arch.interrupt.pending; - if (kvm_apic_accept_pic_intr(v) && pic_irqchip(v->kvm)->output) - return pic_irqchip(v->kvm)->output; /* PIC */ + if (kvm_cpu_has_extint(v)) + return 1; return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); +/* + * Read pending interrupt(from non-APIC source) + * vector and intack. + */ +static int kvm_cpu_get_extint(struct kvm_vcpu *v) +{ + if (kvm_cpu_has_extint(v)) + return kvm_pic_read_irq(v->kvm); /* PIC */ + return -1; +} + /* * Read pending interrupt vector and intack. */ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) { + int vector; + if (!irqchip_in_kernel(v->kvm)) return v->arch.interrupt.nr; - if (kvm_apic_accept_pic_intr(v) && pic_irqchip(v->kvm)->output) - return kvm_pic_read_irq(v->kvm); /* PIC */ + vector = kvm_cpu_get_extint(v); + + if (kvm_apic_vid_enabled(v->kvm) || vector != -1) + return vector; /* PIC */ return kvm_get_apic_interrupt(v); /* APIC */ } -EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f69fc5077a89..02b51dd4e4ad 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -145,21 +145,51 @@ static inline int kvm_apic_id(struct kvm_lapic *apic) return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; } -static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) +void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, + struct kvm_lapic_irq *irq, + u64 *eoi_exit_bitmap) { - u16 cid; - ldr >>= 32 - map->ldr_bits; - cid = (ldr >> map->cid_shift) & map->cid_mask; + struct kvm_lapic **dst; + struct kvm_apic_map *map; + unsigned long bitmap = 1; + int i; - BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); + rcu_read_lock(); + map = rcu_dereference(vcpu->kvm->arch.apic_map); - return cid; -} + if (unlikely(!map)) { + __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap); + goto out; + } -static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) -{ - ldr >>= (32 - map->ldr_bits); - return ldr & map->lid_mask; + if (irq->dest_mode == 0) { /* physical mode */ + if (irq->delivery_mode == APIC_DM_LOWEST || + irq->dest_id == 0xff) { + __set_bit(irq->vector, + (unsigned long *)eoi_exit_bitmap); + goto out; + } + dst = &map->phys_map[irq->dest_id & 0xff]; + } else { + u32 mda = irq->dest_id << (32 - map->ldr_bits); + + dst = map->logical_map[apic_cluster_id(map, mda)]; + + bitmap = apic_logical_id(map, mda); + } + + for_each_set_bit(i, &bitmap, 16) { + if (!dst[i]) + continue; + if (dst[i]->vcpu == vcpu) { + __set_bit(irq->vector, + (unsigned long *)eoi_exit_bitmap); + break; + } + } + +out: + rcu_read_unlock(); } static void recalculate_apic_map(struct kvm *kvm) @@ -225,6 +255,8 @@ out: if (old) kfree_rcu(old, rcu); + + kvm_ioapic_make_eoibitmap_request(kvm); } static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) @@ -340,6 +372,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) { int result; + /* + * Note that irr_pending is just a hint. It will be always + * true with virtual interrupt delivery enabled. + */ if (!apic->irr_pending) return -1; @@ -456,6 +492,8 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) static inline int apic_find_highest_isr(struct kvm_lapic *apic) { int result; + + /* Note that isr_count is always 1 with vid enabled */ if (!apic->isr_count) return -1; if (likely(apic->highest_isr_cache != -1)) @@ -735,6 +773,19 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } +static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) +{ + if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && + kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + int trigger_mode; + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + } +} + static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); @@ -751,19 +802,26 @@ static int apic_set_eoi(struct kvm_lapic *apic) apic_clear_isr(vector, apic); apic_update_ppr(apic); - if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && - kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { - int trigger_mode; - if (apic_test_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); - } + kvm_ioapic_send_eoi(apic, vector); kvm_make_request(KVM_REQ_EVENT, apic->vcpu); return vector; } +/* + * this interface assumes a trap-like exit, which has already finished + * desired side effect including vISR and vPPR update. + */ +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + trace_kvm_eoi(apic, vector); + + kvm_ioapic_send_eoi(apic, vector); + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); +} +EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); + static void apic_send_ipi(struct kvm_lapic *apic) { u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); @@ -1375,8 +1433,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->irr_pending = false; - apic->isr_count = 0; + apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); + apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm); apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -1591,8 +1649,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; - apic->isr_count = count_vectors(apic->regs + APIC_ISR); + apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ? + 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 22a5397b638c..1676d34ddb4e 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -65,6 +65,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset); +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector); void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); @@ -131,4 +132,30 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) return apic->vcpu->arch.apic_base & X2APIC_ENABLE; } +static inline bool kvm_apic_vid_enabled(struct kvm *kvm) +{ + return kvm_x86_ops->vm_has_apicv(kvm); +} + +static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) +{ + u16 cid; + ldr >>= 32 - map->ldr_bits; + cid = (ldr >> map->cid_shift) & map->cid_mask; + + BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); + + return cid; +} + +static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) +{ + ldr >>= (32 - map->ldr_bits); + return ldr & map->lid_mask; +} + +void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, + struct kvm_lapic_irq *irq, + u64 *eoi_bitmap); + #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 38407e9fd1bd..e1b1ce21bc00 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3576,6 +3576,21 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } +static int svm_vm_has_apicv(struct kvm *kvm) +{ + return 0; +} + +static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + return; +} + +static void svm_hwapic_isr_update(struct kvm *kvm, int isr) +{ + return; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4296,6 +4311,9 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, + .vm_has_apicv = svm_vm_has_apicv, + .load_eoi_exitmap = svm_load_eoi_exitmap, + .hwapic_isr_update = svm_hwapic_isr_update, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3ce8a1629330..0cf74a641dec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -84,8 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); -static bool __read_mostly enable_apicv_reg = 1; -module_param(enable_apicv_reg, bool, S_IRUGO); +static bool __read_mostly enable_apicv_reg_vid = 1; +module_param(enable_apicv_reg_vid, bool, S_IRUGO); /* * If nested=1, nested virtualization is supported, i.e., guests may use @@ -781,6 +781,12 @@ static inline bool cpu_has_vmx_apic_register_virt(void) SECONDARY_EXEC_APIC_REGISTER_VIRT; } +static inline bool cpu_has_vmx_virtual_intr_delivery(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -2571,7 +2577,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_APIC_REGISTER_VIRT; + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -2586,7 +2593,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) _cpu_based_2nd_exec_control &= ~( SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT @@ -2785,8 +2793,14 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; - if (!cpu_has_vmx_apic_register_virt()) - enable_apicv_reg = 0; + if (!cpu_has_vmx_apic_register_virt() || + !cpu_has_vmx_virtual_intr_delivery()) + enable_apicv_reg_vid = 0; + + if (enable_apicv_reg_vid) + kvm_x86_ops->update_cr8_intercept = NULL; + else + kvm_x86_ops->hwapic_irr_update = NULL; if (nested) nested_vmx_setup_ctls_msrs(); @@ -3928,6 +3942,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static int vmx_vm_has_apicv(struct kvm *kvm) +{ + return enable_apicv_reg_vid && irqchip_in_kernel(kvm); +} + static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; @@ -3945,8 +3964,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - if (!enable_apicv_reg || !irqchip_in_kernel(vmx->vcpu.kvm)) - exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT; + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; return exec_control; } @@ -3992,6 +4012,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx_secondary_exec_control(vmx)); } + if (enable_apicv_reg_vid) { + vmcs_write64(EOI_EXIT_BITMAP0, 0); + vmcs_write64(EOI_EXIT_BITMAP1, 0); + vmcs_write64(EOI_EXIT_BITMAP2, 0); + vmcs_write64(EOI_EXIT_BITMAP3, 0); + + vmcs_write16(GUEST_INTR_STATUS, 0); + } + if (ple_gap) { vmcs_write32(PLE_GAP, ple_gap); vmcs_write32(PLE_WINDOW, ple_window); @@ -4906,6 +4935,16 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return emulate_instruction(vcpu, 0) == EMULATE_DONE; } +static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int vector = exit_qualification & 0xff; + + /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ + kvm_apic_set_eoi_accelerated(vcpu, vector); + return 1; +} + static int handle_apic_write(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -5851,6 +5890,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_APIC_WRITE] = handle_apic_write, + [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, @@ -6208,7 +6248,8 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) * There is not point to enable virtualize x2apic without enable * apicv */ - if (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) + if (!cpu_has_vmx_virtualize_x2apic_mode() || + !vmx_vm_has_apicv(vcpu->kvm)) return; if (!vm_need_tpr_shadow(vcpu->kvm)) @@ -6228,6 +6269,56 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) vmx_set_msr_bitmap(vcpu); } +static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) +{ + u16 status; + u8 old; + + if (!vmx_vm_has_apicv(kvm)) + return; + + if (isr == -1) + isr = 0; + + status = vmcs_read16(GUEST_INTR_STATUS); + old = status >> 8; + if (isr != old) { + status &= 0xff; + status |= isr << 8; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} + +static void vmx_set_rvi(int vector) +{ + u16 status; + u8 old; + + status = vmcs_read16(GUEST_INTR_STATUS); + old = (u8)status & 0xff; + if ((u8)vector != old) { + status &= ~0xff; + status |= (u8)vector; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} + +static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +{ + if (max_irr == -1) + return; + + vmx_set_rvi(max_irr); +} + +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); + vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); + vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); + vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); +} + static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -7492,6 +7583,10 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, + .vm_has_apicv = vmx_vm_has_apicv, + .load_eoi_exitmap = vmx_load_eoi_exitmap, + .hwapic_irr_update = vmx_hwapic_irr_update, + .hwapic_isr_update = vmx_hwapic_isr_update, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, @@ -7594,7 +7689,7 @@ static int __init vmx_init(void) memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, PAGE_SIZE); - if (enable_apicv_reg) { + if (enable_apicv_reg_vid) { for (msr = 0x800; msr <= 0x8ff; msr++) vmx_disable_intercept_msr_read_x2apic(msr); @@ -7606,6 +7701,10 @@ static int __init vmx_init(void) vmx_enable_intercept_msr_read_x2apic(0x839); /* TPR */ vmx_disable_intercept_msr_write_x2apic(0x808); + /* EOI */ + vmx_disable_intercept_msr_write_x2apic(0x80b); + /* SELF-IPI */ + vmx_disable_intercept_msr_write_x2apic(0x83f); } if (enable_ept) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b9f55299ed7e..cf512e70c797 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5565,7 +5565,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; kvm_x86_ops->set_nmi(vcpu); } - } else if (kvm_cpu_has_interrupt(vcpu)) { + } else if (kvm_cpu_has_injectable_intr(vcpu)) { if (kvm_x86_ops->interrupt_allowed(vcpu)) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); @@ -5633,6 +5633,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) #endif } +static void update_eoi_exitmap(struct kvm_vcpu *vcpu) +{ + u64 eoi_exit_bitmap[4]; + + memset(eoi_exit_bitmap, 0, 32); + + kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; @@ -5686,6 +5696,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_handle_pmu_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) kvm_deliver_pmi(vcpu); + if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) + update_eoi_exitmap(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -5694,10 +5706,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* enable NMI/IRQ window open exits if needed */ if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { + /* + * Update architecture specific hints for APIC + * virtual interrupt delivery. + */ + if (kvm_x86_ops->hwapic_irr_update) + kvm_x86_ops->hwapic_irr_update(vcpu, + kvm_lapic_find_highest_irr(vcpu)); update_cr8_intercept(vcpu); kvm_lapic_sync_to_vapic(vcpu); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4dd7d7531e69..0350e0d5e031 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -123,6 +123,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_MASTERCLOCK_UPDATE 19 #define KVM_REQ_MCLOCK_INPROGRESS 20 #define KVM_REQ_EPR_EXIT 21 +#define KVM_REQ_EOIBITMAP 22 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -538,6 +539,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_reload_remote_mmus(struct kvm *kvm); void kvm_make_mclock_inprogress_request(struct kvm *kvm); +void kvm_make_update_eoibitmap_request(struct kvm *kvm); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); @@ -691,6 +693,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level); +bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index f3abbef46c42..ce82b9401958 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -115,6 +116,42 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic) smp_wmb(); } +void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, + u64 *eoi_exit_bitmap) +{ + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + union kvm_ioapic_redirect_entry *e; + struct kvm_lapic_irq irqe; + int index; + + spin_lock(&ioapic->lock); + /* traverse ioapic entry to set eoi exit bitmap*/ + for (index = 0; index < IOAPIC_NUM_PINS; index++) { + e = &ioapic->redirtbl[index]; + if (!e->fields.mask && + (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || + kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, + index))) { + irqe.dest_id = e->fields.dest_id; + irqe.vector = e->fields.vector; + irqe.dest_mode = e->fields.dest_mode; + irqe.delivery_mode = e->fields.delivery_mode << 8; + kvm_calculate_eoi_exitmap(vcpu, &irqe, eoi_exit_bitmap); + } + } + spin_unlock(&ioapic->lock); +} +EXPORT_SYMBOL_GPL(kvm_ioapic_calculate_eoi_exitmap); + +void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + if (!kvm_apic_vid_enabled(kvm) || !ioapic) + return; + kvm_make_update_eoibitmap_request(kvm); +} + static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; @@ -156,6 +193,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index); + kvm_ioapic_make_eoibitmap_request(ioapic->kvm); break; } } @@ -455,6 +493,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) spin_lock(&ioapic->lock); memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); update_handled_vectors(ioapic); + kvm_ioapic_make_eoibitmap_request(kvm); spin_unlock(&ioapic->lock); return 0; } diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index a30abfe6ed16..0400a466c50c 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -82,5 +82,9 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq); int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm); +void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, + u64 *eoi_exit_bitmap); + #endif diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 656fa455e154..ff6d40e2c06d 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -237,6 +238,28 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) return ret; } +bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) +{ + struct kvm_irq_ack_notifier *kian; + struct hlist_node *n; + int gsi; + + rcu_read_lock(); + gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; + if (gsi != -1) + hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list, + link) + if (kian->gsi == gsi) { + rcu_read_unlock(); + return true; + } + + rcu_read_unlock(); + + return false; +} +EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); + void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) { struct kvm_irq_ack_notifier *kian; @@ -261,6 +284,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm, mutex_lock(&kvm->irq_lock); hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); mutex_unlock(&kvm->irq_lock); + kvm_ioapic_make_eoibitmap_request(kvm); } void kvm_unregister_irq_ack_notifier(struct kvm *kvm, @@ -270,6 +294,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, hlist_del_init_rcu(&kian->link); mutex_unlock(&kvm->irq_lock); synchronize_rcu(); + kvm_ioapic_make_eoibitmap_request(kvm); } int kvm_request_irq_source_id(struct kvm *kvm) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3fec2cdd951b..abc23e27173d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -217,6 +217,11 @@ void kvm_make_mclock_inprogress_request(struct kvm *kvm) make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } +void kvm_make_update_eoibitmap_request(struct kvm *kvm) +{ + make_all_cpus_request(kvm, KVM_REQ_EOIBITMAP); +} + int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { struct page *page; From 7b270f609982f68f2433442bf167f735e7364b06 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Jan 2013 13:09:13 +0530 Subject: [PATCH 07/22] sched: Bail out of yield_to when source and target runqueue has one task In case of undercomitted scenarios, especially in large guests yield_to overhead is significantly high. when run queue length of source and target is one, take an opportunity to bail out and return -ESRCH. This return condition can be further exploited to quickly come out of PLE handler. (History: Raghavendra initially worked on break out of kvm ple handler upon seeing source runqueue length = 1, but it had to export rq length). Peter came up with the elegant idea of return -ESRCH in scheduler core. Signed-off-by: Peter Zijlstra Raghavendra, Checking the rq length of target vcpu condition added.(thanks Avi) Reviewed-by: Srikar Dronamraju Signed-off-by: Raghavendra K T Acked-by: Andrew Jones Tested-by: Chegu Vinod Signed-off-by: Gleb Natapov --- kernel/sched/core.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0533496b6228..01edad9b5d71 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4316,7 +4316,10 @@ EXPORT_SYMBOL(yield); * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. * - * Returns true if we indeed boosted the target task. + * Returns: + * true (>0) if we indeed boosted the target task. + * false (0) if we failed to boost the target. + * -ESRCH if there's no task to yield to. */ bool __sched yield_to(struct task_struct *p, bool preempt) { @@ -4330,6 +4333,15 @@ bool __sched yield_to(struct task_struct *p, bool preempt) again: p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) { + yielded = -ESRCH; + goto out_irq; + } + double_rq_lock(rq, p_rq); while (task_rq(p) != p_rq) { double_rq_unlock(rq, p_rq); @@ -4337,13 +4349,13 @@ again: } if (!curr->sched_class->yield_to_task) - goto out; + goto out_unlock; if (curr->sched_class != p->sched_class) - goto out; + goto out_unlock; if (task_running(p_rq, p) || p->state) - goto out; + goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p, preempt); if (yielded) { @@ -4356,11 +4368,12 @@ again: resched_task(p_rq->curr); } -out: +out_unlock: double_rq_unlock(rq, p_rq); +out_irq: local_irq_restore(flags); - if (yielded) + if (yielded > 0) schedule(); return yielded; From c45c528e899094b9049b3c900e2cf1f00aa0490c Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Tue, 22 Jan 2013 13:09:24 +0530 Subject: [PATCH 08/22] kvm: Handle yield_to failure return code for potential undercommit case yield_to returns -ESRCH, When source and target of yield_to run queue length is one. When we see three successive failures of yield_to we assume we are in potential undercommit case and abort from PLE handler. The assumption is backed by low probability of wrong decision for even worst case scenarios such as average runqueue length between 1 and 2. More detail on rationale behind using three tries: if p is the probability of finding rq length one on a particular cpu, and if we do n tries, then probability of exiting ple handler is: p^(n+1) [ because we would have come across one source with rq length 1 and n target cpu rqs with length 1 ] so num tries: probability of aborting ple handler (1.5x overcommit) 1 1/4 2 1/8 3 1/16 We can increase this probability with more tries, but the problem is the overhead. Also, If we have tried three times that means we would have iterated over 3 good eligible vcpus along with many non-eligible candidates. In worst case if we iterate all the vcpus, we reduce 1x performance and overcommit performance get hit. note that we do not update last boosted vcpu in failure cases. Thank Avi for raising question on aborting after first fail from yield_to. Reviewed-by: Srikar Dronamraju Signed-off-by: Raghavendra K T Tested-by: Chegu Vinod Signed-off-by: Gleb Natapov --- virt/kvm/kvm_main.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index abc23e27173d..a83ca63d26fc 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1694,6 +1694,7 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target) { struct pid *pid; struct task_struct *task = NULL; + bool ret = false; rcu_read_lock(); pid = rcu_dereference(target->pid); @@ -1701,17 +1702,15 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target) task = get_pid_task(target->pid, PIDTYPE_PID); rcu_read_unlock(); if (!task) - return false; + return ret; if (task->flags & PF_VCPU) { put_task_struct(task); - return false; - } - if (yield_to(task, 1)) { - put_task_struct(task); - return true; + return ret; } + ret = yield_to(task, 1); put_task_struct(task); - return false; + + return ret; } EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); @@ -1752,12 +1751,14 @@ bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) return eligible; } #endif + void kvm_vcpu_on_spin(struct kvm_vcpu *me) { struct kvm *kvm = me->kvm; struct kvm_vcpu *vcpu; int last_boosted_vcpu = me->kvm->last_boosted_vcpu; int yielded = 0; + int try = 3; int pass; int i; @@ -1769,7 +1770,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) * VCPU is holding the lock that we need and will release it. * We approximate round-robin by starting at the last boosted VCPU. */ - for (pass = 0; pass < 2 && !yielded; pass++) { + for (pass = 0; pass < 2 && !yielded && try; pass++) { kvm_for_each_vcpu(i, vcpu, kvm) { if (!pass && i <= last_boosted_vcpu) { i = last_boosted_vcpu; @@ -1782,10 +1783,15 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) continue; if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) continue; - if (kvm_vcpu_yield_to(vcpu)) { + + yielded = kvm_vcpu_yield_to(vcpu); + if (yielded > 0) { kvm->last_boosted_vcpu = i; - yielded = 1; break; + } else if (yielded < 0) { + try--; + if (!try) + break; } } } From 15bc8d8457875f495c59d933b05770ba88d1eacb Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 25 Jan 2013 15:34:15 +0100 Subject: [PATCH 09/22] s390/kvm: Fix store status for ACRS/FPRS On store status we need to copy the current state of registers into a save area. Currently we might save stale versions: The sie state descriptor doesnt have fields for guest ACRS,FPRS, those registers are simply stored in the host registers. The host program must copy these away if needed. We do that in vcpu_put/load. If we now do a store status in KVM code between vcpu_put/load, the saved values are not up-to-date. Lets collect the ACRS/FPRS before saving them. This also fixes some strange problems with hotplug and virtio-ccw, since the low level machine check handler (on hotplug a machine check will happen) will revalidate all registers with the content of the save area. Signed-off-by: Christian Borntraeger CC: stable@vger.kernel.org Signed-off-by: Gleb Natapov --- arch/s390/kvm/kvm-s390.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5b01f0953900..4377d1886631 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -770,6 +770,14 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) } else prefix = 0; + /* + * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy + * copying in vcpu load/put. Lets update our copies before we save + * it into the save area + */ + save_fp_regs(&vcpu->arch.guest_fpregs); + save_access_regs(vcpu->run->s.regs.acrs); + if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs), vcpu->arch.guest_fpregs.fprs, 128, prefix)) return -EFAULT; From c98d3683ce675d689121147ed00e7c4af4737518 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Fri, 25 Jan 2013 15:34:16 +0100 Subject: [PATCH 10/22] s390/virtio-ccw: Fix setup_vq error handling. virtio_ccw_setup_vq() failed to unwind correctly on errors. In particular, it failed to delete the virtqueue on errors, leading to list corruption when virtio_ccw_del_vqs() iterated over a virtqueue that had not been added to the vcdev's list. Fix this with redoing the error unwinding in virtio_ccw_setup_vq(), using a single path for all errors. Signed-off-by: Cornelia Huck Reviewed-by: Christian Borntraeger Signed-off-by: Christian Borntraeger Signed-off-by: Gleb Natapov --- drivers/s390/kvm/virtio_ccw.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c index 2edd94af131c..3217dfe5cb8b 100644 --- a/drivers/s390/kvm/virtio_ccw.c +++ b/drivers/s390/kvm/virtio_ccw.c @@ -244,9 +244,9 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, { struct virtio_ccw_device *vcdev = to_vc_device(vdev); int err; - struct virtqueue *vq; + struct virtqueue *vq = NULL; struct virtio_ccw_vq_info *info; - unsigned long size; + unsigned long size = 0; /* silence the compiler */ unsigned long flags; /* Allocate queue. */ @@ -279,11 +279,8 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, /* For now, we fail if we can't get the requested size. */ dev_warn(&vcdev->cdev->dev, "no vq\n"); err = -ENOMEM; - free_pages_exact(info->queue, size); goto out_err; } - info->vq = vq; - vq->priv = info; /* Register it with the host. */ info->info_block->queue = (__u64)info->queue; @@ -297,12 +294,12 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, err = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_VQ | i); if (err) { dev_warn(&vcdev->cdev->dev, "SET_VQ failed\n"); - free_pages_exact(info->queue, size); - info->vq = NULL; - vq->priv = NULL; goto out_err; } + info->vq = vq; + vq->priv = info; + /* Save it to our list. */ spin_lock_irqsave(&vcdev->lock, flags); list_add(&info->node, &vcdev->virtqueues); @@ -311,8 +308,13 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, return vq; out_err: - if (info) + if (vq) + vring_del_virtqueue(vq); + if (info) { + if (info->queue) + free_pages_exact(info->queue, size); kfree(info->info_block); + } kfree(info); return ERR_PTR(err); } From 0c29b2293b25527a1759cbd22b106f19258636b6 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 25 Jan 2013 15:34:17 +0100 Subject: [PATCH 11/22] s390/kvm: Fix instruction decoding Instructions with long displacement have a signed displacement. Currently the sign bit is interpreted as 2^20: Lets fix it by doing the sign extension from 20bit to 32bit and then use it as a signed variable in the addition (see kvm_s390_get_base_disp_rsy). Furthermore, there are lots of "int" in that code. This is problematic, because shifting on a signed integer is undefined/implementation defined if the bit value happens to be negative. Fortunately the promotion rules will make the right hand side unsigned anyway, so there is no real problem right now. Let's convert them anyway to unsigned where appropriate to avoid problems if the code is changed or copy/pasted later on. Signed-off-by: Christian Borntraeger Reviewed-by: Cornelia Huck Signed-off-by: Gleb Natapov --- arch/s390/kvm/kvm-s390.h | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 3e05deff21b6..4d89d64a8161 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -67,8 +67,8 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu) { - int base2 = vcpu->arch.sie_block->ipb >> 28; - int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); + u32 base2 = vcpu->arch.sie_block->ipb >> 28; + u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; } @@ -76,10 +76,10 @@ static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu) static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, u64 *address1, u64 *address2) { - int base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; - int disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16; - int base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12; - int disp2 = vcpu->arch.sie_block->ipb & 0x0fff; + u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; + u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16; + u32 base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12; + u32 disp2 = vcpu->arch.sie_block->ipb & 0x0fff; *address1 = (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1; *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; @@ -87,17 +87,20 @@ static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu) { - int base2 = vcpu->arch.sie_block->ipb >> 28; - int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) + + u32 base2 = vcpu->arch.sie_block->ipb >> 28; + u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) + ((vcpu->arch.sie_block->ipb & 0xff00) << 4); + /* The displacement is a 20bit _SIGNED_ value */ + if (disp2 & 0x80000) + disp2+=0xfff00000; - return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; + return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2; } static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu) { - int base2 = vcpu->arch.sie_block->ipb >> 28; - int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); + u32 base2 = vcpu->arch.sie_block->ipb >> 28; + u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; } From f64c0398939483eb1d8951f24fbc21e94ed54457 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 29 Jan 2013 11:00:07 +0900 Subject: [PATCH 12/22] KVM: set_memory_region: Identify the requested change explicitly KVM_SET_USER_MEMORY_REGION forces __kvm_set_memory_region() to identify what kind of change is being requested by checking the arguments. The current code does this checking at various points in code and each condition being used there is not easy to understand at first glance. This patch consolidates these checks and introduces an enum to name the possible changes to clean up the code. Although this does not introduce any functional changes, there is one change which optimizes the code a bit: if we have nothing to change, the new code returns 0 immediately. Note that the return value for this case cannot be changed since QEMU relies on it: we noticed this when we changed it to -EINVAL and got a section mismatch error at the final stage of live migration. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 64 +++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 20 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a83ca63d26fc..64c5dc37c6a1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -718,6 +718,24 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, return old_memslots; } +/* + * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: + * - create a new memory slot + * - delete an existing memory slot + * - modify an existing memory slot + * -- move it in the guest physical memory space + * -- just change its flags + * + * Since flags can be changed by some of these operations, the following + * differentiation is the best we can do for __kvm_set_memory_region(): + */ +enum kvm_mr_change { + KVM_MR_CREATE, + KVM_MR_DELETE, + KVM_MR_MOVE, + KVM_MR_FLAGS_ONLY, +}; + /* * Allocate some memory and give it an address in the guest physical address * space. @@ -737,6 +755,7 @@ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_memory_slot old, new; struct kvm_memslots *slots = NULL, *old_memslots; bool old_iommu_mapped; + enum kvm_mr_change change; r = check_memory_region_flags(mem); if (r) @@ -780,17 +799,30 @@ int __kvm_set_memory_region(struct kvm *kvm, old_iommu_mapped = old.npages; - /* - * Disallow changing a memory slot's size or changing anything about - * zero sized slots that doesn't involve making them non-zero. - */ r = -EINVAL; - if (npages && old.npages && npages != old.npages) - goto out; - if (!npages && !old.npages) + if (npages) { + if (!old.npages) + change = KVM_MR_CREATE; + else { /* Modify an existing slot. */ + if ((mem->userspace_addr != old.userspace_addr) || + (npages != old.npages)) + goto out; + + if (base_gfn != old.base_gfn) + change = KVM_MR_MOVE; + else if (new.flags != old.flags) + change = KVM_MR_FLAGS_ONLY; + else { /* Nothing to change. */ + r = 0; + goto out; + } + } + } else if (old.npages) { + change = KVM_MR_DELETE; + } else /* Modify a non-existent slot: disallowed. */ goto out; - if ((npages && !old.npages) || (base_gfn != old.base_gfn)) { + if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { /* Check for overlaps */ r = -EEXIST; kvm_for_each_memslot(slot, kvm->memslots) { @@ -808,20 +840,12 @@ int __kvm_set_memory_region(struct kvm *kvm, new.dirty_bitmap = NULL; r = -ENOMEM; - - /* - * Allocate if a slot is being created. If modifying a slot, - * the userspace_addr cannot change. - */ - if (!old.npages) { + if (change == KVM_MR_CREATE) { new.user_alloc = user_alloc; new.userspace_addr = mem->userspace_addr; if (kvm_arch_create_memslot(&new, npages)) goto out_free; - } else if (npages && mem->userspace_addr != old.userspace_addr) { - r = -EINVAL; - goto out_free; } /* Allocate page dirty bitmap if needed */ @@ -830,7 +854,7 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out_free; } - if (!npages || base_gfn != old.base_gfn) { + if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { r = -ENOMEM; slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), GFP_KERNEL); @@ -881,7 +905,7 @@ int __kvm_set_memory_region(struct kvm *kvm, * slots (size changes, userspace addr changes) is disallowed above, * so any other attribute changes getting here can be skipped. */ - if (npages) { + if (change != KVM_MR_DELETE) { if (old_iommu_mapped && ((new.flags ^ old.flags) & KVM_MEM_READONLY)) { kvm_iommu_unmap_pages(kvm, &old); @@ -896,7 +920,7 @@ int __kvm_set_memory_region(struct kvm *kvm, } /* actual memory is freed via old in kvm_free_physmem_slot below */ - if (!npages) { + if (change == KVM_MR_DELETE) { new.dirty_bitmap = NULL; memset(&new.arch, 0, sizeof(new.arch)); } From 75d61fbcf563373696578570e914f555e12c8d97 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 30 Jan 2013 19:40:41 +0900 Subject: [PATCH 13/22] KVM: set_memory_region: Disallow changing read-only attribute later As Xiao pointed out, there are a few problems with it: - kvm_arch_commit_memory_region() write protects the memory slot only for GET_DIRTY_LOG when modifying the flags. - FNAME(sync_page) uses the old spte value to set a new one without checking KVM_MEM_READONLY flag. Since we flush all shadow pages when creating a new slot, the simplest fix is to disallow such problematic flag changes: this is safe because no one is doing such things. Reviewed-by: Gleb Natapov Signed-off-by: Takuya Yoshikawa Cc: Xiao Guangrong Cc: Alex Williamson Signed-off-by: Marcelo Tosatti --- Documentation/virtual/kvm/api.txt | 12 +++++------ virt/kvm/kvm_main.c | 35 +++++++++++-------------------- 2 files changed, 18 insertions(+), 29 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 09905cbcbb0b..0e03b1968272 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -874,12 +874,12 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr be identical. This allows large pages in the guest to be backed by large pages in the host. -The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which instructs -kvm to keep track of writes to memory within the slot. See KVM_GET_DIRTY_LOG -ioctl. The KVM_CAP_READONLY_MEM capability indicates the availability of the -KVM_MEM_READONLY flag. When this flag is set for a memory region, KVM only -allows read accesses. Writes will be posted to userspace as KVM_EXIT_MMIO -exits. +The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and +KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of +writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to +use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it, +to make a new slot read-only. In this case, writes to this memory will be +posted to userspace as KVM_EXIT_MMIO exits. When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of the memory region are automatically reflected into the guest. For example, an diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 64c5dc37c6a1..2e93630b4add 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -754,7 +754,6 @@ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_memory_slot *slot; struct kvm_memory_slot old, new; struct kvm_memslots *slots = NULL, *old_memslots; - bool old_iommu_mapped; enum kvm_mr_change change; r = check_memory_region_flags(mem); @@ -797,15 +796,14 @@ int __kvm_set_memory_region(struct kvm *kvm, new.npages = npages; new.flags = mem->flags; - old_iommu_mapped = old.npages; - r = -EINVAL; if (npages) { if (!old.npages) change = KVM_MR_CREATE; else { /* Modify an existing slot. */ if ((mem->userspace_addr != old.userspace_addr) || - (npages != old.npages)) + (npages != old.npages) || + ((new.flags ^ old.flags) & KVM_MEM_READONLY)) goto out; if (base_gfn != old.base_gfn) @@ -867,7 +865,6 @@ int __kvm_set_memory_region(struct kvm *kvm, /* slot was deleted or moved, clear iommu mapping */ kvm_iommu_unmap_pages(kvm, &old); - old_iommu_mapped = false; /* From this point no new shadow pages pointing to a deleted, * or moved, memslot will be created. * @@ -898,25 +895,17 @@ int __kvm_set_memory_region(struct kvm *kvm, /* * IOMMU mapping: New slots need to be mapped. Old slots need to be - * un-mapped and re-mapped if their base changes or if flags that the - * iommu cares about change (read-only). Base change unmapping is - * handled above with slot deletion, so we only unmap incompatible - * flags here. Anything else the iommu might care about for existing - * slots (size changes, userspace addr changes) is disallowed above, - * so any other attribute changes getting here can be skipped. + * un-mapped and re-mapped if their base changes. Since base change + * unmapping is handled above with slot deletion, mapping alone is + * needed here. Anything else the iommu might care about for existing + * slots (size changes, userspace addr changes and read-only flag + * changes) is disallowed above, so any other attribute changes getting + * here can be skipped. */ - if (change != KVM_MR_DELETE) { - if (old_iommu_mapped && - ((new.flags ^ old.flags) & KVM_MEM_READONLY)) { - kvm_iommu_unmap_pages(kvm, &old); - old_iommu_mapped = false; - } - - if (!old_iommu_mapped) { - r = kvm_iommu_map_pages(kvm, &new); - if (r) - goto out_slots; - } + if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { + r = kvm_iommu_map_pages(kvm, &new); + if (r) + goto out_slots; } /* actual memory is freed via old in kvm_free_physmem_slot below */ From feb3eb704a86d97edb296502e95da42d622dac61 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 30 Jan 2013 16:45:00 +0200 Subject: [PATCH 14/22] KVM: MMU: make spte_is_locklessly_modifiable() more clear spte_is_locklessly_modifiable() checks that both SPTE_HOST_WRITEABLE and SPTE_MMU_WRITEABLE are present on spte. Make it more explicit. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9f628f7a40b2..2fa82b04f14e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte) static bool spte_is_locklessly_modifiable(u64 spte) { - return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); + return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == + (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); } static bool spte_has_volatile_bits(u64 spte) From 9bb4f6b15ec038ab9afcf346aa6a590406ad6c17 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 30 Jan 2013 16:45:01 +0200 Subject: [PATCH 15/22] KVM: MMU: drop unneeded checks. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2fa82b04f14e..40737b38da19 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2328,9 +2328,8 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; - if (!need_unsync && !s->unsync) { + if (!s->unsync) need_unsync = true; - } } if (need_unsync) kvm_unsync_pages(vcpu, gfn); @@ -4008,7 +4007,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, !((sp->role.word ^ vcpu->arch.mmu.base_role.word) & mask.word) && rmap_can_add(vcpu)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); - if (!remote_flush && need_remote_flush(entry, *spte)) + if (need_remote_flush(entry, *spte)) remote_flush = true; ++spte; } From 2c9afa52ef081334925905d6370d36b6602c328c Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 30 Jan 2013 16:45:02 +0200 Subject: [PATCH 16/22] KVM: MMU: set base_role.nxe during mmu initialization. Move base_role.nxe initialisation to where all other roles are initialized. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/x86.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 40737b38da19..8028ac65db18 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3687,6 +3687,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) else r = paging32_init_context(vcpu, context); + vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); vcpu->arch.mmu.base_role.smep_andnot_wp diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cf512e70c797..373e17a0d398 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -870,8 +870,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); - vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; - /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) kvm_mmu_reset_context(vcpu); From 116eb3d30e7e121bfc6117ac037833865dad4971 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 30 Jan 2013 16:45:03 +0200 Subject: [PATCH 17/22] KVM: MMU: drop superfluous min() call. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8028ac65db18..42ba85c62fcb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3854,7 +3854,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; - r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); + r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8); if (r) gentry = 0; new = (const u8 *)&gentry; From eb3fce87ccc5d38b1ad340f32e34abc09911fb83 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 30 Jan 2013 16:45:04 +0200 Subject: [PATCH 18/22] KVM: MMU: drop superfluous is_present_gpte() check. Gust page walker puts only present ptes into ptes[] array. No need to check it again. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ca69dcccbe31..34c5c99323f4 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -409,9 +409,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, unsigned direct_access, access = gw->pt_access; int top_level, emulate = 0; - if (!is_present_gpte(gw->ptes[gw->level - 1])) - return 0; - direct_access = gw->pte_access; top_level = vcpu->arch.mmu.root_level; From 834be0d83f9451573e6fadb381fe0714211c7e90 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 30 Jan 2013 16:45:05 +0200 Subject: [PATCH 19/22] Revert "KVM: MMU: split kvm_mmu_free_page" This reverts commit bd4c86eaa6ff10abc4e00d0f45d2a28b10b09df4. There is not user for kvm_mmu_isolate_page() any more. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 42ba85c62fcb..0242a8a1b2e2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1461,28 +1461,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } -/* - * Remove the sp from shadow page cache, after call it, - * we can not find this sp from the cache, and the shadow - * page table is still valid. - * It should be under the protection of mmu lock. - */ -static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp) +static void kvm_mmu_free_page(struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); - if (!sp->role.direct) - free_page((unsigned long)sp->gfns); -} - -/* - * Free the shadow page table and the sp, we can do it - * out of the protection of mmu lock. - */ -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) -{ list_del(&sp->link); free_page((unsigned long)sp->spt); + if (!sp->role.direct) + free_page((unsigned long)sp->gfns); kmem_cache_free(mmu_page_header_cache, sp); } @@ -2126,7 +2112,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, do { sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_isolate_page(sp); kvm_mmu_free_page(sp); } while (!list_empty(invalid_list)); } From 4293b5e5a68074431cafa74d549c1327ba1d0deb Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Thu, 31 Jan 2013 12:06:08 -0800 Subject: [PATCH 20/22] KVM: Remove duplicate text in api.txt Signed-off-by: Geoff Levand Signed-off-by: Marcelo Tosatti --- Documentation/virtual/kvm/api.txt | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 0e03b1968272..c2534c300a45 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -219,19 +219,6 @@ allocation of vcpu ids. For example, if userspace wants single-threaded guest vcpus, it should make all vcpu ids be a multiple of the number of vcpus per vcore. -On powerpc using book3s_hv mode, the vcpus are mapped onto virtual -threads in one or more virtual CPU cores. (This is because the -hardware requires all the hardware threads in a CPU core to be in the -same partition.) The KVM_CAP_PPC_SMT capability indicates the number -of vcpus per virtual core (vcore). The vcore id is obtained by -dividing the vcpu id by the number of vcpus per vcore. The vcpus in a -given vcore will always be in the same physical core as each other -(though that might be a different physical core from time to time). -Userspace can control the threading (SMT) mode of the guest by its -allocation of vcpu ids. For example, if userspace wants -single-threaded guest vcpus, it should make all vcpu ids be a multiple -of the number of vcpus per vcore. - For virtual cpus that have been created with S390 user controlled virtual machines, the resulting vcpu fd can be memory mapped at page offset KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual From c08800a56cb8622bb61577abb4a120c6fdc4b9be Mon Sep 17 00:00:00 2001 From: Dongxiao Xu Date: Mon, 4 Feb 2013 11:50:43 +0800 Subject: [PATCH 21/22] KVM: VMX: disable SMEP feature when guest is in non-paging mode SMEP is disabled if CPU is in non-paging mode in hardware. However KVM always uses paging mode to emulate guest non-paging mode with TDP. To emulate this behavior, SMEP needs to be manually disabled when guest switches to non-paging mode. We met an issue that, SMP Linux guest with recent kernel (enable SMEP support, for example, 3.5.3) would crash with triple fault if setting unrestricted_guest=0. This is because KVM uses an identity mapping page table to emulate the non-paging mode, where the page table is set with USER flag. If SMEP is still enabled in this case, guest will meet unhandlable page fault and then crash. Reviewed-by: Gleb Natapov Reviewed-by: Paolo Bonzini Signed-off-by: Dongxiao Xu Signed-off-by: Xiantao Zhang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0cf74a641dec..fe9a9cfadbd6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3227,6 +3227,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; + /* + * SMEP is disabled if CPU is in non-paging mode in + * hardware. However KVM always uses paging mode to + * emulate guest non-paging mode with TDP. + * To emulate this behavior, SMEP needs to be manually + * disabled when guest switches to non-paging mode. + */ + hw_cr4 &= ~X86_CR4_SMEP; } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } From b0da5bec30eca7ffbb2c89afa6fe503fd418d3a6 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 3 Feb 2013 18:17:17 +0200 Subject: [PATCH 22/22] KVM: VMX: add missing exit names to VMX_EXIT_REASONS array Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/vmx.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 694586ca6456..5c9dbadd364a 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -105,7 +105,12 @@ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ - { EXIT_REASON_WBINVD, "WBINVD" } + { EXIT_REASON_WBINVD, "WBINVD" }, \ + { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ + { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ + { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ + { EXIT_REASON_INVD, "INVD" }, \ + { EXIT_REASON_INVPCID, "INVPCID" } #ifdef __KERNEL__