From 9e9c3fe40bcd28e3f98f0ad8408435f4503f2781 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 12 Apr 2015 21:47:15 +0300 Subject: [PATCH 01/29] KVM: x86: Fix MSR_IA32_BNDCFGS in msrs_to_save kvm_init_msr_list is currently called before hardware_setup. As a result, vmx_mpx_supported always returns false when kvm_init_msr_list checks whether to save MSR_IA32_BNDCFGS. Move kvm_init_msr_list after vmx_hardware_setup is called to fix this issue. Signed-off-by: Nadav Amit Message-Id: <1428864435-4732-1-git-send-email-namit@cs.technion.ac.il> Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e1a81267f3f6..ed31c31b2485 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5799,7 +5799,6 @@ int kvm_arch_init(void *opaque) kvm_set_mmio_spte_mask(); kvm_x86_ops = ops; - kvm_init_msr_list(); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); @@ -7253,7 +7252,14 @@ void kvm_arch_hardware_disable(void) int kvm_arch_hardware_setup(void) { - return kvm_x86_ops->hardware_setup(); + int r; + + r = kvm_x86_ops->hardware_setup(); + if (r != 0) + return r; + + kvm_init_msr_list(); + return 0; } void kvm_arch_hardware_unsetup(void) From bea15428b9d6bc36e87288f168ab314619a66757 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Apr 2015 15:40:02 +0200 Subject: [PATCH 02/29] KVM: x86: cleanup kvm_irq_delivery_to_apic_fast Sparse is reporting a "we previously assumed 'src' could be null" error. This is true as far as the static analyzer can see, but in practice only IPIs can set shorthand to self and they also set 'src', so it's ok. Still, move the initialization of x2apic_ipi (and thus the NULL check for src right before the first use. While at it, initializing ret to "false" is somewhat confusing because of the almost immediate assigned of "true" to the same variable. Thus, initialize it to "true" and modify it in the only path that used to use the value from "bool ret = false". There is no change in generated code from this change. Reported-by: Dan Carpenter Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d67206a7b99a..629af0f1c5c4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -683,8 +683,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, unsigned long bitmap = 1; struct kvm_lapic **dst; int i; - bool ret = false; - bool x2apic_ipi = src && apic_x2apic_mode(src); + bool ret, x2apic_ipi; *r = -1; @@ -696,16 +695,18 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, if (irq->shorthand) return false; + x2apic_ipi = src && apic_x2apic_mode(src); if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) return false; + ret = true; rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (!map) + if (!map) { + ret = false; goto out; - - ret = true; + } if (irq->dest_mode == APIC_DEST_PHYSICAL) { if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) From 95fce4fa0850da8f85ecf8d069ab3fc6c8bc1478 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 15 Apr 2015 12:27:04 +0200 Subject: [PATCH 03/29] KVM: s390: disable RRBM again commit b273921356df ("KVM: s390: enable more features that need no hypervisor changes") also enabled RRBM. Turns out that this instruction does need some KVM code, so lets disable that bit again. Signed-off-by: Christian Borntraeger Reviewed-by: David Hildenbrand Fixes: b273921356df ("KVM: s390: enable more features that need no hypervisor changes") Message-Id: <1429093624-49611-2-git-send-email-borntraeger@de.ibm.com> Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index afa2bd750ffc..8cd8e7b288c5 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -110,7 +110,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { /* upper facilities limit for kvm */ unsigned long kvm_s390_fac_list_mask[] = { 0xffe6fffbfcfdfc40UL, - 0x205c800000000000UL, + 0x005c800000000000UL, }; unsigned long kvm_s390_fac_list_mask_size(void) From 130005231c9f2090b1b177e2cca9841b562c1784 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 15 Apr 2015 10:24:54 +0800 Subject: [PATCH 04/29] kvm: mmu: don't do memslot overflow check As Andres pointed out: | I don't understand the value of this check here. Are we looking for a | broken memslot? Shouldn't this be a BUG_ON? Is this the place to care | about these things? npages is capped to KVM_MEM_MAX_NR_PAGES, i.e. | 2^31. A 64 bit overflow would be caused by a gigantic gfn_start which | would be trouble in many other ways. This patch drops the memslot overflow check to make the codes more simple. Reviewed-by: Andres Lagar-Cavilla Signed-off-by: Wanpeng Li Message-Id: <1429064694-3072-1-git-send-email-wanpeng.li@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 146f295ee322..07bb22157338 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4504,19 +4504,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, bool flush = false; unsigned long *rmapp; unsigned long last_index, index; - gfn_t gfn_start, gfn_end; spin_lock(&kvm->mmu_lock); - gfn_start = memslot->base_gfn; - gfn_end = memslot->base_gfn + memslot->npages - 1; - - if (gfn_start >= gfn_end) - goto out; - rmapp = memslot->arch.rmap[0]; - last_index = gfn_to_index(gfn_end, memslot->base_gfn, - PT_PAGE_TABLE_LEVEL); + last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1, + memslot->base_gfn, PT_PAGE_TABLE_LEVEL); for (index = 0; index <= last_index; ++index, ++rmapp) { if (*rmapp) @@ -4534,7 +4527,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (flush) kvm_flush_remote_tlbs(kvm); -out: spin_unlock(&kvm->mmu_lock); } From decf63336e356423300b935afbebeca1fcb15184 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 14 Apr 2015 12:04:10 +0800 Subject: [PATCH 05/29] KVM: MMU: fix comment in kvm_mmu_zap_collapsible_spte Soft mmu uses direct shadow page to fill guest large mapping with small pages if huge mapping is disallowed on host. So zapping direct shadow page works well both for soft mmu and hard mmu, it's just less widely applicable. Fix the comment to reflect this. Signed-off-by: Xiao Guangrong Message-Id: <552C91BA.1010703@linux.intel.com> [Fix comment wording further. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 07bb22157338..d43867c33bc4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4481,9 +4481,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, pfn = spte_to_pfn(*sptep); /* - * Only EPT supported for now; otherwise, one would need to - * find out efficiently whether the guest page tables are - * also using huge pages. + * We cannot do huge page mapping for indirect shadow pages, + * which are found on the last rmap (level = 1) when not using + * tdp; such shadow pages are synced with the page table in + * the guest, and the guest page table is using 4K page size + * mapping if the indirect sp has level = 1. */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && From ae75116efdc29bb42f1d99f8c51b5c52965b2413 Mon Sep 17 00:00:00 2001 From: "Suresh E. Warrier" Date: Wed, 25 Feb 2015 17:23:53 -0600 Subject: [PATCH 06/29] powerpc: Export __spin_yield Export __spin_yield so that the arch_spin_unlock() function can be invoked from a module. This will be required for modules where we want to take a lock that is also is acquired in hypervisor real mode. Because we want to avoid running any lockdep code (which may not be safe in real mode), this lock needs to be an arch_spinlock_t instead of a normal spinlock. Signed-off-by: Suresh Warrier Acked-by: Paul Mackerras Acked-by: Michael Ellerman Signed-off-by: Alexander Graf --- arch/powerpc/lib/locks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c index 170a0346f756..f7deebdf3365 100644 --- a/arch/powerpc/lib/locks.c +++ b/arch/powerpc/lib/locks.c @@ -41,6 +41,7 @@ void __spin_yield(arch_spinlock_t *lock) plpar_hcall_norets(H_CONFER, get_hard_smp_processor_id(holder_cpu), yield_count); } +EXPORT_SYMBOL_GPL(__spin_yield); /* * Waiting for a read lock or a write lock on a rwlock... From 99342cf8044420eebdf9297ca03a14cb6a7085a1 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 5 Feb 2015 11:53:25 +1100 Subject: [PATCH 07/29] kvmppc: Implement H_LOGICAL_CI_{LOAD,STORE} in KVM On POWER, storage caching is usually configured via the MMU - attributes such as cache-inhibited are stored in the TLB and the hashed page table. This makes correctly performing cache inhibited IO accesses awkward when the MMU is turned off (real mode). Some CPU models provide special registers to control the cache attributes of real mode load and stores but this is not at all consistent. This is a problem in particular for SLOF, the firmware used on KVM guests, which runs entirely in real mode, but which needs to do IO to load the kernel. To simplify this qemu implements two special hypercalls, H_LOGICAL_CI_LOAD and H_LOGICAL_CI_STORE which simulate a cache-inhibited load or store to a logical address (aka guest physical address). SLOF uses these for IO. However, because these are implemented within qemu, not the host kernel, these bypass any IO devices emulated within KVM itself. The simplest way to see this problem is to attempt to boot a KVM guest from a virtio-blk device with iothread / dataplane enabled. The iothread code relies on an in kernel implementation of the virtio queue notification, which is not triggered by the IO hcalls, and so the guest will stall in SLOF unable to load the guest OS. This patch addresses this by providing in-kernel implementations of the 2 hypercalls, which correctly scan the KVM IO bus. Any access to an address not handled by the KVM IO bus will cause a VM exit, hitting the qemu implementation as before. Note that a userspace change is also required, in order to enable these new hcall implementations with KVM_CAP_PPC_ENABLE_HCALL. Signed-off-by: David Gibson [agraf: fix compilation] Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s.h | 3 ++ arch/powerpc/kvm/book3s.c | 76 +++++++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv.c | 12 +++++ arch/powerpc/kvm/book3s_pr_papr.c | 28 ++++++++++ 4 files changed, 119 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 942c7b1678e3..578e550f937b 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -292,6 +292,9 @@ static inline bool kvmppc_supports_magic_page(struct kvm_vcpu *vcpu) return !is_kvmppc_hv_enabled(vcpu->kvm); } +extern int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu); +extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu); + /* Magic register values loaded into r3 and r4 before the 'sc' assembly * instruction for the OSI hypercalls */ #define OSI_SC_MAGIC_R3 0x113724FA diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index cfbcdc654201..453a8a47a467 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -821,6 +821,82 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) #endif } +int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu) +{ + unsigned long size = kvmppc_get_gpr(vcpu, 4); + unsigned long addr = kvmppc_get_gpr(vcpu, 5); + u64 buf; + int ret; + + if (!is_power_of_2(size) || (size > sizeof(buf))) + return H_TOO_HARD; + + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf); + if (ret != 0) + return H_TOO_HARD; + + switch (size) { + case 1: + kvmppc_set_gpr(vcpu, 4, *(u8 *)&buf); + break; + + case 2: + kvmppc_set_gpr(vcpu, 4, be16_to_cpu(*(__be16 *)&buf)); + break; + + case 4: + kvmppc_set_gpr(vcpu, 4, be32_to_cpu(*(__be32 *)&buf)); + break; + + case 8: + kvmppc_set_gpr(vcpu, 4, be64_to_cpu(*(__be64 *)&buf)); + break; + + default: + BUG(); + } + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_load); + +int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu) +{ + unsigned long size = kvmppc_get_gpr(vcpu, 4); + unsigned long addr = kvmppc_get_gpr(vcpu, 5); + unsigned long val = kvmppc_get_gpr(vcpu, 6); + u64 buf; + int ret; + + switch (size) { + case 1: + *(u8 *)&buf = val; + break; + + case 2: + *(__be16 *)&buf = cpu_to_be16(val); + break; + + case 4: + *(__be32 *)&buf = cpu_to_be32(val); + break; + + case 8: + *(__be64 *)&buf = cpu_to_be64(val); + break; + + default: + return H_TOO_HARD; + } + + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf); + if (ret != 0) + return H_TOO_HARD; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store); + int kvmppc_core_check_processor_compat(void) { /* diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index de747563d29d..b9c11a3abcb2 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -706,6 +706,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) /* Send the error out to userspace via KVM_RUN */ return rc; + case H_LOGICAL_CI_LOAD: + ret = kvmppc_h_logical_ci_load(vcpu); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_LOGICAL_CI_STORE: + ret = kvmppc_h_logical_ci_store(vcpu); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; case H_SET_MODE: ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5), @@ -740,6 +750,8 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd) case H_CONFER: case H_REGISTER_VPA: case H_SET_MODE: + case H_LOGICAL_CI_LOAD: + case H_LOGICAL_CI_STORE: #ifdef CONFIG_KVM_XICS case H_XIRR: case H_CPPR: diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index ce3c893d509b..f2c75a1e0536 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -258,6 +258,28 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu) +{ + long rc; + + rc = kvmppc_h_logical_ci_load(vcpu); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + +static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu) +{ + long rc; + + rc = kvmppc_h_logical_ci_store(vcpu); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) { long rc = kvmppc_xics_hcall(vcpu, cmd); @@ -290,6 +312,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) clear_bit(KVM_REQ_UNHALT, &vcpu->requests); vcpu->stat.halt_wakeup++; return EMULATE_DONE; + case H_LOGICAL_CI_LOAD: + return kvmppc_h_pr_logical_ci_load(vcpu); + case H_LOGICAL_CI_STORE: + return kvmppc_h_pr_logical_ci_store(vcpu); case H_XIRR: case H_CPPR: case H_EOI: @@ -323,6 +349,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd) case H_BULK_REMOVE: case H_PUT_TCE: case H_CEDE: + case H_LOGICAL_CI_LOAD: + case H_LOGICAL_CI_STORE: #ifdef CONFIG_KVM_XICS case H_XIRR: case H_CPPR: From e928e9cb3601ce240189bfea05b67ebd391c85ae Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 20 Mar 2015 20:39:41 +1100 Subject: [PATCH 08/29] KVM: PPC: Book3S HV: Add fast real-mode H_RANDOM implementation. Some PowerNV systems include a hardware random-number generator. This HWRNG is present on POWER7+ and POWER8 chips and is capable of generating one 64-bit random number every microsecond. The random numbers are produced by sampling a set of 64 unstable high-frequency oscillators and are almost completely entropic. PAPR defines an H_RANDOM hypercall which guests can use to obtain one 64-bit random sample from the HWRNG. This adds a real-mode implementation of the H_RANDOM hypercall. This hypercall was implemented in real mode because the latency of reading the HWRNG is generally small compared to the latency of a guest exit and entry for all the threads in the same virtual core. Userspace can detect the presence of the HWRNG and the H_RANDOM implementation by querying the KVM_CAP_PPC_HWRNG capability. The H_RANDOM hypercall implementation will only be invoked when the guest does an H_RANDOM hypercall if userspace first enables the in-kernel H_RANDOM implementation using the KVM_CAP_PPC_ENABLE_HCALL capability. Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 17 ++++ arch/powerpc/include/asm/archrandom.h | 11 ++- arch/powerpc/include/asm/kvm_ppc.h | 2 + arch/powerpc/kvm/book3s_hv_builtin.c | 15 ++++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 115 ++++++++++++++++++++++++ arch/powerpc/kvm/powerpc.c | 3 + arch/powerpc/platforms/powernv/rng.c | 29 ++++++ include/uapi/linux/kvm.h | 1 + 8 files changed, 191 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index bc9f6fe44e27..9fa2bf8c3f6f 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3573,3 +3573,20 @@ struct { @ar - access register number KVM handlers should exit to userspace with rc = -EREMOTE. + + +8. Other capabilities. +---------------------- + +This section lists capabilities that give information about other +features of the KVM implementation. + +8.1 KVM_CAP_PPC_HWRNG + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel has an implementation of the +H_RANDOM hypercall backed by a hardware random-number generator. +If present, the kernel H_RANDOM handler can be enabled for guest use +with the KVM_CAP_PPC_ENABLE_HCALL capability. diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h index bde531103638..0cc6eedc4780 100644 --- a/arch/powerpc/include/asm/archrandom.h +++ b/arch/powerpc/include/asm/archrandom.h @@ -30,8 +30,6 @@ static inline int arch_has_random(void) return !!ppc_md.get_random_long; } -int powernv_get_random_long(unsigned long *v); - static inline int arch_get_random_seed_long(unsigned long *v) { return 0; @@ -47,4 +45,13 @@ static inline int arch_has_random_seed(void) #endif /* CONFIG_ARCH_RANDOM */ +#ifdef CONFIG_PPC_POWERNV +int powernv_hwrng_present(void); +int powernv_get_random_long(unsigned long *v); +int powernv_get_random_real_mode(unsigned long *v); +#else +static inline int powernv_hwrng_present(void) { return 0; } +static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; } +#endif + #endif /* _ASM_POWERPC_ARCHRANDOM_H */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 46bf652c9169..b8475daad884 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -302,6 +302,8 @@ static inline bool is_kvmppc_hv_enabled(struct kvm *kvm) return kvm->arch.kvm_ops == kvmppc_hv_ops; } +extern int kvmppc_hwrng_present(void); + /* * Cuts out inst bits with ordering according to spec. * That means the leftmost bit is zero. All given bits are included. diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 1f083ff8a61a..1954a1c4b1f9 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -21,6 +21,7 @@ #include #include #include +#include #define KVM_CMA_CHUNK_ORDER 18 @@ -169,3 +170,17 @@ int kvmppc_hcall_impl_hv_realmode(unsigned long cmd) return 0; } EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode); + +int kvmppc_hwrng_present(void) +{ + return powernv_hwrng_present(); +} +EXPORT_SYMBOL_GPL(kvmppc_hwrng_present); + +long kvmppc_h_random(struct kvm_vcpu *vcpu) +{ + if (powernv_get_random_real_mode(&vcpu->arch.gpr[4])) + return H_SUCCESS; + + return H_HARDWARE; +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 6cbf1630cb70..0814ca122fcd 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1839,6 +1839,121 @@ hcall_real_table: .long 0 /* 0x12c */ .long 0 /* 0x130 */ .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table + .long 0 /* 0x138 */ + .long 0 /* 0x13c */ + .long 0 /* 0x140 */ + .long 0 /* 0x144 */ + .long 0 /* 0x148 */ + .long 0 /* 0x14c */ + .long 0 /* 0x150 */ + .long 0 /* 0x154 */ + .long 0 /* 0x158 */ + .long 0 /* 0x15c */ + .long 0 /* 0x160 */ + .long 0 /* 0x164 */ + .long 0 /* 0x168 */ + .long 0 /* 0x16c */ + .long 0 /* 0x170 */ + .long 0 /* 0x174 */ + .long 0 /* 0x178 */ + .long 0 /* 0x17c */ + .long 0 /* 0x180 */ + .long 0 /* 0x184 */ + .long 0 /* 0x188 */ + .long 0 /* 0x18c */ + .long 0 /* 0x190 */ + .long 0 /* 0x194 */ + .long 0 /* 0x198 */ + .long 0 /* 0x19c */ + .long 0 /* 0x1a0 */ + .long 0 /* 0x1a4 */ + .long 0 /* 0x1a8 */ + .long 0 /* 0x1ac */ + .long 0 /* 0x1b0 */ + .long 0 /* 0x1b4 */ + .long 0 /* 0x1b8 */ + .long 0 /* 0x1bc */ + .long 0 /* 0x1c0 */ + .long 0 /* 0x1c4 */ + .long 0 /* 0x1c8 */ + .long 0 /* 0x1cc */ + .long 0 /* 0x1d0 */ + .long 0 /* 0x1d4 */ + .long 0 /* 0x1d8 */ + .long 0 /* 0x1dc */ + .long 0 /* 0x1e0 */ + .long 0 /* 0x1e4 */ + .long 0 /* 0x1e8 */ + .long 0 /* 0x1ec */ + .long 0 /* 0x1f0 */ + .long 0 /* 0x1f4 */ + .long 0 /* 0x1f8 */ + .long 0 /* 0x1fc */ + .long 0 /* 0x200 */ + .long 0 /* 0x204 */ + .long 0 /* 0x208 */ + .long 0 /* 0x20c */ + .long 0 /* 0x210 */ + .long 0 /* 0x214 */ + .long 0 /* 0x218 */ + .long 0 /* 0x21c */ + .long 0 /* 0x220 */ + .long 0 /* 0x224 */ + .long 0 /* 0x228 */ + .long 0 /* 0x22c */ + .long 0 /* 0x230 */ + .long 0 /* 0x234 */ + .long 0 /* 0x238 */ + .long 0 /* 0x23c */ + .long 0 /* 0x240 */ + .long 0 /* 0x244 */ + .long 0 /* 0x248 */ + .long 0 /* 0x24c */ + .long 0 /* 0x250 */ + .long 0 /* 0x254 */ + .long 0 /* 0x258 */ + .long 0 /* 0x25c */ + .long 0 /* 0x260 */ + .long 0 /* 0x264 */ + .long 0 /* 0x268 */ + .long 0 /* 0x26c */ + .long 0 /* 0x270 */ + .long 0 /* 0x274 */ + .long 0 /* 0x278 */ + .long 0 /* 0x27c */ + .long 0 /* 0x280 */ + .long 0 /* 0x284 */ + .long 0 /* 0x288 */ + .long 0 /* 0x28c */ + .long 0 /* 0x290 */ + .long 0 /* 0x294 */ + .long 0 /* 0x298 */ + .long 0 /* 0x29c */ + .long 0 /* 0x2a0 */ + .long 0 /* 0x2a4 */ + .long 0 /* 0x2a8 */ + .long 0 /* 0x2ac */ + .long 0 /* 0x2b0 */ + .long 0 /* 0x2b4 */ + .long 0 /* 0x2b8 */ + .long 0 /* 0x2bc */ + .long 0 /* 0x2c0 */ + .long 0 /* 0x2c4 */ + .long 0 /* 0x2c8 */ + .long 0 /* 0x2cc */ + .long 0 /* 0x2d0 */ + .long 0 /* 0x2d4 */ + .long 0 /* 0x2d8 */ + .long 0 /* 0x2dc */ + .long 0 /* 0x2e0 */ + .long 0 /* 0x2e4 */ + .long 0 /* 0x2e8 */ + .long 0 /* 0x2ec */ + .long 0 /* 0x2f0 */ + .long 0 /* 0x2f4 */ + .long 0 /* 0x2f8 */ + .long 0 /* 0x2fc */ + .long DOTSYM(kvmppc_h_random) - hcall_real_table .globl hcall_real_table_end hcall_real_table_end: diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 24bfe401373e..55a4763d6d11 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -529,6 +529,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_RMA: r = 0; break; + case KVM_CAP_PPC_HWRNG: + r = kvmppc_hwrng_present(); + break; #endif case KVM_CAP_SYNC_MMU: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index 80db43944afe..6eb808ff637e 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -24,12 +24,22 @@ struct powernv_rng { void __iomem *regs; + void __iomem *regs_real; unsigned long mask; }; static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng); +int powernv_hwrng_present(void) +{ + struct powernv_rng *rng; + + rng = get_cpu_var(powernv_rng); + put_cpu_var(rng); + return rng != NULL; +} + static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) { unsigned long parity; @@ -46,6 +56,17 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) return val; } +int powernv_get_random_real_mode(unsigned long *v) +{ + struct powernv_rng *rng; + + rng = raw_cpu_read(powernv_rng); + + *v = rng_whiten(rng, in_rm64(rng->regs_real)); + + return 1; +} + int powernv_get_random_long(unsigned long *v) { struct powernv_rng *rng; @@ -80,12 +101,20 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng, static __init int rng_create(struct device_node *dn) { struct powernv_rng *rng; + struct resource res; unsigned long val; rng = kzalloc(sizeof(*rng), GFP_KERNEL); if (!rng) return -ENOMEM; + if (of_address_to_resource(dn, 0, &res)) { + kfree(rng); + return -ENXIO; + } + + rng->regs_real = (void __iomem *)res.start; + rng->regs = of_iomap(dn, 0); if (!rng->regs) { kfree(rng); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f574d7be7631..4b60056776d1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -813,6 +813,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_MIPS_MSA 112 #define KVM_CAP_S390_INJECT_IRQ 113 #define KVM_CAP_S390_IRQ_STATE 114 +#define KVM_CAP_PPC_HWRNG 115 #ifdef KVM_CAP_IRQ_ROUTING From 31037ecad275e9ad9bc671c34f72b495cf708ca3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 20 Mar 2015 20:39:42 +1100 Subject: [PATCH 09/29] KVM: PPC: Book3S HV: Remove RMA-related variables from code We don't support real-mode areas now that 970 support is removed. Remove the remaining details of rma from the code. Also rename rma_setup_done to hpte_setup_done to better reflect the changes. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_host.h | 3 +-- arch/powerpc/kvm/book3s_64_mmu_hv.c | 28 ++++++++++++++-------------- arch/powerpc/kvm/book3s_hv.c | 10 +++++----- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8ef05121d3cd..015773f5bb33 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -228,9 +228,8 @@ struct kvm_arch { int tlbie_lock; unsigned long lpcr; unsigned long rmor; - struct kvm_rma_info *rma; unsigned long vrma_slb_v; - int rma_setup_done; + int hpte_setup_done; u32 hpt_order; atomic_t vcpus_running; u32 online_vcores; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 534acb3c6c3d..dbf127168ca4 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -116,12 +116,12 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) long order; mutex_lock(&kvm->lock); - if (kvm->arch.rma_setup_done) { - kvm->arch.rma_setup_done = 0; - /* order rma_setup_done vs. vcpus_running */ + if (kvm->arch.hpte_setup_done) { + kvm->arch.hpte_setup_done = 0; + /* order hpte_setup_done vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { - kvm->arch.rma_setup_done = 1; + kvm->arch.hpte_setup_done = 1; goto out; } } @@ -1339,20 +1339,20 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, unsigned long tmp[2]; ssize_t nb; long int err, ret; - int rma_setup; + int hpte_setup; if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; /* lock out vcpus from running while we're doing this */ mutex_lock(&kvm->lock); - rma_setup = kvm->arch.rma_setup_done; - if (rma_setup) { - kvm->arch.rma_setup_done = 0; /* temporarily */ - /* order rma_setup_done vs. vcpus_running */ + hpte_setup = kvm->arch.hpte_setup_done; + if (hpte_setup) { + kvm->arch.hpte_setup_done = 0; /* temporarily */ + /* order hpte_setup_done vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { - kvm->arch.rma_setup_done = 1; + kvm->arch.hpte_setup_done = 1; mutex_unlock(&kvm->lock); return -EBUSY; } @@ -1405,7 +1405,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, "r=%lx\n", ret, i, v, r); goto out; } - if (!rma_setup && is_vrma_hpte(v)) { + if (!hpte_setup && is_vrma_hpte(v)) { unsigned long psize = hpte_base_page_size(v, r); unsigned long senc = slb_pgsize_encoding(psize); unsigned long lpcr; @@ -1414,7 +1414,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, (VRMA_VSID << SLB_VSID_SHIFT_1T); lpcr = senc << (LPCR_VRMASD_SH - 4); kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); - rma_setup = 1; + hpte_setup = 1; } ++i; hptp += 2; @@ -1430,9 +1430,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, } out: - /* Order HPTE updates vs. rma_setup_done */ + /* Order HPTE updates vs. hpte_setup_done */ smp_wmb(); - kvm->arch.rma_setup_done = rma_setup; + kvm->arch.hpte_setup_done = hpte_setup; mutex_unlock(&kvm->lock); if (err) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index b9c11a3abcb2..dde14fd64d8e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2044,11 +2044,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) } atomic_inc(&vcpu->kvm->arch.vcpus_running); - /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */ + /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */ smp_mb(); /* On the first time here, set up HTAB and VRMA */ - if (!vcpu->kvm->arch.rma_setup_done) { + if (!vcpu->kvm->arch.hpte_setup_done) { r = kvmppc_hv_setup_htab_rma(vcpu); if (r) goto out; @@ -2250,7 +2250,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) int srcu_idx; mutex_lock(&kvm->lock); - if (kvm->arch.rma_setup_done) + if (kvm->arch.hpte_setup_done) goto out; /* another vcpu beat us to it */ /* Allocate hashed page table (if not done already) and reset it */ @@ -2301,9 +2301,9 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); - /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ + /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ smp_wmb(); - kvm->arch.rma_setup_done = 1; + kvm->arch.hpte_setup_done = 1; err = 0; out_srcu: srcu_read_unlock(&kvm->srcu, srcu_idx); From a4bd6eb07ca72d21a7a34499ad34cfef6f527d4e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 20 Mar 2015 20:39:43 +1100 Subject: [PATCH 10/29] KVM: PPC: Book3S HV: Add helpers for lock/unlock hpte This adds helper routines for locking and unlocking HPTEs, and uses them in the rest of the code. We don't change any locking rules in this patch. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s_64.h | 14 +++++++++++++ arch/powerpc/kvm/book3s_64_mmu_hv.c | 25 ++++++++++-------------- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 25 +++++++++--------------- 3 files changed, 33 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 2d81e202bdcc..0789a0f50969 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -85,6 +85,20 @@ static inline long try_lock_hpte(__be64 *hpte, unsigned long bits) return old == 0; } +static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) +{ + hpte_v &= ~HPTE_V_HVLOCK; + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); + hpte[0] = cpu_to_be64(hpte_v); +} + +/* Without barrier */ +static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v) +{ + hpte_v &= ~HPTE_V_HVLOCK; + hpte[0] = cpu_to_be64(hpte_v); +} + static inline int __hpte_actual_psize(unsigned int lp, int psize) { int i, shift; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index dbf127168ca4..6c6825a7ae49 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -338,9 +338,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; gr = kvm->arch.revmap[index].guest_rpte; - /* Unlock the HPTE */ - asm volatile("lwsync" : : : "memory"); - hptep[0] = cpu_to_be64(v); + unlock_hpte(hptep, v); preempt_enable(); gpte->eaddr = eaddr; @@ -469,8 +467,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; hpte[1] = be64_to_cpu(hptep[1]); hpte[2] = r = rev->guest_rpte; - asm volatile("lwsync" : : : "memory"); - hptep[0] = cpu_to_be64(hpte[0]); + unlock_hpte(hptep, hpte[0]); preempt_enable(); if (hpte[0] != vcpu->arch.pgfault_hpte[0] || @@ -621,7 +618,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, hptep[1] = cpu_to_be64(r); eieio(); - hptep[0] = cpu_to_be64(hpte[0]); + __unlock_hpte(hptep, hpte[0]); asm volatile("ptesync" : : : "memory"); preempt_enable(); if (page && hpte_is_writable(r)) @@ -642,7 +639,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return ret; out_unlock: - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); preempt_enable(); goto out_put; } @@ -771,7 +768,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, } } unlock_rmap(rmapp); - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } return 0; } @@ -857,7 +854,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, } ret = 1; } - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } while ((i = j) != head); unlock_rmap(rmapp); @@ -974,8 +971,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) /* Now check and modify the HPTE */ if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { - /* unlock and continue */ - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); continue; } @@ -996,9 +992,9 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) npages_dirty = n; eieio(); } - v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); + v &= ~HPTE_V_ABSENT; v |= HPTE_V_VALID; - hptep[0] = cpu_to_be64(v); + __unlock_hpte(hptep, v); } while ((i = j) != head); unlock_rmap(rmapp); @@ -1218,8 +1214,7 @@ static long record_hpte(unsigned long flags, __be64 *hptp, r &= ~HPTE_GR_MODIFIED; revp->guest_rpte = r; } - asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); - hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + unlock_hpte(hptp, be64_to_cpu(hptp[0])); preempt_enable(); if (!(valid == want_valid && (first_pass || dirty))) ok = 0; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 625407e4d3b0..f6bf0b1de6d7 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -150,12 +150,6 @@ static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva, return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); } -static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) -{ - asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); - hpte[0] = cpu_to_be64(hpte_v); -} - long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) @@ -271,10 +265,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, u64 pte; while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - pte = be64_to_cpu(*hpte); + pte = be64_to_cpu(hpte[0]); if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT))) break; - *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hpte, pte); hpte += 2; } if (i == 8) @@ -290,9 +284,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - pte = be64_to_cpu(*hpte); + pte = be64_to_cpu(hpte[0]); if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) { - *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hpte, pte); return H_PTEG_FULL; } } @@ -331,7 +325,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Write the first HPTE dword, unlocking the HPTE and making it valid */ eieio(); - hpte[0] = cpu_to_be64(pteh); + __unlock_hpte(hpte, pteh); asm volatile("ptesync" : : : "memory"); *pte_idx_ret = pte_index; @@ -412,7 +406,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) || ((flags & H_ANDCOND) && (pte & avpn) != 0)) { - hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hpte, pte); return H_NOT_FOUND; } @@ -548,7 +542,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) be64_to_cpu(hp[0]), be64_to_cpu(hp[1])); rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); args[j] |= rcbits << (56 - 5); - hp[0] = 0; + __unlock_hpte(hp, 0); } } @@ -574,7 +568,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, pte = be64_to_cpu(hpte[0]); if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) { - hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hpte, pte); return H_NOT_FOUND; } @@ -755,8 +749,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, /* Return with the HPTE still locked */ return (hash << 3) + (i >> 1); - /* Unlock and move on */ - hpte[i] = cpu_to_be64(v); + __unlock_hpte(&hpte[i], v); } if (val & HPTE_V_SECONDARY) From 878610fe9884a34a282cd4431237343864324d23 Mon Sep 17 00:00:00 2001 From: "Suresh E. Warrier" Date: Fri, 20 Mar 2015 20:39:45 +1100 Subject: [PATCH 11/29] KVM: PPC: Book3S HV: Add guest->host real mode completion counters Add counters to track number of times we switch from guest real mode to host virtual mode during an interrupt-related hyper call because the hypercall requires actions that cannot be completed in real mode. This will help when making optimizations that reduce guest-host transitions. It is safe to use an ordinary increment rather than an atomic operation because there is one ICP per virtual CPU and kvmppc_xics_rm_complete() only works on the ICP for the current VCPU. The counters are displayed as part of IPC and ICP state provided by /sys/debug/kernel/powerpc/kvm* for each VM. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_xics.c | 31 +++++++++++++++++++++++++++---- arch/powerpc/kvm/book3s_xics.h | 6 ++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index a4a8d9f0dcb7..60bdbac8beba 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -802,14 +802,22 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n", hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt); - if (icp->rm_action & XICS_RM_KICK_VCPU) + if (icp->rm_action & XICS_RM_KICK_VCPU) { + icp->n_rm_kick_vcpu++; kvmppc_fast_vcpu_kick(icp->rm_kick_target); - if (icp->rm_action & XICS_RM_CHECK_RESEND) + } + if (icp->rm_action & XICS_RM_CHECK_RESEND) { + icp->n_rm_check_resend++; icp_check_resend(xics, icp->rm_resend_icp); - if (icp->rm_action & XICS_RM_REJECT) + } + if (icp->rm_action & XICS_RM_REJECT) { + icp->n_rm_reject++; icp_deliver_irq(xics, icp, icp->rm_reject); - if (icp->rm_action & XICS_RM_NOTIFY_EOI) + } + if (icp->rm_action & XICS_RM_NOTIFY_EOI) { + icp->n_rm_notify_eoi++; kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq); + } icp->rm_action = 0; @@ -872,10 +880,17 @@ static int xics_debug_show(struct seq_file *m, void *private) struct kvm *kvm = xics->kvm; struct kvm_vcpu *vcpu; int icsid, i; + unsigned long t_rm_kick_vcpu, t_rm_check_resend; + unsigned long t_rm_reject, t_rm_notify_eoi; if (!kvm) return 0; + t_rm_kick_vcpu = 0; + t_rm_notify_eoi = 0; + t_rm_check_resend = 0; + t_rm_reject = 0; + seq_printf(m, "=========\nICP state\n=========\n"); kvm_for_each_vcpu(i, vcpu, kvm) { @@ -890,8 +905,16 @@ static int xics_debug_show(struct seq_file *m, void *private) icp->server_num, state.xisr, state.pending_pri, state.cppr, state.mfrr, state.out_ee, state.need_resend); + t_rm_kick_vcpu += icp->n_rm_kick_vcpu; + t_rm_notify_eoi += icp->n_rm_notify_eoi; + t_rm_check_resend += icp->n_rm_check_resend; + t_rm_reject += icp->n_rm_reject; } + seq_puts(m, "ICP Guest Real Mode exit totals: "); + seq_printf(m, "\tkick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n", + t_rm_kick_vcpu, t_rm_check_resend, + t_rm_reject, t_rm_notify_eoi); for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { struct kvmppc_ics *ics = xics->ics[icsid]; diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index 73f0f2723c07..de970eca2167 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -78,6 +78,12 @@ struct kvmppc_icp { u32 rm_reject; u32 rm_eoied_irq; + /* Counters for each reason we exited real mode */ + unsigned long n_rm_kick_vcpu; + unsigned long n_rm_check_resend; + unsigned long n_rm_reject; + unsigned long n_rm_notify_eoi; + /* Debug stuff for real mode */ union kvmppc_icp_state rm_dbgstate; struct kvm_vcpu *rm_dbgtgt; From 34cb7954c0aa7c8ad1591cb6cceae36432f55bb5 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 20 Mar 2015 20:39:46 +1100 Subject: [PATCH 12/29] KVM: PPC: Book3S HV: Convert ICS mutex lock to spin lock Replaces the ICS mutex lock with a spin lock since we will be porting these routines to real mode. Note that we need to disable interrupts before we take the lock in anticipation of the fact that on the guest side, we are running in the context of a hard irq and interrupts are disabled (EE bit off) when the lock is acquired. Again, because we will be acquiring the lock in hypervisor real mode, we need to use an arch_spinlock_t instead of a normal spinlock here as we want to avoid running any lockdep code (which may not be safe to execute in real mode). Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_xics.c | 68 +++++++++++++++++++++++----------- arch/powerpc/kvm/book3s_xics.h | 2 +- 2 files changed, 48 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 60bdbac8beba..5f7beebd36a6 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -39,7 +40,7 @@ * LOCKING * ======= * - * Each ICS has a mutex protecting the information about the IRQ + * Each ICS has a spin lock protecting the information about the IRQ * sources and avoiding simultaneous deliveries if the same interrupt. * * ICP operations are done via a single compare & swap transaction @@ -109,7 +110,10 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, { int i; - mutex_lock(&ics->lock); + unsigned long flags; + + local_irq_save(flags); + arch_spin_lock(&ics->lock); for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *state = &ics->irq_state[i]; @@ -120,12 +124,15 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, XICS_DBG("resend %#x prio %#x\n", state->number, state->priority); - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); icp_deliver_irq(xics, icp, state->number); - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); } - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); } static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, @@ -133,8 +140,10 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, u32 server, u32 priority, u32 saved_priority) { bool deliver; + unsigned long flags; - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); state->server = server; state->priority = priority; @@ -145,7 +154,8 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, deliver = true; } - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); return deliver; } @@ -186,6 +196,7 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority) struct kvmppc_ics *ics; struct ics_irq_state *state; u16 src; + unsigned long flags; if (!xics) return -ENODEV; @@ -195,10 +206,12 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority) return -EINVAL; state = &ics->irq_state[src]; - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); *server = state->server; *priority = state->priority; - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); return 0; } @@ -365,6 +378,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, struct kvmppc_ics *ics; u32 reject; u16 src; + unsigned long flags; /* * This is used both for initial delivery of an interrupt and @@ -391,7 +405,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, state = &ics->irq_state[src]; /* Get a lock on the ICS */ - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); /* Get our server */ if (!icp || state->server != icp->server_num) { @@ -434,7 +449,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * * Note that if successful, the new delivery might have itself * rejected an interrupt that was "delivered" before we took the - * icp mutex. + * ics spin lock. * * In this case we do the whole sequence all over again for the * new guy. We cannot assume that the rejected interrupt is less @@ -448,7 +463,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * Delivery was successful, did we reject somebody else ? */ if (reject && reject != XICS_IPI) { - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); new_irq = reject; goto again; } @@ -468,12 +484,14 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ smp_mb(); if (!icp->state.need_resend) { - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); goto again; } } out: - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); } static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, @@ -880,6 +898,7 @@ static int xics_debug_show(struct seq_file *m, void *private) struct kvm *kvm = xics->kvm; struct kvm_vcpu *vcpu; int icsid, i; + unsigned long flags; unsigned long t_rm_kick_vcpu, t_rm_check_resend; unsigned long t_rm_reject, t_rm_notify_eoi; @@ -924,7 +943,8 @@ static int xics_debug_show(struct seq_file *m, void *private) seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n", icsid); - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *irq = &ics->irq_state[i]; @@ -935,7 +955,8 @@ static int xics_debug_show(struct seq_file *m, void *private) irq->resend, irq->masked_pending); } - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); } return 0; } @@ -988,7 +1009,6 @@ static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, if (!ics) goto out; - mutex_init(&ics->lock); ics->icsid = icsid; for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { @@ -1130,13 +1150,15 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) u64 __user *ubufp = (u64 __user *) addr; u16 idx; u64 val, prio; + unsigned long flags; ics = kvmppc_xics_find_ics(xics, irq, &idx); if (!ics) return -ENOENT; irqp = &ics->irq_state[idx]; - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); ret = -ENOENT; if (irqp->exists) { val = irqp->server; @@ -1152,7 +1174,8 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) val |= KVM_XICS_PENDING; ret = 0; } - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); if (!ret && put_user(val, ubufp)) ret = -EFAULT; @@ -1169,6 +1192,7 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) u64 val; u8 prio; u32 server; + unsigned long flags; if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) return -ENOENT; @@ -1189,7 +1213,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) kvmppc_xics_find_server(xics->kvm, server) == NULL) return -EINVAL; - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); irqp->server = server; irqp->saved_priority = prio; if (val & KVM_XICS_MASKED) @@ -1201,7 +1226,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE)) irqp->asserted = 1; irqp->exists = 1; - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); if (val & KVM_XICS_PENDING) icp_deliver_irq(xics, NULL, irqp->number); diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index de970eca2167..055424c43249 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -90,7 +90,7 @@ struct kvmppc_icp { }; struct kvmppc_ics { - struct mutex lock; + arch_spinlock_t lock; u16 icsid; struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; }; From b0221556dbd3c31c47f37703f856aeeffc78abd3 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 20 Mar 2015 20:39:47 +1100 Subject: [PATCH 13/29] KVM: PPC: Book3S HV: Move virtual mode ICP functions to real-mode Interrupt-based hypercalls return H_TOO_HARD to inform KVM that it needs to switch to the host to complete the rest of hypercall function in virtual mode. This patch ports the virtual mode ICS/ICP reject and resend functions to be runnable in hypervisor real mode, thus avoiding the need to switch to the host to execute these functions in virtual mode. However, the hypercalls continue to return H_TOO_HARD for vcpu_wakeup and notify events - these events cannot be done in real mode and they will still need a switch to host virtual mode. There are sufficient differences between the real mode code and the virtual mode code for the ICS/ICP resend and reject functions that for now the code has been duplicated instead of sharing common code. In the future, we can look at creating common functions. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv_rm_xics.c | 225 +++++++++++++++++++++++++-- 1 file changed, 211 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 7c22997de906..73bbe9246512 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -23,12 +23,39 @@ #define DEBUG_PASSUP +static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq); + static inline void rm_writeb(unsigned long paddr, u8 val) { __asm__ __volatile__("sync; stbcix %0,0,%1" : : "r" (val), "r" (paddr) : "memory"); } +/* -- ICS routines -- */ +static void ics_rm_check_resend(struct kvmppc_xics *xics, + struct kvmppc_ics *ics, struct kvmppc_icp *icp) +{ + int i; + + arch_spin_lock(&ics->lock); + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + struct ics_irq_state *state = &ics->irq_state[i]; + + if (!state->resend) + continue; + + arch_spin_unlock(&ics->lock); + icp_rm_deliver_irq(xics, icp, state->number); + arch_spin_lock(&ics->lock); + } + + arch_spin_unlock(&ics->lock); +} + +/* -- ICP routines -- */ + static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, struct kvm_vcpu *this_vcpu) { @@ -116,6 +143,178 @@ static inline int check_too_hard(struct kvmppc_xics *xics, return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS; } +static void icp_rm_check_resend(struct kvmppc_xics *xics, + struct kvmppc_icp *icp) +{ + u32 icsid; + + /* Order this load with the test for need_resend in the caller */ + smp_rmb(); + for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) { + struct kvmppc_ics *ics = xics->ics[icsid]; + + if (!test_and_clear_bit(icsid, icp->resend_map)) + continue; + if (!ics) + continue; + ics_rm_check_resend(xics, ics, icp); + } +} + +static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, + u32 *reject) +{ + union kvmppc_icp_state old_state, new_state; + bool success; + + do { + old_state = new_state = READ_ONCE(icp->state); + + *reject = 0; + + /* See if we can deliver */ + success = new_state.cppr > priority && + new_state.mfrr > priority && + new_state.pending_pri > priority; + + /* + * If we can, check for a rejection and perform the + * delivery + */ + if (success) { + *reject = new_state.xisr; + new_state.xisr = irq; + new_state.pending_pri = priority; + } else { + /* + * If we failed to deliver we set need_resend + * so a subsequent CPPR state change causes us + * to try a new delivery. + */ + new_state.need_resend = true; + } + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + return success; +} + +static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq) +{ + struct ics_irq_state *state; + struct kvmppc_ics *ics; + u32 reject; + u16 src; + + /* + * This is used both for initial delivery of an interrupt and + * for subsequent rejection. + * + * Rejection can be racy vs. resends. We have evaluated the + * rejection in an atomic ICP transaction which is now complete, + * so potentially the ICP can already accept the interrupt again. + * + * So we need to retry the delivery. Essentially the reject path + * boils down to a failed delivery. Always. + * + * Now the interrupt could also have moved to a different target, + * thus we may need to re-do the ICP lookup as well + */ + + again: + /* Get the ICS state and lock it */ + ics = kvmppc_xics_find_ics(xics, new_irq, &src); + if (!ics) { + /* Unsafe increment, but this does not need to be accurate */ + return; + } + state = &ics->irq_state[src]; + + /* Get a lock on the ICS */ + arch_spin_lock(&ics->lock); + + /* Get our server */ + if (!icp || state->server != icp->server_num) { + icp = kvmppc_xics_find_server(xics->kvm, state->server); + if (!icp) { + /* Unsafe increment again*/ + goto out; + } + } + + /* Clear the resend bit of that interrupt */ + state->resend = 0; + + /* + * If masked, bail out + * + * Note: PAPR doesn't mention anything about masked pending + * when doing a resend, only when doing a delivery. + * + * However that would have the effect of losing a masked + * interrupt that was rejected and isn't consistent with + * the whole masked_pending business which is about not + * losing interrupts that occur while masked. + * + * I don't differentiate normal deliveries and resends, this + * implementation will differ from PAPR and not lose such + * interrupts. + */ + if (state->priority == MASKED) { + state->masked_pending = 1; + goto out; + } + + /* + * Try the delivery, this will set the need_resend flag + * in the ICP as part of the atomic transaction if the + * delivery is not possible. + * + * Note that if successful, the new delivery might have itself + * rejected an interrupt that was "delivered" before we took the + * ics spin lock. + * + * In this case we do the whole sequence all over again for the + * new guy. We cannot assume that the rejected interrupt is less + * favored than the new one, and thus doesn't need to be delivered, + * because by the time we exit icp_rm_try_to_deliver() the target + * processor may well have already consumed & completed it, and thus + * the rejected interrupt might actually be already acceptable. + */ + if (icp_rm_try_to_deliver(icp, new_irq, state->priority, &reject)) { + /* + * Delivery was successful, did we reject somebody else ? + */ + if (reject && reject != XICS_IPI) { + arch_spin_unlock(&ics->lock); + new_irq = reject; + goto again; + } + } else { + /* + * We failed to deliver the interrupt we need to set the + * resend map bit and mark the ICS state as needing a resend + */ + set_bit(ics->icsid, icp->resend_map); + state->resend = 1; + + /* + * If the need_resend flag got cleared in the ICP some time + * between icp_rm_try_to_deliver() atomic update and now, then + * we know it might have missed the resend_map bit. So we + * retry + */ + smp_mb(); + if (!icp->state.need_resend) { + arch_spin_unlock(&ics->lock); + goto again; + } + } + out: + arch_spin_unlock(&ics->lock); +} + static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u8 new_cppr) { @@ -184,8 +383,7 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * separately here as well. */ if (resend) { - icp->rm_action |= XICS_RM_CHECK_RESEND; - icp->rm_resend_icp = icp; + icp_rm_check_resend(xics, icp); } } @@ -300,16 +498,14 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, } } while (!icp_rm_try_update(icp, old_state, new_state)); - /* Pass rejects to virtual mode */ + /* Handle reject in real mode */ if (reject && reject != XICS_IPI) { - this_icp->rm_action |= XICS_RM_REJECT; - this_icp->rm_reject = reject; + icp_rm_deliver_irq(xics, icp, reject); } - /* Pass resends to virtual mode */ + /* Handle resends in real mode */ if (resend) { - this_icp->rm_action |= XICS_RM_CHECK_RESEND; - this_icp->rm_resend_icp = icp; + icp_rm_check_resend(xics, icp); } return check_too_hard(xics, this_icp); @@ -365,10 +561,12 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) } while (!icp_rm_try_update(icp, old_state, new_state)); - /* Pass rejects to virtual mode */ + /* + * Check for rejects. They are handled by doing a new delivery + * attempt (see comments in icp_rm_deliver_irq). + */ if (reject && reject != XICS_IPI) { - icp->rm_action |= XICS_RM_REJECT; - icp->rm_reject = reject; + icp_rm_deliver_irq(xics, icp, reject); } bail: return check_too_hard(xics, icp); @@ -416,10 +614,9 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) goto bail; state = &ics->irq_state[src]; - /* Still asserted, resend it, we make it look like a reject */ + /* Still asserted, resend it */ if (state->asserted) { - icp->rm_action |= XICS_RM_REJECT; - icp->rm_reject = irq; + icp_rm_deliver_irq(xics, icp, irq); } if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) { From 6e0365b782739eb41b03bcfd23abeefacbf0817a Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 20 Mar 2015 20:39:48 +1100 Subject: [PATCH 14/29] KVM: PPC: Book3S HV: Add ICP real mode counters Add two counters to count how often we generate real-mode ICS resend and reject events. The counters provide some performance statistics that could be used in the future to consider if the real mode functions need further optimizing. The counters are displayed as part of IPC and ICP state provided by /sys/debug/kernel/powerpc/kvm* for each VM. Also added two counters that count (approximately) how many times we don't find an ICP or ICS we're looking for. These are not currently exposed through sysfs, but can be useful when debugging crashes. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv_rm_xics.c | 7 +++++++ arch/powerpc/kvm/book3s_xics.c | 10 ++++++++-- arch/powerpc/kvm/book3s_xics.h | 5 +++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 73bbe9246512..6dded8c75234 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -227,6 +227,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, ics = kvmppc_xics_find_ics(xics, new_irq, &src); if (!ics) { /* Unsafe increment, but this does not need to be accurate */ + xics->err_noics++; return; } state = &ics->irq_state[src]; @@ -239,6 +240,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, icp = kvmppc_xics_find_server(xics->kvm, state->server); if (!icp) { /* Unsafe increment again*/ + xics->err_noicp++; goto out; } } @@ -383,6 +385,7 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * separately here as well. */ if (resend) { + icp->n_check_resend++; icp_rm_check_resend(xics, icp); } } @@ -500,11 +503,13 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, /* Handle reject in real mode */ if (reject && reject != XICS_IPI) { + this_icp->n_reject++; icp_rm_deliver_irq(xics, icp, reject); } /* Handle resends in real mode */ if (resend) { + this_icp->n_check_resend++; icp_rm_check_resend(xics, icp); } @@ -566,6 +571,7 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) * attempt (see comments in icp_rm_deliver_irq). */ if (reject && reject != XICS_IPI) { + icp->n_reject++; icp_rm_deliver_irq(xics, icp, reject); } bail: @@ -616,6 +622,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) /* Still asserted, resend it */ if (state->asserted) { + icp->n_reject++; icp_rm_deliver_irq(xics, icp, irq); } diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 5f7beebd36a6..8f3e6cc54d95 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -901,6 +901,7 @@ static int xics_debug_show(struct seq_file *m, void *private) unsigned long flags; unsigned long t_rm_kick_vcpu, t_rm_check_resend; unsigned long t_rm_reject, t_rm_notify_eoi; + unsigned long t_reject, t_check_resend; if (!kvm) return 0; @@ -909,6 +910,8 @@ static int xics_debug_show(struct seq_file *m, void *private) t_rm_notify_eoi = 0; t_rm_check_resend = 0; t_rm_reject = 0; + t_check_resend = 0; + t_reject = 0; seq_printf(m, "=========\nICP state\n=========\n"); @@ -928,12 +931,15 @@ static int xics_debug_show(struct seq_file *m, void *private) t_rm_notify_eoi += icp->n_rm_notify_eoi; t_rm_check_resend += icp->n_rm_check_resend; t_rm_reject += icp->n_rm_reject; + t_check_resend += icp->n_check_resend; + t_reject += icp->n_reject; } - seq_puts(m, "ICP Guest Real Mode exit totals: "); - seq_printf(m, "\tkick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n", + seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n", t_rm_kick_vcpu, t_rm_check_resend, t_rm_reject, t_rm_notify_eoi); + seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n", + t_check_resend, t_reject); for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { struct kvmppc_ics *ics = xics->ics[icsid]; diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index 055424c43249..56ea44f9867f 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -83,6 +83,9 @@ struct kvmppc_icp { unsigned long n_rm_check_resend; unsigned long n_rm_reject; unsigned long n_rm_notify_eoi; + /* Counters for handling ICP processing in real mode */ + unsigned long n_check_resend; + unsigned long n_reject; /* Debug stuff for real mode */ union kvmppc_icp_state rm_dbgstate; @@ -102,6 +105,8 @@ struct kvmppc_xics { u32 max_icsid; bool real_mode; bool real_mode_dbg; + u32 err_noics; + u32 err_noicp; struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1]; }; From e23a808b1681d398a983ebc51179efc51c4a1eaf Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2015 14:21:01 +1100 Subject: [PATCH 15/29] KVM: PPC: Book3S HV: Create debugfs file for each guest's HPT This creates a debugfs directory for each HV guest (assuming debugfs is enabled in the kernel config), and within that directory, a file by which the contents of the guest's HPT (hashed page table) can be read. The directory is named vmnnnn, where nnnn is the PID of the process that created the guest. The file is named "htab". This is intended to help in debugging problems in the host's management of guest memory. The contents of the file consist of a series of lines like this: 3f48 4000d032bf003505 0000000bd7ff1196 00000003b5c71196 The first field is the index of the entry in the HPT, the second and third are the HPT entry, so the third entry contains the real page number that is mapped by the entry if the entry's valid bit is set. The fourth field is the guest's view of the second doubleword of the entry, so it contains the guest physical address. (The format of the second through fourth fields are described in the Power ISA and also in arch/powerpc/include/asm/mmu-hash64.h.) Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s_64.h | 2 + arch/powerpc/include/asm/kvm_host.h | 2 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 136 +++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv.c | 12 ++ virt/kvm/kvm_main.c | 1 + 5 files changed, 153 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 0789a0f50969..869c53fe02cd 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -436,6 +436,8 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) return rcu_dereference_raw_notrace(kvm->memslots); } +extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 015773f5bb33..f1d0bbc0f079 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -238,6 +238,8 @@ struct kvm_arch { atomic_t hpte_mod_interest; cpumask_t need_tlb_flush; int hpt_cma_alloc; + struct dentry *debugfs_dir; + struct dentry *htab_dentry; #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE struct mutex hpt_mutex; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 6c6825a7ae49..d6fe30835c58 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1490,6 +1491,141 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) return ret; } +struct debugfs_htab_state { + struct kvm *kvm; + struct mutex mutex; + unsigned long hpt_index; + int chars_left; + int buf_index; + char buf[64]; +}; + +static int debugfs_htab_open(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + struct debugfs_htab_state *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + kvm_get_kvm(kvm); + p->kvm = kvm; + mutex_init(&p->mutex); + file->private_data = p; + + return nonseekable_open(inode, file); +} + +static int debugfs_htab_release(struct inode *inode, struct file *file) +{ + struct debugfs_htab_state *p = file->private_data; + + kvm_put_kvm(p->kvm); + kfree(p); + return 0; +} + +static ssize_t debugfs_htab_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct debugfs_htab_state *p = file->private_data; + ssize_t ret, r; + unsigned long i, n; + unsigned long v, hr, gr; + struct kvm *kvm; + __be64 *hptp; + + ret = mutex_lock_interruptible(&p->mutex); + if (ret) + return ret; + + if (p->chars_left) { + n = p->chars_left; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf + p->buf_index, n); + n -= r; + p->chars_left -= n; + p->buf_index += n; + buf += n; + len -= n; + ret = n; + if (r) { + if (!n) + ret = -EFAULT; + goto out; + } + } + + kvm = p->kvm; + i = p->hpt_index; + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) { + if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) + continue; + + /* lock the HPTE so it's stable and read it */ + preempt_disable(); + while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) + cpu_relax(); + v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; + hr = be64_to_cpu(hptp[1]); + gr = kvm->arch.revmap[i].guest_rpte; + unlock_hpte(hptp, v); + preempt_enable(); + + if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) + continue; + + n = scnprintf(p->buf, sizeof(p->buf), + "%6lx %.16lx %.16lx %.16lx\n", + i, v, hr, gr); + p->chars_left = n; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf, n); + n -= r; + p->chars_left -= n; + p->buf_index = n; + buf += n; + len -= n; + ret += n; + if (r) { + if (!ret) + ret = -EFAULT; + goto out; + } + } + p->hpt_index = i; + + out: + mutex_unlock(&p->mutex); + return ret; +} + +ssize_t debugfs_htab_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return -EACCES; +} + +static const struct file_operations debugfs_htab_fops = { + .owner = THIS_MODULE, + .open = debugfs_htab_open, + .release = debugfs_htab_release, + .read = debugfs_htab_read, + .write = debugfs_htab_write, + .llseek = generic_file_llseek, +}; + +void kvmppc_mmu_debugfs_init(struct kvm *kvm) +{ + kvm->arch.htab_dentry = debugfs_create_file("htab", 0400, + kvm->arch.debugfs_dir, kvm, + &debugfs_htab_fops); +} + void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) { struct kvmppc_mmu *mmu = &vcpu->arch.mmu; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index dde14fd64d8e..08f8617f4046 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -2319,6 +2320,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; + char buf[32]; /* Allocate the guest's logical partition ID */ @@ -2359,6 +2361,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) */ kvm_hv_vm_activated(); + /* + * Create a debugfs directory for the VM + */ + snprintf(buf, sizeof(buf), "vm%d", current->pid); + kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); + if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) + kvmppc_mmu_debugfs_init(kvm); + return 0; } @@ -2379,6 +2389,8 @@ static void kvmppc_free_vcores(struct kvm *kvm) static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { + debugfs_remove_recursive(kvm->arch.debugfs_dir); + kvm_hv_vm_deactivated(); kvmppc_free_vcores(kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d3fc9399062a..90977418aeb6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -89,6 +89,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_cache); static __read_mostly struct preempt_ops kvm_preempt_ops; struct dentry *kvm_debugfs_dir; +EXPORT_SYMBOL_GPL(kvm_debugfs_dir); static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); From b6c295df3131c6fa25f8f29625ee0609506150ad Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2015 14:21:02 +1100 Subject: [PATCH 16/29] KVM: PPC: Book3S HV: Accumulate timing information for real-mode code This reads the timebase at various points in the real-mode guest entry/exit code and uses that to accumulate total, minimum and maximum time spent in those parts of the code. Currently these times are accumulated per vcpu in 5 parts of the code: * rm_entry - time taken from the start of kvmppc_hv_entry() until just before entering the guest. * rm_intr - time from when we take a hypervisor interrupt in the guest until we either re-enter the guest or decide to exit to the host. This includes time spent handling hcalls in real mode. * rm_exit - time from when we decide to exit the guest until the return from kvmppc_hv_entry(). * guest - time spend in the guest * cede - time spent napping in real mode due to an H_CEDE hcall while other threads in the same vcore are active. These times are exposed in debugfs in a directory per vcpu that contains a file called "timings". This file contains one line for each of the 5 timings above, with the name followed by a colon and 4 numbers, which are the count (number of times the code has been executed), the total time, the minimum time, and the maximum time, all in nanoseconds. The overhead of the extra code amounts to about 30ns for an hcall that is handled in real mode (e.g. H_SET_DABR), which is about 25%. Since production environments may not wish to incur this overhead, the new code is conditional on a new config symbol, CONFIG_KVM_BOOK3S_HV_EXIT_TIMING. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_host.h | 21 ++++ arch/powerpc/include/asm/time.h | 3 + arch/powerpc/kernel/asm-offsets.c | 13 ++ arch/powerpc/kernel/time.c | 6 + arch/powerpc/kvm/Kconfig | 14 +++ arch/powerpc/kvm/book3s_hv.c | 150 ++++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 141 +++++++++++++++++++++- 7 files changed, 346 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index f1d0bbc0f079..d2068bba9059 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -369,6 +369,14 @@ struct kvmppc_slb { u8 base_page_size; /* MMU_PAGE_xxx */ }; +/* Struct used to accumulate timing information in HV real mode code */ +struct kvmhv_tb_accumulator { + u64 seqcount; /* used to synchronize access, also count * 2 */ + u64 tb_total; /* total time in timebase ticks */ + u64 tb_min; /* min time */ + u64 tb_max; /* max time */ +}; + # ifdef CONFIG_PPC_FSL_BOOK3E #define KVMPPC_BOOKE_IAC_NUM 2 #define KVMPPC_BOOKE_DAC_NUM 2 @@ -657,6 +665,19 @@ struct kvm_vcpu_arch { u32 emul_inst; #endif + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + struct kvmhv_tb_accumulator *cur_activity; /* What we're timing */ + u64 cur_tb_start; /* when it started */ + struct kvmhv_tb_accumulator rm_entry; /* real-mode entry code */ + struct kvmhv_tb_accumulator rm_intr; /* real-mode intr handling */ + struct kvmhv_tb_accumulator rm_exit; /* real-mode exit code */ + struct kvmhv_tb_accumulator guest_time; /* guest execution */ + struct kvmhv_tb_accumulator cede_time; /* time napping inside guest */ + + struct dentry *debugfs_dir; + struct dentry *debugfs_timings; +#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ }; #define VCPU_FPR(vcpu, i) (vcpu)->arch.fp.fpr[i][TS_FPROFFSET] diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 03cbada59d3a..10fc784a2ad4 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -211,5 +211,8 @@ extern void secondary_cpu_time_init(void); DECLARE_PER_CPU(u64, decrementers_next_tb); +/* Convert timebase ticks to nanoseconds */ +unsigned long long tb_to_ns(unsigned long long tb_ticks); + #endif /* __KERNEL__ */ #endif /* __POWERPC_TIME_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 4717859fdd04..3fea721f0da5 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -458,6 +458,19 @@ int main(void) DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.shregs.sprg1)); DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2)); DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3)); +#endif +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + DEFINE(VCPU_TB_RMENTRY, offsetof(struct kvm_vcpu, arch.rm_entry)); + DEFINE(VCPU_TB_RMINTR, offsetof(struct kvm_vcpu, arch.rm_intr)); + DEFINE(VCPU_TB_RMEXIT, offsetof(struct kvm_vcpu, arch.rm_exit)); + DEFINE(VCPU_TB_GUEST, offsetof(struct kvm_vcpu, arch.guest_time)); + DEFINE(VCPU_TB_CEDE, offsetof(struct kvm_vcpu, arch.cede_time)); + DEFINE(VCPU_CUR_ACTIVITY, offsetof(struct kvm_vcpu, arch.cur_activity)); + DEFINE(VCPU_ACTIVITY_START, offsetof(struct kvm_vcpu, arch.cur_tb_start)); + DEFINE(TAS_SEQCOUNT, offsetof(struct kvmhv_tb_accumulator, seqcount)); + DEFINE(TAS_TOTAL, offsetof(struct kvmhv_tb_accumulator, tb_total)); + DEFINE(TAS_MIN, offsetof(struct kvmhv_tb_accumulator, tb_min)); + DEFINE(TAS_MAX, offsetof(struct kvmhv_tb_accumulator, tb_max)); #endif DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3)); DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4)); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 2d7b33fab953..56f44848b044 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -608,6 +608,12 @@ void arch_suspend_enable_irqs(void) } #endif +unsigned long long tb_to_ns(unsigned long long ticks) +{ + return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift; +} +EXPORT_SYMBOL_GPL(tb_to_ns); + /* * Scheduler clock - returns current time in nanosec units. * diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 11850f310fb4..2963e4dd0b80 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -110,6 +110,20 @@ config KVM_BOOK3S_64_PR processor, including emulating 32-bit processors on a 64-bit host. +config KVM_BOOK3S_HV_EXIT_TIMING + bool "Detailed timing for hypervisor real-mode code" + depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS + ---help--- + Calculate time taken for each vcpu in the real-mode guest entry, + exit, and interrupt handling code, plus time spent in the guest + and in nap mode due to idle (cede) while other threads are still + in the guest. The total, minimum and maximum times in nanoseconds + together with the number of executions are reported in debugfs in + kvm/vm#/vcpu#/timings. The overhead is of the order of 30 - 40 + ns per exit on POWER8. + + If unsure, say N. + config KVM_BOOKE_HV bool diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 08f8617f4046..64a02d4c737c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1423,6 +1423,154 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) return vcore; } +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING +static struct debugfs_timings_element { + const char *name; + size_t offset; +} timings[] = { + {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)}, + {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)}, + {"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)}, + {"guest", offsetof(struct kvm_vcpu, arch.guest_time)}, + {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, +}; + +#define N_TIMINGS (sizeof(timings) / sizeof(timings[0])) + +struct debugfs_timings_state { + struct kvm_vcpu *vcpu; + unsigned int buflen; + char buf[N_TIMINGS * 100]; +}; + +static int debugfs_timings_open(struct inode *inode, struct file *file) +{ + struct kvm_vcpu *vcpu = inode->i_private; + struct debugfs_timings_state *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + kvm_get_kvm(vcpu->kvm); + p->vcpu = vcpu; + file->private_data = p; + + return nonseekable_open(inode, file); +} + +static int debugfs_timings_release(struct inode *inode, struct file *file) +{ + struct debugfs_timings_state *p = file->private_data; + + kvm_put_kvm(p->vcpu->kvm); + kfree(p); + return 0; +} + +static ssize_t debugfs_timings_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct debugfs_timings_state *p = file->private_data; + struct kvm_vcpu *vcpu = p->vcpu; + char *s, *buf_end; + struct kvmhv_tb_accumulator tb; + u64 count; + loff_t pos; + ssize_t n; + int i, loops; + bool ok; + + if (!p->buflen) { + s = p->buf; + buf_end = s + sizeof(p->buf); + for (i = 0; i < N_TIMINGS; ++i) { + struct kvmhv_tb_accumulator *acc; + + acc = (struct kvmhv_tb_accumulator *) + ((unsigned long)vcpu + timings[i].offset); + ok = false; + for (loops = 0; loops < 1000; ++loops) { + count = acc->seqcount; + if (!(count & 1)) { + smp_rmb(); + tb = *acc; + smp_rmb(); + if (count == acc->seqcount) { + ok = true; + break; + } + } + udelay(1); + } + if (!ok) + snprintf(s, buf_end - s, "%s: stuck\n", + timings[i].name); + else + snprintf(s, buf_end - s, + "%s: %llu %llu %llu %llu\n", + timings[i].name, count / 2, + tb_to_ns(tb.tb_total), + tb_to_ns(tb.tb_min), + tb_to_ns(tb.tb_max)); + s += strlen(s); + } + p->buflen = s - p->buf; + } + + pos = *ppos; + if (pos >= p->buflen) + return 0; + if (len > p->buflen - pos) + len = p->buflen - pos; + n = copy_to_user(buf, p->buf + pos, len); + if (n) { + if (n == len) + return -EFAULT; + len -= n; + } + *ppos = pos + len; + return len; +} + +static ssize_t debugfs_timings_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return -EACCES; +} + +static const struct file_operations debugfs_timings_ops = { + .owner = THIS_MODULE, + .open = debugfs_timings_open, + .release = debugfs_timings_release, + .read = debugfs_timings_read, + .write = debugfs_timings_write, + .llseek = generic_file_llseek, +}; + +/* Create a debugfs directory for the vcpu */ +static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) +{ + char buf[16]; + struct kvm *kvm = vcpu->kvm; + + snprintf(buf, sizeof(buf), "vcpu%u", id); + if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) + return; + vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir); + if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir)) + return; + vcpu->arch.debugfs_timings = + debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, + vcpu, &debugfs_timings_ops); +} + +#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ +static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) +{ +} +#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ + static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, unsigned int id) { @@ -1492,6 +1640,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, vcpu->arch.cpu_type = KVM_CPU_3S_64; kvmppc_sanity_check(vcpu); + debugfs_vcpu_init(vcpu, id); + return vcpu; free_vcpu: diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 0814ca122fcd..b06fe53fd509 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -225,7 +225,13 @@ kvm_novcpu_wakeup: /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ ld r4, HSTATE_KVM_VCPU(r13) cmpdi r4, 0 - bne kvmppc_got_guest + beq kvmppc_primary_no_guest + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r4, VCPU_TB_RMENTRY + bl kvmhv_start_timing +#endif + b kvmppc_got_guest kvm_novcpu_exit: b hdec_soon @@ -376,6 +382,14 @@ kvmppc_hv_entry: li r6, KVM_GUEST_MODE_HOST_HV stb r6, HSTATE_IN_GUEST(r13) +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + /* Store initial timestamp */ + cmpdi r4, 0 + beq 1f + addi r3, r4, VCPU_TB_RMENTRY + bl kvmhv_start_timing +1: +#endif /* Clear out SLB */ li r6,0 slbmte r6,r6 @@ -880,6 +894,12 @@ fast_guest_return: li r9, KVM_GUEST_MODE_GUEST_HV stb r9, HSTATE_IN_GUEST(r13) +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + /* Accumulate timing */ + addi r3, r4, VCPU_TB_GUEST + bl kvmhv_accumulate_time +#endif + /* Enter guest */ BEGIN_FTR_SECTION @@ -917,6 +937,23 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) hrfid b . +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING +secondary_too_late: + cmpdi r4, 0 + beq 11f + addi r3, r4, VCPU_TB_RMEXIT + bl kvmhv_accumulate_time +11: b kvmhv_switch_to_host + +hdec_soon: + ld r4, HSTATE_KVM_VCPU(r13) + cmpdi r4, 0 + beq 12f + addi r3, r4, VCPU_TB_RMEXIT + bl kvmhv_accumulate_time +12: b kvmhv_do_exit +#endif + /****************************************************************************** * * * Exit code * @@ -1002,6 +1039,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) stw r12,VCPU_TRAP(r9) +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r9, VCPU_TB_RMINTR + mr r4, r9 + bl kvmhv_accumulate_time + ld r5, VCPU_GPR(R5)(r9) + ld r6, VCPU_GPR(R6)(r9) + ld r7, VCPU_GPR(R7)(r9) + ld r8, VCPU_GPR(R8)(r9) +#endif + /* Save HEIR (HV emulation assist reg) in emul_inst if this is an HEI (HV emulation interrupt, e40) */ li r3,KVM_INST_FETCH_FAILED @@ -1073,6 +1120,11 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK beq machine_check_realmode mc_cont: +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r9, VCPU_TB_RMEXIT + mr r4, r9 + bl kvmhv_accumulate_time +#endif /* Save guest CTRL register, set runlatch to 1 */ 6: mfspr r6,SPRN_CTRLF @@ -1417,7 +1469,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) slbia ptesync -hdec_soon: /* r12 = trap, r13 = paca */ +#ifndef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING +hdec_soon: +#endif +kvmhv_do_exit: /* r12 = trap, r13 = paca */ /* * POWER7/POWER8 guest -> host partition switch code. * We don't have to lock against tlbies but we do @@ -1476,7 +1531,10 @@ hdec_soon: /* r12 = trap, r13 = paca */ addi r6,r6,PACA_SIZE bne 42b +#ifndef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING secondary_too_late: +#endif +kvmhv_switch_to_host: /* Secondary threads wait for primary to do partition switch */ 43: ld r5,HSTATE_KVM_VCORE(r13) ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ @@ -1562,6 +1620,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 1: addi r8,r8,16 .endr +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + /* Finish timing, if we have a vcpu */ + ld r4, HSTATE_KVM_VCPU(r13) + cmpdi r4, 0 + li r3, 0 + beq 2f + bl kvmhv_accumulate_time +2: +#endif /* Unset guest mode */ li r0, KVM_GUEST_MODE_NONE stb r0, HSTATE_IN_GUEST(r13) @@ -2069,6 +2136,12 @@ _GLOBAL(kvmppc_h_cede) /* save FP state */ bl kvmppc_save_fp +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + ld r4, HSTATE_KVM_VCPU(r13) + addi r3, r4, VCPU_TB_CEDE + bl kvmhv_accumulate_time +#endif + /* * Take a nap until a decrementer or external or doobell interrupt * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the @@ -2109,6 +2182,11 @@ kvm_end_cede: /* Woken by external or decrementer interrupt */ ld r1, HSTATE_HOST_R1(r13) +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r4, VCPU_TB_RMINTR + bl kvmhv_accumulate_time +#endif + /* load up FP state */ bl kvmppc_load_fp @@ -2429,3 +2507,62 @@ kvmppc_fix_pmao: mtspr SPRN_PMC6, r3 isync blr + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING +/* + * Start timing an activity + * r3 = pointer to time accumulation struct, r4 = vcpu + */ +kvmhv_start_timing: + ld r5, HSTATE_KVM_VCORE(r13) + lbz r6, VCORE_IN_GUEST(r5) + cmpwi r6, 0 + beq 5f /* if in guest, need to */ + ld r6, VCORE_TB_OFFSET(r5) /* subtract timebase offset */ +5: mftb r5 + subf r5, r6, r5 + std r3, VCPU_CUR_ACTIVITY(r4) + std r5, VCPU_ACTIVITY_START(r4) + blr + +/* + * Accumulate time to one activity and start another. + * r3 = pointer to new time accumulation struct, r4 = vcpu + */ +kvmhv_accumulate_time: + ld r5, HSTATE_KVM_VCORE(r13) + lbz r8, VCORE_IN_GUEST(r5) + cmpwi r8, 0 + beq 4f /* if in guest, need to */ + ld r8, VCORE_TB_OFFSET(r5) /* subtract timebase offset */ +4: ld r5, VCPU_CUR_ACTIVITY(r4) + ld r6, VCPU_ACTIVITY_START(r4) + std r3, VCPU_CUR_ACTIVITY(r4) + mftb r7 + subf r7, r8, r7 + std r7, VCPU_ACTIVITY_START(r4) + cmpdi r5, 0 + beqlr + subf r3, r6, r7 + ld r8, TAS_SEQCOUNT(r5) + cmpdi r8, 0 + addi r8, r8, 1 + std r8, TAS_SEQCOUNT(r5) + lwsync + ld r7, TAS_TOTAL(r5) + add r7, r7, r3 + std r7, TAS_TOTAL(r5) + ld r6, TAS_MIN(r5) + ld r7, TAS_MAX(r5) + beq 3f + cmpd r3, r6 + bge 1f +3: std r3, TAS_MIN(r5) +1: cmpd r3, r7 + ble 2f + std r3, TAS_MAX(r5) +2: lwsync + addi r8, r8, 1 + std r8, TAS_SEQCOUNT(r5) + blr +#endif From d911f0beddc2a9248dbf375fc50a4bbf30947822 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2015 14:21:03 +1100 Subject: [PATCH 17/29] KVM: PPC: Book3S HV: Simplify handling of VCPUs that need a VPA update Previously, if kvmppc_run_core() was running a VCPU that needed a VPA update (i.e. one of its 3 virtual processor areas needed to be pinned in memory so the host real mode code can update it on guest entry and exit), we would drop the vcore lock and do the update there and then. Future changes will make it inconvenient to drop the lock, so instead we now remove it from the list of runnable VCPUs and wake up its VCPU task. This will have the effect that the VCPU task will exit kvmppc_run_vcpu(), go around the do loop in kvmppc_vcpu_run_hv(), and re-enter kvmppc_run_vcpu(), whereupon it will do the necessary call to kvmppc_update_vpas() and then rejoin the vcore. The one complication is that the runner VCPU (whose VCPU task is the current task) might be one of the ones that gets removed from the runnable list. In that case we just return from kvmppc_run_core() and let the code in kvmppc_run_vcpu() wake up another VCPU task to be the runner if necessary. This all means that the VCORE_STARTING state is no longer used, so we remove it. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_host.h | 5 +-- arch/powerpc/kvm/book3s_hv.c | 58 +++++++++++++++-------------- 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d2068bba9059..2f339ff9b851 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -306,9 +306,8 @@ struct kvmppc_vcore { /* Values for vcore_state */ #define VCORE_INACTIVE 0 #define VCORE_SLEEPING 1 -#define VCORE_STARTING 2 -#define VCORE_RUNNING 3 -#define VCORE_EXITING 4 +#define VCORE_RUNNING 2 +#define VCORE_EXITING 3 /* * Struct used to manage memory for a virtual processor area diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 64a02d4c737c..b38c10e00c16 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1863,6 +1863,25 @@ static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc) mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE); } +static void prepare_threads(struct kvmppc_vcore *vc) +{ + struct kvm_vcpu *vcpu, *vnext; + + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, + arch.run_list) { + if (signal_pending(vcpu->arch.run_task)) + vcpu->arch.ret = -EINTR; + else if (vcpu->arch.vpa.update_pending || + vcpu->arch.slb_shadow.update_pending || + vcpu->arch.dtl.update_pending) + vcpu->arch.ret = RESUME_GUEST; + else + continue; + kvmppc_remove_runnable(vc, vcpu); + wake_up(&vcpu->arch.cpu_run); + } +} + /* * Run a set of guest threads on a physical core. * Called with vc->lock held. @@ -1872,45 +1891,30 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) struct kvm_vcpu *vcpu, *vnext; long ret; u64 now; - int i, need_vpa_update; + int i; int srcu_idx; - struct kvm_vcpu *vcpus_to_update[threads_per_core]; - - /* don't start if any threads have a signal pending */ - need_vpa_update = 0; - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { - if (signal_pending(vcpu->arch.run_task)) - return; - if (vcpu->arch.vpa.update_pending || - vcpu->arch.slb_shadow.update_pending || - vcpu->arch.dtl.update_pending) - vcpus_to_update[need_vpa_update++] = vcpu; - } /* - * Initialize *vc, in particular vc->vcore_state, so we can - * drop the vcore lock if necessary. + * Remove from the list any threads that have a signal pending + * or need a VPA update done + */ + prepare_threads(vc); + + /* if the runner is no longer runnable, let the caller pick a new one */ + if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE) + return; + + /* + * Initialize *vc. */ vc->n_woken = 0; vc->nap_count = 0; vc->entry_exit_count = 0; vc->preempt_tb = TB_NIL; - vc->vcore_state = VCORE_STARTING; vc->in_guest = 0; vc->napping_threads = 0; vc->conferring_threads = 0; - /* - * Updating any of the vpas requires calling kvmppc_pin_guest_page, - * which can't be called with any spinlocks held. - */ - if (need_vpa_update) { - spin_unlock(&vc->lock); - for (i = 0; i < need_vpa_update; ++i) - kvmppc_update_vpas(vcpus_to_update[i]); - spin_lock(&vc->lock); - } - /* * Make sure we are running on primary threads, and that secondary * threads are offline. Also check if the number of threads in this From 1f09c3ed86287d40fef90611cbbee055313f52cf Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2015 14:21:04 +1100 Subject: [PATCH 18/29] KVM: PPC: Book3S HV: Minor cleanups * Remove unused kvmppc_vcore::n_busy field. * Remove setting of RMOR, since it was only used on PPC970 and the PPC970 KVM support has been removed. * Don't use r1 or r2 in setting the runlatch since they are conventionally reserved for other things; use r0 instead. * Streamline the code a little and remove the ext_interrupt_to_host label. * Add some comments about register usage. * hcall_try_real_mode doesn't need to be global, and can't be called from C code anyway. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_host.h | 2 -- arch/powerpc/kernel/asm-offsets.c | 1 - arch/powerpc/kvm/book3s_hv_rmhandlers.S | 44 +++++++++++-------------- 3 files changed, 19 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 2f339ff9b851..3eecd8868b01 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -227,7 +227,6 @@ struct kvm_arch { unsigned long host_sdr1; int tlbie_lock; unsigned long lpcr; - unsigned long rmor; unsigned long vrma_slb_v; int hpte_setup_done; u32 hpt_order; @@ -271,7 +270,6 @@ struct kvm_arch { */ struct kvmppc_vcore { int n_runnable; - int n_busy; int num_threads; int entry_exit_count; int n_woken; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 3fea721f0da5..92ec3fcb3415 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -505,7 +505,6 @@ int main(void) DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls)); DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); - DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index b06fe53fd509..f8267e5b3a50 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -245,9 +245,9 @@ kvm_novcpu_exit: kvm_start_guest: /* Set runlatch bit the minute you wake up from nap */ - mfspr r1, SPRN_CTRLF - ori r1, r1, 1 - mtspr SPRN_CTRLT, r1 + mfspr r0, SPRN_CTRLF + ori r0, r0, 1 + mtspr SPRN_CTRLT, r0 ld r2,PACATOC(r13) @@ -493,11 +493,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) cmpwi r0,0 beq 20b - /* Set LPCR and RMOR. */ + /* Set LPCR. */ 10: ld r8,VCORE_LPCR(r5) mtspr SPRN_LPCR,r8 - ld r8,KVM_RMOR(r9) - mtspr SPRN_RMOR,r8 isync /* Check if HDEC expires soon */ @@ -1075,7 +1073,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) bne 2f mfspr r3,SPRN_HDEC cmpwi r3,0 - bge ignore_hdec + mr r4,r9 + bge fast_guest_return 2: /* See if this is an hcall we can handle in real mode */ cmpwi r12,BOOK3S_INTERRUPT_SYSCALL @@ -1083,26 +1082,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* External interrupt ? */ cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - bne+ ext_interrupt_to_host + bne+ guest_exit_cont /* External interrupt, first check for host_ipi. If this is * set, we know the host wants us out so let's do it now */ bl kvmppc_read_intr cmpdi r3, 0 - bgt ext_interrupt_to_host + bgt guest_exit_cont /* Check if any CPU is heading out to the host, if so head out too */ ld r5, HSTATE_KVM_VCORE(r13) lwz r0, VCORE_ENTRY_EXIT(r5) cmpwi r0, 0x100 - bge ext_interrupt_to_host - - /* Return to guest after delivering any pending interrupt */ mr r4, r9 - b deliver_guest_interrupt - -ext_interrupt_to_host: + blt deliver_guest_interrupt guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save more register state */ @@ -1763,8 +1757,10 @@ kvmppc_hisi: * Returns to the guest if we handle it, or continues on up to * the kernel if we can't (i.e. if we don't have a handler for * it, or if the handler returns H_TOO_HARD). + * + * r5 - r8 contain hcall args, + * r9 = vcpu, r10 = pc, r11 = msr, r12 = trap, r13 = paca */ - .globl hcall_try_real_mode hcall_try_real_mode: ld r3,VCPU_GPR(R3)(r9) andi. r0,r11,MSR_PR @@ -2024,10 +2020,6 @@ hcall_real_table: .globl hcall_real_table_end hcall_real_table_end: -ignore_hdec: - mr r4,r9 - b fast_guest_return - _GLOBAL(kvmppc_h_set_xdabr) andi. r0, r5, DABRX_USER | DABRX_KERNEL beq 6f @@ -2066,7 +2058,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r3, 0 blr -_GLOBAL(kvmppc_h_cede) +_GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ ori r11,r11,MSR_EE std r11,VCPU_MSR(r3) li r0,1 @@ -2148,9 +2140,9 @@ _GLOBAL(kvmppc_h_cede) * runlatch bit before napping. */ kvm_do_nap: - mfspr r2, SPRN_CTRLF - clrrdi r2, r2, 1 - mtspr SPRN_CTRLT, r2 + mfspr r0, SPRN_CTRLF + clrrdi r0, r0, 1 + mtspr SPRN_CTRLT, r0 li r0,1 stb r0,HSTATE_HWTHREAD_REQ(r13) @@ -2282,13 +2274,14 @@ machine_check_realmode: /* * Check the reason we woke from nap, and take appropriate action. - * Returns: + * Returns (in r3): * 0 if nothing needs to be done * 1 if something happened that needs to be handled by the host * -1 if there was a guest wakeup (IPI) * * Also sets r12 to the interrupt vector for any interrupt that needs * to be handled now by the host (0x500 for external interrupt), or zero. + * Modifies r0, r6, r7, r8. */ kvmppc_check_wake_reason: mfspr r6, SPRN_SRR1 @@ -2324,6 +2317,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * 0 if no interrupt is pending * 1 if an interrupt is pending that needs to be handled by the host * -1 if there was a guest wakeup IPI (which has now been cleared) + * Modifies r0, r6, r7, r8, returns value in r3. */ kvmppc_read_intr: /* see if a host IPI is pending */ From 25fedfca94cfbf2461314c6c34ef58e74a31b025 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2015 14:21:05 +1100 Subject: [PATCH 19/29] KVM: PPC: Book3S HV: Move vcore preemption point up into kvmppc_run_vcpu Rather than calling cond_resched() in kvmppc_run_core() before doing the post-processing for the vcpus that we have just run (that is, calling kvmppc_handle_exit_hv(), kvmppc_set_timer(), etc.), we now do that post-processing before calling cond_resched(), and that post- processing is moved out into its own function, post_guest_process(). The reschedule point is now in kvmppc_run_vcpu() and we define a new vcore state, VCORE_PREEMPT, to indicate that that the vcore's runner task is runnable but not running. (Doing the reschedule with the vcore in VCORE_INACTIVE state would be bad because there are potentially other vcpus waiting for the runner in kvmppc_wait_for_exec() which then wouldn't get woken up.) Also, we make use of the handy cond_resched_lock() function, which unlocks and relocks vc->lock for us around the reschedule. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_host.h | 5 +- arch/powerpc/kvm/book3s_hv.c | 92 ++++++++++++++++------------- 2 files changed, 55 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3eecd8868b01..83c44257b005 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -304,8 +304,9 @@ struct kvmppc_vcore { /* Values for vcore_state */ #define VCORE_INACTIVE 0 #define VCORE_SLEEPING 1 -#define VCORE_RUNNING 2 -#define VCORE_EXITING 3 +#define VCORE_PREEMPT 2 +#define VCORE_RUNNING 3 +#define VCORE_EXITING 4 /* * Struct used to manage memory for a virtual processor area diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index b38c10e00c16..fb4f16628a51 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1882,15 +1882,50 @@ static void prepare_threads(struct kvmppc_vcore *vc) } } +static void post_guest_process(struct kvmppc_vcore *vc) +{ + u64 now; + long ret; + struct kvm_vcpu *vcpu, *vnext; + + now = get_tb(); + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, + arch.run_list) { + /* cancel pending dec exception if dec is positive */ + if (now < vcpu->arch.dec_expires && + kvmppc_core_pending_dec(vcpu)) + kvmppc_core_dequeue_dec(vcpu); + + trace_kvm_guest_exit(vcpu); + + ret = RESUME_GUEST; + if (vcpu->arch.trap) + ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, + vcpu->arch.run_task); + + vcpu->arch.ret = ret; + vcpu->arch.trap = 0; + + if (vcpu->arch.ceded) { + if (!is_kvmppc_resume_guest(ret)) + kvmppc_end_cede(vcpu); + else + kvmppc_set_timer(vcpu); + } + if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { + kvmppc_remove_runnable(vc, vcpu); + wake_up(&vcpu->arch.cpu_run); + } + } +} + /* * Run a set of guest threads on a physical core. * Called with vc->lock held. */ static void kvmppc_run_core(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu, *vnext; - long ret; - u64 now; + struct kvm_vcpu *vcpu; int i; int srcu_idx; @@ -1922,8 +1957,11 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) */ if ((threads_per_core > 1) && ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { vcpu->arch.ret = -EBUSY; + kvmppc_remove_runnable(vc, vcpu); + wake_up(&vcpu->arch.cpu_run); + } goto out; } @@ -1979,44 +2017,12 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) kvm_guest_exit(); preempt_enable(); - cond_resched(); spin_lock(&vc->lock); - now = get_tb(); - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { - /* cancel pending dec exception if dec is positive */ - if (now < vcpu->arch.dec_expires && - kvmppc_core_pending_dec(vcpu)) - kvmppc_core_dequeue_dec(vcpu); - - trace_kvm_guest_exit(vcpu); - - ret = RESUME_GUEST; - if (vcpu->arch.trap) - ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, - vcpu->arch.run_task); - - vcpu->arch.ret = ret; - vcpu->arch.trap = 0; - - if (vcpu->arch.ceded) { - if (!is_kvmppc_resume_guest(ret)) - kvmppc_end_cede(vcpu); - else - kvmppc_set_timer(vcpu); - } - } + post_guest_process(vc); out: vc->vcore_state = VCORE_INACTIVE; - list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, - arch.run_list) { - if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { - kvmppc_remove_runnable(vc, vcpu); - wake_up(&vcpu->arch.cpu_run); - } - } - trace_kvmppc_run_core(vc, 1); } @@ -2138,7 +2144,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) break; - vc->runner = vcpu; n_ceded = 0; list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { if (!v->arch.pending_exceptions) @@ -2146,10 +2151,17 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) else v->arch.ceded = 0; } - if (n_ceded == vc->n_runnable) + vc->runner = vcpu; + if (n_ceded == vc->n_runnable) { kvmppc_vcore_blocked(vc); - else + } else if (should_resched()) { + vc->vcore_state = VCORE_PREEMPT; + /* Let something else run */ + cond_resched_lock(&vc->lock); + vc->vcore_state = VCORE_INACTIVE; + } else { kvmppc_run_core(vc); + } vc->runner = NULL; } From 5d5b99cd6818bdbea287d23ef055bba1a8a9e648 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2015 14:21:06 +1100 Subject: [PATCH 20/29] KVM: PPC: Book3S HV: Get rid of vcore nap_count and n_woken We can tell when a secondary thread has finished running a guest by the fact that it clears its kvm_hstate.kvm_vcpu pointer, so there is no real need for the nap_count field in the kvmppc_vcore struct. This changes kvmppc_wait_for_nap to poll the kvm_hstate.kvm_vcpu pointers of the secondary threads rather than polling vc->nap_count. Besides reducing the size of the kvmppc_vcore struct by 8 bytes, this also means that we can tell which secondary threads have got stuck and thus print a more informative error message. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_host.h | 2 -- arch/powerpc/kernel/asm-offsets.c | 1 - arch/powerpc/kvm/book3s_hv.c | 47 ++++++++++++++----------- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 19 ++++------ 4 files changed, 34 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 83c44257b005..1517faac9b98 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -272,8 +272,6 @@ struct kvmppc_vcore { int n_runnable; int num_threads; int entry_exit_count; - int n_woken; - int nap_count; int napping_threads; int first_vcpuid; u16 pcpu; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 92ec3fcb3415..8aa824681d76 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -563,7 +563,6 @@ int main(void) DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort)); DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1)); DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); - DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm)); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index fb4f16628a51..7c1335dca4a5 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1729,8 +1729,10 @@ static int kvmppc_grab_hwthread(int cpu) tpaca = &paca[cpu]; /* Ensure the thread won't go into the kernel if it wakes */ - tpaca->kvm_hstate.hwthread_req = 1; tpaca->kvm_hstate.kvm_vcpu = NULL; + tpaca->kvm_hstate.napping = 0; + smp_wmb(); + tpaca->kvm_hstate.hwthread_req = 1; /* * If the thread is already executing in the kernel (e.g. handling @@ -1773,35 +1775,43 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) } cpu = vc->pcpu + vcpu->arch.ptid; tpaca = &paca[cpu]; - tpaca->kvm_hstate.kvm_vcpu = vcpu; tpaca->kvm_hstate.kvm_vcore = vc; tpaca->kvm_hstate.ptid = vcpu->arch.ptid; vcpu->cpu = vc->pcpu; + /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */ smp_wmb(); + tpaca->kvm_hstate.kvm_vcpu = vcpu; #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) - if (cpu != smp_processor_id()) { + if (cpu != smp_processor_id()) xics_wake_cpu(cpu); - if (vcpu->arch.ptid) - ++vc->n_woken; - } #endif } -static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc) +static void kvmppc_wait_for_nap(void) { - int i; + int cpu = smp_processor_id(); + int i, loops; - HMT_low(); - i = 0; - while (vc->nap_count < vc->n_woken) { - if (++i >= 1000000) { - pr_err("kvmppc_wait_for_nap timeout %d %d\n", - vc->nap_count, vc->n_woken); - break; + for (loops = 0; loops < 1000000; ++loops) { + /* + * Check if all threads are finished. + * We set the vcpu pointer when starting a thread + * and the thread clears it when finished, so we look + * for any threads that still have a non-NULL vcpu ptr. + */ + for (i = 1; i < threads_per_subcore; ++i) + if (paca[cpu + i].kvm_hstate.kvm_vcpu) + break; + if (i == threads_per_subcore) { + HMT_medium(); + return; } - cpu_relax(); + HMT_low(); } HMT_medium(); + for (i = 1; i < threads_per_subcore; ++i) + if (paca[cpu + i].kvm_hstate.kvm_vcpu) + pr_err("KVM: CPU %d seems to be stuck\n", cpu + i); } /* @@ -1942,8 +1952,6 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) /* * Initialize *vc. */ - vc->n_woken = 0; - vc->nap_count = 0; vc->entry_exit_count = 0; vc->preempt_tb = TB_NIL; vc->in_guest = 0; @@ -2002,8 +2010,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) vcpu->cpu = -1; /* wait for secondary threads to finish writing their state to memory */ - if (vc->nap_count < vc->n_woken) - kvmppc_wait_for_nap(vc); + kvmppc_wait_for_nap(); for (i = 0; i < threads_per_subcore; ++i) kvmppc_release_hwthread(vc->pcpu + i); /* prevent other vcpu threads from doing kvmppc_start_thread() now */ diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index f8267e5b3a50..6716db3f1a2b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -292,26 +292,21 @@ kvm_secondary_got_guest: ld r6, PACA_DSCR(r13) std r6, HSTATE_DSCR(r13) + /* Order load of vcore, ptid etc. after load of vcpu */ + lwsync bl kvmppc_hv_entry /* Back from the guest, go back to nap */ /* Clear our vcpu pointer so we don't come back in early */ li r0, 0 - std r0, HSTATE_KVM_VCPU(r13) /* - * Make sure we clear HSTATE_KVM_VCPU(r13) before incrementing - * the nap_count, because once the increment to nap_count is - * visible we could be given another vcpu. + * Once we clear HSTATE_KVM_VCPU(r13), the code in + * kvmppc_run_core() is going to assume that all our vcpu + * state is visible in memory. This lwsync makes sure + * that that is true. */ lwsync - - /* increment the nap count and then go to nap mode */ - ld r4, HSTATE_KVM_VCORE(r13) - addi r4, r4, VCORE_NAP_COUNT -51: lwarx r3, 0, r4 - addi r3, r3, 1 - stwcx. r3, 0, r4 - bne 51b + std r0, HSTATE_KVM_VCPU(r13) /* * At this point we have finished executing in the guest. From ccc07772c99befeda7a7a4b1d05a6f3b762518c2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2015 14:21:07 +1100 Subject: [PATCH 21/29] KVM: PPC: Book3S HV: Don't wake thread with no vcpu on guest IPI When running a multi-threaded guest and vcpu 0 in a virtual core is not running in the guest (i.e. it is busy elsewhere in the host), thread 0 of the physical core will switch the MMU to the guest and then go to nap mode in the code at kvm_do_nap. If the guest sends an IPI to thread 0 using the msgsndp instruction, that will wake up thread 0 and cause all the threads in the guest to exit to the host unnecessarily. To avoid the unnecessary exit, this arranges for the PECEDP bit to be cleared in this situation. When napping due to a H_CEDE from the guest, we still set PECEDP so that the thread will wake up on an IPI sent using msgsndp. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 6716db3f1a2b..12d7e4c8dc54 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -191,6 +191,7 @@ kvmppc_primary_no_guest: li r3, NAPPING_NOVCPU stb r3, HSTATE_NAPPING(r13) + li r3, 0 /* Don't wake on privileged (OS) doorbell */ b kvm_do_nap kvm_novcpu_wakeup: @@ -2129,10 +2130,13 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ bl kvmhv_accumulate_time #endif + lis r3, LPCR_PECEDP@h /* Do wake on privileged doorbell */ + /* * Take a nap until a decrementer or external or doobell interrupt - * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the - * runlatch bit before napping. + * occurs, with PECE1 and PECE0 set in LPCR. + * On POWER8, if we are ceding, also set PECEDP. + * Also clear the runlatch bit before napping. */ kvm_do_nap: mfspr r0, SPRN_CTRLF @@ -2144,7 +2148,7 @@ kvm_do_nap: mfspr r5,SPRN_LPCR ori r5,r5,LPCR_PECE0 | LPCR_PECE1 BEGIN_FTR_SECTION - oris r5,r5,LPCR_PECEDP@h + rlwimi r5, r3, 0, LPCR_PECEDP END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_LPCR,r5 isync From fd6d53b12410b4b73e3996629350dee3f4a7994f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2015 14:21:08 +1100 Subject: [PATCH 22/29] KVM: PPC: Book3S HV: Use decrementer to wake napping threads This arranges for threads that are napping due to their vcpu having ceded or due to not having a vcpu to wake up at the end of the guest's timeslice without having to be poked with an IPI. We do that by arranging for the decrementer to contain a value no greater than the number of timebase ticks remaining until the end of the timeslice. In the case of a thread with no vcpu, this number is in the hypervisor decrementer already. In the case of a ceded vcpu, we use the smaller of the HDEC value and the DEC value. Using the DEC like this when ceded means we need to save and restore the guest decrementer value around the nap. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 43 +++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 12d7e4c8dc54..16719af88bcc 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -172,6 +172,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) kvmppc_primary_no_guest: /* We handle this much like a ceded vcpu */ + /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ + mfspr r3, SPRN_HDEC + mtspr SPRN_DEC, r3 /* set our bit in napping_threads */ ld r5, HSTATE_KVM_VCORE(r13) lbz r7, HSTATE_PTID(r13) @@ -223,6 +226,12 @@ kvm_novcpu_wakeup: cmpdi r3, 0 bge kvm_novcpu_exit + /* See if our timeslice has expired (HDEC is negative) */ + mfspr r0, SPRN_HDEC + li r12, BOOK3S_INTERRUPT_HV_DECREMENTER + cmpwi r0, 0 + blt kvm_novcpu_exit + /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ ld r4, HSTATE_KVM_VCPU(r13) cmpdi r4, 0 @@ -1493,10 +1502,10 @@ kvmhv_do_exit: /* r12 = trap, r13 = paca */ cmpwi r3,0x100 /* Are we the first here? */ bge 43f cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER - beq 40f + beq 43f li r0,0 mtspr SPRN_HDEC,r0 -40: + /* * Send an IPI to any napping threads, since an HDEC interrupt * doesn't wake CPUs up from nap. @@ -2124,6 +2133,27 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ /* save FP state */ bl kvmppc_save_fp + /* + * Set DEC to the smaller of DEC and HDEC, so that we wake + * no later than the end of our timeslice (HDEC interrupts + * don't wake us from nap). + */ + mfspr r3, SPRN_DEC + mfspr r4, SPRN_HDEC + mftb r5 + cmpw r3, r4 + ble 67f + mtspr SPRN_DEC, r4 +67: + /* save expiry time of guest decrementer */ + extsw r3, r3 + add r3, r3, r5 + ld r4, HSTATE_KVM_VCPU(r13) + ld r5, HSTATE_KVM_VCORE(r13) + ld r6, VCORE_TB_OFFSET(r5) + subf r3, r6, r3 /* convert to host TB value */ + std r3, VCPU_DEC_EXPIRES(r4) + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING ld r4, HSTATE_KVM_VCPU(r13) addi r3, r4, VCPU_TB_CEDE @@ -2181,6 +2211,15 @@ kvm_end_cede: /* load up FP state */ bl kvmppc_load_fp + /* Restore guest decrementer */ + ld r3, VCPU_DEC_EXPIRES(r4) + ld r5, HSTATE_KVM_VCORE(r13) + ld r6, VCORE_TB_OFFSET(r5) + add r3, r3, r6 /* convert host TB to guest TB value */ + mftb r7 + subf r3, r7, r3 + mtspr SPRN_DEC, r3 + /* Load NV GPRS */ ld r14, VCPU_GPR(R14)(r4) ld r15, VCPU_GPR(R15)(r4) From 7d6c40da198ac18bd5dd2cd18628d5b4c615d842 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2015 14:21:09 +1100 Subject: [PATCH 23/29] KVM: PPC: Book3S HV: Use bitmap of active threads rather than count Currently, the entry_exit_count field in the kvmppc_vcore struct contains two 8-bit counts, one of the threads that have started entering the guest, and one of the threads that have started exiting the guest. This changes it to an entry_exit_map field which contains two bitmaps of 8 bits each. The advantage of doing this is that it gives us a bitmap of which threads need to be signalled when exiting the guest. That means that we no longer need to use the trick of setting the HDEC to 0 to pull the other threads out of the guest, which led in some cases to a spurious HDEC interrupt on the next guest entry. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_host.h | 15 +++--- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 5 +- arch/powerpc/kvm/book3s_hv_builtin.c | 10 ++-- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 61 ++++++++++++------------- 5 files changed, 44 insertions(+), 49 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1517faac9b98..d67a83830bd1 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -263,15 +263,15 @@ struct kvm_arch { /* * Struct for a virtual core. - * Note: entry_exit_count combines an entry count in the bottom 8 bits - * and an exit count in the next 8 bits. This is so that we can - * atomically increment the entry count iff the exit count is 0 - * without taking the lock. + * Note: entry_exit_map combines a bitmap of threads that have entered + * in the bottom 8 bits and a bitmap of threads that have exited in the + * next 8 bits. This is so that we can atomically set the entry bit + * iff the exit map is 0 without taking a lock. */ struct kvmppc_vcore { int n_runnable; int num_threads; - int entry_exit_count; + int entry_exit_map; int napping_threads; int first_vcpuid; u16 pcpu; @@ -296,8 +296,9 @@ struct kvmppc_vcore { ulong conferring_threads; }; -#define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) -#define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8) +#define VCORE_ENTRY_MAP(vc) ((vc)->entry_exit_map & 0xff) +#define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8) +#define VCORE_IS_EXITING(vc) (VCORE_EXIT_MAP(vc) != 0) /* Values for vcore_state */ #define VCORE_INACTIVE 0 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 8aa824681d76..0d07efbe3fa7 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -562,7 +562,7 @@ int main(void) DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop)); DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort)); DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1)); - DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); + DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map)); DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm)); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 7c1335dca4a5..ea1600ff52b2 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1952,7 +1952,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) /* * Initialize *vc. */ - vc->entry_exit_count = 0; + vc->entry_exit_map = 0; vc->preempt_tb = TB_NIL; vc->in_guest = 0; vc->napping_threads = 0; @@ -2119,8 +2119,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * this thread straight away and have it join in. */ if (!signal_pending(current)) { - if (vc->vcore_state == VCORE_RUNNING && - VCORE_EXIT_COUNT(vc) == 0) { + if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) { kvmppc_create_dtl_entry(vcpu, vc); kvmppc_start_thread(vcpu); trace_kvm_guest_enter(vcpu); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 1954a1c4b1f9..275425142bb7 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -115,11 +115,11 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, int rv = H_SUCCESS; /* => don't yield */ set_bit(vcpu->arch.ptid, &vc->conferring_threads); - while ((get_tb() < stop) && (VCORE_EXIT_COUNT(vc) == 0)) { - threads_running = VCORE_ENTRY_COUNT(vc); - threads_ceded = hweight32(vc->napping_threads); - threads_conferring = hweight32(vc->conferring_threads); - if (threads_ceded + threads_conferring >= threads_running) { + while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) { + threads_running = VCORE_ENTRY_MAP(vc); + threads_ceded = vc->napping_threads; + threads_conferring = vc->conferring_threads; + if ((threads_ceded | threads_conferring) == threads_running) { rv = H_TOO_HARD; /* => do yield */ break; } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 16719af88bcc..245f5c972030 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -185,7 +185,7 @@ kvmppc_primary_no_guest: or r3, r3, r0 stwcx. r3, 0, r6 bne 1b - /* order napping_threads update vs testing entry_exit_count */ + /* order napping_threads update vs testing entry_exit_map */ isync li r12, 0 lwz r7, VCORE_ENTRY_EXIT(r5) @@ -406,19 +406,21 @@ kvmppc_hv_entry: * We don't have to lock against concurrent tlbies, * but we do have to coordinate across hardware threads. */ - /* Increment entry count iff exit count is zero. */ - ld r5,HSTATE_KVM_VCORE(r13) - addi r9,r5,VCORE_ENTRY_EXIT -21: lwarx r3,0,r9 - cmpwi r3,0x100 /* any threads starting to exit? */ + /* Set bit in entry map iff exit map is zero. */ + ld r5, HSTATE_KVM_VCORE(r13) + li r7, 1 + lbz r6, HSTATE_PTID(r13) + sld r7, r7, r6 + addi r9, r5, VCORE_ENTRY_EXIT +21: lwarx r3, 0, r9 + cmpwi r3, 0x100 /* any threads starting to exit? */ bge secondary_too_late /* if so we're too late to the party */ - addi r3,r3,1 - stwcx. r3,0,r9 + or r3, r3, r7 + stwcx. r3, 0, r9 bne 21b /* Primary thread switches to guest partition. */ ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ - lbz r6,HSTATE_PTID(r13) cmpwi r6,0 bne 20f ld r6,KVM_SDR1(r9) @@ -1477,13 +1479,16 @@ kvmhv_do_exit: /* r12 = trap, r13 = paca */ * We don't have to lock against tlbies but we do * have to coordinate the hardware threads. */ - /* Increment the threads-exiting-guest count in the 0xff00 - bits of vcore->entry_exit_count */ - ld r5,HSTATE_KVM_VCORE(r13) - addi r6,r5,VCORE_ENTRY_EXIT -41: lwarx r3,0,r6 - addi r0,r3,0x100 - stwcx. r0,0,r6 + /* Set our bit in the threads-exiting-guest map in the 0xff00 + bits of vcore->entry_exit_map */ + ld r5, HSTATE_KVM_VCORE(r13) + lbz r4, HSTATE_PTID(r13) + li r7, 0x100 + sld r7, r7, r4 + addi r6, r5, VCORE_ENTRY_EXIT +41: lwarx r3, 0, r6 + or r0, r3, r7 + stwcx. r0, 0, r6 bne 41b isync /* order stwcx. vs. reading napping_threads */ @@ -1492,9 +1497,9 @@ kvmhv_do_exit: /* r12 = trap, r13 = paca */ * up to the kernel or qemu; we can't handle it in real mode. * Thus we have to do a partition switch, so we have to * collect the other threads, if we are the first thread - * to take an interrupt. To do this, we set the HDEC to 0, - * which causes an HDEC interrupt in all threads within 2ns - * because the HDEC register is shared between all 4 threads. + * to take an interrupt. To do this, we send a message or + * IPI to all the threads that have their bit set in the entry + * map in vcore->entry_exit_map (other than ourselves). * However, we don't need to bother if this is an HDEC * interrupt, since the other threads will already be on their * way here in that case. @@ -1503,17 +1508,8 @@ kvmhv_do_exit: /* r12 = trap, r13 = paca */ bge 43f cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER beq 43f - li r0,0 - mtspr SPRN_HDEC,r0 - /* - * Send an IPI to any napping threads, since an HDEC interrupt - * doesn't wake CPUs up from nap. - */ - lwz r3,VCORE_NAPPING_THREADS(r5) - lbz r4,HSTATE_PTID(r13) - li r0,1 - sld r0,r0,r4 + srwi r0,r7,8 andc. r3,r3,r0 /* no sense IPI'ing ourselves */ beq 43f /* Order entry/exit update vs. IPIs */ @@ -2091,12 +2087,11 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ addi r6,r5,VCORE_NAPPING_THREADS 31: lwarx r4,0,r6 or r4,r4,r0 - PPC_POPCNTW(R7,R4) - cmpw r7,r8 - bge kvm_cede_exit + cmpw r4,r8 + beq kvm_cede_exit stwcx. r4,0,r6 bne 31b - /* order napping_threads update vs testing entry_exit_count */ + /* order napping_threads update vs testing entry_exit_map */ isync li r0,NAPPING_CEDE stb r0,HSTATE_NAPPING(r13) From 6af27c847ad1b889c29a641dfc41f2d78c46a048 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2015 14:21:10 +1100 Subject: [PATCH 24/29] KVM: PPC: Book3S HV: Streamline guest entry and exit On entry to the guest, secondary threads now wait for the primary to switch the MMU after loading up most of their state, rather than before. This means that the secondary threads get into the guest sooner, in the common case where the secondary threads get to kvmppc_hv_entry before the primary thread. On exit, the first thread out increments the exit count and interrupts the other threads (to get them out of the guest) before saving most of its state, rather than after. That means that the other threads exit sooner and means that the first thread doesn't spend so much time waiting for the other threads at the point where the MMU gets switched back to the host. This pulls out the code that increments the exit count and interrupts other threads into a separate function, kvmhv_commence_exit(). This also makes sure that r12 and vcpu->arch.trap are set correctly in some corner cases. Statistics from /sys/kernel/debug/kvm/vm*/vcpu*/timings show the improvement. Aggregating across vcpus for a guest with 32 vcpus, 8 threads/vcore, running on a POWER8, gives this before the change: rm_entry: avg 4537.3ns (222 - 48444, 1068878 samples) rm_exit: avg 4787.6ns (152 - 165490, 1010717 samples) rm_intr: avg 1673.6ns (12 - 341304, 3818691 samples) and this after the change: rm_entry: avg 3427.7ns (232 - 68150, 1118921 samples) rm_exit: avg 4716.0ns (12 - 150720, 1119477 samples) rm_intr: avg 1614.8ns (12 - 522436, 3850432 samples) showing a substantial reduction in the time spent per guest entry in the real-mode guest entry code, and smaller reductions in the real mode guest exit and interrupt handling times. (The test was to start the guest and boot Fedora 20 big-endian to the login prompt.) Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 212 ++++++++++++++---------- 1 file changed, 126 insertions(+), 86 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 245f5c972030..3f6fd78cccd2 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -175,6 +175,19 @@ kvmppc_primary_no_guest: /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ mfspr r3, SPRN_HDEC mtspr SPRN_DEC, r3 + /* + * Make sure the primary has finished the MMU switch. + * We should never get here on a secondary thread, but + * check it for robustness' sake. + */ + ld r5, HSTATE_KVM_VCORE(r13) +65: lbz r0, VCORE_IN_GUEST(r5) + cmpwi r0, 0 + beq 65b + /* Set LPCR. */ + ld r8,VCORE_LPCR(r5) + mtspr SPRN_LPCR,r8 + isync /* set our bit in napping_threads */ ld r5, HSTATE_KVM_VCORE(r13) lbz r7, HSTATE_PTID(r13) @@ -206,7 +219,7 @@ kvm_novcpu_wakeup: /* check the wake reason */ bl kvmppc_check_wake_reason - + /* see if any other thread is already exiting */ lwz r0, VCORE_ENTRY_EXIT(r5) cmpwi r0, 0x100 @@ -244,7 +257,15 @@ kvm_novcpu_wakeup: b kvmppc_got_guest kvm_novcpu_exit: - b hdec_soon +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + ld r4, HSTATE_KVM_VCPU(r13) + cmpdi r4, 0 + beq 13f + addi r3, r4, VCPU_TB_RMEXIT + bl kvmhv_accumulate_time +#endif +13: bl kvmhv_commence_exit + b kvmhv_switch_to_host /* * We come in here when wakened from nap mode. @@ -422,7 +443,7 @@ kvmppc_hv_entry: /* Primary thread switches to guest partition. */ ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ cmpwi r6,0 - bne 20f + bne 10f ld r6,KVM_SDR1(r9) lwz r7,KVM_LPID(r9) li r0,LPID_RSVD /* switch to reserved LPID */ @@ -493,26 +514,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r0,1 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ - b 10f - - /* Secondary threads wait for primary to have done partition switch */ -20: lbz r0,VCORE_IN_GUEST(r5) - cmpwi r0,0 - beq 20b - - /* Set LPCR. */ -10: ld r8,VCORE_LPCR(r5) - mtspr SPRN_LPCR,r8 - isync - - /* Check if HDEC expires soon */ - mfspr r3,SPRN_HDEC - cmpwi r3,512 /* 1 microsecond */ - li r12,BOOK3S_INTERRUPT_HV_DECREMENTER - blt hdec_soon /* Do we have a guest vcpu to run? */ - cmpdi r4, 0 +10: cmpdi r4, 0 beq kvmppc_primary_no_guest kvmppc_got_guest: @@ -837,6 +841,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) clrrdi r6,r6,1 mtspr SPRN_CTRLT,r6 4: + /* Secondary threads wait for primary to have done partition switch */ + ld r5, HSTATE_KVM_VCORE(r13) + lbz r6, HSTATE_PTID(r13) + cmpwi r6, 0 + beq 21f + lbz r0, VCORE_IN_GUEST(r5) + cmpwi r0, 0 + bne 21f + HMT_LOW +20: lbz r0, VCORE_IN_GUEST(r5) + cmpwi r0, 0 + beq 20b + HMT_MEDIUM +21: + /* Set LPCR. */ + ld r8,VCORE_LPCR(r5) + mtspr SPRN_LPCR,r8 + isync + + /* Check if HDEC expires soon */ + mfspr r3, SPRN_HDEC + cmpwi r3, 512 /* 1 microsecond */ + blt hdec_soon + ld r6, VCPU_CTR(r4) lwz r7, VCPU_XER(r4) @@ -942,22 +970,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) hrfid b . -#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING secondary_too_late: + li r12, 0 cmpdi r4, 0 beq 11f + stw r12, VCPU_TRAP(r4) +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING addi r3, r4, VCPU_TB_RMEXIT bl kvmhv_accumulate_time +#endif 11: b kvmhv_switch_to_host hdec_soon: - ld r4, HSTATE_KVM_VCPU(r13) - cmpdi r4, 0 - beq 12f + li r12, BOOK3S_INTERRUPT_HV_DECREMENTER + stw r12, VCPU_TRAP(r4) + mr r9, r4 +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING addi r3, r4, VCPU_TB_RMEXIT bl kvmhv_accumulate_time -12: b kvmhv_do_exit #endif + b guest_exit_cont /****************************************************************************** * * @@ -1113,7 +1145,7 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ stw r7, VCPU_DSISR(r9) /* don't overwrite fault_dar/fault_dsisr if HDSI */ cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE - beq 6f + beq mc_cont std r6, VCPU_FAULT_DAR(r9) stw r7, VCPU_FAULT_DSISR(r9) @@ -1127,8 +1159,11 @@ mc_cont: bl kvmhv_accumulate_time #endif + /* Increment exit count, poke other threads to exit */ + bl kvmhv_commence_exit + /* Save guest CTRL register, set runlatch to 1 */ -6: mfspr r6,SPRN_CTRLF + mfspr r6,SPRN_CTRLF stw r6,VCPU_CTRL(r9) andi. r0,r6,1 bne 4f @@ -1470,68 +1505,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) slbia ptesync -#ifndef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING -hdec_soon: -#endif -kvmhv_do_exit: /* r12 = trap, r13 = paca */ /* * POWER7/POWER8 guest -> host partition switch code. * We don't have to lock against tlbies but we do * have to coordinate the hardware threads. */ - /* Set our bit in the threads-exiting-guest map in the 0xff00 - bits of vcore->entry_exit_map */ - ld r5, HSTATE_KVM_VCORE(r13) - lbz r4, HSTATE_PTID(r13) - li r7, 0x100 - sld r7, r7, r4 - addi r6, r5, VCORE_ENTRY_EXIT -41: lwarx r3, 0, r6 - or r0, r3, r7 - stwcx. r0, 0, r6 - bne 41b - isync /* order stwcx. vs. reading napping_threads */ - - /* - * At this point we have an interrupt that we have to pass - * up to the kernel or qemu; we can't handle it in real mode. - * Thus we have to do a partition switch, so we have to - * collect the other threads, if we are the first thread - * to take an interrupt. To do this, we send a message or - * IPI to all the threads that have their bit set in the entry - * map in vcore->entry_exit_map (other than ourselves). - * However, we don't need to bother if this is an HDEC - * interrupt, since the other threads will already be on their - * way here in that case. - */ - cmpwi r3,0x100 /* Are we the first here? */ - bge 43f - cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER - beq 43f - - srwi r0,r7,8 - andc. r3,r3,r0 /* no sense IPI'ing ourselves */ - beq 43f - /* Order entry/exit update vs. IPIs */ - sync - mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ - subf r6,r4,r13 -42: andi. r0,r3,1 - beq 44f - ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ - li r0,IPI_PRIORITY - li r7,XICS_MFRR - stbcix r0,r7,r8 /* trigger the IPI */ -44: srdi. r3,r3,1 - addi r6,r6,PACA_SIZE - bne 42b - -#ifndef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING -secondary_too_late: -#endif kvmhv_switch_to_host: /* Secondary threads wait for primary to do partition switch */ -43: ld r5,HSTATE_KVM_VCORE(r13) + ld r5,HSTATE_KVM_VCORE(r13) ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ lbz r3,HSTATE_PTID(r13) cmpwi r3,0 @@ -1633,6 +1614,63 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtlr r0 blr +kvmhv_commence_exit: /* r12 = trap, r13 = paca, doesn't trash r9 */ + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -PPC_MIN_STKFRM(r1) + + /* Set our bit in the threads-exiting-guest map in the 0xff00 + bits of vcore->entry_exit_map */ + ld r5, HSTATE_KVM_VCORE(r13) + lbz r4, HSTATE_PTID(r13) + li r7, 0x100 + sld r7, r7, r4 + addi r6, r5, VCORE_ENTRY_EXIT +41: lwarx r3, 0, r6 + or r0, r3, r7 + stwcx. r0, 0, r6 + bne 41b + isync /* order stwcx. vs. reading napping_threads */ + + /* + * At this point we have an interrupt that we have to pass + * up to the kernel or qemu; we can't handle it in real mode. + * Thus we have to do a partition switch, so we have to + * collect the other threads, if we are the first thread + * to take an interrupt. To do this, we send a message or + * IPI to all the threads that have their bit set in the entry + * map in vcore->entry_exit_map (other than ourselves). + * However, we don't need to bother if this is an HDEC + * interrupt, since the other threads will already be on their + * way here in that case. + */ + cmpwi r3,0x100 /* Are we the first here? */ + bge 43f + cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER + beq 43f + + srwi r0,r7,8 + andc. r3,r3,r0 /* no sense IPI'ing ourselves */ + beq 43f + /* Order entry/exit update vs. IPIs */ + sync + mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ + subf r6,r4,r13 +42: andi. r0,r3,1 + beq 44f + ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ + li r0,IPI_PRIORITY + li r7,XICS_MFRR + stbcix r0,r7,r8 /* trigger the IPI */ +44: srdi. r3,r3,1 + addi r6,r6,PACA_SIZE + bne 42b + +43: ld r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1) + addi r1, r1, PPC_MIN_STKFRM + mtlr r0 + blr + /* * Check whether an HDSI is an HPTE not found fault or something else. * If it is an HPTE not found fault that is due to the guest accessing @@ -2068,8 +2106,8 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ lbz r5,VCPU_PRODDED(r3) cmpwi r5,0 bne kvm_cede_prodded - li r0,0 /* set trap to 0 to say hcall is handled */ - stw r0,VCPU_TRAP(r3) + li r12,0 /* set trap to 0 to say hcall is handled */ + stw r12,VCPU_TRAP(r3) li r0,H_SUCCESS std r0,VCPU_GPR(R3)(r3) @@ -2275,7 +2313,8 @@ kvm_cede_prodded: /* we've ceded but we want to give control to the host */ kvm_cede_exit: - b hcall_real_fallback + ld r9, HSTATE_KVM_VCPU(r13) + b guest_exit_cont /* Try to handle a machine check in real mode */ machine_check_realmode: @@ -2405,6 +2444,7 @@ kvmppc_read_intr: bne- 43f /* OK, it's an IPI for us */ + li r12, 0 li r3, -1 1: blr From eddb60fb1443f85c5728f1b1cd4be608c6832a79 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2015 14:21:11 +1100 Subject: [PATCH 25/29] KVM: PPC: Book3S HV: Translate kvmhv_commence_exit to C This replaces the assembler code for kvmhv_commence_exit() with C code in book3s_hv_builtin.c. It also moves the IPI sending code that was in book3s_hv_rm_xics.c into a new kvmhv_rm_send_ipi() function so it can be used by kvmhv_commence_exit() as well as icp_rm_set_vcpu_irq(). Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s_64.h | 2 + arch/powerpc/kvm/book3s_hv_builtin.c | 63 ++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv_rm_xics.c | 12 +---- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 66 +++--------------------- 4 files changed, 75 insertions(+), 68 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 869c53fe02cd..2b84e485a181 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -438,6 +438,8 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); +extern void kvmhv_rm_send_ipi(int cpu); + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 275425142bb7..c42aa55b885f 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -22,6 +22,7 @@ #include #include #include +#include #define KVM_CMA_CHUNK_ORDER 18 @@ -184,3 +185,65 @@ long kvmppc_h_random(struct kvm_vcpu *vcpu) return H_HARDWARE; } + +static inline void rm_writeb(unsigned long paddr, u8 val) +{ + __asm__ __volatile__("stbcix %0,0,%1" + : : "r" (val), "r" (paddr) : "memory"); +} + +/* + * Send an interrupt to another CPU. + * This can only be called in real mode. + * The caller needs to include any barrier needed to order writes + * to memory vs. the IPI/message. + */ +void kvmhv_rm_send_ipi(int cpu) +{ + unsigned long xics_phys; + + /* Poke the target */ + xics_phys = paca[cpu].kvm_hstate.xics_phys; + rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); +} + +/* + * The following functions are called from the assembly code + * in book3s_hv_rmhandlers.S. + */ +static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active) +{ + int cpu = vc->pcpu; + + /* Order setting of exit map vs. msgsnd/IPI */ + smp_mb(); + for (; active; active >>= 1, ++cpu) + if (active & 1) + kvmhv_rm_send_ipi(cpu); +} + +void kvmhv_commence_exit(int trap) +{ + struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; + int ptid = local_paca->kvm_hstate.ptid; + int me, ee; + + /* Set our bit in the threads-exiting-guest map in the 0xff00 + bits of vcore->entry_exit_map */ + me = 0x100 << ptid; + do { + ee = vc->entry_exit_map; + } while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee); + + /* Are we the first here? */ + if ((ee >> 8) != 0) + return; + + /* + * Trigger the other threads in this vcore to exit the guest. + * If this is a hypervisor decrementer interrupt then they + * will be already on their way out of the guest. + */ + if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER) + kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid)); +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 6dded8c75234..00e45b6d4f24 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -26,12 +26,6 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u32 new_irq); -static inline void rm_writeb(unsigned long paddr, u8 val) -{ - __asm__ __volatile__("sync; stbcix %0,0,%1" - : : "r" (val), "r" (paddr) : "memory"); -} - /* -- ICS routines -- */ static void ics_rm_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, struct kvmppc_icp *icp) @@ -60,7 +54,6 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, struct kvm_vcpu *this_vcpu) { struct kvmppc_icp *this_icp = this_vcpu->arch.icp; - unsigned long xics_phys; int cpu; /* Mark the target VCPU as having an interrupt pending */ @@ -83,9 +76,8 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, /* In SMT cpu will always point to thread 0, we adjust it */ cpu += vcpu->arch.ptid; - /* Not too hard, then poke the target */ - xics_phys = paca[cpu].kvm_hstate.xics_phys; - rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); + smp_mb(); + kvmhv_rm_send_ipi(cpu); } static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 3f6fd78cccd2..fcf3a617cc8a 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -264,7 +264,11 @@ kvm_novcpu_exit: addi r3, r4, VCPU_TB_RMEXIT bl kvmhv_accumulate_time #endif -13: bl kvmhv_commence_exit +13: mr r3, r12 + stw r12, 112-4(r1) + bl kvmhv_commence_exit + nop + lwz r12, 112-4(r1) b kvmhv_switch_to_host /* @@ -1161,6 +1165,9 @@ mc_cont: /* Increment exit count, poke other threads to exit */ bl kvmhv_commence_exit + nop + ld r9, HSTATE_KVM_VCPU(r13) + lwz r12, VCPU_TRAP(r9) /* Save guest CTRL register, set runlatch to 1 */ mfspr r6,SPRN_CTRLF @@ -1614,63 +1621,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtlr r0 blr -kvmhv_commence_exit: /* r12 = trap, r13 = paca, doesn't trash r9 */ - mflr r0 - std r0, PPC_LR_STKOFF(r1) - stdu r1, -PPC_MIN_STKFRM(r1) - - /* Set our bit in the threads-exiting-guest map in the 0xff00 - bits of vcore->entry_exit_map */ - ld r5, HSTATE_KVM_VCORE(r13) - lbz r4, HSTATE_PTID(r13) - li r7, 0x100 - sld r7, r7, r4 - addi r6, r5, VCORE_ENTRY_EXIT -41: lwarx r3, 0, r6 - or r0, r3, r7 - stwcx. r0, 0, r6 - bne 41b - isync /* order stwcx. vs. reading napping_threads */ - - /* - * At this point we have an interrupt that we have to pass - * up to the kernel or qemu; we can't handle it in real mode. - * Thus we have to do a partition switch, so we have to - * collect the other threads, if we are the first thread - * to take an interrupt. To do this, we send a message or - * IPI to all the threads that have their bit set in the entry - * map in vcore->entry_exit_map (other than ourselves). - * However, we don't need to bother if this is an HDEC - * interrupt, since the other threads will already be on their - * way here in that case. - */ - cmpwi r3,0x100 /* Are we the first here? */ - bge 43f - cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER - beq 43f - - srwi r0,r7,8 - andc. r3,r3,r0 /* no sense IPI'ing ourselves */ - beq 43f - /* Order entry/exit update vs. IPIs */ - sync - mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ - subf r6,r4,r13 -42: andi. r0,r3,1 - beq 44f - ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ - li r0,IPI_PRIORITY - li r7,XICS_MFRR - stbcix r0,r7,r8 /* trigger the IPI */ -44: srdi. r3,r3,1 - addi r6,r6,PACA_SIZE - bne 42b - -43: ld r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1) - addi r1, r1, PPC_MIN_STKFRM - mtlr r0 - blr - /* * Check whether an HDSI is an HPTE not found fault or something else. * If it is an HPTE not found fault that is due to the guest accessing From 66feed61cdf6ee65fd551d3460b1efba6bee55b8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2015 14:21:12 +1100 Subject: [PATCH 26/29] KVM: PPC: Book3S HV: Use msgsnd for signalling threads on POWER8 This uses msgsnd where possible for signalling other threads within the same core on POWER8 systems, rather than IPIs through the XICS interrupt controller. This includes waking secondary threads to run the guest, the interrupts generated by the virtual XICS, and the interrupts to bring the other threads out of the guest when exiting. Aggregated statistics from debugfs across vcpus for a guest with 32 vcpus, 8 threads/vcore, running on a POWER8, show this before the change: rm_entry: 3387.6ns (228 - 86600, 1008969 samples) rm_exit: 4561.5ns (12 - 3477452, 1009402 samples) rm_intr: 1660.0ns (12 - 553050, 3600051 samples) and this after the change: rm_entry: 3060.1ns (212 - 65138, 953873 samples) rm_exit: 4244.1ns (12 - 9693408, 954331 samples) rm_intr: 1342.3ns (12 - 1104718, 3405326 samples) for a test of booting Fedora 20 big-endian to the login prompt. The time taken for a H_PROD hcall (which is handled in the host kernel) went down from about 35 microseconds to about 16 microseconds with this change. The noinline added to kvmppc_run_core turned out to be necessary for good performance, at least with gcc 4.9.2 as packaged with Fedora 21 and a little-endian POWER8 host. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/kernel/asm-offsets.c | 3 ++ arch/powerpc/kvm/book3s_hv.c | 51 ++++++++++++++++--------- arch/powerpc/kvm/book3s_hv_builtin.c | 16 +++++++- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 22 +++++++++-- 4 files changed, 70 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0d07efbe3fa7..0034b6b3556a 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -37,6 +37,7 @@ #include #include #include +#include #ifdef CONFIG_PPC64 #include #include @@ -759,5 +760,7 @@ int main(void) offsetof(struct paca_struct, subcore_sibling_mask)); #endif + DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); + return 0; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ea1600ff52b2..48d3c5d2ecc9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -84,9 +85,35 @@ static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1); static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); +static bool kvmppc_ipi_thread(int cpu) +{ + /* On POWER8 for IPIs to threads in the same core, use msgsnd */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + preempt_disable(); + if (cpu_first_thread_sibling(cpu) == + cpu_first_thread_sibling(smp_processor_id())) { + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + msg |= cpu_thread_in_core(cpu); + smp_mb(); + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); + preempt_enable(); + return true; + } + preempt_enable(); + } + +#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) + if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) { + xics_wake_cpu(cpu); + return true; + } +#endif + + return false; +} + static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) { - int me; int cpu = vcpu->cpu; wait_queue_head_t *wqp; @@ -96,20 +123,12 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) ++vcpu->stat.halt_wakeup; } - me = get_cpu(); + if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid)) + return; /* CPU points to the first thread of the core */ - if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) { -#ifdef CONFIG_PPC_ICP_NATIVE - int real_cpu = cpu + vcpu->arch.ptid; - if (paca[real_cpu].kvm_hstate.xics_phys) - xics_wake_cpu(real_cpu); - else -#endif - if (cpu_online(cpu)) - smp_send_reschedule(cpu); - } - put_cpu(); + if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu)) + smp_send_reschedule(cpu); } /* @@ -1781,10 +1800,8 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */ smp_wmb(); tpaca->kvm_hstate.kvm_vcpu = vcpu; -#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) if (cpu != smp_processor_id()) - xics_wake_cpu(cpu); -#endif + kvmppc_ipi_thread(cpu); } static void kvmppc_wait_for_nap(void) @@ -1933,7 +1950,7 @@ static void post_guest_process(struct kvmppc_vcore *vc) * Run a set of guest threads on a physical core. * Called with vc->lock held. */ -static void kvmppc_run_core(struct kvmppc_vcore *vc) +static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) { struct kvm_vcpu *vcpu; int i; diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index c42aa55b885f..ed2589d4593f 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #define KVM_CMA_CHUNK_ORDER 18 @@ -193,7 +195,7 @@ static inline void rm_writeb(unsigned long paddr, u8 val) } /* - * Send an interrupt to another CPU. + * Send an interrupt or message to another CPU. * This can only be called in real mode. * The caller needs to include any barrier needed to order writes * to memory vs. the IPI/message. @@ -202,7 +204,17 @@ void kvmhv_rm_send_ipi(int cpu) { unsigned long xics_phys; - /* Poke the target */ + /* On POWER8 for IPIs to threads in the same core, use msgsnd */ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + cpu_first_thread_sibling(cpu) == + cpu_first_thread_sibling(raw_smp_processor_id())) { + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + msg |= cpu_thread_in_core(cpu); + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); + return; + } + + /* Else poke the target with an IPI */ xics_phys = paca[cpu].kvm_hstate.xics_phys; rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index fcf3a617cc8a..4d70df26c402 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1123,6 +1123,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) cmpwi r12,BOOK3S_INTERRUPT_SYSCALL beq hcall_try_real_mode + /* Hypervisor doorbell - exit only if host IPI flag set */ + cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL + bne 3f + lbz r0, HSTATE_HOST_IPI(r13) + beq 4f + b guest_exit_cont +3: /* External interrupt ? */ cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL bne+ guest_exit_cont @@ -1135,7 +1142,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) bgt guest_exit_cont /* Check if any CPU is heading out to the host, if so head out too */ - ld r5, HSTATE_KVM_VCORE(r13) +4: ld r5, HSTATE_KVM_VCORE(r13) lwz r0, VCORE_ENTRY_EXIT(r5) cmpwi r0, 0x100 mr r4, r9 @@ -2148,7 +2155,7 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ /* * Take a nap until a decrementer or external or doobell interrupt * occurs, with PECE1 and PECE0 set in LPCR. - * On POWER8, if we are ceding, also set PECEDP. + * On POWER8, set PECEDH, and if we are ceding, also set PECEDP. * Also clear the runlatch bit before napping. */ kvm_do_nap: @@ -2161,6 +2168,7 @@ kvm_do_nap: mfspr r5,SPRN_LPCR ori r5,r5,LPCR_PECE0 | LPCR_PECE1 BEGIN_FTR_SECTION + ori r5, r5, LPCR_PECEDH rlwimi r5, r3, 0, LPCR_PECEDP END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_LPCR,r5 @@ -2299,7 +2307,7 @@ machine_check_realmode: * Returns (in r3): * 0 if nothing needs to be done * 1 if something happened that needs to be handled by the host - * -1 if there was a guest wakeup (IPI) + * -1 if there was a guest wakeup (IPI or msgsnd) * * Also sets r12 to the interrupt vector for any interrupt that needs * to be handled now by the host (0x500 for external interrupt), or zero. @@ -2330,7 +2338,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* hypervisor doorbell */ 3: li r12, BOOK3S_INTERRUPT_H_DOORBELL + /* see if it's a host IPI */ li r3, 1 + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bnelr + /* if not, clear it and return -1 */ + lis r6, (PPC_DBELL_SERVER << (63-36))@h + PPC_MSGCLR(6) + li r3, -1 blr /* From 085e68eeafbf76e21848ad5bafaecec88a11dd64 Mon Sep 17 00:00:00 2001 From: Ben Serebrin Date: Thu, 16 Apr 2015 11:58:05 -0700 Subject: [PATCH 27/29] KVM: VMX: Preserve host CR4.MCE value while in guest mode. The host's decision to enable machine check exceptions should remain in force during non-root mode. KVM was writing 0 to cr4 on VCPU reset and passed a slightly-modified 0 to the vmcs.guest_cr4 value. Tested: Built. On earlier version, tested by injecting machine check while a guest is spinning. Before the change, if guest CR4.MCE==0, then the machine check is escalated to Catastrophic Error (CATERR) and the machine dies. If guest CR4.MCE==1, then the machine check causes VMEXIT and is handled normally by host Linux. After the change, injecting a machine check causes normal Linux machine check handling. Signed-off-by: Ben Serebrin Reviewed-by: Venkatesh Srinivas Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f5e8dce8046c..f7b61687bd79 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3622,8 +3622,16 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + /* + * Pass through host's Machine Check Enable value to hw_cr4, which + * is in force while we are in guest mode. Do not let guests control + * this bit, even if host CR4.MCE == 0. + */ + unsigned long hw_cr4 = + (cr4_read_shadow() & X86_CR4_MCE) | + (cr4 & ~X86_CR4_MCE) | + (to_vmx(vcpu)->rmode.vm86_active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); if (cr4 & X86_CR4_VMXE) { /* From 0b3289ebc2d50cf5ab778215ed0b4075bbae6629 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 13 Apr 2015 15:01:59 +0200 Subject: [PATCH 28/29] KVM: arm: irqfd: fix value returned by kvm_irq_map_gsi irqfd/arm curently does not support routing. kvm_irq_map_gsi is supposed to return all the routing entries associated with the provided gsi and return the number of those entries. We should return 0 at this point. Signed-off-by: Eric Auger Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 8d550ff14700..7ed78735f611 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -2141,7 +2141,7 @@ int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) { - return gsi; + return 0; } int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) From fd1d0ddf2ae92fb3df42ed476939861806c5d785 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 10 Apr 2015 16:17:59 +0100 Subject: [PATCH 29/29] KVM: arm/arm64: check IRQ number on userland injection When userland injects a SPI via the KVM_IRQ_LINE ioctl we currently only check it against a fixed limit, which historically is set to 127. With the new dynamic IRQ allocation the effective limit may actually be smaller (64). So when now a malicious or buggy userland injects a SPI in that range, we spill over on our VGIC bitmaps and bytemaps memory. I could trigger a host kernel NULL pointer dereference with current mainline by injecting some bogus IRQ number from a hacked kvmtool: ----------------- .... DEBUG: kvm_vgic_inject_irq(kvm, cpu=0, irq=114, level=1) DEBUG: vgic_update_irq_pending(kvm, cpu=0, irq=114, level=1) DEBUG: IRQ #114 still in the game, writing to bytemap now... Unable to handle kernel NULL pointer dereference at virtual address 00000000 pgd = ffffffc07652e000 [00000000] *pgd=00000000f658b003, *pud=00000000f658b003, *pmd=0000000000000000 Internal error: Oops: 96000006 [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 1053 Comm: lkvm-msi-irqinj Not tainted 4.0.0-rc7+ #3027 Hardware name: FVP Base (DT) task: ffffffc0774e9680 ti: ffffffc0765a8000 task.ti: ffffffc0765a8000 PC is at kvm_vgic_inject_irq+0x234/0x310 LR is at kvm_vgic_inject_irq+0x30c/0x310 pc : [] lr : [] pstate: 80000145 ..... So this patch fixes this by checking the SPI number against the actual limit. Also we remove the former legacy hard limit of 127 in the ioctl code. Signed-off-by: Andre Przywara Reviewed-by: Christoffer Dall CC: # 4.0, 3.19, 3.18 [maz: wrap KVM_ARM_IRQ_GIC_MAX with #ifndef __KERNEL__, as suggested by Christopher Covington] Signed-off-by: Marc Zyngier --- arch/arm/include/uapi/asm/kvm.h | 8 +++++++- arch/arm/kvm/arm.c | 3 +-- arch/arm64/include/uapi/asm/kvm.h | 8 +++++++- virt/kvm/arm/vgic.c | 3 +++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 2499867dd0d8..df3f60cb1168 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -195,8 +195,14 @@ struct kvm_arch_memory_slot { #define KVM_ARM_IRQ_CPU_IRQ 0 #define KVM_ARM_IRQ_CPU_FIQ 1 -/* Highest supported SPI, from VGIC_NR_IRQS */ +/* + * This used to hold the highest supported SPI, but it is now obsolete + * and only here to provide source code level compatibility with older + * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS. + */ +#ifndef __KERNEL__ #define KVM_ARM_IRQ_GIC_MAX 127 +#endif /* One single KVM irqchip, ie. the VGIC */ #define KVM_NR_IRQCHIPS 1 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 6f536451ab78..d9631ecddd56 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -671,8 +671,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, if (!irqchip_in_kernel(kvm)) return -ENXIO; - if (irq_num < VGIC_NR_PRIVATE_IRQS || - irq_num > KVM_ARM_IRQ_GIC_MAX) + if (irq_num < VGIC_NR_PRIVATE_IRQS) return -EINVAL; return kvm_vgic_inject_irq(kvm, 0, irq_num, level); diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index c154c0b7eb60..d26832022127 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -188,8 +188,14 @@ struct kvm_arch_memory_slot { #define KVM_ARM_IRQ_CPU_IRQ 0 #define KVM_ARM_IRQ_CPU_FIQ 1 -/* Highest supported SPI, from VGIC_NR_IRQS */ +/* + * This used to hold the highest supported SPI, but it is now obsolete + * and only here to provide source code level compatibility with older + * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS. + */ +#ifndef __KERNEL__ #define KVM_ARM_IRQ_GIC_MAX 127 +#endif /* One single KVM irqchip, ie. the VGIC */ #define KVM_NR_IRQCHIPS 1 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 7ed78735f611..78fb8201014f 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1561,6 +1561,9 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, goto out; } + if (irq_num >= kvm->arch.vgic.nr_irqs) + return -EINVAL; + vcpu_id = vgic_update_irq_pending(kvm, cpuid, irq_num, level); if (vcpu_id >= 0) { /* kick the specified vcpu */