linux/arch/x86/kvm/svm/avic.c

1164 lines
31 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel-based Virtual Machine driver for Linux
*
* AMD SVM support
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
* Avi Kivity <avi@qumranet.com>
*/
#define pr_fmt(fmt) "SVM: " fmt
#include <linux/kvm_types.h>
#include <linux/hashtable.h>
#include <linux/amd-iommu.h>
#include <linux/kvm_host.h>
#include <asm/irq_remapping.h>
#include "trace.h"
#include "lapic.h"
#include "x86.h"
#include "irq.h"
#include "svm.h"
/* AVIC GATAG is encoded using VM and VCPU IDs */
#define AVIC_VCPU_ID_BITS 8
#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
#define AVIC_VM_ID_BITS 24
#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS)
#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1)
#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
(y & AVIC_VCPU_ID_MASK))
#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
static bool force_avic;
module_param_unsafe(force_avic, bool, 0444);
/* Note:
* This hash table is used to map VM_ID to a struct kvm_svm,
* when handling AMD IOMMU GALOG notification to schedule in
* a particular vCPU.
*/
#define SVM_VM_DATA_HASH_BITS 8
static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
static u32 next_vm_id = 0;
static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
bool x2avic_enabled;
/*
* This is a wrapper of struct amd_iommu_ir_data.
*/
struct amd_svm_iommu_ir {
struct list_head node; /* Used by SVM for per-vcpu ir_list */
void *data; /* Storing pointer to struct amd_ir_data */
};
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
KVM: x86: Inhibit APIC memslot if x2APIC and AVIC are enabled Free the APIC access page memslot if any vCPU enables x2APIC and SVM's AVIC is enabled to prevent accesses to the virtual APIC on vCPUs with x2APIC enabled. On AMD, if its "hybrid" mode is enabled (AVIC is enabled when x2APIC is enabled even without x2AVIC support), keeping the APIC access page memslot results in the guest being able to access the virtual APIC page as x2APIC is fully emulated by KVM. I.e. hardware isn't aware that the guest is operating in x2APIC mode. Exempt nested SVM's update of APICv state from the new logic as x2APIC can't be toggled on VM-Exit. In practice, invoking the x2APIC logic should be harmless precisely because it should be a glorified nop, but play it safe to avoid latent bugs, e.g. with dropping the vCPU's SRCU lock. Intel doesn't suffer from the same issue as APICv has fully independent VMCS controls for xAPIC vs. x2APIC virtualization. Technically, KVM should provide bus error semantics and not memory semantics for the APIC page when x2APIC is enabled, but KVM already provides memory semantics in other scenarios, e.g. if APICv/AVIC is enabled and the APIC is hardware disabled (via APIC_BASE MSR). Note, checking apic_access_memslot_enabled without taking locks relies it being set during vCPU creation (before kvm_vcpu_reset()). vCPUs can race to set the inhibit and delete the memslot, i.e. can get false positives, but can't get false negatives as apic_access_memslot_enabled can't be toggled "on" once any vCPU reaches KVM_RUN. Opportunistically drop the "can" while updating avic_activate_vmcb()'s comment, i.e. to state that KVM _does_ support the hybrid mode. Move the "Note:" down a line to conform to preferred kernel/KVM multi-line comment style. Opportunistically update the apicv_update_lock comment, as it isn't actually used to protect apic_access_memslot_enabled (which is protected by slots_lock). Fixes: 0e311d33bfbe ("KVM: SVM: Introduce hybrid-AVIC mode") Signed-off-by: Sean Christopherson <seanjc@google.com> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20230106011306.85230-11-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-01-06 01:12:43 +00:00
/*
* Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
* accesses, while interrupt injection to a running vCPU can be
* achieved using AVIC doorbell. KVM disables the APIC access page
* (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
* AVIC in hybrid mode activates only the doorbell mechanism.
*/
if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
/* Disabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, false);
} else {
/*
* Flush the TLB, the guest may have inserted a non-APIC
* mapping into the TLB while AVIC was disabled.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
/* For xAVIC and hybrid-xAVIC modes */
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
/* Enabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, true);
}
}
static void avic_deactivate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
/*
* If running nested and the guest uses its own MSR bitmap, there
* is no need to update L0's msr bitmap
*/
if (is_guest_mode(&svm->vcpu) &&
vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
return;
/* Enabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, true);
}
/* Note:
* This function is called from IOMMU driver to notify
* SVM to schedule in a particular vCPU of a particular VM.
*/
int avic_ga_log_notifier(u32 ga_tag)
{
unsigned long flags;
struct kvm_svm *kvm_svm;
struct kvm_vcpu *vcpu = NULL;
u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
trace_kvm_avic_ga_log(vm_id, vcpu_id);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
if (kvm_svm->avic_vm_id != vm_id)
continue;
vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
break;
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
/* Note:
* At this point, the IOMMU should have already set the pending
* bit in the vAPIC backing page. So, we just need to schedule
* in the vcpu.
*/
if (vcpu)
kvm_vcpu_wake_up(vcpu);
return 0;
}
void avic_vm_destroy(struct kvm *kvm)
{
unsigned long flags;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
if (!enable_apicv)
return;
if (kvm_svm->avic_logical_id_table_page)
__free_page(kvm_svm->avic_logical_id_table_page);
if (kvm_svm->avic_physical_id_table_page)
__free_page(kvm_svm->avic_physical_id_table_page);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_del(&kvm_svm->hnode);
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
}
int avic_vm_init(struct kvm *kvm)
{
unsigned long flags;
int err = -ENOMEM;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
struct kvm_svm *k2;
struct page *p_page;
struct page *l_page;
u32 vm_id;
if (!enable_apicv)
return 0;
/* Allocating physical APIC ID table (4KB) */
p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!p_page)
goto free_avic;
kvm_svm->avic_physical_id_table_page = p_page;
/* Allocating logical APIC ID table (4KB) */
l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!l_page)
goto free_avic;
kvm_svm->avic_logical_id_table_page = l_page;
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
again:
vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
if (vm_id == 0) { /* id is 1-based, zero is not okay */
next_vm_id_wrapped = 1;
goto again;
}
/* Is it still in use? Only possible if wrapped at least once */
if (next_vm_id_wrapped) {
hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
if (k2->avic_vm_id == vm_id)
goto again;
}
}
kvm_svm->avic_vm_id = vm_id;
hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
return 0;
free_avic:
avic_vm_destroy(kvm);
return err;
}
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
{
struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
if (kvm_apicv_activated(svm->vcpu.kvm))
avic_activate_vmcb(svm);
else
avic_deactivate_vmcb(svm);
}
static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
unsigned int index)
{
u64 *avic_physical_id_table;
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
(index > X2AVIC_MAX_PHYSICAL_ID))
return NULL;
avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
return &avic_physical_id_table[index];
}
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
u64 *entry, new_entry;
int id = vcpu->vcpu_id;
struct vcpu_svm *svm = to_svm(vcpu);
if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
(id > X2AVIC_MAX_PHYSICAL_ID))
return -EINVAL;
if (!vcpu->arch.apic->regs)
return -EINVAL;
if (kvm_apicv_activated(vcpu->kvm)) {
int ret;
/*
* Note, AVIC hardware walks the nested page table to check
* permissions, but does not use the SPA address specified in
* the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
* pointer field of the VMCB.
*/
ret = kvm_alloc_apic_access_page(vcpu->kvm);
if (ret)
return ret;
}
svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
/* Setting AVIC backing page address in the phy APIC ID table */
entry = avic_get_physical_id_entry(vcpu, id);
if (!entry)
return -EINVAL;
new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
WRITE_ONCE(*entry, new_entry);
svm->avic_physical_id_cache = entry;
return 0;
}
void avic_ring_doorbell(struct kvm_vcpu *vcpu)
{
/*
* Note, the vCPU could get migrated to a different pCPU at any point,
* which could result in signalling the wrong/previous pCPU. But if
* that happens the vCPU is guaranteed to do a VMRUN (after being
* migrated) and thus will process pending interrupts, i.e. a doorbell
* is not needed (and the spurious one is harmless).
*/
int cpu = READ_ONCE(vcpu->cpu);
if (cpu != get_cpu()) {
wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
}
put_cpu();
}
static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
{
vcpu->arch.apic->irr_pending = true;
svm_complete_interrupt_delivery(vcpu,
icrl & APIC_MODE_MASK,
icrl & APIC_INT_LEVELTRIG,
icrl & APIC_VECTOR_MASK);
}
/*
* A fast-path version of avic_kick_target_vcpus(), which attempts to match
* destination APIC ID to vCPU without looping through all vCPUs.
*/
static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
u32 l1_physical_id, dest;
struct kvm_vcpu *target_vcpu;
int dest_mode = icrl & APIC_DEST_MASK;
int shorthand = icrl & APIC_SHORT_MASK;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
if (shorthand != APIC_DEST_NOSHORT)
return -EINVAL;
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
if (apic_x2apic_mode(source))
dest = icrh;
else
dest = GET_XAPIC_DEST_FIELD(icrh);
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
if (dest_mode == APIC_DEST_PHYSICAL) {
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
/* broadcast destination, use slow path */
if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
return -EINVAL;
if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
return -EINVAL;
l1_physical_id = dest;
if (WARN_ON_ONCE(l1_physical_id != index))
return -EINVAL;
} else {
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
u32 bitmap, cluster;
int logid_index;
if (apic_x2apic_mode(source)) {
/* 16 bit dest mask, 16 bit cluster id */
bitmap = dest & 0xFFFF;
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
cluster = (dest >> 16) << 4;
} else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
/* 8 bit dest mask*/
bitmap = dest;
cluster = 0;
} else {
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
/* 4 bit desk mask, 4 bit cluster id */
bitmap = dest & 0xF;
cluster = (dest >> 4) << 2;
}
/* Nothing to do if there are no destinations in the cluster. */
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
if (unlikely(!bitmap))
return 0;
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
if (!is_power_of_2(bitmap))
/* multiple logical destinations, use slow path */
return -EINVAL;
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
logid_index = cluster + __ffs(bitmap);
if (apic_x2apic_mode(source)) {
/*
* For x2APIC, the logical APIC ID is a read-only value
* that is derived from the x2APIC ID, thus the x2APIC
* ID can be found by reversing the calculation (done
* above). Note, bits 31:20 of the x2APIC ID are not
* propagated to the logical ID, but KVM limits the
* x2APIC ID limited to KVM_MAX_VCPU_IDS.
*/
l1_physical_id = logid_index;
} else {
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
u32 *avic_logical_id_table =
page_address(kvm_svm->avic_logical_id_table_page);
u32 logid_entry = avic_logical_id_table[logid_index];
if (WARN_ON_ONCE(index != logid_index))
return -EINVAL;
/* Nothing to do if the logical destination is invalid. */
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
return 0;
l1_physical_id = logid_entry &
AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
}
}
/*
* KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
* i.e. APIC ID == vCPU ID. Once again, nothing to do if the target
* vCPU doesn't exist.
*/
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
if (unlikely(!target_vcpu))
return 0;
avic_kick_vcpu(target_vcpu, icrl);
KVM: x86: SVM: fix avic_kick_target_vcpus_fast There are two issues in avic_kick_target_vcpus_fast 1. It is legal to issue an IPI request with APIC_DEST_NOSHORT and a physical destination of 0xFF (or 0xFFFFFFFF in case of x2apic), which must be treated as a broadcast destination. Fix this by explicitly checking for it. Also don’t use ‘index’ in this case as it gives no new information. 2. It is legal to issue a logical IPI request to more than one target. Index field only provides index in physical id table of first such target and therefore can't be used before we are sure that only a single target was addressed. Instead, parse the ICRL/ICRH, double check that a unicast interrupt was requested, and use that info to figure out the physical id of the target vCPU. At that point there is no need to use the index field as well. In addition to fixing the above issues, also skip the call to kvm_apic_match_dest. It is possible to do this now, because now as long as AVIC is not inhibited, it is guaranteed that none of the vCPUs changed their apic id from its default value. This fixes boot of windows guest with AVIC enabled because it uses IPI with 0xFF destination and no destination shorthand. Fixes: 7223fd2d5338 ("KVM: SVM: Use target APIC ID to complete AVIC IRQs when possible") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20220606180829.102503-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-06 18:08:26 +00:00
return 0;
}
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
unsigned long i;
struct kvm_vcpu *vcpu;
if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
return;
trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
/*
* Wake any target vCPUs that are blocking, i.e. waiting for a wake
* event. There's no need to signal doorbells, as hardware has handled
* vCPUs that were in guest at the time of the IPI, and vCPUs that have
* since entered the guest will have processed pending IRQs at VMRUN.
*/
kvm_for_each_vcpu(i, vcpu, kvm) {
if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
dest, icrl & APIC_DEST_MASK))
avic_kick_vcpu(vcpu, icrl);
}
}
int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
u32 icrl = svm->vmcb->control.exit_info_1;
u32 id = svm->vmcb->control.exit_info_2 >> 32;
u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
struct kvm_lapic *apic = vcpu->arch.apic;
trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
switch (id) {
case AVIC_IPI_FAILURE_INVALID_TARGET:
case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
/*
* Emulate IPIs that are not handled by AVIC hardware, which
* only virtualizes Fixed, Edge-Triggered INTRs, and falls over
* if _any_ targets are invalid, e.g. if the logical mode mask
* is a superset of running vCPUs.
*
* The exit is a trap, e.g. ICR holds the correct value and RIP
* has been advanced, KVM is responsible only for emulating the
* IPI. Sadly, hardware may sometimes leave the BUSY flag set,
* in which case KVM needs to emulate the ICR write as well in
* order to clear the BUSY flag.
*/
if (icrl & APIC_ICR_BUSY)
kvm_apic_write_nodecode(vcpu, APIC_ICR);
else
kvm_apic_send_ipi(apic, icrl, icrh);
break;
case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
/*
* At this point, we expect that the AVIC HW has already
* set the appropriate IRR bits on the valid target
* vcpus. So, we just need to kick the appropriate vcpu.
*/
avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
break;
default:
pr_err("Unknown IPI interception\n");
}
return 1;
}
unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu))
return APICV_INHIBIT_REASON_NESTED;
return 0;
}
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
int index;
u32 *logical_apic_id_table;
int dlid = GET_APIC_LOGICAL_ID(ldr);
if (!dlid)
return NULL;
if (flat) { /* flat */
index = ffs(dlid) - 1;
if (index > 7)
return NULL;
} else { /* cluster */
int cluster = (dlid & 0xf0) >> 4;
int apic = ffs(dlid & 0x0f) - 1;
if ((apic < 0) || (apic > 7) ||
(cluster >= 0xf))
return NULL;
index = (cluster << 2) + apic;
}
logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
return &logical_apic_id_table[index];
}
static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
{
bool flat;
u32 *entry, new_entry;
flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
entry = avic_get_logical_id_entry(vcpu, ldr, flat);
if (!entry)
return -EINVAL;
new_entry = READ_ONCE(*entry);
new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
WRITE_ONCE(*entry, new_entry);
return 0;
}
static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
bool flat = svm->dfr_reg == APIC_DFR_FLAT;
u32 *entry;
/* Note: x2AVIC does not use logical APIC ID table */
if (apic_x2apic_mode(vcpu->arch.apic))
return;
entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
if (entry)
clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
}
static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
{
int ret = 0;
struct vcpu_svm *svm = to_svm(vcpu);
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
u32 id = kvm_xapic_id(vcpu->arch.apic);
/* AVIC does not support LDR update for x2APIC */
if (apic_x2apic_mode(vcpu->arch.apic))
return;
if (ldr == svm->ldr_reg)
return;
avic_invalidate_logical_id_entry(vcpu);
if (ldr)
ret = avic_ldr_write(vcpu, id, ldr);
if (!ret)
svm->ldr_reg = ldr;
}
static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
if (svm->dfr_reg == dfr)
return;
avic_invalidate_logical_id_entry(vcpu);
svm->dfr_reg = dfr;
}
static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
{
u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
switch (offset) {
case APIC_LDR:
avic_handle_ldr_update(vcpu);
break;
case APIC_DFR:
avic_handle_dfr_update(vcpu);
break;
default:
break;
}
kvm_apic_write_nodecode(vcpu, offset);
return 1;
}
static bool is_avic_unaccelerated_access_trap(u32 offset)
{
bool ret = false;
switch (offset) {
case APIC_ID:
case APIC_EOI:
case APIC_RRR:
case APIC_LDR:
case APIC_DFR:
case APIC_SPIV:
case APIC_ESR:
case APIC_ICR:
case APIC_LVTT:
case APIC_LVTTHMR:
case APIC_LVTPC:
case APIC_LVT0:
case APIC_LVT1:
case APIC_LVTERR:
case APIC_TMICT:
case APIC_TDCR:
ret = true;
break;
default:
break;
}
return ret;
}
int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
int ret = 0;
u32 offset = svm->vmcb->control.exit_info_1 &
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
u32 vector = svm->vmcb->control.exit_info_2 &
AVIC_UNACCEL_ACCESS_VECTOR_MASK;
bool write = (svm->vmcb->control.exit_info_1 >> 32) &
AVIC_UNACCEL_ACCESS_WRITE_MASK;
bool trap = is_avic_unaccelerated_access_trap(offset);
trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
trap, write, vector);
if (trap) {
/* Handling Trap */
WARN_ONCE(!write, "svm: Handling trap read.\n");
ret = avic_unaccel_trap_write(vcpu);
} else {
/* Handling Fault */
ret = kvm_emulate_instruction(vcpu, 0);
}
return ret;
}
int avic_init_vcpu(struct vcpu_svm *svm)
{
int ret;
struct kvm_vcpu *vcpu = &svm->vcpu;
if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
return 0;
ret = avic_init_backing_page(vcpu);
if (ret)
return ret;
INIT_LIST_HEAD(&svm->ir_list);
spin_lock_init(&svm->ir_list_lock);
svm->dfr_reg = APIC_DFR_FLAT;
return ret;
}
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
{
avic_handle_dfr_update(vcpu);
avic_handle_ldr_update(vcpu);
}
static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
{
int ret = 0;
unsigned long flags;
struct amd_svm_iommu_ir *ir;
struct vcpu_svm *svm = to_svm(vcpu);
if (!kvm_arch_has_assigned_device(vcpu->kvm))
return 0;
/*
* Here, we go through the per-vcpu ir_list to update all existing
* interrupt remapping table entry targeting this vcpu.
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
if (list_empty(&svm->ir_list))
goto out;
list_for_each_entry(ir, &svm->ir_list, node) {
if (activate)
ret = amd_iommu_activate_guest_mode(ir->data);
else
ret = amd_iommu_deactivate_guest_mode(ir->data);
if (ret)
break;
}
out:
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
return ret;
}
static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
{
unsigned long flags;
struct amd_svm_iommu_ir *cur;
spin_lock_irqsave(&svm->ir_list_lock, flags);
list_for_each_entry(cur, &svm->ir_list, node) {
if (cur->data != pi->ir_data)
continue;
list_del(&cur->node);
kfree(cur);
break;
}
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
{
int ret = 0;
unsigned long flags;
struct amd_svm_iommu_ir *ir;
/**
* In some cases, the existing irte is updated and re-set,
* so we need to check here if it's already been * added
* to the ir_list.
*/
if (pi->ir_data && (pi->prev_ga_tag != 0)) {
struct kvm *kvm = svm->vcpu.kvm;
u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
struct vcpu_svm *prev_svm;
if (!prev_vcpu) {
ret = -EINVAL;
goto out;
}
prev_svm = to_svm(prev_vcpu);
svm_ir_list_del(prev_svm, pi);
}
/**
* Allocating new amd_iommu_pi_data, which will get
* add to the per-vcpu ir_list.
*/
ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
if (!ir) {
ret = -ENOMEM;
goto out;
}
ir->data = pi->ir_data;
spin_lock_irqsave(&svm->ir_list_lock, flags);
list_add(&ir->node, &svm->ir_list);
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
out:
return ret;
}
/*
* Note:
* The HW cannot support posting multicast/broadcast
* interrupts to a vCPU. So, we still use legacy interrupt
* remapping for these kind of interrupts.
*
* For lowest-priority interrupts, we only support
* those with single CPU as the destination, e.g. user
* configures the interrupts via /proc/irq or uses
* irqbalance to make the interrupts single-CPU.
*/
static int
get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
{
struct kvm_lapic_irq irq;
struct kvm_vcpu *vcpu = NULL;
kvm_set_msi_irq(kvm, e, &irq);
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
!kvm_irq_is_postable(&irq)) {
pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
__func__, irq.vector);
return -1;
}
pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
irq.vector);
*svm = to_svm(vcpu);
vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
vcpu_info->vector = irq.vector;
return 0;
}
/*
* avic_pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
* @host_irq: host irq of the interrupt
* @guest_irq: gsi of the interrupt
* @set: set or unset PI
* returns 0 on success, < 0 on failure
*/
int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
KVM: SVM: fix panic on out-of-bounds guest IRQ As guest_irq is coming from KVM_IRQFD API call, it may trigger crash in svm_update_pi_irte() due to out-of-bounds: crash> bt PID: 22218 TASK: ffff951a6ad74980 CPU: 73 COMMAND: "vcpu8" #0 [ffffb1ba6707fa40] machine_kexec at ffffffff8565b397 #1 [ffffb1ba6707fa90] __crash_kexec at ffffffff85788a6d #2 [ffffb1ba6707fb58] crash_kexec at ffffffff8578995d #3 [ffffb1ba6707fb70] oops_end at ffffffff85623c0d #4 [ffffb1ba6707fb90] no_context at ffffffff856692c9 #5 [ffffb1ba6707fbf8] exc_page_fault at ffffffff85f95b51 #6 [ffffb1ba6707fc50] asm_exc_page_fault at ffffffff86000ace [exception RIP: svm_update_pi_irte+227] RIP: ffffffffc0761b53 RSP: ffffb1ba6707fd08 RFLAGS: 00010086 RAX: ffffb1ba6707fd78 RBX: ffffb1ba66d91000 RCX: 0000000000000001 RDX: 00003c803f63f1c0 RSI: 000000000000019a RDI: ffffb1ba66db2ab8 RBP: 000000000000019a R8: 0000000000000040 R9: ffff94ca41b82200 R10: ffffffffffffffcf R11: 0000000000000001 R12: 0000000000000001 R13: 0000000000000001 R14: ffffffffffffffcf R15: 000000000000005f ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffffb1ba6707fdb8] kvm_irq_routing_update at ffffffffc09f19a1 [kvm] #8 [ffffb1ba6707fde0] kvm_set_irq_routing at ffffffffc09f2133 [kvm] #9 [ffffb1ba6707fe18] kvm_vm_ioctl at ffffffffc09ef544 [kvm] RIP: 00007f143c36488b RSP: 00007f143a4e04b8 RFLAGS: 00000246 RAX: ffffffffffffffda RBX: 00007f05780041d0 RCX: 00007f143c36488b RDX: 00007f05780041d0 RSI: 000000004008ae6a RDI: 0000000000000020 RBP: 00000000000004e8 R8: 0000000000000008 R9: 00007f05780041e0 R10: 00007f0578004560 R11: 0000000000000246 R12: 00000000000004e0 R13: 000000000000001a R14: 00007f1424001c60 R15: 00007f0578003bc0 ORIG_RAX: 0000000000000010 CS: 0033 SS: 002b Vmx have been fix this in commit 3a8b0677fc61 (KVM: VMX: Do not BUG() on out-of-bounds guest IRQ), so we can just copy source from that to fix this. Co-developed-by: Yi Liu <liu.yi24@zte.com.cn> Signed-off-by: Yi Liu <liu.yi24@zte.com.cn> Signed-off-by: Yi Wang <wang.yi59@zte.com.cn> Message-Id: <20220309113025.44469-1-wang.yi59@zte.com.cn> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-03-09 11:30:25 +00:00
int idx, ret = 0;
if (!kvm_arch_has_assigned_device(kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP))
return 0;
pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
__func__, host_irq, guest_irq, set);
idx = srcu_read_lock(&kvm->irq_srcu);
irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
KVM: SVM: fix panic on out-of-bounds guest IRQ As guest_irq is coming from KVM_IRQFD API call, it may trigger crash in svm_update_pi_irte() due to out-of-bounds: crash> bt PID: 22218 TASK: ffff951a6ad74980 CPU: 73 COMMAND: "vcpu8" #0 [ffffb1ba6707fa40] machine_kexec at ffffffff8565b397 #1 [ffffb1ba6707fa90] __crash_kexec at ffffffff85788a6d #2 [ffffb1ba6707fb58] crash_kexec at ffffffff8578995d #3 [ffffb1ba6707fb70] oops_end at ffffffff85623c0d #4 [ffffb1ba6707fb90] no_context at ffffffff856692c9 #5 [ffffb1ba6707fbf8] exc_page_fault at ffffffff85f95b51 #6 [ffffb1ba6707fc50] asm_exc_page_fault at ffffffff86000ace [exception RIP: svm_update_pi_irte+227] RIP: ffffffffc0761b53 RSP: ffffb1ba6707fd08 RFLAGS: 00010086 RAX: ffffb1ba6707fd78 RBX: ffffb1ba66d91000 RCX: 0000000000000001 RDX: 00003c803f63f1c0 RSI: 000000000000019a RDI: ffffb1ba66db2ab8 RBP: 000000000000019a R8: 0000000000000040 R9: ffff94ca41b82200 R10: ffffffffffffffcf R11: 0000000000000001 R12: 0000000000000001 R13: 0000000000000001 R14: ffffffffffffffcf R15: 000000000000005f ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffffb1ba6707fdb8] kvm_irq_routing_update at ffffffffc09f19a1 [kvm] #8 [ffffb1ba6707fde0] kvm_set_irq_routing at ffffffffc09f2133 [kvm] #9 [ffffb1ba6707fe18] kvm_vm_ioctl at ffffffffc09ef544 [kvm] RIP: 00007f143c36488b RSP: 00007f143a4e04b8 RFLAGS: 00000246 RAX: ffffffffffffffda RBX: 00007f05780041d0 RCX: 00007f143c36488b RDX: 00007f05780041d0 RSI: 000000004008ae6a RDI: 0000000000000020 RBP: 00000000000004e8 R8: 0000000000000008 R9: 00007f05780041e0 R10: 00007f0578004560 R11: 0000000000000246 R12: 00000000000004e0 R13: 000000000000001a R14: 00007f1424001c60 R15: 00007f0578003bc0 ORIG_RAX: 0000000000000010 CS: 0033 SS: 002b Vmx have been fix this in commit 3a8b0677fc61 (KVM: VMX: Do not BUG() on out-of-bounds guest IRQ), so we can just copy source from that to fix this. Co-developed-by: Yi Liu <liu.yi24@zte.com.cn> Signed-off-by: Yi Liu <liu.yi24@zte.com.cn> Signed-off-by: Yi Wang <wang.yi59@zte.com.cn> Message-Id: <20220309113025.44469-1-wang.yi59@zte.com.cn> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-03-09 11:30:25 +00:00
if (guest_irq >= irq_rt->nr_rt_entries ||
hlist_empty(&irq_rt->map[guest_irq])) {
pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
guest_irq, irq_rt->nr_rt_entries);
goto out;
}
hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
struct vcpu_data vcpu_info;
struct vcpu_svm *svm = NULL;
if (e->type != KVM_IRQ_ROUTING_MSI)
continue;
/**
* Here, we setup with legacy mode in the following cases:
* 1. When cannot target interrupt to a specific vcpu.
* 2. Unsetting posted interrupt.
* 3. APIC virtualization is disabled for the vcpu.
* 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
*/
if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
kvm_vcpu_apicv_active(&svm->vcpu)) {
struct amd_iommu_pi_data pi;
/* Try to enable guest_mode in IRTE */
pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
AVIC_HPA_MASK);
pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
svm->vcpu.vcpu_id);
pi.is_guest_mode = true;
pi.vcpu_data = &vcpu_info;
ret = irq_set_vcpu_affinity(host_irq, &pi);
/**
* Here, we successfully setting up vcpu affinity in
* IOMMU guest mode. Now, we need to store the posted
* interrupt information in a per-vcpu ir_list so that
* we can reference to them directly when we update vcpu
* scheduling information in IOMMU irte.
*/
if (!ret && pi.is_guest_mode)
svm_ir_list_add(svm, &pi);
} else {
/* Use legacy mode in IRTE */
struct amd_iommu_pi_data pi;
/**
* Here, pi is used to:
* - Tell IOMMU to use legacy mode for this interrupt.
* - Retrieve ga_tag of prior interrupt remapping data.
*/
KVM: SVM: Initialize prev_ga_tag before use The function amd_ir_set_vcpu_affinity makes use of the parameter struct amd_iommu_pi_data.prev_ga_tag to determine if it should delete struct amd_iommu_pi_data from a list when not running in AVIC mode. However, prev_ga_tag is initialized only when AVIC is enabled. The non-zero uninitialized value can cause unintended code path, which ends up making use of the struct vcpu_svm.ir_list and ir_list_lock without being initialized (since they are intended only for the AVIC case). This triggers NULL pointer dereference bug in the function vm_ir_list_del with the following call trace: svm_update_pi_irte+0x3c2/0x550 [kvm_amd] ? proc_create_single_data+0x41/0x50 kvm_arch_irq_bypass_add_producer+0x40/0x60 [kvm] __connect+0x5f/0xb0 [irqbypass] irq_bypass_register_producer+0xf8/0x120 [irqbypass] vfio_msi_set_vector_signal+0x1de/0x2d0 [vfio_pci] vfio_msi_set_block+0x77/0xe0 [vfio_pci] vfio_pci_set_msi_trigger+0x25c/0x2f0 [vfio_pci] vfio_pci_set_irqs_ioctl+0x88/0xb0 [vfio_pci] vfio_pci_ioctl+0x2ea/0xed0 [vfio_pci] ? alloc_file_pseudo+0xa5/0x100 vfio_device_fops_unl_ioctl+0x26/0x30 [vfio] ? vfio_device_fops_unl_ioctl+0x26/0x30 [vfio] __x64_sys_ioctl+0x96/0xd0 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Therefore, initialize prev_ga_tag to zero before use. This should be safe because ga_tag value 0 is invalid (see function avic_vm_init). Fixes: dfa20099e26e ("KVM: SVM: Refactor AVIC vcpu initialization into avic_init_vcpu()") Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> Message-Id: <20201003232707.4662-1-suravee.suthikulpanit@amd.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2020-10-03 23:27:07 +00:00
pi.prev_ga_tag = 0;
pi.is_guest_mode = false;
ret = irq_set_vcpu_affinity(host_irq, &pi);
/**
* Check if the posted interrupt was previously
* setup with the guest_mode by checking if the ga_tag
* was cached. If so, we need to clean up the per-vcpu
* ir_list.
*/
if (!ret && pi.prev_ga_tag) {
int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
struct kvm_vcpu *vcpu;
vcpu = kvm_get_vcpu_by_id(kvm, id);
if (vcpu)
svm_ir_list_del(to_svm(vcpu), &pi);
}
}
if (!ret && svm) {
trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
e->gsi, vcpu_info.vector,
vcpu_info.pi_desc_addr, set);
}
if (ret < 0) {
pr_err("%s: failed to update PI IRTE\n", __func__);
goto out;
}
}
ret = 0;
out:
srcu_read_unlock(&kvm->irq_srcu, idx);
return ret;
}
bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
BIT(APICV_INHIBIT_REASON_NESTED) |
BIT(APICV_INHIBIT_REASON_IRQWIN) |
BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
BIT(APICV_INHIBIT_REASON_SEV) |
BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) |
BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) |
BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
return supported & BIT(reason);
}
static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
int ret = 0;
unsigned long flags;
struct amd_svm_iommu_ir *ir;
struct vcpu_svm *svm = to_svm(vcpu);
if (!kvm_arch_has_assigned_device(vcpu->kvm))
return 0;
/*
* Here, we go through the per-vcpu ir_list to update all existing
* interrupt remapping table entry targeting this vcpu.
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
if (list_empty(&svm->ir_list))
goto out;
list_for_each_entry(ir, &svm->ir_list, node) {
ret = amd_iommu_update_ga(cpu, r, ir->data);
if (ret)
break;
}
out:
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
return ret;
}
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
* No need to update anything if the vCPU is blocking, i.e. if the vCPU
* is being scheduled in after being preempted. The CPU entries in the
* Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
* If the vCPU was migrated, its new CPU value will be stuffed when the
* vCPU unblocks.
*/
if (kvm_vcpu_is_blocking(vcpu))
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
}
void avic_vcpu_put(struct kvm_vcpu *vcpu)
{
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
entry = READ_ONCE(*(svm->avic_physical_id_cache));
/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
return;
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
KVM: SVM: Don't put/load AVIC when setting virtual APIC mode Move the VMCB updates from avic_refresh_apicv_exec_ctrl() into avic_set_virtual_apic_mode() and invert the dependency being said functions to avoid calling avic_vcpu_{load,put}() and avic_set_pi_irte_mode() when "only" setting the virtual APIC mode. avic_set_virtual_apic_mode() is invoked from common x86 with preemption enabled, which makes avic_vcpu_{load,put}() unhappy. Luckily, calling those and updating IRTE stuff is unnecessary as the only reason avic_set_virtual_apic_mode() is called is to handle transitions between xAPIC and x2APIC that don't also toggle APICv activation. And if activation doesn't change, there's no need to fiddle with the physical APIC ID table or update IRTE. The "full" refresh is guaranteed to be called if activation changes in this case as the only call to the "set" path is: kvm_vcpu_update_apicv(vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); and kvm_vcpu_update_apicv() invokes the refresh if activation changes: if (apic->apicv_active == activate) goto out; apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); Rename the helper to reflect that it is also called during "refresh". WARNING: CPU: 183 PID: 49186 at arch/x86/kvm/svm/avic.c:1081 avic_vcpu_put+0xde/0xf0 [kvm_amd] CPU: 183 PID: 49186 Comm: stable Tainted: G O 6.0.0-smp--fcddbca45f0a-sink #34 Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 10.48.0 01/27/2022 RIP: 0010:avic_vcpu_put+0xde/0xf0 [kvm_amd] avic_refresh_apicv_exec_ctrl+0x142/0x1c0 [kvm_amd] avic_set_virtual_apic_mode+0x5a/0x70 [kvm_amd] kvm_lapic_set_base+0x149/0x1a0 [kvm] kvm_set_apic_base+0x8f/0xd0 [kvm] kvm_set_msr_common+0xa3a/0xdc0 [kvm] svm_set_msr+0x364/0x6b0 [kvm_amd] __kvm_set_msr+0xb8/0x1c0 [kvm] kvm_emulate_wrmsr+0x58/0x1d0 [kvm] msr_interception+0x1c/0x30 [kvm_amd] svm_invoke_exit_handler+0x31/0x100 [kvm_amd] svm_handle_exit+0xfc/0x160 [kvm_amd] vcpu_enter_guest+0x21bb/0x23e0 [kvm] vcpu_run+0x92/0x450 [kvm] kvm_arch_vcpu_ioctl_run+0x43e/0x6e0 [kvm] kvm_vcpu_ioctl+0x559/0x620 [kvm] Fixes: 05c4fe8c1bd9 ("KVM: SVM: Refresh AVIC configuration when changing APIC mode") Cc: stable@vger.kernel.org Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20230106011306.85230-8-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-01-06 01:12:40 +00:00
void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb01.ptr;
KVM: SVM: Don't put/load AVIC when setting virtual APIC mode Move the VMCB updates from avic_refresh_apicv_exec_ctrl() into avic_set_virtual_apic_mode() and invert the dependency being said functions to avoid calling avic_vcpu_{load,put}() and avic_set_pi_irte_mode() when "only" setting the virtual APIC mode. avic_set_virtual_apic_mode() is invoked from common x86 with preemption enabled, which makes avic_vcpu_{load,put}() unhappy. Luckily, calling those and updating IRTE stuff is unnecessary as the only reason avic_set_virtual_apic_mode() is called is to handle transitions between xAPIC and x2APIC that don't also toggle APICv activation. And if activation doesn't change, there's no need to fiddle with the physical APIC ID table or update IRTE. The "full" refresh is guaranteed to be called if activation changes in this case as the only call to the "set" path is: kvm_vcpu_update_apicv(vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); and kvm_vcpu_update_apicv() invokes the refresh if activation changes: if (apic->apicv_active == activate) goto out; apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); Rename the helper to reflect that it is also called during "refresh". WARNING: CPU: 183 PID: 49186 at arch/x86/kvm/svm/avic.c:1081 avic_vcpu_put+0xde/0xf0 [kvm_amd] CPU: 183 PID: 49186 Comm: stable Tainted: G O 6.0.0-smp--fcddbca45f0a-sink #34 Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 10.48.0 01/27/2022 RIP: 0010:avic_vcpu_put+0xde/0xf0 [kvm_amd] avic_refresh_apicv_exec_ctrl+0x142/0x1c0 [kvm_amd] avic_set_virtual_apic_mode+0x5a/0x70 [kvm_amd] kvm_lapic_set_base+0x149/0x1a0 [kvm] kvm_set_apic_base+0x8f/0xd0 [kvm] kvm_set_msr_common+0xa3a/0xdc0 [kvm] svm_set_msr+0x364/0x6b0 [kvm_amd] __kvm_set_msr+0xb8/0x1c0 [kvm] kvm_emulate_wrmsr+0x58/0x1d0 [kvm] msr_interception+0x1c/0x30 [kvm_amd] svm_invoke_exit_handler+0x31/0x100 [kvm_amd] svm_handle_exit+0xfc/0x160 [kvm_amd] vcpu_enter_guest+0x21bb/0x23e0 [kvm] vcpu_run+0x92/0x450 [kvm] kvm_arch_vcpu_ioctl_run+0x43e/0x6e0 [kvm] kvm_vcpu_ioctl+0x559/0x620 [kvm] Fixes: 05c4fe8c1bd9 ("KVM: SVM: Refresh AVIC configuration when changing APIC mode") Cc: stable@vger.kernel.org Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20230106011306.85230-8-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-01-06 01:12:40 +00:00
if (!lapic_in_kernel(vcpu) || !enable_apicv)
return;
KVM: SVM: Don't put/load AVIC when setting virtual APIC mode Move the VMCB updates from avic_refresh_apicv_exec_ctrl() into avic_set_virtual_apic_mode() and invert the dependency being said functions to avoid calling avic_vcpu_{load,put}() and avic_set_pi_irte_mode() when "only" setting the virtual APIC mode. avic_set_virtual_apic_mode() is invoked from common x86 with preemption enabled, which makes avic_vcpu_{load,put}() unhappy. Luckily, calling those and updating IRTE stuff is unnecessary as the only reason avic_set_virtual_apic_mode() is called is to handle transitions between xAPIC and x2APIC that don't also toggle APICv activation. And if activation doesn't change, there's no need to fiddle with the physical APIC ID table or update IRTE. The "full" refresh is guaranteed to be called if activation changes in this case as the only call to the "set" path is: kvm_vcpu_update_apicv(vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); and kvm_vcpu_update_apicv() invokes the refresh if activation changes: if (apic->apicv_active == activate) goto out; apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); Rename the helper to reflect that it is also called during "refresh". WARNING: CPU: 183 PID: 49186 at arch/x86/kvm/svm/avic.c:1081 avic_vcpu_put+0xde/0xf0 [kvm_amd] CPU: 183 PID: 49186 Comm: stable Tainted: G O 6.0.0-smp--fcddbca45f0a-sink #34 Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 10.48.0 01/27/2022 RIP: 0010:avic_vcpu_put+0xde/0xf0 [kvm_amd] avic_refresh_apicv_exec_ctrl+0x142/0x1c0 [kvm_amd] avic_set_virtual_apic_mode+0x5a/0x70 [kvm_amd] kvm_lapic_set_base+0x149/0x1a0 [kvm] kvm_set_apic_base+0x8f/0xd0 [kvm] kvm_set_msr_common+0xa3a/0xdc0 [kvm] svm_set_msr+0x364/0x6b0 [kvm_amd] __kvm_set_msr+0xb8/0x1c0 [kvm] kvm_emulate_wrmsr+0x58/0x1d0 [kvm] msr_interception+0x1c/0x30 [kvm_amd] svm_invoke_exit_handler+0x31/0x100 [kvm_amd] svm_handle_exit+0xfc/0x160 [kvm_amd] vcpu_enter_guest+0x21bb/0x23e0 [kvm] vcpu_run+0x92/0x450 [kvm] kvm_arch_vcpu_ioctl_run+0x43e/0x6e0 [kvm] kvm_vcpu_ioctl+0x559/0x620 [kvm] Fixes: 05c4fe8c1bd9 ("KVM: SVM: Refresh AVIC configuration when changing APIC mode") Cc: stable@vger.kernel.org Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20230106011306.85230-8-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-01-06 01:12:40 +00:00
if (kvm_vcpu_apicv_active(vcpu)) {
/**
* During AVIC temporary deactivation, guest could update
* APIC ID, DFR and LDR registers, which would not be trapped
* by avic_unaccelerated_access_interception(). In this case,
* we need to check and update the AVIC logical APIC ID table
* accordingly before re-activating.
*/
avic_apicv_post_state_restore(vcpu);
avic_activate_vmcb(svm);
} else {
avic_deactivate_vmcb(svm);
}
vmcb_mark_dirty(vmcb, VMCB_AVIC);
KVM: SVM: Don't put/load AVIC when setting virtual APIC mode Move the VMCB updates from avic_refresh_apicv_exec_ctrl() into avic_set_virtual_apic_mode() and invert the dependency being said functions to avoid calling avic_vcpu_{load,put}() and avic_set_pi_irte_mode() when "only" setting the virtual APIC mode. avic_set_virtual_apic_mode() is invoked from common x86 with preemption enabled, which makes avic_vcpu_{load,put}() unhappy. Luckily, calling those and updating IRTE stuff is unnecessary as the only reason avic_set_virtual_apic_mode() is called is to handle transitions between xAPIC and x2APIC that don't also toggle APICv activation. And if activation doesn't change, there's no need to fiddle with the physical APIC ID table or update IRTE. The "full" refresh is guaranteed to be called if activation changes in this case as the only call to the "set" path is: kvm_vcpu_update_apicv(vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); and kvm_vcpu_update_apicv() invokes the refresh if activation changes: if (apic->apicv_active == activate) goto out; apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); Rename the helper to reflect that it is also called during "refresh". WARNING: CPU: 183 PID: 49186 at arch/x86/kvm/svm/avic.c:1081 avic_vcpu_put+0xde/0xf0 [kvm_amd] CPU: 183 PID: 49186 Comm: stable Tainted: G O 6.0.0-smp--fcddbca45f0a-sink #34 Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 10.48.0 01/27/2022 RIP: 0010:avic_vcpu_put+0xde/0xf0 [kvm_amd] avic_refresh_apicv_exec_ctrl+0x142/0x1c0 [kvm_amd] avic_set_virtual_apic_mode+0x5a/0x70 [kvm_amd] kvm_lapic_set_base+0x149/0x1a0 [kvm] kvm_set_apic_base+0x8f/0xd0 [kvm] kvm_set_msr_common+0xa3a/0xdc0 [kvm] svm_set_msr+0x364/0x6b0 [kvm_amd] __kvm_set_msr+0xb8/0x1c0 [kvm] kvm_emulate_wrmsr+0x58/0x1d0 [kvm] msr_interception+0x1c/0x30 [kvm_amd] svm_invoke_exit_handler+0x31/0x100 [kvm_amd] svm_handle_exit+0xfc/0x160 [kvm_amd] vcpu_enter_guest+0x21bb/0x23e0 [kvm] vcpu_run+0x92/0x450 [kvm] kvm_arch_vcpu_ioctl_run+0x43e/0x6e0 [kvm] kvm_vcpu_ioctl+0x559/0x620 [kvm] Fixes: 05c4fe8c1bd9 ("KVM: SVM: Refresh AVIC configuration when changing APIC mode") Cc: stable@vger.kernel.org Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20230106011306.85230-8-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-01-06 01:12:40 +00:00
}
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
bool activated = kvm_vcpu_apicv_active(vcpu);
if (!enable_apicv)
return;
avic_refresh_virtual_apic_mode(vcpu);
if (activated)
avic_vcpu_load(vcpu, vcpu->cpu);
else
avic_vcpu_put(vcpu);
avic_set_pi_irte_mode(vcpu, activated);
}
void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
{
if (!kvm_vcpu_apicv_active(vcpu))
return;
/*
* Unload the AVIC when the vCPU is about to block, _before_
* the vCPU actually blocks.
*
* Any IRQs that arrive before IsRunning=0 will not cause an
* incomplete IPI vmexit on the source, therefore vIRR will also
* be checked by kvm_vcpu_check_block() before blocking. The
* memory barrier implicit in set_current_state orders writing
* IsRunning=0 before reading the vIRR. The processor needs a
* matching memory barrier on interrupt delivery between writing
* IRR and reading IsRunning; the lack of this barrier might be
* the cause of errata #1235).
*/
avic_vcpu_put(vcpu);
}
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
if (!kvm_vcpu_apicv_active(vcpu))
return;
avic_vcpu_load(vcpu, vcpu->cpu);
}
/*
* Note:
* - The module param avic enable both xAPIC and x2APIC mode.
* - Hypervisor can support both xAVIC and x2AVIC in the same guest.
* - The mode can be switched at run-time.
*/
bool avic_hardware_setup(struct kvm_x86_ops *x86_ops)
{
if (!npt_enabled)
return false;
/* AVIC is a prerequisite for x2AVIC. */
if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
pr_warn(FW_BUG "Try enable AVIC using force_avic option");
}
return false;
}
if (boot_cpu_has(X86_FEATURE_AVIC)) {
pr_info("AVIC enabled\n");
} else if (force_avic) {
/*
* Some older systems does not advertise AVIC support.
* See Revision Guide for specific AMD processor for more detail.
*/
pr_warn("AVIC is not supported in CPUID but force enabled");
pr_warn("Your system might crash and burn");
}
/* AVIC is a prerequisite for x2AVIC. */
x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
if (x2avic_enabled)
pr_info("x2AVIC enabled\n");
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
return true;
}