Merge tag 'kvm-x86-vmx-6.12' of https://github.com/kvm-x86/linux into HEAD

KVM VMX changes for 6.12:

 - Set FINAL/PAGE in the page fault error code for EPT Violations if and only
   if the GVA is valid.  If the GVA is NOT valid, there is no guest-side page
   table walk and so stuffing paging related metadata is nonsensical.

 - Fix a bug where KVM would incorrectly synthesize a nested VM-Exit instead of
   emulating posted interrupt delivery to L2.

 - Add a lockdep assertion to detect unsafe accesses of vmcs12 structures.

 - Harden eVMCS loading against an impossible NULL pointer deref (really truly
   should be impossible).

 - Minor SGX fix and a cleanup.
This commit is contained in:
Paolo Bonzini 2024-09-14 09:56:06 -04:00
commit 3f8df62852
13 changed files with 124 additions and 41 deletions

View File

@ -4214,7 +4214,9 @@ whether or not KVM_CAP_X86_USER_SPACE_MSR's KVM_MSR_EXIT_REASON_FILTER is
enabled. If KVM_MSR_EXIT_REASON_FILTER is enabled, KVM will exit to userspace
on denied accesses, i.e. userspace effectively intercepts the MSR access. If
KVM_MSR_EXIT_REASON_FILTER is not enabled, KVM will inject a #GP into the guest
on denied accesses.
on denied accesses. Note, if an MSR access is denied during emulation of MSR
load/stores during VMX transitions, KVM ignores KVM_MSR_EXIT_REASON_FILTER.
See the below warning for full details.
If an MSR access is allowed by userspace, KVM will emulate and/or virtualize
the access in accordance with the vCPU model. Note, KVM may still ultimately
@ -4229,9 +4231,22 @@ filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes
an error.
.. warning::
MSR accesses as part of nested VM-Enter/VM-Exit are not filtered.
This includes both writes to individual VMCS fields and reads/writes
through the MSR lists pointed to by the VMCS.
MSR accesses that are side effects of instruction execution (emulated or
native) are not filtered as hardware does not honor MSR bitmaps outside of
RDMSR and WRMSR, and KVM mimics that behavior when emulating instructions
to avoid pointless divergence from hardware. E.g. RDPID reads MSR_TSC_AUX,
SYSENTER reads the SYSENTER MSRs, etc.
MSRs that are loaded/stored via dedicated VMCS fields are not filtered as
part of VM-Enter/VM-Exit emulation.
MSRs that are loaded/store via VMX's load/store lists _are_ filtered as part
of VM-Enter/VM-Exit emulation. If an MSR access is denied on VM-Enter, KVM
synthesizes a consistency check VM-Exit(EXIT_REASON_MSR_LOAD_FAIL). If an
MSR access is denied on VM-Exit, KVM synthesizes a VM-Abort. In short, KVM
extends Intel's architectural list of MSRs that cannot be loaded/saved via
the VM-Enter/VM-Exit MSR list. It is platform owner's responsibility to
to communicate any such restrictions to their end users.
x2APIC MSR accesses cannot be filtered (KVM silently ignores filters that
cover any x2APIC MSRs).

View File

@ -2062,6 +2062,8 @@ void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);
void kvm_enable_efer_bits(u64);
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data);
int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data);
int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
@ -2264,6 +2266,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_cpu_has_extint(struct kvm_vcpu *v);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_extint(struct kvm_vcpu *v);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);

View File

@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
* Read pending interrupt(from non-APIC source)
* vector and intack.
*/
static int kvm_cpu_get_extint(struct kvm_vcpu *v)
int kvm_cpu_get_extint(struct kvm_vcpu *v)
{
if (!kvm_cpu_has_extint(v)) {
WARN_ON(!lapic_in_kernel(v));
@ -131,6 +131,7 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
} else
return kvm_pic_read_irq(v->kvm); /* PIC */
}
EXPORT_SYMBOL_GPL(kvm_cpu_get_extint);
/*
* Read pending interrupt vector and intack.
@ -141,9 +142,12 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
if (vector != -1)
return vector; /* PIC */
return kvm_get_apic_interrupt(v); /* APIC */
vector = kvm_apic_has_interrupt(v); /* APIC */
if (vector != -1)
kvm_apic_ack_interrupt(v, vector);
return vector;
}
EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{

View File

@ -2959,14 +2959,13 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
}
}
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector)
{
int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
u32 ppr;
if (vector == -1)
return -1;
if (WARN_ON_ONCE(vector < 0 || !apic))
return;
/*
* We get here even with APIC virtualization enabled, if doing
@ -2994,8 +2993,8 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
__apic_update_ppr(apic, &ppr);
}
return vector;
}
EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt);
static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s, bool set)

View File

@ -88,8 +88,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu);
void kvm_free_lapic(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
int kvm_apic_accept_events(struct kvm_vcpu *vcpu);
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);

View File

@ -981,7 +981,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
__func__, i, e.index, e.reserved);
goto fail;
}
if (kvm_set_msr(vcpu, e.index, e.value)) {
if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) {
pr_debug_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
__func__, i, e.index, e.value);
@ -1017,7 +1017,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
}
}
if (kvm_get_msr(vcpu, msr_index, data)) {
if (kvm_get_msr_with_filter(vcpu, msr_index, data)) {
pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
msr_index);
return false;
@ -1112,9 +1112,9 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
/*
* Emulated VMEntry does not fail here. Instead a less
* accurate value will be returned by
* nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
* instead of reading the value from the vmcs02 VMExit
* MSR-store area.
* nested_vmx_get_vmexit_msr_value() by reading KVM's
* internal MSR state instead of reading the value from
* the vmcs02 VMExit MSR-store area.
*/
pr_warn_ratelimited(
"Not enough msr entries in msr_autostore. Can't add msr %x\n",
@ -2341,10 +2341,12 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
/* Posted interrupts setting is only taken from vmcs12. */
vmx->nested.pi_pending = false;
if (nested_cpu_has_posted_intr(vmcs12))
if (nested_cpu_has_posted_intr(vmcs12)) {
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
else
} else {
vmx->nested.posted_intr_nv = -1;
exec_control &= ~PIN_BASED_POSTED_INTR;
}
pin_controls_set(vmx, exec_control);
/*
@ -2494,6 +2496,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
@ -2531,7 +2534,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
vmx->segment_cache.bitmask = 0;
vmx_segment_cache_clear(vmx);
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@ -4308,11 +4311,52 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
}
if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
int irq;
if (block_nested_events)
return -EBUSY;
if (!nested_exit_on_intr(vcpu))
goto no_vmexit;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
if (!nested_exit_intr_ack_set(vcpu)) {
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
return 0;
}
irq = kvm_cpu_get_extint(vcpu);
if (irq != -1) {
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
return 0;
}
irq = kvm_apic_has_interrupt(vcpu);
if (WARN_ON_ONCE(irq < 0))
goto no_vmexit;
/*
* If the IRQ is L2's PI notification vector, process posted
* interrupts for L2 instead of injecting VM-Exit, as the
* detection/morphing architecturally occurs when the IRQ is
* delivered to the CPU. Note, only interrupts that are routed
* through the local APIC trigger posted interrupt processing,
* and enabling posted interrupts requires ACK-on-exit.
*/
if (irq == vmx->nested.posted_intr_nv) {
vmx->nested.pi_pending = true;
kvm_apic_clear_irr(vcpu, irq);
goto no_vmexit;
}
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
/*
* ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
* be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
* if APICv is active.
*/
kvm_apic_ack_interrupt(vcpu, irq);
return 0;
}
@ -4830,7 +4874,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
goto vmabort;
}
if (kvm_set_msr(vcpu, h.index, h.value)) {
if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) {
pr_debug_ratelimited(
"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
__func__, j, h.index, h.value);
@ -4993,14 +5037,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (likely(!vmx->fail)) {
if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
nested_exit_intr_ack_set(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
WARN_ON(irq < 0);
vmcs12->vm_exit_intr_info = irq |
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
}
if (vm_exit_reason != -1)
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
vmcs12->exit_qualification,

View File

@ -39,11 +39,17 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
!refcount_read(&vcpu->kvm->users_count));
return to_vmx(vcpu)->nested.cached_vmcs12;
}
static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
{
lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
!refcount_read(&vcpu->kvm->users_count));
return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
}

View File

@ -274,7 +274,7 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
* simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
* enforce restriction of access to the PROVISIONKEY.
*/
contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL);
if (!contents)
return -ENOMEM;

View File

@ -525,10 +525,6 @@ static const struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
{
vmx->segment_cache.bitmask = 0;
}
static unsigned long host_idt_base;
@ -4219,6 +4215,13 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
* DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
* and freed, and must not be accessed outside of vcpu->mutex. The
* vCPU's cached PI NV is valid if and only if posted interrupts
* enabled in its vmcs12, i.e. checking the vector also checks that
* L1 has enabled posted interrupts for L2.
*/
if (is_guest_mode(vcpu) &&
vector == vmx->nested.posted_intr_nv) {
/*
@ -5804,8 +5807,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
? PFERR_PRESENT_MASK : 0;
error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
if (error_code & EPT_VIOLATION_GVA_IS_VALID)
error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
/*
* Check that the GPA doesn't exceed physical memory limits, as that is
@ -7969,6 +7973,7 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
kvm_cpu_cap_clear(X86_FEATURE_SGX1);
kvm_cpu_cap_clear(X86_FEATURE_SGX2);
kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
}
if (vmx_umip_emulated())

View File

@ -752,4 +752,9 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && enable_ipiv;
}
static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
{
vmx->segment_cache.bitmask = 0;
}
#endif /* __KVM_X86_VMX_H */

View File

@ -104,6 +104,14 @@ static inline void evmcs_load(u64 phys_addr)
struct hv_vp_assist_page *vp_ap =
hv_get_vp_assist_page(smp_processor_id());
/*
* When enabling eVMCS, KVM verifies that every CPU has a valid hv_vp_assist_page()
* and aborts enabling the feature otherwise. CPU onlining path is also checked in
* vmx_hardware_enable().
*/
if (KVM_BUG_ON(!vp_ap, kvm_get_running_vcpu()->kvm))
return;
if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
vp_ap->nested_control.features.directhypercall = 1;
vp_ap->current_nested_vmcs = phys_addr;

View File

@ -47,7 +47,7 @@ static __always_inline void vmcs_check16(unsigned long field)
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
"16-bit accessor invalid for 64-bit high field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
"16-bit accessor invalid for 32-bit high field");
"16-bit accessor invalid for 32-bit field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
"16-bit accessor invalid for natural width field");
}

View File

@ -1958,19 +1958,21 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
__kvm_get_msr);
}
static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
return KVM_MSR_RET_FILTERED;
return kvm_get_msr_ignored_check(vcpu, index, data, false);
}
EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter);
static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
{
if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
return KVM_MSR_RET_FILTERED;
return kvm_set_msr_ignored_check(vcpu, index, data, false);
}
EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{