From 4f13d471e5d11034d56161af56d0f9396bc0b384 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 7 Jun 2021 06:15:32 +0000 Subject: [PATCH 01/10] KVM: SVM: Fix SEV SEND_START session length & SEND_UPDATE_DATA query length after commit 238eca821cee Commit 238eca821cee ("KVM: SVM: Allocate SEV command structures on local stack") uses the local stack to allocate the structures used to communicate with the PSP, which were earlier being kzalloced. This breaks SEV live migration for computing the SEND_START session length and SEND_UPDATE_DATA query length as session_len and trans_len and hdr_len fields are not zeroed respectively for the above commands before issuing the SEV Firmware API call, hence the firmware returns incorrect session length and update data header or trans length. Also the SEV Firmware API returns SEV_RET_INVALID_LEN firmware error for these length query API calls, and the return value and the firmware error needs to be passed to the userspace as it is, so need to remove the return check in the KVM code. Signed-off-by: Ashish Kalra Message-Id: <20210607061532.27459-1-Ashish.Kalra@amd.com> Fixes: 238eca821cee ("KVM: SVM: Allocate SEV command structures on local stack") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5bc887e9a986..e0ce5da97fc2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1103,10 +1103,9 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, struct sev_data_send_start data; int ret; + memset(&data, 0, sizeof(data)); data.handle = sev->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); - if (ret < 0) - return ret; params->session_len = data.session_len; if (copy_to_user((void __user *)(uintptr_t)argp->data, params, @@ -1215,10 +1214,9 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, struct sev_data_send_update_data data; int ret; + memset(&data, 0, sizeof(data)); data.handle = sev->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); - if (ret < 0) - return ret; params->hdr_len = data.hdr_len; params->trans_len = data.trans_len; From e898da784aed0ea65f7672d941c01dc9b79e6299 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 7 Jun 2021 00:19:43 -0700 Subject: [PATCH 02/10] KVM: LAPIC: Write 0 to TMICT should also cancel vmx-preemption timer According to the SDM 10.5.4.1: A write of 0 to the initial-count register effectively stops the local APIC timer, in both one-shot and periodic mode. However, the lapic timer oneshot/periodic mode which is emulated by vmx-preemption timer doesn't stop by writing 0 to TMICT since vmx->hv_deadline_tsc is still programmed and the guest will receive the spurious timer interrupt later. This patch fixes it by also cancelling the vmx-preemption timer when writing 0 to the initial-count register. Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li Message-Id: <1623050385-100988-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8120e8614b92..6d72d8f43310 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1494,6 +1494,15 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) static void cancel_hv_timer(struct kvm_lapic *apic); +static void cancel_apic_timer(struct kvm_lapic *apic) +{ + hrtimer_cancel(&apic->lapic_timer.timer); + preempt_disable(); + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + preempt_enable(); +} + static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & @@ -1502,11 +1511,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic) if (apic->lapic_timer.timer_mode != timer_mode) { if (apic_lvtt_tscdeadline(apic) != (timer_mode == APIC_LVT_TIMER_TSCDEADLINE)) { - hrtimer_cancel(&apic->lapic_timer.timer); - preempt_disable(); - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_timer(apic); - preempt_enable(); + cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, 0); apic->lapic_timer.period = 0; apic->lapic_timer.tscdeadline = 0; @@ -2092,7 +2097,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) if (apic_lvtt_tscdeadline(apic)) break; - hrtimer_cancel(&apic->lapic_timer.timer); + cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); break; From b1bd5cba3306691c771d558e94baa73e8b0b96b7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 3 Jun 2021 13:24:55 +0800 Subject: [PATCH 03/10] KVM: X86: MMU: Use the correct inherited permissions to get shadow page When computing the access permissions of a shadow page, use the effective permissions of the walk up to that point, i.e. the logic AND of its parents' permissions. Two guest PxE entries that point at the same table gfn need to be shadowed with different shadow pages if their parents' permissions are different. KVM currently uses the effective permissions of the last non-leaf entry for all non-leaf entries. Because all non-leaf SPTEs have full ("uwx") permissions, and the effective permissions are recorded only in role.access and merged into the leaves, this can lead to incorrect reuse of a shadow page and eventually to a missing guest protection page fault. For example, here is a shared pagetable: pgd[] pud[] pmd[] virtual address pointers /->pmd1(u--)->pte1(uw-)->page1 <- ptr1 (u--) /->pud1(uw-)--->pmd2(uw-)->pte2(uw-)->page2 <- ptr2 (uw-) pgd-| (shared pmd[] as above) \->pud2(u--)--->pmd1(u--)->pte1(uw-)->page1 <- ptr3 (u--) \->pmd2(uw-)->pte2(uw-)->page2 <- ptr4 (u--) pud1 and pud2 point to the same pmd table, so: - ptr1 and ptr3 points to the same page. - ptr2 and ptr4 points to the same page. (pud1 and pud2 here are pud entries, while pmd1 and pmd2 here are pmd entries) - First, the guest reads from ptr1 first and KVM prepares a shadow page table with role.access=u--, from ptr1's pud1 and ptr1's pmd1. "u--" comes from the effective permissions of pgd, pud1 and pmd1, which are stored in pt->access. "u--" is used also to get the pagetable for pud1, instead of "uw-". - Then the guest writes to ptr2 and KVM reuses pud1 which is present. The hypervisor set up a shadow page for ptr2 with pt->access is "uw-" even though the pud1 pmd (because of the incorrect argument to kvm_mmu_get_page in the previous step) has role.access="u--". - Then the guest reads from ptr3. The hypervisor reuses pud1's shadow pmd for pud2, because both use "u--" for their permissions. Thus, the shadow pmd already includes entries for both pmd1 and pmd2. - At last, the guest writes to ptr4. This causes no vmexit or pagefault, because pud1's shadow page structures included an "uw-" page even though its role.access was "u--". Any kind of shared pagetable might have the similar problem when in virtual machine without TDP enabled if the permissions are different from different ancestors. In order to fix the problem, we change pt->access to be an array, and any access in it will not include permissions ANDed from child ptes. The test code is: https://lore.kernel.org/kvm/20210603050537.19605-1-jiangshanlai@gmail.com/ Remember to test it with TDP disabled. The problem had existed long before the commit 41074d07c78b ("KVM: MMU: Fix inherited permissions for emulated guest pte updates"), and it is hard to find which is the culprit. So there is no fixes tag here. Signed-off-by: Lai Jiangshan Message-Id: <20210603052455.21023-1-jiangshanlai@gmail.com> Cc: stable@vger.kernel.org Fixes: cea0f0e7ea54 ("[PATCH] KVM: MMU: Shadow page table caching") Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/mmu.rst | 4 ++-- arch/x86/kvm/mmu/paging_tmpl.h | 14 +++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst index 5bfe28b0728e..20d85daed395 100644 --- a/Documentation/virt/kvm/mmu.rst +++ b/Documentation/virt/kvm/mmu.rst @@ -171,8 +171,8 @@ Shadow pages contain the following information: shadow pages) so role.quadrant takes values in the range 0..3. Each quadrant maps 1GB virtual address space. role.access: - Inherited guest access permissions in the form uwx. Note execute - permission is positive, not negative. + Inherited guest access permissions from the parent ptes in the form uwx. + Note execute permission is positive, not negative. role.invalid: The page is invalid and should not be used. It is a root page that is currently pinned (by a cpu hardware register pointing to it); once it is diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 70b7e44e3035..823a5919f9fa 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -90,8 +90,8 @@ struct guest_walker { gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; bool pte_writable[PT_MAX_FULL_LEVELS]; - unsigned pt_access; - unsigned pte_access; + unsigned int pt_access[PT_MAX_FULL_LEVELS]; + unsigned int pte_access; gfn_t gfn; struct x86_exception fault; }; @@ -418,13 +418,15 @@ retry_walk: } walker->ptes[walker->level - 1] = pte; + + /* Convert to ACC_*_MASK flags for struct guest_walker. */ + walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); } while (!is_last_gpte(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; /* Convert to ACC_*_MASK flags for struct guest_walker. */ - walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask); walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) @@ -463,7 +465,8 @@ retry_walk: } pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, walker->pte_access, walker->pt_access); + __func__, (u64)pte, walker->pte_access, + walker->pt_access[walker->level - 1]); return 1; error: @@ -643,7 +646,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; - unsigned direct_access, access = gw->pt_access; + unsigned int direct_access, access; int top_level, level, req_level, ret; gfn_t base_gfn = gw->gfn; @@ -675,6 +678,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, sp = NULL; if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; + access = gw->pt_access[it.level - 2]; sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, false, access); } From af3511ff7fa2107d6410831f3d71030f5e8d2b25 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 1 Jun 2021 01:46:28 +0800 Subject: [PATCH 04/10] KVM: x86: Ensure PV TLB flush tracepoint reflects KVM behavior In record_steal_time(), st->preempted is read twice, and trace_kvm_pv_tlb_flush() might output result inconsistent if kvm_vcpu_flush_tlb_guest() see a different st->preempted later. It is a very trivial problem and hardly has actual harm and can be avoided by reseting and reading st->preempted in atomic way via xchg(). Signed-off-by: Lai Jiangshan Message-Id: <20210531174628.10265-1-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1cd6d4685932..e2144eedaf79 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3101,9 +3101,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + u8 st_preempted = xchg(&st->preempted, 0); + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - st->preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + st_preempted & KVM_VCPU_FLUSH_TLB); + if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); } else { st->preempted = 0; From 1bc603af73dd8fb2934306e861009c54f973dcc2 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 8 Jun 2021 14:39:54 +0200 Subject: [PATCH 05/10] KVM: selftests: introduce P47V64 for s390x s390x can have up to 47bits of physical guest and 64bits of virtual address bits. Add a new address mode to avoid errors of testcases going beyond 47bits. Signed-off-by: Christian Borntraeger Message-Id: <20210608123954.10991-1-borntraeger@de.ibm.com> Fixes: ef4c9f4f6546 ("KVM: selftests: Fix 32-bit truncation of vm_get_max_gfn()") Cc: stable@vger.kernel.org Reviewed-by: David Matlack Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util.h | 3 ++- tools/testing/selftests/kvm/lib/kvm_util.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index fcd8e3855111..b602552b1ed0 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -43,6 +43,7 @@ enum vm_guest_mode { VM_MODE_P40V48_4K, VM_MODE_P40V48_64K, VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + VM_MODE_P47V64_4K, NUM_VM_MODES, }; @@ -60,7 +61,7 @@ enum vm_guest_mode { #elif defined(__s390x__) -#define VM_MODE_DEFAULT VM_MODE_P52V48_4K +#define VM_MODE_DEFAULT VM_MODE_P47V64_4K #define MIN_PAGE_SHIFT 12U #define ptes_per_page(page_size) ((page_size) / 16) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 28e528c19d28..b126fab6c4e1 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -175,6 +175,7 @@ const char *vm_guest_mode_string(uint32_t i) [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", + [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", }; _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES, "Missing new mode strings?"); @@ -192,6 +193,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { { 40, 48, 0x1000, 12 }, { 40, 48, 0x10000, 16 }, { 0, 0, 0x1000, 12 }, + { 47, 64, 0x1000, 12 }, }; _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, "Missing new mode params?"); @@ -277,6 +279,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); #endif break; + case VM_MODE_P47V64_4K: + vm->pgtable_levels = 5; + break; default: TEST_FAIL("Unknown guest mode, mode: 0x%x", mode); } From f53b16ad64408b5376836708f8cf42dbf1cf6098 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 9 Jun 2021 07:38:16 +0800 Subject: [PATCH 06/10] selftests: kvm: Add support for customized slot0 memory size Until commit 39fe2fc96694 ("selftests: kvm: make allocation of extra memory take effect", 2021-05-27), parameter extra_mem_pages was used only to calculate the page table size for all the memory chunks, because real memory allocation happened with calls of vm_userspace_mem_region_add() after vm_create_default(). Commit 39fe2fc96694 however changed the meaning of extra_mem_pages to the size of memory slot 0. This makes the memory allocation more flexible, but makes it harder to account for the number of pages needed for the page tables. For example, memslot_perf_test has a small amount of memory in slot 0 but a lot in other slots, and adding that memory twice (both in slot 0 and with later calls to vm_userspace_mem_region_add()) causes an error that was fixed in commit 000ac4295339 ("selftests: kvm: fix overlapping addresses in memslot_perf_test", 2021-05-29) Since both uses are sensible, add a new parameter slot0_mem_pages to vm_create_with_vcpus() and some comments to clarify the meaning of slot0_mem_pages and extra_mem_pages. With this change, memslot_perf_test can go back to passing the number of memory pages as extra_mem_pages. Signed-off-by: Zhenzhong Duan Message-Id: <20210608233816.423958-4-zhenzhong.duan@intel.com> [Squashed in a single patch and rewrote the commit message. - Paolo] Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util.h | 7 +-- .../selftests/kvm/kvm_page_table_test.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 47 +++++++++++++++---- .../selftests/kvm/lib/perf_test_util.c | 2 +- .../testing/selftests/kvm/memslot_perf_test.c | 2 +- 5 files changed, 45 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index b602552b1ed0..35739567189e 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -286,10 +286,11 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me uint32_t num_percpu_pages, void *guest_code, uint32_t vcpuids[]); -/* Like vm_create_default_with_vcpus, but accepts mode as a parameter */ +/* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t extra_mem_pages, uint32_t num_percpu_pages, - void *guest_code, uint32_t vcpuids[]); + uint64_t slot0_mem_pages, uint64_t extra_mem_pages, + uint32_t num_percpu_pages, void *guest_code, + uint32_t vcpuids[]); /* * Adds a vCPU with reasonable defaults (e.g. a stack) diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 1c4753fff19e..82171f17c1d7 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -268,7 +268,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Create a VM with enough guest pages */ guest_num_pages = test_mem_size / guest_page_size; - vm = vm_create_with_vcpus(mode, nr_vcpus, + vm = vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, guest_num_pages, 0, guest_code, NULL); /* Align down GPA of the testing memslot */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index b126fab6c4e1..5c70596dd1b9 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -313,21 +313,50 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) return vm; } +/* + * VM Create with customized parameters + * + * Input Args: + * mode - VM Mode (e.g. VM_MODE_P52V48_4K) + * nr_vcpus - VCPU count + * slot0_mem_pages - Slot0 physical memory size + * extra_mem_pages - Non-slot0 physical memory total size + * num_percpu_pages - Per-cpu physical memory pages + * guest_code - Guest entry point + * vcpuids - VCPU IDs + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + * + * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K), + * with customized slot0 memory size, at least 512 pages currently. + * extra_mem_pages is only used to calculate the maximum page table size, + * no real memory allocation for non-slot0 memory in this function. + */ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t extra_mem_pages, uint32_t num_percpu_pages, - void *guest_code, uint32_t vcpuids[]) + uint64_t slot0_mem_pages, uint64_t extra_mem_pages, + uint32_t num_percpu_pages, void *guest_code, + uint32_t vcpuids[]) { + uint64_t vcpu_pages, extra_pg_pages, pages; + struct kvm_vm *vm; + int i; + + /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ + if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) + slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES; + /* The maximum page table size for a memory region will be when the * smallest pages are used. Considering each page contains x page * table descriptors, the total extra size for page tables (for extra * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller * than N/x*2. */ - uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; - uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; - uint64_t pages = DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages + extra_pg_pages; - struct kvm_vm *vm; - int i; + vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; + extra_pg_pages = (slot0_mem_pages + extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; + pages = slot0_mem_pages + vcpu_pages + extra_pg_pages; TEST_ASSERT(nr_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), "nr_vcpus = %d too large for host, max-vcpus = %d", @@ -359,8 +388,8 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me uint32_t num_percpu_pages, void *guest_code, uint32_t vcpuids[]) { - return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, extra_mem_pages, - num_percpu_pages, guest_code, vcpuids); + return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, + extra_mem_pages, num_percpu_pages, guest_code, vcpuids); } struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index abf381800a59..7397ca299835 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -69,7 +69,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, "Guest memory size is not guest page size aligned."); - vm = vm_create_with_vcpus(mode, vcpus, + vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, 0, guest_code, NULL); diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 9307f25d8130..11239652d805 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -267,7 +267,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); TEST_ASSERT(data->hva_slots, "malloc() fail"); - data->vm = vm_create_default(VCPU_ID, 1024, guest_code); + data->vm = vm_create_default(VCPU_ID, mempages, guest_code); pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", max_mem_slots - 1, data->pages_per_slot, rempages); From f31500b0d437a2464ca5972d8f5439e156b74960 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 7 Jun 2021 10:57:48 -0700 Subject: [PATCH 07/10] KVM: x86: Ensure liveliness of nested VM-Enter fail tracepoint message Use the __string() machinery provided by the tracing subystem to make a copy of the string literals consumed by the "nested VM-Enter failed" tracepoint. A complete copy is necessary to ensure that the tracepoint can't outlive the data/memory it consumes and deference stale memory. Because the tracepoint itself is defined by kvm, if kvm-intel and/or kvm-amd are built as modules, the memory holding the string literals defined by the vendor modules will be freed when the module is unloaded, whereas the tracepoint and its data in the ring buffer will live until kvm is unloaded (or "indefinitely" if kvm is built-in). This bug has existed since the tracepoint was added, but was recently exposed by a new check in tracing to detect exactly this type of bug. fmt: '%s%s ' current_buffer: ' vmx_dirty_log_t-140127 [003] .... kvm_nested_vmenter_failed: ' WARNING: CPU: 3 PID: 140134 at kernel/trace/trace.c:3759 trace_check_vprintf+0x3be/0x3e0 CPU: 3 PID: 140134 Comm: less Not tainted 5.13.0-rc1-ce2e73ce600a-req #184 Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014 RIP: 0010:trace_check_vprintf+0x3be/0x3e0 Code: <0f> 0b 44 8b 4c 24 1c e9 a9 fe ff ff c6 44 02 ff 00 49 8b 97 b0 20 RSP: 0018:ffffa895cc37bcb0 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffa895cc37bd08 RCX: 0000000000000027 RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff9766cfad74f8 RBP: ffffffffc0a041d4 R08: ffff9766cfad74f0 R09: ffffa895cc37bad8 R10: 0000000000000001 R11: 0000000000000001 R12: ffffffffc0a041d4 R13: ffffffffc0f4dba8 R14: 0000000000000000 R15: ffff976409f2c000 FS: 00007f92fa200740(0000) GS:ffff9766cfac0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559bd11b0000 CR3: 000000019fbaa002 CR4: 00000000001726e0 Call Trace: trace_event_printf+0x5e/0x80 trace_raw_output_kvm_nested_vmenter_failed+0x3a/0x60 [kvm] print_trace_line+0x1dd/0x4e0 s_show+0x45/0x150 seq_read_iter+0x2d5/0x4c0 seq_read+0x106/0x150 vfs_read+0x98/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x40/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Cc: Steven Rostedt Fixes: 380e0055bc7e ("KVM: nVMX: trace nested VM-Enter failures detected by H/W") Signed-off-by: Sean Christopherson Reviewed-by: Steven Rostedt (VMware) Message-Id: <20210607175748.674002-1-seanjc@google.com> --- arch/x86/kvm/trace.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index a61c015870e3..4f839148948b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1550,16 +1550,16 @@ TRACE_EVENT(kvm_nested_vmenter_failed, TP_ARGS(msg, err), TP_STRUCT__entry( - __field(const char *, msg) + __string(msg, msg) __field(u32, err) ), TP_fast_assign( - __entry->msg = msg; + __assign_str(msg, msg); __entry->err = err; ), - TP_printk("%s%s", __entry->msg, !__entry->err ? "" : + TP_printk("%s%s", __get_str(msg), !__entry->err ? "" : __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) ); From b53e84eed08b88fd3ff59e5c2a7f1a69d4004e32 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 1 Jun 2021 01:22:56 +0800 Subject: [PATCH 08/10] KVM: x86: Unload MMU on guest TLB flush if TDP disabled to force MMU sync When using shadow paging, unload the guest MMU when emulating a guest TLB flush to ensure all roots are synchronized. From the guest's perspective, flushing the TLB ensures any and all modifications to its PTEs will be recognized by the CPU. Note, unloading the MMU is overkill, but is done to mirror KVM's existing handling of INVPCID(all) and ensure the bug is squashed. Future cleanup can be done to more precisely synchronize roots when servicing a guest TLB flush. If TDP is enabled, synchronizing the MMU is unnecessary even if nested TDP is in play, as a "legacy" TLB flush from L1 does not invalidate L1's TDP mappings. For EPT, an explicit INVEPT is required to invalidate guest-physical mappings; for NPT, guest mappings are always tagged with an ASID and thus can only be invalidated via the VMCB's ASID control. This bug has existed since the introduction of KVM_VCPU_FLUSH_TLB. It was only recently exposed after Linux guests stopped flushing the local CPU's TLB prior to flushing remote TLBs (see commit 4ce94eabac16, "x86/mm/tlb: Flush remote and local TLBs concurrently"), but is also visible in Windows 10 guests. Tested-by: Maxim Levitsky Reviewed-by: Maxim Levitsky Fixes: f38a7b75267f ("KVM: X86: support paravirtualized help for TLB shootdowns") Signed-off-by: Lai Jiangshan [sean: massaged comment and changelog] Message-Id: <20210531172256.2908-1-jiangshanlai@gmail.com> Signed-off-by: Sean Christopherson Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e2144eedaf79..9dd23bdfc6cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3072,6 +3072,19 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; + + if (!tdp_enabled) { + /* + * A TLB flush on behalf of the guest is equivalent to + * INVPCID(all), toggling CR4.PGE, etc., which requires + * a forced sync of the shadow page tables. Unload the + * entire MMU here and the subsequent load will sync the + * shadow page tables, and also flush the TLB. + */ + kvm_mmu_unload(vcpu); + return; + } + static_call(kvm_x86_tlb_flush_guest)(vcpu); } From da27a83fd6cc7780fea190e1f5c19e87019da65c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 8 Jun 2021 15:31:42 -0400 Subject: [PATCH 09/10] kvm: avoid speculation-based attacks from out-of-range memslot accesses KVM's mechanism for accessing guest memory translates a guest physical address (gpa) to a host virtual address using the right-shifted gpa (also known as gfn) and a struct kvm_memory_slot. The translation is performed in __gfn_to_hva_memslot using the following formula: hva = slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE It is expected that gfn falls within the boundaries of the guest's physical memory. However, a guest can access invalid physical addresses in such a way that the gfn is invalid. __gfn_to_hva_memslot is called from kvm_vcpu_gfn_to_hva_prot, which first retrieves a memslot through __gfn_to_memslot. While __gfn_to_memslot does check that the gfn falls within the boundaries of the guest's physical memory or not, a CPU can speculate the result of the check and continue execution speculatively using an illegal gfn. The speculation can result in calculating an out-of-bounds hva. If the resulting host virtual address is used to load another guest physical address, this is effectively a Spectre gadget consisting of two consecutive reads, the second of which is data dependent on the first. Right now it's not clear if there are any cases in which this is exploitable. One interesting case was reported by the original author of this patch, and involves visiting guest page tables on x86. Right now these are not vulnerable because the hva read goes through get_user(), which contains an LFENCE speculation barrier. However, there are patches in progress for x86 uaccess.h to mask kernel addresses instead of using LFENCE; once these land, a guest could use speculation to read from the VMM's ring 3 address space. Other architectures such as ARM already use the address masking method, and would be susceptible to this same kind of data-dependent access gadgets. Therefore, this patch proactively protects from these attacks by masking out-of-bounds gfns in __gfn_to_hva_memslot, which blocks speculation of invalid hvas. Sean Christopherson noted that this patch does not cover kvm_read_guest_offset_cached. This however is limited to a few bytes past the end of the cache, and therefore it is unlikely to be useful in the context of building a chain of data dependent accesses. Reported-by: Artemiy Margaritov Co-developed-by: Artemiy Margaritov Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 76102efbf079..74995f0a2a3c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1185,7 +1185,15 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) static inline unsigned long __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { - return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; + /* + * The index was checked originally in search_memslots. To avoid + * that a malicious guest builds a Spectre gadget out of e.g. page + * table walks, do not let the processor speculate loads outside + * the guest's registered memslots. + */ + unsigned long offset = array_index_nospec(gfn - slot->base_gfn, + slot->npages); + return slot->userspace_addr + offset * PAGE_SIZE; } static inline int memslot_id(struct kvm *kvm, gfn_t gfn) From 4422829e8053068e0225e4d0ef42dc41ea7c9ef5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 9 Jun 2021 01:49:13 -0400 Subject: [PATCH 10/10] kvm: fix previous commit for 32-bit builds array_index_nospec does not work for uint64_t on 32-bit builds. However, the size of a memory slot must be less than 20 bits wide on those system, since the memory slot must fit in the user address space. So just store it in an unsigned long. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 74995f0a2a3c..8583ed3ff344 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1191,8 +1191,8 @@ __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) * table walks, do not let the processor speculate loads outside * the guest's registered memslots. */ - unsigned long offset = array_index_nospec(gfn - slot->base_gfn, - slot->npages); + unsigned long offset = gfn - slot->base_gfn; + offset = array_index_nospec(offset, slot->npages); return slot->userspace_addr + offset * PAGE_SIZE; }