From 4f8973e65fcd362b85942c80362cc9a7231b09fa Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Thu, 9 May 2024 12:47:10 +0800 Subject: [PATCH 1/7] KVM: x86: invalid_list not used anymore in mmu_shrink_scan 'invalid_list' is now gathered in KVM_MMU_ZAP_OLDEST_MMU_PAGES. Signed-off-by: Liang Chen Link: https://lore.kernel.org/r/20240509044710.18788-1-liangchen.linux@gmail.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 662f62dfb2aa..afc7489513bc 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6893,7 +6893,6 @@ static unsigned long mmu_shrink_scan(struct shrinker *shrink, list_for_each_entry(kvm, &vm_list, vm_list) { int idx; - LIST_HEAD(invalid_list); /* * Never scan more than sc->nr_to_scan VM instances. From 9ecc1c119b28d28869135a3745541c03965d52be Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Sat, 11 May 2024 11:46:37 +0800 Subject: [PATCH 2/7] KVM: x86/mmu: Only allocate shadowed translation cache for sp->role.level <= KVM_MAX_HUGEPAGE_LEVEL Only the indirect SP with sp->role.level <= KVM_MAX_HUGEPAGE_LEVEL might have leaf gptes, so allocation of shadowed translation cache is needed only for it. Then, it can use sp->shadowed_translation to determine whether to use the information in the shadowed translation cache or not. Also, extend the WARN in FNAME(sync_spte)() to ensure that this won't break shadow_mmu_get_sp_for_split(). Suggested-by: Lai Jiangshan Signed-off-by: Hou Wenlong Link: https://lore.kernel.org/r/5b0cda8a7456cda476b14fca36414a56f921dd52.1715398655.git.houwenlong.hwl@antgroup.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 11 +++++------ arch/x86/kvm/mmu/paging_tmpl.h | 3 ++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index afc7489513bc..fd5378b72896 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -719,7 +719,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) if (sp->role.passthrough) return sp->gfn; - if (!sp->role.direct) + if (sp->shadowed_translation) return sp->shadowed_translation[index] >> PAGE_SHIFT; return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); @@ -733,7 +733,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) */ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { - if (sp_has_gptes(sp)) + if (sp->shadowed_translation) return sp->shadowed_translation[index] & ACC_ALL; /* @@ -754,7 +754,7 @@ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, gfn_t gfn, unsigned int access) { - if (sp_has_gptes(sp)) { + if (sp->shadowed_translation) { sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } @@ -1697,8 +1697,7 @@ static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); - if (!sp->role.direct) - free_page((unsigned long)sp->shadowed_translation); + free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } @@ -2200,7 +2199,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); - if (!role.direct) + if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL) sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index d3dbcf382ed2..69941cebb3a8 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -911,7 +911,8 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int gpa_t pte_gpa; gfn_t gfn; - if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE)) + if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE || + !sp->shadowed_translation)) return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); From caa727882937f0d5260a2a026eb7c27d77404066 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 7 Jun 2024 17:11:08 -0700 Subject: [PATCH 3/7] KVM: x86/mmu: Rephrase comment about synthetic PFERR flags in #PF handler Reword the BUILD_BUG_ON() comment in the legacy #PF handler to explicitly describe how asserting that synthetic PFERR flags are limited to bits 31:0 protects KVM against inadvertently passing a synthetic flag to the common page fault handler. No functional change intended. Suggested-by: Xiaoyao Li Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240608001108.3296879-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fd5378b72896..46103e3eda30 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4552,7 +4552,10 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, if (WARN_ON_ONCE(error_code >> 32)) error_code = lower_32_bits(error_code); - /* Ensure the above sanity check also covers KVM-defined flags. */ + /* + * Restrict KVM-defined flags to bits 63:32 so that it's impossible for + * them to conflict with #PF error codes, which are limited to 32 bits. + */ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); vcpu->arch.l1tf_flush_l1d = true; From cf3ff0ee24d6808d19dec6c9dedb5c7555bd8c55 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 11 Jun 2024 15:05:09 -0700 Subject: [PATCH 4/7] KVM: x86/mmu: Always drop mmu_lock to allocate TDP MMU SPs for eager splitting Always drop mmu_lock to allocate shadow pages in the TDP MMU when doing eager page splitting. Dropping mmu_lock during eager page splitting is cheap since KVM does not have to flush remote TLBs, and avoids stalling vCPU threads that are taking page faults while KVM is eager splitting under mmu_lock held for write. This change reduces 20%+ dips in MySQL throughput during live migration in a 160 vCPU VM while userspace is issuing CLEAR_DIRTY_LOG ioctls (tested with 1GiB and 8GiB CLEARs). Userspace could issue finer-grained CLEARs, which would also reduce contention on mmu_lock, but doing so will increase the rate of remote TLB flushing, since KVM must flush TLBs before returning from CLEAR_DITY_LOG. When there isn't contention on mmu_lock[1], this change does not regress the time it takes to perform eager page splitting (the cost of releasing and re-acquiring an uncontended lock is minimal on x86). [1] Tested with dirty_log_perf_test, which does not run vCPUs during eager page splitting, and with a 16 vCPU VM Live Migration with manual-protect disabled (where mmu_lock is held for read). Cc: Bibo Mao Cc: Sean Christopherson Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240611220512.2426439-2-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1259dd63defc..e7a5d0147141 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1366,19 +1366,6 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, kvm_lockdep_assert_mmu_lock_held(kvm, shared); - /* - * Since we are allocating while under the MMU lock we have to be - * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct - * reclaim and to avoid making any filesystem callbacks (which can end - * up invoking KVM MMU notifiers, resulting in a deadlock). - * - * If this allocation fails we drop the lock and retry with reclaim - * allowed. - */ - sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); - if (sp) - return sp; - rcu_read_unlock(); if (shared) @@ -1478,8 +1465,7 @@ retry: break; } - if (iter.yielded) - continue; + continue; } tdp_mmu_init_child_sp(sp, &iter); From e1c04f7a9f4213c56af5021b40bc00f527bffbae Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 11 Jun 2024 15:05:10 -0700 Subject: [PATCH 5/7] KVM: x86/mmu: Hard code GFP flags for TDP MMU eager split allocations Now that the GFP_NOWAIT case is gone, hard code GFP_KERNEL_ACCOUNT when allocating shadow pages during eager page splitting in the TDP MMU. Opportunistically replace use of __GFP_ZERO with allocations that zero to improve readability. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240611220512.2426439-3-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index e7a5d0147141..20ee1bc64f3c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1339,17 +1339,15 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, return spte_set; } -static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) +static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(void) { struct kvm_mmu_page *sp; - gfp |= __GFP_ZERO; - - sp = kmem_cache_alloc(mmu_page_header_cache, gfp); + sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT); if (!sp) return NULL; - sp->spt = (void *)__get_free_page(gfp); + sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!sp->spt) { kmem_cache_free(mmu_page_header_cache, sp); return NULL; @@ -1374,7 +1372,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, write_unlock(&kvm->mmu_lock); iter->yielded = true; - sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); + sp = __tdp_mmu_alloc_sp_for_split(); if (shared) read_lock(&kvm->mmu_lock); From 3d4a5a45ca26f8de9e0a4f384a2fb0967b8566b5 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 11 Jun 2024 15:05:11 -0700 Subject: [PATCH 6/7] KVM: x86/mmu: Unnest TDP MMU helpers that allocate SPs for eager splitting Move the implementation of tdp_mmu_alloc_sp_for_split() to its one and only caller to reduce unnecessary nesting and make it more clear why the eager split loop continues after allocating a new SP. Opportunistically drop the double-underscores from __tdp_mmu_alloc_sp_for_split() now that its parent is gone. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240611220512.2426439-4-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 48 ++++++++++++++------------------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 20ee1bc64f3c..028f5a667482 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1339,7 +1339,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, return spte_set; } -static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(void) +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void) { struct kvm_mmu_page *sp; @@ -1356,34 +1356,6 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(void) return sp; } -static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, - struct tdp_iter *iter, - bool shared) -{ - struct kvm_mmu_page *sp; - - kvm_lockdep_assert_mmu_lock_held(kvm, shared); - - rcu_read_unlock(); - - if (shared) - read_unlock(&kvm->mmu_lock); - else - write_unlock(&kvm->mmu_lock); - - iter->yielded = true; - sp = __tdp_mmu_alloc_sp_for_split(); - - if (shared) - read_lock(&kvm->mmu_lock); - else - write_lock(&kvm->mmu_lock); - - rcu_read_lock(); - - return sp; -} - /* Note, the caller is responsible for initializing @sp. */ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) @@ -1454,7 +1426,22 @@ retry: continue; if (!sp) { - sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); + rcu_read_unlock(); + + if (shared) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); + + sp = tdp_mmu_alloc_sp_for_split(); + + if (shared) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); + + rcu_read_lock(); + if (!sp) { ret = -ENOMEM; trace_kvm_mmu_split_huge_page(iter.gfn, @@ -1463,6 +1450,7 @@ retry: break; } + iter.yielded = true; continue; } From 0089c055b56024edf90e85dc852440b713ce8cb5 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 11 Jun 2024 15:05:12 -0700 Subject: [PATCH 7/7] KVM: x86/mmu: Avoid reacquiring RCU if TDP MMU fails to allocate an SP Avoid needlessly reacquiring the RCU read lock if the TDP MMU fails to allocate a shadow page during eager page splitting. Opportunistically drop the local variable ret as well now that it's no longer necessary. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240611220512.2426439-5-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 028f5a667482..c512a02c44a1 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1402,7 +1402,6 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, { struct kvm_mmu_page *sp = NULL; struct tdp_iter iter; - int ret = 0; rcu_read_lock(); @@ -1440,16 +1439,15 @@ retry: else write_lock(&kvm->mmu_lock); - rcu_read_lock(); - if (!sp) { - ret = -ENOMEM; trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte, - iter.level, ret); - break; + iter.level, -ENOMEM); + return -ENOMEM; } + rcu_read_lock(); + iter.yielded = true; continue; } @@ -1472,7 +1470,7 @@ retry: if (sp) tdp_mmu_free_sp(sp); - return ret; + return 0; }