mirror of
https://github.com/torvalds/linux.git
synced 2024-11-22 20:22:09 +00:00
Fix a long-standing flaw in x86's TDP MMU where unloading roots on a vCPU can
result in the root being freed even though the root is completely valid and can be reused as-is (with a TLB flush). -----BEGIN PGP SIGNATURE----- iQJGBAABCgAwFiEEMHr+pfEFOIzK+KY1YJEiAU0MEvkFAmRP/3ESHHNlYW5qY0Bn b29nbGUuY29tAAoJEGCRIgFNDBL5J7kQAIg6v9UzM/qp7/d6C4laZLTWC2YlGhiI 1ZrfLU3/gQPYnnxv8GzLZ1CaXDhku2IIdyl2AQe8sUEmold45EapAW32rw2127j1 z4jW8x8dKYXUd1HGe823O0Rm+Ls6bGcXmHj8LaBCBIV6loBINeNfLXNllsO/yIcR PmagzEqkNsMW3mvutdqb9mFP8p+mBzQu5qHlMEUb4WOXBmL06teHjR3qo7hi9Kl0 nM0ZvuvCLGvufoI0RESiq7mXPKBz3yvhFbkjrUgBKQ/rij2PMO8iyULsLfGY1iAI m60aBfQPLJIH0NgvNHXkQOW59COYaY+I8udZqZZNr2uVb5A8J+/rQFSG/BP1Ccsw mtJgZRD5WdplcAjYlZCcEgBmwjznjSOFGYaOrAp02dJlbPw2/Tjaj1GHMvMjEIME KLvWTsN6xB9K0OhiXFvo1N4FCJbfi+PJPK0qVG7UttPnziCwYqAeIhGk4Kj6SHsX P23HnDO8U/rCwRG2tuyZmbllpUXsX0q08wyGlp1UcKAbtD8PPGPyz8+I7YakKI97 RddIAh2qh5hwHON1xe35VSQ8X0OPOK1UnkiGTuBDdfldzxXK7OCfKKVQ6hsnpV6e 0a6nQc2Ni7/f5jThPo2cTaKz389ZpVE2j1DaTT8QXq5JuBcTzrNI6HImcJwPFTWP +kUxewuRaaog =pzJT -----END PGP SIGNATURE----- Merge tag 'kvm-x86-mmu-6.4-2' of https://github.com/kvm-x86/linux into HEAD Fix a long-standing flaw in x86's TDP MMU where unloading roots on a vCPU can result in the root being freed even though the root is completely valid and can be reused as-is (with a TLB flush).
This commit is contained in:
commit
29b38e7650
@ -40,7 +40,17 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
|
||||
|
||||
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
|
||||
{
|
||||
/* Also waits for any queued work items. */
|
||||
/*
|
||||
* Invalidate all roots, which besides the obvious, schedules all roots
|
||||
* for zapping and thus puts the TDP MMU's reference to each root, i.e.
|
||||
* ultimately frees all roots.
|
||||
*/
|
||||
kvm_tdp_mmu_invalidate_all_roots(kvm);
|
||||
|
||||
/*
|
||||
* Destroying a workqueue also first flushes the workqueue, i.e. no
|
||||
* need to invoke kvm_tdp_mmu_zap_invalidated_roots().
|
||||
*/
|
||||
destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
|
||||
|
||||
WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
|
||||
@ -116,16 +126,6 @@ static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root
|
||||
queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
|
||||
}
|
||||
|
||||
static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
|
||||
{
|
||||
union kvm_mmu_page_role role = page->role;
|
||||
role.invalid = true;
|
||||
|
||||
/* No need to use cmpxchg, only the invalid bit can change. */
|
||||
role.word = xchg(&page->role.word, role.word);
|
||||
return role.invalid;
|
||||
}
|
||||
|
||||
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
|
||||
bool shared)
|
||||
{
|
||||
@ -134,45 +134,12 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
|
||||
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
|
||||
return;
|
||||
|
||||
WARN_ON(!is_tdp_mmu_page(root));
|
||||
|
||||
/*
|
||||
* The root now has refcount=0. It is valid, but readers already
|
||||
* cannot acquire a reference to it because kvm_tdp_mmu_get_root()
|
||||
* rejects it. This remains true for the rest of the execution
|
||||
* of this function, because readers visit valid roots only
|
||||
* (except for tdp_mmu_zap_root_work(), which however
|
||||
* does not acquire any reference itself).
|
||||
*
|
||||
* Even though there are flows that need to visit all roots for
|
||||
* correctness, they all take mmu_lock for write, so they cannot yet
|
||||
* run concurrently. The same is true after kvm_tdp_root_mark_invalid,
|
||||
* since the root still has refcount=0.
|
||||
*
|
||||
* However, tdp_mmu_zap_root can yield, and writers do not expect to
|
||||
* see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
|
||||
* So the root temporarily gets an extra reference, going to refcount=1
|
||||
* while staying invalid. Readers still cannot acquire any reference;
|
||||
* but writers are now allowed to run if tdp_mmu_zap_root yields and
|
||||
* they might take an extra reference if they themselves yield.
|
||||
* Therefore, when the reference is given back by the worker,
|
||||
* there is no guarantee that the refcount is still 1. If not, whoever
|
||||
* puts the last reference will free the page, but they will not have to
|
||||
* zap the root because a root cannot go from invalid to valid.
|
||||
* The TDP MMU itself holds a reference to each root until the root is
|
||||
* explicitly invalidated, i.e. the final reference should be never be
|
||||
* put for a valid root.
|
||||
*/
|
||||
if (!kvm_tdp_root_mark_invalid(root)) {
|
||||
refcount_set(&root->tdp_mmu_root_count, 1);
|
||||
|
||||
/*
|
||||
* Zapping the root in a worker is not just "nice to have";
|
||||
* it is required because kvm_tdp_mmu_invalidate_all_roots()
|
||||
* skips already-invalid roots. If kvm_tdp_mmu_put_root() did
|
||||
* not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
|
||||
* might return with some roots not zapped yet.
|
||||
*/
|
||||
tdp_mmu_schedule_zap_root(kvm, root);
|
||||
return;
|
||||
}
|
||||
KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
|
||||
|
||||
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
list_del_rcu(&root->link);
|
||||
@ -320,7 +287,14 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
|
||||
root = tdp_mmu_alloc_sp(vcpu);
|
||||
tdp_mmu_init_sp(root, NULL, 0, role);
|
||||
|
||||
refcount_set(&root->tdp_mmu_root_count, 1);
|
||||
/*
|
||||
* TDP MMU roots are kept until they are explicitly invalidated, either
|
||||
* by a memslot update or by the destruction of the VM. Initialize the
|
||||
* refcount to two; one reference for the vCPU, and one reference for
|
||||
* the TDP MMU itself, which is held until the root is invalidated and
|
||||
* is ultimately put by tdp_mmu_zap_root_work().
|
||||
*/
|
||||
refcount_set(&root->tdp_mmu_root_count, 2);
|
||||
|
||||
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
|
||||
@ -946,32 +920,49 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
|
||||
/*
|
||||
* Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
|
||||
* is about to be zapped, e.g. in response to a memslots update. The actual
|
||||
* zapping is performed asynchronously, so a reference is taken on all roots.
|
||||
* Using a separate workqueue makes it easy to ensure that the destruction is
|
||||
* performed before the "fast zap" completes, without keeping a separate list
|
||||
* of invalidated roots; the list is effectively the list of work items in
|
||||
* the workqueue.
|
||||
* zapping is performed asynchronously. Using a separate workqueue makes it
|
||||
* easy to ensure that the destruction is performed before the "fast zap"
|
||||
* completes, without keeping a separate list of invalidated roots; the list is
|
||||
* effectively the list of work items in the workqueue.
|
||||
*
|
||||
* Get a reference even if the root is already invalid, the asynchronous worker
|
||||
* assumes it was gifted a reference to the root it processes. Because mmu_lock
|
||||
* is held for write, it should be impossible to observe a root with zero refcount,
|
||||
* i.e. the list of roots cannot be stale.
|
||||
*
|
||||
* This has essentially the same effect for the TDP MMU
|
||||
* as updating mmu_valid_gen does for the shadow MMU.
|
||||
* Note, the asynchronous worker is gifted the TDP MMU's reference.
|
||||
* See kvm_tdp_mmu_get_vcpu_root_hpa().
|
||||
*/
|
||||
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
|
||||
{
|
||||
struct kvm_mmu_page *root;
|
||||
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
|
||||
if (!root->role.invalid &&
|
||||
!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
|
||||
/*
|
||||
* mmu_lock must be held for write to ensure that a root doesn't become
|
||||
* invalid while there are active readers (invalidating a root while
|
||||
* there are active readers may or may not be problematic in practice,
|
||||
* but it's uncharted territory and not supported).
|
||||
*
|
||||
* Waive the assertion if there are no users of @kvm, i.e. the VM is
|
||||
* being destroyed after all references have been put, or if no vCPUs
|
||||
* have been created (which means there are no roots), i.e. the VM is
|
||||
* being destroyed in an error path of KVM_CREATE_VM.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
|
||||
refcount_read(&kvm->users_count) && kvm->created_vcpus)
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
|
||||
/*
|
||||
* As above, mmu_lock isn't held when destroying the VM! There can't
|
||||
* be other references to @kvm, i.e. nothing else can invalidate roots
|
||||
* or be consuming roots, but walking the list of roots does need to be
|
||||
* guarded against roots being deleted by the asynchronous zap worker.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
|
||||
list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
|
||||
if (!root->role.invalid) {
|
||||
root->role.invalid = true;
|
||||
tdp_mmu_schedule_zap_root(kvm, root);
|
||||
}
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user