Merge branch 'kvm-tdp-mmu-atomicity-fix' into HEAD
We are dropping A/D bits (and W bits) in the TDP MMU. Even if mmu_lock is held for write, as volatile SPTEs can be written by other tasks/vCPUs outside of mmu_lock. Attempting to prove that bug exposed another notable goof, which has been lurking for a decade, give or take: KVM treats _all_ MMU-writable SPTEs as volatile, even though KVM never clears WRITABLE outside of MMU lock. As a result, the legacy MMU (and the TDP MMU if not fixed) uses XCHG to update writable SPTEs. The fix does not seem to have an easily-measurable affect on performance; page faults are so slow that wasting even a few hundred cycles is dwarfed by the base cost.
This commit is contained in:
commit
4f510c8bb1
@ -473,30 +473,6 @@ retry:
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static bool spte_has_volatile_bits(u64 spte)
|
|
||||||
{
|
|
||||||
if (!is_shadow_present_pte(spte))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Always atomically update spte if it can be updated
|
|
||||||
* out of mmu-lock, it can ensure dirty bit is not lost,
|
|
||||||
* also, it can help us to get a stable is_writable_pte()
|
|
||||||
* to ensure tlb flush is not missed.
|
|
||||||
*/
|
|
||||||
if (spte_can_locklessly_be_made_writable(spte) ||
|
|
||||||
is_access_track_spte(spte))
|
|
||||||
return true;
|
|
||||||
|
|
||||||
if (spte_ad_enabled(spte)) {
|
|
||||||
if ((spte & shadow_accessed_mask) == 0 ||
|
|
||||||
(is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Rules for using mmu_spte_set:
|
/* Rules for using mmu_spte_set:
|
||||||
* Set the sptep from nonpresent to present.
|
* Set the sptep from nonpresent to present.
|
||||||
* Note: the sptep being assigned *must* be either not present
|
* Note: the sptep being assigned *must* be either not present
|
||||||
@ -557,7 +533,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
|
|||||||
* we always atomically update it, see the comments in
|
* we always atomically update it, see the comments in
|
||||||
* spte_has_volatile_bits().
|
* spte_has_volatile_bits().
|
||||||
*/
|
*/
|
||||||
if (spte_can_locklessly_be_made_writable(old_spte) &&
|
if (is_mmu_writable_spte(old_spte) &&
|
||||||
!is_writable_pte(new_spte))
|
!is_writable_pte(new_spte))
|
||||||
flush = true;
|
flush = true;
|
||||||
|
|
||||||
@ -591,7 +567,8 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
|
|||||||
u64 old_spte = *sptep;
|
u64 old_spte = *sptep;
|
||||||
int level = sptep_to_sp(sptep)->role.level;
|
int level = sptep_to_sp(sptep)->role.level;
|
||||||
|
|
||||||
if (!spte_has_volatile_bits(old_spte))
|
if (!is_shadow_present_pte(old_spte) ||
|
||||||
|
!spte_has_volatile_bits(old_spte))
|
||||||
__update_clear_spte_fast(sptep, 0ull);
|
__update_clear_spte_fast(sptep, 0ull);
|
||||||
else
|
else
|
||||||
old_spte = __update_clear_spte_slow(sptep, 0ull);
|
old_spte = __update_clear_spte_slow(sptep, 0ull);
|
||||||
@ -1187,7 +1164,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
|
|||||||
u64 spte = *sptep;
|
u64 spte = *sptep;
|
||||||
|
|
||||||
if (!is_writable_pte(spte) &&
|
if (!is_writable_pte(spte) &&
|
||||||
!(pt_protect && spte_can_locklessly_be_made_writable(spte)))
|
!(pt_protect && is_mmu_writable_spte(spte)))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
rmap_printk("spte %p %llx\n", sptep, *sptep);
|
rmap_printk("spte %p %llx\n", sptep, *sptep);
|
||||||
@ -3196,8 +3173,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
|||||||
* be removed in the fast path only if the SPTE was
|
* be removed in the fast path only if the SPTE was
|
||||||
* write-protected for dirty-logging or access tracking.
|
* write-protected for dirty-logging or access tracking.
|
||||||
*/
|
*/
|
||||||
if (fault->write &&
|
if (fault->write && is_mmu_writable_spte(spte)) {
|
||||||
spte_can_locklessly_be_made_writable(spte)) {
|
|
||||||
new_spte |= PT_WRITABLE_MASK;
|
new_spte |= PT_WRITABLE_MASK;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -90,6 +90,34 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
|
|||||||
E820_TYPE_RAM);
|
E820_TYPE_RAM);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns true if the SPTE has bits that may be set without holding mmu_lock.
|
||||||
|
* The caller is responsible for checking if the SPTE is shadow-present, and
|
||||||
|
* for determining whether or not the caller cares about non-leaf SPTEs.
|
||||||
|
*/
|
||||||
|
bool spte_has_volatile_bits(u64 spte)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Always atomically update spte if it can be updated
|
||||||
|
* out of mmu-lock, it can ensure dirty bit is not lost,
|
||||||
|
* also, it can help us to get a stable is_writable_pte()
|
||||||
|
* to ensure tlb flush is not missed.
|
||||||
|
*/
|
||||||
|
if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (is_access_track_spte(spte))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (spte_ad_enabled(spte)) {
|
||||||
|
if (!(spte & shadow_accessed_mask) ||
|
||||||
|
(is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||||
const struct kvm_memory_slot *slot,
|
const struct kvm_memory_slot *slot,
|
||||||
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
|
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
|
||||||
|
@ -390,7 +390,7 @@ static inline void check_spte_writable_invariants(u64 spte)
|
|||||||
"kvm: Writable SPTE is not MMU-writable: %llx", spte);
|
"kvm: Writable SPTE is not MMU-writable: %llx", spte);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
|
static inline bool is_mmu_writable_spte(u64 spte)
|
||||||
{
|
{
|
||||||
return spte & shadow_mmu_writable_mask;
|
return spte & shadow_mmu_writable_mask;
|
||||||
}
|
}
|
||||||
@ -404,6 +404,8 @@ static inline u64 get_mmio_spte_generation(u64 spte)
|
|||||||
return gen;
|
return gen;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool spte_has_volatile_bits(u64 spte);
|
||||||
|
|
||||||
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||||
const struct kvm_memory_slot *slot,
|
const struct kvm_memory_slot *slot,
|
||||||
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
|
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <linux/kvm_host.h>
|
#include <linux/kvm_host.h>
|
||||||
|
|
||||||
#include "mmu.h"
|
#include "mmu.h"
|
||||||
|
#include "spte.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
|
* TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
|
||||||
@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
|
|||||||
{
|
{
|
||||||
return READ_ONCE(*rcu_dereference(sptep));
|
return READ_ONCE(*rcu_dereference(sptep));
|
||||||
}
|
}
|
||||||
static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val)
|
|
||||||
|
static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
|
||||||
{
|
{
|
||||||
WRITE_ONCE(*rcu_dereference(sptep), val);
|
return xchg(rcu_dereference(sptep), new_spte);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
|
||||||
|
{
|
||||||
|
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
|
||||||
|
u64 new_spte, int level)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Atomically write the SPTE if it is a shadow-present, leaf SPTE with
|
||||||
|
* volatile bits, i.e. has bits that can be set outside of mmu_lock.
|
||||||
|
* The Writable bit can be set by KVM's fast page fault handler, and
|
||||||
|
* Accessed and Dirty bits can be set by the CPU.
|
||||||
|
*
|
||||||
|
* Note, non-leaf SPTEs do have Accessed bits and those bits are
|
||||||
|
* technically volatile, but KVM doesn't consume the Accessed bit of
|
||||||
|
* non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
|
||||||
|
* logic needs to be reassessed if KVM were to use non-leaf Accessed
|
||||||
|
* bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
|
||||||
|
*/
|
||||||
|
if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
|
||||||
|
spte_has_volatile_bits(old_spte))
|
||||||
|
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
|
||||||
|
|
||||||
|
__kvm_tdp_mmu_write_spte(sptep, new_spte);
|
||||||
|
return old_spte;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
|
|||||||
tdp_mmu_unlink_sp(kvm, sp, shared);
|
tdp_mmu_unlink_sp(kvm, sp, shared);
|
||||||
|
|
||||||
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
|
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
|
||||||
u64 *sptep = rcu_dereference(pt) + i;
|
tdp_ptep_t sptep = pt + i;
|
||||||
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
|
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
|
||||||
u64 old_child_spte;
|
u64 old_spte;
|
||||||
|
|
||||||
if (shared) {
|
if (shared) {
|
||||||
/*
|
/*
|
||||||
@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
|
|||||||
* value to the removed SPTE value.
|
* value to the removed SPTE value.
|
||||||
*/
|
*/
|
||||||
for (;;) {
|
for (;;) {
|
||||||
old_child_spte = xchg(sptep, REMOVED_SPTE);
|
old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
|
||||||
if (!is_removed_spte(old_child_spte))
|
if (!is_removed_spte(old_spte))
|
||||||
break;
|
break;
|
||||||
cpu_relax();
|
cpu_relax();
|
||||||
}
|
}
|
||||||
@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
|
|||||||
* are guarded by the memslots generation, not by being
|
* are guarded by the memslots generation, not by being
|
||||||
* unreachable.
|
* unreachable.
|
||||||
*/
|
*/
|
||||||
old_child_spte = READ_ONCE(*sptep);
|
old_spte = kvm_tdp_mmu_read_spte(sptep);
|
||||||
if (!is_shadow_present_pte(old_child_spte))
|
if (!is_shadow_present_pte(old_spte))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Marking the SPTE as a removed SPTE is not
|
* Use the common helper instead of a raw WRITE_ONCE as
|
||||||
* strictly necessary here as the MMU lock will
|
* the SPTE needs to be updated atomically if it can be
|
||||||
* stop other threads from concurrently modifying
|
* modified by a different vCPU outside of mmu_lock.
|
||||||
* this SPTE. Using the removed SPTE value keeps
|
* Even though the parent SPTE is !PRESENT, the TLB
|
||||||
* the two branches consistent and simplifies
|
* hasn't yet been flushed, and both Intel and AMD
|
||||||
* the function.
|
* document that A/D assists can use upper-level PxE
|
||||||
|
* entries that are cached in the TLB, i.e. the CPU can
|
||||||
|
* still access the page and mark it dirty.
|
||||||
|
*
|
||||||
|
* No retry is needed in the atomic update path as the
|
||||||
|
* sole concern is dropping a Dirty bit, i.e. no other
|
||||||
|
* task can zap/remove the SPTE as mmu_lock is held for
|
||||||
|
* write. Marking the SPTE as a removed SPTE is not
|
||||||
|
* strictly necessary for the same reason, but using
|
||||||
|
* the remove SPTE value keeps the shared/exclusive
|
||||||
|
* paths consistent and allows the handle_changed_spte()
|
||||||
|
* call below to hardcode the new value to REMOVED_SPTE.
|
||||||
|
*
|
||||||
|
* Note, even though dropping a Dirty bit is the only
|
||||||
|
* scenario where a non-atomic update could result in a
|
||||||
|
* functional bug, simply checking the Dirty bit isn't
|
||||||
|
* sufficient as a fast page fault could read the upper
|
||||||
|
* level SPTE before it is zapped, and then make this
|
||||||
|
* target SPTE writable, resume the guest, and set the
|
||||||
|
* Dirty bit between reading the SPTE above and writing
|
||||||
|
* it here.
|
||||||
*/
|
*/
|
||||||
WRITE_ONCE(*sptep, REMOVED_SPTE);
|
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
|
||||||
|
REMOVED_SPTE, level);
|
||||||
}
|
}
|
||||||
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
|
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
|
||||||
old_child_spte, REMOVED_SPTE, level,
|
old_spte, REMOVED_SPTE, level, shared);
|
||||||
shared);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
|
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
|
||||||
@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
|
|||||||
KVM_PAGES_PER_HPAGE(iter->level));
|
KVM_PAGES_PER_HPAGE(iter->level));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* No other thread can overwrite the removed SPTE as they
|
* No other thread can overwrite the removed SPTE as they must either
|
||||||
* must either wait on the MMU lock or use
|
* wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
|
||||||
* tdp_mmu_set_spte_atomic which will not overwrite the
|
* overwrite the special removed SPTE value. No bookkeeping is needed
|
||||||
* special removed SPTE value. No bookkeeping is needed
|
* here since the SPTE is going from non-present to non-present. Use
|
||||||
* here since the SPTE is going from non-present
|
* the raw write helper to avoid an unnecessary check on volatile bits.
|
||||||
* to non-present.
|
|
||||||
*/
|
*/
|
||||||
kvm_tdp_mmu_write_spte(iter->sptep, 0);
|
__kvm_tdp_mmu_write_spte(iter->sptep, 0);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -699,10 +718,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
|
|||||||
* unless performing certain dirty logging operations.
|
* unless performing certain dirty logging operations.
|
||||||
* Leaving record_dirty_log unset in that case prevents page
|
* Leaving record_dirty_log unset in that case prevents page
|
||||||
* writes from being double counted.
|
* writes from being double counted.
|
||||||
|
*
|
||||||
|
* Returns the old SPTE value, which _may_ be different than @old_spte if the
|
||||||
|
* SPTE had voldatile bits.
|
||||||
*/
|
*/
|
||||||
static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
|
static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
|
||||||
u64 old_spte, u64 new_spte, gfn_t gfn, int level,
|
u64 old_spte, u64 new_spte, gfn_t gfn, int level,
|
||||||
bool record_acc_track, bool record_dirty_log)
|
bool record_acc_track, bool record_dirty_log)
|
||||||
{
|
{
|
||||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||||
|
|
||||||
@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
|
|||||||
*/
|
*/
|
||||||
WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
|
WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
|
||||||
|
|
||||||
kvm_tdp_mmu_write_spte(sptep, new_spte);
|
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
|
||||||
|
|
||||||
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
|
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
|
||||||
|
|
||||||
@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
|
|||||||
if (record_dirty_log)
|
if (record_dirty_log)
|
||||||
handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
|
handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
|
||||||
new_spte, level);
|
new_spte, level);
|
||||||
|
return old_spte;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
|
static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
|
||||||
@ -732,9 +755,10 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
|
|||||||
{
|
{
|
||||||
WARN_ON_ONCE(iter->yielded);
|
WARN_ON_ONCE(iter->yielded);
|
||||||
|
|
||||||
__tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
|
iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
|
||||||
new_spte, iter->gfn, iter->level,
|
iter->old_spte, new_spte,
|
||||||
record_acc_track, record_dirty_log);
|
iter->gfn, iter->level,
|
||||||
|
record_acc_track, record_dirty_log);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
|
static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
|
||||||
|
Loading…
Reference in New Issue
Block a user