Commit 4286a3ec authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge tag 'kvm-x86-mmu-6.15' of https://github.com/kvm-x86/linux into HEAD

KVM x86/mmu changes for 6.15

Add support for "fast" aging of SPTEs in both the TDP MMU and Shadow MMU, where
"fast" means "without holding mmu_lock".  Not taking mmu_lock allows multiple
aging actions to run in parallel, and more importantly avoids stalling vCPUs,
e.g. due to holding mmu_lock for an extended duration while a vCPU is faulting
in memory.

For the TDP MMU, protect aging via RCU; the page tables are RCU-protected and
KVM doesn't need to access any metadata to age SPTEs.

For the Shadow MMU, use bit 1 of rmap pointers (bit 0 is used to terminate a
list of rmaps) to implement a per-rmap single-bit spinlock.  When aging a gfn,
acquire the rmap's spinlock with read-only permissions, which allows hardening
and optimizing the locking and aging, e.g. locking an rmap for write requires
mmu_lock to also be held.  The lock is NOT a true R/W spinlock, i.e. multiple
concurrent readers aren't supported.

To avoid forcing all SPTE updates to use atomic operations (clearing the
Accessed bit out of mmu_lock makes it inherently volatile), rework and rename
spte_has_volatile_bits() to spte_needs_atomic_update() and deliberately exclude
the Accessed bit.  KVM (and mm/) already tolerates false positives/negatives
for Accessed information, and all testing has shown that reducing the latency
of aging is far more beneficial to overall system performance than providing
"perfect" young/old information.
parents e3353000 0dab791f
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -196,7 +196,7 @@ writable between reading spte and updating spte. Like below case:
The Dirty bit is lost in this case.

In order to avoid this kind of issue, we always treat the spte as "volatile"
if it can be updated out of mmu-lock [see spte_has_volatile_bits()]; it means
if it can be updated out of mmu-lock [see spte_needs_atomic_update()]; it means
the spte is always atomically updated in this case.

3) flush tlbs due to spte updated
@@ -212,7 +212,7 @@ function to update spte (present -> present).

Since the spte is "volatile" if it can be updated out of mmu-lock, we always
atomically update the spte and the race caused by fast page fault can be avoided.
See the comments in spte_has_volatile_bits() and mmu_spte_update().
See the comments in spte_needs_atomic_update() and mmu_spte_update().

Lockless Access Tracking:

+3 −1
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@
#include <linux/kfifo.h>
#include <linux/sched/vhost_task.h>
#include <linux/call_once.h>
#include <linux/atomic.h>

#include <asm/apic.h>
#include <asm/pvclock-abi.h>
@@ -405,7 +406,7 @@ union kvm_cpu_role {
};

struct kvm_rmap_head {
	unsigned long val;
	atomic_long_t val;
};

struct kvm_pio_request {
@@ -1479,6 +1480,7 @@ struct kvm_arch {
	 * tdp_mmu_page set.
	 *
	 * For reads, this list is protected by:
	 *	RCU alone or
	 *	the MMU lock in read mode + RCU or
	 *	the MMU lock in write mode
	 *
+1 −0
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@ config KVM_X86
	select KVM_COMMON
	select KVM_GENERIC_MMU_NOTIFIER
	select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
	select KVM_MMU_LOCKLESS_AGING
	select HAVE_KVM_IRQCHIP
	select HAVE_KVM_PFNCACHE
	select HAVE_KVM_DIRTY_RING_TSO
+269 −96
Original line number Diff line number Diff line
@@ -501,7 +501,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
		return false;
	}

	if (!spte_has_volatile_bits(old_spte))
	if (!spte_needs_atomic_update(old_spte))
		__update_clear_spte_fast(sptep, new_spte);
	else
		old_spte = __update_clear_spte_slow(sptep, new_spte);
@@ -524,7 +524,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
	int level = sptep_to_sp(sptep)->role.level;

	if (!is_shadow_present_pte(old_spte) ||
	    !spte_has_volatile_bits(old_spte))
	    !spte_needs_atomic_update(old_spte))
		__update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
	else
		old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
@@ -853,32 +853,173 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
 * About rmap_head encoding:
 *
 * If the bit zero of rmap_head->val is clear, then it points to the only spte
 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
 * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct
 * pte_list_desc containing more mappings.
 */
#define KVM_RMAP_MANY	BIT(0)

/*
 * Returns the number of pointers in the rmap chain, not counting the new one.
 * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always
 * operates with mmu_lock held for write), but rmaps can be walked without
 * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain
 * being zapped/dropped _while the rmap is locked_.
 *
 * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be
 * done while holding mmu_lock for write.  This allows a task walking rmaps
 * without holding mmu_lock to concurrently walk the same entries as a task
 * that is holding mmu_lock but _not_ the rmap lock.  Neither task will modify
 * the rmaps, thus the walks are stable.
 *
 * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED,
 * only the rmap chains themselves are protected.  E.g. holding an rmap's lock
 * ensures all "struct pte_list_desc" fields are stable.
 */
#define KVM_RMAP_LOCKED	BIT(1)

static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
{
	unsigned long old_val, new_val;

	lockdep_assert_preemption_disabled();

	/*
	 * Elide the lock if the rmap is empty, as lockless walkers (read-only
	 * mode) don't need to (and can't) walk an empty rmap, nor can they add
	 * entries to the rmap.  I.e. the only paths that process empty rmaps
	 * do so while holding mmu_lock for write, and are mutually exclusive.
	 */
	old_val = atomic_long_read(&rmap_head->val);
	if (!old_val)
		return 0;

	do {
		/*
		 * If the rmap is locked, wait for it to be unlocked before
		 * trying acquire the lock, e.g. to avoid bouncing the cache
		 * line.
		 */
		while (old_val & KVM_RMAP_LOCKED) {
			cpu_relax();
			old_val = atomic_long_read(&rmap_head->val);
		}

		/*
		 * Recheck for an empty rmap, it may have been purged by the
		 * task that held the lock.
		 */
static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
		if (!old_val)
			return 0;

		new_val = old_val | KVM_RMAP_LOCKED;
	/*
	 * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
	 * from being reordered outside of the critical section created by
	 * __kvm_rmap_lock().
	 *
	 * Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
	 *
	 * For the !old_val case, no ordering is needed, as there is no rmap
	 * to walk.
	 */
	} while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));

	/*
	 * Return the old value, i.e. _without_ the LOCKED bit set.  It's
	 * impossible for the return value to be 0 (see above), i.e. the read-
	 * only unlock flow can't get a false positive and fail to unlock.
	 */
	return old_val;
}

static unsigned long kvm_rmap_lock(struct kvm *kvm,
				   struct kvm_rmap_head *rmap_head)
{
	lockdep_assert_held_write(&kvm->mmu_lock);

	return __kvm_rmap_lock(rmap_head);
}

static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
			      unsigned long val)
{
	KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED);
	/*
	 * Ensure that all accesses to the rmap have completed before unlocking
	 * the rmap.
	 *
	 * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock().
	 */
	atomic_long_set_release(&rmap_head->val, val);
}

static void kvm_rmap_unlock(struct kvm *kvm,
			    struct kvm_rmap_head *rmap_head,
			    unsigned long new_val)
{
	lockdep_assert_held_write(&kvm->mmu_lock);

	__kvm_rmap_unlock(rmap_head, new_val);
}

static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
{
	return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED;
}

/*
 * If mmu_lock isn't held, rmaps can only be locked in read-only mode.  The
 * actual locking is the same, but the caller is disallowed from modifying the
 * rmap, and so the unlock flow is a nop if the rmap is/was empty.
 */
static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
{
	unsigned long rmap_val;

	preempt_disable();
	rmap_val = __kvm_rmap_lock(rmap_head);

	if (!rmap_val)
		preempt_enable();

	return rmap_val;
}

static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
				     unsigned long old_val)
{
	if (!old_val)
		return;

	KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head));

	__kvm_rmap_unlock(rmap_head, old_val);
	preempt_enable();
}

/*
 * Returns the number of pointers in the rmap chain, not counting the new one.
 */
static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
			u64 *spte, struct kvm_rmap_head *rmap_head)
{
	unsigned long old_val, new_val;
	struct pte_list_desc *desc;
	int count = 0;

	if (!rmap_head->val) {
		rmap_head->val = (unsigned long)spte;
	} else if (!(rmap_head->val & KVM_RMAP_MANY)) {
	old_val = kvm_rmap_lock(kvm, rmap_head);

	if (!old_val) {
		new_val = (unsigned long)spte;
	} else if (!(old_val & KVM_RMAP_MANY)) {
		desc = kvm_mmu_memory_cache_alloc(cache);
		desc->sptes[0] = (u64 *)rmap_head->val;
		desc->sptes[0] = (u64 *)old_val;
		desc->sptes[1] = spte;
		desc->spte_count = 2;
		desc->tail_count = 0;
		rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
		new_val = (unsigned long)desc | KVM_RMAP_MANY;
		++count;
	} else {
		desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
		desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
		count = desc->tail_count + desc->spte_count;

		/*
@@ -887,21 +1028,25 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
		 */
		if (desc->spte_count == PTE_LIST_EXT) {
			desc = kvm_mmu_memory_cache_alloc(cache);
			desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
			desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
			desc->spte_count = 0;
			desc->tail_count = count;
			rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
			new_val = (unsigned long)desc | KVM_RMAP_MANY;
		} else {
			new_val = old_val;
		}
		desc->sptes[desc->spte_count++] = spte;
	}

	kvm_rmap_unlock(kvm, rmap_head, new_val);

	return count;
}

static void pte_list_desc_remove_entry(struct kvm *kvm,
				       struct kvm_rmap_head *rmap_head,
static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val,
				       struct pte_list_desc *desc, int i)
{
	struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
	struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY);
	int j = head_desc->spte_count - 1;

	/*
@@ -928,9 +1073,9 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
	 * head at the next descriptor, i.e. the new head.
	 */
	if (!head_desc->more)
		rmap_head->val = 0;
		*rmap_val = 0;
	else
		rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
		*rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
	mmu_free_pte_list_desc(head_desc);
}

@@ -938,24 +1083,26 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
			    struct kvm_rmap_head *rmap_head)
{
	struct pte_list_desc *desc;
	unsigned long rmap_val;
	int i;

	if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
		return;
	rmap_val = kvm_rmap_lock(kvm, rmap_head);
	if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm))
		goto out;

	if (!(rmap_head->val & KVM_RMAP_MANY)) {
		if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
			return;
	if (!(rmap_val & KVM_RMAP_MANY)) {
		if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm))
			goto out;

		rmap_head->val = 0;
		rmap_val = 0;
	} else {
		desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
		desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
		while (desc) {
			for (i = 0; i < desc->spte_count; ++i) {
				if (desc->sptes[i] == spte) {
					pte_list_desc_remove_entry(kvm, rmap_head,
					pte_list_desc_remove_entry(kvm, &rmap_val,
								   desc, i);
					return;
					goto out;
				}
			}
			desc = desc->more;
@@ -963,6 +1110,9 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,

		KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
	}

out:
	kvm_rmap_unlock(kvm, rmap_head, rmap_val);
}

static void kvm_zap_one_rmap_spte(struct kvm *kvm,
@@ -977,17 +1127,19 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
				   struct kvm_rmap_head *rmap_head)
{
	struct pte_list_desc *desc, *next;
	unsigned long rmap_val;
	int i;

	if (!rmap_head->val)
	rmap_val = kvm_rmap_lock(kvm, rmap_head);
	if (!rmap_val)
		return false;

	if (!(rmap_head->val & KVM_RMAP_MANY)) {
		mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
	if (!(rmap_val & KVM_RMAP_MANY)) {
		mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val);
		goto out;
	}

	desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
	desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);

	for (; desc; desc = next) {
		for (i = 0; i < desc->spte_count; i++)
@@ -997,20 +1149,21 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
	}
out:
	/* rmap_head is meaningless now, remember to reset it */
	rmap_head->val = 0;
	kvm_rmap_unlock(kvm, rmap_head, 0);
	return true;
}

unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
{
	unsigned long rmap_val = kvm_rmap_get(rmap_head);
	struct pte_list_desc *desc;

	if (!rmap_head->val)
	if (!rmap_val)
		return 0;
	else if (!(rmap_head->val & KVM_RMAP_MANY))
	else if (!(rmap_val & KVM_RMAP_MANY))
		return 1;

	desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
	desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
	return desc->tail_count + desc->spte_count;
}

@@ -1053,6 +1206,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 */
struct rmap_iterator {
	/* private fields */
	struct rmap_head *head;
	struct pte_list_desc *desc;	/* holds the sptep if not NULL */
	int pos;			/* index of the sptep */
};
@@ -1067,23 +1221,19 @@ struct rmap_iterator {
static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
			   struct rmap_iterator *iter)
{
	u64 *sptep;
	unsigned long rmap_val = kvm_rmap_get(rmap_head);

	if (!rmap_head->val)
	if (!rmap_val)
		return NULL;

	if (!(rmap_head->val & KVM_RMAP_MANY)) {
	if (!(rmap_val & KVM_RMAP_MANY)) {
		iter->desc = NULL;
		sptep = (u64 *)rmap_head->val;
		goto out;
		return (u64 *)rmap_val;
	}

	iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
	iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
	iter->pos = 0;
	sptep = iter->desc->sptes[iter->pos];
out:
	BUG_ON(!is_shadow_present_pte(*sptep));
	return sptep;
	return iter->desc->sptes[iter->pos];
}

/*
@@ -1093,14 +1243,11 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
 */
static u64 *rmap_get_next(struct rmap_iterator *iter)
{
	u64 *sptep;

	if (iter->desc) {
		if (iter->pos < PTE_LIST_EXT - 1) {
			++iter->pos;
			sptep = iter->desc->sptes[iter->pos];
			if (sptep)
				goto out;
			if (iter->desc->sptes[iter->pos])
				return iter->desc->sptes[iter->pos];
		}

		iter->desc = iter->desc->more;
@@ -1108,20 +1255,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
		if (iter->desc) {
			iter->pos = 0;
			/* desc->sptes[0] cannot be NULL */
			sptep = iter->desc->sptes[iter->pos];
			goto out;
			return iter->desc->sptes[iter->pos];
		}
	}

	return NULL;
out:
	BUG_ON(!is_shadow_present_pte(*sptep));
	return sptep;
}

#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)			\
	for (_spte_ = rmap_get_first(_rmap_head_, _iter_);		\
	     _spte_; _spte_ = rmap_get_next(_iter_))
#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_)	\
	for (_sptep_ = rmap_get_first(_rmap_head_, _iter_);	\
	     _sptep_; _sptep_ = rmap_get_next(_iter_))

#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_)			\
	__for_each_rmap_spte(_rmap_head_, _iter_, _sptep_)			\
		if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_))))	\

#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_)	\
	__for_each_rmap_spte(_rmap_head_, _iter_, _sptep_)			\
		if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep)))

static void drop_spte(struct kvm *kvm, u64 *sptep)
{
@@ -1207,12 +1358,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
	struct rmap_iterator iter;
	bool flush = false;

	for_each_rmap_spte(rmap_head, &iter, sptep)
	for_each_rmap_spte(rmap_head, &iter, sptep) {
		if (spte_ad_need_write_protect(*sptep))
			flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
						    (unsigned long *)sptep);
		else
			flush |= spte_clear_dirty(sptep);
	}

	return flush;
}
@@ -1401,7 +1553,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
	while (++iterator->rmap <= iterator->end_rmap) {
		iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);

		if (iterator->rmap->val)
		if (atomic_long_read(&iterator->rmap->val))
			return;
	}

@@ -1533,7 +1685,7 @@ static void __rmap_add(struct kvm *kvm,
	kvm_update_page_stats(kvm, sp->role.level, 1);

	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
	rmap_count = pte_list_add(cache, spte, rmap_head);
	rmap_count = pte_list_add(kvm, cache, spte, rmap_head);

	if (rmap_count > kvm->stat.max_mmu_rmap_size)
		kvm->stat.max_mmu_rmap_size = rmap_count;
@@ -1552,51 +1704,67 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
}

static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
				   struct kvm_gfn_range *range, bool test_only)
				   struct kvm_gfn_range *range,
				   bool test_only)
{
	struct slot_rmap_walk_iterator iterator;
	struct kvm_rmap_head *rmap_head;
	struct rmap_iterator iter;
	unsigned long rmap_val;
	bool young = false;
	u64 *sptep;
	gfn_t gfn;
	int level;
	u64 spte;

	for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
				 range->start, range->end - 1, &iterator) {
		for_each_rmap_spte(iterator.rmap, &iter, sptep) {
			u64 spte = *sptep;
	for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
		for (gfn = range->start; gfn < range->end;
		     gfn += KVM_PAGES_PER_HPAGE(level)) {
			rmap_head = gfn_to_rmap(gfn, level, range->slot);
			rmap_val = kvm_rmap_lock_readonly(rmap_head);

			for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) {
				if (!is_accessed_spte(spte))
					continue;

			if (test_only)
				if (test_only) {
					kvm_rmap_unlock_readonly(rmap_head, rmap_val);
					return true;
				}

			if (spte_ad_enabled(spte)) {
				if (spte_ad_enabled(spte))
					clear_bit((ffs(shadow_accessed_mask) - 1),
						  (unsigned long *)sptep);
			} else {
				else
					/*
				 * WARN if mmu_spte_update() signals the need
				 * for a TLB flush, as Access tracking a SPTE
				 * should never trigger an _immediate_ flush.
					 * If the following cmpxchg fails, the
					 * spte is being concurrently modified
					 * and should most likely stay young.
					 */
				spte = mark_spte_for_access_track(spte);
				WARN_ON_ONCE(mmu_spte_update(sptep, spte));
			}
					cmpxchg64(sptep, spte,
					      mark_spte_for_access_track(spte));
				young = true;
			}

			kvm_rmap_unlock_readonly(rmap_head, rmap_val);
		}
	}
	return young;
}

static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm)
{
	return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages);
}

bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
	bool young = false;

	if (kvm_memslots_have_rmaps(kvm))
		young = kvm_rmap_age_gfn_range(kvm, range, false);

	if (tdp_mmu_enabled)
		young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
		young = kvm_tdp_mmu_age_gfn_range(kvm, range);

	if (kvm_may_have_shadow_mmu_sptes(kvm))
		young |= kvm_rmap_age_gfn_range(kvm, range, false);

	return young;
}
@@ -1605,11 +1773,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
	bool young = false;

	if (kvm_memslots_have_rmaps(kvm))
		young = kvm_rmap_age_gfn_range(kvm, range, true);

	if (tdp_mmu_enabled)
		young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
		young = kvm_tdp_mmu_test_age_gfn(kvm, range);

	if (young)
		return young;

	if (kvm_may_have_shadow_mmu_sptes(kvm))
		young |= kvm_rmap_age_gfn_range(kvm, range, true);

	return young;
}
@@ -1656,13 +1827,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
}

static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
static void mmu_page_add_parent_pte(struct kvm *kvm,
				    struct kvm_mmu_memory_cache *cache,
				    struct kvm_mmu_page *sp, u64 *parent_pte)
{
	if (!parent_pte)
		return;

	pte_list_add(cache, parent_pte, &sp->parent_ptes);
	pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes);
}

static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -2352,7 +2524,7 @@ static void __link_shadow_page(struct kvm *kvm,

	mmu_spte_set(sptep, spte);

	mmu_page_add_parent_pte(cache, sp, sptep);
	mmu_page_add_parent_pte(kvm, cache, sp, sptep);

	/*
	 * The non-direct sub-pagetable must be updated before linking.  For
@@ -2416,7 +2588,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
			 * avoids retaining a large number of stale nested SPs.
			 */
			if (tdp_enabled && invalid_list &&
			    child->role.guest_mode && !child->parent_ptes.val)
			    child->role.guest_mode &&
			    !atomic_long_read(&child->parent_ptes.val))
				return kvm_mmu_prepare_zap_page(kvm, child,
								invalid_list);
		}
+19 −12
Original line number Diff line number Diff line
@@ -129,25 +129,32 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
}

/*
 * Returns true if the SPTE has bits that may be set without holding mmu_lock.
 * The caller is responsible for checking if the SPTE is shadow-present, and
 * for determining whether or not the caller cares about non-leaf SPTEs.
 * Returns true if the SPTE needs to be updated atomically due to having bits
 * that may be changed without holding mmu_lock, and for which KVM must not
 * lose information.  E.g. KVM must not drop Dirty bit information.  The caller
 * is responsible for checking if the SPTE is shadow-present, and for
 * determining whether or not the caller cares about non-leaf SPTEs.
 */
bool spte_has_volatile_bits(u64 spte)
bool spte_needs_atomic_update(u64 spte)
{
	/* SPTEs can be made Writable bit by KVM's fast page fault handler. */
	if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
		return true;

	if (is_access_track_spte(spte))
		return true;

	if (spte_ad_enabled(spte)) {
		if (!(spte & shadow_accessed_mask) ||
		    (is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
	/*
	 * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked
	 * SPTEs can be restored by KVM's fast page fault handler.
	 */
	if (!spte_ad_enabled(spte))
		return true;
	}

	return false;
	/*
	 * Dirty and Accessed bits can be set by the CPU.  Ignore the Accessed
	 * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't
	 * invalidate TLBs when aging SPTEs, and so it's safe to clobber the
	 * Accessed bit (and rare in practice).
	 */
	return is_writable_pte(spte) && !(spte & shadow_dirty_mask);
}

bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
Loading