Commit 5a1c72e0 authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge tag 'kvm-x86-mmu-6.10' of https://github.com/kvm-x86/linux into HEAD

KVM x86 MMU changes for 6.10:

 - Process TDP MMU SPTEs that are are zapped while holding mmu_lock for read
   after replacing REMOVED_SPTE with '0' and flushing remote TLBs, which allows
   vCPU tasks to repopulate the zapped region while the zapper finishes tearing
   down the old, defunct page tables.

 - Fix a longstanding, likely benign-in-practice race where KVM could fail to
   detect a write from kvm_mmu_track_write() to a shadowed GPTE if the GPTE is
   first page table being shadowed.
parents dee7ea42 226d9b8f
Loading
Loading
Loading
Loading
+17 −3
Original line number Diff line number Diff line
@@ -831,6 +831,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
	gfn_t gfn;

	kvm->arch.indirect_shadow_pages++;
	/*
	 * Ensure indirect_shadow_pages is elevated prior to re-reading guest
	 * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight
	 * emulated writes are visible before re-reading guest PTEs, or that
	 * an emulated write will see the elevated count and acquire mmu_lock
	 * to update SPTEs.  Pairs with the smp_mb() in kvm_mmu_track_write().
	 */
	smp_mb();

	gfn = sp->gfn;
	slots = kvm_memslots_for_spte_role(kvm, sp->role);
	slot = __gfn_to_memslot(slots, gfn);
@@ -5787,10 +5796,15 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
	bool flush = false;

	/*
	 * If we don't have indirect shadow pages, it means no page is
	 * write-protected, so we can exit simply.
	 * When emulating guest writes, ensure the written value is visible to
	 * any task that is handling page faults before checking whether or not
	 * KVM is shadowing a guest PTE.  This ensures either KVM will create
	 * the correct SPTE in the page fault handler, or this task will see
	 * a non-zero indirect_shadow_pages.  Pairs with the smp_mb() in
	 * account_shadowed().
	 */
	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
	smp_mb();
	if (!vcpu->kvm->arch.indirect_shadow_pages)
		return;

	write_lock(&vcpu->kvm->mmu_lock);
+49 −26
Original line number Diff line number Diff line
@@ -530,6 +530,31 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}

static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte)
{
	u64 *sptep = rcu_dereference(iter->sptep);

	/*
	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
	 * and pre-checking before inserting a new SPTE is advantageous as it
	 * avoids unnecessary work.
	 */
	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));

	/*
	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
	 * does not hold the mmu_lock.  On failure, i.e. if a different logical
	 * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
	 * the current value, so the caller operates on fresh data, e.g. if it
	 * retries tdp_mmu_set_spte_atomic()
	 */
	if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
		return -EBUSY;

	return 0;
}

/*
 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
 * and handle the associated bookkeeping.  Do not mark the page dirty
@@ -551,27 +576,13 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
					  struct tdp_iter *iter,
					  u64 new_spte)
{
	u64 *sptep = rcu_dereference(iter->sptep);

	/*
	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
	 * and pre-checking before inserting a new SPTE is advantageous as it
	 * avoids unnecessary work.
	 */
	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
	int ret;

	lockdep_assert_held_read(&kvm->mmu_lock);

	/*
	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
	 * does not hold the mmu_lock.  On failure, i.e. if a different logical
	 * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
	 * the current value, so the caller operates on fresh data, e.g. if it
	 * retries tdp_mmu_set_spte_atomic()
	 */
	if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
		return -EBUSY;
	ret = __tdp_mmu_set_spte_atomic(iter, new_spte);
	if (ret)
		return ret;

	handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
			    new_spte, iter->level, true);
@@ -584,13 +595,17 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
{
	int ret;

	lockdep_assert_held_read(&kvm->mmu_lock);

	/*
	 * Freeze the SPTE by setting it to a special,
	 * non-present value. This will stop other threads from
	 * immediately installing a present entry in its place
	 * before the TLBs are flushed.
	 * Freeze the SPTE by setting it to a special, non-present value. This
	 * will stop other threads from immediately installing a present entry
	 * in its place before the TLBs are flushed.
	 *
	 * Delay processing of the zapped SPTE until after TLBs are flushed and
	 * the REMOVED_SPTE is replaced (see below).
	 */
	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
	ret = __tdp_mmu_set_spte_atomic(iter, REMOVED_SPTE);
	if (ret)
		return ret;

@@ -599,12 +614,20 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
	/*
	 * No other thread can overwrite the removed SPTE as they must either
	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
	 * overwrite the special removed SPTE value. No bookkeeping is needed
	 * here since the SPTE is going from non-present to non-present.  Use
	 * the raw write helper to avoid an unnecessary check on volatile bits.
	 * overwrite the special removed SPTE value. Use the raw write helper to
	 * avoid an unnecessary check on volatile bits.
	 */
	__kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);

	/*
	 * Process the zapped SPTE after flushing TLBs, and after replacing
	 * REMOVED_SPTE with 0. This minimizes the amount of time vCPUs are
	 * blocked by the REMOVED_SPTE and reduces contention on the child
	 * SPTEs.
	 */
	handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
			    0, iter->level, true);

	return 0;
}