Commit 5b0d0d85 authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge tag 'kvm-x86-mmu-6.18' of https://github.com/kvm-x86/linux into HEAD

KVM x86 MMU changes for 6.18

 - Recover possible NX huge pages within the TDP MMU under read lock to
   reduce guest jitter when restoring NX huge pages.

 - Return -EAGAIN during prefault if userspace concurrently deletes/moves the
   relevant memslot to fix an issue where prefaulting could deadlock with the
   memslot update.

 - Don't retry in TDX's anti-zero-step mitigation if the target memslot is
   invalid, i.e. is being deleted or moved, to fix a deadlock scenario similar
   to the aforementioned prefaulting case.
parents 99cab802 2bc2694f
Loading
Loading
Loading
Loading
+26 −13
Original line number Diff line number Diff line
@@ -1348,6 +1348,30 @@ enum kvm_apicv_inhibit {
	__APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED),	\
	__APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)

struct kvm_possible_nx_huge_pages {
	/*
	 * A list of kvm_mmu_page structs that, if zapped, could possibly be
	 * replaced by an NX huge page.  A shadow page is on this list if its
	 * existence disallows an NX huge page (nx_huge_page_disallowed is set)
	 * and there are no other conditions that prevent a huge page, e.g.
	 * the backing host page is huge, dirtly logging is not enabled for its
	 * memslot, etc...  Note, zapping shadow pages on this list doesn't
	 * guarantee an NX huge page will be created in its stead, e.g. if the
	 * guest attempts to execute from the region then KVM obviously can't
	 * create an NX huge page (without hanging the guest).
	 */
	struct list_head pages;
	u64 nr_pages;
};

enum kvm_mmu_type {
	KVM_SHADOW_MMU,
#ifdef CONFIG_X86_64
	KVM_TDP_MMU,
#endif
	KVM_NR_MMU_TYPES,
};

struct kvm_arch {
	unsigned long n_used_mmu_pages;
	unsigned long n_requested_mmu_pages;
@@ -1360,18 +1384,7 @@ struct kvm_arch {
	bool pre_fault_allowed;
	struct hlist_head *mmu_page_hash;
	struct list_head active_mmu_pages;
	/*
	 * A list of kvm_mmu_page structs that, if zapped, could possibly be
	 * replaced by an NX huge page.  A shadow page is on this list if its
	 * existence disallows an NX huge page (nx_huge_page_disallowed is set)
	 * and there are no other conditions that prevent a huge page, e.g.
	 * the backing host page is huge, dirtly logging is not enabled for its
	 * memslot, etc...  Note, zapping shadow pages on this list doesn't
	 * guarantee an NX huge page will be created in its stead, e.g. if the
	 * guest attempts to execute from the region then KVM obviously can't
	 * create an NX huge page (without hanging the guest).
	 */
	struct list_head possible_nx_huge_pages;
	struct kvm_possible_nx_huge_pages possible_nx_huge_pages[KVM_NR_MMU_TYPES];
#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
	struct kvm_page_track_notifier_head track_notifier_head;
#endif
@@ -1526,7 +1539,7 @@ struct kvm_arch {
	 * is held in read mode:
	 *  - tdp_mmu_roots (above)
	 *  - the link field of kvm_mmu_page structs used by the TDP MMU
	 *  - possible_nx_huge_pages;
	 *  - possible_nx_huge_pages[KVM_TDP_MMU];
	 *  - the possible_nx_huge_page_link field of kvm_mmu_page structs used
	 *    by the TDP MMU
	 * Because the lock is only taken within the MMU lock, strictly
+107 −54
Original line number Diff line number Diff line
@@ -776,7 +776,8 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
		kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
}

void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				 enum kvm_mmu_type mmu_type)
{
	/*
	 * If it's possible to replace the shadow page with an NX huge page,
@@ -790,8 +791,9 @@ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
		return;

	++kvm->stat.nx_lpage_splits;
	++kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
	list_add_tail(&sp->possible_nx_huge_page_link,
		      &kvm->arch.possible_nx_huge_pages);
		      &kvm->arch.possible_nx_huge_pages[mmu_type].pages);
}

static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -800,7 +802,7 @@ static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
	sp->nx_huge_page_disallowed = true;

	if (nx_huge_page_possible)
		track_possible_nx_huge_page(kvm, sp);
		track_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
}

static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -819,12 +821,14 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
	kvm_mmu_gfn_allow_lpage(slot, gfn);
}

void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				   enum kvm_mmu_type mmu_type)
{
	if (list_empty(&sp->possible_nx_huge_page_link))
		return;

	--kvm->stat.nx_lpage_splits;
	--kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
	list_del_init(&sp->possible_nx_huge_page_link);
}

@@ -832,7 +836,7 @@ static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
	sp->nx_huge_page_disallowed = false;

	untrack_possible_nx_huge_page(kvm, sp);
	untrack_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
}

static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
@@ -4663,10 +4667,16 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
	/*
	 * Retry the page fault if the gfn hit a memslot that is being deleted
	 * or moved.  This ensures any existing SPTEs for the old memslot will
	 * be zapped before KVM inserts a new MMIO SPTE for the gfn.
	 * be zapped before KVM inserts a new MMIO SPTE for the gfn.  Punt the
	 * error to userspace if this is a prefault, as KVM's prefaulting ABI
	 * doesn't provide the same forward progress guarantees as KVM_RUN.
	 */
	if (slot->flags & KVM_MEMSLOT_INVALID)
	if (slot->flags & KVM_MEMSLOT_INVALID) {
		if (fault->prefetch)
			return -EAGAIN;

		return RET_PF_RETRY;
	}

	if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
		/*
@@ -6751,11 +6761,12 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)

int kvm_mmu_init_vm(struct kvm *kvm)
{
	int r;
	int r, i;

	kvm->arch.shadow_mmio_value = shadow_mmio_value;
	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
	INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
	for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
		INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages[i].pages);
	spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);

	if (tdp_mmu_enabled) {
@@ -7596,18 +7607,63 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
	return err;
}

static void kvm_recover_nx_huge_pages(struct kvm *kvm)
static unsigned long nx_huge_pages_to_zap(struct kvm *kvm,
					  enum kvm_mmu_type mmu_type)
{
	unsigned long pages = READ_ONCE(kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages);
	unsigned int ratio = READ_ONCE(nx_huge_pages_recovery_ratio);

	return ratio ? DIV_ROUND_UP(pages, ratio) : 0;
}

static bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm,
					     struct kvm_mmu_page *sp)
{
	unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
	struct kvm_memory_slot *slot;
	int rcu_idx;

	/*
	 * Skip the memslot lookup if dirty tracking can't possibly be enabled,
	 * as memslot lookups are relatively expensive.
	 *
	 * If a memslot update is in progress, reading an incorrect value of
	 * kvm->nr_memslots_dirty_logging is not a problem: if it is becoming
	 * zero, KVM will  do an unnecessary memslot lookup;  if it is becoming
	 * nonzero, the page will be zapped unnecessarily.  Either way, this
	 * only affects efficiency in racy situations, and not correctness.
	 */
	if (!atomic_read(&kvm->nr_memslots_dirty_logging))
		return false;

	slot = __gfn_to_memslot(kvm_memslots_for_spte_role(kvm, sp->role), sp->gfn);
	if (WARN_ON_ONCE(!slot))
		return false;

	return kvm_slot_dirty_track_enabled(slot);
}

static void kvm_recover_nx_huge_pages(struct kvm *kvm,
				      const enum kvm_mmu_type mmu_type)
{
#ifdef CONFIG_X86_64
	const bool is_tdp_mmu = mmu_type == KVM_TDP_MMU;
	spinlock_t *tdp_mmu_pages_lock = &kvm->arch.tdp_mmu_pages_lock;
#else
	const bool is_tdp_mmu = false;
	spinlock_t *tdp_mmu_pages_lock = NULL;
#endif
	unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type);
	struct list_head *nx_huge_pages;
	struct kvm_mmu_page *sp;
	unsigned int ratio;
	LIST_HEAD(invalid_list);
	bool flush = false;
	ulong to_zap;
	int rcu_idx;

	nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages;

	rcu_idx = srcu_read_lock(&kvm->srcu);
	if (is_tdp_mmu)
		read_lock(&kvm->mmu_lock);
	else
		write_lock(&kvm->mmu_lock);

	/*
@@ -7617,11 +7673,15 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
	 */
	rcu_read_lock();

	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
	to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
	for ( ; to_zap; --to_zap) {
		if (list_empty(&kvm->arch.possible_nx_huge_pages))
		if (is_tdp_mmu)
			spin_lock(tdp_mmu_pages_lock);

		if (list_empty(nx_huge_pages)) {
			if (is_tdp_mmu)
				spin_unlock(tdp_mmu_pages_lock);
			break;
		}

		/*
		 * We use a separate list instead of just using active_mmu_pages
@@ -7630,56 +7690,44 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
		 * the total number of shadow pages.  And because the TDP MMU
		 * doesn't use active_mmu_pages.
		 */
		sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
		sp = list_first_entry(nx_huge_pages,
				      struct kvm_mmu_page,
				      possible_nx_huge_page_link);
		WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
		WARN_ON_ONCE(!sp->role.direct);

		/*
		 * Unaccount and do not attempt to recover any NX Huge Pages
		 * that are being dirty tracked, as they would just be faulted
		 * back in as 4KiB pages. The NX Huge Pages in this slot will be
		 * recovered, along with all the other huge pages in the slot,
		 * when dirty logging is disabled.
		 *
		 * Since gfn_to_memslot() is relatively expensive, it helps to
		 * skip it if it the test cannot possibly return true.  On the
		 * other hand, if any memslot has logging enabled, chances are
		 * good that all of them do, in which case unaccount_nx_huge_page()
		 * is much cheaper than zapping the page.
		 *
		 * If a memslot update is in progress, reading an incorrect value
		 * of kvm->nr_memslots_dirty_logging is not a problem: if it is
		 * becoming zero, gfn_to_memslot() will be done unnecessarily; if
		 * it is becoming nonzero, the page will be zapped unnecessarily.
		 * Either way, this only affects efficiency in racy situations,
		 * and not correctness.
		 */
		slot = NULL;
		if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
			struct kvm_memslots *slots;
		unaccount_nx_huge_page(kvm, sp);

			slots = kvm_memslots_for_spte_role(kvm, sp->role);
			slot = __gfn_to_memslot(slots, sp->gfn);
			WARN_ON_ONCE(!slot);
		}
		if (is_tdp_mmu)
			spin_unlock(tdp_mmu_pages_lock);

		if (slot && kvm_slot_dirty_track_enabled(slot))
			unaccount_nx_huge_page(kvm, sp);
		else if (is_tdp_mmu_page(sp))
			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
		/*
		 * Do not attempt to recover any NX Huge Pages that are being
		 * dirty tracked, as they would just be faulted back in as 4KiB
		 * pages. The NX Huge Pages in this slot will be recovered,
		 * along with all the other huge pages in the slot, when dirty
		 * logging is disabled.
		 */
		if (!kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
			if (is_tdp_mmu)
				flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp);
			else
				kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);

		}

		WARN_ON_ONCE(sp->nx_huge_page_disallowed);

		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
			kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
			rcu_read_unlock();

			if (is_tdp_mmu)
				cond_resched_rwlock_read(&kvm->mmu_lock);
			else
				cond_resched_rwlock_write(&kvm->mmu_lock);
			flush = false;

			flush = false;
			rcu_read_lock();
		}
	}
@@ -7687,6 +7735,9 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)

	rcu_read_unlock();

	if (is_tdp_mmu)
		read_unlock(&kvm->mmu_lock);
	else
		write_unlock(&kvm->mmu_lock);
	srcu_read_unlock(&kvm->srcu, rcu_idx);
}
@@ -7698,9 +7749,10 @@ static void kvm_nx_huge_page_recovery_worker_kill(void *data)
static bool kvm_nx_huge_page_recovery_worker(void *data)
{
	struct kvm *kvm = data;
	long remaining_time;
	bool enabled;
	uint period;
	long remaining_time;
	int i;

	enabled = calc_nx_huge_pages_recovery_period(&period);
	if (!enabled)
@@ -7715,7 +7767,8 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
	}

	__set_current_state(TASK_RUNNING);
	kvm_recover_nx_huge_pages(kvm);
	for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
		kvm_recover_nx_huge_pages(kvm, i);
	kvm->arch.nx_huge_page_last = get_jiffies_64();
	return true;
}
+4 −2
Original line number Diff line number Diff line
@@ -416,7 +416,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);

void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				 enum kvm_mmu_type mmu_type);
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				   enum kvm_mmu_type mmu_type);

#endif /* __KVM_X86_MMU_INTERNAL_H */
+39 −10
Original line number Diff line number Diff line
@@ -355,7 +355,7 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)

	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
	sp->nx_huge_page_disallowed = false;
	untrack_possible_nx_huge_page(kvm, sp);
	untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}

@@ -925,23 +925,52 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
	rcu_read_unlock();
}

bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
					   struct kvm_mmu_page *sp)
{
	u64 old_spte;
	struct tdp_iter iter = {
		.old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
		.sptep = sp->ptep,
		.level = sp->role.level + 1,
		.gfn = sp->gfn,
		.as_id = kvm_mmu_page_as_id(sp),
	};

	lockdep_assert_held_read(&kvm->mmu_lock);

	if (WARN_ON_ONCE(!is_tdp_mmu_page(sp)))
		return false;

	/*
	 * This helper intentionally doesn't allow zapping a root shadow page,
	 * which doesn't have a parent page table and thus no associated entry.
	 * Root shadow pages don't have a parent page table and thus no
	 * associated entry, but they can never be possible NX huge pages.
	 */
	if (WARN_ON_ONCE(!sp->ptep))
		return false;

	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
	/*
	 * Since mmu_lock is held in read mode, it's possible another task has
	 * already modified the SPTE. Zap the SPTE if and only if the SPTE
	 * points at the SP's page table, as checking shadow-present isn't
	 * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even
	 * another SP. Note, spte_to_child_pt() also checks that the SPTE is
	 * shadow-present, i.e. guards against zapping a frozen SPTE.
	 */
	if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
		return false;

	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
			 SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
	/*
	 * If a different task modified the SPTE, then it should be impossible
	 * for the SPTE to still be used for the to-be-zapped SP. Non-leaf
	 * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
	 * creating non-leaf SPTEs, and all other bits are immutable for non-
	 * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
	 * zapping and replacement.
	 */
	if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
		WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
		return false;
	}

	return true;
}
@@ -1303,7 +1332,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
		    fault->req_level >= iter.level) {
			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
			if (sp->nx_huge_page_disallowed)
				track_possible_nx_huge_page(kvm, sp);
				track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
		}
	}
+2 −1
Original line number Diff line number Diff line
@@ -64,7 +64,8 @@ static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu,
}

bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
					   struct kvm_mmu_page *sp);
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
				  enum kvm_tdp_mmu_root_types root_types);
Loading