Commit 67778856 authored by Vipin Sharma's avatar Vipin Sharma Committed by Sean Christopherson
Browse files

KVM: x86/mmu: Track possible NX huge pages separately for TDP vs. Shadow MMU



Track possible NX huge pages for the TDP MMU separately from Shadow MMUs
in anticipation of doing recovery for the TDP MMU while holding mmu_lock
for read instead of write.

Use a small structure to hold the list of pages along with the number of
pages/entries in the list, as relying on kvm->stat.nx_lpage_splits to
calculating the number of pages to recover would result in over-zapping
when both TDP and Shadow MMUs are active.

Suggested-by: default avatarSean Christopherson <seanjc@google.com>
Suggested-by: default avatarDavid Matlack <dmatlack@google.com>
Signed-off-by: default avatarVipin Sharma <vipinsh@google.com>
Co-developed-by: default avatarJames Houghton <jthoughton@google.com>
Signed-off-by: default avatarJames Houghton <jthoughton@google.com>
Link: https://lore.kernel.org/r/20250707224720.4016504-2-jthoughton@google.com


[sean: rewrite changelog, use #ifdef instead of dummy KVM_TDP_MMU #define]
Signed-off-by: default avatarSean Christopherson <seanjc@google.com>
parent c17b750b
Loading
Loading
Loading
Loading
+26 −13
Original line number Diff line number Diff line
@@ -1348,6 +1348,30 @@ enum kvm_apicv_inhibit {
	__APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED),	\
	__APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)

struct kvm_possible_nx_huge_pages {
	/*
	 * A list of kvm_mmu_page structs that, if zapped, could possibly be
	 * replaced by an NX huge page.  A shadow page is on this list if its
	 * existence disallows an NX huge page (nx_huge_page_disallowed is set)
	 * and there are no other conditions that prevent a huge page, e.g.
	 * the backing host page is huge, dirtly logging is not enabled for its
	 * memslot, etc...  Note, zapping shadow pages on this list doesn't
	 * guarantee an NX huge page will be created in its stead, e.g. if the
	 * guest attempts to execute from the region then KVM obviously can't
	 * create an NX huge page (without hanging the guest).
	 */
	struct list_head pages;
	u64 nr_pages;
};

enum kvm_mmu_type {
	KVM_SHADOW_MMU,
#ifdef CONFIG_X86_64
	KVM_TDP_MMU,
#endif
	KVM_NR_MMU_TYPES,
};

struct kvm_arch {
	unsigned long n_used_mmu_pages;
	unsigned long n_requested_mmu_pages;
@@ -1360,18 +1384,7 @@ struct kvm_arch {
	bool pre_fault_allowed;
	struct hlist_head *mmu_page_hash;
	struct list_head active_mmu_pages;
	/*
	 * A list of kvm_mmu_page structs that, if zapped, could possibly be
	 * replaced by an NX huge page.  A shadow page is on this list if its
	 * existence disallows an NX huge page (nx_huge_page_disallowed is set)
	 * and there are no other conditions that prevent a huge page, e.g.
	 * the backing host page is huge, dirtly logging is not enabled for its
	 * memslot, etc...  Note, zapping shadow pages on this list doesn't
	 * guarantee an NX huge page will be created in its stead, e.g. if the
	 * guest attempts to execute from the region then KVM obviously can't
	 * create an NX huge page (without hanging the guest).
	 */
	struct list_head possible_nx_huge_pages;
	struct kvm_possible_nx_huge_pages possible_nx_huge_pages[KVM_NR_MMU_TYPES];
#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
	struct kvm_page_track_notifier_head track_notifier_head;
#endif
@@ -1526,7 +1539,7 @@ struct kvm_arch {
	 * is held in read mode:
	 *  - tdp_mmu_roots (above)
	 *  - the link field of kvm_mmu_page structs used by the TDP MMU
	 *  - possible_nx_huge_pages;
	 *  - possible_nx_huge_pages[KVM_TDP_MMU];
	 *  - the possible_nx_huge_page_link field of kvm_mmu_page structs used
	 *    by the TDP MMU
	 * Because the lock is only taken within the MMU lock, strictly
+40 −19
Original line number Diff line number Diff line
@@ -776,7 +776,8 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
		kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
}

void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				 enum kvm_mmu_type mmu_type)
{
	/*
	 * If it's possible to replace the shadow page with an NX huge page,
@@ -790,8 +791,9 @@ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
		return;

	++kvm->stat.nx_lpage_splits;
	++kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
	list_add_tail(&sp->possible_nx_huge_page_link,
		      &kvm->arch.possible_nx_huge_pages);
		      &kvm->arch.possible_nx_huge_pages[mmu_type].pages);
}

static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -800,7 +802,7 @@ static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
	sp->nx_huge_page_disallowed = true;

	if (nx_huge_page_possible)
		track_possible_nx_huge_page(kvm, sp);
		track_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
}

static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -819,12 +821,14 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
	kvm_mmu_gfn_allow_lpage(slot, gfn);
}

void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				   enum kvm_mmu_type mmu_type)
{
	if (list_empty(&sp->possible_nx_huge_page_link))
		return;

	--kvm->stat.nx_lpage_splits;
	--kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
	list_del_init(&sp->possible_nx_huge_page_link);
}

@@ -832,7 +836,7 @@ static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
	sp->nx_huge_page_disallowed = false;

	untrack_possible_nx_huge_page(kvm, sp);
	untrack_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
}

static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
@@ -6737,11 +6741,12 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)

int kvm_mmu_init_vm(struct kvm *kvm)
{
	int r;
	int r, i;

	kvm->arch.shadow_mmio_value = shadow_mmio_value;
	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
	INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
	for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
		INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages[i].pages);
	spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);

	if (tdp_mmu_enabled) {
@@ -7582,16 +7587,32 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
	return err;
}

static void kvm_recover_nx_huge_pages(struct kvm *kvm)
static unsigned long nx_huge_pages_to_zap(struct kvm *kvm,
					  enum kvm_mmu_type mmu_type)
{
	unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
	unsigned long pages = READ_ONCE(kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages);
	unsigned int ratio = READ_ONCE(nx_huge_pages_recovery_ratio);

	return ratio ? DIV_ROUND_UP(pages, ratio) : 0;
}

static void kvm_recover_nx_huge_pages(struct kvm *kvm,
				      enum kvm_mmu_type mmu_type)
{
#ifdef CONFIG_X86_64
	const bool is_tdp_mmu = mmu_type == KVM_TDP_MMU;
#else
	const bool is_tdp_mmu = false;
#endif
	unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type);
	struct list_head *nx_huge_pages;
	struct kvm_memory_slot *slot;
	int rcu_idx;
	struct kvm_mmu_page *sp;
	unsigned int ratio;
	LIST_HEAD(invalid_list);
	bool flush = false;
	ulong to_zap;
	int rcu_idx;

	nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages;

	rcu_idx = srcu_read_lock(&kvm->srcu);
	write_lock(&kvm->mmu_lock);
@@ -7603,10 +7624,8 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
	 */
	rcu_read_lock();

	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
	to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
	for ( ; to_zap; --to_zap) {
		if (list_empty(&kvm->arch.possible_nx_huge_pages))
		if (list_empty(nx_huge_pages))
			break;

		/*
@@ -7616,7 +7635,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
		 * the total number of shadow pages.  And because the TDP MMU
		 * doesn't use active_mmu_pages.
		 */
		sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
		sp = list_first_entry(nx_huge_pages,
				      struct kvm_mmu_page,
				      possible_nx_huge_page_link);
		WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
@@ -7653,7 +7672,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)

		if (slot && kvm_slot_dirty_track_enabled(slot))
			unaccount_nx_huge_page(kvm, sp);
		else if (is_tdp_mmu_page(sp))
		else if (is_tdp_mmu)
			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
		else
			kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -7684,9 +7703,10 @@ static void kvm_nx_huge_page_recovery_worker_kill(void *data)
static bool kvm_nx_huge_page_recovery_worker(void *data)
{
	struct kvm *kvm = data;
	long remaining_time;
	bool enabled;
	uint period;
	long remaining_time;
	int i;

	enabled = calc_nx_huge_pages_recovery_period(&period);
	if (!enabled)
@@ -7701,7 +7721,8 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
	}

	__set_current_state(TASK_RUNNING);
	kvm_recover_nx_huge_pages(kvm);
	for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
		kvm_recover_nx_huge_pages(kvm, i);
	kvm->arch.nx_huge_page_last = get_jiffies_64();
	return true;
}
+4 −2
Original line number Diff line number Diff line
@@ -416,7 +416,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);

void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				 enum kvm_mmu_type mmu_type);
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
				   enum kvm_mmu_type mmu_type);

#endif /* __KVM_X86_MMU_INTERNAL_H */
+2 −2
Original line number Diff line number Diff line
@@ -355,7 +355,7 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)

	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
	sp->nx_huge_page_disallowed = false;
	untrack_possible_nx_huge_page(kvm, sp);
	untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}

@@ -1303,7 +1303,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
		    fault->req_level >= iter.level) {
			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
			if (sp->nx_huge_page_disallowed)
				track_possible_nx_huge_page(kvm, sp);
				track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
		}
	}