KVM: x86/mmu: Track possible NX huge pages separately for TDP vs. Shadow MMU (67778856) · Commits · git / linux-nf

arch/x86/include/asm/kvm_host.h

+26 −13

Original line number	Diff line number	Diff line
		@@ -1348,6 +1348,30 @@ enum kvm_apicv_inhibit {
		__APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \
		__APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)

		struct kvm_possible_nx_huge_pages {
		/*
		* A list of kvm_mmu_page structs that, if zapped, could possibly be
		* replaced by an NX huge page. A shadow page is on this list if its
		* existence disallows an NX huge page (nx_huge_page_disallowed is set)
		* and there are no other conditions that prevent a huge page, e.g.
		* the backing host page is huge, dirtly logging is not enabled for its
		* memslot, etc... Note, zapping shadow pages on this list doesn't
		* guarantee an NX huge page will be created in its stead, e.g. if the
		* guest attempts to execute from the region then KVM obviously can't
		* create an NX huge page (without hanging the guest).
		*/
		struct list_head pages;
		u64 nr_pages;
		};

		enum kvm_mmu_type {
		KVM_SHADOW_MMU,
		#ifdef CONFIG_X86_64
		KVM_TDP_MMU,
		#endif
		KVM_NR_MMU_TYPES,
		};

		struct kvm_arch {
		unsigned long n_used_mmu_pages;
		unsigned long n_requested_mmu_pages;
		@@ -1360,18 +1384,7 @@ struct kvm_arch {
		bool pre_fault_allowed;
		struct hlist_head *mmu_page_hash;
		struct list_head active_mmu_pages;
		/*
		* A list of kvm_mmu_page structs that, if zapped, could possibly be
		* replaced by an NX huge page. A shadow page is on this list if its
		* existence disallows an NX huge page (nx_huge_page_disallowed is set)
		* and there are no other conditions that prevent a huge page, e.g.
		* the backing host page is huge, dirtly logging is not enabled for its
		* memslot, etc... Note, zapping shadow pages on this list doesn't
		* guarantee an NX huge page will be created in its stead, e.g. if the
		* guest attempts to execute from the region then KVM obviously can't
		* create an NX huge page (without hanging the guest).
		*/
		struct list_head possible_nx_huge_pages;
		struct kvm_possible_nx_huge_pages possible_nx_huge_pages[KVM_NR_MMU_TYPES];
		#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
		struct kvm_page_track_notifier_head track_notifier_head;
		#endif
		@@ -1526,7 +1539,7 @@ struct kvm_arch {
		* is held in read mode:
		* - tdp_mmu_roots (above)
		* - the link field of kvm_mmu_page structs used by the TDP MMU
		* - possible_nx_huge_pages;
		* - possible_nx_huge_pages[KVM_TDP_MMU];
		* - the possible_nx_huge_page_link field of kvm_mmu_page structs used
		* by the TDP MMU
		* Because the lock is only taken within the MMU lock, strictly

arch/x86/kvm/mmu/mmu.c

+40 −19

Original line number	Diff line number	Diff line
		@@ -776,7 +776,8 @@ static void account_shadowed(struct kvm kvm, struct kvm_mmu_page sp)
		kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
		}

		void track_possible_nx_huge_page(struct kvm kvm, struct kvm_mmu_page sp)
		void track_possible_nx_huge_page(struct kvm kvm, struct kvm_mmu_page sp,
		enum kvm_mmu_type mmu_type)
		{
		/*
		* If it's possible to replace the shadow page with an NX huge page,
		@@ -790,8 +791,9 @@ void track_possible_nx_huge_page(struct kvm kvm, struct kvm_mmu_page sp)
		return;

		++kvm->stat.nx_lpage_splits;
		++kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
		list_add_tail(&sp->possible_nx_huge_page_link,
		&kvm->arch.possible_nx_huge_pages);
		&kvm->arch.possible_nx_huge_pages[mmu_type].pages);
		}

		static void account_nx_huge_page(struct kvm kvm, struct kvm_mmu_page sp,
		@@ -800,7 +802,7 @@ static void account_nx_huge_page(struct kvm kvm, struct kvm_mmu_page sp,
		sp->nx_huge_page_disallowed = true;

		if (nx_huge_page_possible)
		track_possible_nx_huge_page(kvm, sp);
		track_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
		}

		static void unaccount_shadowed(struct kvm kvm, struct kvm_mmu_page sp)
		@@ -819,12 +821,14 @@ static void unaccount_shadowed(struct kvm kvm, struct kvm_mmu_page sp)
		kvm_mmu_gfn_allow_lpage(slot, gfn);
		}

		void untrack_possible_nx_huge_page(struct kvm kvm, struct kvm_mmu_page sp)
		void untrack_possible_nx_huge_page(struct kvm kvm, struct kvm_mmu_page sp,
		enum kvm_mmu_type mmu_type)
		{
		if (list_empty(&sp->possible_nx_huge_page_link))
		return;

		--kvm->stat.nx_lpage_splits;
		--kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
		list_del_init(&sp->possible_nx_huge_page_link);
		}

		@@ -832,7 +836,7 @@ static void unaccount_nx_huge_page(struct kvm kvm, struct kvm_mmu_page sp)
		{
		sp->nx_huge_page_disallowed = false;

		untrack_possible_nx_huge_page(kvm, sp);
		untrack_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
		}

		static struct kvm_memory_slot gfn_to_memslot_dirty_bitmap(struct kvm_vcpu vcpu,
		@@ -6737,11 +6741,12 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)

		int kvm_mmu_init_vm(struct kvm *kvm)
		{
		int r;
		int r, i;

		kvm->arch.shadow_mmio_value = shadow_mmio_value;
		INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
		INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
		for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
		INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages[i].pages);
		spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);

		if (tdp_mmu_enabled) {
		@@ -7582,16 +7587,32 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
		return err;
		}

		static void kvm_recover_nx_huge_pages(struct kvm *kvm)
		static unsigned long nx_huge_pages_to_zap(struct kvm *kvm,
		enum kvm_mmu_type mmu_type)
		{
		unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
		unsigned long pages = READ_ONCE(kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages);
		unsigned int ratio = READ_ONCE(nx_huge_pages_recovery_ratio);

		return ratio ? DIV_ROUND_UP(pages, ratio) : 0;
		}

		static void kvm_recover_nx_huge_pages(struct kvm *kvm,
		enum kvm_mmu_type mmu_type)
		{
		#ifdef CONFIG_X86_64
		const bool is_tdp_mmu = mmu_type == KVM_TDP_MMU;
		#else
		const bool is_tdp_mmu = false;
		#endif
		unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type);
		struct list_head *nx_huge_pages;
		struct kvm_memory_slot *slot;
		int rcu_idx;
		struct kvm_mmu_page *sp;
		unsigned int ratio;
		LIST_HEAD(invalid_list);
		bool flush = false;
		ulong to_zap;
		int rcu_idx;

		nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages;

		rcu_idx = srcu_read_lock(&kvm->srcu);
		write_lock(&kvm->mmu_lock);
		@@ -7603,10 +7624,8 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
		*/
		rcu_read_lock();

		ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
		to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
		for ( ; to_zap; --to_zap) {
		if (list_empty(&kvm->arch.possible_nx_huge_pages))
		if (list_empty(nx_huge_pages))
		break;

		/*
		@@ -7616,7 +7635,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
		* the total number of shadow pages. And because the TDP MMU
		* doesn't use active_mmu_pages.
		*/
		sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
		sp = list_first_entry(nx_huge_pages,
		struct kvm_mmu_page,
		possible_nx_huge_page_link);
		WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
		@@ -7653,7 +7672,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)

		if (slot && kvm_slot_dirty_track_enabled(slot))
		unaccount_nx_huge_page(kvm, sp);
		else if (is_tdp_mmu_page(sp))
		else if (is_tdp_mmu)
		flush \|= kvm_tdp_mmu_zap_sp(kvm, sp);
		else
		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
		@@ -7684,9 +7703,10 @@ static void kvm_nx_huge_page_recovery_worker_kill(void *data)
		static bool kvm_nx_huge_page_recovery_worker(void *data)
		{
		struct kvm *kvm = data;
		long remaining_time;
		bool enabled;
		uint period;
		long remaining_time;
		int i;

		enabled = calc_nx_huge_pages_recovery_period(&period);
		if (!enabled)
		@@ -7701,7 +7721,8 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
		}

		__set_current_state(TASK_RUNNING);
		kvm_recover_nx_huge_pages(kvm);
		for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
		kvm_recover_nx_huge_pages(kvm, i);
		kvm->arch.nx_huge_page_last = get_jiffies_64();
		return true;
		}

arch/x86/kvm/mmu/mmu_internal.h

+4 −2

Original line number	Diff line number	Diff line
		@@ -416,7 +416,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
		void kvm_mmu_hugepage_adjust(struct kvm_vcpu vcpu, struct kvm_page_fault fault);
		void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);

		void track_possible_nx_huge_page(struct kvm kvm, struct kvm_mmu_page sp);
		void untrack_possible_nx_huge_page(struct kvm kvm, struct kvm_mmu_page sp);
		void track_possible_nx_huge_page(struct kvm kvm, struct kvm_mmu_page sp,
		enum kvm_mmu_type mmu_type);
		void untrack_possible_nx_huge_page(struct kvm kvm, struct kvm_mmu_page sp,
		enum kvm_mmu_type mmu_type);

		#endif /* __KVM_X86_MMU_INTERNAL_H */

arch/x86/kvm/mmu/tdp_mmu.c

+2 −2

Original line number	Diff line number	Diff line
		@@ -355,7 +355,7 @@ static void tdp_mmu_unlink_sp(struct kvm kvm, struct kvm_mmu_page sp)

		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
		sp->nx_huge_page_disallowed = false;
		untrack_possible_nx_huge_page(kvm, sp);
		untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
		}

		@@ -1303,7 +1303,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu vcpu, struct kvm_page_fault fault)
		fault->req_level >= iter.level) {
		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
		if (sp->nx_huge_page_disallowed)
		track_possible_nx_huge_page(kvm, sp);
		track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
		}
		}