Merge tag 'kvm-x86-mmu-6.9' of https://github.com/kvm-x86/linux into HEAD (41ebae2e) · Commits · git / linux-net

arch/x86/include/asm/kvm_host.h

+9 −0

Original line number	Diff line number	Diff line
		@@ -1468,6 +1468,15 @@ struct kvm_arch {
		*/
		bool shadow_root_allocated;

		#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
		/*
		* If set, the VM has (or had) an external write tracking user, and
		* thus all write tracking metadata has been allocated, even if KVM
		* itself isn't using write tracking.
		*/
		bool external_write_tracking_enabled;
		#endif

		#if IS_ENABLED(CONFIG_HYPERV)
		hpa_t hv_root_tdp;
		spinlock_t hv_root_tdp_lock;

arch/x86/kvm/mmu/mmu.c

+24 −13

Original line number	Diff line number	Diff line
		@@ -3575,10 +3575,14 @@ static void mmu_free_root_page(struct kvm kvm, hpa_t root_hpa,
		if (WARN_ON_ONCE(!sp))
		return;

		if (is_tdp_mmu_page(sp))
		if (is_tdp_mmu_page(sp)) {
		lockdep_assert_held_read(&kvm->mmu_lock);
		kvm_tdp_mmu_put_root(kvm, sp);
		else if (!--sp->root_count && sp->role.invalid)
		} else {
		lockdep_assert_held_write(&kvm->mmu_lock);
		if (!--sp->root_count && sp->role.invalid)
		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
		}

		*root_hpa = INVALID_PAGE;
		}
		@@ -3587,6 +3591,7 @@ static void mmu_free_root_page(struct kvm kvm, hpa_t root_hpa,
		void kvm_mmu_free_roots(struct kvm kvm, struct kvm_mmu mmu,
		ulong roots_to_free)
		{
		bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct;
		int i;
		LIST_HEAD(invalid_list);
		bool free_active_root;
		@@ -3609,6 +3614,9 @@ void kvm_mmu_free_roots(struct kvm kvm, struct kvm_mmu mmu,
		return;
		}

		if (is_tdp_mmu)
		read_lock(&kvm->mmu_lock);
		else
		write_lock(&kvm->mmu_lock);

		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
		@@ -3635,9 +3643,14 @@ void kvm_mmu_free_roots(struct kvm kvm, struct kvm_mmu mmu,
		mmu->root.pgd = 0;
		}

		if (is_tdp_mmu) {
		read_unlock(&kvm->mmu_lock);
		WARN_ON_ONCE(!list_empty(&invalid_list));
		} else {
		kvm_mmu_commit_zap_page(kvm, &invalid_list);
		write_unlock(&kvm->mmu_lock);
		}
		}
		EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);

		void kvm_mmu_free_guest_mode_roots(struct kvm kvm, struct kvm_mmu mmu)
		@@ -3693,15 +3706,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
		unsigned i;
		int r;

		if (tdp_mmu_enabled)
		return kvm_tdp_mmu_alloc_root(vcpu);

		write_lock(&vcpu->kvm->mmu_lock);
		r = make_mmu_pages_available(vcpu);
		if (r < 0)
		goto out_unlock;

		if (tdp_mmu_enabled) {
		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
		mmu->root.hpa = root;
		} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
		if (shadow_root_level >= PT64_ROOT_4LEVEL) {
		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
		mmu->root.hpa = root;
		} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
		@@ -6997,9 +7010,7 @@ int kvm_mmu_vendor_module_init(void)

		kvm_mmu_reset_all_pte_masks();

		pte_list_desc_cache = kmem_cache_create("pte_list_desc",
		sizeof(struct pte_list_desc),
		0, SLAB_ACCOUNT, NULL);
		pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT);
		if (!pte_list_desc_cache)
		goto out;

arch/x86/kvm/mmu/page_track.c

+66 −2

Original line number	Diff line number	Diff line
		@@ -20,10 +20,23 @@
		#include "mmu_internal.h"
		#include "page_track.h"

		static bool kvm_external_write_tracking_enabled(struct kvm *kvm)
		{
		#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
		/*
		* Read external_write_tracking_enabled before related pointers. Pairs
		* with the smp_store_release in kvm_page_track_write_tracking_enable().
		*/
		return smp_load_acquire(&kvm->arch.external_write_tracking_enabled);
		#else
		return false;
		#endif
		}

		bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
		{
		return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) \|\|
		!tdp_enabled \|\| kvm_shadow_root_allocated(kvm);
		return kvm_external_write_tracking_enabled(kvm) \|\|
		kvm_shadow_root_allocated(kvm) \|\| !tdp_enabled;
		}

		void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
		@@ -153,6 +166,50 @@ int kvm_page_track_init(struct kvm *kvm)
		return init_srcu_struct(&head->track_srcu);
		}

		static int kvm_enable_external_write_tracking(struct kvm *kvm)
		{
		struct kvm_memslots *slots;
		struct kvm_memory_slot *slot;
		int r = 0, i, bkt;

		mutex_lock(&kvm->slots_arch_lock);

		/*
		* Check for any write tracking user (not just external users) under
		* lock. This avoids unnecessary work, e.g. if KVM itself is using
		* write tracking, or if two external users raced when registering.
		*/
		if (kvm_page_track_write_tracking_enabled(kvm))
		goto out_success;

		for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
		slots = __kvm_memslots(kvm, i);
		kvm_for_each_memslot(slot, bkt, slots) {
		/*
		* Intentionally do NOT free allocations on failure to
		* avoid having to track which allocations were made
		* now versus when the memslot was created. The
		* metadata is guaranteed to be freed when the slot is
		* freed, and will be kept/used if userspace retries
		* the failed ioctl() instead of killing the VM.
		*/
		r = kvm_page_track_write_tracking_alloc(slot);
		if (r)
		goto out_unlock;
		}
		}

		out_success:
		/*
		* Ensure that external_write_tracking_enabled becomes true strictly
		* after all the related pointers are set.
		*/
		smp_store_release(&kvm->arch.external_write_tracking_enabled, true);
		out_unlock:
		mutex_unlock(&kvm->slots_arch_lock);
		return r;
		}

		/*
		* register the notifier so that event interception for the tracked guest
		* pages can be received.
		@@ -161,10 +218,17 @@ int kvm_page_track_register_notifier(struct kvm *kvm,
		struct kvm_page_track_notifier_node *n)
		{
		struct kvm_page_track_notifier_head *head;
		int r;

		if (!kvm \|\| kvm->mm != current->mm)
		return -ESRCH;

		if (!kvm_external_write_tracking_enabled(kvm)) {
		r = kvm_enable_external_write_tracking(kvm);
		if (r)
		return r;
		}

		kvm_get_kvm(kvm);

		head = &kvm->arch.track_notifier_head;

arch/x86/kvm/mmu/tdp_mmu.c

+87 −37

Original line number	Diff line number	Diff line
		@@ -153,7 +153,7 @@ static struct kvm_mmu_page tdp_mmu_next_root(struct kvm kvm,
		for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
		({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
		_root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
		if (kvm_mmu_page_as_id(_root) != _as_id) { \
		if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
		} else

		#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
		@@ -171,12 +171,19 @@ static struct kvm_mmu_page tdp_mmu_next_root(struct kvm kvm,
		* Holding mmu_lock for write obviates the need for RCU protection as the list
		* is guaranteed to be stable.
		*/
		#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
		#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \
		list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
		kvm_mmu_page_as_id(_root) != _as_id) { \
		((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) \|\| \
		((_only_valid) && (_root)->role.invalid))) { \
		} else

		#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
		__for_each_tdp_mmu_root(_kvm, _root, _as_id, false)

		#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
		__for_each_tdp_mmu_root(_kvm, _root, _as_id, true)

		static struct kvm_mmu_page tdp_mmu_alloc_sp(struct kvm_vcpu vcpu)
		{
		struct kvm_mmu_page *sp;
		@@ -216,22 +223,41 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
		tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
		}

		hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
		int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
		{
		union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
		struct kvm_mmu *mmu = vcpu->arch.mmu;
		union kvm_mmu_page_role role = mmu->root_role;
		int as_id = kvm_mmu_role_as_id(role);
		struct kvm *kvm = vcpu->kvm;
		struct kvm_mmu_page *root;

		lockdep_assert_held_write(&kvm->mmu_lock);
		/*
		* Check for an existing root before acquiring the pages lock to avoid
		* unnecessary serialization if multiple vCPUs are loading a new root.
		* E.g. when bringing up secondary vCPUs, KVM will already have created
		* a valid root on behalf of the primary vCPU.
		*/
		read_lock(&kvm->mmu_lock);

		for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
		if (root->role.word == role.word)
		goto out_read_unlock;
		}

		spin_lock(&kvm->arch.tdp_mmu_pages_lock);

		/*
		* Check for an existing root before allocating a new one. Note, the
		* role check prevents consuming an invalid root.
		* Recheck for an existing root after acquiring the pages lock, another
		* vCPU may have raced ahead and created a new usable root. Manually
		* walk the list of roots as the standard macros assume that the pages
		* lock is not held. WARN if grabbing a reference to a usable root
		* fails, as the last reference to a root can only be put after the
		* root has been invalidated, which requires holding mmu_lock for write.
		*/
		for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
		list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
		if (root->role.word == role.word &&
		kvm_tdp_mmu_get_root(root))
		goto out;
		!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
		goto out_spin_unlock;
		}

		root = tdp_mmu_alloc_sp(vcpu);
		@@ -245,13 +271,20 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
		* is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
		*/
		refcount_set(&root->tdp_mmu_root_count, 2);

		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
		list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);

		out:
		return __pa(root->spt);
		out_spin_unlock:
		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
		out_read_unlock:
		read_unlock(&kvm->mmu_lock);
		/*
		* Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
		* and actually consuming the root if it's invalidated after dropping
		* mmu_lock, and the root can't be freed as this vCPU holds a reference.
		*/
		mmu->root.hpa = __pa(root->spt);
		mmu->root.pgd = 0;
		return 0;
		}

		static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
		@@ -734,15 +767,26 @@ static void tdp_mmu_zap_root(struct kvm kvm, struct kvm_mmu_page root,
		rcu_read_lock();

		/*
		* To avoid RCU stalls due to recursively removing huge swaths of SPs,
		* split the zap into two passes. On the first pass, zap at the 1gb
		* level, and then zap top-level SPs on the second pass. "1gb" is not
		* arbitrary, as KVM must be able to zap a 1gb shadow page without
		* inducing a stall to allow in-place replacement with a 1gb hugepage.
		* Zap roots in multiple passes of decreasing granularity, i.e. zap at
		* 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
		* preempt models) or mmu_lock contention (full or real-time models).
		* Zapping at finer granularity marginally increases the total time of
		* the zap, but in most cases the zap itself isn't latency sensitive.
		*
		* Because zapping a SP recurses on its children, stepping down to
		* PG_LEVEL_4K in the iterator itself is unnecessary.
		*/
		* If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
		* in order to mimic the page fault path, which can replace a 1GiB page
		* table with an equivalent 1GiB hugepage, i.e. can get saddled with
		* zapping a 1GiB region that's fully populated with 4KiB SPTEs. This
		* allows verifying that KVM can safely zap 1GiB regions, e.g. without
		* inducing RCU stalls, without relying on a relatively rare event
		* (zapping roots is orders of magnitude more common). Note, because
		* zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
		* in the iterator itself is unnecessary.
		*/
		if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
		}
		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
		__tdp_mmu_zap_root(kvm, root, shared, root->role.level);

		@@ -800,6 +844,12 @@ static bool tdp_mmu_zap_leafs(struct kvm kvm, struct kvm_mmu_page root,
		continue;

		tdp_mmu_iter_set_spte(kvm, &iter, 0);

		/*
		* Zappings SPTEs in invalid roots doesn't require a TLB flush,
		* see kvm_tdp_mmu_zap_invalidated_roots() for details.
		*/
		if (!root->role.invalid)
		flush = true;
		}

		@@ -813,16 +863,16 @@ static bool tdp_mmu_zap_leafs(struct kvm kvm, struct kvm_mmu_page root,
		}

		/*
		* Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
		* true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
		* more SPTEs were zapped since the MMU lock was last acquired.
		* Zap leaf SPTEs for the range of gfns, [start, end), for all VALID* roots.
		* Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
		* one or more SPTEs were zapped since the MMU lock was last acquired.
		*/
		bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
		{
		struct kvm_mmu_page *root;

		lockdep_assert_held_write(&kvm->mmu_lock);
		for_each_tdp_mmu_root_yield_safe(kvm, root)
		for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
		flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);

		return flush;
		@@ -896,7 +946,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
		* the VM is being destroyed).
		*
		* Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
		* See kvm_tdp_mmu_get_vcpu_root_hpa().
		* See kvm_tdp_mmu_alloc_root().
		*/
		void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
		{
		@@ -1622,7 +1672,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
		{
		struct kvm_mmu_page *root;

		for_each_tdp_mmu_root(kvm, root, slot->as_id)
		for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
		}

		@@ -1740,7 +1790,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
		bool spte_set = false;

		lockdep_assert_held_write(&kvm->mmu_lock);
		for_each_tdp_mmu_root(kvm, root, slot->as_id)
		for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
		spte_set \|= write_protect_gfn(kvm, root, gfn, min_level);

		return spte_set;

arch/x86/kvm/mmu/tdp_mmu.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -10,7 +10,7 @@
		void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
		void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);

		hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
		int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu);

		__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
		{