KVM: x86/mmu: Add infrastructure to allow walking rmaps outside of mmu_lock (4834eade) · Commits · git / linux-nf

arch/x86/include/asm/kvm_host.h

+2 −1

Original line number	Diff line number	Diff line
		@@ -27,6 +27,7 @@
		#include <linux/kfifo.h>
		#include <linux/sched/vhost_task.h>
		#include <linux/call_once.h>
		#include <linux/atomic.h>

		#include <asm/apic.h>
		#include <asm/pvclock-abi.h>
		@@ -405,7 +406,7 @@ union kvm_cpu_role {
		};

		struct kvm_rmap_head {
		unsigned long val;
		atomic_long_t val;
		};

		struct kvm_pio_request {

arch/x86/kvm/mmu/mmu.c

+99 −11

Original line number	Diff line number	Diff line
		@@ -853,11 +853,98 @@ static struct kvm_memory_slot gfn_to_memslot_dirty_bitmap(struct kvm_vcpu vcpu
		* About rmap_head encoding:
		*
		* If the bit zero of rmap_head->val is clear, then it points to the only spte
		* in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
		* in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct
		* pte_list_desc containing more mappings.
		*/
		#define KVM_RMAP_MANY BIT(0)

		/*
		* rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always
		* operates with mmu_lock held for write), but rmaps can be walked without
		* holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain
		* being zapped/dropped _while the rmap is locked_.
		*
		* Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be
		* done while holding mmu_lock for write. This allows a task walking rmaps
		* without holding mmu_lock to concurrently walk the same entries as a task
		* that is holding mmu_lock but _not_ the rmap lock. Neither task will modify
		* the rmaps, thus the walks are stable.
		*
		* As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED,
		* only the rmap chains themselves are protected. E.g. holding an rmap's lock
		* ensures all "struct pte_list_desc" fields are stable.
		*/
		#define KVM_RMAP_LOCKED BIT(1)

		static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
		{
		unsigned long old_val, new_val;

		lockdep_assert_preemption_disabled();

		/*
		* Elide the lock if the rmap is empty, as lockless walkers (read-only
		* mode) don't need to (and can't) walk an empty rmap, nor can they add
		* entries to the rmap. I.e. the only paths that process empty rmaps
		* do so while holding mmu_lock for write, and are mutually exclusive.
		*/
		old_val = atomic_long_read(&rmap_head->val);
		if (!old_val)
		return 0;

		do {
		/*
		* If the rmap is locked, wait for it to be unlocked before
		* trying acquire the lock, e.g. to avoid bouncing the cache
		* line.
		*/
		while (old_val & KVM_RMAP_LOCKED) {
		cpu_relax();
		old_val = atomic_long_read(&rmap_head->val);
		}

		/*
		* Recheck for an empty rmap, it may have been purged by the
		* task that held the lock.
		*/
		if (!old_val)
		return 0;

		new_val = old_val \| KVM_RMAP_LOCKED;
		/*
		* Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
		* from being reordered outside of the critical section created by
		* kvm_rmap_lock().
		*
		* Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
		*
		* For the !old_val case, no ordering is needed, as there is no rmap
		* to walk.
		*/
		} while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));

		/* Return the old value, i.e. _without_ the LOCKED bit set. */
		return old_val;
		}

		static void kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
		unsigned long new_val)
		{
		WARN_ON_ONCE(new_val & KVM_RMAP_LOCKED);
		/*
		* Ensure that all accesses to the rmap have completed before unlocking
		* the rmap.
		*
		* Pairs with the atomic_long_try_cmpxchg_acquire() in kvm_rmap_lock.
		*/
		atomic_long_set_release(&rmap_head->val, new_val);
		}

		static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
		{
		return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED;
		}

		/*
		* Returns the number of pointers in the rmap chain, not counting the new one.
		*/
		@@ -868,7 +955,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache cache, u64 spte,
		struct pte_list_desc *desc;
		int count = 0;

		old_val = rmap_head->val;
		old_val = kvm_rmap_lock(rmap_head);

		if (!old_val) {
		new_val = (unsigned long)spte;
		@@ -900,7 +987,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache cache, u64 spte,
		desc->sptes[desc->spte_count++] = spte;
		}

		rmap_head->val = new_val;
		kvm_rmap_unlock(rmap_head, new_val);

		return count;
		}
		@@ -948,7 +1035,7 @@ static void pte_list_remove(struct kvm kvm, u64 spte,
		unsigned long rmap_val;
		int i;

		rmap_val = rmap_head->val;
		rmap_val = kvm_rmap_lock(rmap_head);
		if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm))
		goto out;

		@@ -974,7 +1061,7 @@ static void pte_list_remove(struct kvm kvm, u64 spte,
		}

		out:
		rmap_head->val = rmap_val;
		kvm_rmap_unlock(rmap_head, rmap_val);
		}

		static void kvm_zap_one_rmap_spte(struct kvm *kvm,
		@@ -992,7 +1079,7 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
		unsigned long rmap_val;
		int i;

		rmap_val = rmap_head->val;
		rmap_val = kvm_rmap_lock(rmap_head);
		if (!rmap_val)
		return false;

		@@ -1011,13 +1098,13 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
		}
		out:
		/* rmap_head is meaningless now, remember to reset it */
		rmap_head->val = 0;
		kvm_rmap_unlock(rmap_head, 0);
		return true;
		}

		unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
		{
		unsigned long rmap_val = rmap_head->val;
		unsigned long rmap_val = kvm_rmap_get(rmap_head);
		struct pte_list_desc *desc;

		if (!rmap_val)
		@@ -1083,7 +1170,7 @@ struct rmap_iterator {
		static u64 rmap_get_first(struct kvm_rmap_head rmap_head,
		struct rmap_iterator *iter)
		{
		unsigned long rmap_val = rmap_head->val;
		unsigned long rmap_val = kvm_rmap_get(rmap_head);
		u64 *sptep;

		if (!rmap_val)
		@@ -1418,7 +1505,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
		while (++iterator->rmap <= iterator->end_rmap) {
		iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);

		if (iterator->rmap->val)
		if (atomic_long_read(&iterator->rmap->val))
		return;
		}

		@@ -2444,7 +2531,8 @@ static int mmu_page_zap_pte(struct kvm kvm, struct kvm_mmu_page sp,
		* avoids retaining a large number of stale nested SPs.
		*/
		if (tdp_enabled && invalid_list &&
		child->role.guest_mode && !child->parent_ptes.val)
		child->role.guest_mode &&
		!atomic_long_read(&child->parent_ptes.val))
		return kvm_mmu_prepare_zap_page(kvm, child,
		invalid_list);
		}