Commit 5cb1659f authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge branch 'kvm-no-struct-page' into HEAD



TL;DR: Eliminate KVM's long-standing (and heinous) behavior of essentially
guessing which pfns are refcounted pages (see kvm_pfn_to_refcounted_page()).

Getting there requires "fixing" arch code that isn't obviously broken.
Specifically, to get rid of kvm_pfn_to_refcounted_page(), KVM needs to
stop marking pages/folios dirty/accessed based solely on the pfn that's
stored in KVM's stage-2 page tables.

Instead of tracking which SPTEs correspond to refcounted pages, simply
remove all of the code that operates on "struct page" based ona the pfn
in stage-2 PTEs.  This is the back ~40-50% of the series.

For x86 in particular, which sets accessed/dirty status when that info
would be "lost", e.g. when SPTEs are zapped or KVM clears the dirty flag
in a SPTE, foregoing the updates provides very measurable performance
improvements for related operations.  E.g. when clearing dirty bits as
part of dirty logging, and zapping SPTEs to reconstitue huge pages when
disabling dirty logging.

The front ~40% of the series is cleanups and prep work, and most of it is
x86 focused (purely because x86 added the most special cases, *sigh*).
E.g. several of the inputs to hva_to_pfn() (and it's myriad wrappers),
can be removed by cleaning up and deduplicating x86 code.

Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parents e9001a38 8b15c376
Loading
Loading
Loading
Loading
+41 −39
Original line number Diff line number Diff line
@@ -135,8 +135,8 @@ We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.
For direct sp, we can easily avoid it since the spte of direct sp is fixed
to gfn.  For indirect sp, we disabled fast page fault for simplicity.

A solution for indirect sp could be to pin the gfn, for example via
gfn_to_pfn_memslot_atomic, before the cmpxchg.  After the pinning:
A solution for indirect sp could be to pin the gfn before the cmpxchg.  After
the pinning:

- We have held the refcount of pfn; that means the pfn can not be freed and
  be reused for another gfn.
@@ -147,22 +147,22 @@ Then, we can ensure the dirty bitmaps is correctly set for a gfn.

2) Dirty bit tracking

In the origin code, the spte can be fast updated (non-atomically) if the
In the original code, the spte can be fast updated (non-atomically) if the
spte is read-only and the Accessed bit has already been set since the
Accessed bit and Dirty bit can not be lost.

But it is not true after fast page fault since the spte can be marked
writable between reading spte and updating spte. Like below case:

+------------------------------------------------------------------------+
+-------------------------------------------------------------------------+
| At the beginning::                                                      |
|                                                                         |
|	spte.W = 0                                                              |
|	spte.Accessed = 1                                                       |
+------------------------------------+-----------------------------------+
+-------------------------------------+-----------------------------------+
| CPU 0:                              | CPU 1:                            |
+------------------------------------+-----------------------------------+
| In mmu_spte_clear_track_bits()::   |                                   |
+-------------------------------------+-----------------------------------+
| In mmu_spte_update()::              |                                   |
|                                     |                                   |
|  old_spte = *spte;                  |                                   |
|                                     |                                   |
@@ -170,8 +170,8 @@ writable between reading spte and updating spte. Like below case:
|  /* 'if' condition is satisfied. */ |                                   |
|  if (old_spte.Accessed == 1 &&      |                                   |
|       old_spte.W == 0)              |                                   |
|     spte = 0ull;                   |                                   |
+------------------------------------+-----------------------------------+
|     spte = new_spte;                |                                   |
+-------------------------------------+-----------------------------------+
|                                     | on fast page fault path::         |
|                                     |                                   |
|                                     |    spte.W = 1                     |
@@ -179,17 +179,19 @@ writable between reading spte and updating spte. Like below case:
|                                     | memory write on the spte::        |
|                                     |                                   |
|                                     |    spte.Dirty = 1                 |
+------------------------------------+-----------------------------------+
+-------------------------------------+-----------------------------------+
|  ::                                 |                                   |
|                                     |                                   |
|   else                              |                                   |
|     old_spte = xchg(spte, 0ull)    |                                   |
|   if (old_spte.Accessed == 1)      |                                   |
|     kvm_set_pfn_accessed(spte.pfn);|                                   |
|   if (old_spte.Dirty == 1)         |                                   |
|     kvm_set_pfn_dirty(spte.pfn);   |                                   |
|     old_spte = xchg(spte, new_spte);|                                   |
|   if (old_spte.Accessed &&          |                                   |
|       !new_spte.Accessed)           |                                   |
|     flush = true;                   |                                   |
|   if (old_spte.Dirty &&             |                                   |
|       !new_spte.Dirty)              |                                   |
|     flush = true;                   |                                   |
|     OOPS!!!                         |                                   |
+------------------------------------+-----------------------------------+
+-------------------------------------+-----------------------------------+

The Dirty bit is lost in this case.

+1 −3
Original line number Diff line number Diff line
@@ -674,10 +674,8 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
 *
 * If there is a valid, leaf page-table entry used to translate @addr, then
 * set the access flag in that entry.
 *
 * Return: The old page-table entry prior to setting the flag, 0 on failure.
 */
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);

/**
 * kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access
+6 −9
Original line number Diff line number Diff line
@@ -1051,20 +1051,18 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
	}

	while (length > 0) {
		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
		struct page *page = __gfn_to_page(kvm, gfn, write);
		void *maddr;
		unsigned long num_tags;
		struct page *page;

		if (is_error_noslot_pfn(pfn)) {
		if (!page) {
			ret = -EFAULT;
			goto out;
		}

		page = pfn_to_online_page(pfn);
		if (!page) {
		if (!pfn_to_online_page(page_to_pfn(page))) {
			/* Reject ZONE_DEVICE memory */
			kvm_release_pfn_clean(pfn);
			kvm_release_page_unused(page);
			ret = -EFAULT;
			goto out;
		}
@@ -1078,7 +1076,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
				/* No tags in memory, so write zeros */
				num_tags = MTE_GRANULES_PER_PAGE -
					clear_user(tags, MTE_GRANULES_PER_PAGE);
			kvm_release_pfn_clean(pfn);
			kvm_release_page_clean(page);
		} else {
			/*
			 * Only locking to serialise with a concurrent
@@ -1093,8 +1091,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
			if (num_tags != MTE_GRANULES_PER_PAGE)
				mte_clear_page_tags(maddr);
			set_page_mte_tagged(page);

			kvm_release_pfn_dirty(pfn);
			kvm_release_page_dirty(page);
		}

		if (num_tags != MTE_GRANULES_PER_PAGE) {
+2 −5
Original line number Diff line number Diff line
@@ -1245,19 +1245,16 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
					NULL, NULL, 0);
}

kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
{
	kvm_pte_t pte = 0;
	int ret;

	ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
				       &pte, NULL,
				       NULL, NULL,
				       KVM_PGTABLE_WALK_HANDLE_FAULT |
				       KVM_PGTABLE_WALK_SHARED);
	if (!ret)
		dsb(ishst);

	return pte;
}

struct stage2_age_data {
+8 −13
Original line number Diff line number Diff line
@@ -1440,6 +1440,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
	long vma_pagesize, fault_granule;
	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
	struct kvm_pgtable *pgt;
	struct page *page;

	if (fault_is_perm)
		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
@@ -1561,7 +1562,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,

	/*
	 * Read mmu_invalidate_seq so that KVM can detect if the results of
	 * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
	 * vma_lookup() or __kvm_faultin_pfn() become stale prior to
	 * acquiring kvm->mmu_lock.
	 *
	 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
@@ -1570,8 +1571,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
	mmap_read_unlock(current->mm);

	pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
				   write_fault, &writable, NULL);
	pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
				&writable, &page);
	if (pfn == KVM_PFN_ERR_HWPOISON) {
		kvm_send_hwpoison_signal(hva, vma_shift);
		return 0;
@@ -1584,7 +1585,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
		 * If the page was identified as device early by looking at
		 * the VMA flags, vma_pagesize is already representing the
		 * largest quantity we can map.  If instead it was mapped
		 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
		 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
		 * and must not be upgraded.
		 *
		 * In both cases, we don't let transparent_hugepage_adjust()
@@ -1693,33 +1694,27 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
	}

out_unlock:
	kvm_release_faultin_page(kvm, page, !!ret, writable);
	read_unlock(&kvm->mmu_lock);

	/* Mark the page dirty only if the fault is handled successfully */
	if (writable && !ret) {
		kvm_set_pfn_dirty(pfn);
	if (writable && !ret)
		mark_page_dirty_in_slot(kvm, memslot, gfn);
	}

	kvm_release_pfn_clean(pfn);
	return ret != -EAGAIN ? ret : 0;
}

/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
	kvm_pte_t pte;
	struct kvm_s2_mmu *mmu;

	trace_kvm_access_fault(fault_ipa);

	read_lock(&vcpu->kvm->mmu_lock);
	mmu = vcpu->arch.hw_mmu;
	pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
	kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
	read_unlock(&vcpu->kvm->mmu_lock);

	if (kvm_pte_valid(pte))
		kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
}

/**
Loading