Commit 55f4db79 authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

KVM: e500: perform hugepage check after looking up the PFN



e500 KVM tries to bypass __kvm_faultin_pfn() in order to map VM_PFNMAP
VMAs as huge pages.  This is a Bad Idea because VM_PFNMAP VMAs could
become noncontiguous as a result of callsto remap_pfn_range().

Instead, use the already existing host PTE lookup to retrieve a
valid host-side mapping level after __kvm_faultin_pfn() has
returned.  Then find the largest size that will satisfy the
guest's request while staying within a single host PTE.

Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent 03b755b2
Loading
Loading
Loading
Loading
+69 −109
Original line number Diff line number Diff line
@@ -326,15 +326,14 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
	struct tlbe_ref *ref)
{
	struct kvm_memory_slot *slot;
	unsigned long pfn = 0; /* silence GCC warning */
	unsigned int psize;
	unsigned long pfn;
	struct page *page = NULL;
	unsigned long hva;
	int pfnmap = 0;
	int tsize = BOOK3E_PAGESZ_4K;
	int ret = 0;
	unsigned long mmu_seq;
	struct kvm *kvm = vcpu_e500->vcpu.kvm;
	unsigned long tsize_pages = 0;
	pte_t *ptep;
	unsigned int wimg = 0;
	pgd_t *pgdir;
@@ -356,31 +355,54 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
	slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
	hva = gfn_to_hva_memslot(slot, gfn);

	if (tlbsel == 1) {
		struct vm_area_struct *vma;
		mmap_read_lock(kvm->mm);
	pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page);
	if (is_error_noslot_pfn(pfn)) {
		if (printk_ratelimit())
			pr_err("%s: real page not found for gfn %lx\n",
			       __func__, (long)gfn);
		return -EINVAL;
	}

	spin_lock(&kvm->mmu_lock);
	if (mmu_invalidate_retry(kvm, mmu_seq)) {
		ret = -EAGAIN;
		goto out;
	}

		vma = find_vma(kvm->mm, hva);
		if (vma && hva >= vma->vm_start &&
		    (vma->vm_flags & VM_PFNMAP)) {

	pgdir = vcpu_e500->vcpu.arch.pgdir;
	/*
			 * This VMA is a physically contiguous region (e.g.
			 * /dev/mem) that bypasses normal Linux page
			 * management.  Find the overlap between the
			 * vma and the memslot.
	 * We are just looking at the wimg bits, so we don't
	 * care much about the trans splitting bit.
	 * We are holding kvm->mmu_lock so a notifier invalidate
	 * can't run hence pfn won't change.
	 */
	local_irq_save(flags);
	ptep = find_linux_pte(pgdir, hva, NULL, &psize);
	if (ptep) {
		pte_t pte = READ_ONCE(*ptep);

		if (pte_present(pte)) {
			wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
				MAS2_WIMGE_MASK;
		} else {
			local_irq_restore(flags);
			pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
					   __func__, (long)gfn, pfn);
			ret = -EINVAL;
			goto out;
		}
	}
	local_irq_restore(flags);

	if (psize && tlbsel == 1) {
		unsigned long psize_pages, tsize_pages;
		unsigned long start, end;
		unsigned long slot_start, slot_end;

			pfnmap = 1;
			writable = vma->vm_flags & VM_WRITE;

			start = vma->vm_pgoff;
			end = start +
			      vma_pages(vma);

			pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
		psize_pages = 1UL << (psize - PAGE_SHIFT);
		start = pfn & ~(psize_pages - 1);
		end = start + psize_pages;

		slot_start = pfn - (gfn - slot->base_gfn);
		slot_end = slot_start + slot->npages;
@@ -393,6 +415,12 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
		tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
			MAS1_TSIZE_SHIFT;

		/*
		 * Any page size that doesn't satisfy the host mapping
		 * will fail the start and end tests.
		 */
		tsize = min(psize - PAGE_SHIFT + BOOK3E_PAGESZ_4K, tsize);

		/*
		 * e500 doesn't implement the lowest tsize bit,
		 * or 1K pages.
@@ -425,75 +453,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
			pfn &= ~(tsize_pages - 1);
			break;
		}
		} else if (vma && hva >= vma->vm_start &&
			   is_vm_hugetlb_page(vma)) {
			unsigned long psize = vma_kernel_pagesize(vma);

			tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
				MAS1_TSIZE_SHIFT;

			/*
			 * Take the largest page size that satisfies both host
			 * and guest mapping
			 */
			tsize = min(__ilog2(psize) - 10, tsize);

			/*
			 * e500 doesn't implement the lowest tsize bit,
			 * or 1K pages.
			 */
			tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
		}

		mmap_read_unlock(kvm->mm);
	}

	if (likely(!pfnmap)) {
		tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT);
		pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page);
		if (is_error_noslot_pfn(pfn)) {
			if (printk_ratelimit())
				pr_err("%s: real page not found for gfn %lx\n",
				       __func__, (long)gfn);
			return -EINVAL;
		}

		/* Align guest and physical address to page map boundaries */
		pfn &= ~(tsize_pages - 1);
		gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
	}

	spin_lock(&kvm->mmu_lock);
	if (mmu_invalidate_retry(kvm, mmu_seq)) {
		ret = -EAGAIN;
		goto out;
	}


	pgdir = vcpu_e500->vcpu.arch.pgdir;
	/*
	 * We are just looking at the wimg bits, so we don't
	 * care much about the trans splitting bit.
	 * We are holding kvm->mmu_lock so a notifier invalidate
	 * can't run hence pfn won't change.
	 */
	local_irq_save(flags);
	ptep = find_linux_pte(pgdir, hva, NULL, NULL);
	if (ptep) {
		pte_t pte = READ_ONCE(*ptep);

		if (pte_present(pte)) {
			wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
				MAS2_WIMGE_MASK;
		} else {
			local_irq_restore(flags);
			pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
					   __func__, (long)gfn, pfn);
			ret = -EINVAL;
			goto out;
		}
	}
	local_irq_restore(flags);

	kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, writable);
	kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,