Commit a6ad5413 authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge branch 'guest-memfd-mmap' into HEAD



Add support for host userspace mapping of guest_memfd-backed memory for VM
types that do NOT use support KVM_MEMORY_ATTRIBUTE_PRIVATE (which isn't
precisely the same thing as CoCo VMs, since x86's SEV-MEM and SEV-ES have
no way to detect private vs. shared).

mmap() support paves the way for several evolving KVM use cases:

* Allows VMMs like Firecracker to run guests entirely backed by
  guest_memfd [1]. This provides a unified memory management model for
  both confidential and non-confidential guests, simplifying VMM design.

* Enhanced Security via direct map removal: When combined with Patrick's
  series for direct map removal [2], this provides additional hardening
  against Spectre-like transient execution attacks by eliminating the
  need for host kernel direct maps of guest memory.

* Lays the groundwork for *restricted* mmap() support for guest_memfd-backed
  memory on CoCo platforms [3] that permit in-place sharing of guest memory
   with the host.

Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parents 0dc4a751 42188667
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -6414,6 +6414,15 @@ most one mapping per page, i.e. binding multiple memory regions to a single
guest_memfd range is not allowed (any number of memory regions can be bound to
a single guest_memfd file, but the bound ranges must not overlap).

When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field
supports GUEST_MEMFD_FLAG_MMAP.  Setting this flag on guest_memfd creation
enables mmap() and faulting of guest_memfd memory to host userspace.

When the KVM MMU performs a PFN lookup to service a guest fault and the backing
guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be
consumed from guest_memfd, regardless of whether it is a shared or a private
fault.

See KVM_SET_USER_MEMORY_REGION2 for additional details.

4.143 KVM_PRE_FAULT_MEMORY
+1 −0
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@ menuconfig KVM
	select HAVE_KVM_VCPU_RUN_PID_CHANGE
	select SCHED_INFO
	select GUEST_PERF_EVENTS if PERF_EVENTS
	select KVM_GUEST_MEMFD
	help
	  Support hosting virtualized guest machines.

+149 −54
Original line number Diff line number Diff line
@@ -1477,13 +1477,132 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
	}
}

static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache,
				void **memcache)
{
	int min_pages;

	if (!is_protected_kvm_enabled())
		*memcache = &vcpu->arch.mmu_page_cache;
	else
		*memcache = &vcpu->arch.pkvm_memcache;

	if (!topup_memcache)
		return 0;

	min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);

	if (!is_protected_kvm_enabled())
		return kvm_mmu_topup_memory_cache(*memcache, min_pages);

	return topup_hyp_memcache(*memcache, min_pages);
}

/*
 * Potentially reduce shadow S2 permissions to match the guest's own S2. For
 * exec faults, we'd only reach this point if the guest actually allowed it (see
 * kvm_s2_handle_perm_fault).
 *
 * Also encode the level of the original translation in the SW bits of the leaf
 * entry as a proxy for the span of that translation. This will be retrieved on
 * TLB invalidation from the guest and used to limit the invalidation scope if a
 * TTL hint or a range isn't provided.
 */
static void adjust_nested_fault_perms(struct kvm_s2_trans *nested,
				      enum kvm_pgtable_prot *prot,
				      bool *writable)
{
	*writable &= kvm_s2_trans_writable(nested);
	if (!kvm_s2_trans_readable(nested))
		*prot &= ~KVM_PGTABLE_PROT_R;

	*prot |= kvm_encode_nested_level(nested);
}

#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED)

static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
		      struct kvm_s2_trans *nested,
		      struct kvm_memory_slot *memslot, bool is_perm)
{
	bool write_fault, exec_fault, writable;
	enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
	struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
	unsigned long mmu_seq;
	struct page *page;
	struct kvm *kvm = vcpu->kvm;
	void *memcache;
	kvm_pfn_t pfn;
	gfn_t gfn;
	int ret;

	ret = prepare_mmu_memcache(vcpu, true, &memcache);
	if (ret)
		return ret;

	if (nested)
		gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT;
	else
		gfn = fault_ipa >> PAGE_SHIFT;

	write_fault = kvm_is_write_fault(vcpu);
	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);

	VM_WARN_ON_ONCE(write_fault && exec_fault);

	mmu_seq = kvm->mmu_invalidate_seq;
	/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
	smp_rmb();

	ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL);
	if (ret) {
		kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE,
					      write_fault, exec_fault, false);
		return ret;
	}

	writable = !(memslot->flags & KVM_MEM_READONLY);

	if (nested)
		adjust_nested_fault_perms(nested, &prot, &writable);

	if (writable)
		prot |= KVM_PGTABLE_PROT_W;

	if (exec_fault ||
	    (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
	     (!nested || kvm_s2_trans_executable(nested))))
		prot |= KVM_PGTABLE_PROT_X;

	kvm_fault_lock(kvm);
	if (mmu_invalidate_retry(kvm, mmu_seq)) {
		ret = -EAGAIN;
		goto out_unlock;
	}

	ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE,
						 __pfn_to_phys(pfn), prot,
						 memcache, flags);

out_unlock:
	kvm_release_faultin_page(kvm, page, !!ret, writable);
	kvm_fault_unlock(kvm);

	if (writable && !ret)
		mark_page_dirty_in_slot(kvm, memslot, gfn);

	return ret != -EAGAIN ? ret : 0;
}

static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
			  struct kvm_s2_trans *nested,
			  struct kvm_memory_slot *memslot, unsigned long hva,
			  bool fault_is_perm)
{
	int ret = 0;
	bool write_fault, writable, force_pte = false;
	bool topup_memcache;
	bool write_fault, writable;
	bool exec_fault, mte_allowed, is_vma_cacheable;
	bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
	unsigned long mmu_seq;
@@ -1495,28 +1614,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
	gfn_t gfn;
	kvm_pfn_t pfn;
	bool logging_active = memslot_is_logging(memslot);
	bool force_pte = logging_active;
	long vma_pagesize, fault_granule;
	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
	struct kvm_pgtable *pgt;
	struct page *page;
	vm_flags_t vm_flags;
	enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
	enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;

	if (fault_is_perm)
		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
	write_fault = kvm_is_write_fault(vcpu);
	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
	VM_BUG_ON(write_fault && exec_fault);

	if (fault_is_perm && !write_fault && !exec_fault) {
		kvm_err("Unexpected L2 read permission error\n");
		return -EFAULT;
	}

	if (!is_protected_kvm_enabled())
		memcache = &vcpu->arch.mmu_page_cache;
	else
		memcache = &vcpu->arch.pkvm_memcache;
	VM_WARN_ON_ONCE(write_fault && exec_fault);

	/*
	 * Permission faults just need to update the existing leaf entry,
@@ -1524,17 +1634,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
	 * only exception to this is when dirty logging is enabled at runtime
	 * and a write fault needs to collapse a block entry into a table.
	 */
	if (!fault_is_perm || (logging_active && write_fault)) {
		int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);

		if (!is_protected_kvm_enabled())
			ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
		else
			ret = topup_hyp_memcache(memcache, min_pages);

	topup_memcache = !fault_is_perm || (logging_active && write_fault);
	ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
	if (ret)
		return ret;
	}

	/*
	 * Let's check if we will get back a huge page backed by hugetlbfs, or
@@ -1548,16 +1651,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
		return -EFAULT;
	}

	/*
	 * logging_active is guaranteed to never be true for VM_PFNMAP
	 * memslots.
	 */
	if (logging_active) {
		force_pte = true;
	if (force_pte)
		vma_shift = PAGE_SHIFT;
	} else {
	else
		vma_shift = get_vma_page_shift(vma, hva);
	}

	switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
@@ -1609,7 +1706,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
			max_map_size = PAGE_SIZE;

		force_pte = (max_map_size == PAGE_SIZE);
		vma_pagesize = min(vma_pagesize, (long)max_map_size);
		vma_pagesize = min_t(long, vma_pagesize, max_map_size);
	}

	/*
@@ -1642,7 +1739,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
	 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
	 * with the smp_wmb() in kvm_mmu_invalidate_end().
	 */
	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
	mmu_seq = kvm->mmu_invalidate_seq;
	mmap_read_unlock(current->mm);

	pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
@@ -1698,24 +1795,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
	if (exec_fault && s2_force_noncacheable)
		return -ENOEXEC;

	/*
	 * Potentially reduce shadow S2 permissions to match the guest's own
	 * S2. For exec faults, we'd only reach this point if the guest
	 * actually allowed it (see kvm_s2_handle_perm_fault).
	 *
	 * Also encode the level of the original translation in the SW bits
	 * of the leaf entry as a proxy for the span of that translation.
	 * This will be retrieved on TLB invalidation from the guest and
	 * used to limit the invalidation scope if a TTL hint or a range
	 * isn't provided.
	 */
	if (nested) {
		writable &= kvm_s2_trans_writable(nested);
		if (!kvm_s2_trans_readable(nested))
			prot &= ~KVM_PGTABLE_PROT_R;

		prot |= kvm_encode_nested_level(nested);
	}
	if (nested)
		adjust_nested_fault_perms(nested, &prot, &writable);

	kvm_fault_lock(kvm);
	pgt = vcpu->arch.hw_mmu->pgt;
@@ -1981,6 +2062,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
		goto out_unlock;
	}

	VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
			!write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));

	if (kvm_slot_has_gmem(memslot))
		ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
				 esr_fsc_is_permission_fault(esr));
	else
		ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
				     esr_fsc_is_permission_fault(esr));
	if (ret == 0)
@@ -2214,6 +2302,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
	if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
		return -EFAULT;

	/*
	 * Only support guest_memfd backed memslots with mappable memory, since
	 * there aren't any CoCo VMs that support only private memory on arm64.
	 */
	if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new))
		return -EINVAL;

	hva = new->userspace_addr;
	reg_end = hva + (new->npages << PAGE_SHIFT);

+35 −6
Original line number Diff line number Diff line
@@ -1172,8 +1172,9 @@ static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
	return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48);
}

static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
{
	struct kvm_memory_slot *memslot;
	bool write_fault, writable;
	unsigned long mmu_seq;
	struct vncr_tlb *vt;
@@ -1216,9 +1217,24 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
	smp_rmb();

	gfn = vt->wr.pa >> PAGE_SHIFT;
	pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writable, &page);
	memslot = gfn_to_memslot(vcpu->kvm, gfn);
	if (!memslot)
		return -EFAULT;

	*is_gmem = kvm_slot_has_gmem(memslot);
	if (!*is_gmem) {
		pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
					&writable, &page);
		if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
			return -EFAULT;
	} else {
		ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL);
		if (ret) {
			kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE,
					      write_fault, false, false);
			return ret;
		}
	}

	scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
		if (mmu_invalidate_retry(vcpu->kvm, mmu_seq))
@@ -1292,23 +1308,36 @@ int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
	if (esr_fsc_is_permission_fault(esr)) {
		inject_vncr_perm(vcpu);
	} else if (esr_fsc_is_translation_fault(esr)) {
		bool valid;
		bool valid, is_gmem = false;
		int ret;

		scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
			valid = kvm_vncr_tlb_lookup(vcpu);

		if (!valid)
			ret = kvm_translate_vncr(vcpu);
			ret = kvm_translate_vncr(vcpu, &is_gmem);
		else
			ret = -EPERM;

		switch (ret) {
		case -EAGAIN:
		case -ENOMEM:
			/* Let's try again... */
			break;
		case -ENOMEM:
			/*
			 * For guest_memfd, this indicates that it failed to
			 * create a folio to back the memory. Inform userspace.
			 */
			if (is_gmem)
				return 0;
			/* Otherwise, let's try again... */
			break;
		case -EFAULT:
		case -EIO:
		case -EHWPOISON:
			if (is_gmem)
				return 0;
			fallthrough;
		case -EINVAL:
		case -ENOENT:
		case -EACCES:
+1 −1
Original line number Diff line number Diff line
@@ -145,7 +145,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
KVM_X86_OP_OPTIONAL(get_untagged_addr)
KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level)
KVM_X86_OP_OPTIONAL_RET0(gmem_max_mapping_level)
KVM_X86_OP_OPTIONAL(gmem_invalidate)

#undef KVM_X86_OP
Loading