Merge branch 'guest-memfd-mmap' into HEAD

Add support for host userspace mapping of guest_memfd-backed memory for VM
types that do NOT use support KVM_MEMORY_ATTRIBUTE_PRIVATE (which isn't
precisely the same thing as CoCo VMs, since x86's SEV-MEM and SEV-ES have
no way to detect private vs. shared).

mmap() support paves the way for several evolving KVM use cases:

* Allows VMMs like Firecracker to run guests entirely backed by
  guest_memfd [1]. This provides a unified memory management model for
  both confidential and non-confidential guests, simplifying VMM design.

* Enhanced Security via direct map removal: When combined with Patrick's
  series for direct map removal [2], this provides additional hardening
  against Spectre-like transient execution attacks by eliminating the
  need for host kernel direct maps of guest memory.

* Lays the groundwork for *restricted* mmap() support for guest_memfd-backed
  memory on CoCo platforms [3] that permit in-place sharing of guest memory
   with the host.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
Paolo Bonzini 2025-08-27 04:37:40 -04:00
commit a6ad54137a
26 changed files with 648 additions and 214 deletions

View File

@ -6414,6 +6414,15 @@ most one mapping per page, i.e. binding multiple memory regions to a single
guest_memfd range is not allowed (any number of memory regions can be bound to
a single guest_memfd file, but the bound ranges must not overlap).
When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field
supports GUEST_MEMFD_FLAG_MMAP. Setting this flag on guest_memfd creation
enables mmap() and faulting of guest_memfd memory to host userspace.
When the KVM MMU performs a PFN lookup to service a guest fault and the backing
guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be
consumed from guest_memfd, regardless of whether it is a shared or a private
fault.
See KVM_SET_USER_MEMORY_REGION2 for additional details.
4.143 KVM_PRE_FAULT_MEMORY

View File

@ -37,6 +37,7 @@ menuconfig KVM
select HAVE_KVM_VCPU_RUN_PID_CHANGE
select SCHED_INFO
select GUEST_PERF_EVENTS if PERF_EVENTS
select KVM_GUEST_MEMFD
help
Support hosting virtualized guest machines.

View File

@ -1477,13 +1477,132 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
}
}
static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache,
void **memcache)
{
int min_pages;
if (!is_protected_kvm_enabled())
*memcache = &vcpu->arch.mmu_page_cache;
else
*memcache = &vcpu->arch.pkvm_memcache;
if (!topup_memcache)
return 0;
min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
if (!is_protected_kvm_enabled())
return kvm_mmu_topup_memory_cache(*memcache, min_pages);
return topup_hyp_memcache(*memcache, min_pages);
}
/*
* Potentially reduce shadow S2 permissions to match the guest's own S2. For
* exec faults, we'd only reach this point if the guest actually allowed it (see
* kvm_s2_handle_perm_fault).
*
* Also encode the level of the original translation in the SW bits of the leaf
* entry as a proxy for the span of that translation. This will be retrieved on
* TLB invalidation from the guest and used to limit the invalidation scope if a
* TTL hint or a range isn't provided.
*/
static void adjust_nested_fault_perms(struct kvm_s2_trans *nested,
enum kvm_pgtable_prot *prot,
bool *writable)
{
*writable &= kvm_s2_trans_writable(nested);
if (!kvm_s2_trans_readable(nested))
*prot &= ~KVM_PGTABLE_PROT_R;
*prot |= kvm_encode_nested_level(nested);
}
#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED)
static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, bool is_perm)
{
bool write_fault, exec_fault, writable;
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
unsigned long mmu_seq;
struct page *page;
struct kvm *kvm = vcpu->kvm;
void *memcache;
kvm_pfn_t pfn;
gfn_t gfn;
int ret;
ret = prepare_mmu_memcache(vcpu, true, &memcache);
if (ret)
return ret;
if (nested)
gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT;
else
gfn = fault_ipa >> PAGE_SHIFT;
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_WARN_ON_ONCE(write_fault && exec_fault);
mmu_seq = kvm->mmu_invalidate_seq;
/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
smp_rmb();
ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL);
if (ret) {
kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE,
write_fault, exec_fault, false);
return ret;
}
writable = !(memslot->flags & KVM_MEM_READONLY);
if (nested)
adjust_nested_fault_perms(nested, &prot, &writable);
if (writable)
prot |= KVM_PGTABLE_PROT_W;
if (exec_fault ||
(cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
(!nested || kvm_s2_trans_executable(nested))))
prot |= KVM_PGTABLE_PROT_X;
kvm_fault_lock(kvm);
if (mmu_invalidate_retry(kvm, mmu_seq)) {
ret = -EAGAIN;
goto out_unlock;
}
ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE,
__pfn_to_phys(pfn), prot,
memcache, flags);
out_unlock:
kvm_release_faultin_page(kvm, page, !!ret, writable);
kvm_fault_unlock(kvm);
if (writable && !ret)
mark_page_dirty_in_slot(kvm, memslot, gfn);
return ret != -EAGAIN ? ret : 0;
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, unsigned long hva,
bool fault_is_perm)
{
int ret = 0;
bool write_fault, writable, force_pte = false;
bool topup_memcache;
bool write_fault, writable;
bool exec_fault, mte_allowed, is_vma_cacheable;
bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
@ -1495,28 +1614,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
gfn_t gfn;
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
bool force_pte = logging_active;
long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
struct page *page;
vm_flags_t vm_flags;
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
if (fault_is_perm)
fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_BUG_ON(write_fault && exec_fault);
if (fault_is_perm && !write_fault && !exec_fault) {
kvm_err("Unexpected L2 read permission error\n");
return -EFAULT;
}
if (!is_protected_kvm_enabled())
memcache = &vcpu->arch.mmu_page_cache;
else
memcache = &vcpu->arch.pkvm_memcache;
VM_WARN_ON_ONCE(write_fault && exec_fault);
/*
* Permission faults just need to update the existing leaf entry,
@ -1524,17 +1634,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* only exception to this is when dirty logging is enabled at runtime
* and a write fault needs to collapse a block entry into a table.
*/
if (!fault_is_perm || (logging_active && write_fault)) {
int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
if (!is_protected_kvm_enabled())
ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
else
ret = topup_hyp_memcache(memcache, min_pages);
if (ret)
return ret;
}
topup_memcache = !fault_is_perm || (logging_active && write_fault);
ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
if (ret)
return ret;
/*
* Let's check if we will get back a huge page backed by hugetlbfs, or
@ -1548,16 +1651,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
/*
* logging_active is guaranteed to never be true for VM_PFNMAP
* memslots.
*/
if (logging_active) {
force_pte = true;
if (force_pte)
vma_shift = PAGE_SHIFT;
} else {
else
vma_shift = get_vma_page_shift(vma, hva);
}
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
@ -1609,7 +1706,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
max_map_size = PAGE_SIZE;
force_pte = (max_map_size == PAGE_SIZE);
vma_pagesize = min(vma_pagesize, (long)max_map_size);
vma_pagesize = min_t(long, vma_pagesize, max_map_size);
}
/*
@ -1642,7 +1739,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
* with the smp_wmb() in kvm_mmu_invalidate_end().
*/
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
mmu_seq = kvm->mmu_invalidate_seq;
mmap_read_unlock(current->mm);
pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
@ -1698,24 +1795,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault && s2_force_noncacheable)
return -ENOEXEC;
/*
* Potentially reduce shadow S2 permissions to match the guest's own
* S2. For exec faults, we'd only reach this point if the guest
* actually allowed it (see kvm_s2_handle_perm_fault).
*
* Also encode the level of the original translation in the SW bits
* of the leaf entry as a proxy for the span of that translation.
* This will be retrieved on TLB invalidation from the guest and
* used to limit the invalidation scope if a TTL hint or a range
* isn't provided.
*/
if (nested) {
writable &= kvm_s2_trans_writable(nested);
if (!kvm_s2_trans_readable(nested))
prot &= ~KVM_PGTABLE_PROT_R;
prot |= kvm_encode_nested_level(nested);
}
if (nested)
adjust_nested_fault_perms(nested, &prot, &writable);
kvm_fault_lock(kvm);
pgt = vcpu->arch.hw_mmu->pgt;
@ -1981,8 +2062,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
goto out_unlock;
}
ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
esr_fsc_is_permission_fault(esr));
VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
!write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
if (kvm_slot_has_gmem(memslot))
ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
esr_fsc_is_permission_fault(esr));
else
ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
esr_fsc_is_permission_fault(esr));
if (ret == 0)
ret = 1;
out:
@ -2214,6 +2302,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
return -EFAULT;
/*
* Only support guest_memfd backed memslots with mappable memory, since
* there aren't any CoCo VMs that support only private memory on arm64.
*/
if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new))
return -EINVAL;
hva = new->userspace_addr;
reg_end = hva + (new->npages << PAGE_SHIFT);

View File

@ -1172,8 +1172,9 @@ static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48);
}
static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
{
struct kvm_memory_slot *memslot;
bool write_fault, writable;
unsigned long mmu_seq;
struct vncr_tlb *vt;
@ -1216,10 +1217,25 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
smp_rmb();
gfn = vt->wr.pa >> PAGE_SHIFT;
pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writable, &page);
if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
memslot = gfn_to_memslot(vcpu->kvm, gfn);
if (!memslot)
return -EFAULT;
*is_gmem = kvm_slot_has_gmem(memslot);
if (!*is_gmem) {
pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
&writable, &page);
if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
return -EFAULT;
} else {
ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL);
if (ret) {
kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE,
write_fault, false, false);
return ret;
}
}
scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
if (mmu_invalidate_retry(vcpu->kvm, mmu_seq))
return -EAGAIN;
@ -1292,23 +1308,36 @@ int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
if (esr_fsc_is_permission_fault(esr)) {
inject_vncr_perm(vcpu);
} else if (esr_fsc_is_translation_fault(esr)) {
bool valid;
bool valid, is_gmem = false;
int ret;
scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
valid = kvm_vncr_tlb_lookup(vcpu);
if (!valid)
ret = kvm_translate_vncr(vcpu);
ret = kvm_translate_vncr(vcpu, &is_gmem);
else
ret = -EPERM;
switch (ret) {
case -EAGAIN:
case -ENOMEM:
/* Let's try again... */
break;
case -ENOMEM:
/*
* For guest_memfd, this indicates that it failed to
* create a folio to back the memory. Inform userspace.
*/
if (is_gmem)
return 0;
/* Otherwise, let's try again... */
break;
case -EFAULT:
case -EIO:
case -EHWPOISON:
if (is_gmem)
return 0;
fallthrough;
case -EINVAL:
case -ENOENT:
case -EACCES:

View File

@ -145,7 +145,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
KVM_X86_OP_OPTIONAL(get_untagged_addr)
KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level)
KVM_X86_OP_OPTIONAL_RET0(gmem_max_mapping_level)
KVM_X86_OP_OPTIONAL(gmem_invalidate)
#undef KVM_X86_OP

View File

@ -1922,7 +1922,7 @@ struct kvm_x86_ops {
void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn);
int (*gmem_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
};
struct kvm_x86_nested_ops {
@ -2276,10 +2276,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
#ifdef CONFIG_KVM_PRIVATE_MEM
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#else
#define kvm_arch_has_private_mem(kvm) false
#endif
#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state)

View File

@ -46,8 +46,8 @@ config KVM_X86
select HAVE_KVM_PM_NOTIFIER if PM
select KVM_GENERIC_HARDWARE_ENABLING
select KVM_GENERIC_PRE_FAULT_MEMORY
select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM
select KVM_WERROR if WERROR
select KVM_GUEST_MEMFD if X86_64
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
@ -74,7 +74,7 @@ config KVM_WERROR
# FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning.
# Building KVM with -Werror and KASAN is still doable via enabling
# the kernel-wide WERROR=y.
depends on KVM && ((EXPERT && !KASAN) || WERROR)
depends on KVM_X86 && ((EXPERT && !KASAN) || WERROR)
help
Add -Werror to the build flags for KVM.
@ -83,7 +83,8 @@ config KVM_WERROR
config KVM_SW_PROTECTED_VM
bool "Enable support for KVM software-protected VMs"
depends on EXPERT
depends on KVM && X86_64
depends on KVM_X86 && X86_64
select KVM_GENERIC_MEMORY_ATTRIBUTES
help
Enable support for KVM software-protected VMs. Currently, software-
protected VMs are purely a development and testing vehicle for
@ -95,8 +96,6 @@ config KVM_SW_PROTECTED_VM
config KVM_INTEL
tristate "KVM for Intel (and compatible) processors support"
depends on KVM && IA32_FEAT_CTL
select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST
select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
help
Provides support for KVM on processors equipped with Intel's VT
extensions, a.k.a. Virtual Machine Extensions (VMX).
@ -135,6 +134,8 @@ config KVM_INTEL_TDX
bool "Intel Trust Domain Extensions (TDX) support"
default y
depends on INTEL_TDX_HOST
select KVM_GENERIC_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_POPULATE
help
Provides support for launching Intel Trust Domain Extensions (TDX)
confidential VMs on Intel processors.
@ -157,9 +158,10 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
select KVM_GENERIC_PRIVATE_MEM
select KVM_GENERIC_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_PREPARE
select HAVE_KVM_ARCH_GMEM_INVALIDATE
select HAVE_KVM_ARCH_GMEM_POPULATE
help
Provides support for launching encrypted VMs which use Secure
Encrypted Virtualization (SEV), Secure Encrypted Virtualization with
@ -169,7 +171,7 @@ config KVM_AMD_SEV
config KVM_IOAPIC
bool "I/O APIC, PIC, and PIT emulation"
default y
depends on KVM
depends on KVM_X86
help
Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
for full in-kernel APIC emulation.
@ -179,7 +181,7 @@ config KVM_IOAPIC
config KVM_SMM
bool "System Management Mode emulation"
default y
depends on KVM
depends on KVM_X86
help
Provides support for KVM to emulate System Management Mode (SMM)
in virtual machines. This can be used by the virtual machine
@ -189,7 +191,7 @@ config KVM_SMM
config KVM_HYPERV
bool "Support for Microsoft Hyper-V emulation"
depends on KVM
depends on KVM_X86
default y
help
Provides KVM support for emulating Microsoft Hyper-V. This allows KVM
@ -203,7 +205,7 @@ config KVM_HYPERV
config KVM_XEN
bool "Support for Xen hypercall interface"
depends on KVM
depends on KVM_X86
help
Provides KVM support for the hosting Xen HVM guests and
passing Xen hypercalls to userspace.
@ -213,7 +215,7 @@ config KVM_XEN
config KVM_PROVE_MMU
bool "Prove KVM MMU correctness"
depends on DEBUG_KERNEL
depends on KVM
depends on KVM_X86
depends on EXPERT
help
Enables runtime assertions in KVM's MMU that are too costly to enable
@ -228,7 +230,7 @@ config KVM_EXTERNAL_WRITE_TRACKING
config KVM_MAX_NR_VCPUS
int "Maximum number of vCPUs per KVM guest"
depends on KVM
depends on KVM_X86
range 1024 4096
default 4096 if MAXSMP
default 1024

View File

@ -3285,12 +3285,72 @@ out:
return level;
}
static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot,
gfn_t gfn, int max_level, bool is_private)
static u8 kvm_max_level_for_order(int order)
{
BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
return PG_LEVEL_1G;
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
return PG_LEVEL_2M;
return PG_LEVEL_4K;
}
static u8 kvm_gmem_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
const struct kvm_memory_slot *slot, gfn_t gfn,
bool is_private)
{
u8 max_level, coco_level;
kvm_pfn_t pfn;
/* For faults, use the gmem information that was resolved earlier. */
if (fault) {
pfn = fault->pfn;
max_level = fault->max_level;
} else {
/* TODO: Call into guest_memfd once hugepages are supported. */
WARN_ONCE(1, "Get pfn+order from guest_memfd");
pfn = KVM_PFN_ERR_FAULT;
max_level = PG_LEVEL_4K;
}
if (max_level == PG_LEVEL_4K)
return max_level;
/*
* CoCo may influence the max mapping level, e.g. due to RMP or S-EPT
* restrictions. A return of '0' means "no additional restrictions", to
* allow for using an optional "ret0" static call.
*/
coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn, is_private);
if (coco_level)
max_level = min(max_level, coco_level);
return max_level;
}
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
const struct kvm_memory_slot *slot, gfn_t gfn)
{
struct kvm_lpage_info *linfo;
int host_level;
int host_level, max_level;
bool is_private;
lockdep_assert_held(&kvm->mmu_lock);
if (fault) {
max_level = fault->max_level;
is_private = fault->is_private;
} else {
max_level = PG_LEVEL_NUM;
is_private = kvm_mem_is_private(kvm, gfn);
}
max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
@ -3299,25 +3359,17 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
break;
}
if (is_private)
return max_level;
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
host_level = host_pfn_mapping_level(kvm, gfn, slot);
if (is_private || kvm_memslot_is_gmem_only(slot))
host_level = kvm_gmem_max_mapping_level(kvm, fault, slot, gfn,
is_private);
else
host_level = host_pfn_mapping_level(kvm, gfn, slot);
return min(host_level, max_level);
}
int kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot, gfn_t gfn)
{
bool is_private = kvm_slot_can_be_private(slot) &&
kvm_mem_is_private(kvm, gfn);
return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
}
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
@ -3338,9 +3390,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
fault->gfn, fault->max_level,
fault->is_private);
fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault,
fault->slot, fault->gfn);
if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
return;
@ -4503,42 +4554,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
vcpu->stat.pf_fixed++;
}
static inline u8 kvm_max_level_for_order(int order)
{
BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
return PG_LEVEL_1G;
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
return PG_LEVEL_2M;
return PG_LEVEL_4K;
}
static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
u8 max_level, int gmem_order)
{
u8 req_max_level;
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
max_level = min(kvm_max_level_for_order(gmem_order), max_level);
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn);
if (req_max_level)
max_level = min(max_level, req_max_level);
return max_level;
}
static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault, int r)
{
@ -4546,12 +4561,12 @@ static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
r == RET_PF_RETRY, fault->map_writable);
}
static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
int max_order, r;
if (!kvm_slot_can_be_private(fault->slot)) {
if (!kvm_slot_has_gmem(fault->slot)) {
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
return -EFAULT;
}
@ -4564,8 +4579,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
}
fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
fault->max_level, max_order);
fault->max_level = kvm_max_level_for_order(max_order);
return RET_PF_CONTINUE;
}
@ -4575,8 +4589,8 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
{
unsigned int foll = fault->write ? FOLL_WRITE : 0;
if (fault->is_private)
return kvm_mmu_faultin_pfn_private(vcpu, fault);
if (fault->is_private || kvm_memslot_is_gmem_only(fault->slot))
return kvm_mmu_faultin_pfn_gmem(vcpu, fault);
foll |= FOLL_NOWAIT;
fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
@ -7165,7 +7179,7 @@ restart:
* mapping if the indirect sp has level = 1.
*/
if (sp->role.direct &&
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) {
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
if (kvm_available_flush_remote_tlbs_range())

View File

@ -411,7 +411,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
return r;
}
int kvm_mmu_max_mapping_level(struct kvm *kvm,
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
const struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);

View File

@ -1813,7 +1813,7 @@ retry:
if (iter.gfn < start || iter.gfn >= end)
continue;
max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
if (max_mapping_level < iter.level)
continue;

View File

@ -2361,7 +2361,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
mutex_lock(&kvm->slots_lock);
memslot = gfn_to_memslot(kvm, params.gfn_start);
if (!kvm_slot_can_be_private(memslot)) {
if (!kvm_slot_has_gmem(memslot)) {
ret = -EINVAL;
goto out;
}
@ -4715,7 +4715,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
}
slot = gfn_to_memslot(kvm, gfn);
if (!kvm_slot_can_be_private(slot)) {
if (!kvm_slot_has_gmem(slot)) {
pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
gpa);
return;
@ -4943,7 +4943,7 @@ next_pfn:
}
}
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
{
int level, rc;
bool assigned;

View File

@ -5180,7 +5180,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.gmem_prepare = sev_gmem_prepare,
.gmem_invalidate = sev_gmem_invalidate,
.private_max_mapping_level = sev_private_max_mapping_level,
.gmem_max_mapping_level = sev_gmem_max_mapping_level,
};
/*

View File

@ -866,7 +866,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu);
void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa);
#else
@ -895,7 +895,7 @@ static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, in
return 0;
}
static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {}
static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
static inline int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
{
return 0;
}

View File

@ -831,10 +831,11 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
return tdx_vcpu_ioctl(vcpu, argp);
}
static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
bool is_private)
{
if (is_td(kvm))
return tdx_gmem_private_max_mapping_level(kvm, pfn);
return tdx_gmem_max_mapping_level(kvm, pfn, is_private);
return 0;
}
@ -1005,7 +1006,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl),
.vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),
.private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level)
.gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level)
};
struct kvm_x86_init_ops vt_init_ops __initdata = {

View File

@ -3318,8 +3318,11 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
return ret;
}
int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
{
if (!is_private)
return 0;
return PG_LEVEL_4K;
}

View File

@ -153,7 +153,7 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
#endif
#endif /* __KVM_X86_VMX_X86_OPS_H */

View File

@ -13521,6 +13521,16 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
#ifdef CONFIG_KVM_GUEST_MEMFD
/*
* KVM doesn't yet support mmap() on guest_memfd for VMs with private memory
* (the private vs. shared tracking needs to be moved into guest_memfd).
*/
bool kvm_arch_supports_gmem_mmap(struct kvm *kvm)
{
return !kvm_arch_has_private_mem(kvm);
}
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
{
@ -13534,6 +13544,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
kvm_x86_call(gmem_invalidate)(start, end);
}
#endif
#endif
int kvm_spec_ctrl_test_value(u64 value)
{

View File

@ -52,9 +52,10 @@
/*
* The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally
* used in kvm, other bits are visible for userspace which are defined in
* include/linux/kvm_h.
* include/uapi/linux/kvm.h.
*/
#define KVM_MEMSLOT_INVALID (1UL << 16)
#define KVM_MEMSLOT_INVALID (1UL << 16)
#define KVM_MEMSLOT_GMEM_ONLY (1UL << 17)
/*
* Bit 63 of the memslot generation number is an "update in-progress flag",
@ -602,7 +603,7 @@ struct kvm_memory_slot {
short id;
u16 as_id;
#ifdef CONFIG_KVM_PRIVATE_MEM
#ifdef CONFIG_KVM_GUEST_MEMFD
struct {
/*
* Writes protected by kvm->slots_lock. Acquiring a
@ -615,7 +616,7 @@ struct kvm_memory_slot {
#endif
};
static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
static inline bool kvm_slot_has_gmem(const struct kvm_memory_slot *slot)
{
return slot && (slot->flags & KVM_MEM_GUEST_MEMFD);
}
@ -719,17 +720,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
}
#endif
/*
* Arch code must define kvm_arch_has_private_mem if support for private memory
* is enabled.
*/
#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
{
return false;
}
#endif
#ifdef CONFIG_KVM_GUEST_MEMFD
bool kvm_arch_supports_gmem_mmap(struct kvm *kvm);
#endif
#ifndef kvm_arch_has_readonly_mem
static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm)
{
@ -860,7 +861,7 @@ struct kvm {
struct notifier_block pm_notifier;
#endif
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
/* Protected by slots_locks (for writes) and RCU (for reads) */
/* Protected by slots_lock (for writes) and RCU (for reads) */
struct xarray mem_attr_array;
#endif
char stats_id[KVM_STATS_NAME_SIZE];
@ -2490,6 +2491,14 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
}
static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
{
if (!IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
return false;
return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
}
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
{
@ -2505,8 +2514,7 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
}
#else
static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
@ -2515,7 +2523,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
}
#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
#ifdef CONFIG_KVM_PRIVATE_MEM
#ifdef CONFIG_KVM_GUEST_MEMFD
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
int *max_order);
@ -2528,13 +2536,13 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm,
KVM_BUG_ON(1, kvm);
return -EIO;
}
#endif /* CONFIG_KVM_PRIVATE_MEM */
#endif /* CONFIG_KVM_GUEST_MEMFD */
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order);
#endif
#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
/**
* kvm_gmem_populate() - Populate/prepare a GPA range with guest data
*

View File

@ -962,6 +962,7 @@ struct kvm_enable_cap {
#define KVM_CAP_ARM_EL2_E2H0 241
#define KVM_CAP_RISCV_MP_STATE_RESET 242
#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
#define KVM_CAP_GUEST_MEMFD_MMAP 244
struct kvm_irq_routing_irqchip {
__u32 irqchip;
@ -1598,6 +1599,7 @@ struct kvm_memory_attributes {
#define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0)
struct kvm_create_guest_memfd {
__u64 size;

View File

@ -174,6 +174,7 @@ TEST_GEN_PROGS_arm64 += arch_timer
TEST_GEN_PROGS_arm64 += coalesced_io_test
TEST_GEN_PROGS_arm64 += dirty_log_perf_test
TEST_GEN_PROGS_arm64 += get-reg-list
TEST_GEN_PROGS_arm64 += guest_memfd_test
TEST_GEN_PROGS_arm64 += memslot_modification_stress_test
TEST_GEN_PROGS_arm64 += memslot_perf_test
TEST_GEN_PROGS_arm64 += mmu_stress_test

View File

@ -13,12 +13,16 @@
#include <linux/bitmap.h>
#include <linux/falloc.h>
#include <linux/sizes.h>
#include <setjmp.h>
#include <signal.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "kvm_util.h"
#include "test_util.h"
#include "ucall_common.h"
static void test_file_read_write(int fd)
{
@ -34,12 +38,83 @@ static void test_file_read_write(int fd)
"pwrite on a guest_mem fd should fail");
}
static void test_mmap(int fd, size_t page_size)
static void test_mmap_supported(int fd, size_t page_size, size_t total_size)
{
const char val = 0xaa;
char *mem;
size_t i;
int ret;
mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
TEST_ASSERT(mem == MAP_FAILED, "Copy-on-write not allowed by guest_memfd.");
mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
TEST_ASSERT(mem != MAP_FAILED, "mmap() for guest_memfd should succeed.");
memset(mem, val, total_size);
for (i = 0; i < total_size; i++)
TEST_ASSERT_EQ(READ_ONCE(mem[i]), val);
ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0,
page_size);
TEST_ASSERT(!ret, "fallocate the first page should succeed.");
for (i = 0; i < page_size; i++)
TEST_ASSERT_EQ(READ_ONCE(mem[i]), 0x00);
for (; i < total_size; i++)
TEST_ASSERT_EQ(READ_ONCE(mem[i]), val);
memset(mem, val, page_size);
for (i = 0; i < total_size; i++)
TEST_ASSERT_EQ(READ_ONCE(mem[i]), val);
ret = munmap(mem, total_size);
TEST_ASSERT(!ret, "munmap() should succeed.");
}
static sigjmp_buf jmpbuf;
void fault_sigbus_handler(int signum)
{
siglongjmp(jmpbuf, 1);
}
static void test_fault_overflow(int fd, size_t page_size, size_t total_size)
{
struct sigaction sa_old, sa_new = {
.sa_handler = fault_sigbus_handler,
};
size_t map_size = total_size * 4;
const char val = 0xaa;
char *mem;
size_t i;
int ret;
mem = mmap(NULL, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
TEST_ASSERT(mem != MAP_FAILED, "mmap() for guest_memfd should succeed.");
sigaction(SIGBUS, &sa_new, &sa_old);
if (sigsetjmp(jmpbuf, 1) == 0) {
memset(mem, 0xaa, map_size);
TEST_ASSERT(false, "memset() should have triggered SIGBUS.");
}
sigaction(SIGBUS, &sa_old, NULL);
for (i = 0; i < total_size; i++)
TEST_ASSERT_EQ(READ_ONCE(mem[i]), val);
ret = munmap(mem, map_size);
TEST_ASSERT(!ret, "munmap() should succeed.");
}
static void test_mmap_not_supported(int fd, size_t page_size, size_t total_size)
{
char *mem;
mem = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
TEST_ASSERT_EQ(mem, MAP_FAILED);
mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
TEST_ASSERT_EQ(mem, MAP_FAILED);
}
static void test_file_size(int fd, size_t page_size, size_t total_size)
@ -120,80 +195,187 @@ static void test_invalid_punch_hole(int fd, size_t page_size, size_t total_size)
}
}
static void test_create_guest_memfd_invalid(struct kvm_vm *vm)
static void test_create_guest_memfd_invalid_sizes(struct kvm_vm *vm,
uint64_t guest_memfd_flags,
size_t page_size)
{
size_t page_size = getpagesize();
uint64_t flag;
size_t size;
int fd;
for (size = 1; size < page_size; size++) {
fd = __vm_create_guest_memfd(vm, size, 0);
TEST_ASSERT(fd == -1 && errno == EINVAL,
fd = __vm_create_guest_memfd(vm, size, guest_memfd_flags);
TEST_ASSERT(fd < 0 && errno == EINVAL,
"guest_memfd() with non-page-aligned page size '0x%lx' should fail with EINVAL",
size);
}
for (flag = BIT(0); flag; flag <<= 1) {
fd = __vm_create_guest_memfd(vm, page_size, flag);
TEST_ASSERT(fd == -1 && errno == EINVAL,
"guest_memfd() with flag '0x%lx' should fail with EINVAL",
flag);
}
}
static void test_create_guest_memfd_multiple(struct kvm_vm *vm)
{
int fd1, fd2, ret;
struct stat st1, st2;
size_t page_size = getpagesize();
fd1 = __vm_create_guest_memfd(vm, 4096, 0);
fd1 = __vm_create_guest_memfd(vm, page_size, 0);
TEST_ASSERT(fd1 != -1, "memfd creation should succeed");
ret = fstat(fd1, &st1);
TEST_ASSERT(ret != -1, "memfd fstat should succeed");
TEST_ASSERT(st1.st_size == 4096, "memfd st_size should match requested size");
TEST_ASSERT(st1.st_size == page_size, "memfd st_size should match requested size");
fd2 = __vm_create_guest_memfd(vm, 8192, 0);
fd2 = __vm_create_guest_memfd(vm, page_size * 2, 0);
TEST_ASSERT(fd2 != -1, "memfd creation should succeed");
ret = fstat(fd2, &st2);
TEST_ASSERT(ret != -1, "memfd fstat should succeed");
TEST_ASSERT(st2.st_size == 8192, "second memfd st_size should match requested size");
TEST_ASSERT(st2.st_size == page_size * 2, "second memfd st_size should match requested size");
ret = fstat(fd1, &st1);
TEST_ASSERT(ret != -1, "memfd fstat should succeed");
TEST_ASSERT(st1.st_size == 4096, "first memfd st_size should still match requested size");
TEST_ASSERT(st1.st_size == page_size, "first memfd st_size should still match requested size");
TEST_ASSERT(st1.st_ino != st2.st_ino, "different memfd should have different inode numbers");
close(fd2);
close(fd1);
}
int main(int argc, char *argv[])
static void test_guest_memfd_flags(struct kvm_vm *vm, uint64_t valid_flags)
{
size_t page_size;
size_t total_size;
size_t page_size = getpagesize();
uint64_t flag;
int fd;
struct kvm_vm *vm;
TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD));
for (flag = BIT(0); flag; flag <<= 1) {
fd = __vm_create_guest_memfd(vm, page_size, flag);
if (flag & valid_flags) {
TEST_ASSERT(fd >= 0,
"guest_memfd() with flag '0x%lx' should succeed",
flag);
close(fd);
} else {
TEST_ASSERT(fd < 0 && errno == EINVAL,
"guest_memfd() with flag '0x%lx' should fail with EINVAL",
flag);
}
}
}
static void test_guest_memfd(unsigned long vm_type)
{
uint64_t flags = 0;
struct kvm_vm *vm;
size_t total_size;
size_t page_size;
int fd;
page_size = getpagesize();
total_size = page_size * 4;
vm = vm_create_barebones();
vm = vm_create_barebones_type(vm_type);
if (vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP))
flags |= GUEST_MEMFD_FLAG_MMAP;
test_create_guest_memfd_invalid(vm);
test_create_guest_memfd_multiple(vm);
test_create_guest_memfd_invalid_sizes(vm, flags, page_size);
fd = vm_create_guest_memfd(vm, total_size, 0);
fd = vm_create_guest_memfd(vm, total_size, flags);
test_file_read_write(fd);
test_mmap(fd, page_size);
if (flags & GUEST_MEMFD_FLAG_MMAP) {
test_mmap_supported(fd, page_size, total_size);
test_fault_overflow(fd, page_size, total_size);
} else {
test_mmap_not_supported(fd, page_size, total_size);
}
test_file_size(fd, page_size, total_size);
test_fallocate(fd, page_size, total_size);
test_invalid_punch_hole(fd, page_size, total_size);
test_guest_memfd_flags(vm, flags);
close(fd);
kvm_vm_free(vm);
}
static void guest_code(uint8_t *mem, uint64_t size)
{
size_t i;
for (i = 0; i < size; i++)
__GUEST_ASSERT(mem[i] == 0xaa,
"Guest expected 0xaa at offset %lu, got 0x%x", i, mem[i]);
memset(mem, 0xff, size);
GUEST_DONE();
}
static void test_guest_memfd_guest(void)
{
/*
* Skip the first 4gb and slot0. slot0 maps <1gb and is used to back
* the guest's code, stack, and page tables, and low memory contains
* the PCI hole and other MMIO regions that need to be avoided.
*/
const uint64_t gpa = SZ_4G;
const int slot = 1;
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
uint8_t *mem;
size_t size;
int fd, i;
if (!kvm_has_cap(KVM_CAP_GUEST_MEMFD_MMAP))
return;
vm = __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, &vcpu, 1, guest_code);
TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP),
"Default VM type should always support guest_memfd mmap()");
size = vm->page_size;
fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP);
vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0);
mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
TEST_ASSERT(mem != MAP_FAILED, "mmap() on guest_memfd failed");
memset(mem, 0xaa, size);
munmap(mem, size);
virt_pg_map(vm, gpa, gpa);
vcpu_args_set(vcpu, 2, gpa, size);
vcpu_run(vcpu);
TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
TEST_ASSERT(mem != MAP_FAILED, "mmap() on guest_memfd failed");
for (i = 0; i < size; i++)
TEST_ASSERT_EQ(mem[i], 0xff);
close(fd);
kvm_vm_free(vm);
}
int main(int argc, char *argv[])
{
unsigned long vm_types, vm_type;
TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD));
/*
* Not all architectures support KVM_CAP_VM_TYPES. However, those that
* support guest_memfd have that support for the default VM type.
*/
vm_types = kvm_check_cap(KVM_CAP_VM_TYPES);
if (!vm_types)
vm_types = BIT(VM_TYPE_DEFAULT);
for_each_set_bit(vm_type, &vm_types, BITS_PER_TYPE(vm_types))
test_guest_memfd(vm_type);
test_guest_memfd_guest();
}

View File

@ -112,19 +112,18 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES
depends on KVM_GENERIC_MMU_NOTIFIER
bool
config KVM_PRIVATE_MEM
config KVM_GUEST_MEMFD
select XARRAY_MULTI
bool
config KVM_GENERIC_PRIVATE_MEM
select KVM_GENERIC_MEMORY_ATTRIBUTES
select KVM_PRIVATE_MEM
bool
config HAVE_KVM_ARCH_GMEM_PREPARE
bool
depends on KVM_PRIVATE_MEM
depends on KVM_GUEST_MEMFD
config HAVE_KVM_ARCH_GMEM_INVALIDATE
bool
depends on KVM_PRIVATE_MEM
depends on KVM_GUEST_MEMFD
config HAVE_KVM_ARCH_GMEM_POPULATE
bool
depends on KVM_GUEST_MEMFD

View File

@ -12,4 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_memfd.o
kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o

View File

@ -312,7 +312,74 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
return gfn - slot->base_gfn + slot->gmem.pgoff;
}
static bool kvm_gmem_supports_mmap(struct inode *inode)
{
const u64 flags = (u64)inode->i_private;
return flags & GUEST_MEMFD_FLAG_MMAP;
}
static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct folio *folio;
vm_fault_t ret = VM_FAULT_LOCKED;
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
folio = kvm_gmem_get_folio(inode, vmf->pgoff);
if (IS_ERR(folio)) {
int err = PTR_ERR(folio);
if (err == -EAGAIN)
return VM_FAULT_RETRY;
return vmf_error(err);
}
if (WARN_ON_ONCE(folio_test_large(folio))) {
ret = VM_FAULT_SIGBUS;
goto out_folio;
}
if (!folio_test_uptodate(folio)) {
clear_highpage(folio_page(folio, 0));
kvm_gmem_mark_prepared(folio);
}
vmf->page = folio_file_page(folio, vmf->pgoff);
out_folio:
if (ret != VM_FAULT_LOCKED) {
folio_unlock(folio);
folio_put(folio);
}
return ret;
}
static const struct vm_operations_struct kvm_gmem_vm_ops = {
.fault = kvm_gmem_fault_user_mapping,
};
static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
{
if (!kvm_gmem_supports_mmap(file_inode(file)))
return -ENODEV;
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
(VM_SHARED | VM_MAYSHARE)) {
return -EINVAL;
}
vma->vm_ops = &kvm_gmem_vm_ops;
return 0;
}
static struct file_operations kvm_gmem_fops = {
.mmap = kvm_gmem_mmap,
.open = generic_file_open,
.release = kvm_gmem_release,
.fallocate = kvm_gmem_fallocate,
@ -391,6 +458,11 @@ static const struct inode_operations kvm_gmem_iops = {
.setattr = kvm_gmem_setattr,
};
bool __weak kvm_arch_supports_gmem_mmap(struct kvm *kvm)
{
return true;
}
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
const char *anon_name = "[kvm-gmem]";
@ -452,6 +524,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
u64 flags = args->flags;
u64 valid_flags = 0;
if (kvm_arch_supports_gmem_mmap(kvm))
valid_flags |= GUEST_MEMFD_FLAG_MMAP;
if (flags & ~valid_flags)
return -EINVAL;
@ -508,6 +583,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
*/
WRITE_ONCE(slot->gmem.file, file);
slot->gmem.pgoff = start;
if (kvm_gmem_supports_mmap(inode))
slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
filemap_invalidate_unlock(inode->i_mapping);
@ -627,7 +704,7 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque)
{
@ -643,7 +720,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
return -EINVAL;
slot = gfn_to_memslot(kvm, start_gfn);
if (!kvm_slot_can_be_private(slot))
if (!kvm_slot_has_gmem(slot))
return -EINVAL;
file = kvm_gmem_get_file(slot);

View File

@ -331,7 +331,7 @@ void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
* All current use cases for flushing the TLBs for a specific memslot
* are related to dirty logging, and many do the TLB flush out of
* mmu_lock. The interaction between the various operations on memslot
* must be serialized by slots_locks to ensure the TLB flush from one
* must be serialized by slots_lock to ensure the TLB flush from one
* operation is observed by any other operation on the same memslot.
*/
lockdep_assert_held(&kvm->slots_lock);
@ -1588,7 +1588,7 @@ static int check_memory_region_flags(struct kvm *kvm,
{
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
if (kvm_arch_has_private_mem(kvm))
if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
valid_flags |= KVM_MEM_GUEST_MEMFD;
/* Dirty logging private memory is not currently supported. */
@ -4915,9 +4915,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_MEMORY_ATTRIBUTES:
return kvm_supported_mem_attributes(kvm);
#endif
#ifdef CONFIG_KVM_PRIVATE_MEM
#ifdef CONFIG_KVM_GUEST_MEMFD
case KVM_CAP_GUEST_MEMFD:
return !kvm || kvm_arch_has_private_mem(kvm);
return 1;
case KVM_CAP_GUEST_MEMFD_MMAP:
return !kvm || kvm_arch_supports_gmem_mmap(kvm);
#endif
default:
break;
@ -5352,7 +5354,7 @@ static long kvm_vm_ioctl(struct file *filp,
case KVM_GET_STATS_FD:
r = kvm_vm_ioctl_get_stats_fd(kvm);
break;
#ifdef CONFIG_KVM_PRIVATE_MEM
#ifdef CONFIG_KVM_GUEST_MEMFD
case KVM_CREATE_GUEST_MEMFD: {
struct kvm_create_guest_memfd guest_memfd;

View File

@ -67,7 +67,7 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
}
#endif /* HAVE_KVM_PFNCACHE */
#ifdef CONFIG_KVM_PRIVATE_MEM
#ifdef CONFIG_KVM_GUEST_MEMFD
void kvm_gmem_init(struct module *module);
int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
@ -91,6 +91,6 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
{
WARN_ON_ONCE(1);
}
#endif /* CONFIG_KVM_PRIVATE_MEM */
#endif /* CONFIG_KVM_GUEST_MEMFD */
#endif /* __KVM_MM_H__ */