Merge branch 'guest-memfd-mmap' into HEAD
Add support for host userspace mapping of guest_memfd-backed memory for VM types that do NOT use support KVM_MEMORY_ATTRIBUTE_PRIVATE (which isn't precisely the same thing as CoCo VMs, since x86's SEV-MEM and SEV-ES have no way to detect private vs. shared). mmap() support paves the way for several evolving KVM use cases: * Allows VMMs like Firecracker to run guests entirely backed by guest_memfd [1]. This provides a unified memory management model for both confidential and non-confidential guests, simplifying VMM design. * Enhanced Security via direct map removal: When combined with Patrick's series for direct map removal [2], this provides additional hardening against Spectre-like transient execution attacks by eliminating the need for host kernel direct maps of guest memory. * Lays the groundwork for *restricted* mmap() support for guest_memfd-backed memory on CoCo platforms [3] that permit in-place sharing of guest memory with the host. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
commit
a6ad54137a
|
@ -6414,6 +6414,15 @@ most one mapping per page, i.e. binding multiple memory regions to a single
|
|||
guest_memfd range is not allowed (any number of memory regions can be bound to
|
||||
a single guest_memfd file, but the bound ranges must not overlap).
|
||||
|
||||
When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field
|
||||
supports GUEST_MEMFD_FLAG_MMAP. Setting this flag on guest_memfd creation
|
||||
enables mmap() and faulting of guest_memfd memory to host userspace.
|
||||
|
||||
When the KVM MMU performs a PFN lookup to service a guest fault and the backing
|
||||
guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be
|
||||
consumed from guest_memfd, regardless of whether it is a shared or a private
|
||||
fault.
|
||||
|
||||
See KVM_SET_USER_MEMORY_REGION2 for additional details.
|
||||
|
||||
4.143 KVM_PRE_FAULT_MEMORY
|
||||
|
|
|
@ -37,6 +37,7 @@ menuconfig KVM
|
|||
select HAVE_KVM_VCPU_RUN_PID_CHANGE
|
||||
select SCHED_INFO
|
||||
select GUEST_PERF_EVENTS if PERF_EVENTS
|
||||
select KVM_GUEST_MEMFD
|
||||
help
|
||||
Support hosting virtualized guest machines.
|
||||
|
||||
|
|
|
@ -1477,13 +1477,132 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
|
|||
}
|
||||
}
|
||||
|
||||
static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache,
|
||||
void **memcache)
|
||||
{
|
||||
int min_pages;
|
||||
|
||||
if (!is_protected_kvm_enabled())
|
||||
*memcache = &vcpu->arch.mmu_page_cache;
|
||||
else
|
||||
*memcache = &vcpu->arch.pkvm_memcache;
|
||||
|
||||
if (!topup_memcache)
|
||||
return 0;
|
||||
|
||||
min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
|
||||
|
||||
if (!is_protected_kvm_enabled())
|
||||
return kvm_mmu_topup_memory_cache(*memcache, min_pages);
|
||||
|
||||
return topup_hyp_memcache(*memcache, min_pages);
|
||||
}
|
||||
|
||||
/*
|
||||
* Potentially reduce shadow S2 permissions to match the guest's own S2. For
|
||||
* exec faults, we'd only reach this point if the guest actually allowed it (see
|
||||
* kvm_s2_handle_perm_fault).
|
||||
*
|
||||
* Also encode the level of the original translation in the SW bits of the leaf
|
||||
* entry as a proxy for the span of that translation. This will be retrieved on
|
||||
* TLB invalidation from the guest and used to limit the invalidation scope if a
|
||||
* TTL hint or a range isn't provided.
|
||||
*/
|
||||
static void adjust_nested_fault_perms(struct kvm_s2_trans *nested,
|
||||
enum kvm_pgtable_prot *prot,
|
||||
bool *writable)
|
||||
{
|
||||
*writable &= kvm_s2_trans_writable(nested);
|
||||
if (!kvm_s2_trans_readable(nested))
|
||||
*prot &= ~KVM_PGTABLE_PROT_R;
|
||||
|
||||
*prot |= kvm_encode_nested_level(nested);
|
||||
}
|
||||
|
||||
#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED)
|
||||
|
||||
static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
struct kvm_s2_trans *nested,
|
||||
struct kvm_memory_slot *memslot, bool is_perm)
|
||||
{
|
||||
bool write_fault, exec_fault, writable;
|
||||
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
|
||||
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
|
||||
struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
|
||||
unsigned long mmu_seq;
|
||||
struct page *page;
|
||||
struct kvm *kvm = vcpu->kvm;
|
||||
void *memcache;
|
||||
kvm_pfn_t pfn;
|
||||
gfn_t gfn;
|
||||
int ret;
|
||||
|
||||
ret = prepare_mmu_memcache(vcpu, true, &memcache);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (nested)
|
||||
gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT;
|
||||
else
|
||||
gfn = fault_ipa >> PAGE_SHIFT;
|
||||
|
||||
write_fault = kvm_is_write_fault(vcpu);
|
||||
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
|
||||
|
||||
VM_WARN_ON_ONCE(write_fault && exec_fault);
|
||||
|
||||
mmu_seq = kvm->mmu_invalidate_seq;
|
||||
/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
|
||||
smp_rmb();
|
||||
|
||||
ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL);
|
||||
if (ret) {
|
||||
kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE,
|
||||
write_fault, exec_fault, false);
|
||||
return ret;
|
||||
}
|
||||
|
||||
writable = !(memslot->flags & KVM_MEM_READONLY);
|
||||
|
||||
if (nested)
|
||||
adjust_nested_fault_perms(nested, &prot, &writable);
|
||||
|
||||
if (writable)
|
||||
prot |= KVM_PGTABLE_PROT_W;
|
||||
|
||||
if (exec_fault ||
|
||||
(cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
|
||||
(!nested || kvm_s2_trans_executable(nested))))
|
||||
prot |= KVM_PGTABLE_PROT_X;
|
||||
|
||||
kvm_fault_lock(kvm);
|
||||
if (mmu_invalidate_retry(kvm, mmu_seq)) {
|
||||
ret = -EAGAIN;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE,
|
||||
__pfn_to_phys(pfn), prot,
|
||||
memcache, flags);
|
||||
|
||||
out_unlock:
|
||||
kvm_release_faultin_page(kvm, page, !!ret, writable);
|
||||
kvm_fault_unlock(kvm);
|
||||
|
||||
if (writable && !ret)
|
||||
mark_page_dirty_in_slot(kvm, memslot, gfn);
|
||||
|
||||
return ret != -EAGAIN ? ret : 0;
|
||||
}
|
||||
|
||||
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
struct kvm_s2_trans *nested,
|
||||
struct kvm_memory_slot *memslot, unsigned long hva,
|
||||
bool fault_is_perm)
|
||||
{
|
||||
int ret = 0;
|
||||
bool write_fault, writable, force_pte = false;
|
||||
bool topup_memcache;
|
||||
bool write_fault, writable;
|
||||
bool exec_fault, mte_allowed, is_vma_cacheable;
|
||||
bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
|
||||
unsigned long mmu_seq;
|
||||
|
@ -1495,28 +1614,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
|||
gfn_t gfn;
|
||||
kvm_pfn_t pfn;
|
||||
bool logging_active = memslot_is_logging(memslot);
|
||||
bool force_pte = logging_active;
|
||||
long vma_pagesize, fault_granule;
|
||||
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
|
||||
struct kvm_pgtable *pgt;
|
||||
struct page *page;
|
||||
vm_flags_t vm_flags;
|
||||
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
|
||||
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
|
||||
|
||||
if (fault_is_perm)
|
||||
fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
|
||||
write_fault = kvm_is_write_fault(vcpu);
|
||||
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
|
||||
VM_BUG_ON(write_fault && exec_fault);
|
||||
|
||||
if (fault_is_perm && !write_fault && !exec_fault) {
|
||||
kvm_err("Unexpected L2 read permission error\n");
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
if (!is_protected_kvm_enabled())
|
||||
memcache = &vcpu->arch.mmu_page_cache;
|
||||
else
|
||||
memcache = &vcpu->arch.pkvm_memcache;
|
||||
VM_WARN_ON_ONCE(write_fault && exec_fault);
|
||||
|
||||
/*
|
||||
* Permission faults just need to update the existing leaf entry,
|
||||
|
@ -1524,17 +1634,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
|||
* only exception to this is when dirty logging is enabled at runtime
|
||||
* and a write fault needs to collapse a block entry into a table.
|
||||
*/
|
||||
if (!fault_is_perm || (logging_active && write_fault)) {
|
||||
int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
|
||||
|
||||
if (!is_protected_kvm_enabled())
|
||||
ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
|
||||
else
|
||||
ret = topup_hyp_memcache(memcache, min_pages);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
topup_memcache = !fault_is_perm || (logging_active && write_fault);
|
||||
ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* Let's check if we will get back a huge page backed by hugetlbfs, or
|
||||
|
@ -1548,16 +1651,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
|||
return -EFAULT;
|
||||
}
|
||||
|
||||
/*
|
||||
* logging_active is guaranteed to never be true for VM_PFNMAP
|
||||
* memslots.
|
||||
*/
|
||||
if (logging_active) {
|
||||
force_pte = true;
|
||||
if (force_pte)
|
||||
vma_shift = PAGE_SHIFT;
|
||||
} else {
|
||||
else
|
||||
vma_shift = get_vma_page_shift(vma, hva);
|
||||
}
|
||||
|
||||
switch (vma_shift) {
|
||||
#ifndef __PAGETABLE_PMD_FOLDED
|
||||
|
@ -1609,7 +1706,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
|||
max_map_size = PAGE_SIZE;
|
||||
|
||||
force_pte = (max_map_size == PAGE_SIZE);
|
||||
vma_pagesize = min(vma_pagesize, (long)max_map_size);
|
||||
vma_pagesize = min_t(long, vma_pagesize, max_map_size);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1642,7 +1739,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
|||
* Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
|
||||
* with the smp_wmb() in kvm_mmu_invalidate_end().
|
||||
*/
|
||||
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
|
||||
mmu_seq = kvm->mmu_invalidate_seq;
|
||||
mmap_read_unlock(current->mm);
|
||||
|
||||
pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
|
||||
|
@ -1698,24 +1795,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
|||
if (exec_fault && s2_force_noncacheable)
|
||||
return -ENOEXEC;
|
||||
|
||||
/*
|
||||
* Potentially reduce shadow S2 permissions to match the guest's own
|
||||
* S2. For exec faults, we'd only reach this point if the guest
|
||||
* actually allowed it (see kvm_s2_handle_perm_fault).
|
||||
*
|
||||
* Also encode the level of the original translation in the SW bits
|
||||
* of the leaf entry as a proxy for the span of that translation.
|
||||
* This will be retrieved on TLB invalidation from the guest and
|
||||
* used to limit the invalidation scope if a TTL hint or a range
|
||||
* isn't provided.
|
||||
*/
|
||||
if (nested) {
|
||||
writable &= kvm_s2_trans_writable(nested);
|
||||
if (!kvm_s2_trans_readable(nested))
|
||||
prot &= ~KVM_PGTABLE_PROT_R;
|
||||
|
||||
prot |= kvm_encode_nested_level(nested);
|
||||
}
|
||||
if (nested)
|
||||
adjust_nested_fault_perms(nested, &prot, &writable);
|
||||
|
||||
kvm_fault_lock(kvm);
|
||||
pgt = vcpu->arch.hw_mmu->pgt;
|
||||
|
@ -1981,8 +2062,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
|
|||
goto out_unlock;
|
||||
}
|
||||
|
||||
ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
|
||||
esr_fsc_is_permission_fault(esr));
|
||||
VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
|
||||
!write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
|
||||
|
||||
if (kvm_slot_has_gmem(memslot))
|
||||
ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
|
||||
esr_fsc_is_permission_fault(esr));
|
||||
else
|
||||
ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
|
||||
esr_fsc_is_permission_fault(esr));
|
||||
if (ret == 0)
|
||||
ret = 1;
|
||||
out:
|
||||
|
@ -2214,6 +2302,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
|
|||
if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
|
||||
return -EFAULT;
|
||||
|
||||
/*
|
||||
* Only support guest_memfd backed memslots with mappable memory, since
|
||||
* there aren't any CoCo VMs that support only private memory on arm64.
|
||||
*/
|
||||
if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new))
|
||||
return -EINVAL;
|
||||
|
||||
hva = new->userspace_addr;
|
||||
reg_end = hva + (new->npages << PAGE_SHIFT);
|
||||
|
||||
|
|
|
@ -1172,8 +1172,9 @@ static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
|
|||
return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48);
|
||||
}
|
||||
|
||||
static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
|
||||
static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
|
||||
{
|
||||
struct kvm_memory_slot *memslot;
|
||||
bool write_fault, writable;
|
||||
unsigned long mmu_seq;
|
||||
struct vncr_tlb *vt;
|
||||
|
@ -1216,10 +1217,25 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
|
|||
smp_rmb();
|
||||
|
||||
gfn = vt->wr.pa >> PAGE_SHIFT;
|
||||
pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writable, &page);
|
||||
if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
|
||||
memslot = gfn_to_memslot(vcpu->kvm, gfn);
|
||||
if (!memslot)
|
||||
return -EFAULT;
|
||||
|
||||
*is_gmem = kvm_slot_has_gmem(memslot);
|
||||
if (!*is_gmem) {
|
||||
pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
|
||||
&writable, &page);
|
||||
if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
|
||||
return -EFAULT;
|
||||
} else {
|
||||
ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL);
|
||||
if (ret) {
|
||||
kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE,
|
||||
write_fault, false, false);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
|
||||
if (mmu_invalidate_retry(vcpu->kvm, mmu_seq))
|
||||
return -EAGAIN;
|
||||
|
@ -1292,23 +1308,36 @@ int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
|
|||
if (esr_fsc_is_permission_fault(esr)) {
|
||||
inject_vncr_perm(vcpu);
|
||||
} else if (esr_fsc_is_translation_fault(esr)) {
|
||||
bool valid;
|
||||
bool valid, is_gmem = false;
|
||||
int ret;
|
||||
|
||||
scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
|
||||
valid = kvm_vncr_tlb_lookup(vcpu);
|
||||
|
||||
if (!valid)
|
||||
ret = kvm_translate_vncr(vcpu);
|
||||
ret = kvm_translate_vncr(vcpu, &is_gmem);
|
||||
else
|
||||
ret = -EPERM;
|
||||
|
||||
switch (ret) {
|
||||
case -EAGAIN:
|
||||
case -ENOMEM:
|
||||
/* Let's try again... */
|
||||
break;
|
||||
case -ENOMEM:
|
||||
/*
|
||||
* For guest_memfd, this indicates that it failed to
|
||||
* create a folio to back the memory. Inform userspace.
|
||||
*/
|
||||
if (is_gmem)
|
||||
return 0;
|
||||
/* Otherwise, let's try again... */
|
||||
break;
|
||||
case -EFAULT:
|
||||
case -EIO:
|
||||
case -EHWPOISON:
|
||||
if (is_gmem)
|
||||
return 0;
|
||||
fallthrough;
|
||||
case -EINVAL:
|
||||
case -ENOENT:
|
||||
case -EACCES:
|
||||
|
|
|
@ -145,7 +145,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
|
|||
KVM_X86_OP_OPTIONAL(get_untagged_addr)
|
||||
KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
|
||||
KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
|
||||
KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level)
|
||||
KVM_X86_OP_OPTIONAL_RET0(gmem_max_mapping_level)
|
||||
KVM_X86_OP_OPTIONAL(gmem_invalidate)
|
||||
|
||||
#undef KVM_X86_OP
|
||||
|
|
|
@ -1922,7 +1922,7 @@ struct kvm_x86_ops {
|
|||
void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
|
||||
int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
|
||||
void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
|
||||
int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn);
|
||||
int (*gmem_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
|
||||
};
|
||||
|
||||
struct kvm_x86_nested_ops {
|
||||
|
@ -2276,10 +2276,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
|
|||
int tdp_max_root_level, int tdp_huge_page_level);
|
||||
|
||||
|
||||
#ifdef CONFIG_KVM_PRIVATE_MEM
|
||||
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
|
||||
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
|
||||
#else
|
||||
#define kvm_arch_has_private_mem(kvm) false
|
||||
#endif
|
||||
|
||||
#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state)
|
||||
|
|
|
@ -46,8 +46,8 @@ config KVM_X86
|
|||
select HAVE_KVM_PM_NOTIFIER if PM
|
||||
select KVM_GENERIC_HARDWARE_ENABLING
|
||||
select KVM_GENERIC_PRE_FAULT_MEMORY
|
||||
select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM
|
||||
select KVM_WERROR if WERROR
|
||||
select KVM_GUEST_MEMFD if X86_64
|
||||
|
||||
config KVM
|
||||
tristate "Kernel-based Virtual Machine (KVM) support"
|
||||
|
@ -74,7 +74,7 @@ config KVM_WERROR
|
|||
# FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning.
|
||||
# Building KVM with -Werror and KASAN is still doable via enabling
|
||||
# the kernel-wide WERROR=y.
|
||||
depends on KVM && ((EXPERT && !KASAN) || WERROR)
|
||||
depends on KVM_X86 && ((EXPERT && !KASAN) || WERROR)
|
||||
help
|
||||
Add -Werror to the build flags for KVM.
|
||||
|
||||
|
@ -83,7 +83,8 @@ config KVM_WERROR
|
|||
config KVM_SW_PROTECTED_VM
|
||||
bool "Enable support for KVM software-protected VMs"
|
||||
depends on EXPERT
|
||||
depends on KVM && X86_64
|
||||
depends on KVM_X86 && X86_64
|
||||
select KVM_GENERIC_MEMORY_ATTRIBUTES
|
||||
help
|
||||
Enable support for KVM software-protected VMs. Currently, software-
|
||||
protected VMs are purely a development and testing vehicle for
|
||||
|
@ -95,8 +96,6 @@ config KVM_SW_PROTECTED_VM
|
|||
config KVM_INTEL
|
||||
tristate "KVM for Intel (and compatible) processors support"
|
||||
depends on KVM && IA32_FEAT_CTL
|
||||
select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST
|
||||
select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
|
||||
help
|
||||
Provides support for KVM on processors equipped with Intel's VT
|
||||
extensions, a.k.a. Virtual Machine Extensions (VMX).
|
||||
|
@ -135,6 +134,8 @@ config KVM_INTEL_TDX
|
|||
bool "Intel Trust Domain Extensions (TDX) support"
|
||||
default y
|
||||
depends on INTEL_TDX_HOST
|
||||
select KVM_GENERIC_MEMORY_ATTRIBUTES
|
||||
select HAVE_KVM_ARCH_GMEM_POPULATE
|
||||
help
|
||||
Provides support for launching Intel Trust Domain Extensions (TDX)
|
||||
confidential VMs on Intel processors.
|
||||
|
@ -157,9 +158,10 @@ config KVM_AMD_SEV
|
|||
depends on KVM_AMD && X86_64
|
||||
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
|
||||
select ARCH_HAS_CC_PLATFORM
|
||||
select KVM_GENERIC_PRIVATE_MEM
|
||||
select KVM_GENERIC_MEMORY_ATTRIBUTES
|
||||
select HAVE_KVM_ARCH_GMEM_PREPARE
|
||||
select HAVE_KVM_ARCH_GMEM_INVALIDATE
|
||||
select HAVE_KVM_ARCH_GMEM_POPULATE
|
||||
help
|
||||
Provides support for launching encrypted VMs which use Secure
|
||||
Encrypted Virtualization (SEV), Secure Encrypted Virtualization with
|
||||
|
@ -169,7 +171,7 @@ config KVM_AMD_SEV
|
|||
config KVM_IOAPIC
|
||||
bool "I/O APIC, PIC, and PIT emulation"
|
||||
default y
|
||||
depends on KVM
|
||||
depends on KVM_X86
|
||||
help
|
||||
Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
|
||||
for full in-kernel APIC emulation.
|
||||
|
@ -179,7 +181,7 @@ config KVM_IOAPIC
|
|||
config KVM_SMM
|
||||
bool "System Management Mode emulation"
|
||||
default y
|
||||
depends on KVM
|
||||
depends on KVM_X86
|
||||
help
|
||||
Provides support for KVM to emulate System Management Mode (SMM)
|
||||
in virtual machines. This can be used by the virtual machine
|
||||
|
@ -189,7 +191,7 @@ config KVM_SMM
|
|||
|
||||
config KVM_HYPERV
|
||||
bool "Support for Microsoft Hyper-V emulation"
|
||||
depends on KVM
|
||||
depends on KVM_X86
|
||||
default y
|
||||
help
|
||||
Provides KVM support for emulating Microsoft Hyper-V. This allows KVM
|
||||
|
@ -203,7 +205,7 @@ config KVM_HYPERV
|
|||
|
||||
config KVM_XEN
|
||||
bool "Support for Xen hypercall interface"
|
||||
depends on KVM
|
||||
depends on KVM_X86
|
||||
help
|
||||
Provides KVM support for the hosting Xen HVM guests and
|
||||
passing Xen hypercalls to userspace.
|
||||
|
@ -213,7 +215,7 @@ config KVM_XEN
|
|||
config KVM_PROVE_MMU
|
||||
bool "Prove KVM MMU correctness"
|
||||
depends on DEBUG_KERNEL
|
||||
depends on KVM
|
||||
depends on KVM_X86
|
||||
depends on EXPERT
|
||||
help
|
||||
Enables runtime assertions in KVM's MMU that are too costly to enable
|
||||
|
@ -228,7 +230,7 @@ config KVM_EXTERNAL_WRITE_TRACKING
|
|||
|
||||
config KVM_MAX_NR_VCPUS
|
||||
int "Maximum number of vCPUs per KVM guest"
|
||||
depends on KVM
|
||||
depends on KVM_X86
|
||||
range 1024 4096
|
||||
default 4096 if MAXSMP
|
||||
default 1024
|
||||
|
|
|
@ -3285,12 +3285,72 @@ out:
|
|||
return level;
|
||||
}
|
||||
|
||||
static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot,
|
||||
gfn_t gfn, int max_level, bool is_private)
|
||||
static u8 kvm_max_level_for_order(int order)
|
||||
{
|
||||
BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
|
||||
|
||||
KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
|
||||
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
|
||||
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
|
||||
|
||||
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
|
||||
return PG_LEVEL_1G;
|
||||
|
||||
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
|
||||
return PG_LEVEL_2M;
|
||||
|
||||
return PG_LEVEL_4K;
|
||||
}
|
||||
|
||||
static u8 kvm_gmem_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
|
||||
const struct kvm_memory_slot *slot, gfn_t gfn,
|
||||
bool is_private)
|
||||
{
|
||||
u8 max_level, coco_level;
|
||||
kvm_pfn_t pfn;
|
||||
|
||||
/* For faults, use the gmem information that was resolved earlier. */
|
||||
if (fault) {
|
||||
pfn = fault->pfn;
|
||||
max_level = fault->max_level;
|
||||
} else {
|
||||
/* TODO: Call into guest_memfd once hugepages are supported. */
|
||||
WARN_ONCE(1, "Get pfn+order from guest_memfd");
|
||||
pfn = KVM_PFN_ERR_FAULT;
|
||||
max_level = PG_LEVEL_4K;
|
||||
}
|
||||
|
||||
if (max_level == PG_LEVEL_4K)
|
||||
return max_level;
|
||||
|
||||
/*
|
||||
* CoCo may influence the max mapping level, e.g. due to RMP or S-EPT
|
||||
* restrictions. A return of '0' means "no additional restrictions", to
|
||||
* allow for using an optional "ret0" static call.
|
||||
*/
|
||||
coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn, is_private);
|
||||
if (coco_level)
|
||||
max_level = min(max_level, coco_level);
|
||||
|
||||
return max_level;
|
||||
}
|
||||
|
||||
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
|
||||
const struct kvm_memory_slot *slot, gfn_t gfn)
|
||||
{
|
||||
struct kvm_lpage_info *linfo;
|
||||
int host_level;
|
||||
int host_level, max_level;
|
||||
bool is_private;
|
||||
|
||||
lockdep_assert_held(&kvm->mmu_lock);
|
||||
|
||||
if (fault) {
|
||||
max_level = fault->max_level;
|
||||
is_private = fault->is_private;
|
||||
} else {
|
||||
max_level = PG_LEVEL_NUM;
|
||||
is_private = kvm_mem_is_private(kvm, gfn);
|
||||
}
|
||||
|
||||
max_level = min(max_level, max_huge_page_level);
|
||||
for ( ; max_level > PG_LEVEL_4K; max_level--) {
|
||||
|
@ -3299,25 +3359,17 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
|
|||
break;
|
||||
}
|
||||
|
||||
if (is_private)
|
||||
return max_level;
|
||||
|
||||
if (max_level == PG_LEVEL_4K)
|
||||
return PG_LEVEL_4K;
|
||||
|
||||
host_level = host_pfn_mapping_level(kvm, gfn, slot);
|
||||
if (is_private || kvm_memslot_is_gmem_only(slot))
|
||||
host_level = kvm_gmem_max_mapping_level(kvm, fault, slot, gfn,
|
||||
is_private);
|
||||
else
|
||||
host_level = host_pfn_mapping_level(kvm, gfn, slot);
|
||||
return min(host_level, max_level);
|
||||
}
|
||||
|
||||
int kvm_mmu_max_mapping_level(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot, gfn_t gfn)
|
||||
{
|
||||
bool is_private = kvm_slot_can_be_private(slot) &&
|
||||
kvm_mem_is_private(kvm, gfn);
|
||||
|
||||
return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
|
||||
}
|
||||
|
||||
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
||||
{
|
||||
struct kvm_memory_slot *slot = fault->slot;
|
||||
|
@ -3338,9 +3390,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
|
|||
* Enforce the iTLB multihit workaround after capturing the requested
|
||||
* level, which will be used to do precise, accurate accounting.
|
||||
*/
|
||||
fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
|
||||
fault->gfn, fault->max_level,
|
||||
fault->is_private);
|
||||
fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault,
|
||||
fault->slot, fault->gfn);
|
||||
if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
|
||||
return;
|
||||
|
||||
|
@ -4503,42 +4554,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
|
|||
vcpu->stat.pf_fixed++;
|
||||
}
|
||||
|
||||
static inline u8 kvm_max_level_for_order(int order)
|
||||
{
|
||||
BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
|
||||
|
||||
KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
|
||||
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
|
||||
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
|
||||
|
||||
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
|
||||
return PG_LEVEL_1G;
|
||||
|
||||
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
|
||||
return PG_LEVEL_2M;
|
||||
|
||||
return PG_LEVEL_4K;
|
||||
}
|
||||
|
||||
static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
|
||||
u8 max_level, int gmem_order)
|
||||
{
|
||||
u8 req_max_level;
|
||||
|
||||
if (max_level == PG_LEVEL_4K)
|
||||
return PG_LEVEL_4K;
|
||||
|
||||
max_level = min(kvm_max_level_for_order(gmem_order), max_level);
|
||||
if (max_level == PG_LEVEL_4K)
|
||||
return PG_LEVEL_4K;
|
||||
|
||||
req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn);
|
||||
if (req_max_level)
|
||||
max_level = min(max_level, req_max_level);
|
||||
|
||||
return max_level;
|
||||
}
|
||||
|
||||
static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
|
||||
struct kvm_page_fault *fault, int r)
|
||||
{
|
||||
|
@ -4546,12 +4561,12 @@ static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
|
|||
r == RET_PF_RETRY, fault->map_writable);
|
||||
}
|
||||
|
||||
static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
|
||||
struct kvm_page_fault *fault)
|
||||
static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu,
|
||||
struct kvm_page_fault *fault)
|
||||
{
|
||||
int max_order, r;
|
||||
|
||||
if (!kvm_slot_can_be_private(fault->slot)) {
|
||||
if (!kvm_slot_has_gmem(fault->slot)) {
|
||||
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
|
||||
return -EFAULT;
|
||||
}
|
||||
|
@ -4564,8 +4579,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
|
|||
}
|
||||
|
||||
fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
|
||||
fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
|
||||
fault->max_level, max_order);
|
||||
fault->max_level = kvm_max_level_for_order(max_order);
|
||||
|
||||
return RET_PF_CONTINUE;
|
||||
}
|
||||
|
@ -4575,8 +4589,8 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
|
|||
{
|
||||
unsigned int foll = fault->write ? FOLL_WRITE : 0;
|
||||
|
||||
if (fault->is_private)
|
||||
return kvm_mmu_faultin_pfn_private(vcpu, fault);
|
||||
if (fault->is_private || kvm_memslot_is_gmem_only(fault->slot))
|
||||
return kvm_mmu_faultin_pfn_gmem(vcpu, fault);
|
||||
|
||||
foll |= FOLL_NOWAIT;
|
||||
fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
|
||||
|
@ -7165,7 +7179,7 @@ restart:
|
|||
* mapping if the indirect sp has level = 1.
|
||||
*/
|
||||
if (sp->role.direct &&
|
||||
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
|
||||
sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) {
|
||||
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
|
||||
|
||||
if (kvm_available_flush_remote_tlbs_range())
|
||||
|
|
|
@ -411,7 +411,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
|
|||
return r;
|
||||
}
|
||||
|
||||
int kvm_mmu_max_mapping_level(struct kvm *kvm,
|
||||
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
|
||||
const struct kvm_memory_slot *slot, gfn_t gfn);
|
||||
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
|
||||
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
|
||||
|
|
|
@ -1813,7 +1813,7 @@ retry:
|
|||
if (iter.gfn < start || iter.gfn >= end)
|
||||
continue;
|
||||
|
||||
max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
|
||||
max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
|
||||
if (max_mapping_level < iter.level)
|
||||
continue;
|
||||
|
||||
|
|
|
@ -2361,7 +2361,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
|
|||
mutex_lock(&kvm->slots_lock);
|
||||
|
||||
memslot = gfn_to_memslot(kvm, params.gfn_start);
|
||||
if (!kvm_slot_can_be_private(memslot)) {
|
||||
if (!kvm_slot_has_gmem(memslot)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
@ -4715,7 +4715,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
|
|||
}
|
||||
|
||||
slot = gfn_to_memslot(kvm, gfn);
|
||||
if (!kvm_slot_can_be_private(slot)) {
|
||||
if (!kvm_slot_has_gmem(slot)) {
|
||||
pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
|
||||
gpa);
|
||||
return;
|
||||
|
@ -4943,7 +4943,7 @@ next_pfn:
|
|||
}
|
||||
}
|
||||
|
||||
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
|
||||
int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
|
||||
{
|
||||
int level, rc;
|
||||
bool assigned;
|
||||
|
|
|
@ -5180,7 +5180,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
|
|||
|
||||
.gmem_prepare = sev_gmem_prepare,
|
||||
.gmem_invalidate = sev_gmem_invalidate,
|
||||
.private_max_mapping_level = sev_private_max_mapping_level,
|
||||
.gmem_max_mapping_level = sev_gmem_max_mapping_level,
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
|
@ -866,7 +866,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
|
|||
void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
|
||||
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
|
||||
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
|
||||
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
|
||||
int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
|
||||
struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu);
|
||||
void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa);
|
||||
#else
|
||||
|
@ -895,7 +895,7 @@ static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, in
|
|||
return 0;
|
||||
}
|
||||
static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {}
|
||||
static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
|
||||
static inline int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -831,10 +831,11 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
|
|||
return tdx_vcpu_ioctl(vcpu, argp);
|
||||
}
|
||||
|
||||
static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
|
||||
static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
|
||||
bool is_private)
|
||||
{
|
||||
if (is_td(kvm))
|
||||
return tdx_gmem_private_max_mapping_level(kvm, pfn);
|
||||
return tdx_gmem_max_mapping_level(kvm, pfn, is_private);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1005,7 +1006,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
|
|||
.mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl),
|
||||
.vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),
|
||||
|
||||
.private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level)
|
||||
.gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level)
|
||||
};
|
||||
|
||||
struct kvm_x86_init_ops vt_init_ops __initdata = {
|
||||
|
|
|
@ -3318,8 +3318,11 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
|
|||
return ret;
|
||||
}
|
||||
|
||||
int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
|
||||
int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
|
||||
{
|
||||
if (!is_private)
|
||||
return 0;
|
||||
|
||||
return PG_LEVEL_4K;
|
||||
}
|
||||
|
||||
|
|
|
@ -153,7 +153,7 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
|
|||
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
|
||||
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
|
||||
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
|
||||
int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
|
||||
int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
|
||||
#endif
|
||||
|
||||
#endif /* __KVM_X86_VMX_X86_OPS_H */
|
||||
|
|
|
@ -13521,6 +13521,16 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
|
||||
|
||||
#ifdef CONFIG_KVM_GUEST_MEMFD
|
||||
/*
|
||||
* KVM doesn't yet support mmap() on guest_memfd for VMs with private memory
|
||||
* (the private vs. shared tracking needs to be moved into guest_memfd).
|
||||
*/
|
||||
bool kvm_arch_supports_gmem_mmap(struct kvm *kvm)
|
||||
{
|
||||
return !kvm_arch_has_private_mem(kvm);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
|
||||
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
|
||||
{
|
||||
|
@ -13534,6 +13544,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
|
|||
kvm_x86_call(gmem_invalidate)(start, end);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
int kvm_spec_ctrl_test_value(u64 value)
|
||||
{
|
||||
|
|
|
@ -52,9 +52,10 @@
|
|||
/*
|
||||
* The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally
|
||||
* used in kvm, other bits are visible for userspace which are defined in
|
||||
* include/linux/kvm_h.
|
||||
* include/uapi/linux/kvm.h.
|
||||
*/
|
||||
#define KVM_MEMSLOT_INVALID (1UL << 16)
|
||||
#define KVM_MEMSLOT_INVALID (1UL << 16)
|
||||
#define KVM_MEMSLOT_GMEM_ONLY (1UL << 17)
|
||||
|
||||
/*
|
||||
* Bit 63 of the memslot generation number is an "update in-progress flag",
|
||||
|
@ -602,7 +603,7 @@ struct kvm_memory_slot {
|
|||
short id;
|
||||
u16 as_id;
|
||||
|
||||
#ifdef CONFIG_KVM_PRIVATE_MEM
|
||||
#ifdef CONFIG_KVM_GUEST_MEMFD
|
||||
struct {
|
||||
/*
|
||||
* Writes protected by kvm->slots_lock. Acquiring a
|
||||
|
@ -615,7 +616,7 @@ struct kvm_memory_slot {
|
|||
#endif
|
||||
};
|
||||
|
||||
static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
|
||||
static inline bool kvm_slot_has_gmem(const struct kvm_memory_slot *slot)
|
||||
{
|
||||
return slot && (slot->flags & KVM_MEM_GUEST_MEMFD);
|
||||
}
|
||||
|
@ -719,17 +720,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
|
|||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Arch code must define kvm_arch_has_private_mem if support for private memory
|
||||
* is enabled.
|
||||
*/
|
||||
#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
|
||||
#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
|
||||
static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_KVM_GUEST_MEMFD
|
||||
bool kvm_arch_supports_gmem_mmap(struct kvm *kvm);
|
||||
#endif
|
||||
|
||||
#ifndef kvm_arch_has_readonly_mem
|
||||
static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm)
|
||||
{
|
||||
|
@ -860,7 +861,7 @@ struct kvm {
|
|||
struct notifier_block pm_notifier;
|
||||
#endif
|
||||
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
|
||||
/* Protected by slots_locks (for writes) and RCU (for reads) */
|
||||
/* Protected by slots_lock (for writes) and RCU (for reads) */
|
||||
struct xarray mem_attr_array;
|
||||
#endif
|
||||
char stats_id[KVM_STATS_NAME_SIZE];
|
||||
|
@ -2490,6 +2491,14 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
|
|||
vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
|
||||
}
|
||||
|
||||
static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
|
||||
return false;
|
||||
|
||||
return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
|
||||
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
|
||||
{
|
||||
|
@ -2505,8 +2514,7 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
|
|||
|
||||
static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
|
||||
kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
|
||||
return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
|
||||
}
|
||||
#else
|
||||
static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
|
||||
|
@ -2515,7 +2523,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
|
|||
}
|
||||
#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
|
||||
|
||||
#ifdef CONFIG_KVM_PRIVATE_MEM
|
||||
#ifdef CONFIG_KVM_GUEST_MEMFD
|
||||
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
|
||||
gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
|
||||
int *max_order);
|
||||
|
@ -2528,13 +2536,13 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm,
|
|||
KVM_BUG_ON(1, kvm);
|
||||
return -EIO;
|
||||
}
|
||||
#endif /* CONFIG_KVM_PRIVATE_MEM */
|
||||
#endif /* CONFIG_KVM_GUEST_MEMFD */
|
||||
|
||||
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
|
||||
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
|
||||
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
|
||||
/**
|
||||
* kvm_gmem_populate() - Populate/prepare a GPA range with guest data
|
||||
*
|
||||
|
|
|
@ -962,6 +962,7 @@ struct kvm_enable_cap {
|
|||
#define KVM_CAP_ARM_EL2_E2H0 241
|
||||
#define KVM_CAP_RISCV_MP_STATE_RESET 242
|
||||
#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
|
||||
#define KVM_CAP_GUEST_MEMFD_MMAP 244
|
||||
|
||||
struct kvm_irq_routing_irqchip {
|
||||
__u32 irqchip;
|
||||
|
@ -1598,6 +1599,7 @@ struct kvm_memory_attributes {
|
|||
#define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
|
||||
|
||||
#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
|
||||
#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0)
|
||||
|
||||
struct kvm_create_guest_memfd {
|
||||
__u64 size;
|
||||
|
|
|
@ -174,6 +174,7 @@ TEST_GEN_PROGS_arm64 += arch_timer
|
|||
TEST_GEN_PROGS_arm64 += coalesced_io_test
|
||||
TEST_GEN_PROGS_arm64 += dirty_log_perf_test
|
||||
TEST_GEN_PROGS_arm64 += get-reg-list
|
||||
TEST_GEN_PROGS_arm64 += guest_memfd_test
|
||||
TEST_GEN_PROGS_arm64 += memslot_modification_stress_test
|
||||
TEST_GEN_PROGS_arm64 += memslot_perf_test
|
||||
TEST_GEN_PROGS_arm64 += mmu_stress_test
|
||||
|
|
|
@ -13,12 +13,16 @@
|
|||
|
||||
#include <linux/bitmap.h>
|
||||
#include <linux/falloc.h>
|
||||
#include <linux/sizes.h>
|
||||
#include <setjmp.h>
|
||||
#include <signal.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "kvm_util.h"
|
||||
#include "test_util.h"
|
||||
#include "ucall_common.h"
|
||||
|
||||
static void test_file_read_write(int fd)
|
||||
{
|
||||
|
@ -34,12 +38,83 @@ static void test_file_read_write(int fd)
|
|||
"pwrite on a guest_mem fd should fail");
|
||||
}
|
||||
|
||||
static void test_mmap(int fd, size_t page_size)
|
||||
static void test_mmap_supported(int fd, size_t page_size, size_t total_size)
|
||||
{
|
||||
const char val = 0xaa;
|
||||
char *mem;
|
||||
size_t i;
|
||||
int ret;
|
||||
|
||||
mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
|
||||
TEST_ASSERT(mem == MAP_FAILED, "Copy-on-write not allowed by guest_memfd.");
|
||||
|
||||
mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
TEST_ASSERT(mem != MAP_FAILED, "mmap() for guest_memfd should succeed.");
|
||||
|
||||
memset(mem, val, total_size);
|
||||
for (i = 0; i < total_size; i++)
|
||||
TEST_ASSERT_EQ(READ_ONCE(mem[i]), val);
|
||||
|
||||
ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0,
|
||||
page_size);
|
||||
TEST_ASSERT(!ret, "fallocate the first page should succeed.");
|
||||
|
||||
for (i = 0; i < page_size; i++)
|
||||
TEST_ASSERT_EQ(READ_ONCE(mem[i]), 0x00);
|
||||
for (; i < total_size; i++)
|
||||
TEST_ASSERT_EQ(READ_ONCE(mem[i]), val);
|
||||
|
||||
memset(mem, val, page_size);
|
||||
for (i = 0; i < total_size; i++)
|
||||
TEST_ASSERT_EQ(READ_ONCE(mem[i]), val);
|
||||
|
||||
ret = munmap(mem, total_size);
|
||||
TEST_ASSERT(!ret, "munmap() should succeed.");
|
||||
}
|
||||
|
||||
static sigjmp_buf jmpbuf;
|
||||
void fault_sigbus_handler(int signum)
|
||||
{
|
||||
siglongjmp(jmpbuf, 1);
|
||||
}
|
||||
|
||||
static void test_fault_overflow(int fd, size_t page_size, size_t total_size)
|
||||
{
|
||||
struct sigaction sa_old, sa_new = {
|
||||
.sa_handler = fault_sigbus_handler,
|
||||
};
|
||||
size_t map_size = total_size * 4;
|
||||
const char val = 0xaa;
|
||||
char *mem;
|
||||
size_t i;
|
||||
int ret;
|
||||
|
||||
mem = mmap(NULL, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
TEST_ASSERT(mem != MAP_FAILED, "mmap() for guest_memfd should succeed.");
|
||||
|
||||
sigaction(SIGBUS, &sa_new, &sa_old);
|
||||
if (sigsetjmp(jmpbuf, 1) == 0) {
|
||||
memset(mem, 0xaa, map_size);
|
||||
TEST_ASSERT(false, "memset() should have triggered SIGBUS.");
|
||||
}
|
||||
sigaction(SIGBUS, &sa_old, NULL);
|
||||
|
||||
for (i = 0; i < total_size; i++)
|
||||
TEST_ASSERT_EQ(READ_ONCE(mem[i]), val);
|
||||
|
||||
ret = munmap(mem, map_size);
|
||||
TEST_ASSERT(!ret, "munmap() should succeed.");
|
||||
}
|
||||
|
||||
static void test_mmap_not_supported(int fd, size_t page_size, size_t total_size)
|
||||
{
|
||||
char *mem;
|
||||
|
||||
mem = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
TEST_ASSERT_EQ(mem, MAP_FAILED);
|
||||
|
||||
mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
TEST_ASSERT_EQ(mem, MAP_FAILED);
|
||||
}
|
||||
|
||||
static void test_file_size(int fd, size_t page_size, size_t total_size)
|
||||
|
@ -120,80 +195,187 @@ static void test_invalid_punch_hole(int fd, size_t page_size, size_t total_size)
|
|||
}
|
||||
}
|
||||
|
||||
static void test_create_guest_memfd_invalid(struct kvm_vm *vm)
|
||||
static void test_create_guest_memfd_invalid_sizes(struct kvm_vm *vm,
|
||||
uint64_t guest_memfd_flags,
|
||||
size_t page_size)
|
||||
{
|
||||
size_t page_size = getpagesize();
|
||||
uint64_t flag;
|
||||
size_t size;
|
||||
int fd;
|
||||
|
||||
for (size = 1; size < page_size; size++) {
|
||||
fd = __vm_create_guest_memfd(vm, size, 0);
|
||||
TEST_ASSERT(fd == -1 && errno == EINVAL,
|
||||
fd = __vm_create_guest_memfd(vm, size, guest_memfd_flags);
|
||||
TEST_ASSERT(fd < 0 && errno == EINVAL,
|
||||
"guest_memfd() with non-page-aligned page size '0x%lx' should fail with EINVAL",
|
||||
size);
|
||||
}
|
||||
|
||||
for (flag = BIT(0); flag; flag <<= 1) {
|
||||
fd = __vm_create_guest_memfd(vm, page_size, flag);
|
||||
TEST_ASSERT(fd == -1 && errno == EINVAL,
|
||||
"guest_memfd() with flag '0x%lx' should fail with EINVAL",
|
||||
flag);
|
||||
}
|
||||
}
|
||||
|
||||
static void test_create_guest_memfd_multiple(struct kvm_vm *vm)
|
||||
{
|
||||
int fd1, fd2, ret;
|
||||
struct stat st1, st2;
|
||||
size_t page_size = getpagesize();
|
||||
|
||||
fd1 = __vm_create_guest_memfd(vm, 4096, 0);
|
||||
fd1 = __vm_create_guest_memfd(vm, page_size, 0);
|
||||
TEST_ASSERT(fd1 != -1, "memfd creation should succeed");
|
||||
|
||||
ret = fstat(fd1, &st1);
|
||||
TEST_ASSERT(ret != -1, "memfd fstat should succeed");
|
||||
TEST_ASSERT(st1.st_size == 4096, "memfd st_size should match requested size");
|
||||
TEST_ASSERT(st1.st_size == page_size, "memfd st_size should match requested size");
|
||||
|
||||
fd2 = __vm_create_guest_memfd(vm, 8192, 0);
|
||||
fd2 = __vm_create_guest_memfd(vm, page_size * 2, 0);
|
||||
TEST_ASSERT(fd2 != -1, "memfd creation should succeed");
|
||||
|
||||
ret = fstat(fd2, &st2);
|
||||
TEST_ASSERT(ret != -1, "memfd fstat should succeed");
|
||||
TEST_ASSERT(st2.st_size == 8192, "second memfd st_size should match requested size");
|
||||
TEST_ASSERT(st2.st_size == page_size * 2, "second memfd st_size should match requested size");
|
||||
|
||||
ret = fstat(fd1, &st1);
|
||||
TEST_ASSERT(ret != -1, "memfd fstat should succeed");
|
||||
TEST_ASSERT(st1.st_size == 4096, "first memfd st_size should still match requested size");
|
||||
TEST_ASSERT(st1.st_size == page_size, "first memfd st_size should still match requested size");
|
||||
TEST_ASSERT(st1.st_ino != st2.st_ino, "different memfd should have different inode numbers");
|
||||
|
||||
close(fd2);
|
||||
close(fd1);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
static void test_guest_memfd_flags(struct kvm_vm *vm, uint64_t valid_flags)
|
||||
{
|
||||
size_t page_size;
|
||||
size_t total_size;
|
||||
size_t page_size = getpagesize();
|
||||
uint64_t flag;
|
||||
int fd;
|
||||
struct kvm_vm *vm;
|
||||
|
||||
TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD));
|
||||
for (flag = BIT(0); flag; flag <<= 1) {
|
||||
fd = __vm_create_guest_memfd(vm, page_size, flag);
|
||||
if (flag & valid_flags) {
|
||||
TEST_ASSERT(fd >= 0,
|
||||
"guest_memfd() with flag '0x%lx' should succeed",
|
||||
flag);
|
||||
close(fd);
|
||||
} else {
|
||||
TEST_ASSERT(fd < 0 && errno == EINVAL,
|
||||
"guest_memfd() with flag '0x%lx' should fail with EINVAL",
|
||||
flag);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_guest_memfd(unsigned long vm_type)
|
||||
{
|
||||
uint64_t flags = 0;
|
||||
struct kvm_vm *vm;
|
||||
size_t total_size;
|
||||
size_t page_size;
|
||||
int fd;
|
||||
|
||||
page_size = getpagesize();
|
||||
total_size = page_size * 4;
|
||||
|
||||
vm = vm_create_barebones();
|
||||
vm = vm_create_barebones_type(vm_type);
|
||||
|
||||
if (vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP))
|
||||
flags |= GUEST_MEMFD_FLAG_MMAP;
|
||||
|
||||
test_create_guest_memfd_invalid(vm);
|
||||
test_create_guest_memfd_multiple(vm);
|
||||
test_create_guest_memfd_invalid_sizes(vm, flags, page_size);
|
||||
|
||||
fd = vm_create_guest_memfd(vm, total_size, 0);
|
||||
fd = vm_create_guest_memfd(vm, total_size, flags);
|
||||
|
||||
test_file_read_write(fd);
|
||||
test_mmap(fd, page_size);
|
||||
|
||||
if (flags & GUEST_MEMFD_FLAG_MMAP) {
|
||||
test_mmap_supported(fd, page_size, total_size);
|
||||
test_fault_overflow(fd, page_size, total_size);
|
||||
} else {
|
||||
test_mmap_not_supported(fd, page_size, total_size);
|
||||
}
|
||||
|
||||
test_file_size(fd, page_size, total_size);
|
||||
test_fallocate(fd, page_size, total_size);
|
||||
test_invalid_punch_hole(fd, page_size, total_size);
|
||||
|
||||
test_guest_memfd_flags(vm, flags);
|
||||
|
||||
close(fd);
|
||||
kvm_vm_free(vm);
|
||||
}
|
||||
|
||||
static void guest_code(uint8_t *mem, uint64_t size)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < size; i++)
|
||||
__GUEST_ASSERT(mem[i] == 0xaa,
|
||||
"Guest expected 0xaa at offset %lu, got 0x%x", i, mem[i]);
|
||||
|
||||
memset(mem, 0xff, size);
|
||||
GUEST_DONE();
|
||||
}
|
||||
|
||||
static void test_guest_memfd_guest(void)
|
||||
{
|
||||
/*
|
||||
* Skip the first 4gb and slot0. slot0 maps <1gb and is used to back
|
||||
* the guest's code, stack, and page tables, and low memory contains
|
||||
* the PCI hole and other MMIO regions that need to be avoided.
|
||||
*/
|
||||
const uint64_t gpa = SZ_4G;
|
||||
const int slot = 1;
|
||||
|
||||
struct kvm_vcpu *vcpu;
|
||||
struct kvm_vm *vm;
|
||||
uint8_t *mem;
|
||||
size_t size;
|
||||
int fd, i;
|
||||
|
||||
if (!kvm_has_cap(KVM_CAP_GUEST_MEMFD_MMAP))
|
||||
return;
|
||||
|
||||
vm = __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, &vcpu, 1, guest_code);
|
||||
|
||||
TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP),
|
||||
"Default VM type should always support guest_memfd mmap()");
|
||||
|
||||
size = vm->page_size;
|
||||
fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP);
|
||||
vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0);
|
||||
|
||||
mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
TEST_ASSERT(mem != MAP_FAILED, "mmap() on guest_memfd failed");
|
||||
memset(mem, 0xaa, size);
|
||||
munmap(mem, size);
|
||||
|
||||
virt_pg_map(vm, gpa, gpa);
|
||||
vcpu_args_set(vcpu, 2, gpa, size);
|
||||
vcpu_run(vcpu);
|
||||
|
||||
TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
|
||||
|
||||
mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
TEST_ASSERT(mem != MAP_FAILED, "mmap() on guest_memfd failed");
|
||||
for (i = 0; i < size; i++)
|
||||
TEST_ASSERT_EQ(mem[i], 0xff);
|
||||
|
||||
close(fd);
|
||||
kvm_vm_free(vm);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
unsigned long vm_types, vm_type;
|
||||
|
||||
TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD));
|
||||
|
||||
/*
|
||||
* Not all architectures support KVM_CAP_VM_TYPES. However, those that
|
||||
* support guest_memfd have that support for the default VM type.
|
||||
*/
|
||||
vm_types = kvm_check_cap(KVM_CAP_VM_TYPES);
|
||||
if (!vm_types)
|
||||
vm_types = BIT(VM_TYPE_DEFAULT);
|
||||
|
||||
for_each_set_bit(vm_type, &vm_types, BITS_PER_TYPE(vm_types))
|
||||
test_guest_memfd(vm_type);
|
||||
|
||||
test_guest_memfd_guest();
|
||||
}
|
||||
|
|
|
@ -112,19 +112,18 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES
|
|||
depends on KVM_GENERIC_MMU_NOTIFIER
|
||||
bool
|
||||
|
||||
config KVM_PRIVATE_MEM
|
||||
config KVM_GUEST_MEMFD
|
||||
select XARRAY_MULTI
|
||||
bool
|
||||
|
||||
config KVM_GENERIC_PRIVATE_MEM
|
||||
select KVM_GENERIC_MEMORY_ATTRIBUTES
|
||||
select KVM_PRIVATE_MEM
|
||||
bool
|
||||
|
||||
config HAVE_KVM_ARCH_GMEM_PREPARE
|
||||
bool
|
||||
depends on KVM_PRIVATE_MEM
|
||||
depends on KVM_GUEST_MEMFD
|
||||
|
||||
config HAVE_KVM_ARCH_GMEM_INVALIDATE
|
||||
bool
|
||||
depends on KVM_PRIVATE_MEM
|
||||
depends on KVM_GUEST_MEMFD
|
||||
|
||||
config HAVE_KVM_ARCH_GMEM_POPULATE
|
||||
bool
|
||||
depends on KVM_GUEST_MEMFD
|
||||
|
|
|
@ -12,4 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
|
|||
kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
|
||||
kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
|
||||
kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
|
||||
kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_memfd.o
|
||||
kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o
|
||||
|
|
|
@ -312,7 +312,74 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
|
|||
return gfn - slot->base_gfn + slot->gmem.pgoff;
|
||||
}
|
||||
|
||||
static bool kvm_gmem_supports_mmap(struct inode *inode)
|
||||
{
|
||||
const u64 flags = (u64)inode->i_private;
|
||||
|
||||
return flags & GUEST_MEMFD_FLAG_MMAP;
|
||||
}
|
||||
|
||||
static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
|
||||
{
|
||||
struct inode *inode = file_inode(vmf->vma->vm_file);
|
||||
struct folio *folio;
|
||||
vm_fault_t ret = VM_FAULT_LOCKED;
|
||||
|
||||
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
|
||||
return VM_FAULT_SIGBUS;
|
||||
|
||||
folio = kvm_gmem_get_folio(inode, vmf->pgoff);
|
||||
if (IS_ERR(folio)) {
|
||||
int err = PTR_ERR(folio);
|
||||
|
||||
if (err == -EAGAIN)
|
||||
return VM_FAULT_RETRY;
|
||||
|
||||
return vmf_error(err);
|
||||
}
|
||||
|
||||
if (WARN_ON_ONCE(folio_test_large(folio))) {
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto out_folio;
|
||||
}
|
||||
|
||||
if (!folio_test_uptodate(folio)) {
|
||||
clear_highpage(folio_page(folio, 0));
|
||||
kvm_gmem_mark_prepared(folio);
|
||||
}
|
||||
|
||||
vmf->page = folio_file_page(folio, vmf->pgoff);
|
||||
|
||||
out_folio:
|
||||
if (ret != VM_FAULT_LOCKED) {
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct kvm_gmem_vm_ops = {
|
||||
.fault = kvm_gmem_fault_user_mapping,
|
||||
};
|
||||
|
||||
static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
if (!kvm_gmem_supports_mmap(file_inode(file)))
|
||||
return -ENODEV;
|
||||
|
||||
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
|
||||
(VM_SHARED | VM_MAYSHARE)) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
vma->vm_ops = &kvm_gmem_vm_ops;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct file_operations kvm_gmem_fops = {
|
||||
.mmap = kvm_gmem_mmap,
|
||||
.open = generic_file_open,
|
||||
.release = kvm_gmem_release,
|
||||
.fallocate = kvm_gmem_fallocate,
|
||||
|
@ -391,6 +458,11 @@ static const struct inode_operations kvm_gmem_iops = {
|
|||
.setattr = kvm_gmem_setattr,
|
||||
};
|
||||
|
||||
bool __weak kvm_arch_supports_gmem_mmap(struct kvm *kvm)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
|
||||
{
|
||||
const char *anon_name = "[kvm-gmem]";
|
||||
|
@ -452,6 +524,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
|
|||
u64 flags = args->flags;
|
||||
u64 valid_flags = 0;
|
||||
|
||||
if (kvm_arch_supports_gmem_mmap(kvm))
|
||||
valid_flags |= GUEST_MEMFD_FLAG_MMAP;
|
||||
|
||||
if (flags & ~valid_flags)
|
||||
return -EINVAL;
|
||||
|
||||
|
@ -508,6 +583,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
|
|||
*/
|
||||
WRITE_ONCE(slot->gmem.file, file);
|
||||
slot->gmem.pgoff = start;
|
||||
if (kvm_gmem_supports_mmap(inode))
|
||||
slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
|
||||
|
||||
xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
|
@ -627,7 +704,7 @@ out:
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
|
||||
|
||||
#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
|
||||
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
|
||||
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
|
||||
kvm_gmem_populate_cb post_populate, void *opaque)
|
||||
{
|
||||
|
@ -643,7 +720,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
|
|||
return -EINVAL;
|
||||
|
||||
slot = gfn_to_memslot(kvm, start_gfn);
|
||||
if (!kvm_slot_can_be_private(slot))
|
||||
if (!kvm_slot_has_gmem(slot))
|
||||
return -EINVAL;
|
||||
|
||||
file = kvm_gmem_get_file(slot);
|
||||
|
|
|
@ -331,7 +331,7 @@ void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
|
|||
* All current use cases for flushing the TLBs for a specific memslot
|
||||
* are related to dirty logging, and many do the TLB flush out of
|
||||
* mmu_lock. The interaction between the various operations on memslot
|
||||
* must be serialized by slots_locks to ensure the TLB flush from one
|
||||
* must be serialized by slots_lock to ensure the TLB flush from one
|
||||
* operation is observed by any other operation on the same memslot.
|
||||
*/
|
||||
lockdep_assert_held(&kvm->slots_lock);
|
||||
|
@ -1588,7 +1588,7 @@ static int check_memory_region_flags(struct kvm *kvm,
|
|||
{
|
||||
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
|
||||
|
||||
if (kvm_arch_has_private_mem(kvm))
|
||||
if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
|
||||
valid_flags |= KVM_MEM_GUEST_MEMFD;
|
||||
|
||||
/* Dirty logging private memory is not currently supported. */
|
||||
|
@ -4915,9 +4915,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
|
|||
case KVM_CAP_MEMORY_ATTRIBUTES:
|
||||
return kvm_supported_mem_attributes(kvm);
|
||||
#endif
|
||||
#ifdef CONFIG_KVM_PRIVATE_MEM
|
||||
#ifdef CONFIG_KVM_GUEST_MEMFD
|
||||
case KVM_CAP_GUEST_MEMFD:
|
||||
return !kvm || kvm_arch_has_private_mem(kvm);
|
||||
return 1;
|
||||
case KVM_CAP_GUEST_MEMFD_MMAP:
|
||||
return !kvm || kvm_arch_supports_gmem_mmap(kvm);
|
||||
#endif
|
||||
default:
|
||||
break;
|
||||
|
@ -5352,7 +5354,7 @@ static long kvm_vm_ioctl(struct file *filp,
|
|||
case KVM_GET_STATS_FD:
|
||||
r = kvm_vm_ioctl_get_stats_fd(kvm);
|
||||
break;
|
||||
#ifdef CONFIG_KVM_PRIVATE_MEM
|
||||
#ifdef CONFIG_KVM_GUEST_MEMFD
|
||||
case KVM_CREATE_GUEST_MEMFD: {
|
||||
struct kvm_create_guest_memfd guest_memfd;
|
||||
|
||||
|
|
|
@ -67,7 +67,7 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
|
|||
}
|
||||
#endif /* HAVE_KVM_PFNCACHE */
|
||||
|
||||
#ifdef CONFIG_KVM_PRIVATE_MEM
|
||||
#ifdef CONFIG_KVM_GUEST_MEMFD
|
||||
void kvm_gmem_init(struct module *module);
|
||||
int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
|
||||
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
|
||||
|
@ -91,6 +91,6 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
|
|||
{
|
||||
WARN_ON_ONCE(1);
|
||||
}
|
||||
#endif /* CONFIG_KVM_PRIVATE_MEM */
|
||||
#endif /* CONFIG_KVM_GUEST_MEMFD */
|
||||
|
||||
#endif /* __KVM_MM_H__ */
|
||||
|
|
Loading…
Reference in New Issue