From 1bcc3f87912779f66cdf1789a066046536ca6ccc Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 24 Sep 2025 10:42:55 -0700 Subject: [PATCH 01/46] KVM: selftests: Test prefault memory during concurrent memslot removal Expand the prefault memory selftest to add a regression test for a KVM bug where KVM's retry logic would result in (breakable) deadlock due to the memslot deletion waiting on prefaulting to release SRCU, and prefaulting waiting on the memslot to fully disappear (KVM uses a two-step process to delete memslots, and KVM x86 retries page faults if a to-be-deleted, a.k.a. INVALID, memslot is encountered). To exercise concurrent memslot remove, spawn a second thread to initiate memslot removal at roughly the same time as prefaulting. Test memslot removal for all testcases, i.e. don't limit concurrent removal to only the success case. There are essentially three prefault scenarios (so far) that are of interest: 1. Success 2. ENOENT due to no memslot 3. EAGAIN due to INVALID memslot For all intents and purposes, #1 and #2 are mutually exclusive, or rather, easier to test via separate testcases since writing to non-existent memory is trivial. But for #3, making it mutually exclusive with #1 _or_ #2 is actually more complex than testing memslot removal for all scenarios. The only requirement to let memslot removal coexist with other scenarios is a way to guarantee a stable result, e.g. that the "no memslot" test observes ENOENT, not EAGAIN, for the final checks. So, rather than make memslot removal mutually exclusive with the ENOENT scenario, simply restore the memslot and retry prefaulting. For the "no memslot" case, KVM_PRE_FAULT_MEMORY should be idempotent, i.e. should always fail with ENOENT regardless of how many times userspace attempts prefaulting. Pass in both the base GPA and the offset (instead of the "full" GPA) so that the worker can recreate the memslot. Signed-off-by: Yan Zhao Co-developed-by: Sean Christopherson Link: https://lore.kernel.org/r/20250924174255.2141847-1-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/pre_fault_memory_test.c | 131 +++++++++++++++--- 1 file changed, 114 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c index 0350a8896a2f..f04768c1d2e4 100644 --- a/tools/testing/selftests/kvm/pre_fault_memory_test.c +++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c @@ -10,6 +10,7 @@ #include #include #include +#include /* Arbitrarily chosen values */ #define TEST_SIZE (SZ_2M + PAGE_SIZE) @@ -30,18 +31,66 @@ static void guest_code(uint64_t base_gpa) GUEST_DONE(); } -static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 gpa, u64 size, - u64 left) +struct slot_worker_data { + struct kvm_vm *vm; + u64 gpa; + uint32_t flags; + bool worker_ready; + bool prefault_ready; + bool recreate_slot; +}; + +static void *delete_slot_worker(void *__data) +{ + struct slot_worker_data *data = __data; + struct kvm_vm *vm = data->vm; + + WRITE_ONCE(data->worker_ready, true); + + while (!READ_ONCE(data->prefault_ready)) + cpu_relax(); + + vm_mem_region_delete(vm, TEST_SLOT); + + while (!READ_ONCE(data->recreate_slot)) + cpu_relax(); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, data->gpa, + TEST_SLOT, TEST_NPAGES, data->flags); + + return NULL; +} + +static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset, + u64 size, u64 expected_left, bool private) { struct kvm_pre_fault_memory range = { - .gpa = gpa, + .gpa = base_gpa + offset, .size = size, .flags = 0, }; - u64 prev; + struct slot_worker_data data = { + .vm = vcpu->vm, + .gpa = base_gpa, + .flags = private ? KVM_MEM_GUEST_MEMFD : 0, + }; + bool slot_recreated = false; + pthread_t slot_worker; int ret, save_errno; + u64 prev; - do { + /* + * Concurrently delete (and recreate) the slot to test KVM's handling + * of a racing memslot deletion with prefaulting. + */ + pthread_create(&slot_worker, NULL, delete_slot_worker, &data); + + while (!READ_ONCE(data.worker_ready)) + cpu_relax(); + + WRITE_ONCE(data.prefault_ready, true); + + for (;;) { prev = range.size; ret = __vcpu_ioctl(vcpu, KVM_PRE_FAULT_MEMORY, &range); save_errno = errno; @@ -49,18 +98,65 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 gpa, u64 size, "%sexpecting range.size to change on %s", ret < 0 ? "not " : "", ret < 0 ? "failure" : "success"); - } while (ret >= 0 ? range.size : save_errno == EINTR); - TEST_ASSERT(range.size == left, - "Completed with %lld bytes left, expected %" PRId64, - range.size, left); + /* + * Immediately retry prefaulting if KVM was interrupted by an + * unrelated signal/event. + */ + if (ret < 0 && save_errno == EINTR) + continue; - if (left == 0) - __TEST_ASSERT_VM_VCPU_IOCTL(!ret, "KVM_PRE_FAULT_MEMORY", ret, vcpu->vm); + /* + * Tell the worker to recreate the slot in order to complete + * prefaulting (if prefault didn't already succeed before the + * slot was deleted) and/or to prepare for the next testcase. + * Wait for the worker to exit so that the next invocation of + * prefaulting is guaranteed to complete (assuming no KVM bugs). + */ + if (!slot_recreated) { + WRITE_ONCE(data.recreate_slot, true); + pthread_join(slot_worker, NULL); + slot_recreated = true; + + /* + * Retry prefaulting to get a stable result, i.e. to + * avoid seeing random EAGAIN failures. Don't retry if + * prefaulting already succeeded, as KVM disallows + * prefaulting with size=0, i.e. blindly retrying would + * result in test failures due to EINVAL. KVM should + * always return success if all bytes are prefaulted, + * i.e. there is no need to guard against EAGAIN being + * returned. + */ + if (range.size) + continue; + } + + /* + * All done if there are no remaining bytes to prefault, or if + * prefaulting failed (EINTR was handled above, and EAGAIN due + * to prefaulting a memslot that's being actively deleted should + * be impossible since the memslot has already been recreated). + */ + if (!range.size || ret < 0) + break; + } + + TEST_ASSERT(range.size == expected_left, + "Completed with %llu bytes left, expected %lu", + range.size, expected_left); + + /* + * Assert success if prefaulting the entire range should succeed, i.e. + * complete with no bytes remaining. Otherwise prefaulting should have + * failed due to ENOENT (due to RET_PF_EMULATE for emulated MMIO when + * no memslot exists). + */ + if (!expected_left) + TEST_ASSERT_VM_VCPU_IOCTL(!ret, KVM_PRE_FAULT_MEMORY, ret, vcpu->vm); else - /* No memory slot causes RET_PF_EMULATE. it results in -ENOENT. */ - __TEST_ASSERT_VM_VCPU_IOCTL(ret && save_errno == ENOENT, - "KVM_PRE_FAULT_MEMORY", ret, vcpu->vm); + TEST_ASSERT_VM_VCPU_IOCTL(ret && save_errno == ENOENT, + KVM_PRE_FAULT_MEMORY, ret, vcpu->vm); } static void __test_pre_fault_memory(unsigned long vm_type, bool private) @@ -97,9 +193,10 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private) if (private) vm_mem_set_private(vm, guest_test_phys_mem, TEST_SIZE); - pre_fault_memory(vcpu, guest_test_phys_mem, SZ_2M, 0); - pre_fault_memory(vcpu, guest_test_phys_mem + SZ_2M, PAGE_SIZE * 2, PAGE_SIZE); - pre_fault_memory(vcpu, guest_test_phys_mem + TEST_SIZE, PAGE_SIZE, PAGE_SIZE); + + pre_fault_memory(vcpu, guest_test_phys_mem, 0, SZ_2M, 0, private); + pre_fault_memory(vcpu, guest_test_phys_mem, SZ_2M, PAGE_SIZE * 2, PAGE_SIZE, private); + pre_fault_memory(vcpu, guest_test_phys_mem, TEST_SIZE, PAGE_SIZE, PAGE_SIZE, private); vcpu_args_set(vcpu, 1, guest_test_virt_mem); vcpu_run(vcpu); From 034417c1439a533a315562a57bd340d963eaac6b Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 10 Oct 2025 08:52:39 +0800 Subject: [PATCH 02/46] KVM: x86/pmu: Don't try to get perf capabilities for hybrid CPUs Explicitly zero kvm_host_pmu instead of attempting to get the perf PMU capabilities when running on a hybrid CPU to avoid running afoul of perf's sanity check. ------------[ cut here ]------------ WARNING: arch/x86/events/core.c:3089 at perf_get_x86_pmu_capability+0xd/0xc0, Call Trace: kvm_x86_vendor_init+0x1b0/0x1a40 [kvm] vmx_init+0xdb/0x260 [kvm_intel] vt_init+0x12/0x9d0 [kvm_intel] do_one_initcall+0x60/0x3f0 do_init_module+0x97/0x2b0 load_module+0x2d08/0x2e30 init_module_from_file+0x96/0xe0 idempotent_init_module+0x117/0x330 __x64_sys_finit_module+0x73/0xe0 Always read the capabilities for non-hybrid CPUs, i.e. don't entirely revert to reading capabilities if and only if KVM wants to use a PMU, as it may be useful to have the host PMU capabilities available, e.g. if only or debug. Reported-by: Chaitanya Kumar Borah Closes: https://lore.kernel.org/all/70b64347-2aca-4511-af78-a767d5fa8226@intel.com/ Fixes: 51f34b1e650f ("KVM: x86/pmu: Snapshot host (i.e. perf's) reported PMU capabilities") Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Link: https://lore.kernel.org/r/20251010005239.146953-1-dapeng1.mi@linux.intel.com [sean: rework changelog, call out hybrid CPUs in shortlog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 40ac4cb44ed2..487ad19a236e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -108,16 +108,18 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; - perf_get_x86_pmu_capability(&kvm_host_pmu); - /* * Hybrid PMUs don't play nice with virtualization without careful * configuration by userspace, and KVM's APIs for reporting supported * vPMU features do not account for hybrid PMUs. Disable vPMU support * for hybrid PMUs until KVM gains a way to let userspace opt-in. */ - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { enable_pmu = false; + memset(&kvm_host_pmu, 0, sizeof(kvm_host_pmu)); + } else { + perf_get_x86_pmu_capability(&kvm_host_pmu); + } if (enable_pmu) { /* From d2042d8f96ddefdeee823737f813efe3ab4b4e8d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:25:54 -0700 Subject: [PATCH 03/46] KVM: Rework KVM_CAP_GUEST_MEMFD_MMAP into KVM_CAP_GUEST_MEMFD_FLAGS Rework the not-yet-released KVM_CAP_GUEST_MEMFD_MMAP into a more generic KVM_CAP_GUEST_MEMFD_FLAGS capability so that adding new flags doesn't require a new capability, and so that developers aren't tempted to bundle multiple flags into a single capability. Note, kvm_vm_ioctl_check_extension_generic() can only return a 32-bit value, but that limitation can be easily circumvented by adding e.g. KVM_CAP_GUEST_MEMFD_FLAGS2 in the unlikely event guest_memfd supports more than 32 flags. Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20251003232606.4070510-2-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 10 +++++++--- include/uapi/linux/kvm.h | 2 +- tools/testing/selftests/kvm/guest_memfd_test.c | 13 ++++++------- virt/kvm/kvm_main.c | 7 +++++-- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6ae24c5ca559..7ba92f2ced38 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6432,9 +6432,13 @@ most one mapping per page, i.e. binding multiple memory regions to a single guest_memfd range is not allowed (any number of memory regions can be bound to a single guest_memfd file, but the bound ranges must not overlap). -When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field -supports GUEST_MEMFD_FLAG_MMAP. Setting this flag on guest_memfd creation -enables mmap() and faulting of guest_memfd memory to host userspace. +The capability KVM_CAP_GUEST_MEMFD_FLAGS enumerates the `flags` that can be +specified via KVM_CREATE_GUEST_MEMFD. Currently defined flags: + + ============================ ================================================ + GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file + descriptor. + ============================ ================================================ When the KVM MMU performs a PFN lookup to service a guest fault and the backing guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 6efa98a57ec1..b1d52d0c56ec 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -962,7 +962,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 -#define KVM_CAP_GUEST_MEMFD_MMAP 244 +#define KVM_CAP_GUEST_MEMFD_FLAGS 244 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index b3ca6737f304..3e58bd496104 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -262,19 +262,17 @@ static void test_guest_memfd_flags(struct kvm_vm *vm, uint64_t valid_flags) static void test_guest_memfd(unsigned long vm_type) { - uint64_t flags = 0; struct kvm_vm *vm; size_t total_size; size_t page_size; + uint64_t flags; int fd; page_size = getpagesize(); total_size = page_size * 4; vm = vm_create_barebones_type(vm_type); - - if (vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP)) - flags |= GUEST_MEMFD_FLAG_MMAP; + flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); test_create_guest_memfd_multiple(vm); test_create_guest_memfd_invalid_sizes(vm, flags, page_size); @@ -328,13 +326,14 @@ static void test_guest_memfd_guest(void) size_t size; int fd, i; - if (!kvm_has_cap(KVM_CAP_GUEST_MEMFD_MMAP)) + if (!kvm_check_cap(KVM_CAP_GUEST_MEMFD_FLAGS)) return; vm = __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, &vcpu, 1, guest_code); - TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP), - "Default VM type should always support guest_memfd mmap()"); + TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_MMAP, + "Default VM type should support MMAP, supported flags = 0x%x", + vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); size = vm->page_size; fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 226faeaa8e56..e3a268757621 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4928,8 +4928,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: return 1; - case KVM_CAP_GUEST_MEMFD_MMAP: - return !kvm || kvm_arch_supports_gmem_mmap(kvm); + case KVM_CAP_GUEST_MEMFD_FLAGS: + if (!kvm || kvm_arch_supports_gmem_mmap(kvm)) + return GUEST_MEMFD_FLAG_MMAP; + + return 0; #endif default: break; From fe2bf6234e947bf5544db6d386af1df2a8db80f3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:25:55 -0700 Subject: [PATCH 04/46] KVM: guest_memfd: Add INIT_SHARED flag, reject user page faults if not set Add a guest_memfd flag to allow userspace to state that the underlying memory should be configured to be initialized as shared, and reject user page faults if the guest_memfd instance's memory isn't shared. Because KVM doesn't yet support in-place private<=>shared conversions, all guest_memfd memory effectively follows the initial state. Alternatively, KVM could deduce the initial state based on MMAP, which for all intents and purposes is what KVM currently does. However, implicitly deriving the default state based on MMAP will result in a messy ABI when support for in-place conversions is added. For x86 CoCo VMs, which don't yet support MMAP, memory is currently private by default (otherwise the memory would be unusable). If MMAP implies memory is shared by default, then the default state for CoCo VMs will vary based on MMAP, and from userspace's perspective, will change when in-place conversion support is added. I.e. to maintain guest<=>host ABI, userspace would need to immediately convert all memory from shared=>private, which is both ugly and inefficient. The inefficiency could be avoided by adding a flag to state that memory is _private_ by default, irrespective of MMAP, but that would lead to an equally messy and hard to document ABI. Bite the bullet and immediately add a flag to control the default state so that the effective behavior is explicit and straightforward. Fixes: 3d3a04fad25a ("KVM: Allow and advertise support for host mmap() on guest_memfd files") Cc: David Hildenbrand Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20251003232606.4070510-3-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 5 +++++ include/uapi/linux/kvm.h | 3 ++- tools/testing/selftests/kvm/guest_memfd_test.c | 15 ++++++++++++--- virt/kvm/guest_memfd.c | 6 +++++- virt/kvm/kvm_main.c | 3 ++- 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 7ba92f2ced38..754b662a453c 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6438,6 +6438,11 @@ specified via KVM_CREATE_GUEST_MEMFD. Currently defined flags: ============================ ================================================ GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file descriptor. + GUEST_MEMFD_FLAG_INIT_SHARED Make all memory in the file shared during + KVM_CREATE_GUEST_MEMFD (memory files created + without INIT_SHARED will be marked private). + Shared memory can be faulted into host userspace + page tables. Private memory cannot. ============================ ================================================ When the KVM MMU performs a PFN lookup to service a guest fault and the backing diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b1d52d0c56ec..52f6000ab020 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1599,7 +1599,8 @@ struct kvm_memory_attributes { #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) -#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) +#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) +#define GUEST_MEMFD_FLAG_INIT_SHARED (1ULL << 1) struct kvm_create_guest_memfd { __u64 size; diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 3e58bd496104..0de56ce3c4e2 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -239,8 +239,9 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm) close(fd1); } -static void test_guest_memfd_flags(struct kvm_vm *vm, uint64_t valid_flags) +static void test_guest_memfd_flags(struct kvm_vm *vm) { + uint64_t valid_flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); size_t page_size = getpagesize(); uint64_t flag; int fd; @@ -274,6 +275,10 @@ static void test_guest_memfd(unsigned long vm_type) vm = vm_create_barebones_type(vm_type); flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); + /* This test doesn't yet support testing mmap() on private memory. */ + if (!(flags & GUEST_MEMFD_FLAG_INIT_SHARED)) + flags &= ~GUEST_MEMFD_FLAG_MMAP; + test_create_guest_memfd_multiple(vm); test_create_guest_memfd_invalid_sizes(vm, flags, page_size); @@ -292,7 +297,7 @@ static void test_guest_memfd(unsigned long vm_type) test_fallocate(fd, page_size, total_size); test_invalid_punch_hole(fd, page_size, total_size); - test_guest_memfd_flags(vm, flags); + test_guest_memfd_flags(vm); close(fd); kvm_vm_free(vm); @@ -334,9 +339,13 @@ static void test_guest_memfd_guest(void) TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_MMAP, "Default VM type should support MMAP, supported flags = 0x%x", vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); + TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_INIT_SHARED, + "Default VM type should support INIT_SHARED, supported flags = 0x%x", + vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); size = vm->page_size; - fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP); + fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP | + GUEST_MEMFD_FLAG_INIT_SHARED); vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0); mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 94bafd6c558c..cf3afba23a6b 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -328,6 +328,9 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; + if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED)) + return VM_FAULT_SIGBUS; + folio = kvm_gmem_get_folio(inode, vmf->pgoff); if (IS_ERR(folio)) { int err = PTR_ERR(folio); @@ -525,7 +528,8 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) u64 valid_flags = 0; if (kvm_arch_supports_gmem_mmap(kvm)) - valid_flags |= GUEST_MEMFD_FLAG_MMAP; + valid_flags |= GUEST_MEMFD_FLAG_MMAP | + GUEST_MEMFD_FLAG_INIT_SHARED; if (flags & ~valid_flags) return -EINVAL; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e3a268757621..5f644ca54af3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4930,7 +4930,8 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return 1; case KVM_CAP_GUEST_MEMFD_FLAGS: if (!kvm || kvm_arch_supports_gmem_mmap(kvm)) - return GUEST_MEMFD_FLAG_MMAP; + return GUEST_MEMFD_FLAG_MMAP | + GUEST_MEMFD_FLAG_INIT_SHARED; return 0; #endif From 5d3341d684be80892d8f6f9812f90f9274b81177 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:25:56 -0700 Subject: [PATCH 05/46] KVM: guest_memfd: Invalidate SHARED GPAs if gmem supports INIT_SHARED When invalidating gmem ranges, e.g. in response to PUNCH_HOLE, process all possible range types (PRIVATE vs. SHARED) for the gmem instance. Since since guest_memfd doesn't yet support in-place conversions, simply pivot on INIT_SHARED as a gmem instance can currently only have private or shared memory, not both. Failure to mark shared GPAs for invalidation is benign in the current code base, as only x86's TDX consumes KVM_FILTER_{PRIVATE,SHARED}, and TDX doesn't yet support INIT_SHARED with guest_memfd. However, invalidating only private GPAs is conceptually wrong and a lurking bug, e.g. could result in missed invalidations if ARM starts filtering invalidations based on attributes. Fixes: 3d3a04fad25a ("KVM: Allow and advertise support for host mmap() on guest_memfd files") Reviewed-by: Ackerley Tng Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20251003232606.4070510-4-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 64 +++++++++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 20 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index cf3afba23a6b..e10d2c71e78c 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -102,8 +102,17 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) return filemap_grab_folio(inode->i_mapping, index); } -static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, - pgoff_t end) +static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) +{ + if ((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED) + return KVM_FILTER_SHARED; + + return KVM_FILTER_PRIVATE; +} + +static void __kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, + pgoff_t end, + enum kvm_gfn_range_filter attr_filter) { bool flush = false, found_memslot = false; struct kvm_memory_slot *slot; @@ -118,8 +127,7 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, .slot = slot, .may_block = true, - /* guest memfd is relevant to only private mappings. */ - .attr_filter = KVM_FILTER_PRIVATE, + .attr_filter = attr_filter, }; if (!found_memslot) { @@ -139,8 +147,21 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, KVM_MMU_UNLOCK(kvm); } -static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, - pgoff_t end) +static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, + pgoff_t end) +{ + struct list_head *gmem_list = &inode->i_mapping->i_private_list; + enum kvm_gfn_range_filter attr_filter; + struct kvm_gmem *gmem; + + attr_filter = kvm_gmem_get_invalidate_filter(inode); + + list_for_each_entry(gmem, gmem_list, entry) + __kvm_gmem_invalidate_begin(gmem, start, end, attr_filter); +} + +static void __kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, + pgoff_t end) { struct kvm *kvm = gmem->kvm; @@ -151,12 +172,20 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, } } -static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) +static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, + pgoff_t end) { struct list_head *gmem_list = &inode->i_mapping->i_private_list; + struct kvm_gmem *gmem; + + list_for_each_entry(gmem, gmem_list, entry) + __kvm_gmem_invalidate_end(gmem, start, end); +} + +static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ pgoff_t start = offset >> PAGE_SHIFT; pgoff_t end = (offset + len) >> PAGE_SHIFT; - struct kvm_gmem *gmem; /* * Bindings must be stable across invalidation to ensure the start+end @@ -164,13 +193,11 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) */ filemap_invalidate_lock(inode->i_mapping); - list_for_each_entry(gmem, gmem_list, entry) - kvm_gmem_invalidate_begin(gmem, start, end); + kvm_gmem_invalidate_begin(inode, start, end); truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); - list_for_each_entry(gmem, gmem_list, entry) - kvm_gmem_invalidate_end(gmem, start, end); + kvm_gmem_invalidate_end(inode, start, end); filemap_invalidate_unlock(inode->i_mapping); @@ -280,8 +307,9 @@ static int kvm_gmem_release(struct inode *inode, struct file *file) * Zap all SPTEs pointed at by this file. Do not free the backing * memory, as its lifetime is associated with the inode, not the file. */ - kvm_gmem_invalidate_begin(gmem, 0, -1ul); - kvm_gmem_invalidate_end(gmem, 0, -1ul); + __kvm_gmem_invalidate_begin(gmem, 0, -1ul, + kvm_gmem_get_invalidate_filter(inode)); + __kvm_gmem_invalidate_end(gmem, 0, -1ul); list_del(&gmem->entry); @@ -403,8 +431,6 @@ static int kvm_gmem_migrate_folio(struct address_space *mapping, static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) { - struct list_head *gmem_list = &mapping->i_private_list; - struct kvm_gmem *gmem; pgoff_t start, end; filemap_invalidate_lock_shared(mapping); @@ -412,8 +438,7 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol start = folio->index; end = start + folio_nr_pages(folio); - list_for_each_entry(gmem, gmem_list, entry) - kvm_gmem_invalidate_begin(gmem, start, end); + kvm_gmem_invalidate_begin(mapping->host, start, end); /* * Do not truncate the range, what action is taken in response to the @@ -424,8 +449,7 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol * error to userspace. */ - list_for_each_entry(gmem, gmem_list, entry) - kvm_gmem_invalidate_end(gmem, start, end); + kvm_gmem_invalidate_end(mapping->host, start, end); filemap_invalidate_unlock_shared(mapping); From 9aef71c892a55e004419923ba7129abe3e58d9f1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:25:57 -0700 Subject: [PATCH 06/46] KVM: Explicitly mark KVM_GUEST_MEMFD as depending on KVM_GENERIC_MMU_NOTIFIER Add KVM_GENERIC_MMU_NOTIFIER as a dependency for selecting KVM_GUEST_MEMFD, as guest_memfd relies on kvm_mmu_invalidate_{begin,end}(), which are defined if and only if the generic mmu_notifier implementation is enabled. The missing dependency is currently benign as s390 is the only KVM arch that doesn't utilize the generic mmu_notifier infrastructure, and s390 doesn't currently support guest_memfd. Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20251003232606.4070510-5-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 1b7d5be0b6c4..a01cc5743137 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -113,6 +113,7 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES bool config KVM_GUEST_MEMFD + depends on KVM_GENERIC_MMU_NOTIFIER select XARRAY_MULTI bool From 44c6cb9fe9888b371e31165b2854bd0f4e2787d4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:25:58 -0700 Subject: [PATCH 07/46] KVM: guest_memfd: Allow mmap() on guest_memfd for x86 VMs with private memory Allow mmap() on guest_memfd instances for x86 VMs with private memory as the need to track private vs. shared state in the guest_memfd instance is only pertinent to INIT_SHARED. Doing mmap() on private memory isn't terrible useful (yet!), but it's now possible, and will be desirable when guest_memfd gains support for other VMA-based syscalls, e.g. mbind() to set NUMA policy. Lift the restriction now, before MMAP support is officially released, so that KVM doesn't need to add another capability to enumerate support for mmap() on private memory. Fixes: 3d3a04fad25a ("KVM: Allow and advertise support for host mmap() on guest_memfd files") Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20251003232606.4070510-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 7 ++++--- include/linux/kvm_host.h | 12 +++++++++++- virt/kvm/guest_memfd.c | 9 ++------- virt/kvm/kvm_main.c | 6 +----- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b8138bd4857..fe3dc3eb4331 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13942,10 +13942,11 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) #ifdef CONFIG_KVM_GUEST_MEMFD /* - * KVM doesn't yet support mmap() on guest_memfd for VMs with private memory - * (the private vs. shared tracking needs to be moved into guest_memfd). + * KVM doesn't yet support initializing guest_memfd memory as shared for VMs + * with private memory (the private vs. shared tracking needs to be moved into + * guest_memfd). */ -bool kvm_arch_supports_gmem_mmap(struct kvm *kvm) +bool kvm_arch_supports_gmem_init_shared(struct kvm *kvm) { return !kvm_arch_has_private_mem(kvm); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 19b8c4bebb9c..680ca838f018 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -729,7 +729,17 @@ static inline bool kvm_arch_has_private_mem(struct kvm *kvm) #endif #ifdef CONFIG_KVM_GUEST_MEMFD -bool kvm_arch_supports_gmem_mmap(struct kvm *kvm); +bool kvm_arch_supports_gmem_init_shared(struct kvm *kvm); + +static inline u64 kvm_gmem_get_supported_flags(struct kvm *kvm) +{ + u64 flags = GUEST_MEMFD_FLAG_MMAP; + + if (!kvm || kvm_arch_supports_gmem_init_shared(kvm)) + flags |= GUEST_MEMFD_FLAG_INIT_SHARED; + + return flags; +} #endif #ifndef kvm_arch_has_readonly_mem diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index e10d2c71e78c..fbca8c0972da 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -485,7 +485,7 @@ static const struct inode_operations kvm_gmem_iops = { .setattr = kvm_gmem_setattr, }; -bool __weak kvm_arch_supports_gmem_mmap(struct kvm *kvm) +bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) { return true; } @@ -549,13 +549,8 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) { loff_t size = args->size; u64 flags = args->flags; - u64 valid_flags = 0; - if (kvm_arch_supports_gmem_mmap(kvm)) - valid_flags |= GUEST_MEMFD_FLAG_MMAP | - GUEST_MEMFD_FLAG_INIT_SHARED; - - if (flags & ~valid_flags) + if (flags & ~kvm_gmem_get_supported_flags(kvm)) return -EINVAL; if (size <= 0 || !PAGE_ALIGNED(size)) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5f644ca54af3..b7a0ae2a7b20 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4929,11 +4929,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_GUEST_MEMFD: return 1; case KVM_CAP_GUEST_MEMFD_FLAGS: - if (!kvm || kvm_arch_supports_gmem_mmap(kvm)) - return GUEST_MEMFD_FLAG_MMAP | - GUEST_MEMFD_FLAG_INIT_SHARED; - - return 0; + return kvm_gmem_get_supported_flags(kvm); #endif default: break; From 3a6c08538c742624c60cb6a53dd61eb025e0d1e1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:25:59 -0700 Subject: [PATCH 08/46] KVM: selftests: Stash the host page size in a global in the guest_memfd test Use a global variable to track the host page size in the guest_memfd test so that the information doesn't need to be constantly passed around. The state is purely a reflection of the underlying system, i.e. can't be set by the test and is constant for a given invocation of the test, and thus explicitly passing the host page size to individual testcases adds no value, e.g. doesn't allow testing different combinations. Making page_size a global will simplify an upcoming change to create a new guest_memfd instance per testcase. No functional change intended. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: David Hildenbrand Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Link: https://lore.kernel.org/r/20251003232606.4070510-7-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/guest_memfd_test.c | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 0de56ce3c4e2..a7c9601bd31e 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -24,6 +24,8 @@ #include "test_util.h" #include "ucall_common.h" +static size_t page_size; + static void test_file_read_write(int fd) { char buf[64]; @@ -38,7 +40,7 @@ static void test_file_read_write(int fd) "pwrite on a guest_mem fd should fail"); } -static void test_mmap_supported(int fd, size_t page_size, size_t total_size) +static void test_mmap_supported(int fd, size_t total_size) { const char val = 0xaa; char *mem; @@ -78,7 +80,7 @@ void fault_sigbus_handler(int signum) siglongjmp(jmpbuf, 1); } -static void test_fault_overflow(int fd, size_t page_size, size_t total_size) +static void test_fault_overflow(int fd, size_t total_size) { struct sigaction sa_old, sa_new = { .sa_handler = fault_sigbus_handler, @@ -106,7 +108,7 @@ static void test_fault_overflow(int fd, size_t page_size, size_t total_size) TEST_ASSERT(!ret, "munmap() should succeed."); } -static void test_mmap_not_supported(int fd, size_t page_size, size_t total_size) +static void test_mmap_not_supported(int fd, size_t total_size) { char *mem; @@ -117,7 +119,7 @@ static void test_mmap_not_supported(int fd, size_t page_size, size_t total_size) TEST_ASSERT_EQ(mem, MAP_FAILED); } -static void test_file_size(int fd, size_t page_size, size_t total_size) +static void test_file_size(int fd, size_t total_size) { struct stat sb; int ret; @@ -128,7 +130,7 @@ static void test_file_size(int fd, size_t page_size, size_t total_size) TEST_ASSERT_EQ(sb.st_blksize, page_size); } -static void test_fallocate(int fd, size_t page_size, size_t total_size) +static void test_fallocate(int fd, size_t total_size) { int ret; @@ -165,7 +167,7 @@ static void test_fallocate(int fd, size_t page_size, size_t total_size) TEST_ASSERT(!ret, "fallocate to restore punched hole should succeed"); } -static void test_invalid_punch_hole(int fd, size_t page_size, size_t total_size) +static void test_invalid_punch_hole(int fd, size_t total_size) { struct { off_t offset; @@ -196,8 +198,7 @@ static void test_invalid_punch_hole(int fd, size_t page_size, size_t total_size) } static void test_create_guest_memfd_invalid_sizes(struct kvm_vm *vm, - uint64_t guest_memfd_flags, - size_t page_size) + uint64_t guest_memfd_flags) { size_t size; int fd; @@ -214,7 +215,6 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm) { int fd1, fd2, ret; struct stat st1, st2; - size_t page_size = getpagesize(); fd1 = __vm_create_guest_memfd(vm, page_size, 0); TEST_ASSERT(fd1 != -1, "memfd creation should succeed"); @@ -242,7 +242,6 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm) static void test_guest_memfd_flags(struct kvm_vm *vm) { uint64_t valid_flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); - size_t page_size = getpagesize(); uint64_t flag; int fd; @@ -265,11 +264,9 @@ static void test_guest_memfd(unsigned long vm_type) { struct kvm_vm *vm; size_t total_size; - size_t page_size; uint64_t flags; int fd; - page_size = getpagesize(); total_size = page_size * 4; vm = vm_create_barebones_type(vm_type); @@ -280,22 +277,22 @@ static void test_guest_memfd(unsigned long vm_type) flags &= ~GUEST_MEMFD_FLAG_MMAP; test_create_guest_memfd_multiple(vm); - test_create_guest_memfd_invalid_sizes(vm, flags, page_size); + test_create_guest_memfd_invalid_sizes(vm, flags); fd = vm_create_guest_memfd(vm, total_size, flags); test_file_read_write(fd); if (flags & GUEST_MEMFD_FLAG_MMAP) { - test_mmap_supported(fd, page_size, total_size); - test_fault_overflow(fd, page_size, total_size); + test_mmap_supported(fd, total_size); + test_fault_overflow(fd, total_size); } else { - test_mmap_not_supported(fd, page_size, total_size); + test_mmap_not_supported(fd, total_size); } - test_file_size(fd, page_size, total_size); - test_fallocate(fd, page_size, total_size); - test_invalid_punch_hole(fd, page_size, total_size); + test_file_size(fd, total_size); + test_fallocate(fd, total_size); + test_invalid_punch_hole(fd, total_size); test_guest_memfd_flags(vm); @@ -374,6 +371,8 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD)); + page_size = getpagesize(); + /* * Not all architectures support KVM_CAP_VM_TYPES. However, those that * support guest_memfd have that support for the default VM type. From 21d602ed616aebae4de568be55db65a1f5a3be10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:26:00 -0700 Subject: [PATCH 09/46] KVM: selftests: Create a new guest_memfd for each testcase Refactor the guest_memfd selftest to improve test isolation by creating a a new guest_memfd for each testcase. Currently, the test reuses a single guest_memfd instance for all testcases, and thus creates dependencies between tests, e.g. not truncating folios from the guest_memfd instance at the end of a test could lead to unexpected results (see the PUNCH_HOLE purging that needs to done by in-flight the NUMA testcases[1]). Invoke each test via a macro wrapper to create and close a guest_memfd to cut down on the boilerplate copy+paste needed to create a test. Link: https://lore.kernel.org/all/20250827175247.83322-10-shivankg@amd.com Reported-by: Ackerley Tng Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: David Hildenbrand Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Link: https://lore.kernel.org/r/20251003232606.4070510-8-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/guest_memfd_test.c | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index a7c9601bd31e..afdc4d3a956d 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -26,7 +26,7 @@ static size_t page_size; -static void test_file_read_write(int fd) +static void test_file_read_write(int fd, size_t total_size) { char buf[64]; @@ -260,14 +260,18 @@ static void test_guest_memfd_flags(struct kvm_vm *vm) } } +#define gmem_test(__test, __vm, __flags) \ +do { \ + int fd = vm_create_guest_memfd(__vm, page_size * 4, __flags); \ + \ + test_##__test(fd, page_size * 4); \ + close(fd); \ +} while (0) + static void test_guest_memfd(unsigned long vm_type) { struct kvm_vm *vm; - size_t total_size; uint64_t flags; - int fd; - - total_size = page_size * 4; vm = vm_create_barebones_type(vm_type); flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); @@ -279,24 +283,21 @@ static void test_guest_memfd(unsigned long vm_type) test_create_guest_memfd_multiple(vm); test_create_guest_memfd_invalid_sizes(vm, flags); - fd = vm_create_guest_memfd(vm, total_size, flags); - - test_file_read_write(fd); + gmem_test(file_read_write, vm, flags); if (flags & GUEST_MEMFD_FLAG_MMAP) { - test_mmap_supported(fd, total_size); - test_fault_overflow(fd, total_size); + gmem_test(mmap_supported, vm, flags); + gmem_test(fault_overflow, vm, flags); } else { - test_mmap_not_supported(fd, total_size); + gmem_test(mmap_not_supported, vm, flags); } - test_file_size(fd, total_size); - test_fallocate(fd, total_size); - test_invalid_punch_hole(fd, total_size); + gmem_test(file_size, vm, flags); + gmem_test(fallocate, vm, flags); + gmem_test(invalid_punch_hole, vm, flags); test_guest_memfd_flags(vm); - close(fd); kvm_vm_free(vm); } From df0d9923f7055169cb01b050236e5a9a2b36db02 Mon Sep 17 00:00:00 2001 From: Ackerley Tng Date: Fri, 3 Oct 2025 16:26:01 -0700 Subject: [PATCH 10/46] KVM: selftests: Add test coverage for guest_memfd without GUEST_MEMFD_FLAG_MMAP If a VM type supports KVM_CAP_GUEST_MEMFD_MMAP, the guest_memfd test will run all test cases with GUEST_MEMFD_FLAG_MMAP set. This leaves the code path for creating a non-mmap()-able guest_memfd on a VM that supports mappable guest memfds untested. Refactor the test to run the main test suite with a given set of flags. Then, for VM types that support the mappable capability, invoke the test suite twice: once with no flags, and once with GUEST_MEMFD_FLAG_MMAP set. This ensures both creation paths are properly exercised on capable VMs. Run test_guest_memfd_flags() only once per VM type since it depends only on the set of valid/supported flags, i.e. iterating over an arbitrary set of flags is both unnecessary and wrong. Signed-off-by: Ackerley Tng [sean: use double-underscores for the inner helper] Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20251003232606.4070510-9-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/guest_memfd_test.c | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index afdc4d3a956d..9f98a067ab51 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -268,18 +268,8 @@ do { \ close(fd); \ } while (0) -static void test_guest_memfd(unsigned long vm_type) +static void __test_guest_memfd(struct kvm_vm *vm, uint64_t flags) { - struct kvm_vm *vm; - uint64_t flags; - - vm = vm_create_barebones_type(vm_type); - flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); - - /* This test doesn't yet support testing mmap() on private memory. */ - if (!(flags & GUEST_MEMFD_FLAG_INIT_SHARED)) - flags &= ~GUEST_MEMFD_FLAG_MMAP; - test_create_guest_memfd_multiple(vm); test_create_guest_memfd_invalid_sizes(vm, flags); @@ -295,9 +285,24 @@ static void test_guest_memfd(unsigned long vm_type) gmem_test(file_size, vm, flags); gmem_test(fallocate, vm, flags); gmem_test(invalid_punch_hole, vm, flags); +} + +static void test_guest_memfd(unsigned long vm_type) +{ + struct kvm_vm *vm = vm_create_barebones_type(vm_type); + uint64_t flags; test_guest_memfd_flags(vm); + __test_guest_memfd(vm, 0); + + flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); + + /* MMAP should always be supported if INIT_SHARED is supported. */ + if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) + __test_guest_memfd(vm, GUEST_MEMFD_FLAG_MMAP | + GUEST_MEMFD_FLAG_INIT_SHARED); + kvm_vm_free(vm); } From 61cee97f40180312dcca9580a5be0b0aa2217f6e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:26:02 -0700 Subject: [PATCH 11/46] KVM: selftests: Add wrappers for mmap() and munmap() to assert success Add and use wrappers for mmap() and munmap() that assert success to reduce a significant amount of boilerplate code, to ensure all tests assert on failure, and to provide consistent error messages on failure. No functional change intended. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: David Hildenbrand Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20251003232606.4070510-10-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/guest_memfd_test.c | 21 +++------ .../testing/selftests/kvm/include/kvm_util.h | 25 +++++++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 44 +++++++------------ tools/testing/selftests/kvm/mmu_stress_test.c | 5 +-- .../selftests/kvm/s390/ucontrol_test.c | 16 +++---- .../selftests/kvm/set_memory_region_test.c | 17 ++++--- 6 files changed, 64 insertions(+), 64 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 9f98a067ab51..319fda4f5d53 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -50,8 +50,7 @@ static void test_mmap_supported(int fd, size_t total_size) mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); TEST_ASSERT(mem == MAP_FAILED, "Copy-on-write not allowed by guest_memfd."); - mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - TEST_ASSERT(mem != MAP_FAILED, "mmap() for guest_memfd should succeed."); + mem = kvm_mmap(total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); memset(mem, val, total_size); for (i = 0; i < total_size; i++) @@ -70,8 +69,7 @@ static void test_mmap_supported(int fd, size_t total_size) for (i = 0; i < total_size; i++) TEST_ASSERT_EQ(READ_ONCE(mem[i]), val); - ret = munmap(mem, total_size); - TEST_ASSERT(!ret, "munmap() should succeed."); + kvm_munmap(mem, total_size); } static sigjmp_buf jmpbuf; @@ -89,10 +87,8 @@ static void test_fault_overflow(int fd, size_t total_size) const char val = 0xaa; char *mem; size_t i; - int ret; - mem = mmap(NULL, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - TEST_ASSERT(mem != MAP_FAILED, "mmap() for guest_memfd should succeed."); + mem = kvm_mmap(map_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); sigaction(SIGBUS, &sa_new, &sa_old); if (sigsetjmp(jmpbuf, 1) == 0) { @@ -104,8 +100,7 @@ static void test_fault_overflow(int fd, size_t total_size) for (i = 0; i < total_size; i++) TEST_ASSERT_EQ(READ_ONCE(mem[i]), val); - ret = munmap(mem, map_size); - TEST_ASSERT(!ret, "munmap() should succeed."); + kvm_munmap(mem, map_size); } static void test_mmap_not_supported(int fd, size_t total_size) @@ -351,10 +346,9 @@ static void test_guest_memfd_guest(void) GUEST_MEMFD_FLAG_INIT_SHARED); vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0); - mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - TEST_ASSERT(mem != MAP_FAILED, "mmap() on guest_memfd failed"); + mem = kvm_mmap(size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); memset(mem, 0xaa, size); - munmap(mem, size); + kvm_munmap(mem, size); virt_pg_map(vm, gpa, gpa); vcpu_args_set(vcpu, 2, gpa, size); @@ -362,8 +356,7 @@ static void test_guest_memfd_guest(void) TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); - mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - TEST_ASSERT(mem != MAP_FAILED, "mmap() on guest_memfd failed"); + mem = kvm_mmap(size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); for (i = 0; i < size; i++) TEST_ASSERT_EQ(mem[i], 0xff); diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 26cc30290e76..ee60dbf5208a 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -286,6 +286,31 @@ static inline bool kvm_has_cap(long cap) #define __KVM_SYSCALL_ERROR(_name, _ret) \ "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) +static inline void *__kvm_mmap(size_t size, int prot, int flags, int fd, + off_t offset) +{ + void *mem; + + mem = mmap(NULL, size, prot, flags, fd, offset); + TEST_ASSERT(mem != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", + (int)(unsigned long)MAP_FAILED)); + + return mem; +} + +static inline void *kvm_mmap(size_t size, int prot, int flags, int fd) +{ + return __kvm_mmap(size, prot, flags, fd, 0); +} + +static inline void kvm_munmap(void *mem, size_t size) +{ + int ret; + + ret = munmap(mem, size); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); +} + /* * Use the "inner", double-underscore macro when reporting errors from within * other macros so that the name of ioctl() and not its literal numeric value diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 6743fbd9bd67..83a721be7ec5 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -741,13 +741,11 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) int ret; if (vcpu->dirty_gfns) { - ret = munmap(vcpu->dirty_gfns, vm->dirty_ring_size); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + kvm_munmap(vcpu->dirty_gfns, vm->dirty_ring_size); vcpu->dirty_gfns = NULL; } - ret = munmap(vcpu->run, vcpu_mmap_sz()); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + kvm_munmap(vcpu->run, vcpu_mmap_sz()); ret = close(vcpu->fd); TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); @@ -783,20 +781,16 @@ void kvm_vm_release(struct kvm_vm *vmp) static void __vm_mem_region_delete(struct kvm_vm *vm, struct userspace_mem_region *region) { - int ret; - rb_erase(®ion->gpa_node, &vm->regions.gpa_tree); rb_erase(®ion->hva_node, &vm->regions.hva_tree); hash_del(®ion->slot_node); sparsebit_free(®ion->unused_phy_pages); sparsebit_free(®ion->protected_phy_pages); - ret = munmap(region->mmap_start, region->mmap_size); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + kvm_munmap(region->mmap_start, region->mmap_size); if (region->fd >= 0) { /* There's an extra map when using shared memory. */ - ret = munmap(region->mmap_alias, region->mmap_size); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + kvm_munmap(region->mmap_alias, region->mmap_size); close(region->fd); } if (region->region.guest_memfd >= 0) @@ -1053,12 +1047,9 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, region->fd = kvm_memfd_alloc(region->mmap_size, src_type == VM_MEM_SRC_SHARED_HUGETLB); - region->mmap_start = mmap(NULL, region->mmap_size, - PROT_READ | PROT_WRITE, - vm_mem_backing_src_alias(src_type)->flag, - region->fd, 0); - TEST_ASSERT(region->mmap_start != MAP_FAILED, - __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); + region->mmap_start = kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE, + vm_mem_backing_src_alias(src_type)->flag, + region->fd); TEST_ASSERT(!is_backing_src_hugetlb(src_type) || region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz), @@ -1129,12 +1120,10 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, /* If shared memory, create an alias. */ if (region->fd >= 0) { - region->mmap_alias = mmap(NULL, region->mmap_size, - PROT_READ | PROT_WRITE, - vm_mem_backing_src_alias(src_type)->flag, - region->fd, 0); - TEST_ASSERT(region->mmap_alias != MAP_FAILED, - __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); + region->mmap_alias = kvm_mmap(region->mmap_size, + PROT_READ | PROT_WRITE, + vm_mem_backing_src_alias(src_type)->flag, + region->fd); /* Align host alias address */ region->host_alias = align_ptr_up(region->mmap_alias, alignment); @@ -1344,10 +1333,8 @@ struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size " "smaller than expected, vcpu_mmap_sz: %zi expected_min: %zi", vcpu_mmap_sz(), sizeof(*vcpu->run)); - vcpu->run = (struct kvm_run *) mmap(NULL, vcpu_mmap_sz(), - PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0); - TEST_ASSERT(vcpu->run != MAP_FAILED, - __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); + vcpu->run = kvm_mmap(vcpu_mmap_sz(), PROT_READ | PROT_WRITE, + MAP_SHARED, vcpu->fd); if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD)) vcpu->stats.fd = vcpu_get_stats_fd(vcpu); @@ -1794,9 +1781,8 @@ void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu) page_size * KVM_DIRTY_LOG_PAGE_OFFSET); TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec"); - addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, - page_size * KVM_DIRTY_LOG_PAGE_OFFSET); - TEST_ASSERT(addr != MAP_FAILED, "Dirty ring map failed"); + addr = __kvm_mmap(size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, + page_size * KVM_DIRTY_LOG_PAGE_OFFSET); vcpu->dirty_gfns = addr; vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn); diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c index 6a437d2be9fa..37b7e6524533 100644 --- a/tools/testing/selftests/kvm/mmu_stress_test.c +++ b/tools/testing/selftests/kvm/mmu_stress_test.c @@ -339,8 +339,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(max_gpa > (4 * slot_size), "MAXPHYADDR <4gb "); fd = kvm_memfd_alloc(slot_size, hugepages); - mem = mmap(NULL, slot_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - TEST_ASSERT(mem != MAP_FAILED, "mmap() failed"); + mem = kvm_mmap(slot_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); TEST_ASSERT(!madvise(mem, slot_size, MADV_NOHUGEPAGE), "madvise() failed"); @@ -413,7 +412,7 @@ int main(int argc, char *argv[]) for (slot = (slot - 1) & ~1ull; slot >= first_slot; slot -= 2) vm_set_user_memory_region(vm, slot, 0, 0, 0, NULL); - munmap(mem, slot_size / 2); + kvm_munmap(mem, slot_size / 2); /* Sanity check that the vCPUs actually ran. */ for (i = 0; i < nr_vcpus; i++) diff --git a/tools/testing/selftests/kvm/s390/ucontrol_test.c b/tools/testing/selftests/kvm/s390/ucontrol_test.c index d265b34c54be..50bc1c38225a 100644 --- a/tools/testing/selftests/kvm/s390/ucontrol_test.c +++ b/tools/testing/selftests/kvm/s390/ucontrol_test.c @@ -142,19 +142,17 @@ FIXTURE_SETUP(uc_kvm) self->kvm_run_size = ioctl(self->kvm_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); ASSERT_GE(self->kvm_run_size, sizeof(struct kvm_run)) TH_LOG(KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, self->kvm_run_size)); - self->run = (struct kvm_run *)mmap(NULL, self->kvm_run_size, - PROT_READ | PROT_WRITE, MAP_SHARED, self->vcpu_fd, 0); - ASSERT_NE(self->run, MAP_FAILED); + self->run = kvm_mmap(self->kvm_run_size, PROT_READ | PROT_WRITE, + MAP_SHARED, self->vcpu_fd); /** * For virtual cpus that have been created with S390 user controlled * virtual machines, the resulting vcpu fd can be memory mapped at page * offset KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of * the virtual cpu's hardware control block. */ - self->sie_block = (struct kvm_s390_sie_block *)mmap(NULL, PAGE_SIZE, - PROT_READ | PROT_WRITE, MAP_SHARED, - self->vcpu_fd, KVM_S390_SIE_PAGE_OFFSET << PAGE_SHIFT); - ASSERT_NE(self->sie_block, MAP_FAILED); + self->sie_block = __kvm_mmap(PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, self->vcpu_fd, + KVM_S390_SIE_PAGE_OFFSET << PAGE_SHIFT); TH_LOG("VM created %p %p", self->run, self->sie_block); @@ -186,8 +184,8 @@ FIXTURE_SETUP(uc_kvm) FIXTURE_TEARDOWN(uc_kvm) { - munmap(self->sie_block, PAGE_SIZE); - munmap(self->run, self->kvm_run_size); + kvm_munmap(self->sie_block, PAGE_SIZE); + kvm_munmap(self->run, self->kvm_run_size); close(self->vcpu_fd); close(self->vm_fd); close(self->kvm_fd); diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index ce3ac0fd6dfb..7fe427ff9b38 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -433,10 +433,10 @@ static void test_add_max_memory_regions(void) pr_info("Adding slots 0..%i, each memory region with %dK size\n", (max_mem_slots - 1), MEM_REGION_SIZE >> 10); - mem = mmap(NULL, (size_t)max_mem_slots * MEM_REGION_SIZE + alignment, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0); - TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host"); + + mem = kvm_mmap((size_t)max_mem_slots * MEM_REGION_SIZE + alignment, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1); mem_aligned = (void *)(((size_t) mem + alignment - 1) & ~(alignment - 1)); for (slot = 0; slot < max_mem_slots; slot++) @@ -446,9 +446,8 @@ static void test_add_max_memory_regions(void) mem_aligned + (uint64_t)slot * MEM_REGION_SIZE); /* Check it cannot be added memory slots beyond the limit */ - mem_extra = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - TEST_ASSERT(mem_extra != MAP_FAILED, "Failed to mmap() host"); + mem_extra = kvm_mmap(MEM_REGION_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1); ret = __vm_set_user_memory_region(vm, max_mem_slots, 0, (uint64_t)max_mem_slots * MEM_REGION_SIZE, @@ -456,8 +455,8 @@ static void test_add_max_memory_regions(void) TEST_ASSERT(ret == -1 && errno == EINVAL, "Adding one more memory slot should fail with EINVAL"); - munmap(mem, (size_t)max_mem_slots * MEM_REGION_SIZE + alignment); - munmap(mem_extra, MEM_REGION_SIZE); + kvm_munmap(mem, (size_t)max_mem_slots * MEM_REGION_SIZE + alignment); + kvm_munmap(mem_extra, MEM_REGION_SIZE); kvm_vm_free(vm); } From 505c953009ec8260870d41ef8109bb4c7e208e6f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:26:03 -0700 Subject: [PATCH 12/46] KVM: selftests: Isolate the guest_memfd Copy-on-Write negative testcase Move the guest_memfd Copy-on-Write (CoW) testcase to its own function to better separate positive testcases from negative testcases. No functional change intended. Suggested-by: Ackerley Tng Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Link: https://lore.kernel.org/r/20251003232606.4070510-11-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/guest_memfd_test.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 319fda4f5d53..640636c76eb9 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -40,6 +40,14 @@ static void test_file_read_write(int fd, size_t total_size) "pwrite on a guest_mem fd should fail"); } +static void test_mmap_cow(int fd, size_t size) +{ + void *mem; + + mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + TEST_ASSERT(mem == MAP_FAILED, "Copy-on-write not allowed by guest_memfd."); +} + static void test_mmap_supported(int fd, size_t total_size) { const char val = 0xaa; @@ -47,9 +55,6 @@ static void test_mmap_supported(int fd, size_t total_size) size_t i; int ret; - mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - TEST_ASSERT(mem == MAP_FAILED, "Copy-on-write not allowed by guest_memfd."); - mem = kvm_mmap(total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); memset(mem, val, total_size); @@ -272,6 +277,7 @@ static void __test_guest_memfd(struct kvm_vm *vm, uint64_t flags) if (flags & GUEST_MEMFD_FLAG_MMAP) { gmem_test(mmap_supported, vm, flags); + gmem_test(mmap_cow, vm, flags); gmem_test(fault_overflow, vm, flags); } else { gmem_test(mmap_not_supported, vm, flags); From f91187c0ecc6358ccecf533c5fcc6b7dbb4735cb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:26:04 -0700 Subject: [PATCH 13/46] KVM: selftests: Add wrapper macro to handle and assert on expected SIGBUS Extract the guest_memfd test's SIGBUS handling functionality into a common TEST_EXPECT_SIGBUS() macro in anticipation of adding more SIGBUS testcases. Eating a SIGBUS isn't terrible difficult, but it requires a non-trivial amount of boilerplate code, and using a macro allows selftests to print out the exact action that failed to generate a SIGBUS without the developer needing to remember to add a useful error message. Explicitly mark the SIGBUS handler as "used", as gcc-14 at least likes to discard the function before linking. Opportunistically use TEST_FAIL(...) instead of TEST_ASSERT(false, ...), and fix the write path of the guest_memfd test to use the local "val" instead of hardcoding the literal value a second time. Suggested-by: Ackerley Tng Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: Lisa Wang Tested-by: Lisa Wang Link: https://lore.kernel.org/r/20251003232606.4070510-12-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/guest_memfd_test.c | 18 +----------------- .../testing/selftests/kvm/include/test_util.h | 19 +++++++++++++++++++ tools/testing/selftests/kvm/lib/test_util.c | 7 +++++++ 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 640636c76eb9..73c2e54e7297 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -14,8 +14,6 @@ #include #include #include -#include -#include #include #include #include @@ -77,17 +75,8 @@ static void test_mmap_supported(int fd, size_t total_size) kvm_munmap(mem, total_size); } -static sigjmp_buf jmpbuf; -void fault_sigbus_handler(int signum) -{ - siglongjmp(jmpbuf, 1); -} - static void test_fault_overflow(int fd, size_t total_size) { - struct sigaction sa_old, sa_new = { - .sa_handler = fault_sigbus_handler, - }; size_t map_size = total_size * 4; const char val = 0xaa; char *mem; @@ -95,12 +84,7 @@ static void test_fault_overflow(int fd, size_t total_size) mem = kvm_mmap(map_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); - sigaction(SIGBUS, &sa_new, &sa_old); - if (sigsetjmp(jmpbuf, 1) == 0) { - memset(mem, 0xaa, map_size); - TEST_ASSERT(false, "memset() should have triggered SIGBUS."); - } - sigaction(SIGBUS, &sa_old, NULL); + TEST_EXPECT_SIGBUS(memset(mem, val, map_size)); for (i = 0; i < total_size; i++) TEST_ASSERT_EQ(READ_ONCE(mem[i]), val); diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index c6ef895fbd9a..b4872ba8ed12 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -8,6 +8,8 @@ #ifndef SELFTEST_KVM_TEST_UTIL_H #define SELFTEST_KVM_TEST_UTIL_H +#include +#include #include #include #include @@ -78,6 +80,23 @@ do { \ __builtin_unreachable(); \ } while (0) +extern sigjmp_buf expect_sigbus_jmpbuf; +void expect_sigbus_handler(int signum); + +#define TEST_EXPECT_SIGBUS(action) \ +do { \ + struct sigaction sa_old, sa_new = { \ + .sa_handler = expect_sigbus_handler, \ + }; \ + \ + sigaction(SIGBUS, &sa_new, &sa_old); \ + if (sigsetjmp(expect_sigbus_jmpbuf, 1) == 0) { \ + action; \ + TEST_FAIL("'%s' should have triggered SIGBUS", #action); \ + } \ + sigaction(SIGBUS, &sa_old, NULL); \ +} while (0) + size_t parse_size(const char *size); int64_t timespec_to_ns(struct timespec ts); diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 03eb99af9b8d..8a1848586a85 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -18,6 +18,13 @@ #include "test_util.h" +sigjmp_buf expect_sigbus_jmpbuf; + +void __attribute__((used)) expect_sigbus_handler(int signum) +{ + siglongjmp(expect_sigbus_jmpbuf, 1); +} + /* * Random number generator that is usable from guest code. This is the * Park-Miller LCG using standard constants. From 19942d4fd9cf022b28a9afb432a6338dcf96fc2f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:26:05 -0700 Subject: [PATCH 14/46] KVM: selftests: Verify that faulting in private guest_memfd memory fails Add a guest_memfd testcase to verify that faulting in private memory gets a SIGBUS. For now, test only the case where memory is private by default since KVM doesn't yet support in-place conversion. Deliberately run the CoW test with and without INIT_SHARED set as KVM should disallow MAP_PRIVATE regardless of whether the memory itself is private from a CoCo perspective. Cc: Ackerley Tng Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: David Hildenbrand Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Link: https://lore.kernel.org/r/20251003232606.4070510-13-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/guest_memfd_test.c | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 73c2e54e7297..f5372fdf096d 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -75,9 +75,8 @@ static void test_mmap_supported(int fd, size_t total_size) kvm_munmap(mem, total_size); } -static void test_fault_overflow(int fd, size_t total_size) +static void test_fault_sigbus(int fd, size_t accessible_size, size_t map_size) { - size_t map_size = total_size * 4; const char val = 0xaa; char *mem; size_t i; @@ -86,12 +85,22 @@ static void test_fault_overflow(int fd, size_t total_size) TEST_EXPECT_SIGBUS(memset(mem, val, map_size)); - for (i = 0; i < total_size; i++) + for (i = 0; i < accessible_size; i++) TEST_ASSERT_EQ(READ_ONCE(mem[i]), val); kvm_munmap(mem, map_size); } +static void test_fault_overflow(int fd, size_t total_size) +{ + test_fault_sigbus(fd, total_size, total_size * 4); +} + +static void test_fault_private(int fd, size_t total_size) +{ + test_fault_sigbus(fd, 0, total_size); +} + static void test_mmap_not_supported(int fd, size_t total_size) { char *mem; @@ -260,9 +269,14 @@ static void __test_guest_memfd(struct kvm_vm *vm, uint64_t flags) gmem_test(file_read_write, vm, flags); if (flags & GUEST_MEMFD_FLAG_MMAP) { - gmem_test(mmap_supported, vm, flags); + if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) { + gmem_test(mmap_supported, vm, flags); + gmem_test(fault_overflow, vm, flags); + } else { + gmem_test(fault_private, vm, flags); + } + gmem_test(mmap_cow, vm, flags); - gmem_test(fault_overflow, vm, flags); } else { gmem_test(mmap_not_supported, vm, flags); } @@ -282,6 +296,8 @@ static void test_guest_memfd(unsigned long vm_type) __test_guest_memfd(vm, 0); flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); + if (flags & GUEST_MEMFD_FLAG_MMAP) + __test_guest_memfd(vm, GUEST_MEMFD_FLAG_MMAP); /* MMAP should always be supported if INIT_SHARED is supported. */ if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) From 505f5224b197b77169c977e747cbc18b222f85f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:26:06 -0700 Subject: [PATCH 15/46] KVM: selftests: Verify that reads to inaccessible guest_memfd VMAs SIGBUS Expand the guest_memfd negative testcases for overflow and MAP_PRIVATE to verify that reads to inaccessible memory also get a SIGBUS. Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: Lisa Wang Tested-by: Lisa Wang Link: https://lore.kernel.org/r/20251003232606.4070510-14-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/guest_memfd_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index f5372fdf096d..e7d9aeb418d3 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -84,6 +84,7 @@ static void test_fault_sigbus(int fd, size_t accessible_size, size_t map_size) mem = kvm_mmap(map_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); TEST_EXPECT_SIGBUS(memset(mem, val, map_size)); + TEST_EXPECT_SIGBUS((void)READ_ONCE(mem[accessible_size])); for (i = 0; i < accessible_size; i++) TEST_ASSERT_EQ(READ_ONCE(mem[i]), val); From ed25dcfbc4327570b28f0328a8e17d121434c0ea Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 26 Sep 2025 12:41:07 -0700 Subject: [PATCH 16/46] KVM: arm64: nv: Don't treat ZCR_EL2 as a 'mapped' register Unlike the other mapped EL2 sysregs ZCR_EL2 isn't guaranteed to be resident when a vCPU is loaded as it actually follows the SVE context. As such, the contents of ZCR_EL1 may belong to another guest if the vCPU has been preempted before reaching sysreg emulation. Unconditionally use the in-memory value of ZCR_EL2 and switch to the memory-only accessors. The in-memory value is guaranteed to be valid as fpsimd_lazy_switch_to_{guest,host}() will restore/save the register appropriately. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 91053aa832d0..4a75e5f0c259 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -203,7 +203,6 @@ static void locate_register(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg, MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL ); MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); - MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL ); MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL ); MAPPED_EL2_SYSREG(SCTLR2_EL2, SCTLR2_EL1, NULL ); case CNTHCTL_EL2: @@ -2709,14 +2708,13 @@ static bool access_zcr_el2(struct kvm_vcpu *vcpu, } if (!p->is_write) { - p->regval = vcpu_read_sys_reg(vcpu, ZCR_EL2); + p->regval = __vcpu_sys_reg(vcpu, ZCR_EL2); return true; } vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1; vq = min(vq, vcpu_sve_max_vq(vcpu)); - vcpu_write_sys_reg(vcpu, vq - 1, ZCR_EL2); - + __vcpu_assign_sys_reg(vcpu, ZCR_EL2, vq - 1); return true; } From 9a1950f97741a23fc68a7b2cfd487e059d389be5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 26 Sep 2025 12:41:08 -0700 Subject: [PATCH 17/46] KVM: arm64: nv: Don't advance PC when pending an SVE exception Jan reports that running a nested guest on Neoverse-V2 leads to a WARN in the host due to simultaneously pending an exception and PC increment after an access to ZCR_EL2. Returning true from a sysreg accessor is an indication that the sysreg instruction has been retired. Of course this isn't the case when we've pended a synchronous SVE exception for the guest. Fix the return value and let the exception propagate to the guest as usual. Reported-by: Jan Kotas Closes: https://lore.kernel.org/kvmarm/865xd61tt5.wl-maz@kernel.org/ Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 4a75e5f0c259..ee8a7033c85b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2704,7 +2704,7 @@ static bool access_zcr_el2(struct kvm_vcpu *vcpu, if (guest_hyp_sve_traps_enabled(vcpu)) { kvm_inject_nested_sve_trap(vcpu); - return true; + return false; } if (!p->is_write) { From a46c09b382eea3f9e3d16576096b987a2171fcca Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 26 Sep 2025 15:42:46 -0700 Subject: [PATCH 18/46] KVM: arm64: Use the in-context stage-1 in __kvm_find_s1_desc_level() Running the external_aborts selftest at EL2 leads to an ugly splat due to the stage-1 MMU being disabled for the walked context, owing to the fact that __kvm_find_s1_desc_level() is hardcoded to the EL1&0 regime. Select the appropriate translation regime for the stage-1 walk based on the current vCPU context. Fixes: b8e625167a32 ("KVM: arm64: Add S1 IPA to page table level walker") Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/kvm/at.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 20bb9af125b1..e2e06ec8a67b 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1602,13 +1602,17 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) .fn = match_s1_desc, .priv = &dm, }, - .regime = TR_EL10, .as_el0 = false, .pan = false, }; struct s1_walk_result wr = {}; int ret; + if (is_hyp_ctxt(vcpu)) + wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; + else + wi.regime = TR_EL10; + ret = setup_s1_walk(vcpu, &wi, &wr, va); if (ret) return ret; From 890c608b4d5e6a616693da92a2d4e7de4ab9e6c5 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 26 Sep 2025 15:44:54 -0700 Subject: [PATCH 19/46] KVM: arm64: selftests: Test effective value of HCR_EL2.AMO A defect against the architecture now allows an implementation to treat AMO as 1 when HCR_EL2.{E2H, TGE} = {1, 0}. KVM now takes advantage of this interpretation to address a quality of emulation issue w.r.t. SError injection. Add a corresponding test case and expect a pending SError to be taken. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier --- .../selftests/kvm/arm64/external_aborts.c | 43 +++++++++++++++++++ .../selftests/kvm/include/arm64/processor.h | 12 +++++- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/arm64/external_aborts.c b/tools/testing/selftests/kvm/arm64/external_aborts.c index 592b26ded779..d8fe17a6cc59 100644 --- a/tools/testing/selftests/kvm/arm64/external_aborts.c +++ b/tools/testing/selftests/kvm/arm64/external_aborts.c @@ -359,6 +359,44 @@ static void test_mmio_ease(void) kvm_vm_free(vm); } +static void test_serror_amo_guest(void) +{ + /* + * The ISB is entirely unnecessary (and highlights how FEAT_NV2 is borked) + * since the write is redirected to memory. But don't write (intentionally) + * broken code! + */ + sysreg_clear_set(hcr_el2, HCR_EL2_AMO | HCR_EL2_TGE, 0); + isb(); + + GUEST_SYNC(0); + GUEST_ASSERT(read_sysreg(isr_el1) & ISR_EL1_A); + + /* + * KVM treats the effective value of AMO as 1 when + * HCR_EL2.{E2H,TGE} = {1, 0}, meaning the SError will be taken when + * unmasked. + */ + local_serror_enable(); + isb(); + local_serror_disable(); + + GUEST_FAIL("Should've taken pending SError exception"); +} + +static void test_serror_amo(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_serror_amo_guest, + unexpected_dabt_handler); + + vm_install_exception_handler(vm, VECTOR_ERROR_CURRENT, expect_serror_handler); + vcpu_run_expect_sync(vcpu); + vcpu_inject_serror(vcpu); + vcpu_run_expect_done(vcpu); + kvm_vm_free(vm); +} + int main(void) { test_mmio_abort(); @@ -369,4 +407,9 @@ int main(void) test_serror_emulated(); test_mmio_ease(); test_s1ptw_abort(); + + if (!test_supports_el2()) + return 0; + + test_serror_amo(); } diff --git a/tools/testing/selftests/kvm/include/arm64/processor.h b/tools/testing/selftests/kvm/include/arm64/processor.h index 6f481475c135..ff928716574d 100644 --- a/tools/testing/selftests/kvm/include/arm64/processor.h +++ b/tools/testing/selftests/kvm/include/arm64/processor.h @@ -305,7 +305,17 @@ void test_wants_mte(void); void test_disable_default_vgic(void); bool vm_supports_el2(struct kvm_vm *vm); -static bool vcpu_has_el2(struct kvm_vcpu *vcpu) + +static inline bool test_supports_el2(void) +{ + struct kvm_vm *vm = vm_create(1); + bool supported = vm_supports_el2(vm); + + kvm_vm_free(vm); + return supported; +} + +static inline bool vcpu_has_el2(struct kvm_vcpu *vcpu) { return vcpu->init.features[0] & BIT(KVM_ARM_VCPU_HAS_EL2); } From cb49b7b8622e914171e9eb7197c006320889e0fc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 26 Sep 2025 08:58:38 -0700 Subject: [PATCH 20/46] KVM: arm64: selftests: Track width of timer counter as "int", not "uint64_t" Store the width of arm64's timer counter as an "int", not a "uint64_t". ilog2() returns an "int", and more importantly using what is an "unsigned long" under the hood makes clang unhappy due to a type mismatch when clamping the width to a sane value. arm64/arch_timer_edge_cases.c:1032:10: error: comparison of distinct pointer types ('typeof (width) *' (aka 'unsigned long *') and 'typeof (56) *' (aka 'int *')) [-Werror,-Wcompare-distinct-pointer-types] 1032 | width = clamp(width, 56, 64); | ^~~~~~~~~~~~~~~~~~~~ tools/include/linux/kernel.h:47:45: note: expanded from macro 'clamp' 47 | #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) | ^~~~~~~~~~~~ tools/include/linux/kernel.h:33:17: note: expanded from macro 'max' 33 | (void) (&_max1 == &_max2); \ | ~~~~~~ ^ ~~~~~~ tools/include/linux/kernel.h:39:9: note: expanded from macro 'min' 39 | typeof(x) _min1 = (x); \ | ^ Fixes: fad4cf944839 ("KVM: arm64: selftests: Determine effective counter width in arch_timer_edge_cases") Cc: Sebastian Ott Signed-off-by: Sean Christopherson Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c index 91906414a474..993c9e38e729 100644 --- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c +++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c @@ -1020,7 +1020,7 @@ static void set_counter_defaults(void) { const uint64_t MIN_ROLLOVER_SECS = 40ULL * 365 * 24 * 3600; uint64_t freq = read_sysreg(CNTFRQ_EL0); - uint64_t width = ilog2(MIN_ROLLOVER_SECS * freq); + int width = ilog2(MIN_ROLLOVER_SECS * freq); width = clamp(width, 56, 64); CVAL_MAX = GENMASK_ULL(width - 1, 0); From 0aa1b76fe1429629215a7c79820e4b96233ac4a3 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 30 Sep 2025 01:52:37 -0700 Subject: [PATCH 21/46] KVM: arm64: Prevent access to vCPU events before init Another day, another syzkaller bug. KVM erroneously allows userspace to pend vCPU events for a vCPU that hasn't been initialized yet, leading to KVM interpreting a bunch of uninitialized garbage for routing / injecting the exception. In one case the injection code and the hyp disagree on whether the vCPU has a 32bit EL1 and put the vCPU into an illegal mode for AArch64, tripping the BUG() in exception_target_el() during the next injection: kernel BUG at arch/arm64/kvm/inject_fault.c:40! Internal error: Oops - BUG: 00000000f2000800 [#1] SMP CPU: 3 UID: 0 PID: 318 Comm: repro Not tainted 6.17.0-rc4-00104-g10fd0285305d #6 PREEMPT Hardware name: linux,dummy-virt (DT) pstate: 21402009 (nzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : exception_target_el+0x88/0x8c lr : pend_serror_exception+0x18/0x13c sp : ffff800082f03a10 x29: ffff800082f03a10 x28: ffff0000cb132280 x27: 0000000000000000 x26: 0000000000000000 x25: ffff0000c2a99c20 x24: 0000000000000000 x23: 0000000000008000 x22: 0000000000000002 x21: 0000000000000004 x20: 0000000000008000 x19: ffff0000c2a99c20 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 00000000200000c0 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 x8 : ffff800082f03af8 x7 : 0000000000000000 x6 : 0000000000000000 x5 : ffff800080f621f0 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 000000000040009b x1 : 0000000000000003 x0 : ffff0000c2a99c20 Call trace: exception_target_el+0x88/0x8c (P) kvm_inject_serror_esr+0x40/0x3b4 __kvm_arm_vcpu_set_events+0xf0/0x100 kvm_arch_vcpu_ioctl+0x180/0x9d4 kvm_vcpu_ioctl+0x60c/0x9f4 __arm64_sys_ioctl+0xac/0x104 invoke_syscall+0x48/0x110 el0_svc_common.constprop.0+0x40/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x34/0xf0 el0t_64_sync_handler+0xa0/0xe4 el0t_64_sync+0x198/0x19c Code: f946bc01 b4fffe61 9101e020 17fffff2 (d4210000) Reject the ioctls outright as no sane VMM would call these before KVM_ARM_VCPU_INIT anyway. Even if it did the exception would've been thrown away by the eventual reset of the vCPU's state. Cc: stable@vger.kernel.org # 6.17 Fixes: b7b27facc7b5 ("arm/arm64: KVM: Add KVM_GET/SET_VCPU_EVENTS") Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index f21d1b7f20f8..f01cacb669cf 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1794,6 +1794,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_GET_VCPU_EVENTS: { struct kvm_vcpu_events events; + if (!kvm_vcpu_initialized(vcpu)) + return -ENOEXEC; + if (kvm_arm_vcpu_get_events(vcpu, &events)) return -EINVAL; @@ -1805,6 +1808,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_SET_VCPU_EVENTS: { struct kvm_vcpu_events events; + if (!kvm_vcpu_initialized(vcpu)) + return -ENOEXEC; + if (copy_from_user(&events, argp, sizeof(events))) return -EFAULT; From cc4309324dc695f62d25d56c0b29805e9724170c Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 30 Sep 2025 16:36:20 -0700 Subject: [PATCH 22/46] KVM: arm64: Document vCPU event ioctls as requiring init'ed vCPU KVM rejects calls to KVM_{GET,SET}_VCPU_EVENTS for an uninitialized vCPU as of commit cc96679f3c03 ("KVM: arm64: Prevent access to vCPU events before init"). Update the corresponding API documentation. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier --- Documentation/virt/kvm/api.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6ae24c5ca559..4973c74db5c6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1229,6 +1229,9 @@ It is not possible to read back a pending external abort (injected via KVM_SET_VCPU_EVENTS or otherwise) because such an exception is always delivered directly to the virtual CPU). +Calling this ioctl on a vCPU that hasn't been initialized will return +-ENOEXEC. + :: struct kvm_vcpu_events { @@ -1309,6 +1312,8 @@ exceptions by manipulating individual registers using the KVM_SET_ONE_REG API. See KVM_GET_VCPU_EVENTS for the data structure. +Calling this ioctl on a vCPU that hasn't been initialized will return +-ENOEXEC. 4.33 KVM_GET_DEBUGREGS ---------------------- From a133052666bed0dc0b169952e9d3f9e6b2125f9a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 30 Sep 2025 12:33:02 -0700 Subject: [PATCH 23/46] KVM: selftests: Fix irqfd_test for non-x86 architectures The KVM_IRQFD ioctl fails if no irqchip is present in-kernel, which isn't too surprising as there's not much KVM can do for an IRQ if it cannot resolve a destination. As written the irqfd_test assumes that a 'default' VM created in selftests has an in-kernel irqchip created implicitly. That may be the case on x86 but it isn't necessarily true on other architectures. Add an arch predicate indicating if 'default' VMs get an irqchip and make the irqfd_test depend on it. Work around arm64 VGIC initialization requirements by using vm_create_with_one_vcpu(), ignoring the created vCPU as it isn't used for the test. Reported-by: Sebastian Ott Reported-by: Naresh Kamboju Acked-by: Sean Christopherson Fixes: 7e9b231c402a ("KVM: selftests: Add a KVM_IRQFD test to verify uniqueness requirements") Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/include/kvm_util.h | 2 ++ tools/testing/selftests/kvm/irqfd_test.c | 14 +++++++++++--- tools/testing/selftests/kvm/lib/arm64/processor.c | 5 +++++ tools/testing/selftests/kvm/lib/kvm_util.c | 5 +++++ tools/testing/selftests/kvm/lib/s390/processor.c | 5 +++++ tools/testing/selftests/kvm/lib/x86/processor.c | 5 +++++ 6 files changed, 33 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 26cc30290e76..112d3f443a17 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -1273,4 +1273,6 @@ bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr); uint32_t guest_get_vcpuid(void); +bool kvm_arch_has_default_irqchip(void); + #endif /* SELFTEST_KVM_UTIL_H */ diff --git a/tools/testing/selftests/kvm/irqfd_test.c b/tools/testing/selftests/kvm/irqfd_test.c index 7c301b4c7005..5d7590d01868 100644 --- a/tools/testing/selftests/kvm/irqfd_test.c +++ b/tools/testing/selftests/kvm/irqfd_test.c @@ -89,11 +89,19 @@ static void juggle_eventfd_primary(struct kvm_vm *vm, int eventfd) int main(int argc, char *argv[]) { pthread_t racing_thread; + struct kvm_vcpu *unused; int r, i; - /* Create "full" VMs, as KVM_IRQFD requires an in-kernel IRQ chip. */ - vm1 = vm_create(1); - vm2 = vm_create(1); + TEST_REQUIRE(kvm_arch_has_default_irqchip()); + + /* + * Create "full" VMs, as KVM_IRQFD requires an in-kernel IRQ chip. Also + * create an unused vCPU as certain architectures (like arm64) need to + * complete IRQ chip initialization after all possible vCPUs for a VM + * have been created. + */ + vm1 = vm_create_with_one_vcpu(&unused, NULL); + vm2 = vm_create_with_one_vcpu(&unused, NULL); WRITE_ONCE(__eventfd, kvm_new_eventfd()); diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 369a4c87dd8f..54f6d17c78f7 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -725,3 +725,8 @@ void kvm_arch_vm_release(struct kvm_vm *vm) if (vm->arch.has_gic) close(vm->arch.gic_fd); } + +bool kvm_arch_has_default_irqchip(void) +{ + return request_vgic && kvm_supports_vgic_v3(); +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 6743fbd9bd67..a35adfebfa23 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -2344,3 +2344,8 @@ bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr) pg = paddr >> vm->page_shift; return sparsebit_is_set(region->protected_phy_pages, pg); } + +__weak bool kvm_arch_has_default_irqchip(void) +{ + return false; +} diff --git a/tools/testing/selftests/kvm/lib/s390/processor.c b/tools/testing/selftests/kvm/lib/s390/processor.c index 20cfe970e3e3..8ceeb17c819a 100644 --- a/tools/testing/selftests/kvm/lib/s390/processor.c +++ b/tools/testing/selftests/kvm/lib/s390/processor.c @@ -221,3 +221,8 @@ void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) { } + +bool kvm_arch_has_default_irqchip(void) +{ + return true; +} diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index c748cd9b2eef..b418502c5ecc 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -1318,3 +1318,8 @@ bool sys_clocksource_is_based_on_tsc(void) return ret; } + +bool kvm_arch_has_default_irqchip(void) +{ + return true; +} From 05a02490faeb952f1c8d2f5c38346fa0a717a483 Mon Sep 17 00:00:00 2001 From: Osama Abdelkader Date: Tue, 30 Sep 2025 16:56:21 +0300 Subject: [PATCH 24/46] KVM: arm64: Remove unreachable break after return Remove an unnecessary 'break' statement that follows a 'return' in arch/arm64/kvm/at.c. The break is unreachable. Signed-off-by: Osama Abdelkader Reviewed-by: Zenghui Yu Signed-off-by: Marc Zyngier --- arch/arm64/kvm/at.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index e2e06ec8a67b..be26d5aa668c 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -91,7 +91,6 @@ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 o case OP_AT_S1E2W: case OP_AT_S1E2A: return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; - break; default: return (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; From 9a7f87eb587da49993f47f44c4c5535d8de76750 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sun, 12 Oct 2025 23:43:52 +0800 Subject: [PATCH 25/46] KVM: arm64: selftests: Sync ID_AA64PFR1, MPIDR, CLIDR in guest We forgot to sync several registers (ID_AA64PFR1, MPIDR, CLIDR) in guest to make sure that the guest had seen the written value. Add them to the list. Signed-off-by: Zenghui Yu Reviewed-By: Ben Horgan Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/set_id_regs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index 8ff1e853f7f8..5e24f77868b5 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -249,11 +249,14 @@ static void guest_code(void) GUEST_REG_SYNC(SYS_ID_AA64ISAR2_EL1); GUEST_REG_SYNC(SYS_ID_AA64ISAR3_EL1); GUEST_REG_SYNC(SYS_ID_AA64PFR0_EL1); + GUEST_REG_SYNC(SYS_ID_AA64PFR1_EL1); GUEST_REG_SYNC(SYS_ID_AA64MMFR0_EL1); GUEST_REG_SYNC(SYS_ID_AA64MMFR1_EL1); GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1); GUEST_REG_SYNC(SYS_ID_AA64MMFR3_EL1); GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1); + GUEST_REG_SYNC(SYS_MPIDR_EL1); + GUEST_REG_SYNC(SYS_CLIDR_EL1); GUEST_REG_SYNC(SYS_CTR_EL0); GUEST_REG_SYNC(SYS_MIDR_EL1); GUEST_REG_SYNC(SYS_REVIDR_EL1); From c35dd838666d47de2848639234ec32e3ba22b49f Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Fri, 10 Oct 2025 23:17:07 +0530 Subject: [PATCH 26/46] KVM: arm64: Guard PMSCR_EL1 initialization with SPE presence check Commit efad60e46057 ("KVM: arm64: Initialize PMSCR_EL1 when in VHE") does not perform sufficient check before initializing PMSCR_EL1 to 0 when running in VHE mode. On some platforms, this causes the system to hang during boot, as EL3 has not delegated access to the Profiling Buffer to the Non-secure world, nor does it reinject an UNDEF on sysreg trap. To avoid this issue, restrict the PMSCR_EL1 initialization to CPUs that support Statistical Profiling Extension (FEAT_SPE) and have the Profiling Buffer accessible in Non-secure EL1. This is determined via a new helper `cpu_has_spe()` which checks both PMSVer and PMBIDR_EL1.P. This ensures the initialization only affects CPUs where SPE is implemented and usable, preventing boot failures on platforms where SPE is not properly configured. Fixes: efad60e46057 ("KVM: arm64: Initialize PMSCR_EL1 when in VHE") Signed-off-by: Mukesh Ojha Signed-off-by: Marc Zyngier --- arch/arm64/kvm/debug.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 3515a273eaa2..3ad6b7c6e4ba 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -15,6 +15,12 @@ #include #include +static int cpu_has_spe(u64 dfr0) +{ + return cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && + !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P); +} + /** * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value * @@ -77,13 +83,12 @@ void kvm_init_host_debug_data(void) *host_data_ptr(debug_brps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0); *host_data_ptr(debug_wrps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0); + if (cpu_has_spe(dfr0)) + host_data_set_flag(HAS_SPE); + if (has_vhe()) return; - if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && - !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P)) - host_data_set_flag(HAS_SPE); - /* Check if we have BRBE implemented and available at the host */ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_BRBE_SHIFT)) host_data_set_flag(HAS_BRBE); @@ -102,7 +107,7 @@ void kvm_init_host_debug_data(void) void kvm_debug_init_vhe(void) { /* Clear PMSCR_EL1.E{0,1}SPE which reset to UNKNOWN values. */ - if (SYS_FIELD_GET(ID_AA64DFR0_EL1, PMSVer, read_sysreg(id_aa64dfr0_el1))) + if (host_data_test_flag(HAS_SPE)) write_sysreg_el1(0, SYS_PMSCR); } From 2192d348c0aa0cc2e7249dc3709f21bfe0a0170c Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Wed, 8 Oct 2025 23:45:20 +0800 Subject: [PATCH 27/46] KVM: arm64: selftests: Allocate vcpus with correct size vcpus array contains pointers to struct kvm_vcpu {}. It is way overkill to allocate the array with (nr_cpus * sizeof(struct kvm_vcpu)). Fix the allocation by using the correct size. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c index 87922a89b134..b134a304f0a6 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c +++ b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c @@ -331,7 +331,7 @@ static void setup_vm(void) { int i; - vcpus = malloc(test_data.nr_cpus * sizeof(struct kvm_vcpu)); + vcpus = malloc(test_data.nr_cpus * sizeof(struct kvm_vcpu *)); TEST_ASSERT(vcpus, "Failed to allocate vCPU array"); vm = vm_create_with_vcpus(test_data.nr_cpus, guest_code, vcpus); From d5e6310a0d996493b1af9f3eeec418350523388b Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 7 Oct 2025 12:52:55 -0700 Subject: [PATCH 28/46] KVM: arm64: selftests: Actually enable IRQs in vgic_lpi_stress vgic_lpi_stress rather hilariously leaves IRQs disabled for the duration of the test. While the ITS translation of MSIs happens regardless of this, for completeness the guest should actually handle the LPIs. Signed-off-by: Oliver Upton Reviewed-by: Zenghui Yu Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c index b134a304f0a6..687d04463983 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c +++ b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c @@ -123,6 +123,7 @@ static void guest_setup_gic(void) static void guest_code(size_t nr_lpis) { guest_setup_gic(); + local_irq_enable(); GUEST_SYNC(0); From 3193287ddffbce29fd1a79d812f543c0fe4861d1 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Tue, 7 Oct 2025 16:07:13 +0000 Subject: [PATCH 29/46] KVM: arm64: gic-v3: Only set ICH_HCR traps for v2-on-v3 or v3 guests The ICH_HCR_EL2 traps are used when running on GICv3 hardware, or when running a GICv3-based guest using FEAT_GCIE_LEGACY on GICv5 hardware. When running a GICv2 guest on GICv3 hardware the traps are used to ensure that the guest never sees any part of GICv3 (only GICv2 is visible to the guest), and when running a GICv3 guest they are used to trap in specific scenarios. They are not applicable for a GICv2-native guest, and won't be applicable for a(n upcoming) GICv5 guest. The traps themselves are configured in the vGIC CPU IF state, which is stored as a union. Updating the wrong aperture of the union risks corrupting state, and therefore needs to be avoided at all costs. Bail early if we're not running a compatible guest (GICv2 on GICv3 hardware, GICv3 native, GICv3 on GICv5 hardware). Trap everything unconditionally if we're running a GICv2 guest on GICv3 hardware. Otherwise, conditionally set up GICv3-native trapping. Signed-off-by: Sascha Bischoff Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-v3.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index f1c153106c56..6fbb4b099855 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -297,8 +297,11 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; + if (!vgic_is_v3(vcpu->kvm)) + return; + /* Hide GICv3 sysreg if necessary */ - if (!kvm_has_gicv3(vcpu->kvm)) { + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | ICH_HCR_EL2_TC); return; From 164ecbf73c3ea61455e07eefdad8050a7b569558 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Tue, 7 Oct 2025 15:48:54 +0000 Subject: [PATCH 30/46] Documentation: KVM: Update GICv3 docs for GICv5 hosts GICv5 hosts optionally include FEAT_GCIE_LEGACY, which allows them to execute GICv3-based VMs on GICv5 hardware. Update the GICv3 documentation to reflect this now that GICv3 guests are supports on compatible GICv5 hosts. Signed-off-by: Sascha Bischoff Signed-off-by: Marc Zyngier --- Documentation/virt/kvm/devices/arm-vgic-v3.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.rst b/Documentation/virt/kvm/devices/arm-vgic-v3.rst index ff02102f7141..5395ee66fc32 100644 --- a/Documentation/virt/kvm/devices/arm-vgic-v3.rst +++ b/Documentation/virt/kvm/devices/arm-vgic-v3.rst @@ -13,7 +13,8 @@ will act as the VM interrupt controller, requiring emulated user-space devices to inject interrupts to the VGIC instead of directly to CPUs. It is not possible to create both a GICv3 and GICv2 on the same VM. -Creating a guest GICv3 device requires a host GICv3 as well. +Creating a guest GICv3 device requires a host GICv3 host, or a GICv5 host with +support for FEAT_GCIE_LEGACY. Groups: From 4cab5c857d1f92b4b322e30349fdc5e2e38e7a2f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:45 +0100 Subject: [PATCH 31/46] KVM: arm64: Hide CNTHV_*_EL2 from userspace for nVHE guests Although we correctly UNDEF any CNTHV_*_EL2 access from the guest when E2H==0, we still expose these registers to userspace, which is a bad idea. Drop the ad-hoc UNDEF injection and switch to a .visibility() callback which will also hide the register from userspace. Fixes: 0e45981028550 ("KVM: arm64: timer: Don't adjust the EL2 virtual timer offset") Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ee8a7033c85b..9f2f4e0b042e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1594,16 +1594,6 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } -static bool access_hv_timer(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - if (!vcpu_el2_e2h_is_set(vcpu)) - return undef_access(vcpu, p, r); - - return access_arch_timer(vcpu, p, r); -} - static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { @@ -2831,6 +2821,16 @@ static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu, return __el2_visibility(vcpu, rd, s1pie_visibility); } +static unsigned int cnthv_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (vcpu_has_nv(vcpu) && + !vcpu_has_feature(vcpu, KVM_ARM_VCPU_HAS_EL2_E2H0)) + return 0; + + return REG_HIDDEN; +} + static bool access_mdcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -3691,9 +3691,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0), EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), - { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer }, - EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0), - EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0), + { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer, .visibility = cnthv_visibility }, + EL2_REG_FILTERED(CNTHV_CTL_EL2, access_arch_timer, reset_val, 0, cnthv_visibility), + EL2_REG_FILTERED(CNTHV_CVAL_EL2, access_arch_timer, reset_val, 0, cnthv_visibility), { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, From aa68975c973ed3b0bd4ff513113495588afb855c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:46 +0100 Subject: [PATCH 32/46] KVM: arm64: Introduce timer_context_to_vcpu() helper We currently have a vcpu pointer nested into each timer context. As we are about to remove this pointer, introduce a helper (aptly named timer_context_to_vcpu()) that returns this pointer, at least until we repaint the data structure. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 25 +++++++++++++------------ include/kvm/arm_arch_timer.h | 2 +- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index dbd74e4885e2..e5a25e743f5b 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -66,7 +66,7 @@ static int nr_timers(struct kvm_vcpu *vcpu) u32 timer_get_ctl(struct arch_timer_context *ctxt) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt); switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: @@ -85,7 +85,7 @@ u32 timer_get_ctl(struct arch_timer_context *ctxt) u64 timer_get_cval(struct arch_timer_context *ctxt) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt); switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: @@ -104,7 +104,7 @@ u64 timer_get_cval(struct arch_timer_context *ctxt) static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt); switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: @@ -126,7 +126,7 @@ static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctxt); switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: @@ -343,7 +343,7 @@ static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) u64 ns; ctx = container_of(hrt, struct arch_timer_context, hrtimer); - vcpu = ctx->vcpu; + vcpu = timer_context_to_vcpu(ctx); trace_kvm_timer_hrtimer_expire(ctx); @@ -436,8 +436,9 @@ static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level) * * But hey, it's fast, right? */ - if (is_hyp_ctxt(ctx->vcpu) && - (ctx == vcpu_vtimer(ctx->vcpu) || ctx == vcpu_ptimer(ctx->vcpu))) { + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctx); + if (is_hyp_ctxt(vcpu) && + (ctx == vcpu_vtimer(vcpu) || ctx == vcpu_ptimer(vcpu))) { unsigned long val = timer_get_ctl(ctx); __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level); timer_set_ctl(ctx, val); @@ -470,7 +471,7 @@ static void timer_emulate(struct arch_timer_context *ctx) trace_kvm_timer_emulate(ctx, should_fire); if (should_fire != ctx->irq.level) - kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); + kvm_timer_update_irq(timer_context_to_vcpu(ctx), should_fire, ctx); kvm_timer_update_status(ctx, should_fire); @@ -498,7 +499,7 @@ static void set_cntpoff(u64 cntpoff) static void timer_save_state(struct arch_timer_context *ctx) { - struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + struct arch_timer_cpu *timer = vcpu_timer(timer_context_to_vcpu(ctx)); enum kvm_arch_timers index = arch_timer_ctx_index(ctx); unsigned long flags; @@ -609,7 +610,7 @@ static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) static void timer_restore_state(struct arch_timer_context *ctx) { - struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + struct arch_timer_cpu *timer = vcpu_timer(timer_context_to_vcpu(ctx)); enum kvm_arch_timers index = arch_timer_ctx_index(ctx); unsigned long flags; @@ -668,7 +669,7 @@ static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, boo static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) { - struct kvm_vcpu *vcpu = ctx->vcpu; + struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctx); bool phys_active = false; /* @@ -677,7 +678,7 @@ static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) * this point and the register restoration, we'll take the * interrupt anyway. */ - kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx); + kvm_timer_update_irq(vcpu, kvm_timer_should_fire(ctx), ctx); if (irqchip_in_kernel(vcpu->kvm)) phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx)); diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 681cf0c8b9df..d188c716d03c 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -128,7 +128,7 @@ void kvm_timer_init_vhe(void); #define vcpu_hptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_HPTIMER]) #define arch_timer_ctx_index(ctx) ((ctx) - vcpu_timer((ctx)->vcpu)->timers) - +#define timer_context_to_vcpu(ctx) ((ctx)->vcpu) #define timer_vm_data(ctx) (&(ctx)->vcpu->kvm->arch.timer_data) #define timer_irq(ctx) (timer_vm_data(ctx)->ppi[arch_timer_ctx_index(ctx)]) From 8625a670afb05f1e1d69d50a74dbcc9d1b855efe Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:47 +0100 Subject: [PATCH 33/46] KVM: arm64: Replace timer context vcpu pointer with timer_id Having to follow a pointer to a vcpu is pretty dumb, when the timers are are a fixed offset in the vcpu structure itself. Trade the vcpu pointer for a timer_id, which can then be used to compute the vcpu address as needed. Reviewed-by: Joey Gouly Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 4 ++-- include/kvm/arm_arch_timer.h | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index e5a25e743f5b..c832c293676a 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -149,7 +149,7 @@ static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval) static void timer_set_offset(struct arch_timer_context *ctxt, u64 offset) { if (!ctxt->offset.vm_offset) { - WARN(offset, "timer %ld\n", arch_timer_ctx_index(ctxt)); + WARN(offset, "timer %d\n", arch_timer_ctx_index(ctxt)); return; } @@ -1064,7 +1064,7 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) struct arch_timer_context *ctxt = vcpu_get_timer(vcpu, timerid); struct kvm *kvm = vcpu->kvm; - ctxt->vcpu = vcpu; + ctxt->timer_id = timerid; if (timerid == TIMER_VTIMER) ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset; diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index d188c716d03c..d8e400cb2bff 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -51,8 +51,6 @@ struct arch_timer_vm_data { }; struct arch_timer_context { - struct kvm_vcpu *vcpu; - /* Emulated Timer (may be unused) */ struct hrtimer hrtimer; u64 ns_frac; @@ -71,6 +69,9 @@ struct arch_timer_context { bool level; } irq; + /* Who am I? */ + enum kvm_arch_timers timer_id; + /* Duplicated state from arch_timer.c for convenience */ u32 host_timer_irq; }; @@ -127,9 +128,9 @@ void kvm_timer_init_vhe(void); #define vcpu_hvtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_HVTIMER]) #define vcpu_hptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_HPTIMER]) -#define arch_timer_ctx_index(ctx) ((ctx) - vcpu_timer((ctx)->vcpu)->timers) -#define timer_context_to_vcpu(ctx) ((ctx)->vcpu) -#define timer_vm_data(ctx) (&(ctx)->vcpu->kvm->arch.timer_data) +#define arch_timer_ctx_index(ctx) ((ctx)->timer_id) +#define timer_context_to_vcpu(ctx) container_of((ctx), struct kvm_vcpu, arch.timer_cpu.timers[(ctx)->timer_id]) +#define timer_vm_data(ctx) (&(timer_context_to_vcpu(ctx)->kvm->arch.timer_data)) #define timer_irq(ctx) (timer_vm_data(ctx)->ppi[arch_timer_ctx_index(ctx)]) u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, From a92d552266890f83126fdef4f777a985cc1302bd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:48 +0100 Subject: [PATCH 34/46] KVM: arm64: Make timer_set_offset() generally accessible Move the timer_set_offset() helper to arm_arch_timer.h, so that it is next to timer_get_offset(), and accessible by the rest of KVM. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 10 ---------- include/kvm/arm_arch_timer.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index c832c293676a..27662a3a3043 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -146,16 +146,6 @@ static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval) } } -static void timer_set_offset(struct arch_timer_context *ctxt, u64 offset) -{ - if (!ctxt->offset.vm_offset) { - WARN(offset, "timer %d\n", arch_timer_ctx_index(ctxt)); - return; - } - - WRITE_ONCE(*ctxt->offset.vm_offset, offset); -} - u64 kvm_phys_timer_read(void) { return timecounter->cc->read(timecounter->cc); diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index d8e400cb2bff..5f7f2ed8817c 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -179,4 +179,14 @@ static inline u64 timer_get_offset(struct arch_timer_context *ctxt) return offset; } +static inline void timer_set_offset(struct arch_timer_context *ctxt, u64 offset) +{ + if (!ctxt->offset.vm_offset) { + WARN(offset, "timer %d\n", arch_timer_ctx_index(ctxt)); + return; + } + + WRITE_ONCE(*ctxt->offset.vm_offset, offset); +} + #endif From 77a0c42eaf03c66936429d190bb2ea1a214bd528 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:49 +0100 Subject: [PATCH 35/46] KVM: arm64: Add timer UAPI workaround to sysreg infrastructure Amongst the numerous bugs that plague the KVM/arm64 UAPI, one of the most annoying thing is that the userspace view of the virtual timer has its CVAL and CNT encodings swapped. In order to reduce the amount of code that has to know about this, start by adding handling for this bug in the sys_reg code. Nothing is making use of it yet, as the code responsible for userspace interaction is catching the accesses early. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 33 ++++++++++++++++++++++++++++++--- arch/arm64/kvm/sys_regs.h | 6 ++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9f2f4e0b042e..8e6f50f54b4b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -5231,15 +5231,28 @@ static int demux_c15_set(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) } } +static u64 kvm_one_reg_to_id(const struct kvm_one_reg *reg) +{ + switch(reg->id) { + case KVM_REG_ARM_TIMER_CVAL: + return TO_ARM64_SYS_REG(CNTV_CVAL_EL0); + case KVM_REG_ARM_TIMER_CNT: + return TO_ARM64_SYS_REG(CNTVCT_EL0); + default: + return reg->id; + } +} + int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num) { u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; const struct sys_reg_desc *r; + u64 id = kvm_one_reg_to_id(reg); u64 val; int ret; - r = id_to_sys_reg_desc(vcpu, reg->id, table, num); + r = id_to_sys_reg_desc(vcpu, id, table, num); if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; @@ -5272,13 +5285,14 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, { u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; const struct sys_reg_desc *r; + u64 id = kvm_one_reg_to_id(reg); u64 val; int ret; if (get_user(val, uaddr)) return -EFAULT; - r = id_to_sys_reg_desc(vcpu, reg->id, table, num); + r = id_to_sys_reg_desc(vcpu, id, table, num); if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; @@ -5338,10 +5352,23 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg) static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind) { + u64 idx; + if (!*uind) return true; - if (put_user(sys_reg_to_index(reg), *uind)) + switch (reg_to_encoding(reg)) { + case SYS_CNTV_CVAL_EL0: + idx = KVM_REG_ARM_TIMER_CVAL; + break; + case SYS_CNTVCT_EL0: + idx = KVM_REG_ARM_TIMER_CNT; + break; + default: + idx = sys_reg_to_index(reg); + } + + if (put_user(idx, *uind)) return false; (*uind)++; diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 317abc490368..b3f904472fac 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -257,4 +257,10 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu); (val); \ }) +#define TO_ARM64_SYS_REG(r) ARM64_SYS_REG(sys_reg_Op0(SYS_ ## r), \ + sys_reg_Op1(SYS_ ## r), \ + sys_reg_CRn(SYS_ ## r), \ + sys_reg_CRm(SYS_ ## r), \ + sys_reg_Op2(SYS_ ## r)) + #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */ From 09424d5d7d4e8b427ee4a737fb7765103789e08a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:50 +0100 Subject: [PATCH 36/46] KVM: arm64: Move CNT*_CTL_EL0 userspace accessors to generic infrastructure Remove the handling of CNT*_CTL_EL0 from guest.c, and move it to sys_regs.c, using a new TIMER_REG() definition to encapsulate it. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/guest.c | 4 ---- arch/arm64/kvm/sys_regs.c | 36 +++++++++++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 16ba5e9ac86c..dea648706fd5 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -592,10 +592,8 @@ static unsigned long num_core_regs(const struct kvm_vcpu *vcpu) } static const u64 timer_reg_list[] = { - KVM_REG_ARM_TIMER_CTL, KVM_REG_ARM_TIMER_CNT, KVM_REG_ARM_TIMER_CVAL, - KVM_REG_ARM_PTIMER_CTL, KVM_REG_ARM_PTIMER_CNT, KVM_REG_ARM_PTIMER_CVAL, }; @@ -605,10 +603,8 @@ static const u64 timer_reg_list[] = { static bool is_timer_reg(u64 index) { switch (index) { - case KVM_REG_ARM_TIMER_CTL: case KVM_REG_ARM_TIMER_CNT: case KVM_REG_ARM_TIMER_CVAL: - case KVM_REG_ARM_PTIMER_CTL: case KVM_REG_ARM_PTIMER_CNT: case KVM_REG_ARM_PTIMER_CVAL: return true; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 8e6f50f54b4b..d97aacf4c1dc 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1594,6 +1594,23 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } +static int arch_timer_set_user(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + switch (reg_to_encoding(rd)) { + case SYS_CNTV_CTL_EL0: + case SYS_CNTP_CTL_EL0: + case SYS_CNTHV_CTL_EL2: + case SYS_CNTHP_CTL_EL2: + val &= ~ARCH_TIMER_CTRL_IT_STAT; + break; + } + + __vcpu_assign_sys_reg(vcpu, rd->reg, val); + return 0; +} + static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { @@ -2496,15 +2513,20 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, "trap of EL2 register redirected to EL1"); } -#define EL2_REG_FILTERED(name, acc, rst, v, filter) { \ +#define SYS_REG_USER_FILTER(name, acc, rst, v, gu, su, filter) { \ SYS_DESC(SYS_##name), \ .access = acc, \ .reset = rst, \ .reg = name, \ + .get_user = gu, \ + .set_user = su, \ .visibility = filter, \ .val = v, \ } +#define EL2_REG_FILTERED(name, acc, rst, v, filter) \ + SYS_REG_USER_FILTER(name, acc, rst, v, NULL, NULL, filter) + #define EL2_REG(name, acc, rst, v) \ EL2_REG_FILTERED(name, acc, rst, v, el2_visibility) @@ -2515,6 +2537,10 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, EL2_REG_VNCR_FILT(name, hidden_visibility) #define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) +#define TIMER_REG(name, vis) \ + SYS_REG_USER_FILTER(name, access_arch_timer, reset_val, 0, \ + NULL, arch_timer_set_user, vis) + /* * Since reset() callback and field val are not used for idregs, they will be * used for specific purposes for idregs. @@ -3485,11 +3511,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, - { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, + TIMER_REG(CNTP_CTL_EL0, NULL), { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer }, - { SYS_DESC(SYS_CNTV_CTL_EL0), access_arch_timer }, + TIMER_REG(CNTV_CTL_EL0, NULL), { SYS_DESC(SYS_CNTV_CVAL_EL0), access_arch_timer }, /* PMEVCNTRn_EL0 */ @@ -3688,11 +3714,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer }, - EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0), + TIMER_REG(CNTHP_CTL_EL2, el2_visibility), EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer, .visibility = cnthv_visibility }, - EL2_REG_FILTERED(CNTHV_CTL_EL2, access_arch_timer, reset_val, 0, cnthv_visibility), + TIMER_REG(CNTHV_CTL_EL2, cnthv_visibility), EL2_REG_FILTERED(CNTHV_CVAL_EL2, access_arch_timer, reset_val, 0, cnthv_visibility), { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, From 8af198980eff2ed2a5df3d2ee39f8c9d61f40559 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:51 +0100 Subject: [PATCH 37/46] KVM: arm64: Move CNT*_CVAL_EL0 userspace accessors to generic infrastructure As for the control registers, move the comparator registers to the common infrastructure. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/guest.c | 4 ---- arch/arm64/kvm/sys_regs.c | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index dea648706fd5..c23ec9be4ce2 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -593,9 +593,7 @@ static unsigned long num_core_regs(const struct kvm_vcpu *vcpu) static const u64 timer_reg_list[] = { KVM_REG_ARM_TIMER_CNT, - KVM_REG_ARM_TIMER_CVAL, KVM_REG_ARM_PTIMER_CNT, - KVM_REG_ARM_PTIMER_CVAL, }; #define NUM_TIMER_REGS ARRAY_SIZE(timer_reg_list) @@ -604,9 +602,7 @@ static bool is_timer_reg(u64 index) { switch (index) { case KVM_REG_ARM_TIMER_CNT: - case KVM_REG_ARM_TIMER_CVAL: case KVM_REG_ARM_PTIMER_CNT: - case KVM_REG_ARM_PTIMER_CVAL: return true; } return false; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index d97aacf4c1dc..68e88d5c0dfb 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -3512,11 +3512,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, TIMER_REG(CNTP_CTL_EL0, NULL), - { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, + TIMER_REG(CNTP_CVAL_EL0, NULL), { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer }, TIMER_REG(CNTV_CTL_EL0, NULL), - { SYS_DESC(SYS_CNTV_CVAL_EL0), access_arch_timer }, + TIMER_REG(CNTV_CVAL_EL0, NULL), /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), @@ -3715,11 +3715,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer }, TIMER_REG(CNTHP_CTL_EL2, el2_visibility), - EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), + TIMER_REG(CNTHP_CVAL_EL2, el2_visibility), { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer, .visibility = cnthv_visibility }, TIMER_REG(CNTHV_CTL_EL2, cnthv_visibility), - EL2_REG_FILTERED(CNTHV_CVAL_EL2, access_arch_timer, reset_val, 0, cnthv_visibility), + TIMER_REG(CNTHV_CVAL_EL2, cnthv_visibility), { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, From c3be3a48fb18f9d243fac452e0be41469bb246b4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:52 +0100 Subject: [PATCH 38/46] KVM: arm64: Move CNT*CT_EL0 userspace accessors to generic infrastructure Moving the counter registers is a bit more involved than for the control and comparator (there is no shadow data for the counter), but still pretty manageable. Reviewed-by: Joey Gouly Signed-off-by: Marc Zyngier --- arch/arm64/kvm/guest.c | 7 ------- arch/arm64/kvm/sys_regs.c | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index c23ec9be4ce2..138e5e2dc10c 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -592,19 +592,12 @@ static unsigned long num_core_regs(const struct kvm_vcpu *vcpu) } static const u64 timer_reg_list[] = { - KVM_REG_ARM_TIMER_CNT, - KVM_REG_ARM_PTIMER_CNT, }; #define NUM_TIMER_REGS ARRAY_SIZE(timer_reg_list) static bool is_timer_reg(u64 index) { - switch (index) { - case KVM_REG_ARM_TIMER_CNT: - case KVM_REG_ARM_PTIMER_CNT: - return true; - } return false; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 68e88d5c0dfb..e67eb39ddc11 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1605,12 +1605,38 @@ static int arch_timer_set_user(struct kvm_vcpu *vcpu, case SYS_CNTHP_CTL_EL2: val &= ~ARCH_TIMER_CTRL_IT_STAT; break; + case SYS_CNTVCT_EL0: + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) + timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read() - val); + return 0; + case SYS_CNTPCT_EL0: + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) + timer_set_offset(vcpu_ptimer(vcpu), kvm_phys_timer_read() - val); + return 0; } __vcpu_assign_sys_reg(vcpu, rd->reg, val); return 0; } +static int arch_timer_get_user(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 *val) +{ + switch (reg_to_encoding(rd)) { + case SYS_CNTVCT_EL0: + *val = kvm_phys_timer_read() - timer_get_offset(vcpu_vtimer(vcpu)); + break; + case SYS_CNTPCT_EL0: + *val = kvm_phys_timer_read() - timer_get_offset(vcpu_ptimer(vcpu)); + break; + default: + *val = __vcpu_sys_reg(vcpu, rd->reg); + } + + return 0; +} + static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { @@ -2539,7 +2565,7 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, #define TIMER_REG(name, vis) \ SYS_REG_USER_FILTER(name, access_arch_timer, reset_val, 0, \ - NULL, arch_timer_set_user, vis) + arch_timer_get_user, arch_timer_set_user, vis) /* * Since reset() callback and field val are not used for idregs, they will be @@ -3506,8 +3532,10 @@ static const struct sys_reg_desc sys_reg_descs[] = { AMU_AMEVTYPER1_EL0(14), AMU_AMEVTYPER1_EL0(15), - { SYS_DESC(SYS_CNTPCT_EL0), access_arch_timer }, - { SYS_DESC(SYS_CNTVCT_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTPCT_EL0), .access = access_arch_timer, + .get_user = arch_timer_get_user, .set_user = arch_timer_set_user }, + { SYS_DESC(SYS_CNTVCT_EL0), .access = access_arch_timer, + .get_user = arch_timer_get_user, .set_user = arch_timer_set_user }, { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, From 892f7c38ba3b7de19b3dffb8e148d5fbf1228f20 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:53 +0100 Subject: [PATCH 39/46] KVM: arm64: Fix WFxT handling of nested virt The spec for WFxT indicates that the parameter to the WFxT instruction is relative to the reading of CNTVCT_EL0. This means that the implementation needs to take the execution context into account, as CNTVOFF_EL2 does not always affect readings of CNTVCT_EL0 (such as when HCR_EL2.E2H is 1 and that we're in host context). This also rids us of the last instance of KVM_REG_ARM_TIMER_CNT outside of the userspace interaction code. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/handle_exit.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index bca8c80e11da..cc7d5d1709cb 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -147,7 +147,12 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) if (esr & ESR_ELx_WFx_ISS_RV) { u64 val, now; - now = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_TIMER_CNT); + now = kvm_phys_timer_read(); + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + now -= timer_get_offset(vcpu_hvtimer(vcpu)); + else + now -= timer_get_offset(vcpu_vtimer(vcpu)); + val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); if (now >= val) From 386aac77da112651a5cdadc4a6b29181592f5aa0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:54 +0100 Subject: [PATCH 40/46] KVM: arm64: Kill leftovers of ad-hoc timer userspace access Now that the whole timer infrastructure is handled as system register accesses, get rid of the now unused ad-hoc infrastructure. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 68 ------------------------------------ arch/arm64/kvm/guest.c | 55 ----------------------------- include/kvm/arm_arch_timer.h | 3 -- 3 files changed, 126 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 27662a3a3043..3f675875abea 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -1112,49 +1112,6 @@ void kvm_timer_cpu_down(void) disable_percpu_irq(host_ptimer_irq); } -int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) -{ - struct arch_timer_context *timer; - - switch (regid) { - case KVM_REG_ARM_TIMER_CTL: - timer = vcpu_vtimer(vcpu); - kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); - break; - case KVM_REG_ARM_TIMER_CNT: - if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, - &vcpu->kvm->arch.flags)) { - timer = vcpu_vtimer(vcpu); - timer_set_offset(timer, kvm_phys_timer_read() - value); - } - break; - case KVM_REG_ARM_TIMER_CVAL: - timer = vcpu_vtimer(vcpu); - kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); - break; - case KVM_REG_ARM_PTIMER_CTL: - timer = vcpu_ptimer(vcpu); - kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); - break; - case KVM_REG_ARM_PTIMER_CNT: - if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, - &vcpu->kvm->arch.flags)) { - timer = vcpu_ptimer(vcpu); - timer_set_offset(timer, kvm_phys_timer_read() - value); - } - break; - case KVM_REG_ARM_PTIMER_CVAL: - timer = vcpu_ptimer(vcpu); - kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); - break; - - default: - return -1; - } - - return 0; -} - static u64 read_timer_ctl(struct arch_timer_context *timer) { /* @@ -1171,31 +1128,6 @@ static u64 read_timer_ctl(struct arch_timer_context *timer) return ctl; } -u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) -{ - switch (regid) { - case KVM_REG_ARM_TIMER_CTL: - return kvm_arm_timer_read(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CTL); - case KVM_REG_ARM_TIMER_CNT: - return kvm_arm_timer_read(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CNT); - case KVM_REG_ARM_TIMER_CVAL: - return kvm_arm_timer_read(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CVAL); - case KVM_REG_ARM_PTIMER_CTL: - return kvm_arm_timer_read(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CTL); - case KVM_REG_ARM_PTIMER_CNT: - return kvm_arm_timer_read(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CNT); - case KVM_REG_ARM_PTIMER_CVAL: - return kvm_arm_timer_read(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CVAL); - } - return (u64)-1; -} - static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, struct arch_timer_context *timer, enum kvm_arch_timer_regs treg) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 138e5e2dc10c..1c87699fd886 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -591,49 +591,6 @@ static unsigned long num_core_regs(const struct kvm_vcpu *vcpu) return copy_core_reg_indices(vcpu, NULL); } -static const u64 timer_reg_list[] = { -}; - -#define NUM_TIMER_REGS ARRAY_SIZE(timer_reg_list) - -static bool is_timer_reg(u64 index) -{ - return false; -} - -static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -{ - for (int i = 0; i < NUM_TIMER_REGS; i++) { - if (put_user(timer_reg_list[i], uindices)) - return -EFAULT; - uindices++; - } - - return 0; -} - -static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - int ret; - - ret = copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)); - if (ret != 0) - return -EFAULT; - - return kvm_arm_timer_set_reg(vcpu, reg->id, val); -} - -static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - - val = kvm_arm_timer_get_reg(vcpu, reg->id); - return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0; -} - static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu) { const unsigned int slices = vcpu_sve_slices(vcpu); @@ -709,7 +666,6 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) res += num_sve_regs(vcpu); res += kvm_arm_num_sys_reg_descs(vcpu); res += kvm_arm_get_fw_num_regs(vcpu); - res += NUM_TIMER_REGS; return res; } @@ -740,11 +696,6 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return ret; uindices += kvm_arm_get_fw_num_regs(vcpu); - ret = copy_timer_indices(vcpu, uindices); - if (ret < 0) - return ret; - uindices += NUM_TIMER_REGS; - return kvm_arm_copy_sys_reg_indices(vcpu, uindices); } @@ -762,9 +713,6 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM64_SVE: return get_sve_reg(vcpu, reg); } - if (is_timer_reg(reg->id)) - return get_timer_reg(vcpu, reg); - return kvm_arm_sys_reg_get_reg(vcpu, reg); } @@ -782,9 +730,6 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM64_SVE: return set_sve_reg(vcpu, reg); } - if (is_timer_reg(reg->id)) - return set_timer_reg(vcpu, reg); - return kvm_arm_sys_reg_set_reg(vcpu, reg); } diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 5f7f2ed8817c..7310841f4512 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -107,9 +107,6 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu); void kvm_timer_init_vm(struct kvm *kvm); -u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); -int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); - int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); From 6418330c8478735f625398bc4e96d3ac6ce1e055 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:55 +0100 Subject: [PATCH 41/46] KVM: arm64: selftests: Make dependencies on VHE-specific registers explicit The hyp virtual timer registers only exist when VHE is present, Similarly, VNCR_EL2 only exists when NV2 is present. Make these dependencies explicit. Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/get-reg-list.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c index 011fad95dd02..0a4cfb368512 100644 --- a/tools/testing/selftests/kvm/arm64/get-reg-list.c +++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c @@ -65,6 +65,9 @@ static struct feature_id_reg feat_id_regs[] = { REG_FEAT(SCTLR2_EL1, ID_AA64MMFR3_EL1, SCTLRX, IMP), REG_FEAT(VDISR_EL2, ID_AA64PFR0_EL1, RAS, IMP), REG_FEAT(VSESR_EL2, ID_AA64PFR0_EL1, RAS, IMP), + REG_FEAT(VNCR_EL2, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY), + REG_FEAT(CNTHV_CTL_EL2, ID_AA64MMFR1_EL1, VH, IMP), + REG_FEAT(CNTHV_CVAL_EL2,ID_AA64MMFR1_EL1, VH, IMP), }; bool filter_reg(__u64 reg) From 4da5a9af78b74fb771a4d25dc794296d10e170b1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:56 +0100 Subject: [PATCH 42/46] KVM: arm64: selftests: Add an E2H=0-specific configuration to get_reg_list Add yet another configuration, this time dealing E2H=0. Signed-off-by: Marc Zyngier --- .../selftests/kvm/arm64/get-reg-list.c | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c index 0a4cfb368512..7a238755f072 100644 --- a/tools/testing/selftests/kvm/arm64/get-reg-list.c +++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c @@ -758,6 +758,10 @@ static __u64 el2_regs[] = { SYS_REG(VSESR_EL2), }; +static __u64 el2_e2h0_regs[] = { + /* Empty */ +}; + #define BASE_SUBLIST \ { "base", .regs = base_regs, .regs_n = ARRAY_SIZE(base_regs), } #define VREGS_SUBLIST \ @@ -792,6 +796,15 @@ static __u64 el2_regs[] = { .regs = el2_regs, \ .regs_n = ARRAY_SIZE(el2_regs), \ } +#define EL2_E2H0_SUBLIST \ + EL2_SUBLIST, \ + { \ + .name = "EL2 E2H0", \ + .capability = KVM_CAP_ARM_EL2_E2H0, \ + .feature = KVM_ARM_VCPU_HAS_EL2_E2H0, \ + .regs = el2_e2h0_regs, \ + .regs_n = ARRAY_SIZE(el2_e2h0_regs), \ + } static struct vcpu_reg_list vregs_config = { .sublists = { @@ -900,6 +913,65 @@ static struct vcpu_reg_list el2_pauth_pmu_config = { }, }; +static struct vcpu_reg_list el2_e2h0_vregs_config = { + .sublists = { + BASE_SUBLIST, + EL2_E2H0_SUBLIST, + VREGS_SUBLIST, + {0}, + }, +}; + +static struct vcpu_reg_list el2_e2h0_vregs_pmu_config = { + .sublists = { + BASE_SUBLIST, + EL2_E2H0_SUBLIST, + VREGS_SUBLIST, + PMU_SUBLIST, + {0}, + }, +}; + +static struct vcpu_reg_list el2_e2h0_sve_config = { + .sublists = { + BASE_SUBLIST, + EL2_E2H0_SUBLIST, + SVE_SUBLIST, + {0}, + }, +}; + +static struct vcpu_reg_list el2_e2h0_sve_pmu_config = { + .sublists = { + BASE_SUBLIST, + EL2_E2H0_SUBLIST, + SVE_SUBLIST, + PMU_SUBLIST, + {0}, + }, +}; + +static struct vcpu_reg_list el2_e2h0_pauth_config = { + .sublists = { + BASE_SUBLIST, + EL2_E2H0_SUBLIST, + VREGS_SUBLIST, + PAUTH_SUBLIST, + {0}, + }, +}; + +static struct vcpu_reg_list el2_e2h0_pauth_pmu_config = { + .sublists = { + BASE_SUBLIST, + EL2_E2H0_SUBLIST, + VREGS_SUBLIST, + PAUTH_SUBLIST, + PMU_SUBLIST, + {0}, + }, +}; + struct vcpu_reg_list *vcpu_configs[] = { &vregs_config, &vregs_pmu_config, @@ -914,5 +986,12 @@ struct vcpu_reg_list *vcpu_configs[] = { &el2_sve_pmu_config, &el2_pauth_config, &el2_pauth_pmu_config, + + &el2_e2h0_vregs_config, + &el2_e2h0_vregs_pmu_config, + &el2_e2h0_sve_config, + &el2_e2h0_sve_pmu_config, + &el2_e2h0_pauth_config, + &el2_e2h0_pauth_pmu_config, }; int vcpu_configs_n = ARRAY_SIZE(vcpu_configs); From 5c7cf1e44e94a5408b1b5277810502b0f82b77fe Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 29 Sep 2025 17:04:57 +0100 Subject: [PATCH 43/46] KVM: arm64: selftests: Fix misleading comment about virtual timer encoding The userspace-visible encoding for CNTV_CVAL_EL0 and CNTVCNT_EL0 have been swapped for as long as usersapce has had access to the registers. This is documented in arch/arm64/include/uapi/asm/kvm.h. Despite that, the get_reg_list test has unhelpful comments indicating the wrong register for the encoding. Replace this with definitions exposed in the include file, and a comment explaining again the brokenness. Signed-off-by: Marc Zyngier --- .../testing/selftests/kvm/arm64/get-reg-list.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c index 7a238755f072..c9b84eeaab6b 100644 --- a/tools/testing/selftests/kvm/arm64/get-reg-list.c +++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c @@ -348,9 +348,20 @@ static __u64 base_regs[] = { KVM_REG_ARM_FW_FEAT_BMAP_REG(1), /* KVM_REG_ARM_STD_HYP_BMAP */ KVM_REG_ARM_FW_FEAT_BMAP_REG(2), /* KVM_REG_ARM_VENDOR_HYP_BMAP */ KVM_REG_ARM_FW_FEAT_BMAP_REG(3), /* KVM_REG_ARM_VENDOR_HYP_BMAP_2 */ - ARM64_SYS_REG(3, 3, 14, 3, 1), /* CNTV_CTL_EL0 */ - ARM64_SYS_REG(3, 3, 14, 3, 2), /* CNTV_CVAL_EL0 */ - ARM64_SYS_REG(3, 3, 14, 0, 2), + + /* + * EL0 Virtual Timer Registers + * + * WARNING: + * KVM_REG_ARM_TIMER_CVAL and KVM_REG_ARM_TIMER_CNT are not defined + * with the appropriate register encodings. Their values have been + * accidentally swapped. As this is set API, the definitions here + * must be used, rather than ones derived from the encodings. + */ + KVM_ARM64_SYS_REG(SYS_CNTV_CTL_EL0), + KVM_REG_ARM_TIMER_CVAL, + KVM_REG_ARM_TIMER_CNT, + ARM64_SYS_REG(3, 0, 0, 0, 0), /* MIDR_EL1 */ ARM64_SYS_REG(3, 0, 0, 0, 6), /* REVIDR_EL1 */ ARM64_SYS_REG(3, 1, 0, 0, 1), /* CLIDR_EL1 */ From fb10ddf35c1cc3b2888a944c0a3b1aa3baea585e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 24 Sep 2025 16:51:49 -0700 Subject: [PATCH 44/46] KVM: arm64: Compute per-vCPU FGTs at vcpu_load() To date KVM has used the fine-grained traps for the sake of UNDEF enforcement (so-called FGUs), meaning the constituent parts could be computed on a per-VM basis and folded into the effective value when programmed. Prepare for traps changing based on the vCPU context by computing the whole mess of them at vcpu_load(). Aggressively inline all the helpers to preserve the build-time checks that were there before. Signed-off-by: Oliver Upton Reviewed-by: Joey Gouly Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 50 ++++++++ arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/config.c | 82 +++++++++++++ arch/arm64/kvm/hyp/include/hyp/switch.h | 148 +++--------------------- arch/arm64/kvm/hyp/nvhe/pkvm.c | 1 + 5 files changed, 151 insertions(+), 131 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index b763293281c8..64302c438355 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -816,6 +816,11 @@ struct kvm_vcpu_arch { u64 hcrx_el2; u64 mdcr_el2; + struct { + u64 r; + u64 w; + } fgt[__NR_FGT_GROUP_IDS__]; + /* Exception Information */ struct kvm_vcpu_fault_info fault; @@ -1600,6 +1605,51 @@ static inline bool kvm_arch_has_irq_bypass(void) void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt); void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1); void check_feature_map(void); +void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu); +static __always_inline enum fgt_group_id __fgt_reg_to_group_id(enum vcpu_sysreg reg) +{ + switch (reg) { + case HFGRTR_EL2: + case HFGWTR_EL2: + return HFGRTR_GROUP; + case HFGITR_EL2: + return HFGITR_GROUP; + case HDFGRTR_EL2: + case HDFGWTR_EL2: + return HDFGRTR_GROUP; + case HAFGRTR_EL2: + return HAFGRTR_GROUP; + case HFGRTR2_EL2: + case HFGWTR2_EL2: + return HFGRTR2_GROUP; + case HFGITR2_EL2: + return HFGITR2_GROUP; + case HDFGRTR2_EL2: + case HDFGWTR2_EL2: + return HDFGRTR2_GROUP; + default: + BUILD_BUG_ON(1); + } +} + +#define vcpu_fgt(vcpu, reg) \ + ({ \ + enum fgt_group_id id = __fgt_reg_to_group_id(reg); \ + u64 *p; \ + switch (reg) { \ + case HFGWTR_EL2: \ + case HDFGWTR_EL2: \ + case HFGWTR2_EL2: \ + case HDFGWTR2_EL2: \ + p = &(vcpu)->arch.fgt[id].w; \ + break; \ + default: \ + p = &(vcpu)->arch.fgt[id].r; \ + break; \ + } \ + \ + p; \ + }) #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index f01cacb669cf..870953b4a8a7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -642,6 +642,7 @@ nommu: vcpu->arch.hcr_el2 |= HCR_TWI; vcpu_set_pauth_traps(vcpu); + kvm_vcpu_load_fgt(vcpu); if (is_protected_kvm_enabled()) { kvm_call_hyp_nvhe(__pkvm_vcpu_load, diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index fbd8944a3dea..b1cf7660efe1 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -5,6 +5,8 @@ */ #include +#include +#include #include /* @@ -1428,3 +1430,83 @@ void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *r break; } } + +static __always_inline struct fgt_masks *__fgt_reg_to_masks(enum vcpu_sysreg reg) +{ + switch (reg) { + case HFGRTR_EL2: + return &hfgrtr_masks; + case HFGWTR_EL2: + return &hfgwtr_masks; + case HFGITR_EL2: + return &hfgitr_masks; + case HDFGRTR_EL2: + return &hdfgrtr_masks; + case HDFGWTR_EL2: + return &hdfgwtr_masks; + case HAFGRTR_EL2: + return &hafgrtr_masks; + case HFGRTR2_EL2: + return &hfgrtr2_masks; + case HFGWTR2_EL2: + return &hfgwtr2_masks; + case HFGITR2_EL2: + return &hfgitr2_masks; + case HDFGRTR2_EL2: + return &hdfgrtr2_masks; + case HDFGWTR2_EL2: + return &hdfgwtr2_masks; + default: + BUILD_BUG_ON(1); + } +} + +static __always_inline void __compute_fgt(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg) +{ + u64 fgu = vcpu->kvm->arch.fgu[__fgt_reg_to_group_id(reg)]; + struct fgt_masks *m = __fgt_reg_to_masks(reg); + u64 clear = 0, set = 0, val = m->nmask; + + set |= fgu & m->mask; + clear |= fgu & m->nmask; + + if (is_nested_ctxt(vcpu)) { + u64 nested = __vcpu_sys_reg(vcpu, reg); + set |= nested & m->mask; + clear |= ~nested & m->nmask; + } + + val |= set; + val &= ~clear; + *vcpu_fgt(vcpu, reg) = val; +} + +static void __compute_hfgwtr(struct kvm_vcpu *vcpu) +{ + __compute_fgt(vcpu, HFGWTR_EL2); + + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + *vcpu_fgt(vcpu, HFGWTR_EL2) |= HFGWTR_EL2_TCR_EL1; +} + +void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu) +{ + if (!cpus_have_final_cap(ARM64_HAS_FGT)) + return; + + __compute_fgt(vcpu, HFGRTR_EL2); + __compute_hfgwtr(vcpu); + __compute_fgt(vcpu, HFGITR_EL2); + __compute_fgt(vcpu, HDFGRTR_EL2); + __compute_fgt(vcpu, HDFGWTR_EL2); + __compute_fgt(vcpu, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + __compute_fgt(vcpu, HFGRTR2_EL2); + __compute_fgt(vcpu, HFGWTR2_EL2); + __compute_fgt(vcpu, HFGITR2_EL2); + __compute_fgt(vcpu, HDFGRTR2_EL2); + __compute_fgt(vcpu, HDFGWTR2_EL2); +} diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index b6682202edf3..c5d5e5b86eaf 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -195,123 +195,6 @@ static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) __deactivate_cptr_traps_nvhe(vcpu); } -#define reg_to_fgt_masks(reg) \ - ({ \ - struct fgt_masks *m; \ - switch(reg) { \ - case HFGRTR_EL2: \ - m = &hfgrtr_masks; \ - break; \ - case HFGWTR_EL2: \ - m = &hfgwtr_masks; \ - break; \ - case HFGITR_EL2: \ - m = &hfgitr_masks; \ - break; \ - case HDFGRTR_EL2: \ - m = &hdfgrtr_masks; \ - break; \ - case HDFGWTR_EL2: \ - m = &hdfgwtr_masks; \ - break; \ - case HAFGRTR_EL2: \ - m = &hafgrtr_masks; \ - break; \ - case HFGRTR2_EL2: \ - m = &hfgrtr2_masks; \ - break; \ - case HFGWTR2_EL2: \ - m = &hfgwtr2_masks; \ - break; \ - case HFGITR2_EL2: \ - m = &hfgitr2_masks; \ - break; \ - case HDFGRTR2_EL2: \ - m = &hdfgrtr2_masks; \ - break; \ - case HDFGWTR2_EL2: \ - m = &hdfgwtr2_masks; \ - break; \ - default: \ - BUILD_BUG_ON(1); \ - } \ - \ - m; \ - }) - -#define compute_clr_set(vcpu, reg, clr, set) \ - do { \ - u64 hfg = __vcpu_sys_reg(vcpu, reg); \ - struct fgt_masks *m = reg_to_fgt_masks(reg); \ - set |= hfg & m->mask; \ - clr |= ~hfg & m->nmask; \ - } while(0) - -#define reg_to_fgt_group_id(reg) \ - ({ \ - enum fgt_group_id id; \ - switch(reg) { \ - case HFGRTR_EL2: \ - case HFGWTR_EL2: \ - id = HFGRTR_GROUP; \ - break; \ - case HFGITR_EL2: \ - id = HFGITR_GROUP; \ - break; \ - case HDFGRTR_EL2: \ - case HDFGWTR_EL2: \ - id = HDFGRTR_GROUP; \ - break; \ - case HAFGRTR_EL2: \ - id = HAFGRTR_GROUP; \ - break; \ - case HFGRTR2_EL2: \ - case HFGWTR2_EL2: \ - id = HFGRTR2_GROUP; \ - break; \ - case HFGITR2_EL2: \ - id = HFGITR2_GROUP; \ - break; \ - case HDFGRTR2_EL2: \ - case HDFGWTR2_EL2: \ - id = HDFGRTR2_GROUP; \ - break; \ - default: \ - BUILD_BUG_ON(1); \ - } \ - \ - id; \ - }) - -#define compute_undef_clr_set(vcpu, kvm, reg, clr, set) \ - do { \ - u64 hfg = kvm->arch.fgu[reg_to_fgt_group_id(reg)]; \ - struct fgt_masks *m = reg_to_fgt_masks(reg); \ - set |= hfg & m->mask; \ - clr |= hfg & m->nmask; \ - } while(0) - -#define update_fgt_traps_cs(hctxt, vcpu, kvm, reg, clr, set) \ - do { \ - struct fgt_masks *m = reg_to_fgt_masks(reg); \ - u64 c = clr, s = set; \ - u64 val; \ - \ - ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \ - if (is_nested_ctxt(vcpu)) \ - compute_clr_set(vcpu, reg, c, s); \ - \ - compute_undef_clr_set(vcpu, kvm, reg, c, s); \ - \ - val = m->nmask; \ - val |= s; \ - val &= ~c; \ - write_sysreg_s(val, SYS_ ## reg); \ - } while(0) - -#define update_fgt_traps(hctxt, vcpu, kvm, reg) \ - update_fgt_traps_cs(hctxt, vcpu, kvm, reg, 0, 0) - static inline bool cpu_has_amu(void) { u64 pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1); @@ -320,33 +203,36 @@ static inline bool cpu_has_amu(void) ID_AA64PFR0_EL1_AMU_SHIFT); } +#define __activate_fgt(hctxt, vcpu, reg) \ + do { \ + ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \ + write_sysreg_s(*vcpu_fgt(vcpu, reg), SYS_ ## reg); \ + } while (0) + static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); - struct kvm *kvm = kern_hyp_va(vcpu->kvm); if (!cpus_have_final_cap(ARM64_HAS_FGT)) return; - update_fgt_traps(hctxt, vcpu, kvm, HFGRTR_EL2); - update_fgt_traps_cs(hctxt, vcpu, kvm, HFGWTR_EL2, 0, - cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) ? - HFGWTR_EL2_TCR_EL1_MASK : 0); - update_fgt_traps(hctxt, vcpu, kvm, HFGITR_EL2); - update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR_EL2); - update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR_EL2); + __activate_fgt(hctxt, vcpu, HFGRTR_EL2); + __activate_fgt(hctxt, vcpu, HFGWTR_EL2); + __activate_fgt(hctxt, vcpu, HFGITR_EL2); + __activate_fgt(hctxt, vcpu, HDFGRTR_EL2); + __activate_fgt(hctxt, vcpu, HDFGWTR_EL2); if (cpu_has_amu()) - update_fgt_traps(hctxt, vcpu, kvm, HAFGRTR_EL2); + __activate_fgt(hctxt, vcpu, HAFGRTR_EL2); if (!cpus_have_final_cap(ARM64_HAS_FGT2)) return; - update_fgt_traps(hctxt, vcpu, kvm, HFGRTR2_EL2); - update_fgt_traps(hctxt, vcpu, kvm, HFGWTR2_EL2); - update_fgt_traps(hctxt, vcpu, kvm, HFGITR2_EL2); - update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR2_EL2); - update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR2_EL2); + __activate_fgt(hctxt, vcpu, HFGRTR2_EL2); + __activate_fgt(hctxt, vcpu, HFGWTR2_EL2); + __activate_fgt(hctxt, vcpu, HFGITR2_EL2); + __activate_fgt(hctxt, vcpu, HDFGRTR2_EL2); + __activate_fgt(hctxt, vcpu, HDFGWTR2_EL2); } #define __deactivate_fgt(htcxt, vcpu, reg) \ diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 05774aed09cb..43bde061b65d 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -172,6 +172,7 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) /* Trust the host for non-protected vcpu features. */ vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2; + memcpy(vcpu->arch.fgt, host_vcpu->arch.fgt, sizeof(vcpu->arch.fgt)); return 0; } From e0b5a7967dec05144bc98125f98c47f74fd1152b Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 24 Sep 2025 16:51:50 -0700 Subject: [PATCH 45/46] KVM: arm64: nv: Use FGT write trap of MDSCR_EL1 when available Marc reports that the performance of running an L3 guest has regressed by 60% as a result of setting MDCR_EL2.TDA to hide bad architecture. That's of course terrible for the single user of recursive NV ;-) While there's nothing to be done on non-FGT systems, take advantage of the precise write trap of MDSCR_EL1 and leave the rest of the debug registers untrapped. Reported-by: Marc Zyngier Signed-off-by: Oliver Upton Reviewed-by: Joey Gouly Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 10 +++++++++- arch/arm64/kvm/nested.c | 9 ++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index b1cf7660efe1..24bb3f36e9d5 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -1489,6 +1489,14 @@ static void __compute_hfgwtr(struct kvm_vcpu *vcpu) *vcpu_fgt(vcpu, HFGWTR_EL2) |= HFGWTR_EL2_TCR_EL1; } +static void __compute_hdfgwtr(struct kvm_vcpu *vcpu) +{ + __compute_fgt(vcpu, HDFGWTR_EL2); + + if (is_hyp_ctxt(vcpu)) + *vcpu_fgt(vcpu, HDFGWTR_EL2) |= HDFGWTR_EL2_MDSCR_EL1; +} + void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu) { if (!cpus_have_final_cap(ARM64_HAS_FGT)) @@ -1498,7 +1506,7 @@ void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu) __compute_hfgwtr(vcpu); __compute_fgt(vcpu, HFGITR_EL2); __compute_fgt(vcpu, HDFGRTR_EL2); - __compute_fgt(vcpu, HDFGWTR_EL2); + __compute_hdfgwtr(vcpu); __compute_fgt(vcpu, HAFGRTR_EL2); if (!cpus_have_final_cap(ARM64_HAS_FGT2)) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 7a045cad6bdf..f04cda40545b 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1859,13 +1859,16 @@ void kvm_nested_setup_mdcr_el2(struct kvm_vcpu *vcpu) { u64 guest_mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + if (is_nested_ctxt(vcpu)) + vcpu->arch.mdcr_el2 |= (guest_mdcr & NV_MDCR_GUEST_INCLUDE); /* * In yet another example where FEAT_NV2 is fscking broken, accesses * to MDSCR_EL1 are redirected to the VNCR despite having an effect * at EL2. Use a big hammer to apply sanity. + * + * Unless of course we have FEAT_FGT, in which case we can precisely + * trap MDSCR_EL1. */ - if (is_hyp_ctxt(vcpu)) + else if (!cpus_have_final_cap(ARM64_HAS_FGT)) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; - else - vcpu->arch.mdcr_el2 |= (guest_mdcr & NV_MDCR_GUEST_INCLUDE); } From ca88ecdce5f51874a7c151809bd2c936ee0d3805 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 8 Oct 2025 22:35:15 +0100 Subject: [PATCH 46/46] arm64: Revamp HCR_EL2.E2H RES1 detection We currently have two ways to identify CPUs that only implement FEAT_VHE and not FEAT_E2H0: - either they advertise it via ID_AA64MMFR4_EL1.E2H0, - or the HCR_EL2.E2H bit is RAO/WI However, there is a third category of "cpus" that fall between these two cases: on CPUs that do not implement FEAT_FGT, it is IMPDEF whether an access to ID_AA64MMFR4_EL1 can trap to EL2 when the register value is zero. A consequence of this is that on systems such as Neoverse V2, a NV guest cannot reliably detect that it is in a VHE-only configuration (E2H is writable, and ID_AA64MMFR0_EL1 is 0), despite the hypervisor's best effort to repaint the id register. Replace the RAO/WI test by a sequence that makes use of the VHE register remnapping between EL1 and EL2 to detect this situation, and work out whether we get the VHE behaviour even after having set HCR_EL2.E2H to 0. This solves the NV problem, and provides a more reliable acid test for CPUs that do not completely follow the letter of the architecture while providing a RES1 behaviour for HCR_EL2.E2H. Suggested-by: Mark Rutland Acked-by: Mark Rutland Acked-by: Catalin Marinas Reviewed-by: Oliver Upton Tested-by: Jan Kotas Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/15A85F2B-1A0C-4FA7-9FE4-EEC2203CC09E@global.cadence.com --- arch/arm64/include/asm/el2_setup.h | 38 +++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index b37da3ee8529..99a7c0235e6d 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -24,22 +24,48 @@ * ID_AA64MMFR4_EL1.E2H0 < 0. On such CPUs HCR_EL2.E2H is RES1, but it * can reset into an UNKNOWN state and might not read as 1 until it has * been initialized explicitly. - * - * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but - * don't advertise it (they predate this relaxation). - * * Initalize HCR_EL2.E2H so that later code can rely upon HCR_EL2.E2H * indicating whether the CPU is running in E2H mode. */ mrs_s x1, SYS_ID_AA64MMFR4_EL1 sbfx x1, x1, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH cmp x1, #0 - b.ge .LnVHE_\@ + b.lt .LnE2H0_\@ + /* + * Unfortunately, HCR_EL2.E2H can be RES1 even if not advertised + * as such via ID_AA64MMFR4_EL1.E2H0: + * + * - Fruity CPUs predate the !FEAT_E2H0 relaxation, and seem to + * have HCR_EL2.E2H implemented as RAO/WI. + * + * - On CPUs that lack FEAT_FGT, a hypervisor can't trap guest + * reads of ID_AA64MMFR4_EL1 to advertise !FEAT_E2H0. NV + * guests on these hosts can write to HCR_EL2.E2H without + * trapping to the hypervisor, but these writes have no + * functional effect. + * + * Handle both cases by checking for an essential VHE property + * (system register remapping) to decide whether we're + * effectively VHE-only or not. + */ + msr_hcr_el2 x0 // Setup HCR_EL2 as nVHE + isb + mov x1, #1 // Write something to FAR_EL1 + msr far_el1, x1 + isb + mov x1, #2 // Try to overwrite it via FAR_EL2 + msr far_el2, x1 + isb + mrs x1, far_el1 // If we see the latest write in FAR_EL1, + cmp x1, #2 // we can safely assume we are VHE only. + b.ne .LnVHE_\@ // Otherwise, we know that nVHE works. + +.LnE2H0_\@: orr x0, x0, #HCR_E2H -.LnVHE_\@: msr_hcr_el2 x0 isb +.LnVHE_\@: .endm .macro __init_el2_sctlr