Loading arch/x86/include/asm/kvm_host.h +25 −21 Original line number Diff line number Diff line Loading @@ -254,28 +254,31 @@ enum x86_intercept_stage; KVM_GUESTDBG_INJECT_DB | \ KVM_GUESTDBG_BLOCKIRQ) #define PFERR_PRESENT_MASK BIT(0) #define PFERR_WRITE_MASK BIT(1) #define PFERR_USER_MASK BIT(2) #define PFERR_RSVD_MASK BIT(3) #define PFERR_FETCH_MASK BIT(4) #define PFERR_PK_MASK BIT(5) #define PFERR_SGX_MASK BIT(15) #define PFERR_GUEST_RMP_MASK BIT_ULL(31) #define PFERR_GUEST_FINAL_MASK BIT_ULL(32) #define PFERR_GUEST_PAGE_MASK BIT_ULL(33) #define PFERR_GUEST_ENC_MASK BIT_ULL(34) #define PFERR_GUEST_SIZEM_MASK BIT_ULL(35) #define PFERR_GUEST_VMPL_MASK BIT_ULL(36) #define PFERR_PRESENT_BIT 0 #define PFERR_WRITE_BIT 1 #define PFERR_USER_BIT 2 #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 #define PFERR_SGX_BIT 15 #define PFERR_GUEST_FINAL_BIT 32 #define PFERR_GUEST_PAGE_BIT 33 #define PFERR_IMPLICIT_ACCESS_BIT 48 #define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) #define PFERR_USER_MASK BIT(PFERR_USER_BIT) #define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) #define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) #define PFERR_PK_MASK BIT(PFERR_PK_BIT) #define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) #define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) #define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) #define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) /* * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks * when emulating instructions that triggers implicit access. */ #define PFERR_IMPLICIT_ACCESS BIT_ULL(48) /* * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred * when the guest was accessing private memory. */ #define PFERR_PRIVATE_ACCESS BIT_ULL(49) #define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ Loading Loading @@ -1848,6 +1851,7 @@ struct kvm_arch_async_pf { gfn_t gfn; unsigned long cr3; bool direct_map; u64 error_code; }; extern u32 __read_mostly kvm_nr_uret_msrs; Loading arch/x86/kvm/mmu.h +2 −3 Original line number Diff line number Diff line Loading @@ -213,7 +213,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, */ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1; u32 errcode = PFERR_PRESENT_MASK; bool fault; Loading @@ -234,8 +234,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3; /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ offset = (pfec & ~1) + ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0); pkru_bits &= mmu->pkru_mask >> offset; errcode |= -pkru_bits & PFERR_PK_MASK; Loading arch/x86/kvm/mmu/mmu.c +112 −70 Original line number Diff line number Diff line Loading @@ -3262,9 +3262,19 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, { gva_t gva = fault->is_tdp ? 0 : fault->addr; if (fault->is_private) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } vcpu_cache_mmio_info(vcpu, gva, fault->gfn, access & shadow_mmio_access_mask); fault->slot = NULL; fault->pfn = KVM_PFN_NOSLOT; fault->map_writable = false; fault->hva = KVM_HVA_ERR_BAD; /* * If MMIO caching is disabled, emulate immediately without * touching the shadow page tables as attempting to install an Loading Loading @@ -4207,24 +4217,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu) return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; } static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, gfn_t gfn) static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_arch_async_pf arch; arch.token = alloc_apf_token(vcpu); arch.gfn = gfn; arch.gfn = fault->gfn; arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); return kvm_setup_async_pf(vcpu, cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); return kvm_setup_async_pf(vcpu, fault->addr, kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); } void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { int r; if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) return; if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || work->wakeup_all) return; Loading @@ -4237,7 +4251,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL); kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL); } static inline u8 kvm_max_level_for_order(int order) Loading @@ -4257,14 +4271,6 @@ static inline u8 kvm_max_level_for_order(int order) return PG_LEVEL_4K; } static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, PAGE_SIZE, fault->write, fault->exec, fault->is_private); } static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { Loading @@ -4291,48 +4297,15 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; bool async; /* * Retry the page fault if the gfn hit a memslot that is being deleted * or moved. This ensures any existing SPTEs for the old memslot will * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) return RET_PF_RETRY; if (!kvm_is_visible_memslot(slot)) { /* Don't expose private memslots to L2. */ if (is_guest_mode(vcpu)) { fault->slot = NULL; fault->pfn = KVM_PFN_NOSLOT; fault->map_writable = false; return RET_PF_CONTINUE; } /* * If the APIC access page exists but is disabled, go directly * to emulation without caching the MMIO access or creating a * MMIO SPTE. That way the cache doesn't need to be purged * when the AVIC is re-enabled. */ if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && !kvm_apicv_activated(vcpu->kvm)) return RET_PF_EMULATE; } if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } if (fault->is_private) return kvm_faultin_pfn_private(vcpu, fault); async = false; fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, &fault->hva); fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, &fault->hva); if (!async) return RET_PF_CONTINUE; /* *pfn has correct page already */ Loading @@ -4342,7 +4315,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return RET_PF_RETRY; } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) { } else if (kvm_arch_setup_async_pf(vcpu, fault)) { return RET_PF_RETRY; } } Loading @@ -4352,17 +4325,72 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL, fault->write, &fault->map_writable, &fault->hva); fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, NULL, fault->write, &fault->map_writable, &fault->hva); return RET_PF_CONTINUE; } static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { struct kvm_memory_slot *slot = fault->slot; int ret; /* * Note that the mmu_invalidate_seq also serves to detect a concurrent * change in attributes. is_page_fault_stale() will detect an * invalidation relate to fault->fn and resume the guest without * installing a mapping in the page tables. */ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); /* * Now that we have a snapshot of mmu_invalidate_seq we can check for a * private vs. shared mismatch. */ if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } if (unlikely(!slot)) return kvm_handle_noslot_fault(vcpu, fault, access); /* * Retry the page fault if the gfn hit a memslot that is being deleted * or moved. This ensures any existing SPTEs for the old memslot will * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot->flags & KVM_MEMSLOT_INVALID) return RET_PF_RETRY; if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) { /* * Don't map L1's APIC access page into L2, KVM doesn't support * using APICv/AVIC to accelerate L2 accesses to L1's APIC, * i.e. the access needs to be emulated. Emulating access to * L1's APIC is also correct if L1 is accelerating L2's own * virtual APIC, but for some reason L1 also maps _L1's_ APIC * into L2. Note, vcpu_is_mmio_gpa() always treats access to * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM * uses different roots for L1 vs. L2, i.e. there is no danger * of breaking APICv/AVIC for L1. */ if (is_guest_mode(vcpu)) return kvm_handle_noslot_fault(vcpu, fault, access); /* * If the APIC access page exists but is disabled, go directly * to emulation without caching the MMIO access or creating a * MMIO SPTE. That way the cache doesn't need to be purged * when the AVIC is re-enabled. */ if (!kvm_apicv_activated(vcpu->kvm)) return RET_PF_EMULATE; } fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); Loading @@ -4387,8 +4415,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held * to detect retry guarantees the worst case latency for the vCPU. */ if (fault->slot && mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY; ret = __kvm_faultin_pfn(vcpu, fault); Loading @@ -4398,7 +4425,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (unlikely(is_error_pfn(fault->pfn))) return kvm_handle_error_pfn(vcpu, fault); if (unlikely(!fault->slot)) if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn))) return kvm_handle_noslot_fault(vcpu, fault, access); /* Loading Loading @@ -4509,6 +4536,16 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, if (WARN_ON_ONCE(fault_address >> 32)) return -EFAULT; #endif /* * Legacy #PF exception only have a 32-bit error code. Simply drop the * upper bits as KVM doesn't use them for #PF (because they are never * set), and to ensure there are no collisions with KVM-defined bits. */ if (WARN_ON_ONCE(error_code >> 32)) error_code = lower_32_bits(error_code); /* Ensure the above sanity check also covers KVM-defined flags. */ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); vcpu->arch.l1tf_flush_l1d = true; if (!flags) { Loading Loading @@ -5794,30 +5831,35 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct; /* * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP * checks when emulating instructions that triggers implicit access. * WARN if hardware generates a fault with an error code that collides * with the KVM-defined value. Clear the flag and continue on, i.e. * don't terminate the VM, as KVM can't possibly be relying on a flag * that KVM doesn't know about. */ if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS)) error_code &= ~PFERR_IMPLICIT_ACCESS; if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; /* * Except for reserved faults (emulated MMIO is shared-only), set the * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's * current attributes, which are the source of truth for such VMs. Note, * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't * currently supported nested virtualization (among many other things) * for software-protected VMs. */ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && !(error_code & PFERR_RSVD_MASK) && vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) error_code |= PFERR_PRIVATE_ACCESS; r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS)) return -EFAULT; r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate; } if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false, r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, &emulation_type); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; Loading arch/x86/kvm/mmu/mmu_internal.h +25 −3 Original line number Diff line number Diff line Loading @@ -190,7 +190,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm) struct kvm_page_fault { /* arguments to kvm_mmu_do_page_fault. */ const gpa_t addr; const u32 error_code; const u64 error_code; const bool prefetch; /* Derived from error_code. */ Loading Loading @@ -279,8 +279,16 @@ enum { RET_PF_SPURIOUS, }; static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, PAGE_SIZE, fault->write, fault->exec, fault->is_private); } static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err, bool prefetch, int *emulation_type) u64 err, bool prefetch, int *emulation_type) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, Loading @@ -298,7 +306,10 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, .goal_level = PG_LEVEL_4K, .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT), .is_private = err & PFERR_PRIVATE_ACCESS, .pfn = KVM_PFN_ERR_FAULT, .hva = KVM_HVA_ERR_BAD, }; int r; Loading @@ -320,6 +331,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, else r = vcpu->arch.mmu->page_fault(vcpu, &fault); /* * Not sure what's happening, but punt to userspace and hope that * they can fix it by changing memory to shared, or they can * provide a better error. */ if (r == RET_PF_EMULATE && fault.is_private) { pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n"); kvm_mmu_prepare_memory_fault_exit(vcpu, &fault); return -EFAULT; } if (fault.write_fault_to_shadow_pgtable && emulation_type) *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; Loading arch/x86/kvm/mmu/mmutrace.h +1 −1 Original line number Diff line number Diff line Loading @@ -260,7 +260,7 @@ TRACE_EVENT( TP_STRUCT__entry( __field(int, vcpu_id) __field(gpa_t, cr2_or_gpa) __field(u32, error_code) __field(u64, error_code) __field(u64 *, sptep) __field(u64, old_spte) __field(u64, new_spte) Loading Loading
arch/x86/include/asm/kvm_host.h +25 −21 Original line number Diff line number Diff line Loading @@ -254,28 +254,31 @@ enum x86_intercept_stage; KVM_GUESTDBG_INJECT_DB | \ KVM_GUESTDBG_BLOCKIRQ) #define PFERR_PRESENT_MASK BIT(0) #define PFERR_WRITE_MASK BIT(1) #define PFERR_USER_MASK BIT(2) #define PFERR_RSVD_MASK BIT(3) #define PFERR_FETCH_MASK BIT(4) #define PFERR_PK_MASK BIT(5) #define PFERR_SGX_MASK BIT(15) #define PFERR_GUEST_RMP_MASK BIT_ULL(31) #define PFERR_GUEST_FINAL_MASK BIT_ULL(32) #define PFERR_GUEST_PAGE_MASK BIT_ULL(33) #define PFERR_GUEST_ENC_MASK BIT_ULL(34) #define PFERR_GUEST_SIZEM_MASK BIT_ULL(35) #define PFERR_GUEST_VMPL_MASK BIT_ULL(36) #define PFERR_PRESENT_BIT 0 #define PFERR_WRITE_BIT 1 #define PFERR_USER_BIT 2 #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 #define PFERR_SGX_BIT 15 #define PFERR_GUEST_FINAL_BIT 32 #define PFERR_GUEST_PAGE_BIT 33 #define PFERR_IMPLICIT_ACCESS_BIT 48 #define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) #define PFERR_USER_MASK BIT(PFERR_USER_BIT) #define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) #define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) #define PFERR_PK_MASK BIT(PFERR_PK_BIT) #define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) #define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) #define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) #define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) /* * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks * when emulating instructions that triggers implicit access. */ #define PFERR_IMPLICIT_ACCESS BIT_ULL(48) /* * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred * when the guest was accessing private memory. */ #define PFERR_PRIVATE_ACCESS BIT_ULL(49) #define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ Loading Loading @@ -1848,6 +1851,7 @@ struct kvm_arch_async_pf { gfn_t gfn; unsigned long cr3; bool direct_map; u64 error_code; }; extern u32 __read_mostly kvm_nr_uret_msrs; Loading
arch/x86/kvm/mmu.h +2 −3 Original line number Diff line number Diff line Loading @@ -213,7 +213,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, */ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1; u32 errcode = PFERR_PRESENT_MASK; bool fault; Loading @@ -234,8 +234,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3; /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ offset = (pfec & ~1) + ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0); pkru_bits &= mmu->pkru_mask >> offset; errcode |= -pkru_bits & PFERR_PK_MASK; Loading
arch/x86/kvm/mmu/mmu.c +112 −70 Original line number Diff line number Diff line Loading @@ -3262,9 +3262,19 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, { gva_t gva = fault->is_tdp ? 0 : fault->addr; if (fault->is_private) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } vcpu_cache_mmio_info(vcpu, gva, fault->gfn, access & shadow_mmio_access_mask); fault->slot = NULL; fault->pfn = KVM_PFN_NOSLOT; fault->map_writable = false; fault->hva = KVM_HVA_ERR_BAD; /* * If MMIO caching is disabled, emulate immediately without * touching the shadow page tables as attempting to install an Loading Loading @@ -4207,24 +4217,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu) return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; } static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, gfn_t gfn) static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_arch_async_pf arch; arch.token = alloc_apf_token(vcpu); arch.gfn = gfn; arch.gfn = fault->gfn; arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); return kvm_setup_async_pf(vcpu, cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); return kvm_setup_async_pf(vcpu, fault->addr, kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); } void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { int r; if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) return; if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || work->wakeup_all) return; Loading @@ -4237,7 +4251,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL); kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL); } static inline u8 kvm_max_level_for_order(int order) Loading @@ -4257,14 +4271,6 @@ static inline u8 kvm_max_level_for_order(int order) return PG_LEVEL_4K; } static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, PAGE_SIZE, fault->write, fault->exec, fault->is_private); } static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { Loading @@ -4291,48 +4297,15 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; bool async; /* * Retry the page fault if the gfn hit a memslot that is being deleted * or moved. This ensures any existing SPTEs for the old memslot will * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) return RET_PF_RETRY; if (!kvm_is_visible_memslot(slot)) { /* Don't expose private memslots to L2. */ if (is_guest_mode(vcpu)) { fault->slot = NULL; fault->pfn = KVM_PFN_NOSLOT; fault->map_writable = false; return RET_PF_CONTINUE; } /* * If the APIC access page exists but is disabled, go directly * to emulation without caching the MMIO access or creating a * MMIO SPTE. That way the cache doesn't need to be purged * when the AVIC is re-enabled. */ if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && !kvm_apicv_activated(vcpu->kvm)) return RET_PF_EMULATE; } if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } if (fault->is_private) return kvm_faultin_pfn_private(vcpu, fault); async = false; fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, &fault->hva); fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, &fault->hva); if (!async) return RET_PF_CONTINUE; /* *pfn has correct page already */ Loading @@ -4342,7 +4315,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return RET_PF_RETRY; } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) { } else if (kvm_arch_setup_async_pf(vcpu, fault)) { return RET_PF_RETRY; } } Loading @@ -4352,17 +4325,72 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL, fault->write, &fault->map_writable, &fault->hva); fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, NULL, fault->write, &fault->map_writable, &fault->hva); return RET_PF_CONTINUE; } static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { struct kvm_memory_slot *slot = fault->slot; int ret; /* * Note that the mmu_invalidate_seq also serves to detect a concurrent * change in attributes. is_page_fault_stale() will detect an * invalidation relate to fault->fn and resume the guest without * installing a mapping in the page tables. */ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); /* * Now that we have a snapshot of mmu_invalidate_seq we can check for a * private vs. shared mismatch. */ if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } if (unlikely(!slot)) return kvm_handle_noslot_fault(vcpu, fault, access); /* * Retry the page fault if the gfn hit a memslot that is being deleted * or moved. This ensures any existing SPTEs for the old memslot will * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot->flags & KVM_MEMSLOT_INVALID) return RET_PF_RETRY; if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) { /* * Don't map L1's APIC access page into L2, KVM doesn't support * using APICv/AVIC to accelerate L2 accesses to L1's APIC, * i.e. the access needs to be emulated. Emulating access to * L1's APIC is also correct if L1 is accelerating L2's own * virtual APIC, but for some reason L1 also maps _L1's_ APIC * into L2. Note, vcpu_is_mmio_gpa() always treats access to * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM * uses different roots for L1 vs. L2, i.e. there is no danger * of breaking APICv/AVIC for L1. */ if (is_guest_mode(vcpu)) return kvm_handle_noslot_fault(vcpu, fault, access); /* * If the APIC access page exists but is disabled, go directly * to emulation without caching the MMIO access or creating a * MMIO SPTE. That way the cache doesn't need to be purged * when the AVIC is re-enabled. */ if (!kvm_apicv_activated(vcpu->kvm)) return RET_PF_EMULATE; } fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); Loading @@ -4387,8 +4415,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held * to detect retry guarantees the worst case latency for the vCPU. */ if (fault->slot && mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY; ret = __kvm_faultin_pfn(vcpu, fault); Loading @@ -4398,7 +4425,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (unlikely(is_error_pfn(fault->pfn))) return kvm_handle_error_pfn(vcpu, fault); if (unlikely(!fault->slot)) if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn))) return kvm_handle_noslot_fault(vcpu, fault, access); /* Loading Loading @@ -4509,6 +4536,16 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, if (WARN_ON_ONCE(fault_address >> 32)) return -EFAULT; #endif /* * Legacy #PF exception only have a 32-bit error code. Simply drop the * upper bits as KVM doesn't use them for #PF (because they are never * set), and to ensure there are no collisions with KVM-defined bits. */ if (WARN_ON_ONCE(error_code >> 32)) error_code = lower_32_bits(error_code); /* Ensure the above sanity check also covers KVM-defined flags. */ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); vcpu->arch.l1tf_flush_l1d = true; if (!flags) { Loading Loading @@ -5794,30 +5831,35 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct; /* * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP * checks when emulating instructions that triggers implicit access. * WARN if hardware generates a fault with an error code that collides * with the KVM-defined value. Clear the flag and continue on, i.e. * don't terminate the VM, as KVM can't possibly be relying on a flag * that KVM doesn't know about. */ if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS)) error_code &= ~PFERR_IMPLICIT_ACCESS; if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; /* * Except for reserved faults (emulated MMIO is shared-only), set the * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's * current attributes, which are the source of truth for such VMs. Note, * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't * currently supported nested virtualization (among many other things) * for software-protected VMs. */ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && !(error_code & PFERR_RSVD_MASK) && vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) error_code |= PFERR_PRIVATE_ACCESS; r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS)) return -EFAULT; r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate; } if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false, r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, &emulation_type); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; Loading
arch/x86/kvm/mmu/mmu_internal.h +25 −3 Original line number Diff line number Diff line Loading @@ -190,7 +190,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm) struct kvm_page_fault { /* arguments to kvm_mmu_do_page_fault. */ const gpa_t addr; const u32 error_code; const u64 error_code; const bool prefetch; /* Derived from error_code. */ Loading Loading @@ -279,8 +279,16 @@ enum { RET_PF_SPURIOUS, }; static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, PAGE_SIZE, fault->write, fault->exec, fault->is_private); } static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err, bool prefetch, int *emulation_type) u64 err, bool prefetch, int *emulation_type) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, Loading @@ -298,7 +306,10 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, .goal_level = PG_LEVEL_4K, .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT), .is_private = err & PFERR_PRIVATE_ACCESS, .pfn = KVM_PFN_ERR_FAULT, .hva = KVM_HVA_ERR_BAD, }; int r; Loading @@ -320,6 +331,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, else r = vcpu->arch.mmu->page_fault(vcpu, &fault); /* * Not sure what's happening, but punt to userspace and hope that * they can fix it by changing memory to shared, or they can * provide a better error. */ if (r == RET_PF_EMULATE && fault.is_private) { pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n"); kvm_mmu_prepare_memory_fault_exit(vcpu, &fault); return -EFAULT; } if (fault.write_fault_to_shadow_pgtable && emulation_type) *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; Loading
arch/x86/kvm/mmu/mmutrace.h +1 −1 Original line number Diff line number Diff line Loading @@ -260,7 +260,7 @@ TRACE_EVENT( TP_STRUCT__entry( __field(int, vcpu_id) __field(gpa_t, cr2_or_gpa) __field(u32, error_code) __field(u64, error_code) __field(u64 *, sptep) __field(u64, old_spte) __field(u64, new_spte) Loading