Commit f3650842 authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge branch 'kvm-coco-pagefault-prep' into HEAD

A combination of prep work for TDX and SNP, and a clean up of the
page fault path to (hopefully) make it easier to follow the rules for
private memory, noslot faults, writes to read-only slots, etc.
parents 1e21b538 2b1f4355
Loading
Loading
Loading
Loading
+25 −21
Original line number Diff line number Diff line
@@ -254,28 +254,31 @@ enum x86_intercept_stage;
	KVM_GUESTDBG_INJECT_DB | \
	KVM_GUESTDBG_BLOCKIRQ)

#define PFERR_PRESENT_MASK	BIT(0)
#define PFERR_WRITE_MASK	BIT(1)
#define PFERR_USER_MASK		BIT(2)
#define PFERR_RSVD_MASK		BIT(3)
#define PFERR_FETCH_MASK	BIT(4)
#define PFERR_PK_MASK		BIT(5)
#define PFERR_SGX_MASK		BIT(15)
#define PFERR_GUEST_RMP_MASK	BIT_ULL(31)
#define PFERR_GUEST_FINAL_MASK	BIT_ULL(32)
#define PFERR_GUEST_PAGE_MASK	BIT_ULL(33)
#define PFERR_GUEST_ENC_MASK	BIT_ULL(34)
#define PFERR_GUEST_SIZEM_MASK	BIT_ULL(35)
#define PFERR_GUEST_VMPL_MASK	BIT_ULL(36)

#define PFERR_PRESENT_BIT 0
#define PFERR_WRITE_BIT 1
#define PFERR_USER_BIT 2
#define PFERR_RSVD_BIT 3
#define PFERR_FETCH_BIT 4
#define PFERR_PK_BIT 5
#define PFERR_SGX_BIT 15
#define PFERR_GUEST_FINAL_BIT 32
#define PFERR_GUEST_PAGE_BIT 33
#define PFERR_IMPLICIT_ACCESS_BIT 48

#define PFERR_PRESENT_MASK	BIT(PFERR_PRESENT_BIT)
#define PFERR_WRITE_MASK	BIT(PFERR_WRITE_BIT)
#define PFERR_USER_MASK		BIT(PFERR_USER_BIT)
#define PFERR_RSVD_MASK		BIT(PFERR_RSVD_BIT)
#define PFERR_FETCH_MASK	BIT(PFERR_FETCH_BIT)
#define PFERR_PK_MASK		BIT(PFERR_PK_BIT)
#define PFERR_SGX_MASK		BIT(PFERR_SGX_BIT)
#define PFERR_GUEST_FINAL_MASK	BIT_ULL(PFERR_GUEST_FINAL_BIT)
#define PFERR_GUEST_PAGE_MASK	BIT_ULL(PFERR_GUEST_PAGE_BIT)
#define PFERR_IMPLICIT_ACCESS	BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
/*
 * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks
 * when emulating instructions that triggers implicit access.
 */
#define PFERR_IMPLICIT_ACCESS	BIT_ULL(48)
/*
 * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred
 * when the guest was accessing private memory.
 */
#define PFERR_PRIVATE_ACCESS   BIT_ULL(49)
#define PFERR_SYNTHETIC_MASK   (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)

#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK |	\
				 PFERR_WRITE_MASK |		\
@@ -1848,6 +1851,7 @@ struct kvm_arch_async_pf {
	gfn_t gfn;
	unsigned long cr3;
	bool direct_map;
	u64 error_code;
};

extern u32 __read_mostly kvm_nr_uret_msrs;
+2 −3
Original line number Diff line number Diff line
@@ -213,7 +213,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
	 */
	u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
	bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
	int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
	int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1;
	u32 errcode = PFERR_PRESENT_MASK;
	bool fault;

@@ -234,8 +234,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
		pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;

		/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
		offset = (pfec & ~1) +
			((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
		offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0);

		pkru_bits &= mmu->pkru_mask >> offset;
		errcode |= -pkru_bits & PFERR_PK_MASK;
+112 −70
Original line number Diff line number Diff line
@@ -3262,9 +3262,19 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
{
	gva_t gva = fault->is_tdp ? 0 : fault->addr;

	if (fault->is_private) {
		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
		return -EFAULT;
	}

	vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
			     access & shadow_mmio_access_mask);

	fault->slot = NULL;
	fault->pfn = KVM_PFN_NOSLOT;
	fault->map_writable = false;
	fault->hva = KVM_HVA_ERR_BAD;

	/*
	 * If MMIO caching is disabled, emulate immediately without
	 * touching the shadow page tables as attempting to install an
@@ -4207,24 +4217,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
	return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
}

static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
				    gfn_t gfn)
static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
				    struct kvm_page_fault *fault)
{
	struct kvm_arch_async_pf arch;

	arch.token = alloc_apf_token(vcpu);
	arch.gfn = gfn;
	arch.gfn = fault->gfn;
	arch.error_code = fault->error_code;
	arch.direct_map = vcpu->arch.mmu->root_role.direct;
	arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);

	return kvm_setup_async_pf(vcpu, cr2_or_gpa,
				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
	return kvm_setup_async_pf(vcpu, fault->addr,
				  kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
}

void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
	int r;

	if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
		return;

	if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
	      work->wakeup_all)
		return;
@@ -4237,7 +4251,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
	      work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
		return;

	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
}

static inline u8 kvm_max_level_for_order(int order)
@@ -4257,14 +4271,6 @@ static inline u8 kvm_max_level_for_order(int order)
	return PG_LEVEL_4K;
}

static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
					      struct kvm_page_fault *fault)
{
	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
				      PAGE_SIZE, fault->write, fault->exec,
				      fault->is_private);
}

static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
				   struct kvm_page_fault *fault)
{
@@ -4291,48 +4297,15 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,

static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
	struct kvm_memory_slot *slot = fault->slot;
	bool async;

	/*
	 * Retry the page fault if the gfn hit a memslot that is being deleted
	 * or moved.  This ensures any existing SPTEs for the old memslot will
	 * be zapped before KVM inserts a new MMIO SPTE for the gfn.
	 */
	if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
		return RET_PF_RETRY;

	if (!kvm_is_visible_memslot(slot)) {
		/* Don't expose private memslots to L2. */
		if (is_guest_mode(vcpu)) {
			fault->slot = NULL;
			fault->pfn = KVM_PFN_NOSLOT;
			fault->map_writable = false;
			return RET_PF_CONTINUE;
		}
		/*
		 * If the APIC access page exists but is disabled, go directly
		 * to emulation without caching the MMIO access or creating a
		 * MMIO SPTE.  That way the cache doesn't need to be purged
		 * when the AVIC is re-enabled.
		 */
		if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
		    !kvm_apicv_activated(vcpu->kvm))
			return RET_PF_EMULATE;
	}

	if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
		return -EFAULT;
	}

	if (fault->is_private)
		return kvm_faultin_pfn_private(vcpu, fault);

	async = false;
	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
					  fault->write, &fault->map_writable,
					  &fault->hva);
	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false,
					  &async, fault->write,
					  &fault->map_writable, &fault->hva);
	if (!async)
		return RET_PF_CONTINUE; /* *pfn has correct page already */

@@ -4342,7 +4315,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
			trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
			return RET_PF_RETRY;
		} else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
		} else if (kvm_arch_setup_async_pf(vcpu, fault)) {
			return RET_PF_RETRY;
		}
	}
@@ -4352,17 +4325,72 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
	 * to wait for IO.  Note, gup always bails if it is unable to quickly
	 * get a page and a fatal signal, i.e. SIGKILL, is pending.
	 */
	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
					  fault->write, &fault->map_writable,
					  &fault->hva);
	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true,
					  NULL, fault->write,
					  &fault->map_writable, &fault->hva);
	return RET_PF_CONTINUE;
}

static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
			   unsigned int access)
{
	struct kvm_memory_slot *slot = fault->slot;
	int ret;

	/*
	 * Note that the mmu_invalidate_seq also serves to detect a concurrent
	 * change in attributes.  is_page_fault_stale() will detect an
	 * invalidation relate to fault->fn and resume the guest without
	 * installing a mapping in the page tables.
	 */
	fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
	smp_rmb();

	/*
	 * Now that we have a snapshot of mmu_invalidate_seq we can check for a
	 * private vs. shared mismatch.
	 */
	if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
		return -EFAULT;
	}

	if (unlikely(!slot))
		return kvm_handle_noslot_fault(vcpu, fault, access);

	/*
	 * Retry the page fault if the gfn hit a memslot that is being deleted
	 * or moved.  This ensures any existing SPTEs for the old memslot will
	 * be zapped before KVM inserts a new MMIO SPTE for the gfn.
	 */
	if (slot->flags & KVM_MEMSLOT_INVALID)
		return RET_PF_RETRY;

	if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
		/*
		 * Don't map L1's APIC access page into L2, KVM doesn't support
		 * using APICv/AVIC to accelerate L2 accesses to L1's APIC,
		 * i.e. the access needs to be emulated.  Emulating access to
		 * L1's APIC is also correct if L1 is accelerating L2's own
		 * virtual APIC, but for some reason L1 also maps _L1's_ APIC
		 * into L2.  Note, vcpu_is_mmio_gpa() always treats access to
		 * the APIC as MMIO.  Allow an MMIO SPTE to be created, as KVM
		 * uses different roots for L1 vs. L2, i.e. there is no danger
		 * of breaking APICv/AVIC for L1.
		 */
		if (is_guest_mode(vcpu))
			return kvm_handle_noslot_fault(vcpu, fault, access);

		/*
		 * If the APIC access page exists but is disabled, go directly
		 * to emulation without caching the MMIO access or creating a
		 * MMIO SPTE.  That way the cache doesn't need to be purged
		 * when the AVIC is re-enabled.
		 */
		if (!kvm_apicv_activated(vcpu->kvm))
			return RET_PF_EMULATE;
	}

	fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
	smp_rmb();

@@ -4387,8 +4415,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
	 * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
	 * to detect retry guarantees the worst case latency for the vCPU.
	 */
	if (fault->slot &&
	    mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
	if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
		return RET_PF_RETRY;

	ret = __kvm_faultin_pfn(vcpu, fault);
@@ -4398,7 +4425,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
	if (unlikely(is_error_pfn(fault->pfn)))
		return kvm_handle_error_pfn(vcpu, fault);

	if (unlikely(!fault->slot))
	if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn)))
		return kvm_handle_noslot_fault(vcpu, fault, access);

	/*
@@ -4509,6 +4536,16 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
	if (WARN_ON_ONCE(fault_address >> 32))
		return -EFAULT;
#endif
	/*
	 * Legacy #PF exception only have a 32-bit error code.  Simply drop the
	 * upper bits as KVM doesn't use them for #PF (because they are never
	 * set), and to ensure there are no collisions with KVM-defined bits.
	 */
	if (WARN_ON_ONCE(error_code >> 32))
		error_code = lower_32_bits(error_code);

	/* Ensure the above sanity check also covers KVM-defined flags. */
	BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));

	vcpu->arch.l1tf_flush_l1d = true;
	if (!flags) {
@@ -5794,30 +5831,35 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
	int r, emulation_type = EMULTYPE_PF;
	bool direct = vcpu->arch.mmu->root_role.direct;

	/*
	 * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
	 * checks when emulating instructions that triggers implicit access.
	 * WARN if hardware generates a fault with an error code that collides
	 * with the KVM-defined value.  Clear the flag and continue on, i.e.
	 * don't terminate the VM, as KVM can't possibly be relying on a flag
	 * that KVM doesn't know about.
	 */
	if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
		error_code &= ~PFERR_IMPLICIT_ACCESS;

	if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
		return RET_PF_RETRY;

	/*
	 * Except for reserved faults (emulated MMIO is shared-only), set the
	 * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's
	 * current attributes, which are the source of truth for such VMs.  Note,
	 * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't
	 * currently supported nested virtualization (among many other things)
	 * for software-protected VMs.
	 */
	if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
	    !(error_code & PFERR_RSVD_MASK) &&
	    vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM &&
	    kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
		error_code |= PFERR_PRIVATE_ACCESS;

	r = RET_PF_INVALID;
	if (unlikely(error_code & PFERR_RSVD_MASK)) {
		if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS))
			return -EFAULT;

		r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
		if (r == RET_PF_EMULATE)
			goto emulate;
	}

	if (r == RET_PF_INVALID) {
		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
					  lower_32_bits(error_code), false,
		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
					  &emulation_type);
		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
			return -EIO;
+25 −3
Original line number Diff line number Diff line
@@ -190,7 +190,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
struct kvm_page_fault {
	/* arguments to kvm_mmu_do_page_fault.  */
	const gpa_t addr;
	const u32 error_code;
	const u64 error_code;
	const bool prefetch;

	/* Derived from error_code.  */
@@ -279,8 +279,16 @@ enum {
	RET_PF_SPURIOUS,
};

static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
						     struct kvm_page_fault *fault)
{
	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
				      PAGE_SIZE, fault->write, fault->exec,
				      fault->is_private);
}

static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
					u32 err, bool prefetch, int *emulation_type)
					u64 err, bool prefetch, int *emulation_type)
{
	struct kvm_page_fault fault = {
		.addr = cr2_or_gpa,
@@ -298,7 +306,10 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
		.req_level = PG_LEVEL_4K,
		.goal_level = PG_LEVEL_4K,
		.is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
		.is_private = err & PFERR_PRIVATE_ACCESS,

		.pfn = KVM_PFN_ERR_FAULT,
		.hva = KVM_HVA_ERR_BAD,
	};
	int r;

@@ -320,6 +331,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
	else
		r = vcpu->arch.mmu->page_fault(vcpu, &fault);

	/*
	 * Not sure what's happening, but punt to userspace and hope that
	 * they can fix it by changing memory to shared, or they can
	 * provide a better error.
	 */
	if (r == RET_PF_EMULATE && fault.is_private) {
		pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
		kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
		return -EFAULT;
	}

	if (fault.write_fault_to_shadow_pgtable && emulation_type)
		*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;

+1 −1
Original line number Diff line number Diff line
@@ -260,7 +260,7 @@ TRACE_EVENT(
	TP_STRUCT__entry(
		__field(int, vcpu_id)
		__field(gpa_t, cr2_or_gpa)
		__field(u32, error_code)
		__field(u64, error_code)
		__field(u64 *, sptep)
		__field(u64, old_spte)
		__field(u64, new_spte)
Loading