Commit f3996d4d authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge branch 'kvm-prefault' into HEAD

Pre-population has been requested several times to mitigate KVM page faults
during guest boot or after live migration.  It is also required by TDX
before filling in the initial guest memory with measured contents.
Introduce it as a generic API.
parents eb162c94 9ff0e37c
Loading
Loading
Loading
Loading
+55 −0
Original line number Diff line number Diff line
@@ -6352,6 +6352,61 @@ a single guest_memfd file, but the bound ranges must not overlap).

See KVM_SET_USER_MEMORY_REGION2 for additional details.

4.143 KVM_PRE_FAULT_MEMORY
------------------------

:Capability: KVM_CAP_PRE_FAULT_MEMORY
:Architectures: none
:Type: vcpu ioctl
:Parameters: struct kvm_pre_fault_memory (in/out)
:Returns: 0 if at least one page is processed, < 0 on error

Errors:

  ========== ===============================================================
  EINVAL     The specified `gpa` and `size` were invalid (e.g. not
             page aligned, causes an overflow, or size is zero).
  ENOENT     The specified `gpa` is outside defined memslots.
  EINTR      An unmasked signal is pending and no page was processed.
  EFAULT     The parameter address was invalid.
  EOPNOTSUPP Mapping memory for a GPA is unsupported by the
             hypervisor, and/or for the current vCPU state/mode.
  EIO        unexpected error conditions (also causes a WARN)
  ========== ===============================================================

::

  struct kvm_pre_fault_memory {
	/* in/out */
	__u64 gpa;
	__u64 size;
	/* in */
	__u64 flags;
	__u64 padding[5];
  };

KVM_PRE_FAULT_MEMORY populates KVM's stage-2 page tables used to map memory
for the current vCPU state.  KVM maps memory as if the vCPU generated a
stage-2 read page fault, e.g. faults in memory as needed, but doesn't break
CoW.  However, KVM does not mark any newly created stage-2 PTE as Accessed.

In some cases, multiple vCPUs might share the page tables.  In this
case, the ioctl can be called in parallel.

When the ioctl returns, the input values are updated to point to the
remaining range.  If `size` > 0 on return, the caller can just issue
the ioctl again with the same `struct kvm_map_memory` argument.

Shadow page tables cannot support this ioctl because they
are indexed by virtual address or nested guest physical address.
Calling this ioctl when the guest is using shadow page tables (for
example because it is running a nested guest with nested page tables)
will fail with `EOPNOTSUPP` even if `KVM_CHECK_EXTENSION` reports
the capability to be present.

`flags` must currently be zero.


5. The kvm_run structure
========================

+1 −0
Original line number Diff line number Diff line
@@ -44,6 +44,7 @@ config KVM
	select KVM_VFIO
	select HAVE_KVM_PM_NOTIFIER if PM
	select KVM_GENERIC_HARDWARE_ENABLING
	select KVM_GENERIC_PRE_FAULT_MEMORY
	select KVM_WERROR if WERROR
	help
	  Support hosting fully virtualized guest machines using hardware
+94 −2
Original line number Diff line number Diff line
@@ -4291,7 +4291,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
	      work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
		return;

	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
	r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
				  true, NULL, NULL);

	/*
	 * Account fixed page faults, otherwise they'll never be counted, but
	 * ignore stats for all other return times.  Page-ready "faults" aren't
	 * truly spurious and never trigger emulation
	 */
	if (r == RET_PF_FIXED)
		vcpu->stat.pf_fixed++;
}

static inline u8 kvm_max_level_for_order(int order)
@@ -4700,6 +4709,79 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
	return direct_page_fault(vcpu, fault);
}

static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
			    u8 *level)
{
	int r;

	/*
	 * Restrict to TDP page fault, since that's the only case where the MMU
	 * is indexed by GPA.
	 */
	if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
		return -EOPNOTSUPP;

	do {
		if (signal_pending(current))
			return -EINTR;
		cond_resched();
		r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
	} while (r == RET_PF_RETRY);

	if (r < 0)
		return r;

	switch (r) {
	case RET_PF_FIXED:
	case RET_PF_SPURIOUS:
		return 0;

	case RET_PF_EMULATE:
		return -ENOENT;

	case RET_PF_RETRY:
	case RET_PF_CONTINUE:
	case RET_PF_INVALID:
	default:
		WARN_ONCE(1, "could not fix page fault during prefault");
		return -EIO;
	}
}

long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
				    struct kvm_pre_fault_memory *range)
{
	u64 error_code = PFERR_GUEST_FINAL_MASK;
	u8 level = PG_LEVEL_4K;
	u64 end;
	int r;

	/*
	 * reload is efficient when called repeatedly, so we can do it on
	 * every iteration.
	 */
	kvm_mmu_reload(vcpu);

	if (kvm_arch_has_private_mem(vcpu->kvm) &&
	    kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
		error_code |= PFERR_PRIVATE_ACCESS;

	/*
	 * Shadow paging uses GVA for kvm page fault, so restrict to
	 * two-dimensional paging.
	 */
	r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
	if (r < 0)
		return r;

	/*
	 * If the mapping that covers range->gpa can use a huge page, it
	 * may start below it or end after range->gpa + range->size.
	 */
	end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
	return min(range->size, end - range->gpa);
}

static void nonpaging_init_context(struct kvm_mmu *context)
{
	context->page_fault = nonpaging_page_fault;
@@ -5925,14 +6007,24 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
	}

	if (r == RET_PF_INVALID) {
		vcpu->stat.pf_taken++;

		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
					  &emulation_type);
					  &emulation_type, NULL);
		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
			return -EIO;
	}

	if (r < 0)
		return r;

	if (r == RET_PF_FIXED)
		vcpu->stat.pf_fixed++;
	else if (r == RET_PF_EMULATE)
		vcpu->stat.pf_emulate++;
	else if (r == RET_PF_SPURIOUS)
		vcpu->stat.pf_spurious++;

	if (r != RET_PF_EMULATE)
		return 1;

+4 −22
Original line number Diff line number Diff line
@@ -288,7 +288,8 @@ static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
}

static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
					u64 err, bool prefetch, int *emulation_type)
					u64 err, bool prefetch,
					int *emulation_type, u8 *level)
{
	struct kvm_page_fault fault = {
		.addr = cr2_or_gpa,
@@ -318,14 +319,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
		fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
	}

	/*
	 * Async #PF "faults", a.k.a. prefetch faults, are not faults from the
	 * guest perspective and have already been counted at the time of the
	 * original fault.
	 */
	if (!prefetch)
		vcpu->stat.pf_taken++;

	if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
		r = kvm_tdp_page_fault(vcpu, &fault);
	else
@@ -344,20 +337,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,

	if (fault.write_fault_to_shadow_pgtable && emulation_type)
		*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
	if (level)
		*level = fault.goal_level;

	/*
	 * Similar to above, prefetch faults aren't truly spurious, and the
	 * async #PF path doesn't do emulation.  Do count faults that are fixed
	 * by the async #PF handler though, otherwise they'll never be counted.
	 */
	if (r == RET_PF_FIXED)
		vcpu->stat.pf_fixed++;
	else if (prefetch)
		;
	else if (r == RET_PF_EMULATE)
		vcpu->stat.pf_emulate++;
	else if (r == RET_PF_SPURIOUS)
		vcpu->stat.pf_spurious++;
	return r;
}

+3 −0
Original line number Diff line number Diff line
@@ -4705,6 +4705,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
	case KVM_CAP_MEMORY_FAULT_INFO:
		r = 1;
		break;
	case KVM_CAP_PRE_FAULT_MEMORY:
		r = tdp_enabled;
		break;
	case KVM_CAP_EXIT_HYPERCALL:
		r = KVM_EXIT_HYPERCALL_VALID_MASK;
		break;
Loading