Merge branch 'kvm-prefault' into HEAD (f3996d4d) · Commits · git / linux-net

Documentation/virt/kvm/api.rst

+55 −0

Original line number	Diff line number	Diff line
		@@ -6352,6 +6352,61 @@ a single guest_memfd file, but the bound ranges must not overlap).

		See KVM_SET_USER_MEMORY_REGION2 for additional details.

		4.143 KVM_PRE_FAULT_MEMORY
		------------------------

		:Capability: KVM_CAP_PRE_FAULT_MEMORY
		:Architectures: none
		:Type: vcpu ioctl
		:Parameters: struct kvm_pre_fault_memory (in/out)
		:Returns: 0 if at least one page is processed, < 0 on error

		Errors:

		========== ===============================================================
		EINVAL The specified `gpa` and `size` were invalid (e.g. not
		page aligned, causes an overflow, or size is zero).
		ENOENT The specified `gpa` is outside defined memslots.
		EINTR An unmasked signal is pending and no page was processed.
		EFAULT The parameter address was invalid.
		EOPNOTSUPP Mapping memory for a GPA is unsupported by the
		hypervisor, and/or for the current vCPU state/mode.
		EIO unexpected error conditions (also causes a WARN)
		========== ===============================================================

		::

		struct kvm_pre_fault_memory {
		/* in/out */
		__u64 gpa;
		__u64 size;
		/* in */
		__u64 flags;
		__u64 padding[5];
		};

		KVM_PRE_FAULT_MEMORY populates KVM's stage-2 page tables used to map memory
		for the current vCPU state. KVM maps memory as if the vCPU generated a
		stage-2 read page fault, e.g. faults in memory as needed, but doesn't break
		CoW. However, KVM does not mark any newly created stage-2 PTE as Accessed.

		In some cases, multiple vCPUs might share the page tables. In this
		case, the ioctl can be called in parallel.

		When the ioctl returns, the input values are updated to point to the
		remaining range. If `size` > 0 on return, the caller can just issue
		the ioctl again with the same `struct kvm_map_memory` argument.

		Shadow page tables cannot support this ioctl because they
		are indexed by virtual address or nested guest physical address.
		Calling this ioctl when the guest is using shadow page tables (for
		example because it is running a nested guest with nested page tables)
		will fail with `EOPNOTSUPP` even if `KVM_CHECK_EXTENSION` reports
		the capability to be present.

		`flags` must currently be zero.


		5. The kvm_run structure
		========================

arch/x86/kvm/Kconfig

+1 −0

Original line number	Diff line number	Diff line
		@@ -44,6 +44,7 @@ config KVM
		select KVM_VFIO
		select HAVE_KVM_PM_NOTIFIER if PM
		select KVM_GENERIC_HARDWARE_ENABLING
		select KVM_GENERIC_PRE_FAULT_MEMORY
		select KVM_WERROR if WERROR
		help
		Support hosting fully virtualized guest machines using hardware

arch/x86/kvm/mmu/mmu.c

+94 −2

Original line number	Diff line number	Diff line
		@@ -4291,7 +4291,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu vcpu, struct kvm_async_pf work)
		work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
		return;

		kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
		r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
		true, NULL, NULL);

		/*
		* Account fixed page faults, otherwise they'll never be counted, but
		* ignore stats for all other return times. Page-ready "faults" aren't
		* truly spurious and never trigger emulation
		*/
		if (r == RET_PF_FIXED)
		vcpu->stat.pf_fixed++;
		}

		static inline u8 kvm_max_level_for_order(int order)
		@@ -4700,6 +4709,79 @@ int kvm_tdp_page_fault(struct kvm_vcpu vcpu, struct kvm_page_fault fault)
		return direct_page_fault(vcpu, fault);
		}

		static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
		u8 *level)
		{
		int r;

		/*
		* Restrict to TDP page fault, since that's the only case where the MMU
		* is indexed by GPA.
		*/
		if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
		return -EOPNOTSUPP;

		do {
		if (signal_pending(current))
		return -EINTR;
		cond_resched();
		r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
		} while (r == RET_PF_RETRY);

		if (r < 0)
		return r;

		switch (r) {
		case RET_PF_FIXED:
		case RET_PF_SPURIOUS:
		return 0;

		case RET_PF_EMULATE:
		return -ENOENT;

		case RET_PF_RETRY:
		case RET_PF_CONTINUE:
		case RET_PF_INVALID:
		default:
		WARN_ONCE(1, "could not fix page fault during prefault");
		return -EIO;
		}
		}

		long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
		struct kvm_pre_fault_memory *range)
		{
		u64 error_code = PFERR_GUEST_FINAL_MASK;
		u8 level = PG_LEVEL_4K;
		u64 end;
		int r;

		/*
		* reload is efficient when called repeatedly, so we can do it on
		* every iteration.
		*/
		kvm_mmu_reload(vcpu);

		if (kvm_arch_has_private_mem(vcpu->kvm) &&
		kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
		error_code \|= PFERR_PRIVATE_ACCESS;

		/*
		* Shadow paging uses GVA for kvm page fault, so restrict to
		* two-dimensional paging.
		*/
		r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
		if (r < 0)
		return r;

		/*
		* If the mapping that covers range->gpa can use a huge page, it
		* may start below it or end after range->gpa + range->size.
		*/
		end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
		return min(range->size, end - range->gpa);
		}

		static void nonpaging_init_context(struct kvm_mmu *context)
		{
		context->page_fault = nonpaging_page_fault;
		@@ -5925,14 +6007,24 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
		}

		if (r == RET_PF_INVALID) {
		vcpu->stat.pf_taken++;

		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
		&emulation_type);
		&emulation_type, NULL);
		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
		return -EIO;
		}

		if (r < 0)
		return r;

		if (r == RET_PF_FIXED)
		vcpu->stat.pf_fixed++;
		else if (r == RET_PF_EMULATE)
		vcpu->stat.pf_emulate++;
		else if (r == RET_PF_SPURIOUS)
		vcpu->stat.pf_spurious++;

		if (r != RET_PF_EMULATE)
		return 1;

arch/x86/kvm/mmu/mmu_internal.h

+4 −22

Original line number	Diff line number	Diff line
		@@ -288,7 +288,8 @@ static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
		}

		static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
		u64 err, bool prefetch, int *emulation_type)
		u64 err, bool prefetch,
		int emulation_type, u8 level)
		{
		struct kvm_page_fault fault = {
		.addr = cr2_or_gpa,
		@@ -318,14 +319,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
		fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
		}

		/*
		* Async #PF "faults", a.k.a. prefetch faults, are not faults from the
		* guest perspective and have already been counted at the time of the
		* original fault.
		*/
		if (!prefetch)
		vcpu->stat.pf_taken++;

		if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
		r = kvm_tdp_page_fault(vcpu, &fault);
		else
		@@ -344,20 +337,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,

		if (fault.write_fault_to_shadow_pgtable && emulation_type)
		*emulation_type \|= EMULTYPE_WRITE_PF_TO_SP;
		if (level)
		*level = fault.goal_level;

		/*
		* Similar to above, prefetch faults aren't truly spurious, and the
		* async #PF path doesn't do emulation. Do count faults that are fixed
		* by the async #PF handler though, otherwise they'll never be counted.
		*/
		if (r == RET_PF_FIXED)
		vcpu->stat.pf_fixed++;
		else if (prefetch)
		;
		else if (r == RET_PF_EMULATE)
		vcpu->stat.pf_emulate++;
		else if (r == RET_PF_SPURIOUS)
		vcpu->stat.pf_spurious++;
		return r;
		}

arch/x86/kvm/x86.c

+3 −0

Original line number	Diff line number	Diff line
		@@ -4705,6 +4705,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
		case KVM_CAP_MEMORY_FAULT_INFO:
		r = 1;
		break;
		case KVM_CAP_PRE_FAULT_MEMORY:
		r = tdp_enabled;
		break;
		case KVM_CAP_EXIT_HYPERCALL:
		r = KVM_EXIT_HYPERCALL_VALID_MASK;
		break;