Commit a3522ac7 authored by Sean Christopherson's avatar Sean Christopherson Committed by Paolo Bonzini
Browse files

KVM: x86/mmu: Enforce guest_memfd's max order when recovering hugepages



Rework kvm_mmu_max_mapping_level() to provide the plumbing to consult
guest_memfd (and relevant vendor code) when recovering hugepages, e.g.
after disabling live migration.  The flaw has existed since guest_memfd was
originally added, but has gone unnoticed due to lack of guest_memfd support
for hugepages or dirty logging.

Don't actually call into guest_memfd at this time, as it's unclear as to
what the API should be.  Ideally, KVM would simply use kvm_gmem_get_pfn(),
but invoking kvm_gmem_get_pfn() would lead to sleeping in atomic context
if guest_memfd needed to allocate memory (mmu_lock is held).  Luckily,
the path isn't actually reachable, so just add a TODO and WARN to ensure
the functionality is added alongisde guest_memfd hugepage support, and
punt the guest_memfd API design question to the future.

Note, calling kvm_mem_is_private() in the non-fault path is safe, so long
as mmu_lock is held, as hugepage recovery operates on shadow-present SPTEs,
i.e. calling kvm_mmu_max_mapping_level() with @fault=NULL is mutually
exclusive with kvm_vm_set_mem_attributes() changing the PRIVATE attribute
of the gfn.

Signed-off-by: default avatarSean Christopherson <seanjc@google.com>
Reviewed-by: default avatarDavid Hildenbrand <david@redhat.com>
Reviewed-by: default avatarFuad Tabba <tabba@google.com>
Message-ID: <20250729225455.670324-15-seanjc@google.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent 1c3fdf13
Loading
Loading
Loading
Loading
+45 −33
Original line number Diff line number Diff line
@@ -3302,31 +3302,54 @@ static u8 kvm_max_level_for_order(int order)
	return PG_LEVEL_4K;
}

static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
					u8 max_level, int gmem_order)
static u8 kvm_max_private_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
					const struct kvm_memory_slot *slot, gfn_t gfn)
{
	u8 req_max_level;
	u8 max_level, coco_level;
	kvm_pfn_t pfn;

	if (max_level == PG_LEVEL_4K)
		return PG_LEVEL_4K;
	/* For faults, use the gmem information that was resolved earlier. */
	if (fault) {
		pfn = fault->pfn;
		max_level = fault->max_level;
	} else {
		/* TODO: Call into guest_memfd once hugepages are supported. */
		WARN_ONCE(1, "Get pfn+order from guest_memfd");
		pfn = KVM_PFN_ERR_FAULT;
		max_level = PG_LEVEL_4K;
	}

	max_level = min(kvm_max_level_for_order(gmem_order), max_level);
	if (max_level == PG_LEVEL_4K)
		return PG_LEVEL_4K;
		return max_level;

	req_max_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn);
	if (req_max_level)
		max_level = min(max_level, req_max_level);
	/*
	 * CoCo may influence the max mapping level, e.g. due to RMP or S-EPT
	 * restrictions.  A return of '0' means "no additional restrictions", to
	 * allow for using an optional "ret0" static call.
	 */
	coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn);
	if (coco_level)
		max_level = min(max_level, coco_level);

	return max_level;
}

static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
				       const struct kvm_memory_slot *slot,
				       gfn_t gfn, int max_level, bool is_private)
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
			      const struct kvm_memory_slot *slot, gfn_t gfn)
{
	struct kvm_lpage_info *linfo;
	int host_level;
	int host_level, max_level;
	bool is_private;

	lockdep_assert_held(&kvm->mmu_lock);

	if (fault) {
		max_level = fault->max_level;
		is_private = fault->is_private;
	} else {
		max_level = PG_LEVEL_NUM;
		is_private = kvm_mem_is_private(kvm, gfn);
	}

	max_level = min(max_level, max_huge_page_level);
	for ( ; max_level > PG_LEVEL_4K; max_level--) {
@@ -3335,25 +3358,16 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
			break;
	}

	if (is_private)
		return max_level;

	if (max_level == PG_LEVEL_4K)
		return PG_LEVEL_4K;

	if (is_private)
		host_level = kvm_max_private_mapping_level(kvm, fault, slot, gfn);
	else
		host_level = host_pfn_mapping_level(kvm, gfn, slot);
	return min(host_level, max_level);
}

int kvm_mmu_max_mapping_level(struct kvm *kvm,
			      const struct kvm_memory_slot *slot, gfn_t gfn)
{
	bool is_private = kvm_slot_has_gmem(slot) &&
			  kvm_mem_is_private(kvm, gfn);

	return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
}

void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
	struct kvm_memory_slot *slot = fault->slot;
@@ -3374,9 +3388,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
	 * Enforce the iTLB multihit workaround after capturing the requested
	 * level, which will be used to do precise, accurate accounting.
	 */
	fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
						       fault->gfn, fault->max_level,
						       fault->is_private);
	fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault,
						     fault->slot, fault->gfn);
	if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
		return;

@@ -4564,8 +4577,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
	}

	fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
	fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
							 fault->max_level, max_order);
	fault->max_level = kvm_max_level_for_order(max_order);

	return RET_PF_CONTINUE;
}
@@ -7165,7 +7177,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
		 * mapping if the indirect sp has level = 1.
		 */
		if (sp->role.direct &&
		    sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
		    sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) {
			kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);

			if (kvm_available_flush_remote_tlbs_range())
+1 −1
Original line number Diff line number Diff line
@@ -411,7 +411,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
	return r;
}

int kvm_mmu_max_mapping_level(struct kvm *kvm,
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
			      const struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
+1 −1
Original line number Diff line number Diff line
@@ -1813,7 +1813,7 @@ static void recover_huge_pages_range(struct kvm *kvm,
		if (iter.gfn < start || iter.gfn >= end)
			continue;

		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
		max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
		if (max_mapping_level < iter.level)
			continue;