Commit 55f50b2f authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge branch 'kvm-memslot-zap-quirk' into HEAD



Today whenever a memslot is moved or deleted, KVM invalidates the entire
page tables and generates fresh ones based on the new memslot layout.

This behavior traditionally was kept because of a bug which was never
fully investigated and caused VM instability with assigned GeForce
GPUs.  It generally does not have a huge overhead, because the old
MMU is able to reuse cached page tables and the new one is more
scalabale and can resolve EPT violations/nested page faults in parallel,
but it has worse performance if the guest frequently deletes and
adds small memslots, and it's entirely not viable for TDX.  This is
because TDX requires re-accepting of private pages after page dropping.

For non-TDX VMs, this series therefore introduces the
KVM_X86_QUIRK_SLOT_ZAP_ALL quirk, enabling users to control the behavior
of memslot zapping when a memslot is moved/deleted.  The quirk is turned
on by default, leading to the zapping of all SPTEs when a memslot is
moved/deleted; users however have the option to turn off the quirk,
which limits the zapping only to those SPTEs hat lie within the range
of memslot being moved/deleted.

Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parents 356dab4e 61de4c34
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -8082,6 +8082,14 @@ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By default, KVM emulates MONITOR/MWAIT (if
                                    guest CPUID on writes to MISC_ENABLE if
                                    KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is
                                    disabled.

KVM_X86_QUIRK_SLOT_ZAP_ALL          By default, KVM invalidates all SPTEs in
                                    fast way for memslot deletion when VM type
                                    is KVM_X86_DEFAULT_VM.
                                    When this quirk is disabled or when VM type
                                    is other than KVM_X86_DEFAULT_VM, KVM zaps
                                    only leaf SPTEs that are within the range of
                                    the memslot being deleted.
=================================== ============================================

7.32 KVM_CAP_MAX_VCPU_ID
+2 −1
Original line number Diff line number Diff line
@@ -2345,7 +2345,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
	 KVM_X86_QUIRK_OUT_7E_INC_RIP |		\
	 KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT |	\
	 KVM_X86_QUIRK_FIX_HYPERCALL_INSN |	\
	 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
	 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS |	\
	 KVM_X86_QUIRK_SLOT_ZAP_ALL)

/*
 * KVM previously used a u32 field in kvm_run to indicate the hypercall was
+1 −0
Original line number Diff line number Diff line
@@ -439,6 +439,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT	(1 << 4)
#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN	(1 << 5)
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS	(1 << 6)
#define KVM_X86_QUIRK_SLOT_ZAP_ALL		(1 << 7)

#define KVM_STATE_NESTED_FORMAT_VMX	0
#define KVM_STATE_NESTED_FORMAT_SVM	1
+41 −1
Original line number Diff line number Diff line
@@ -6999,10 +6999,50 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
	kvm_mmu_zap_all(kvm);
}

/*
 * Zapping leaf SPTEs with memslot range when a memslot is moved/deleted.
 *
 * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
 * case scenario we'll have unused shadow pages lying around until they
 * are recycled due to age or when the VM is destroyed.
 */
static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot)
{
	struct kvm_gfn_range range = {
		.slot = slot,
		.start = slot->base_gfn,
		.end = slot->base_gfn + slot->npages,
		.may_block = true,
	};
	bool flush = false;

	write_lock(&kvm->mmu_lock);

	if (kvm_memslots_have_rmaps(kvm))
		flush = kvm_handle_gfn_range(kvm, &range, kvm_zap_rmap);

	if (tdp_mmu_enabled)
		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, &range, flush);

	if (flush)
		kvm_flush_remote_tlbs_memslot(kvm, slot);

	write_unlock(&kvm->mmu_lock);
}

static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm)
{
	return kvm->arch.vm_type == KVM_X86_DEFAULT_VM &&
	       kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL);
}

void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
				   struct kvm_memory_slot *slot)
{
	if (kvm_memslot_flush_zap_all(kvm))
		kvm_mmu_zap_all_fast(kvm);
	else
		kvm_mmu_zap_memslot_leafs(kvm, slot);
}

void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
+17 −2
Original line number Diff line number Diff line
@@ -79,6 +79,7 @@ struct test_params {
	useconds_t delay;
	uint64_t nr_iterations;
	bool partition_vcpu_memory_access;
	bool disable_slot_zap_quirk;
};

static void run_test(enum vm_guest_mode mode, void *arg)
@@ -89,6 +90,13 @@ static void run_test(enum vm_guest_mode mode, void *arg)
	vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
				 VM_MEM_SRC_ANONYMOUS,
				 p->partition_vcpu_memory_access);
#ifdef __x86_64__
	if (p->disable_slot_zap_quirk)
		vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);

	pr_info("Memslot zap quirk %s\n", p->disable_slot_zap_quirk ?
		"disabled" : "enabled");
#endif

	pr_info("Finished creating vCPUs\n");

@@ -107,11 +115,12 @@ static void run_test(enum vm_guest_mode mode, void *arg)
static void help(char *name)
{
	puts("");
	printf("usage: %s [-h] [-m mode] [-d delay_usec]\n"
	printf("usage: %s [-h] [-m mode] [-d delay_usec] [-q]\n"
	       "          [-b memory] [-v vcpus] [-o] [-i iterations]\n", name);
	guest_modes_help();
	printf(" -d: add a delay between each iteration of adding and\n"
	       "     deleting a memslot in usec.\n");
	printf(" -q: Disable memslot zap quirk.\n");
	printf(" -b: specify the size of the memory region which should be\n"
	       "     accessed by each vCPU. e.g. 10M or 3G.\n"
	       "     Default: 1G\n");
@@ -137,7 +146,7 @@ int main(int argc, char *argv[])

	guest_modes_append_default();

	while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) {
	while ((opt = getopt(argc, argv, "hm:d:qb:v:oi:")) != -1) {
		switch (opt) {
		case 'm':
			guest_modes_cmdline(optarg);
@@ -160,6 +169,12 @@ int main(int argc, char *argv[])
		case 'i':
			p.nr_iterations = atoi_positive("Number of iterations", optarg);
			break;
		case 'q':
			p.disable_slot_zap_quirk = true;

			TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) &
				     KVM_X86_QUIRK_SLOT_ZAP_ALL);
			break;
		case 'h':
		default:
			help(argv[0]);
Loading