Commit d7f4aac2 authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge tag 'kvm-x86-mmu-6.17' of https://github.com/kvm-x86/linux into HEAD

KVM x86 MMU changes for 6.17

 - Exempt nested EPT from the the !USER + CR0.WP logic, as EPT doesn't interact
   with CR0.WP.

 - Move the TDX hardware setup code to tdx.c to better co-locate TDX code
   and eliminate a few global symbols.

 - Dynamically allocation the shadow MMU's hashed page list, and defer
   allocating the hashed list until it's actually needed (the TDP MMU doesn't
   use the list).
parents 1a14928e 9c4fe6d1
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -1358,7 +1358,7 @@ struct kvm_arch {
	bool has_private_mem;
	bool has_protected_state;
	bool pre_fault_allowed;
	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
	struct hlist_head *mmu_page_hash;
	struct list_head active_mmu_pages;
	/*
	 * A list of kvm_mmu_page structs that, if zapped, could possibly be
@@ -1985,7 +1985,7 @@ void kvm_x86_vendor_exit(void);
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
	return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
	return kvzalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT);
}

#define __KVM_HAVE_ARCH_VM_FREE
@@ -2030,7 +2030,7 @@ void kvm_mmu_vendor_module_exit(void);

void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
int kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);

void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
+69 −6
Original line number Diff line number Diff line
@@ -1983,14 +1983,35 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
	return true;
}

static __ro_after_init HLIST_HEAD(empty_page_hash);

static struct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn)
{
	/*
	 * Ensure the load of the hash table pointer itself is ordered before
	 * loads to walk the table.  The pointer is set at runtime outside of
	 * mmu_lock when the TDP MMU is enabled, i.e. when the hash table of
	 * shadow pages becomes necessary only when KVM needs to shadow L1's
	 * TDP for an L2 guest.  Pairs with the smp_store_release() in
	 * kvm_mmu_alloc_page_hash().
	 */
	struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash);

	lockdep_assert_held(&kvm->mmu_lock);

	if (!page_hash)
		return &empty_page_hash;

	return &page_hash[kvm_page_table_hashfn(gfn)];
}

#define for_each_valid_sp(_kvm, _sp, _list)				\
	hlist_for_each_entry(_sp, _list, hash_link)			\
		if (is_obsolete_sp((_kvm), (_sp))) {			\
		} else

#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn)		\
	for_each_valid_sp(_kvm, _sp,					\
	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])	\
	for_each_valid_sp(_kvm, _sp, kvm_get_mmu_page_hash(_kvm, _gfn))	\
		if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else

static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
@@ -2358,6 +2379,12 @@ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
	struct kvm_mmu_page *sp;
	bool created = false;

	/*
	 * No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as
	 * mmu_page_hash must be set prior to creating the first shadow root,
	 * i.e. reaching this point is fully serialized by slots_arch_lock.
	 */
	BUG_ON(!kvm->arch.mmu_page_hash);
	sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];

	sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
@@ -3882,6 +3909,28 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
	return r;
}

static int kvm_mmu_alloc_page_hash(struct kvm *kvm)
{
	struct hlist_head *h;

	if (kvm->arch.mmu_page_hash)
		return 0;

	h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT);
	if (!h)
		return -ENOMEM;

	/*
	 * Ensure the hash table pointer is set only after all stores to zero
	 * the memory are retired.  Pairs with the smp_load_acquire() in
	 * kvm_get_mmu_page_hash().  Note, mmu_lock must be held for write to
	 * add (or remove) shadow pages, and so readers are guaranteed to see
	 * an empty list for their current mmu_lock critical section.
	 */
	smp_store_release(&kvm->arch.mmu_page_hash, h);
	return 0;
}

static int mmu_first_shadow_root_alloc(struct kvm *kvm)
{
	struct kvm_memslots *slots;
@@ -3901,9 +3950,13 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
	if (kvm_shadow_root_allocated(kvm))
		goto out_unlock;

	r = kvm_mmu_alloc_page_hash(kvm);
	if (r)
		goto out_unlock;

	/*
	 * Check if anything actually needs to be allocated, e.g. all metadata
	 * will be allocated upfront if TDP is disabled.
	 * Check if memslot metadata actually needs to be allocated, e.g. all
	 * metadata will be allocated upfront if TDP is disabled.
	 */
	if (kvm_memslots_have_rmaps(kvm) &&
	    kvm_page_track_write_tracking_enabled(kvm))
@@ -6682,15 +6735,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
		kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
}

void kvm_mmu_init_vm(struct kvm *kvm)
int kvm_mmu_init_vm(struct kvm *kvm)
{
	int r;

	kvm->arch.shadow_mmio_value = shadow_mmio_value;
	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
	INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
	spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);

	if (tdp_mmu_enabled)
	if (tdp_mmu_enabled) {
		kvm_mmu_init_tdp_mmu(kvm);
	} else {
		r = kvm_mmu_alloc_page_hash(kvm);
		if (r)
			return r;
	}

	kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
	kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
@@ -6699,6 +6759,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)

	kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
	kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
	return 0;
}

static void mmu_free_vm_memory_caches(struct kvm *kvm)
@@ -6710,6 +6771,8 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm)

void kvm_mmu_uninit_vm(struct kvm *kvm)
{
	kvfree(kvm->arch.mmu_page_hash);

	if (tdp_mmu_enabled)
		kvm_mmu_uninit_tdp_mmu(kvm);

+6 −2
Original line number Diff line number Diff line
@@ -804,9 +804,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
	if (r != RET_PF_CONTINUE)
		return r;

#if PTTYPE != PTTYPE_EPT
	/*
	 * Do not change pte_access if the pfn is a mmio page, otherwise
	 * we will cache the incorrect access into mmio spte.
	 * Treat the guest PTE protections as writable, supervisor-only if this
	 * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore
	 * PTE.W if CR0.WP=0).  Don't change the access type for emulated MMIO,
	 * otherwise KVM will cache incorrect access information in the SPTE.
	 */
	if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
	    !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
@@ -822,6 +825,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
		if (is_cr4_smep(vcpu->arch.mmu))
			walker.pte_access &= ~ACC_EXEC_MASK;
	}
#endif

	r = RET_PF_RETRY;
	write_lock(&vcpu->kvm->mmu_lock);
+2 −0
Original line number Diff line number Diff line
@@ -5494,6 +5494,8 @@ static int __init svm_init(void)
{
	int r;

	KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm);

	__unused_size_checks();

	if (!kvm_is_svm_supported())
+2 −34
Original line number Diff line number Diff line
@@ -29,40 +29,8 @@ static __init int vt_hardware_setup(void)
	if (ret)
		return ret;

	/*
	 * Update vt_x86_ops::vm_size here so it is ready before
	 * kvm_ops_update() is called in kvm_x86_vendor_init().
	 *
	 * Note, the actual bringing up of TDX must be done after
	 * kvm_ops_update() because enabling TDX requires enabling
	 * hardware virtualization first, i.e., all online CPUs must
	 * be in post-VMXON state.  This means the @vm_size here
	 * may be updated to TDX's size but TDX may fail to enable
	 * at later time.
	 *
	 * The VMX/VT code could update kvm_x86_ops::vm_size again
	 * after bringing up TDX, but this would require exporting
	 * either kvm_x86_ops or kvm_ops_update() from the base KVM
	 * module, which looks overkill.  Anyway, the worst case here
	 * is KVM may allocate couple of more bytes than needed for
	 * each VM.
	 */
	if (enable_tdx) {
		vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size,
				sizeof(struct kvm_tdx));
		/*
		 * Note, TDX may fail to initialize in a later time in
		 * vt_init(), in which case it is not necessary to setup
		 * those callbacks.  But making them valid here even
		 * when TDX fails to init later is fine because those
		 * callbacks won't be called if the VM isn't TDX guest.
		 */
		vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
		vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
		vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
		vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
		vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
	}
	if (enable_tdx)
		tdx_hardware_setup();

	return 0;
}
Loading