Merge tag 'kvm-x86-mmu-6.17' of https://github.com/kvm-x86/linux into HEAD (d7f4aac2) · Commits · git / linux-net

arch/x86/include/asm/kvm_host.h

+3 −3

Original line number	Diff line number	Diff line
		@@ -1358,7 +1358,7 @@ struct kvm_arch {
		bool has_private_mem;
		bool has_protected_state;
		bool pre_fault_allowed;
		struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
		struct hlist_head *mmu_page_hash;
		struct list_head active_mmu_pages;
		/*
		* A list of kvm_mmu_page structs that, if zapped, could possibly be
		@@ -1985,7 +1985,7 @@ void kvm_x86_vendor_exit(void);
		#define __KVM_HAVE_ARCH_VM_ALLOC
		static inline struct kvm *kvm_arch_alloc_vm(void)
		{
		return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT \| __GFP_ZERO);
		return kvzalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT);
		}

		#define __KVM_HAVE_ARCH_VM_FREE
		@@ -2030,7 +2030,7 @@ void kvm_mmu_vendor_module_exit(void);

		void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
		int kvm_mmu_create(struct kvm_vcpu *vcpu);
		void kvm_mmu_init_vm(struct kvm *kvm);
		int kvm_mmu_init_vm(struct kvm *kvm);
		void kvm_mmu_uninit_vm(struct kvm *kvm);

		void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,

arch/x86/kvm/mmu/mmu.c

+69 −6

Original line number	Diff line number	Diff line
		@@ -1983,14 +1983,35 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
		return true;
		}

		static __ro_after_init HLIST_HEAD(empty_page_hash);

		static struct hlist_head kvm_get_mmu_page_hash(struct kvm kvm, gfn_t gfn)
		{
		/*
		* Ensure the load of the hash table pointer itself is ordered before
		* loads to walk the table. The pointer is set at runtime outside of
		* mmu_lock when the TDP MMU is enabled, i.e. when the hash table of
		* shadow pages becomes necessary only when KVM needs to shadow L1's
		* TDP for an L2 guest. Pairs with the smp_store_release() in
		* kvm_mmu_alloc_page_hash().
		*/
		struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash);

		lockdep_assert_held(&kvm->mmu_lock);

		if (!page_hash)
		return &empty_page_hash;

		return &page_hash[kvm_page_table_hashfn(gfn)];
		}

		#define for_each_valid_sp(_kvm, _sp, _list) \
		hlist_for_each_entry(_sp, _list, hash_link) \
		if (is_obsolete_sp((_kvm), (_sp))) { \
		} else

		#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
		for_each_valid_sp(_kvm, _sp, \
		&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
		for_each_valid_sp(_kvm, _sp, kvm_get_mmu_page_hash(_kvm, _gfn)) \
		if ((_sp)->gfn != (_gfn) \|\| !sp_has_gptes(_sp)) {} else

		static bool kvm_sync_page_check(struct kvm_vcpu vcpu, struct kvm_mmu_page sp)
		@@ -2358,6 +2379,12 @@ static struct kvm_mmu_page __kvm_mmu_get_shadow_page(struct kvm kvm,
		struct kvm_mmu_page *sp;
		bool created = false;

		/*
		* No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as
		* mmu_page_hash must be set prior to creating the first shadow root,
		* i.e. reaching this point is fully serialized by slots_arch_lock.
		*/
		BUG_ON(!kvm->arch.mmu_page_hash);
		sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];

		sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
		@@ -3882,6 +3909,28 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
		return r;
		}

		static int kvm_mmu_alloc_page_hash(struct kvm *kvm)
		{
		struct hlist_head *h;

		if (kvm->arch.mmu_page_hash)
		return 0;

		h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT);
		if (!h)
		return -ENOMEM;

		/*
		* Ensure the hash table pointer is set only after all stores to zero
		* the memory are retired. Pairs with the smp_load_acquire() in
		* kvm_get_mmu_page_hash(). Note, mmu_lock must be held for write to
		* add (or remove) shadow pages, and so readers are guaranteed to see
		* an empty list for their current mmu_lock critical section.
		*/
		smp_store_release(&kvm->arch.mmu_page_hash, h);
		return 0;
		}

		static int mmu_first_shadow_root_alloc(struct kvm *kvm)
		{
		struct kvm_memslots *slots;
		@@ -3901,9 +3950,13 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
		if (kvm_shadow_root_allocated(kvm))
		goto out_unlock;

		r = kvm_mmu_alloc_page_hash(kvm);
		if (r)
		goto out_unlock;

		/*
		* Check if anything actually needs to be allocated, e.g. all metadata
		* will be allocated upfront if TDP is disabled.
		* Check if memslot metadata actually needs to be allocated, e.g. all
		* metadata will be allocated upfront if TDP is disabled.
		*/
		if (kvm_memslots_have_rmaps(kvm) &&
		kvm_page_track_write_tracking_enabled(kvm))
		@@ -6682,15 +6735,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
		kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
		}

		void kvm_mmu_init_vm(struct kvm *kvm)
		int kvm_mmu_init_vm(struct kvm *kvm)
		{
		int r;

		kvm->arch.shadow_mmio_value = shadow_mmio_value;
		INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
		INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
		spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);

		if (tdp_mmu_enabled)
		if (tdp_mmu_enabled) {
		kvm_mmu_init_tdp_mmu(kvm);
		} else {
		r = kvm_mmu_alloc_page_hash(kvm);
		if (r)
		return r;
		}

		kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
		kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
		@@ -6699,6 +6759,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)

		kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
		kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
		return 0;
		}

		static void mmu_free_vm_memory_caches(struct kvm *kvm)
		@@ -6710,6 +6771,8 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm)

		void kvm_mmu_uninit_vm(struct kvm *kvm)
		{
		kvfree(kvm->arch.mmu_page_hash);

		if (tdp_mmu_enabled)
		kvm_mmu_uninit_tdp_mmu(kvm);

arch/x86/kvm/mmu/paging_tmpl.h

+6 −2

Original line number	Diff line number	Diff line
		@@ -804,9 +804,12 @@ static int FNAME(page_fault)(struct kvm_vcpu vcpu, struct kvm_page_fault fault
		if (r != RET_PF_CONTINUE)
		return r;

		#if PTTYPE != PTTYPE_EPT
		/*
		* Do not change pte_access if the pfn is a mmio page, otherwise
		* we will cache the incorrect access into mmio spte.
		* Treat the guest PTE protections as writable, supervisor-only if this
		* is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore
		* PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO,
		* otherwise KVM will cache incorrect access information in the SPTE.
		*/
		if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
		!is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
		@@ -822,6 +825,7 @@ static int FNAME(page_fault)(struct kvm_vcpu vcpu, struct kvm_page_fault fault
		if (is_cr4_smep(vcpu->arch.mmu))
		walker.pte_access &= ~ACC_EXEC_MASK;
		}
		#endif

		r = RET_PF_RETRY;
		write_lock(&vcpu->kvm->mmu_lock);

arch/x86/kvm/svm/svm.c

+2 −0

Original line number	Diff line number	Diff line
		@@ -5494,6 +5494,8 @@ static int __init svm_init(void)
		{
		int r;

		KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm);

		__unused_size_checks();

		if (!kvm_is_svm_supported())

arch/x86/kvm/vmx/main.c

+2 −34

Original line number	Diff line number	Diff line
		@@ -29,40 +29,8 @@ static __init int vt_hardware_setup(void)
		if (ret)
		return ret;

		/*
		* Update vt_x86_ops::vm_size here so it is ready before
		* kvm_ops_update() is called in kvm_x86_vendor_init().
		*
		* Note, the actual bringing up of TDX must be done after
		* kvm_ops_update() because enabling TDX requires enabling
		* hardware virtualization first, i.e., all online CPUs must
		* be in post-VMXON state. This means the @vm_size here
		* may be updated to TDX's size but TDX may fail to enable
		* at later time.
		*
		* The VMX/VT code could update kvm_x86_ops::vm_size again
		* after bringing up TDX, but this would require exporting
		* either kvm_x86_ops or kvm_ops_update() from the base KVM
		* module, which looks overkill. Anyway, the worst case here
		* is KVM may allocate couple of more bytes than needed for
		* each VM.
		*/
		if (enable_tdx) {
		vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size,
		sizeof(struct kvm_tdx));
		/*
		* Note, TDX may fail to initialize in a later time in
		* vt_init(), in which case it is not necessary to setup
		* those callbacks. But making them valid here even
		* when TDX fails to init later is fine because those
		* callbacks won't be called if the VM isn't TDX guest.
		*/
		vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
		vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
		vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
		vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
		vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
		}
		if (enable_tdx)
		tdx_hardware_setup();

		return 0;
		}