Commit 137e0ec0 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull kvm fixes from Paolo Bonzini:
 "KVM GUEST_MEMFD fixes for 6.8:

   - Make KVM_MEM_GUEST_MEMFD mutually exclusive with KVM_MEM_READONLY
     to avoid creating an inconsistent ABI (KVM_MEM_GUEST_MEMFD is not
     writable from userspace, so there would be no way to write to a
     read-only guest_memfd).

   - Update documentation for KVM_SW_PROTECTED_VM to make it abundantly
     clear that such VMs are purely for development and testing.

   - Limit KVM_SW_PROTECTED_VM guests to the TDP MMU, as the long term
     plan is to support confidential VMs with deterministic private
     memory (SNP and TDX) only in the TDP MMU.

   - Fix a bug in a GUEST_MEMFD dirty logging test that caused false
     passes.

  x86 fixes:

   - Fix missing marking of a guest page as dirty when emulating an
     atomic access.

   - Check for mmu_notifier invalidation events before faulting in the
     pfn, and before acquiring mmu_lock, to avoid unnecessary work and
     lock contention with preemptible kernels (including
     CONFIG_PREEMPT_DYNAMIC in non-preemptible mode).

   - Disable AMD DebugSwap by default, it breaks VMSA signing and will
     be re-enabled with a better VM creation API in 6.10.

   - Do the cache flush of converted pages in svm_register_enc_region()
     before dropping kvm->lock, to avoid a race with unregistering of
     the same region and the consequent use-after-free issue"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  SEV: disable SEV-ES DebugSwap by default
  KVM: x86/mmu: Retry fault before acquiring mmu_lock if mapping is changing
  KVM: SVM: Flush pages under kvm->lock to fix UAF in svm_register_enc_region()
  KVM: selftests: Add a testcase to verify GUEST_MEMFD and READONLY are exclusive
  KVM: selftests: Create GUEST_MEMFD for relevant invalid flags testcases
  KVM: x86/mmu: Restrict KVM_SW_PROTECTED_VM to the TDP MMU
  KVM: x86: Update KVM_SW_PROTECTED_VM docs to make it clear they're a WIP
  KVM: Make KVM_MEM_GUEST_MEMFD mutually exclusive with KVM_MEM_READONLY
  KVM: x86: Mark target gfn of emulated atomic instruction as dirty
parents 005f6f34 5abf6dce
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -8791,6 +8791,11 @@ means the VM type with value @n is supported. Possible values of @n are::
  #define KVM_X86_DEFAULT_VM	0
  #define KVM_X86_SW_PROTECTED_VM	1

Note, KVM_X86_SW_PROTECTED_VM is currently only for development and testing.
Do not use KVM_X86_SW_PROTECTED_VM for "real" VMs, and especially not in
production.  The behavior and effective ABI for software-protected VMs is
unstable.

9. Known KVM API problems
=========================

+4 −3
Original line number Diff line number Diff line
@@ -80,9 +80,10 @@ config KVM_SW_PROTECTED_VM
	depends on KVM && X86_64
	select KVM_GENERIC_PRIVATE_MEM
	help
	  Enable support for KVM software-protected VMs.  Currently "protected"
	  means the VM can be backed with memory provided by
	  KVM_CREATE_GUEST_MEMFD.
	  Enable support for KVM software-protected VMs.  Currently, software-
	  protected VMs are purely a development and testing vehicle for
	  KVM_CREATE_GUEST_MEMFD.  Attempting to run a "real" VM workload as a
	  software-protected VM will fail miserably.

	  If unsure, say "N".

+42 −0
Original line number Diff line number Diff line
@@ -4405,6 +4405,31 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
	fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
	smp_rmb();

	/*
	 * Check for a relevant mmu_notifier invalidation event before getting
	 * the pfn from the primary MMU, and before acquiring mmu_lock.
	 *
	 * For mmu_lock, if there is an in-progress invalidation and the kernel
	 * allows preemption, the invalidation task may drop mmu_lock and yield
	 * in response to mmu_lock being contended, which is *very* counter-
	 * productive as this vCPU can't actually make forward progress until
	 * the invalidation completes.
	 *
	 * Retrying now can also avoid unnessary lock contention in the primary
	 * MMU, as the primary MMU doesn't necessarily hold a single lock for
	 * the duration of the invalidation, i.e. faulting in a conflicting pfn
	 * can cause the invalidation to take longer by holding locks that are
	 * needed to complete the invalidation.
	 *
	 * Do the pre-check even for non-preemtible kernels, i.e. even if KVM
	 * will never yield mmu_lock in response to contention, as this vCPU is
	 * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
	 * to detect retry guarantees the worst case latency for the vCPU.
	 */
	if (fault->slot &&
	    mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
		return RET_PF_RETRY;

	ret = __kvm_faultin_pfn(vcpu, fault);
	if (ret != RET_PF_CONTINUE)
		return ret;
@@ -4415,6 +4440,18 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
	if (unlikely(!fault->slot))
		return kvm_handle_noslot_fault(vcpu, fault, access);

	/*
	 * Check again for a relevant mmu_notifier invalidation event purely to
	 * avoid contending mmu_lock.  Most invalidations will be detected by
	 * the previous check, but checking is extremely cheap relative to the
	 * overall cost of failing to detect the invalidation until after
	 * mmu_lock is acquired.
	 */
	if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) {
		kvm_release_pfn_clean(fault->pfn);
		return RET_PF_RETRY;
	}

	return RET_PF_CONTINUE;
}

@@ -4442,6 +4479,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
	if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
		return true;

	/*
	 * Check for a relevant mmu_notifier invalidation event one last time
	 * now that mmu_lock is held, as the "unsafe" checks performed without
	 * holding mmu_lock can get false negatives.
	 */
	return fault->slot &&
	       mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
}
+14 −9
Original line number Diff line number Diff line
@@ -57,7 +57,7 @@ static bool sev_es_enabled = true;
module_param_named(sev_es, sev_es_enabled, bool, 0444);

/* enable/disable SEV-ES DebugSwap support */
static bool sev_es_debug_swap_enabled = true;
static bool sev_es_debug_swap_enabled = false;
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
#else
#define sev_enabled false
@@ -612,8 +612,11 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
	save->xss  = svm->vcpu.arch.ia32_xss;
	save->dr6  = svm->vcpu.arch.dr6;

	if (sev_es_debug_swap_enabled)
	if (sev_es_debug_swap_enabled) {
		save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
		pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. "
			     "This will not work starting with Linux 6.10\n");
	}

	pr_debug("Virtual Machine Save Area (VMSA):\n");
	print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
@@ -1975,20 +1978,22 @@ int sev_mem_enc_register_region(struct kvm *kvm,
		goto e_free;
	}

	region->uaddr = range->addr;
	region->size = range->size;

	list_add_tail(&region->list, &sev->regions_list);
	mutex_unlock(&kvm->lock);

	/*
	 * The guest may change the memory encryption attribute from C=0 -> C=1
	 * or vice versa for this memory range. Lets make sure caches are
	 * flushed to ensure that guest data gets written into memory with
	 * correct C-bit.
	 * correct C-bit.  Note, this must be done before dropping kvm->lock,
	 * as region and its array of pages can be freed by a different task
	 * once kvm->lock is released.
	 */
	sev_clflush_pages(region->pages, region->npages);

	region->uaddr = range->addr;
	region->size = range->size;

	list_add_tail(&region->list, &sev->regions_list);
	mutex_unlock(&kvm->lock);

	return ret;

e_free:
+11 −1
Original line number Diff line number Diff line
@@ -4580,7 +4580,7 @@ static bool kvm_is_vm_type_supported(unsigned long type)
{
	return type == KVM_X86_DEFAULT_VM ||
	       (type == KVM_X86_SW_PROTECTED_VM &&
		IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled);
		IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled);
}

int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -8007,6 +8007,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,

	if (r < 0)
		return X86EMUL_UNHANDLEABLE;

	/*
	 * Mark the page dirty _before_ checking whether or not the CMPXCHG was
	 * successful, as the old value is written back on failure.  Note, for
	 * live migration, this is unnecessarily conservative as CMPXCHG writes
	 * back the original value and the access is atomic, but KVM's ABI is
	 * that all writes are dirty logged, regardless of the value written.
	 */
	kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa));

	if (r)
		return X86EMUL_CMPXCHG_FAILED;

Loading