Commit d1e7b461 authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge tag 'kvm-x86-vmx-6.19' of https://github.com/kvm-x86/linux into HEAD

KVM VMX changes for 6.19:

 - Use the root role from kvm_mmu_page to construct EPTPs instead of the
   current vCPU state, partly as worthwhile cleanup, but mostly to pave the
   way for tracking per-root TLB flushes so that KVM can elide EPT flushes on
   pCPU migration if KVM has flushed the root at least once.

 - Add a few missing nested consistency checks.

 - Rip out support for doing "early" consistency checks via hardware as the
   functionality hasn't been used in years and is no longer useful in general,
   and replace it with an off-by-default module param to detected missed
   consistency checks (i.e. WARN if hardware finds a check that KVM does not).

 - Fix a currently-benign bug where KVM would drop the guest's SPEC_CTRL[63:32]
   on VM-Enter.

 - Misc cleanups.
parents de8e8ebb dfd1572a
Loading
Loading
Loading
Loading
+0 −10
Original line number Diff line number Diff line
@@ -39,16 +39,6 @@
#define INVALID_PAE_ROOT	0
#define IS_VALID_PAE_ROOT(x)	(!!(x))

static inline hpa_t kvm_mmu_get_dummy_root(void)
{
	return my_zero_pfn(0) << PAGE_SHIFT;
}

static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
{
	return is_zero_pfn(shadow_page >> PAGE_SHIFT);
}

typedef u64 __rcu *tdp_ptep_t;

struct kvm_mmu_page {
+10 −0
Original line number Diff line number Diff line
@@ -246,6 +246,16 @@ static inline int spte_index(u64 *sptep)
 */
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;

static inline hpa_t kvm_mmu_get_dummy_root(void)
{
	return my_zero_pfn(0) << PAGE_SHIFT;
}

static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
{
	return is_zero_pfn(shadow_page >> PAGE_SHIFT);
}

static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
{
	struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);
+60 −111
Original line number Diff line number Diff line
@@ -23,8 +23,8 @@
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);

static bool __read_mostly nested_early_check = 0;
module_param(nested_early_check, bool, S_IRUGO);
static bool __ro_after_init warn_on_missed_cc;
module_param(warn_on_missed_cc, bool, 0444);

#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK

@@ -555,6 +555,9 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
	if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
		return -EINVAL;

	if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4))
		return -EINVAL;

	return 0;
}

@@ -761,7 +764,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
				      vmcs12->vmcs_link_pointer, VMCS12_SIZE))
		return;

	kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
	kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu),
			      VMCS12_SIZE);
}

@@ -780,7 +783,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
				      vmcs12->vmcs_link_pointer, VMCS12_SIZE))
		return;

	kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
	kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu),
			       VMCS12_SIZE);
}

@@ -2296,15 +2299,6 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
		return;
	vmx->nested.vmcs02_initialized = true;

	/*
	 * We don't care what the EPTP value is we just need to guarantee
	 * it's valid so we don't get a false positive when doing early
	 * consistency checks.
	 */
	if (enable_ept && nested_early_check)
		vmcs_write64(EPT_POINTER,
			     construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));

	if (vmx->ve_info)
		vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));

@@ -2749,7 +2743,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
		vcpu->arch.pat = vmcs12->guest_ia32_pat;
	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
		vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat);
	}

	vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
@@ -2961,6 +2955,10 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
		}
	}

	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) &&
	    CC(!vmcs12->tsc_multiplier))
		return -EINVAL;

	return 0;
}

@@ -3078,6 +3076,38 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
	return 0;
}

static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu,
					  struct vmcs12 *vmcs12)
{
	void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva;
	u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0;

	/*
	 * Don't bother with the consistency checks if KVM isn't configured to
	 * WARN on missed consistency checks, as KVM needs to rely on hardware
	 * to fully detect an illegal vTPR vs. TRP Threshold combination due to
	 * the vTPR being writable by L1 at all times (it's an in-memory value,
	 * not a VMCS field).  I.e. even if the check passes now, it might fail
	 * at the actual VM-Enter.
	 *
	 * Keying off the module param also allows treating an invalid vAPIC
	 * mapping as a consistency check failure without increasing the risk
	 * of breaking a "real" VM.
	 */
	if (!warn_on_missed_cc)
		return 0;

	if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) &&
	    nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) &&
	    !nested_cpu_has_vid(vmcs12) &&
	    !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
	    (CC(!vapic) ||
	     CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0)))))
		return -EINVAL;

	return 0;
}

static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
				       struct vmcs12 *vmcs12)
{
@@ -3333,84 +3363,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
	return 0;
}

static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	unsigned long cr3, cr4;
	bool vm_fail;

	if (!nested_early_check)
		return 0;

	if (vmx->msr_autoload.host.nr)
		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
	if (vmx->msr_autoload.guest.nr)
		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);

	preempt_disable();

	vmx_prepare_switch_to_guest(vcpu);

	/*
	 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
	 * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
	 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
	 * there is no need to preserve other bits or save/restore the field.
	 */
	vmcs_writel(GUEST_RFLAGS, 0);

	cr3 = __get_current_cr3_fast();
	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
		vmcs_writel(HOST_CR3, cr3);
		vmx->loaded_vmcs->host_state.cr3 = cr3;
	}

	cr4 = cr4_read_shadow();
	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
		vmcs_writel(HOST_CR4, cr4);
		vmx->loaded_vmcs->host_state.cr4 = cr4;
	}

	vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
				 __vmx_vcpu_run_flags(vmx));

	if (vmx->msr_autoload.host.nr)
		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
	if (vmx->msr_autoload.guest.nr)
		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);

	if (vm_fail) {
		u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);

		preempt_enable();

		trace_kvm_nested_vmenter_failed(
			"early hardware check VM-instruction error: ", error);
		WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
		return 1;
	}

	/*
	 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
	 */
	if (hw_breakpoint_active())
		set_debugreg(__this_cpu_read(cpu_dr7), 7);
	local_irq_enable();
	preempt_enable();

	/*
	 * A non-failing VMEntry means we somehow entered guest mode with
	 * an illegal RIP, and that's just the tip of the iceberg.  There
	 * is no telling what memory has been modified or what state has
	 * been exposed to unknown code.  Hitting this all but guarantees
	 * a (very critical) hardware issue.
	 */
	WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
		VMX_EXIT_REASONS_FAILED_VMENTRY));

	return 0;
}

#ifdef CONFIG_KVM_HYPERV
static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
{
@@ -3667,22 +3619,18 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
				    &vmx->nested.pre_vmenter_ssp_tbl);

	/*
	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
	 * nested early checks are disabled.  In the event of a "late" VM-Fail,
	 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
	 * software model to the pre-VMEntry host state.  When EPT is disabled,
	 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
	 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
	 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
	 * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
	 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
	 * guaranteed to be overwritten with a shadow CR3 prior to re-entering
	 * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
	 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
	 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
	 * path would need to manually save/restore vmcs01.GUEST_CR3.
	 */
	if (!enable_ept && !nested_early_check)
	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled.  In the
	 * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but
	 * not KVM, KVM must unwind its software model to the pre-VM-Entry host
	 * state.  When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not
	 * L1's "real" CR3, which causes nested_vmx_restore_host_state() to
	 * corrupt vcpu->arch.cr3.  Stuffing vmcs01.GUEST_CR3 results in the
	 * unwind naturally setting arch.cr3 to the correct value.  Smashing
	 * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind,
	 * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be
	 * overwritten with a shadow CR3 prior to re-entering L1.
	 */
	if (!enable_ept)
		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);

	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
@@ -3695,7 +3643,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
			return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
		}

		if (nested_vmx_check_vmentry_hw(vcpu)) {
		if (nested_vmx_check_controls_late(vcpu, vmcs12)) {
			vmx_switch_vmcs(vcpu, &vmx->vmcs01);
			return NVMX_VMENTRY_VMFAIL;
		}
@@ -5164,12 +5112,13 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
		/*
		 * The only expected VM-instruction error is "VM entry with
		 * invalid control field(s)." Anything else indicates a
		 * problem with L0.  And we should never get here with a
		 * VMFail of any type if early consistency checks are enabled.
		 * problem with L0.
		 */
		WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
			     VMXERR_ENTRY_INVALID_CONTROL_FIELD);
		WARN_ON_ONCE(nested_early_check);

		/* VM-Fail at VM-Entry means KVM missed a consistency check. */
		WARN_ON_ONCE(warn_on_missed_cc);
	}

	/*
+10 −20
Original line number Diff line number Diff line
@@ -2706,9 +2706,11 @@ DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,

static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
	struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
	struct kvm_tdx_init_vm *init_vm;
	struct td_params *td_params = NULL;
	u32 nr_user_entries;
	int ret;

	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
@@ -2720,28 +2722,16 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
	if (cmd->flags)
		return -EINVAL;

	init_vm = kmalloc(sizeof(*init_vm) +
			  sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
			  GFP_KERNEL);
	if (!init_vm)
		return -ENOMEM;

	if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
		ret = -EFAULT;
		goto out;
	}
	if (get_user(nr_user_entries, &user_data->cpuid.nent))
		return -EFAULT;

	if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
		ret = -E2BIG;
		goto out;
	}
	if (nr_user_entries > KVM_MAX_CPUID_ENTRIES)
		return -E2BIG;

	if (copy_from_user(init_vm->cpuid.entries,
			   u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
			   flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
		ret = -EFAULT;
		goto out;
	}
	init_vm = memdup_user(user_data,
			      struct_size(user_data, cpuid.entries, nr_user_entries));
	if (IS_ERR(init_vm))
		return PTR_ERR(init_vm);

	if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
		ret = -EINVAL;
+15 −5
Original line number Diff line number Diff line
@@ -116,13 +116,23 @@ SYM_FUNC_START(__vmx_vcpu_run)
	 * and vmentry.
	 */
	mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI
	movl VMX_spec_ctrl(%_ASM_DI), %edi
	movl PER_CPU_VAR(x86_spec_ctrl_current), %esi
	cmp %edi, %esi
#ifdef CONFIG_X86_64
	mov VMX_spec_ctrl(%rdi), %rdx
	cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx
	je .Lspec_ctrl_done
	movl %edx, %eax
	shr $32, %rdx
#else
	mov VMX_spec_ctrl(%edi), %eax
	mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx
	xor %eax, %ecx
	mov VMX_spec_ctrl + 4(%edi), %edx
	mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edi
	xor %edx, %edi
	or %edi, %ecx
	je .Lspec_ctrl_done
#endif
	mov $MSR_IA32_SPEC_CTRL, %ecx
	xor %edx, %edx
	mov %edi, %eax
	wrmsr

.Lspec_ctrl_done:
Loading