Commit 3e89d5fd authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge tag 'kvm-x86-vmx-6.16' of https://github.com/kvm-x86/linux into HEAD

KVM VMX changes for 6.16:

 - Explicitly check MSR load/store list counts to fix a potential overflow on
   32-bit kernels.

 - Flush shadow VMCSes on emergency reboot.

 - Revert mem_enc_ioctl() back to an optional hook, as it's nullified when
   SEV or TDX is disabled via Kconfig.

 - Macrofy the handling of vt_x86_ops to eliminate a pile of boilerplate code
   needed for TDX, and to optimize CONFIG_KVM_INTEL_TDX=n builds.
parents 3e0797f6 907092bf
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -127,7 +127,7 @@ KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
#endif
KVM_X86_OP_OPTIONAL(dev_get_attr)
KVM_X86_OP(mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
+103 −105
Original line number Diff line number Diff line
@@ -12,7 +12,6 @@

#ifdef CONFIG_KVM_INTEL_TDX
static_assert(offsetof(struct vcpu_vmx, vt) == offsetof(struct vcpu_tdx, vt));
#endif

static void vt_disable_virtualization_cpu(void)
{
@@ -240,7 +239,7 @@ static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
	if (is_td_vcpu(vcpu))
		return tdx_complete_emulated_msr(vcpu, err);

	return kvm_complete_insn_gp(vcpu, err);
	return vmx_complete_emulated_msr(vcpu, err);
}

#ifdef CONFIG_KVM_SMM
@@ -315,14 +314,6 @@ static void vt_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
	return vmx_set_virtual_apic_mode(vcpu);
}

static void vt_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
{
	struct pi_desc *pi = vcpu_to_pi_desc(vcpu);

	pi_clear_on(pi);
	memset(pi->pir, 0, sizeof(pi->pir));
}

static void vt_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
	if (is_td_vcpu(vcpu))
@@ -888,6 +879,13 @@ static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
	return 0;
}

#define vt_op(name) vt_##name
#define vt_op_tdx_only(name) vt_##name
#else /* CONFIG_KVM_INTEL_TDX */
#define vt_op(name) vmx_##name
#define vt_op_tdx_only(name) NULL
#endif /* CONFIG_KVM_INTEL_TDX */

#define VMX_REQUIRED_APICV_INHIBITS				\
	(BIT(APICV_INHIBIT_REASON_DISABLED) |			\
	 BIT(APICV_INHIBIT_REASON_ABSENT) |			\
@@ -905,113 +903,113 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
	.hardware_unsetup = vmx_hardware_unsetup,

	.enable_virtualization_cpu = vmx_enable_virtualization_cpu,
	.disable_virtualization_cpu = vt_disable_virtualization_cpu,
	.disable_virtualization_cpu = vt_op(disable_virtualization_cpu),
	.emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,

	.has_emulated_msr = vt_has_emulated_msr,
	.has_emulated_msr = vt_op(has_emulated_msr),

	.vm_size = sizeof(struct kvm_vmx),

	.vm_init = vt_vm_init,
	.vm_pre_destroy = vt_vm_pre_destroy,
	.vm_destroy = vt_vm_destroy,
	.vm_init = vt_op(vm_init),
	.vm_destroy = vt_op(vm_destroy),
	.vm_pre_destroy = vt_op_tdx_only(vm_pre_destroy),

	.vcpu_precreate = vt_vcpu_precreate,
	.vcpu_create = vt_vcpu_create,
	.vcpu_free = vt_vcpu_free,
	.vcpu_reset = vt_vcpu_reset,
	.vcpu_precreate = vt_op(vcpu_precreate),
	.vcpu_create = vt_op(vcpu_create),
	.vcpu_free = vt_op(vcpu_free),
	.vcpu_reset = vt_op(vcpu_reset),

	.prepare_switch_to_guest = vt_prepare_switch_to_guest,
	.vcpu_load = vt_vcpu_load,
	.vcpu_put = vt_vcpu_put,
	.prepare_switch_to_guest = vt_op(prepare_switch_to_guest),
	.vcpu_load = vt_op(vcpu_load),
	.vcpu_put = vt_op(vcpu_put),

	.update_exception_bitmap = vt_update_exception_bitmap,
	.update_exception_bitmap = vt_op(update_exception_bitmap),
	.get_feature_msr = vmx_get_feature_msr,
	.get_msr = vt_get_msr,
	.set_msr = vt_set_msr,

	.get_segment_base = vt_get_segment_base,
	.get_segment = vt_get_segment,
	.set_segment = vt_set_segment,
	.get_cpl = vt_get_cpl,
	.get_cpl_no_cache = vt_get_cpl_no_cache,
	.get_cs_db_l_bits = vt_get_cs_db_l_bits,
	.is_valid_cr0 = vt_is_valid_cr0,
	.set_cr0 = vt_set_cr0,
	.is_valid_cr4 = vt_is_valid_cr4,
	.set_cr4 = vt_set_cr4,
	.set_efer = vt_set_efer,
	.get_idt = vt_get_idt,
	.set_idt = vt_set_idt,
	.get_gdt = vt_get_gdt,
	.set_gdt = vt_set_gdt,
	.set_dr6 = vt_set_dr6,
	.set_dr7 = vt_set_dr7,
	.sync_dirty_debug_regs = vt_sync_dirty_debug_regs,
	.cache_reg = vt_cache_reg,
	.get_rflags = vt_get_rflags,
	.set_rflags = vt_set_rflags,
	.get_if_flag = vt_get_if_flag,

	.flush_tlb_all = vt_flush_tlb_all,
	.flush_tlb_current = vt_flush_tlb_current,
	.flush_tlb_gva = vt_flush_tlb_gva,
	.flush_tlb_guest = vt_flush_tlb_guest,

	.vcpu_pre_run = vt_vcpu_pre_run,
	.vcpu_run = vt_vcpu_run,
	.handle_exit = vt_handle_exit,
	.get_msr = vt_op(get_msr),
	.set_msr = vt_op(set_msr),

	.get_segment_base = vt_op(get_segment_base),
	.get_segment = vt_op(get_segment),
	.set_segment = vt_op(set_segment),
	.get_cpl = vt_op(get_cpl),
	.get_cpl_no_cache = vt_op(get_cpl_no_cache),
	.get_cs_db_l_bits = vt_op(get_cs_db_l_bits),
	.is_valid_cr0 = vt_op(is_valid_cr0),
	.set_cr0 = vt_op(set_cr0),
	.is_valid_cr4 = vt_op(is_valid_cr4),
	.set_cr4 = vt_op(set_cr4),
	.set_efer = vt_op(set_efer),
	.get_idt = vt_op(get_idt),
	.set_idt = vt_op(set_idt),
	.get_gdt = vt_op(get_gdt),
	.set_gdt = vt_op(set_gdt),
	.set_dr6 = vt_op(set_dr6),
	.set_dr7 = vt_op(set_dr7),
	.sync_dirty_debug_regs = vt_op(sync_dirty_debug_regs),
	.cache_reg = vt_op(cache_reg),
	.get_rflags = vt_op(get_rflags),
	.set_rflags = vt_op(set_rflags),
	.get_if_flag = vt_op(get_if_flag),

	.flush_tlb_all = vt_op(flush_tlb_all),
	.flush_tlb_current = vt_op(flush_tlb_current),
	.flush_tlb_gva = vt_op(flush_tlb_gva),
	.flush_tlb_guest = vt_op(flush_tlb_guest),

	.vcpu_pre_run = vt_op(vcpu_pre_run),
	.vcpu_run = vt_op(vcpu_run),
	.handle_exit = vt_op(handle_exit),
	.skip_emulated_instruction = vmx_skip_emulated_instruction,
	.update_emulated_instruction = vmx_update_emulated_instruction,
	.set_interrupt_shadow = vt_set_interrupt_shadow,
	.get_interrupt_shadow = vt_get_interrupt_shadow,
	.patch_hypercall = vt_patch_hypercall,
	.inject_irq = vt_inject_irq,
	.inject_nmi = vt_inject_nmi,
	.inject_exception = vt_inject_exception,
	.cancel_injection = vt_cancel_injection,
	.interrupt_allowed = vt_interrupt_allowed,
	.nmi_allowed = vt_nmi_allowed,
	.get_nmi_mask = vt_get_nmi_mask,
	.set_nmi_mask = vt_set_nmi_mask,
	.enable_nmi_window = vt_enable_nmi_window,
	.enable_irq_window = vt_enable_irq_window,
	.update_cr8_intercept = vt_update_cr8_intercept,
	.set_interrupt_shadow = vt_op(set_interrupt_shadow),
	.get_interrupt_shadow = vt_op(get_interrupt_shadow),
	.patch_hypercall = vt_op(patch_hypercall),
	.inject_irq = vt_op(inject_irq),
	.inject_nmi = vt_op(inject_nmi),
	.inject_exception = vt_op(inject_exception),
	.cancel_injection = vt_op(cancel_injection),
	.interrupt_allowed = vt_op(interrupt_allowed),
	.nmi_allowed = vt_op(nmi_allowed),
	.get_nmi_mask = vt_op(get_nmi_mask),
	.set_nmi_mask = vt_op(set_nmi_mask),
	.enable_nmi_window = vt_op(enable_nmi_window),
	.enable_irq_window = vt_op(enable_irq_window),
	.update_cr8_intercept = vt_op(update_cr8_intercept),

	.x2apic_icr_is_split = false,
	.set_virtual_apic_mode = vt_set_virtual_apic_mode,
	.set_apic_access_page_addr = vt_set_apic_access_page_addr,
	.refresh_apicv_exec_ctrl = vt_refresh_apicv_exec_ctrl,
	.load_eoi_exitmap = vt_load_eoi_exitmap,
	.apicv_pre_state_restore = vt_apicv_pre_state_restore,
	.set_virtual_apic_mode = vt_op(set_virtual_apic_mode),
	.set_apic_access_page_addr = vt_op(set_apic_access_page_addr),
	.refresh_apicv_exec_ctrl = vt_op(refresh_apicv_exec_ctrl),
	.load_eoi_exitmap = vt_op(load_eoi_exitmap),
	.apicv_pre_state_restore = pi_apicv_pre_state_restore,
	.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
	.hwapic_isr_update = vt_hwapic_isr_update,
	.sync_pir_to_irr = vt_sync_pir_to_irr,
	.deliver_interrupt = vt_deliver_interrupt,
	.hwapic_isr_update = vt_op(hwapic_isr_update),
	.sync_pir_to_irr = vt_op(sync_pir_to_irr),
	.deliver_interrupt = vt_op(deliver_interrupt),
	.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,

	.set_tss_addr = vt_set_tss_addr,
	.set_identity_map_addr = vt_set_identity_map_addr,
	.set_tss_addr = vt_op(set_tss_addr),
	.set_identity_map_addr = vt_op(set_identity_map_addr),
	.get_mt_mask = vmx_get_mt_mask,

	.get_exit_info = vt_get_exit_info,
	.get_entry_info = vt_get_entry_info,
	.get_exit_info = vt_op(get_exit_info),
	.get_entry_info = vt_op(get_entry_info),

	.vcpu_after_set_cpuid = vt_vcpu_after_set_cpuid,
	.vcpu_after_set_cpuid = vt_op(vcpu_after_set_cpuid),

	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,

	.get_l2_tsc_offset = vt_get_l2_tsc_offset,
	.get_l2_tsc_multiplier = vt_get_l2_tsc_multiplier,
	.write_tsc_offset = vt_write_tsc_offset,
	.write_tsc_multiplier = vt_write_tsc_multiplier,
	.get_l2_tsc_offset = vt_op(get_l2_tsc_offset),
	.get_l2_tsc_multiplier = vt_op(get_l2_tsc_multiplier),
	.write_tsc_offset = vt_op(write_tsc_offset),
	.write_tsc_multiplier = vt_op(write_tsc_multiplier),

	.load_mmu_pgd = vt_load_mmu_pgd,
	.load_mmu_pgd = vt_op(load_mmu_pgd),

	.check_intercept = vmx_check_intercept,
	.handle_exit_irqoff = vmx_handle_exit_irqoff,

	.update_cpu_dirty_logging = vt_update_cpu_dirty_logging,
	.update_cpu_dirty_logging = vt_op(update_cpu_dirty_logging),

	.nested_ops = &vmx_nested_ops,

@@ -1019,38 +1017,38 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
	.pi_start_assignment = vmx_pi_start_assignment,

#ifdef CONFIG_X86_64
	.set_hv_timer = vt_set_hv_timer,
	.cancel_hv_timer = vt_cancel_hv_timer,
	.set_hv_timer = vt_op(set_hv_timer),
	.cancel_hv_timer = vt_op(cancel_hv_timer),
#endif

	.setup_mce = vt_setup_mce,
	.setup_mce = vt_op(setup_mce),

#ifdef CONFIG_KVM_SMM
	.smi_allowed = vt_smi_allowed,
	.enter_smm = vt_enter_smm,
	.leave_smm = vt_leave_smm,
	.enable_smi_window = vt_enable_smi_window,
	.smi_allowed = vt_op(smi_allowed),
	.enter_smm = vt_op(enter_smm),
	.leave_smm = vt_op(leave_smm),
	.enable_smi_window = vt_op(enable_smi_window),
#endif

	.check_emulate_instruction = vt_check_emulate_instruction,
	.apic_init_signal_blocked = vt_apic_init_signal_blocked,
	.check_emulate_instruction = vt_op(check_emulate_instruction),
	.apic_init_signal_blocked = vt_op(apic_init_signal_blocked),
	.migrate_timers = vmx_migrate_timers,

	.msr_filter_changed = vt_msr_filter_changed,
	.complete_emulated_msr = vt_complete_emulated_msr,
	.msr_filter_changed = vt_op(msr_filter_changed),
	.complete_emulated_msr = vt_op(complete_emulated_msr),

	.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,

	.get_untagged_addr = vmx_get_untagged_addr,

	.mem_enc_ioctl = vt_mem_enc_ioctl,
	.vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
	.mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl),
	.vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),

	.private_max_mapping_level = vt_gmem_private_max_mapping_level
	.private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level)
};

struct kvm_x86_init_ops vt_init_ops __initdata = {
	.hardware_setup = vt_hardware_setup,
	.hardware_setup = vt_op(hardware_setup),
	.handle_intel_pt_intr = NULL,

	.runtime_ops = &vt_x86_ops,
+20 −11
Original line number Diff line number Diff line
@@ -824,12 +824,30 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
	return 0;
}

static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
				       vmx->nested.msrs.misc_high);

	return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
}

static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
				       u32 count, u64 addr)
{
	if (count == 0)
		return 0;

	/*
	 * Exceeding the limit results in architecturally _undefined_ behavior,
	 * i.e. KVM is allowed to do literally anything in response to a bad
	 * limit.  Immediately generate a consistency check so that code that
	 * consumes the count doesn't need to worry about extreme edge cases.
	 */
	if (count > nested_vmx_max_atomic_switch_msrs(vcpu))
		return -EINVAL;

	if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
	    !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
		return -EINVAL;
@@ -940,15 +958,6 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
	return 0;
}

static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
				       vmx->nested.msrs.misc_high);

	return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
}

/*
 * Load guest's/host's msr at nested entry/exit.
 * return 0 for success, entry index for failure.
@@ -965,7 +974,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
	u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);

	for (i = 0; i < count; i++) {
		if (unlikely(i >= max_msr_list_size))
		if (WARN_ON_ONCE(i >= max_msr_list_size))
			goto fail;

		if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
@@ -1053,7 +1062,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
	u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);

	for (i = 0; i < count; i++) {
		if (unlikely(i >= max_msr_list_size))
		if (WARN_ON_ONCE(i >= max_msr_list_size))
			return -EINVAL;

		if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
+9 −1
Original line number Diff line number Diff line
@@ -34,7 +34,7 @@ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);

#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING

struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
	return &(to_vt(vcpu)->pi_desc);
}
@@ -263,6 +263,14 @@ void __init pi_init_cpu(int cpu)
	raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}

void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
{
	struct pi_desc *pi = vcpu_to_pi_desc(vcpu);

	pi_clear_on(pi);
	memset(pi->pir, 0, sizeof(pi->pir));
}

bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+1 −2
Original line number Diff line number Diff line
@@ -5,12 +5,11 @@
#include <linux/bitmap.h>
#include <asm/posted_intr.h>

struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu);

void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
		       uint32_t guest_irq, bool set);
Loading