Commit 1a14928e authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge tag 'kvm-x86-misc-6.17' of https://github.com/kvm-x86/linux into HEAD

KVM x86 misc changes for 6.17

 - Prevert the host's DEBUGCTL.FREEZE_IN_SMM (Intel only) when running the
   guest.  Failure to honor FREEZE_IN_SMM can bleed host state into the guest.

 - Explicitly check vmcs12.GUEST_DEBUGCTL on nested VM-Enter (Intel only) to
   prevent L1 from running L2 with features that KVM doesn't support, e.g. BTF.

 - Intercept SPEC_CTRL on AMD if the MSR shouldn't exist according to the
   vCPU's CPUID model.

 - Rework the MSR interception code so that the SVM and VMX APIs are more or
   less identical.

 - Recalculate all MSR intercepts from the "source" on MSR filter changes, and
   drop the dedicated "shadow" bitmaps (and their awful "max" size defines).

 - WARN and reject loading kvm-amd.ko instead of panicking the kernel if the
   nested SVM MSRPM offsets tracker can't handle an MSR.

 - Advertise support for LKGS (Load Kernel GS base), a new instruction that's
   loosely related to FRED, but is supported and enumerated independently.

 - Fix a user-triggerable WARN that syzkaller found by stuffing INIT_RECEIVED,
   a.k.a. WFS, and then putting the vCPU into VMX Root Mode (post-VMXON).  Use
   the same approach KVM uses for dealing with "impossible" emulation when
   running a !URG guest, and simply wait until KVM_RUN to detect that the vCPU
   has architecturally impossible state.

 - Add KVM_X86_DISABLE_EXITS_APERFMPERF to allow disabling interception of
   APERF/MPERF reads, so that a "properly" configured VM can "virtualize"
   APERF/MPERF (with many caveats).

 - Reject KVM_SET_TSC_KHZ if vCPUs have been created, as changing the "default"
   frequency is unsupported for VMs with a "secure" TSC, and there's no known
   use case for changing the default frequency for other VM types.
parents 9de13951 dcbe5a46
Loading
Loading
Loading
Loading
+24 −1
Original line number Diff line number Diff line
@@ -2006,7 +2006,7 @@ frequency is KHz.

If the KVM_CAP_VM_TSC_CONTROL capability is advertised, this can also
be used as a vm ioctl to set the initial tsc frequency of subsequently
created vCPUs.
created vCPUs.  Note, the vm ioctl is only allowed prior to creating vCPUs.

For TSC protected Confidential Computing (CoCo) VMs where TSC frequency
is configured once at VM scope and remains unchanged during VM's
@@ -7851,6 +7851,7 @@ Valid bits in args[0] are::
  #define KVM_X86_DISABLE_EXITS_HLT              (1 << 1)
  #define KVM_X86_DISABLE_EXITS_PAUSE            (1 << 2)
  #define KVM_X86_DISABLE_EXITS_CSTATE           (1 << 3)
  #define KVM_X86_DISABLE_EXITS_APERFMPERF       (1 << 4)

Enabling this capability on a VM provides userspace with a way to no
longer intercept some instructions for improved latency in some
@@ -7861,6 +7862,28 @@ all such vmexits.

Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits.

Virtualizing the ``IA32_APERF`` and ``IA32_MPERF`` MSRs requires more
than just disabling APERF/MPERF exits. While both Intel and AMD
document strict usage conditions for these MSRs--emphasizing that only
the ratio of their deltas over a time interval (T0 to T1) is
architecturally defined--simply passing through the MSRs can still
produce an incorrect ratio.

This erroneous ratio can occur if, between T0 and T1:

1. The vCPU thread migrates between logical processors.
2. Live migration or suspend/resume operations take place.
3. Another task shares the vCPU's logical processor.
4. C-states lower than C0 are emulated (e.g., via HLT interception).
5. The guest TSC frequency doesn't match the host TSC frequency.

Due to these complexities, KVM does not automatically associate this
passthrough capability with the guest CPUID bit,
``CPUID.6:ECX.APERFMPERF[bit 0]``. Userspace VMMs that deem this
mechanism adequate for virtualizing the ``IA32_APERF`` and
``IA32_MPERF`` MSRs must set the guest CPUID bit explicitly.


7.14 KVM_CAP_S390_HPAGE_1M
--------------------------

+1 −2
Original line number Diff line number Diff line
@@ -49,7 +49,6 @@ KVM_X86_OP(set_idt)
KVM_X86_OP(get_gdt)
KVM_X86_OP(set_gdt)
KVM_X86_OP(sync_dirty_debug_regs)
KVM_X86_OP(set_dr6)
KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
@@ -139,7 +138,7 @@ KVM_X86_OP(check_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
KVM_X86_OP_OPTIONAL(migrate_timers)
KVM_X86_OP(msr_filter_changed)
KVM_X86_OP(recalc_msr_intercepts)
KVM_X86_OP(complete_emulated_msr)
KVM_X86_OP(vcpu_deliver_sipi_vector)
KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
+15 −7
Original line number Diff line number Diff line
@@ -1408,10 +1408,7 @@ struct kvm_arch {

	gpa_t wall_clock;

	bool mwait_in_guest;
	bool hlt_in_guest;
	bool pause_in_guest;
	bool cstate_in_guest;
	u64 disabled_exits;

	s64 kvmclock_offset;

@@ -1687,6 +1684,12 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
	return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
}

enum kvm_x86_run_flags {
	KVM_RUN_FORCE_IMMEDIATE_EXIT	= BIT(0),
	KVM_RUN_LOAD_GUEST_DR6		= BIT(1),
	KVM_RUN_LOAD_DEBUGCTL		= BIT(2),
};

struct kvm_x86_ops {
	const char *name;

@@ -1715,6 +1718,12 @@ struct kvm_x86_ops {
	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
	void (*vcpu_put)(struct kvm_vcpu *vcpu);

	/*
	 * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
	 * match the host's value even while the guest is active.
	 */
	const u64 HOST_OWNED_DEBUGCTL;

	void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
	int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
	int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -1737,7 +1746,6 @@ struct kvm_x86_ops {
	void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
	void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
	void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
	void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
	void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
@@ -1768,7 +1776,7 @@ struct kvm_x86_ops {

	int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
	enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
						  bool force_immediate_exit);
						  u64 run_flags);
	int (*handle_exit)(struct kvm_vcpu *vcpu,
		enum exit_fastpath_completion exit_fastpath);
	int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
@@ -1900,7 +1908,7 @@ struct kvm_x86_ops {
	int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu);

	void (*migrate_timers)(struct kvm_vcpu *vcpu);
	void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
	void (*recalc_msr_intercepts)(struct kvm_vcpu *vcpu);
	int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);

	void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
+1 −0
Original line number Diff line number Diff line
@@ -419,6 +419,7 @@
#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI	(1UL << 12)
#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT	14
#define DEBUGCTLMSR_FREEZE_IN_SMM	(1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
#define DEBUGCTLMSR_RTM_DEBUG		BIT(15)

#define MSR_PEBS_FRONTEND		0x000003f7

+1 −0
Original line number Diff line number Diff line
@@ -979,6 +979,7 @@ void kvm_set_cpu_caps(void)
		F(FSRS),
		F(FSRC),
		F(WRMSRNS),
		X86_64_F(LKGS),
		F(AMX_FP16),
		F(AVX_IFMA),
		F(LAM),
Loading