Commit e9025cdd authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge tag 'kvm-x86-pmu-6.9' of https://github.com/kvm-x86/linux into HEAD

KVM x86 PMU changes for 6.9:

 - Fix several bugs where KVM speciously prevents the guest from utilizing
   fixed counters and architectural event encodings based on whether or not
   guest CPUID reports support for the _architectural_ encoding.

 - Fix a variety of bugs in KVM's emulation of RDPMC, e.g. for "fast" reads,
   priority of VMX interception vs #GP, PMC types in architectural PMUs, etc.

 - Add a selftest to verify KVM correctly emulates RDMPC, counter availability,
   and a variety of other PMC-related behaviors that depend on guest CPUID,
   i.e. are difficult to validate via KVM-Unit-Tests.

 - Zero out PMU metadata on AMD if the virtual PMU is disabled to avoid wasting
   cycles, e.g. when checking if a PMC event needs to be synthesized when
   skipping an instruction.

 - Optimize triggering of emulated events, e.g. for "count instructions" events
   when skipping an instruction, which yields a ~10% performance improvement in
   VM-Exit microbenchmarks when a vPMU is exposed to the guest.

 - Tighten the check for "PMI in guest" to reduce false positives if an NMI
   arrives in the host while KVM is handling an IRQ VM-Exit.
parents b00471a5 812d4323
Loading
Loading
Loading
Loading
+1 −3
Original line number Diff line number Diff line
@@ -12,11 +12,9 @@ BUILD_BUG_ON(1)
 * a NULL definition, for example if "static_call_cond()" will be used
 * at the call sites.
 */
KVM_X86_PMU_OP(hw_event_available)
KVM_X86_PMU_OP(pmc_idx_to_pmc)
KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
KVM_X86_PMU_OP(msr_idx_to_pmc)
KVM_X86_PMU_OP(is_valid_rdpmc_ecx)
KVM_X86_PMU_OP_OPTIONAL(check_rdpmc_early)
KVM_X86_PMU_OP(is_valid_msr)
KVM_X86_PMU_OP(get_msr)
KVM_X86_PMU_OP(set_msr)
+10 −1
Original line number Diff line number Diff line
@@ -536,6 +536,7 @@ struct kvm_pmc {
#define KVM_PMC_MAX_FIXED	3
#define MSR_ARCH_PERFMON_FIXED_CTR_MAX	(MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
#define KVM_AMD_PMC_MAX_GENERIC	6

struct kvm_pmu {
	u8 version;
	unsigned nr_arch_gp_counters;
@@ -1889,8 +1890,16 @@ static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn,
}
#endif /* CONFIG_HYPERV */

enum kvm_intr_type {
	/* Values are arbitrary, but must be non-zero. */
	KVM_HANDLING_IRQ = 1,
	KVM_HANDLING_NMI,
};

/* Enable perf NMI and timer modes to work, and minimise false positives. */
#define kvm_arch_pmi_in_guest(vcpu) \
	((vcpu) && (vcpu)->arch.handling_intr_from_guest)
	((vcpu) && (vcpu)->arch.handling_intr_from_guest && \
	 (!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI)))

void __init kvm_mmu_x86_module_init(void);
int kvm_mmu_vendor_module_init(void);
+1 −1
Original line number Diff line number Diff line
@@ -3955,7 +3955,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
	 * protected mode.
	 */
	if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
	    ctxt->ops->check_pmc(ctxt, rcx))
	    ctxt->ops->check_rdpmc_early(ctxt, rcx))
		return emulate_gp(ctxt, 0);

	return X86EMUL_CONTINUE;
+1 −1
Original line number Diff line number Diff line
@@ -208,7 +208,7 @@ struct x86_emulate_ops {
	int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
	int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
	int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
	int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
	int (*check_rdpmc_early)(struct x86_emulate_ctxt *ctxt, u32 pmc);
	int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
	void (*halt)(struct x86_emulate_ctxt *ctxt);
	void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
+103 −60
Original line number Diff line number Diff line
@@ -29,6 +29,9 @@
struct x86_pmu_capability __read_mostly kvm_pmu_cap;
EXPORT_SYMBOL_GPL(kvm_pmu_cap);

struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);

/* Precise Distribution of Instructions Retired (PDIR) */
static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
@@ -67,7 +70,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
 *        all perf counters (both gp and fixed). The mapping relationship
 *        between pmc and perf counters is as the following:
 *        * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
 *                 [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
 *                 [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
 *        * AMD:   [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
 *          and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
 */
@@ -411,7 +414,7 @@ static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
				   int idx)
{
	int fixed_idx = idx - INTEL_PMC_IDX_FIXED;
	int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;

	if (filter->action == KVM_PMU_EVENT_DENY &&
	    test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
@@ -441,11 +444,10 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc)
static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
{
	return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
	       static_call(kvm_x86_pmu_hw_event_available)(pmc) &&
	       check_pmu_event_filter(pmc);
}

static void reprogram_counter(struct kvm_pmc *pmc)
static int reprogram_counter(struct kvm_pmc *pmc)
{
	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
	u64 eventsel = pmc->eventsel;
@@ -456,7 +458,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
	emulate_overflow = pmc_pause_counter(pmc);

	if (!pmc_event_is_allowed(pmc))
		goto reprogram_complete;
		return 0;

	if (emulate_overflow)
		__kvm_perf_overflow(pmc, false);
@@ -466,7 +468,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)

	if (pmc_is_fixed(pmc)) {
		fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
						  pmc->idx - INTEL_PMC_IDX_FIXED);
						  pmc->idx - KVM_FIXED_PMC_BASE_IDX);
		if (fixed_ctr_ctrl & 0x1)
			eventsel |= ARCH_PERFMON_EVENTSEL_OS;
		if (fixed_ctr_ctrl & 0x2)
@@ -477,43 +479,45 @@ static void reprogram_counter(struct kvm_pmc *pmc)
	}

	if (pmc->current_config == new_config && pmc_resume_counter(pmc))
		goto reprogram_complete;
		return 0;

	pmc_release_perf_event(pmc);

	pmc->current_config = new_config;

	/*
	 * If reprogramming fails, e.g. due to contention, leave the counter's
	 * regprogram bit set, i.e. opportunistically try again on the next PMU
	 * refresh.  Don't make a new request as doing so can stall the guest
	 * if reprogramming repeatedly fails.
	 */
	if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
	return pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
				     (eventsel & pmu->raw_event_mask),
				     !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
				     !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
				  eventsel & ARCH_PERFMON_EVENTSEL_INT))
		return;

reprogram_complete:
	clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
				     eventsel & ARCH_PERFMON_EVENTSEL_INT);
}

void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
{
	DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
	struct kvm_pmc *pmc;
	int bit;

	for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
		struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
	bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX);

		if (unlikely(!pmc)) {
			clear_bit(bit, pmu->reprogram_pmi);
			continue;
		}
	/*
	 * The reprogramming bitmap can be written asynchronously by something
	 * other than the task that holds vcpu->mutex, take care to clear only
	 * the bits that will actually processed.
	 */
	BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
	atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);

		reprogram_counter(pmc);
	kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
		/*
		 * If reprogramming fails, e.g. due to contention, re-set the
		 * regprogram bit set, i.e. opportunistically try again on the
		 * next PMU refresh.  Don't make a new request as doing so can
		 * stall the guest if reprogramming repeatedly fails.
		 */
		if (reprogram_counter(pmc))
			set_bit(pmc->idx, pmu->reprogram_pmi);
	}

	/*
@@ -525,10 +529,20 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
		kvm_pmu_cleanup(vcpu);
}

/* check if idx is a valid index to access PMU */
bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
{
	return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
	/*
	 * On Intel, VMX interception has priority over RDPMC exceptions that
	 * aren't already handled by the emulator, i.e. there are no additional
	 * check needed for Intel PMUs.
	 *
	 * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
	 * i.e. an invalid PMC results in a #GP, not #VMEXIT.
	 */
	if (!kvm_pmu_ops.check_rdpmc_early)
		return 0;

	return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx);
}

bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -567,10 +581,9 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)

int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
{
	bool fast_mode = idx & (1u << 31);
	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
	struct kvm_pmc *pmc;
	u64 mask = fast_mode ? ~0u : ~0ull;
	u64 mask = ~0ull;

	if (!pmu->version)
		return 1;
@@ -716,11 +729,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)

	bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);

	for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
		pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
		if (!pmc)
			continue;

	kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
		pmc_stop_counter(pmc);
		pmc->counter = 0;
		pmc->emulated_counter = 0;
@@ -741,6 +750,8 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
 */
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);

	if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
		return;

@@ -750,7 +761,21 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
	 */
	kvm_pmu_reset(vcpu);

	bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX);
	pmu->version = 0;
	pmu->nr_arch_gp_counters = 0;
	pmu->nr_arch_fixed_counters = 0;
	pmu->counter_bitmask[KVM_PMC_GP] = 0;
	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
	pmu->reserved_bits = 0xffffffff00200000ull;
	pmu->raw_event_mask = X86_RAW_EVENT_MASK;
	pmu->global_ctrl_mask = ~0ull;
	pmu->global_status_mask = ~0ull;
	pmu->fixed_ctr_ctrl_mask = ~0ull;
	pmu->pebs_enable_mask = ~0ull;
	pmu->pebs_data_cfg_mask = ~0ull;
	bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);

	if (vcpu->kvm->arch.enable_pmu)
		static_call(kvm_x86_pmu_refresh)(vcpu);
}

@@ -776,10 +801,8 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
	bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
		      pmu->pmc_in_use, X86_PMC_IDX_MAX);

	for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
		pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);

		if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
	kvm_for_each_pmc(pmu, pmc, i, bitmask) {
		if (pmc->perf_event && !pmc_speculative_in_use(pmc))
			pmc_stop_counter(pmc);
	}

@@ -799,13 +822,6 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
	kvm_pmu_request_counter_reprogram(pmc);
}

static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
	unsigned int perf_hw_id)
{
	return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) &
		AMD64_RAW_EVENT_MASK_NB);
}

static inline bool cpl_is_matched(struct kvm_pmc *pmc)
{
	bool select_os, select_user;
@@ -817,28 +833,55 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc)
		select_user = config & ARCH_PERFMON_EVENTSEL_USR;
	} else {
		config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
					  pmc->idx - INTEL_PMC_IDX_FIXED);
					  pmc->idx - KVM_FIXED_PMC_BASE_IDX);
		select_os = config & 0x1;
		select_user = config & 0x2;
	}

	/*
	 * Skip the CPL lookup, which isn't free on Intel, if the result will
	 * be the same regardless of the CPL.
	 */
	if (select_os == select_user)
		return select_os;

	return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
}

void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
{
	DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
	struct kvm_pmc *pmc;
	int i;

	for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
		pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
	BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);

		if (!pmc || !pmc_event_is_allowed(pmc))
	if (!kvm_pmu_has_perf_global_ctrl(pmu))
		bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
	else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx,
			     (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
		return;

	kvm_for_each_pmc(pmu, pmc, i, bitmap) {
		/*
		 * Ignore checks for edge detect (all events currently emulated
		 * but KVM are always rising edges), pin control (unsupported
		 * by modern CPUs), and counter mask and its invert flag (KVM
		 * doesn't emulate multiple events in a single clock cycle).
		 *
		 * Note, the uppermost nibble of AMD's mask overlaps Intel's
		 * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved
		 * bits (bits 35:34).  Checking the "in HLE/RTM transaction"
		 * flags is correct as the vCPU can't be in a transaction if
		 * KVM is emulating an instruction.  Checking the reserved bits
		 * might be wrong if they are defined in the future, but so
		 * could ignoring them, so do the simple thing for now.
		 */
		if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) ||
		    !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc))
			continue;

		/* Ignore checks for edge detect, pin control, invert and CMASK bits */
		if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
		kvm_pmu_incr_counter(pmc);
	}
}
Loading