Commit 766331f2 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'perf-urgent-2025-02-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf event fixes from Ingo Molnar:
 "Miscellaneous perf events fixes and a minor HW enablement change:

   - Fix missing RCU protection in perf_iterate_ctx()

   - Fix pmu_ctx_list ordering bug

   - Reject the zero page in uprobes

   - Fix a family of bugs related to low frequency sampling

   - Add Intel Arrow Lake U CPUs to the generic Arrow Lake RAPL support
     table

   - Fix a lockdep-assert false positive in uretprobes"

* tag 'perf-urgent-2025-02-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  uprobes: Remove too strict lockdep_assert() condition in hprobe_expire()
  perf/x86/rapl: Add support for Intel Arrow Lake U
  perf/x86/intel: Use better start period for frequency mode
  perf/core: Fix low freq setting via IOC_PERIOD
  perf/x86: Fix low freqency setting issue
  uprobes: Reject the shared zeropage in uprobe_write_opcode()
  perf/core: Order the PMU list to fix warning about unordered pmu_ctx_list
  perf/core: Add RCU read lock protection to perf_iterate_ctx()
parents ad69e021 f8c85723
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -628,7 +628,7 @@ int x86_pmu_hw_config(struct perf_event *event)
	if (event->attr.type == event->pmu->type)
		event->hw.config |= x86_pmu_get_event_config(event);

	if (event->attr.sample_period && x86_pmu.limit_period) {
	if (!event->attr.freq && x86_pmu.limit_period) {
		s64 left = event->attr.sample_period;
		x86_pmu.limit_period(event, &left);
		if (left > event->attr.sample_period)
+85 −0
Original line number Diff line number Diff line
@@ -3952,6 +3952,85 @@ static inline bool intel_pmu_has_cap(struct perf_event *event, int idx)
	return test_bit(idx, (unsigned long *)&intel_cap->capabilities);
}

static u64 intel_pmu_freq_start_period(struct perf_event *event)
{
	int type = event->attr.type;
	u64 config, factor;
	s64 start;

	/*
	 * The 127 is the lowest possible recommended SAV (sample after value)
	 * for a 4000 freq (default freq), according to the event list JSON file.
	 * Also, assume the workload is idle 50% time.
	 */
	factor = 64 * 4000;
	if (type != PERF_TYPE_HARDWARE && type != PERF_TYPE_HW_CACHE)
		goto end;

	/*
	 * The estimation of the start period in the freq mode is
	 * based on the below assumption.
	 *
	 * For a cycles or an instructions event, 1GHZ of the
	 * underlying platform, 1 IPC. The workload is idle 50% time.
	 * The start period = 1,000,000,000 * 1 / freq / 2.
	 *		    = 500,000,000 / freq
	 *
	 * Usually, the branch-related events occur less than the
	 * instructions event. According to the Intel event list JSON
	 * file, the SAV (sample after value) of a branch-related event
	 * is usually 1/4 of an instruction event.
	 * The start period of branch-related events = 125,000,000 / freq.
	 *
	 * The cache-related events occurs even less. The SAV is usually
	 * 1/20 of an instruction event.
	 * The start period of cache-related events = 25,000,000 / freq.
	 */
	config = event->attr.config & PERF_HW_EVENT_MASK;
	if (type == PERF_TYPE_HARDWARE) {
		switch (config) {
		case PERF_COUNT_HW_CPU_CYCLES:
		case PERF_COUNT_HW_INSTRUCTIONS:
		case PERF_COUNT_HW_BUS_CYCLES:
		case PERF_COUNT_HW_STALLED_CYCLES_FRONTEND:
		case PERF_COUNT_HW_STALLED_CYCLES_BACKEND:
		case PERF_COUNT_HW_REF_CPU_CYCLES:
			factor = 500000000;
			break;
		case PERF_COUNT_HW_BRANCH_INSTRUCTIONS:
		case PERF_COUNT_HW_BRANCH_MISSES:
			factor = 125000000;
			break;
		case PERF_COUNT_HW_CACHE_REFERENCES:
		case PERF_COUNT_HW_CACHE_MISSES:
			factor = 25000000;
			break;
		default:
			goto end;
		}
	}

	if (type == PERF_TYPE_HW_CACHE)
		factor = 25000000;
end:
	/*
	 * Usually, a prime or a number with less factors (close to prime)
	 * is chosen as an SAV, which makes it less likely that the sampling
	 * period synchronizes with some periodic event in the workload.
	 * Minus 1 to make it at least avoiding values near power of twos
	 * for the default freq.
	 */
	start = DIV_ROUND_UP_ULL(factor, event->attr.sample_freq) - 1;

	if (start > x86_pmu.max_period)
		start = x86_pmu.max_period;

	if (x86_pmu.limit_period)
		x86_pmu.limit_period(event, &start);

	return start;
}

static int intel_pmu_hw_config(struct perf_event *event)
{
	int ret = x86_pmu_hw_config(event);
@@ -3963,6 +4042,12 @@ static int intel_pmu_hw_config(struct perf_event *event)
	if (ret)
		return ret;

	if (event->attr.freq && event->attr.sample_freq) {
		event->hw.sample_period = intel_pmu_freq_start_period(event);
		event->hw.last_period = event->hw.sample_period;
		local64_set(&event->hw.period_left, event->hw.sample_period);
	}

	if (event->attr.precise_ip) {
		if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT)
			return -EINVAL;
+1 −0
Original line number Diff line number Diff line
@@ -879,6 +879,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
	X86_MATCH_VFM(INTEL_METEORLAKE_L,	&model_skl),
	X86_MATCH_VFM(INTEL_ARROWLAKE_H,	&model_skl),
	X86_MATCH_VFM(INTEL_ARROWLAKE,		&model_skl),
	X86_MATCH_VFM(INTEL_ARROWLAKE_U,	&model_skl),
	X86_MATCH_VFM(INTEL_LUNARLAKE_M,	&model_skl),
	{},
};
+20 −11
Original line number Diff line number Diff line
@@ -4950,7 +4950,7 @@ static struct perf_event_pmu_context *
find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
		     struct perf_event *event)
{
	struct perf_event_pmu_context *new = NULL, *epc;
	struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc;
	void *task_ctx_data = NULL;

	if (!ctx->task) {
@@ -5007,12 +5007,19 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
			atomic_inc(&epc->refcount);
			goto found_epc;
		}
		/* Make sure the pmu_ctx_list is sorted by PMU type: */
		if (!pos && epc->pmu->type > pmu->type)
			pos = epc;
	}

	epc = new;
	new = NULL;

	list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
	if (!pos)
		list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
	else
		list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev);

	epc->ctx = ctx;

found_epc:
@@ -5962,14 +5969,15 @@ static int _perf_event_period(struct perf_event *event, u64 value)
	if (!value)
		return -EINVAL;

	if (event->attr.freq && value > sysctl_perf_event_sample_rate)
	if (event->attr.freq) {
		if (value > sysctl_perf_event_sample_rate)
			return -EINVAL;

	} else {
		if (perf_event_check_period(event, value))
			return -EINVAL;

	if (!event->attr.freq && (value & (1ULL << 63)))
		if (value & (1ULL << 63))
			return -EINVAL;
	}

	event_function_call(event, __perf_event_period, &value);

@@ -8321,6 +8329,7 @@ void perf_event_exec(void)

	perf_event_enable_on_exec(ctx);
	perf_event_remove_on_exec(ctx);
	scoped_guard(rcu)
		perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);

	perf_unpin_context(ctx);
+12 −3
Original line number Diff line number Diff line
@@ -495,6 +495,11 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
	if (ret <= 0)
		goto put_old;

	if (is_zero_page(old_page)) {
		ret = -EINVAL;
		goto put_old;
	}

	if (WARN(!is_register && PageCompound(old_page),
		 "uprobe unregister should never work on compound page\n")) {
		ret = -EINVAL;
@@ -762,10 +767,14 @@ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get)
	enum hprobe_state hstate;

	/*
	 * return_instance's hprobe is protected by RCU.
	 * Underlying uprobe is itself protected from reuse by SRCU.
	 * Caller should guarantee that return_instance is not going to be
	 * freed from under us. This can be achieved either through holding
	 * rcu_read_lock() or by owning return_instance in the first place.
	 *
	 * Underlying uprobe is itself protected from reuse by SRCU, so ensure
	 * SRCU lock is held properly.
	 */
	lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu));
	lockdep_assert(srcu_read_lock_held(&uretprobes_srcu));

	hstate = READ_ONCE(hprobe->state);
	switch (hstate) {