Commit 327ecdbc authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'perf-core-2025-03-22' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull performance events updates from Ingo Molnar:
 "Core:
   - Move perf_event sysctls into kernel/events/ (Joel Granados)
   - Use POLLHUP for pinned events in error (Namhyung Kim)
   - Avoid the read if the count is already updated (Peter Zijlstra)
   - Allow the EPOLLRDNORM flag for poll (Tao Chen)
   - locking/percpu-rwsem: Add guard support [ NOTE: this got
     (mis-)merged into the perf tree due to related work ] (Peter
     Zijlstra)

  perf_pmu_unregister() related improvements: (Peter Zijlstra)
   - Simplify the perf_event_alloc() error path
   - Simplify the perf_pmu_register() error path
   - Simplify perf_pmu_register()
   - Simplify perf_init_event()
   - Simplify perf_event_alloc()
   - Merge struct pmu::pmu_disable_count into struct
     perf_cpu_pmu_context::pmu_disable_count
   - Add this_cpc() helper
   - Introduce perf_free_addr_filters()
   - Robustify perf_event_free_bpf_prog()
   - Simplify the perf_mmap() control flow
   - Further simplify perf_mmap()
   - Remove retry loop from perf_mmap()
   - Lift event->mmap_mutex in perf_mmap()
   - Detach 'struct perf_cpu_pmu_context' and 'struct pmu' lifetimes
   - Fix perf_mmap() failure path

  Uprobes:
   - Harden x86 uretprobe syscall trampoline check (Jiri Olsa)
   - Remove redundant spinlock in uprobe_deny_signal() (Liao Chang)
   - Remove the spinlock within handle_singlestep() (Liao Chang)

  x86 Intel PMU enhancements:
   - Support PEBS counters snapshotting (Kan Liang)
   - Fix intel_pmu_read_event() (Kan Liang)
   - Extend per event callchain limit to branch stack (Kan Liang)
   - Fix system-wide LBR profiling (Kan Liang)
   - Allocate bts_ctx only if necessary (Li RongQing)
   - Apply static call for drain_pebs (Peter Zijlstra)

  x86 AMD PMU enhancements: (Ravi Bangoria)
   - Remove pointless sample period check
   - Fix ->config to sample period calculation for OP PMU
   - Fix perf_ibs_op.cnt_mask for CurCnt
   - Don't allow freq mode event creation through ->config interface
   - Add PMU specific minimum period
   - Add ->check_period() callback
   - Ceil sample_period to min_period
   - Add support for OP Load Latency Filtering
   - Update DTLB/PageSize decode logic

  Hardware breakpoints:
   - Return EOPNOTSUPP for unsupported breakpoint type (Saket Kumar
     Bhaskar)

  Hardlockup detector improvements: (Li Huafei)
   - perf_event memory leak
   - Warn if watchdog_ev is leaked

  Fixes and cleanups:
   - Misc fixes and cleanups (Andy Shevchenko, Kan Liang, Peter
     Zijlstra, Ravi Bangoria, Thorsten Blum, XieLudan)"

* tag 'perf-core-2025-03-22' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (55 commits)
  perf: Fix __percpu annotation
  perf: Clean up pmu specific data
  perf/x86: Remove swap_task_ctx()
  perf/x86/lbr: Fix shorter LBRs call stacks for the system-wide mode
  perf: Supply task information to sched_task()
  perf: attach/detach PMU specific data
  locking/percpu-rwsem: Add guard support
  perf: Save PMU specific data in task_struct
  perf: Extend per event callchain limit to branch stack
  perf/ring_buffer: Allow the EPOLLRDNORM flag for poll
  perf/core: Use POLLHUP for pinned events in error
  perf/core: Use sysfs_emit() instead of scnprintf()
  perf/core: Remove optional 'size' arguments from strscpy() calls
  perf/x86/intel/bts: Check if bts_ctx is allocated when calling BTS functions
  uprobes/x86: Harden uretprobe syscall trampoline check
  watchdog/hardlockup/perf: Warn if watchdog_ev is leaked
  watchdog/hardlockup/perf: Fix perf_event memory leak
  perf/x86: Annotate struct bts_buffer::buf with __counted_by()
  perf/core: Clean up perf_try_init_event()
  perf/core: Fix perf_mmap() failure path
  ...
parents 32b22538 12e766d1
Loading
Loading
Loading
Loading
+6 −2
Original line number Diff line number Diff line
@@ -132,7 +132,10 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)

static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) {}
static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
				 struct task_struct *task, bool sched_in)
{
}
static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {}
static void pmao_restore_workaround(bool ebb) { }
#endif /* CONFIG_PPC32 */
@@ -444,7 +447,8 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
/* Called from ctxsw to prevent one process's branch entries to
 * mingle with the other process's entries during context switch.
 */
static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
				 struct task_struct *task, bool sched_in)
{
	if (!ppmu->bhrb_nr)
		return;
+2 −1
Original line number Diff line number Diff line
@@ -518,7 +518,8 @@ static void paicrypt_have_samples(void)
/* Called on schedule-in and schedule-out. No access to event structure,
 * but for sampling only event CRYPTO_ALL is allowed.
 */
static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx,
				struct task_struct *task, bool sched_in)
{
	/* We started with a clean page on event installation. So read out
	 * results on schedule_out and if page was dirty, save old values.
+2 −1
Original line number Diff line number Diff line
@@ -542,7 +542,8 @@ static void paiext_have_samples(void)
/* Called on schedule-in and schedule-out. No access to event structure,
 * but for sampling only event NNPA_ALL is allowed.
 */
static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx,
			      struct task_struct *task, bool sched_in)
{
	/* We started with a clean page on event installation. So read out
	 * results on schedule_out and if page was dirty, save old values.
+2 −1
Original line number Diff line number Diff line
@@ -381,7 +381,8 @@ static void amd_brs_poison_buffer(void)
 * On ctxswin, sched_in = true, called after the PMU has started
 * On ctxswout, sched_in = false, called before the PMU is stopped
 */
void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx,
			    struct task_struct *task, bool sched_in)
{
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);

+175 −31
Original line number Diff line number Diff line
@@ -28,9 +28,6 @@ static u32 ibs_caps;
#include <asm/nmi.h>
#include <asm/amd-ibs.h>

#define IBS_FETCH_CONFIG_MASK	(IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
#define IBS_OP_CONFIG_MASK	IBS_OP_MAX_CNT

/* attr.config2 */
#define IBS_SW_FILTER_MASK	1

@@ -89,6 +86,7 @@ struct perf_ibs {
	u64				cnt_mask;
	u64				enable_mask;
	u64				valid_mask;
	u16				min_period;
	u64				max_period;
	unsigned long			offset_mask[1];
	int				offset_max;
@@ -270,11 +268,19 @@ static int validate_group(struct perf_event *event)
	return 0;
}

static bool perf_ibs_ldlat_event(struct perf_ibs *perf_ibs,
				 struct perf_event *event)
{
	return perf_ibs == &perf_ibs_op &&
	       (ibs_caps & IBS_CAPS_OPLDLAT) &&
	       (event->attr.config1 & 0xFFF);
}

static int perf_ibs_init(struct perf_event *event)
{
	struct hw_perf_event *hwc = &event->hw;
	struct perf_ibs *perf_ibs;
	u64 max_cnt, config;
	u64 config;
	int ret;

	perf_ibs = get_ibs_pmu(event->attr.type);
@@ -310,25 +316,47 @@ static int perf_ibs_init(struct perf_event *event)
		if (config & perf_ibs->cnt_mask)
			/* raw max_cnt may not be set */
			return -EINVAL;
		if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
			/*
			 * lower 4 bits can not be set in ibs max cnt,
			 * but allowing it in case we adjust the
			 * sample period to set a frequency.
			 */
			return -EINVAL;

		if (event->attr.freq) {
			hwc->sample_period = perf_ibs->min_period;
		} else {
			/* Silently mask off lower nibble. IBS hw mandates it. */
			hwc->sample_period &= ~0x0FULL;
		if (!hwc->sample_period)
			hwc->sample_period = 0x10;
			if (hwc->sample_period < perf_ibs->min_period)
				return -EINVAL;
		}
	} else {
		u64 period = 0;

		if (event->attr.freq)
			return -EINVAL;

		if (perf_ibs == &perf_ibs_op) {
			period = (config & IBS_OP_MAX_CNT) << 4;
			if (ibs_caps & IBS_CAPS_OPCNTEXT)
				period |= config & IBS_OP_MAX_CNT_EXT_MASK;
		} else {
		max_cnt = config & perf_ibs->cnt_mask;
			period = (config & IBS_FETCH_MAX_CNT) << 4;
		}

		config &= ~perf_ibs->cnt_mask;
		event->attr.sample_period = max_cnt << 4;
		hwc->sample_period = event->attr.sample_period;
		event->attr.sample_period = period;
		hwc->sample_period = period;

		if (hwc->sample_period < perf_ibs->min_period)
			return -EINVAL;
	}

	if (!hwc->sample_period)
	if (perf_ibs_ldlat_event(perf_ibs, event)) {
		u64 ldlat = event->attr.config1 & 0xFFF;

		if (ldlat < 128 || ldlat > 2048)
			return -EINVAL;
		ldlat >>= 7;

		config |= (ldlat - 1) << 59;
		config |= IBS_OP_L3MISSONLY | IBS_OP_LDLAT_EN;
	}

	/*
	 * If we modify hwc->sample_period, we also need to update
@@ -349,7 +377,8 @@ static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
	int overflow;

	/* ignore lower 4 bits in min count: */
	overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
	overflow = perf_event_set_period(hwc, perf_ibs->min_period,
					 perf_ibs->max_period, period);
	local64_set(&hwc->prev_count, 0);

	return overflow;
@@ -447,6 +476,9 @@ static void perf_ibs_start(struct perf_event *event, int flags)
	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
	hwc->state = 0;

	if (event->attr.freq && hwc->sample_period < perf_ibs->min_period)
		hwc->sample_period = perf_ibs->min_period;

	perf_ibs_set_period(perf_ibs, hwc, &period);
	if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) {
		config |= period & IBS_OP_MAX_CNT_EXT_MASK;
@@ -554,6 +586,28 @@ static void perf_ibs_del(struct perf_event *event, int flags)

static void perf_ibs_read(struct perf_event *event) { }

static int perf_ibs_check_period(struct perf_event *event, u64 value)
{
	struct perf_ibs *perf_ibs;
	u64 low_nibble;

	if (event->attr.freq)
		return 0;

	perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
	low_nibble = value & 0xFULL;

	/*
	 * This contradicts with perf_ibs_init() which allows sample period
	 * with lower nibble bits set but silently masks them off. Whereas
	 * this returns error.
	 */
	if (low_nibble || value < perf_ibs->min_period)
		return -EINVAL;

	return 0;
}

/*
 * We need to initialize with empty group if all attributes in the
 * group are dynamic.
@@ -572,7 +626,10 @@ PMU_FORMAT_ATTR(cnt_ctl, "config:19");
PMU_FORMAT_ATTR(swfilt,		"config2:0");
PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11");
PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1");
PMU_EVENT_ATTR_STRING(dtlb_pgsize, ibs_op_dtlb_pgsize_cap, "1");

static umode_t
zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i)
@@ -580,6 +637,18 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int
	return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
}

static umode_t
ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i)
{
	return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0;
}

static umode_t
ibs_op_dtlb_pgsize_is_visible(struct kobject *kobj, struct attribute *attr, int i)
{
	return ibs_caps & IBS_CAPS_OPDTLBPGSIZE ? attr->mode : 0;
}

static struct attribute *fetch_attrs[] = {
	&format_attr_rand_en.attr,
	&format_attr_swfilt.attr,
@@ -596,6 +665,16 @@ static struct attribute *zen4_ibs_extensions_attrs[] = {
	NULL,
};

static struct attribute *ibs_op_ldlat_cap_attrs[] = {
	&ibs_op_ldlat_cap.attr.attr,
	NULL,
};

static struct attribute *ibs_op_dtlb_pgsize_cap_attrs[] = {
	&ibs_op_dtlb_pgsize_cap.attr.attr,
	NULL,
};

static struct attribute_group group_fetch_formats = {
	.name = "format",
	.attrs = fetch_attrs,
@@ -613,6 +692,18 @@ static struct attribute_group group_zen4_ibs_extensions = {
	.is_visible = zen4_ibs_extensions_is_visible,
};

static struct attribute_group group_ibs_op_ldlat_cap = {
	.name = "caps",
	.attrs = ibs_op_ldlat_cap_attrs,
	.is_visible = ibs_op_ldlat_is_visible,
};

static struct attribute_group group_ibs_op_dtlb_pgsize_cap = {
	.name = "caps",
	.attrs = ibs_op_dtlb_pgsize_cap_attrs,
	.is_visible = ibs_op_dtlb_pgsize_is_visible,
};

static const struct attribute_group *fetch_attr_groups[] = {
	&group_fetch_formats,
	&empty_caps_group,
@@ -651,6 +742,11 @@ static struct attribute_group group_op_formats = {
	.attrs = op_attrs,
};

static struct attribute *ibs_op_ldlat_format_attrs[] = {
	&ibs_op_ldlat_format.attr.attr,
	NULL,
};

static struct attribute_group group_cnt_ctl = {
	.name = "format",
	.attrs = cnt_ctl_attrs,
@@ -669,10 +765,19 @@ static const struct attribute_group *op_attr_groups[] = {
	NULL,
};

static struct attribute_group group_ibs_op_ldlat_format = {
	.name = "format",
	.attrs = ibs_op_ldlat_format_attrs,
	.is_visible = ibs_op_ldlat_is_visible,
};

static const struct attribute_group *op_attr_update[] = {
	&group_cnt_ctl,
	&group_op_l3missonly,
	&group_zen4_ibs_extensions,
	&group_ibs_op_ldlat_cap,
	&group_ibs_op_ldlat_format,
	&group_ibs_op_dtlb_pgsize_cap,
	NULL,
};

@@ -686,12 +791,14 @@ static struct perf_ibs perf_ibs_fetch = {
		.start		= perf_ibs_start,
		.stop		= perf_ibs_stop,
		.read		= perf_ibs_read,
		.check_period	= perf_ibs_check_period,
	},
	.msr			= MSR_AMD64_IBSFETCHCTL,
	.config_mask		= IBS_FETCH_CONFIG_MASK,
	.config_mask		= IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN,
	.cnt_mask		= IBS_FETCH_MAX_CNT,
	.enable_mask		= IBS_FETCH_ENABLE,
	.valid_mask		= IBS_FETCH_VAL,
	.min_period		= 0x10,
	.max_period		= IBS_FETCH_MAX_CNT << 4,
	.offset_mask		= { MSR_AMD64_IBSFETCH_REG_MASK },
	.offset_max		= MSR_AMD64_IBSFETCH_REG_COUNT,
@@ -709,13 +816,15 @@ static struct perf_ibs perf_ibs_op = {
		.start		= perf_ibs_start,
		.stop		= perf_ibs_stop,
		.read		= perf_ibs_read,
		.check_period	= perf_ibs_check_period,
	},
	.msr			= MSR_AMD64_IBSOPCTL,
	.config_mask		= IBS_OP_CONFIG_MASK,
	.config_mask		= IBS_OP_MAX_CNT,
	.cnt_mask		= IBS_OP_MAX_CNT | IBS_OP_CUR_CNT |
				  IBS_OP_CUR_CNT_RAND,
	.enable_mask		= IBS_OP_ENABLE,
	.valid_mask		= IBS_OP_VAL,
	.min_period		= 0x90,
	.max_period		= IBS_OP_MAX_CNT << 4,
	.offset_mask		= { MSR_AMD64_IBSOP_REG_MASK },
	.offset_max		= MSR_AMD64_IBSOP_REG_COUNT,
@@ -917,6 +1026,10 @@ static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3,
	if (!op_data3->dc_lin_addr_valid)
		return;

	if ((ibs_caps & IBS_CAPS_OPDTLBPGSIZE) &&
	    !op_data3->dc_phy_addr_valid)
		return;

	if (!op_data3->dc_l1tlb_miss) {
		data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT;
		return;
@@ -1023,15 +1136,25 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type,
	}
}

static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type,
static bool perf_ibs_is_mem_sample_type(struct perf_ibs *perf_ibs,
					struct perf_event *event)
{
	u64 sample_type = event->attr.sample_type;

	return perf_ibs == &perf_ibs_op &&
	       sample_type & (PERF_SAMPLE_DATA_SRC |
			      PERF_SAMPLE_WEIGHT_TYPE |
			      PERF_SAMPLE_ADDR |
			      PERF_SAMPLE_PHYS_ADDR);
}

static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs,
				   struct perf_event *event,
				   int check_rip)
{
	if (sample_type & PERF_SAMPLE_RAW ||
	    (perf_ibs == &perf_ibs_op &&
	     (sample_type & PERF_SAMPLE_DATA_SRC ||
	      sample_type & PERF_SAMPLE_WEIGHT_TYPE ||
	      sample_type & PERF_SAMPLE_ADDR ||
	      sample_type & PERF_SAMPLE_PHYS_ADDR)))
	if (event->attr.sample_type & PERF_SAMPLE_RAW ||
	    perf_ibs_is_mem_sample_type(perf_ibs, event) ||
	    perf_ibs_ldlat_event(perf_ibs, event))
		return perf_ibs->offset_max;
	else if (check_rip)
		return 3;
@@ -1148,7 +1271,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
	offset = 1;
	check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));

	offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip);
	offset_max = perf_ibs_get_offset_max(perf_ibs, event, check_rip);

	do {
		rdmsrl(msr + offset, *buf++);
@@ -1157,6 +1280,22 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
				       perf_ibs->offset_max,
				       offset + 1);
	} while (offset < offset_max);

	if (perf_ibs_ldlat_event(perf_ibs, event)) {
		union ibs_op_data3 op_data3;

		op_data3.val = ibs_data.regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)];
		/*
		 * Opening event is errored out if load latency threshold is
		 * outside of [128, 2048] range. Since the event has reached
		 * interrupt handler, we can safely assume the threshold is
		 * within [128, 2048] range.
		 */
		if (!op_data3.ld_op || !op_data3.dc_miss ||
		    op_data3.dc_miss_lat <= (event->attr.config1 & 0xFFF))
			goto out;
	}

	/*
	 * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately
	 * depending on their availability.
@@ -1229,6 +1368,10 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
	perf_sample_save_callchain(&data, event, iregs);

	throttle = perf_event_overflow(event, &data, &regs);

	if (event->attr.freq && hwc->sample_period < perf_ibs->min_period)
		hwc->sample_period = perf_ibs->min_period;

out:
	if (throttle) {
		perf_ibs_stop(event, 0);
@@ -1318,7 +1461,8 @@ static __init int perf_ibs_op_init(void)
	if (ibs_caps & IBS_CAPS_OPCNTEXT) {
		perf_ibs_op.max_period  |= IBS_OP_MAX_CNT_EXT_MASK;
		perf_ibs_op.config_mask	|= IBS_OP_MAX_CNT_EXT_MASK;
		perf_ibs_op.cnt_mask    |= IBS_OP_MAX_CNT_EXT_MASK;
		perf_ibs_op.cnt_mask    |= (IBS_OP_MAX_CNT_EXT_MASK |
					    IBS_OP_CUR_CNT_EXT_MASK);
	}

	if (ibs_caps & IBS_CAPS_ZEN4)
Loading