Commit 17ef32ae authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'perf_urgent_for_v6.16_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Borislav Petkov:

 - Avoid a crash on a heterogeneous machine where not all cores support
   the same hw events features

 - Avoid a deadlock when throttling events

 - Document the perf event states more

 - Make sure a number of perf paths switching off or rescheduling events
   call perf_cgroup_event_disable()

 - Make sure perf does task sampling before its userspace mapping is
   torn down, and not after

* tag 'perf_urgent_for_v6.16_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  perf/x86/intel: Fix crash in icl_update_topdown_event()
  perf: Fix the throttle error of some clock events
  perf: Add comment to enum perf_event_state
  perf/core: Fix WARN in perf_cgroup_switch()
  perf: Fix dangling cgroup pointer in cpuctx
  perf: Fix cgroup state vs ERROR
  perf: Fix sample vs do_exit()
parents aff2a7e2 b0823d5f
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -2826,7 +2826,7 @@ static void intel_pmu_read_event(struct perf_event *event)
		 * If the PEBS counters snapshotting is enabled,
		 * the topdown event is available in PEBS records.
		 */
		if (is_topdown_event(event) && !is_pebs_counter_event_group(event))
		if (is_topdown_count(event) && !is_pebs_counter_event_group(event))
			static_call(intel_pmu_update_topdown_event)(event, NULL);
		else
			intel_pmu_drain_pebs_buffer();
+40 −2
Original line number Diff line number Diff line
@@ -635,8 +635,46 @@ struct perf_addr_filter_range {
	unsigned long			size;
};

/**
 * enum perf_event_state - the states of an event:
/*
 * The normal states are:
 *
 *            ACTIVE    --.
 *               ^        |
 *               |        |
 *       sched_{in,out}() |
 *               |        |
 *               v        |
 *      ,---> INACTIVE  --+ <-.
 *      |                 |   |
 *      |                {dis,en}able()
 *   sched_in()           |   |
 *      |       OFF    <--' --+
 *      |                     |
 *      `--->  ERROR    ------'
 *
 * That is:
 *
 * sched_in:       INACTIVE          -> {ACTIVE,ERROR}
 * sched_out:      ACTIVE            -> INACTIVE
 * disable:        {ACTIVE,INACTIVE} -> OFF
 * enable:         {OFF,ERROR}       -> INACTIVE
 *
 * Where {OFF,ERROR} are disabled states.
 *
 * Then we have the {EXIT,REVOKED,DEAD} states which are various shades of
 * defunct events:
 *
 *  - EXIT means task that the even was assigned to died, but child events
 *    still live, and further children can still be created. But the event
 *    itself will never be active again. It can only transition to
 *    {REVOKED,DEAD};
 *
 *  - REVOKED means the PMU the event was associated with is gone; all
 *    functionality is stopped but the event is still alive. Can only
 *    transition to DEAD;
 *
 *  - DEAD event really is DYING tearing down state and freeing bits.
 *
 */
enum perf_event_state {
	PERF_EVENT_STATE_DEAD		= -5,
+74 −42
Original line number Diff line number Diff line
@@ -207,6 +207,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
	__perf_ctx_unlock(&cpuctx->ctx);
}

typedef struct {
	struct perf_cpu_context *cpuctx;
	struct perf_event_context *ctx;
} class_perf_ctx_lock_t;

static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T)
{ perf_ctx_unlock(_T->cpuctx, _T->ctx); }

static inline class_perf_ctx_lock_t
class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx,
				struct perf_event_context *ctx)
{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; }

#define TASK_TOMBSTONE ((void *)-1L)

static bool is_kernel_event(struct perf_event *event)
@@ -944,7 +957,13 @@ static void perf_cgroup_switch(struct task_struct *task)
	if (READ_ONCE(cpuctx->cgrp) == cgrp)
		return;

	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
	guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
	/*
	 * Re-check, could've raced vs perf_remove_from_context().
	 */
	if (READ_ONCE(cpuctx->cgrp) == NULL)
		return;

	perf_ctx_disable(&cpuctx->ctx, true);

	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
@@ -962,7 +981,6 @@ static void perf_cgroup_switch(struct task_struct *task)
	ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);

	perf_ctx_enable(&cpuctx->ctx, true);
	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -2120,18 +2138,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
	if (event->group_leader == event)
		del_event_from_groups(event, ctx);

	/*
	 * If event was in error state, then keep it
	 * that way, otherwise bogus counts will be
	 * returned on read(). The only way to get out
	 * of error state is by explicit re-enabling
	 * of the event
	 */
	if (event->state > PERF_EVENT_STATE_OFF) {
		perf_cgroup_event_disable(event, ctx);
		perf_event_set_state(event, PERF_EVENT_STATE_OFF);
	}

	ctx->generation++;
	event->pmu_ctx->nr_events--;
}
@@ -2149,8 +2155,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
}

static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
			    struct perf_event_context *ctx);
static void __event_disable(struct perf_event *event,
			    struct perf_event_context *ctx,
			    enum perf_event_state state);

static void perf_put_aux_event(struct perf_event *event)
{
@@ -2183,8 +2190,7 @@ static void perf_put_aux_event(struct perf_event *event)
		 * state so that we don't try to schedule it again. Note
		 * that perf_event_enable() will clear the ERROR status.
		 */
		event_sched_out(iter, ctx);
		perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
		__event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
	}
}

@@ -2242,18 +2248,6 @@ static inline struct list_head *get_event_list(struct perf_event *event)
				    &event->pmu_ctx->flexible_active;
}

/*
 * Events that have PERF_EV_CAP_SIBLING require being part of a group and
 * cannot exist on their own, schedule them out and move them into the ERROR
 * state. Also see _perf_event_enable(), it will not be able to recover
 * this ERROR state.
 */
static inline void perf_remove_sibling_event(struct perf_event *event)
{
	event_sched_out(event, event->ctx);
	perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}

static void perf_group_detach(struct perf_event *event)
{
	struct perf_event *leader = event->group_leader;
@@ -2289,8 +2283,15 @@ static void perf_group_detach(struct perf_event *event)
	 */
	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {

		/*
		 * Events that have PERF_EV_CAP_SIBLING require being part of
		 * a group and cannot exist on their own, schedule them out
		 * and move them into the ERROR state. Also see
		 * _perf_event_enable(), it will not be able to recover this
		 * ERROR state.
		 */
		if (sibling->event_caps & PERF_EV_CAP_SIBLING)
			perf_remove_sibling_event(sibling);
			__event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);

		sibling->group_leader = sibling;
		list_del_init(&sibling->sibling_list);
@@ -2493,11 +2494,14 @@ __perf_remove_from_context(struct perf_event *event,
		state = PERF_EVENT_STATE_EXIT;
	if (flags & DETACH_REVOKE)
		state = PERF_EVENT_STATE_REVOKED;
	if (flags & DETACH_DEAD) {
		event->pending_disable = 1;
	if (flags & DETACH_DEAD)
		state = PERF_EVENT_STATE_DEAD;
	}

	event_sched_out(event, ctx);

	if (event->state > PERF_EVENT_STATE_OFF)
		perf_cgroup_event_disable(event, ctx);

	perf_event_set_state(event, min(event->state, state));

	if (flags & DETACH_GROUP)
@@ -2562,6 +2566,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
	event_function_call(event, __perf_remove_from_context, (void *)flags);
}

static void __event_disable(struct perf_event *event,
			    struct perf_event_context *ctx,
			    enum perf_event_state state)
{
	event_sched_out(event, ctx);
	perf_cgroup_event_disable(event, ctx);
	perf_event_set_state(event, state);
}

/*
 * Cross CPU call to disable a performance event
 */
@@ -2576,13 +2589,18 @@ static void __perf_event_disable(struct perf_event *event,
	perf_pmu_disable(event->pmu_ctx->pmu);
	ctx_time_update_event(ctx, event);

	/*
	 * When disabling a group leader, the whole group becomes ineligible
	 * to run, so schedule out the full group.
	 */
	if (event == event->group_leader)
		group_sched_out(event, ctx);
	else
		event_sched_out(event, ctx);

	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
	perf_cgroup_event_disable(event, ctx);
	/*
	 * But only mark the leader OFF; the siblings will remain
	 * INACTIVE.
	 */
	__event_disable(event, ctx, PERF_EVENT_STATE_OFF);

	perf_pmu_enable(event->pmu_ctx->pmu);
}
@@ -2656,8 +2674,8 @@ static void perf_event_unthrottle(struct perf_event *event, bool start)

static void perf_event_throttle(struct perf_event *event)
{
	event->pmu->stop(event, 0);
	event->hw.interrupts = MAX_INTERRUPTS;
	event->pmu->stop(event, 0);
	if (event == event->group_leader)
		perf_log_throttle(event, 0);
}
@@ -7439,6 +7457,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size,
	if (!regs)
		return 0;

	/* No mm, no stack, no dump. */
	if (!current->mm)
		return 0;

	/*
	 * Check if we fit in with the requested stack size into the:
	 * - TASK_SIZE
@@ -8150,6 +8172,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
	const u32 max_stack = event->attr.sample_max_stack;
	struct perf_callchain_entry *callchain;

	if (!current->mm)
		user = false;

	if (!kernel && !user)
		return &__empty_callchain;

@@ -11749,7 +11774,12 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
	struct hw_perf_event *hwc = &event->hw;

	if (is_sampling_event(event)) {
	/*
	 * The throttle can be triggered in the hrtimer handler.
	 * The HRTIMER_NORESTART should be used to stop the timer,
	 * rather than hrtimer_cancel(). See perf_swevent_hrtimer()
	 */
	if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
		local64_set(&hwc->period_left, ktime_to_ns(remaining));

@@ -11804,6 +11834,7 @@ static void cpu_clock_event_start(struct perf_event *event, int flags)
static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
	perf_swevent_cancel_hrtimer(event);
	if (flags & PERF_EF_UPDATE)
		cpu_clock_event_update(event);
}

@@ -11882,6 +11913,7 @@ static void task_clock_event_start(struct perf_event *event, int flags)
static void task_clock_event_stop(struct perf_event *event, int flags)
{
	perf_swevent_cancel_hrtimer(event);
	if (flags & PERF_EF_UPDATE)
		task_clock_event_update(event, event->ctx->time);
}

+9 −8
Original line number Diff line number Diff line
@@ -940,6 +940,15 @@ void __noreturn do_exit(long code)
	taskstats_exit(tsk, group_dead);
	trace_sched_process_exit(tsk, group_dead);

	/*
	 * Since sampling can touch ->mm, make sure to stop everything before we
	 * tear it down.
	 *
	 * Also flushes inherited counters to the parent - before the parent
	 * gets woken up by child-exit notifications.
	 */
	perf_event_exit_task(tsk);

	exit_mm();

	if (group_dead)
@@ -955,14 +964,6 @@ void __noreturn do_exit(long code)
	exit_task_work(tsk);
	exit_thread(tsk);

	/*
	 * Flush inherited counters to the parent - before the parent
	 * gets woken up by child-exit notifications.
	 *
	 * because of cgroup mode, must be called before cgroup_exit()
	 */
	perf_event_exit_task(tsk);

	sched_autogroup_exit_task(tsk);
	cgroup_exit(tsk);