Commit c69993ec authored by Peter Zijlstra's avatar Peter Zijlstra
Browse files

perf: Support deferred user unwind



Add support for deferred userspace unwind to perf.

Where perf currently relies on in-place stack unwinding; from NMI
context and all that. This moves the userspace part of the unwind to
right before the return-to-userspace.

This has two distinct benefits, the biggest is that it moves the
unwind to a faultable context. It becomes possible to fault in debug
info (.eh_frame, SFrame etc.) that might not otherwise be readily
available. And secondly, it de-duplicates the user callchain where
multiple samples happen during the same kernel entry.

To facilitate this the perf interface is extended with a new record
type:

  PERF_RECORD_CALLCHAIN_DEFERRED

and two new attribute flags:

  perf_event_attr::defer_callchain - to request the user unwind be deferred
  perf_event_attr::defer_output    - to request PERF_RECORD_CALLCHAIN_DEFERRED records

The existing PERF_RECORD_SAMPLE callchain section gets a new
context type:

  PERF_CONTEXT_USER_DEFERRED

After which will come a single entry, denoting the 'cookie' of the
deferred callchain that should be attached here, matching the 'cookie'
field of the above mentioned PERF_RECORD_CALLCHAIN_DEFERRED.

The 'defer_callchain' flag is expected on all events with
PERF_SAMPLE_CALLCHAIN. The 'defer_output' flag is expect on the event
responsible for collecting side-band events (like mmap, comm etc.).
Setting 'defer_output' on multiple events will get you duplicated
PERF_RECORD_CALLCHAIN_DEFERRED records.

Based on earlier patches by Josh and Steven.

Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251023150002.GR4067720@noisy.programming.kicks-ass.net
parent ae25884a
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -1720,7 +1720,7 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
		   u32 max_stack, bool crosstask, bool add_mark);
		   u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
+0 −12
Original line number Diff line number Diff line
@@ -6,18 +6,6 @@
#include <linux/unwind_user.h>
#include <linux/unwind_deferred_types.h>

struct unwind_work;

typedef void (*unwind_callback_t)(struct unwind_work *work,
				  struct unwind_stacktrace *trace,
				  u64 cookie);

struct unwind_work {
	struct list_head		list;
	unwind_callback_t		func;
	int				bit;
};

#ifdef CONFIG_UNWIND_USER

enum {
+13 −0
Original line number Diff line number Diff line
@@ -39,4 +39,17 @@ struct unwind_task_info {
	union unwind_task_id	id;
};

struct unwind_work;
struct unwind_stacktrace;

typedef void (*unwind_callback_t)(struct unwind_work *work,
				  struct unwind_stacktrace *trace,
				  u64 cookie);

struct unwind_work {
	struct list_head		list;
	unwind_callback_t		func;
	int				bit;
};

#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
+20 −1
Original line number Diff line number Diff line
@@ -463,7 +463,9 @@ struct perf_event_attr {
				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
				remove_on_exec :  1, /* event is removed from task on exec */
				sigtrap        :  1, /* send synchronous SIGTRAP on event */
				__reserved_1   : 26;
				defer_callchain:  1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */
				defer_output   :  1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */
				__reserved_1   : 24;

	union {
		__u32		wakeup_events;	  /* wake up every n events */
@@ -1239,6 +1241,22 @@ enum perf_event_type {
	 */
	PERF_RECORD_AUX_OUTPUT_HW_ID		= 21,

	/*
	 * This user callchain capture was deferred until shortly before
	 * returning to user space.  Previous samples would have kernel
	 * callchains only and they need to be stitched with this to make full
	 * callchains.
	 *
	 * struct {
	 *	struct perf_event_header	header;
	 *	u64				cookie;
	 *	u64				nr;
	 *	u64				ips[nr];
	 *	struct sample_id		sample_id;
	 * };
	 */
	PERF_RECORD_CALLCHAIN_DEFERRED		= 22,

	PERF_RECORD_MAX,			/* non-ABI */
};

@@ -1269,6 +1287,7 @@ enum perf_callchain_context {
	PERF_CONTEXT_HV				= (__u64)-32,
	PERF_CONTEXT_KERNEL			= (__u64)-128,
	PERF_CONTEXT_USER			= (__u64)-512,
	PERF_CONTEXT_USER_DEFERRED		= (__u64)-640,

	PERF_CONTEXT_GUEST			= (__u64)-2048,
	PERF_CONTEXT_GUEST_KERNEL		= (__u64)-2176,
+2 −2
Original line number Diff line number Diff line
@@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
		max_depth = sysctl_perf_event_max_stack;

	trace = get_perf_callchain(regs, kernel, user, max_depth,
				   false, false);
				   false, false, 0);

	if (unlikely(!trace))
		/* couldn't fetch the stack trace */
@@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
		trace = get_callchain_entry_for_task(task, max_depth);
	else
		trace = get_perf_callchain(regs, kernel, user, max_depth,
					   crosstask, false);
					   crosstask, false, 0);

	if (unlikely(!trace) || trace->nr < skip) {
		if (may_fault)
Loading