Commit ef60b8f5 authored by Namhyung Kim's avatar Namhyung Kim Committed by Arnaldo Carvalho de Melo
Browse files

perf trace: Support --summary-mode=cgroup



Add a new summary mode to collect stats for each cgroup.

  $ sudo ./perf trace -as --bpf-summary --summary-mode=cgroup -- sleep 1

   Summary of events:

   cgroup /user.slice/user-657345.slice/user@657345.service/session.slice/org.gnome.Shell@x11.service, 535 events

     syscall            calls  errors  total       min       avg       max       stddev
                                       (msec)    (msec)    (msec)    (msec)        (%)
     --------------- --------  ------ -------- --------- --------- ---------     ------
     ppoll                 15      0   373.600     0.004    24.907   197.491     55.26%
     poll                  15      0     1.325     0.001     0.088     0.369     38.76%
     close                 66      0     0.567     0.007     0.009     0.026      3.55%
     write                150      0     0.471     0.001     0.003     0.010      3.29%
     recvmsg               94     83     0.290     0.000     0.003     0.037     16.39%
     ioctl                 26      0     0.237     0.001     0.009     0.096     50.13%
     timerfd_create        66      0     0.236     0.003     0.004     0.024      8.92%
     timerfd_settime       70      0     0.160     0.001     0.002     0.012      7.66%
     writev                10      0     0.118     0.001     0.012     0.019     18.17%
     read                   9      0     0.021     0.001     0.002     0.004     14.07%
     getpid                14      0     0.019     0.000     0.001     0.004     20.28%

   cgroup /system.slice/polkit.service, 94 events

     syscall            calls  errors  total       min       avg       max       stddev
                                       (msec)    (msec)    (msec)    (msec)        (%)
     --------------- --------  ------ -------- --------- --------- ---------     ------
     ppoll                 22      0    19.811     0.000     0.900     9.273     63.88%
     write                 30      0     0.040     0.001     0.001     0.003     12.09%
     recvmsg               12      0     0.018     0.001     0.002     0.006     28.15%
     read                  18      0     0.013     0.000     0.001     0.003     21.99%
     poll                  12      0     0.006     0.000     0.001     0.001      4.48%

   cgroup /user.slice/user-657345.slice/user@657345.service/app.slice/app-org.gnome.Terminal.slice/gnome-terminal-server.service, 21 events

     syscall            calls  errors  total       min       avg       max       stddev
                                       (msec)    (msec)    (msec)    (msec)        (%)
     --------------- --------  ------ -------- --------- --------- ---------     ------
     ppoll                  4      0    17.476     0.003     4.369    13.298     69.65%
     recvmsg               15     12     0.068     0.002     0.005     0.014     26.53%
     writev                 1      0     0.033     0.033     0.033     0.033      0.00%
     poll                   1      0     0.005     0.005     0.005     0.005      0.00%

   ...

It works only for --bpf-summary for now.

Signed-off-by: default avatarNamhyung Kim <namhyung@kernel.org>
Tested-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20250501225337.928470-1-namhyung@kernel.org


Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent 39922dc5
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -152,7 +152,8 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.

--summary-mode=mode::
	To be used with -s or -S, to select how to show summary.  By default it'll
	show the syscall summary by thread.  Possible values are: thread, total.
	show the syscall summary by thread.  Possible values are: thread, total,
	cgroup.

--tool_stats::
	Show tool stats such as number of times fd->pathname was discovered thru
+9 −1
Original line number Diff line number Diff line
@@ -5302,6 +5302,8 @@ static int trace__parse_summary_mode(const struct option *opt, const char *str,
		trace->summary_mode = SUMMARY__BY_THREAD;
	} else if (!strcmp(str, "total")) {
		trace->summary_mode = SUMMARY__BY_TOTAL;
	} else if (!strcmp(str, "cgroup")) {
		trace->summary_mode = SUMMARY__BY_CGROUP;
	} else {
		pr_err("Unknown summary mode: %s\n", str);
		return -1;
@@ -5461,7 +5463,7 @@ int cmd_trace(int argc, const char **argv)
	OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
		    "Show errno stats per syscall, use with -s or -S"),
	OPT_CALLBACK(0, "summary-mode", &trace, "mode",
		     "How to show summary: select thread (default) or total",
		     "How to show summary: select thread (default), total or cgroup",
		     trace__parse_summary_mode),
	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
		     "Trace pagefaults", parse_pagefaults, "maj"),
@@ -5775,6 +5777,12 @@ int cmd_trace(int argc, const char **argv)
		symbol_conf.keep_exited_threads = true;
		if (trace.summary_mode == SUMMARY__NONE)
			trace.summary_mode = SUMMARY__BY_THREAD;

		if (!trace.summary_bpf && trace.summary_mode == SUMMARY__BY_CGROUP) {
			pr_err("Error: --summary-mode=cgroup only works with --bpf-summary\n");
			err = -EINVAL;
			goto out;
		}
	}

	if (output_name != NULL) {
+117 −6
Original line number Diff line number Diff line
@@ -6,10 +6,12 @@

#include "dwarf-regs.h" /* for EM_HOST */
#include "syscalltbl.h"
#include "util/cgroup.h"
#include "util/hashmap.h"
#include "util/trace.h"
#include "util/util.h"
#include <bpf/bpf.h>
#include <linux/rbtree.h>
#include <linux/time64.h>
#include <tools/libc_compat.h> /* reallocarray */

@@ -18,6 +20,7 @@


static struct syscall_summary_bpf *skel;
static struct rb_root cgroups = RB_ROOT;

int trace_prepare_bpf_summary(enum trace_summary_mode mode)
{
@@ -29,9 +32,14 @@ int trace_prepare_bpf_summary(enum trace_summary_mode mode)

	if (mode == SUMMARY__BY_THREAD)
		skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
	else if (mode == SUMMARY__BY_CGROUP)
		skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP;
	else
		skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;

	if (cgroup_is_v2("perf_event") > 0)
		skel->rodata->use_cgroup_v2 = 1;

	if (syscall_summary_bpf__load(skel) < 0) {
		fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
		return -1;
@@ -42,6 +50,9 @@ int trace_prepare_bpf_summary(enum trace_summary_mode mode)
		return -1;
	}

	if (mode == SUMMARY__BY_CGROUP)
		read_all_cgroups(&cgroups);

	return 0;
}

@@ -88,9 +99,13 @@ static double rel_stddev(struct syscall_stats *stat)
 * per-cpu analysis so it's keyed by the syscall number to combine stats
 * from different CPUs.  And syscall_data always has a syscall_node so
 * it can effectively work as flat hierarchy.
 *
 * For per-cgroup stats, it uses two-level data structure like thread
 * syscall_data is keyed by CGROUP and has an array of node which
 * represents each syscall for the cgroup.
 */
struct syscall_data {
	int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
	u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */
	int nr_events;
	int nr_nodes;
	u64 total_time;
@@ -191,7 +206,7 @@ static int print_thread_stat(struct syscall_data *data, FILE *fp)

	qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);

	printed += fprintf(fp, " thread (%d), ", data->key);
	printed += fprintf(fp, " thread (%d), ", (int)data->key);
	printed += fprintf(fp, "%d events\n\n", data->nr_events);

	printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
@@ -283,6 +298,75 @@ static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
	return printed;
}

static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key,
			       struct syscall_stats *map_data)
{
	struct syscall_data *data;
	struct syscall_node *nodes;

	if (!hashmap__find(hash, map_key->cgroup, &data)) {
		data = zalloc(sizeof(*data));
		if (data == NULL)
			return -ENOMEM;

		data->key = map_key->cgroup;
		if (hashmap__add(hash, data->key, data) < 0) {
			free(data);
			return -ENOMEM;
		}
	}

	/* update thread total stats */
	data->nr_events += map_data->count;
	data->total_time += map_data->total_time;

	nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
	if (nodes == NULL)
		return -ENOMEM;

	data->nodes = nodes;
	nodes = &data->nodes[data->nr_nodes++];
	nodes->syscall_nr = map_key->nr;

	/* each thread has an entry for each syscall, just use the stat */
	memcpy(&nodes->stats, map_data, sizeof(*map_data));
	return 0;
}

static int print_cgroup_stat(struct syscall_data *data, FILE *fp)
{
	int printed = 0;
	struct cgroup *cgrp = __cgroup__find(&cgroups, data->key);

	qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);

	if (cgrp)
		printed += fprintf(fp, " cgroup %s,", cgrp->name);
	else
		printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key);

	printed += fprintf(fp, " %d events\n\n", data->nr_events);

	printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
	printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
	printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");

	printed += print_common_stats(data, fp);
	printed += fprintf(fp, "\n\n");

	return printed;
}

static int print_cgroup_stats(struct syscall_data **data, int nr_data, FILE *fp)
{
	int printed = 0;

	for (int i = 0; i < nr_data; i++)
		printed += print_cgroup_stat(data[i], fp);

	return printed;
}

int trace_print_bpf_summary(FILE *fp)
{
	struct bpf_map *map = skel->maps.syscall_stats_map;
@@ -305,10 +389,19 @@ int trace_print_bpf_summary(FILE *fp)
		struct syscall_stats stat;

		if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
			if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
			switch (skel->rodata->aggr_mode) {
			case SYSCALL_AGGR_THREAD:
				update_thread_stats(&schash, &key, &stat);
			else
				break;
			case SYSCALL_AGGR_CPU:
				update_total_stats(&schash, &key, &stat);
				break;
			case SYSCALL_AGGR_CGROUP:
				update_cgroup_stats(&schash, &key, &stat);
				break;
			default:
				break;
			}
		}

		prev_key = &key;
@@ -325,10 +418,19 @@ int trace_print_bpf_summary(FILE *fp)

	qsort(data, nr_data, sizeof(*data), datacmp);

	if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
	switch (skel->rodata->aggr_mode) {
	case SYSCALL_AGGR_THREAD:
		printed += print_thread_stats(data, nr_data, fp);
	else
		break;
	case SYSCALL_AGGR_CPU:
		printed += print_total_stats(data, nr_data, fp);
		break;
	case SYSCALL_AGGR_CGROUP:
		printed += print_cgroup_stats(data, nr_data, fp);
		break;
	default:
		break;
	}

	for (i = 0; i < nr_data && data; i++) {
		free(data[i]->nodes);
@@ -343,5 +445,14 @@ int trace_print_bpf_summary(FILE *fp)

void trace_cleanup_bpf_summary(void)
{
	if (!RB_EMPTY_ROOT(&cgroups)) {
		struct cgroup *cgrp, *tmp;

		rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node)
			cgroup__put(cgrp);

		cgroups = RB_ROOT;
	}

	syscall_summary_bpf__destroy(skel);
}
+39 −4
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@

#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>

/* This is to calculate a delta between sys-enter and sys-exit for each thread */
struct syscall_trace {
@@ -35,10 +36,41 @@ struct syscall_stats_map {
int enabled; /* controlled from userspace */

const volatile enum syscall_aggr_mode aggr_mode;
const volatile int use_cgroup_v2;

static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
int perf_subsys_id = -1;

static inline __u64 get_current_cgroup_id(void)
{
	struct task_struct *task;
	struct cgroup *cgrp;

	if (use_cgroup_v2)
		return bpf_get_current_cgroup_id();

	task = bpf_get_current_task_btf();

	if (perf_subsys_id == -1) {
#if __has_builtin(__builtin_preserve_enum_value)
		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
						     perf_event_cgrp_id);
#else
		perf_subsys_id = perf_event_cgrp_id;
#endif
	}

	cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
	return BPF_CORE_READ(cgrp, kn, id);
}

static void update_stats(int cpu_or_tid, u64 cgroup_id, int nr, s64 duration,
			 long ret)
{
	struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
	struct syscall_key key = {
		.cpu_or_tid = cpu_or_tid,
		.cgroup = cgroup_id,
		.nr = nr,
	};
	struct syscall_stats *stats;

	stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
@@ -90,7 +122,8 @@ SEC("tp_btf/sys_exit")
int sys_exit(u64 *ctx)
{
	int tid;
	int key;
	int key = 0;
	u64 cgroup = 0;
	long ret = ctx[1]; /* return value of the syscall */
	struct syscall_trace *st;
	s64 delta;
@@ -105,11 +138,13 @@ int sys_exit(u64 *ctx)

	if (aggr_mode == SYSCALL_AGGR_THREAD)
		key = tid;
	else if (aggr_mode == SYSCALL_AGGR_CGROUP)
		cgroup = get_current_cgroup_id();
	else
		key = bpf_get_smp_processor_id();

	delta = bpf_ktime_get_ns() - st->timestamp;
	update_stats(key, st->nr, delta, ret);
	update_stats(key, cgroup, st->nr, delta, ret);

	bpf_map_delete_elem(&syscall_trace_map, &tid);
	return 0;
+2 −0
Original line number Diff line number Diff line
@@ -6,9 +6,11 @@
enum syscall_aggr_mode {
	SYSCALL_AGGR_THREAD,
	SYSCALL_AGGR_CPU,
	SYSCALL_AGGR_CGROUP,
};

struct syscall_key {
	u64 cgroup;
	int cpu_or_tid;
	int nr;
};
Loading