Commit aab667ca authored by K Prateek Nayak's avatar K Prateek Nayak Committed by Arnaldo Carvalho de Melo
Browse files

perf stat: Add "--per-cache" aggregation option and document it



This patch adds support for "--per-cache" option for aggregation at a
particular cache level and documents the same.

Following is the output of 'perf stat' with aggregation at L3 for the
event "ls_dmnd_fills_from_sys.ext_cache_remote" on a dual socket 3rd
Generation EPYC Processor (2 x 64C/128T - 16 LLCs) when running
hackbench pinned to 4 LLCs:

  $ sudo perf stat --per-cache=L3 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \
    taskset -c 0-15,64-79,128-143,192-207 \
    perf bench sched messaging -p -t -l 100000 -g 8

  ...

   Performance counter stats for 'system wide':

  S0-D0-L3-ID0             16          9,500,803      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID8             16          6,338,099      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID16            16            355,005      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID24            16             22,067      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID32            16             16,321      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID40            16             11,619      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID48            16              4,238      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID56            16             31,158      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID64            16         28,242,452      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID72            16         22,906,973      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID80            16             72,898      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID88            16             56,907      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID96            16             20,456      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID104           16             40,913      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID112           16             78,113      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID120           16             37,897      ls_dmnd_fills_from_sys.ext_cache_remote

Also support 'perf stat record' and 'perf stat report' with the ability
to specify a different cache level to aggregate data at when running
'perf stat report'.

  $ sudo perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \
    taskset -c 0-15,64-79,128-143,192-207 \
    perf bench sched messaging -p -t -l 100000 -g 8

  ...

   Performance counter stats for 'system wide':

  S0-D0-L2-ID0              2          1,442,061      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID1              2          1,548,994      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID2              2          1,553,557      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID3              2          1,420,122      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID4              2          1,465,461      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID5              2          1,455,153      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID6              2          1,595,237      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID7              2          1,499,321      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID8              2          1,919,025      ls_dmnd_fills_from_sys.ext_cache_remote
  ...
  S1-D1-L2-ID127            2             21,295      ls_dmnd_fills_from_sys.ext_cache_remote

  $ sudo perf stat report --per-cache=L3

   Performance counter stats for 'perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote --\
                                  taskset -c 0-15,64-79,128-143,192-207 \
                                  perf bench sched messaging -p -t -l 100000 -g 8':

  S0-D0-L3-ID0             16         11,979,906      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID8             16         14,257,202      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID16            16            377,484      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID24            16             27,224      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID32            16             26,816      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID40            16             14,461      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID48            16             10,499      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID56            16             53,817      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID64            16         27,361,987      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID72            16         37,299,024      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID80            16             84,125      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID88            16             64,561      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID96            16             13,403      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID104           16             20,138      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID112           16             93,220      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID120           16             35,465      ls_dmnd_fills_from_sys.ext_cache_remote

On the above system, the domain covered by S0-D0-L3-ID0 contains
S0-D0-L2-ID0 to S0-D0-L2-ID7, the corresponding count for L3-ID0 is
equal to the sum of counts for L2-ID0 to L2-ID7.

Add documentation for the newly introduced "--per-cache" option.

Suggested-by: default avatarGautham Shenoy <gautham.shenoy@amd.com>
Signed-off-by: default avatarK Prateek Nayak <kprateek.nayak@amd.com>
Acked-by: default avatarIan Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wen Pu <puwen@hygon.cn>
Link: https://lore.kernel.org/r/20230517172745.5833-5-kprateek.nayak@amd.com


Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent 4b87406a
Loading
Loading
Loading
Loading
+16 −0
Original line number Diff line number Diff line
@@ -308,6 +308,14 @@ use --per-die in addition to -a. (system-wide). The output includes the
die number and the number of online processors on that die. This is
useful to gauge the amount of aggregation.

--per-cache::
Aggregate counts per cache instance for system-wide mode measurements.  By
default, the aggregation happens for the cache level at the highest index
in the system. To specify a particular level, mention the cache level
alongside the option in the format [Ll][1-9][0-9]*. For example:
Using option "--per-cache=l3" or "--per-cache=L3" will aggregate the
information at the boundary of the level 3 cache in the system.

--per-core::
Aggregate counts per physical processor for system-wide mode measurements.  This
is a useful mode to detect imbalance between physical cores.  To enable this mode,
@@ -379,6 +387,14 @@ Aggregate counts per processor socket for system-wide mode measurements.
--per-die::
Aggregate counts per processor die for system-wide mode measurements.

--per-cache::
Aggregate counts per cache instance for system-wide mode measurements.  By
default, the aggregation happens for the cache level at the highest index
in the system. To specify a particular level, mention the cache level
alongside the option in the format [Ll][1-9][0-9]*. For example: Using
option "--per-cache=l3" or "--per-cache=L3" will aggregate the
information at the boundary of the level 3 cache in the system.

--per-core::
Aggregate counts per physical processor for system-wide mode measurements.

+56 −0
Original line number Diff line number Diff line
@@ -1113,6 +1113,55 @@ static int parse_cputype(const struct option *opt,
	return 0;
}

static int parse_cache_level(const struct option *opt,
			     const char *str,
			     int unset __maybe_unused)
{
	int level;
	u32 *aggr_mode = (u32 *)opt->value;
	u32 *aggr_level = (u32 *)opt->data;

	/*
	 * If no string is specified, aggregate based on the topology of
	 * Last Level Cache (LLC). Since the LLC level can change from
	 * architecture to architecture, set level greater than
	 * MAX_CACHE_LVL which will be interpreted as LLC.
	 */
	if (str == NULL) {
		level = MAX_CACHE_LVL + 1;
		goto out;
	}

	/*
	 * The format to specify cache level is LX or lX where X is the
	 * cache level.
	 */
	if (strlen(str) != 2 || (str[0] != 'l' && str[0] != 'L')) {
		pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n",
		       MAX_CACHE_LVL,
		       MAX_CACHE_LVL);
		return -EINVAL;
	}

	level = atoi(&str[1]);
	if (level < 1) {
		pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n",
		       MAX_CACHE_LVL,
		       MAX_CACHE_LVL);
		return -EINVAL;
	}

	if (level > MAX_CACHE_LVL) {
		pr_err("perf only supports max cache level of %d.\n"
		       "Consider increasing MAX_CACHE_LVL\n", MAX_CACHE_LVL);
		return -EINVAL;
	}
out:
	*aggr_mode = AGGR_CACHE;
	*aggr_level = level;
	return 0;
}

static struct option stat_options[] = {
	OPT_BOOLEAN('T', "transaction", &transaction_run,
		    "hardware transaction statistics"),
@@ -1190,6 +1239,9 @@ static struct option stat_options[] = {
		     "aggregate counts per processor socket", AGGR_SOCKET),
	OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode,
		     "aggregate counts per processor die", AGGR_DIE),
	OPT_CALLBACK_OPTARG(0, "per-cache", &stat_config.aggr_mode, &stat_config.aggr_level,
			    "cache level", "aggregate count at this cache level (Default: LLC)",
			    parse_cache_level),
	OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode,
		     "aggregate counts per physical processor core", AGGR_CORE),
	OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode,
@@ -2346,6 +2398,10 @@ static int __cmd_report(int argc, const char **argv)
		     "aggregate counts per processor socket", AGGR_SOCKET),
	OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode,
		     "aggregate counts per processor die", AGGR_DIE),
	OPT_CALLBACK_OPTARG(0, "per-cache", &perf_stat.aggr_mode, &perf_stat.aggr_level,
			    "cache level",
			    "aggregate count at this cache level (Default: LLC)",
			    parse_cache_level),
	OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode,
		     "aggregate counts per physical processor core", AGGR_CORE),
	OPT_SET_UINT(0, "per-node", &perf_stat.aggr_mode,