Commit fa1332a8 authored by Ravi Bangoria's avatar Ravi Bangoria Committed by Arnaldo Carvalho de Melo
Browse files

perf mem/c2c amd: Add ldlat support



'perf mem/c2c' uses IBS Op PMU on AMD platforms.

IBS Op PMU on Zen5 uarch has added support for Load Latency filtering.

Implement 'perf mem/c2c' --ldlat using IBS Op Load Latency filtering
capability.

Some subtle differences between AMD and other arch:

o --ldlat is disabled by default on AMD

o Supported values are 128 to 2048.

Signed-off-by: default avatarRavi Bangoria <ravi.bangoria@amd.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Joe Mario <jmario@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20250429035938.1301-4-ravi.bangoria@amd.com


Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent fc481adc
Loading
Loading
Loading
Loading
+9 −2
Original line number Diff line number Diff line
@@ -54,8 +54,15 @@ RECORD OPTIONS

-l::
--ldlat::
	Configure mem-loads latency. Supported on Intel and Arm64 processors
	only. Ignored on other archs.
	Configure mem-loads latency. Supported on Intel, Arm64 and some AMD
	processors. Ignored on other archs.

	On supported AMD processors:
	- /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'.
	- Supported latency values are 128 to 2048 (both inclusive).
	- Latency value which is a multiple of 128 incurs a little less profiling
	  overhead compared to other values.
	- Load latency filtering is disabled by default.

-k::
--all-kernel::
+11 −2
Original line number Diff line number Diff line
@@ -28,6 +28,8 @@ and kernel support is required. See linkperf:perf-arm-spe[1] for a setup guide.
Due to the statistical nature of SPE sampling, not every memory operation will
be sampled.

On AMD this use IBS Op PMU to sample load-store operations.

COMMON OPTIONS
--------------
-f::
@@ -67,8 +69,15 @@ RECORD OPTIONS
	Configure all used events to run in user space.

--ldlat <n>::
	Specify desired latency for loads event. Supported on Intel and Arm64
	processors only. Ignored on other archs.
	Specify desired latency for loads event. Supported on Intel, Arm64 and
	some AMD processors. Ignored on other archs.

	On supported AMD processors:
	- /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'.
	- Supported latency values are 128 to 2048 (both inclusive).
	- Latency value which is a multiple of 128 incurs a little less profiling
	  overhead compared to other values.
	- Load latency filtering is disabled by default.

REPORT OPTIONS
--------------
+6 −0
Original line number Diff line number Diff line
@@ -26,3 +26,9 @@ struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = {
	E(NULL,		NULL,		NULL,	false,	0),
	E("mem-ldst",	"%s//",		NULL,	false,	0),
};

struct perf_mem_event perf_mem_events_amd_ldlat[PERF_MEM_EVENTS__MAX] = {
	E(NULL,		NULL,		NULL,	false,	0),
	E(NULL,		NULL,		NULL,	false,	0),
	E("mem-ldst",	"%s/ldlat=%u/",	NULL,	true,	0),
};
+1 −0
Original line number Diff line number Diff line
@@ -6,5 +6,6 @@ extern struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX];
extern struct perf_mem_event perf_mem_events_intel_aux[PERF_MEM_EVENTS__MAX];

extern struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX];
extern struct perf_mem_event perf_mem_events_amd_ldlat[PERF_MEM_EVENTS__MAX];

#endif /* _X86_MEM_EVENTS_H */
+17 −3
Original line number Diff line number Diff line
@@ -18,8 +18,10 @@
#include "mem-events.h"
#include "util/env.h"

void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
void perf_pmu__arch_init(struct perf_pmu *pmu)
{
	struct perf_pmu_caps *ldlat_cap;

#ifdef HAVE_AUXTRACE_SUPPORT
	if (!strcmp(pmu->name, INTEL_PT_PMU_NAME)) {
		pmu->auxtrace = true;
@@ -33,8 +35,20 @@ void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
#endif

	if (x86__is_amd_cpu()) {
		if (!strcmp(pmu->name, "ibs_op"))
		if (strcmp(pmu->name, "ibs_op"))
			return;

		pmu->mem_events = perf_mem_events_amd;

		if (!perf_pmu__caps_parse(pmu))
			return;

		ldlat_cap = perf_pmu__get_cap(pmu, "ldlat");
		if (!ldlat_cap || strcmp(ldlat_cap->value, "1"))
			return;

		perf_mem_events__loads_ldlat = 0;
		pmu->mem_events = perf_mem_events_amd_ldlat;
	} else if (pmu->is_core) {
		if (perf_pmu__have_event(pmu, "mem-loads-aux"))
			pmu->mem_events = perf_mem_events_intel_aux;
Loading