Merge tag 'perf-tools-for-v6.11-2024-07-16' of... (68b59730) · Commits · git / linux-net

tools/lib/api/io.h

+38 −31

Original line number	Diff line number	Diff line
		@@ -43,17 +43,14 @@ static inline void io__init(struct io *io, int fd,
		io->eof = false;
		}

		/* Reads one character from the "io" file with similar semantics to fgetc. */
		static inline int io__get_char(struct io *io)
		/* Read from fd filling the buffer. Called when io->data == io->end. */
		static inline int io__fill_buffer(struct io *io)
		{
		char *ptr = io->data;
		ssize_t n;

		if (io->eof)
		return -1;

		if (ptr == io->end) {
		ssize_t n;

		if (io->timeout_ms != 0) {
		struct pollfd pfds[] = {
		{
		@@ -80,11 +77,21 @@ static inline int io__get_char(struct io *io)
		io->eof = true;
		return -1;
		}
		ptr = &io->buf[0];
		io->data = &io->buf[0];
		io->end = &io->buf[n];
		return 0;
		}

		/* Reads one character from the "io" file with similar semantics to fgetc. */
		static inline int io__get_char(struct io *io)
		{
		if (io->data == io->end) {
		int ret = io__fill_buffer(io);

		if (ret)
		return ret;
		}
		io->data = ptr + 1;
		return *ptr;
		return *io->data++;
		}

		/* Read a hexadecimal value with no 0x prefix into the out argument hex. If the

tools/lib/perf/include/perf/event.h

+6 −0

Original line number	Diff line number	Diff line
		@@ -77,6 +77,12 @@ struct perf_record_lost_samples {
		__u64 lost;
		};

		#define MAX_ID_HDR_ENTRIES 6
		struct perf_record_lost_samples_and_ids {
		struct perf_record_lost_samples lost;
		__u64 sample_ids[MAX_ID_HDR_ENTRIES];
		};

		/*
		* PERF_FORMAT_ENABLED \| PERF_FORMAT_RUNNING \| PERF_FORMAT_ID \| PERF_FORMAT_LOST
		*/

tools/perf/Build

+8 −6

Original line number	Diff line number	Diff line
		perf-y += builtin-bench.o
		perf-bench-y += builtin-bench.o
		perf-y += builtin-annotate.o
		perf-y += builtin-config.o
		perf-y += builtin-diff.o
		@@ -35,8 +35,8 @@ endif

		perf-$(CONFIG_LIBELF) += builtin-probe.o

		perf-y += bench/
		perf-y += tests/
		perf-bench-y += bench/
		perf-test-y += tests/

		perf-y += perf.o

		@@ -53,10 +53,12 @@ CFLAGS_builtin-trace.o += -DSTRACE_GROUPS_DIR="BUILD_STR($(STRACE_GROUPS_DIR_
		CFLAGS_builtin-report.o += -DTIPDIR="BUILD_STR($(tipdir_SQ))"
		CFLAGS_builtin-report.o += -DDOCDIR="BUILD_STR($(srcdir_SQ)/Documentation)"

		perf-y += util/
		perf-util-y += util/
		perf-util-y += arch/
		perf-y += arch/
		perf-y += ui/
		perf-y += scripts/
		perf-test-y += arch/
		perf-ui-y += ui/
		perf-util-y += scripts/

		gtk-y += ui/gtk/

tools/perf/Documentation/perf-amd-ibs.txt

0 → 100644

+189 −0

Original line number	Diff line number	Diff line
		perf-amd-ibs(1)
		===============

		NAME
		----
		perf-amd-ibs - Support for AMD Instruction-Based Sampling (IBS) with perf tool

		SYNOPSIS
		--------
		[verse]
		'perf record' -e ibs_op//
		'perf record' -e ibs_fetch//

		DESCRIPTION
		-----------

		Instruction-Based Sampling (IBS) provides precise Instruction Pointer (IP)
		profiling support on AMD platforms. IBS has two independent components: IBS
		Op and IBS Fetch. IBS Op sampling provides information about instruction
		execution (micro-op execution to be precise) with details like d-cache
		hit/miss, d-TLB hit/miss, cache miss latency, load/store data source, branch
		behavior etc. IBS Fetch sampling provides information about instruction fetch
		with details like i-cache hit/miss, i-TLB hit/miss, fetch latency etc. IBS is
		per-smt-thread i.e. each SMT hardware thread contains standalone IBS units.

		Both, IBS Op and IBS Fetch, are exposed as PMUs by Linux and can be exploited
		using the Linux perf utility. The following files will be created at boot time
		if IBS is supported by the hardware and kernel.

		/sys/bus/event_source/devices/ibs_op/
		/sys/bus/event_source/devices/ibs_fetch/

		IBS Op PMU supports two events: cycles and micro ops. IBS Fetch PMU supports
		one event: fetch ops.

		IBS PMUs do not have user/kernel filtering capability and thus it requires
		CAP_SYS_ADMIN or CAP_PERFMON privilege.

		IBS VS. REGULAR CORE PMU
		------------------------

		IBS gives samples with precise IP, i.e. the IP recorded with IBS sample has
		no skid. Whereas the IP recorded by regular core PMU will have some skid
		(sample was generated at IP X but perf would record it at IP X+n). Hence,
		regular core PMU might not help for profiling with instruction level
		precision. Further, IBS provides additional information about the sample in
		question. On the other hand, regular core PMU has it's own advantages like
		plethora of events, counting mode (less interference), up to 6 parallel
		counters, event grouping support, filtering capabilities etc.

		Three regular core PMU events are internally forwarded to IBS Op PMU when
		precise_ip attribute is set:

		-e cpu-cycles:p becomes -e ibs_op//
		-e r076:p becomes -e ibs_op//
		-e r0C1:p becomes -e ibs_op/cnt_ctl=1/

		EXAMPLES
		--------

		IBS Op PMU
		~~~~~~~~~~

		System-wide profile, cycles event, sampling period: 100000

		# perf record -e ibs_op// -c 100000 -a

		Per-cpu profile (cpu10), cycles event, sampling period: 100000

		# perf record -e ibs_op// -c 100000 -C 10

		Per-cpu profile (cpu10), cycles event, sampling freq: 1000

		# perf record -e ibs_op// -F 1000 -C 10

		System-wide profile, uOps event, sampling period: 100000

		# perf record -e ibs_op/cnt_ctl=1/ -c 100000 -a

		Same command, but also capture IBS register raw dump along with perf sample:

		# perf record -e ibs_op/cnt_ctl=1/ -c 100000 -a --raw-samples

		System-wide profile, uOps event, sampling period: 100000, L3MissOnly (Zen4 onward)

		# perf record -e ibs_op/cnt_ctl=1,l3missonly=1/ -c 100000 -a

		Per process(upstream v6.2 onward), uOps event, sampling period: 100000

		# perf record -e ibs_op/cnt_ctl=1/ -c 100000 -p 1234

		Per process(upstream v6.2 onward), uOps event, sampling period: 100000

		# perf record -e ibs_op/cnt_ctl=1/ -c 100000 -- ls

		To analyse recorded profile in aggregate mode

		# perf report
		/* Select a line and press 'a' to drill down at instruction level. */

		To go over each sample

		# perf script

		Raw dump of IBS registers when profiled with --raw-samples

		# perf report -D
		/* Look for PERF_RECORD_SAMPLE */

		Example register raw dump:

		ibs_op_ctl: 000002c30006186a MaxCnt 100000 L3MissOnly 0 En 1
		Val 1 CntCtl 0=cycles CurCnt 707
		IbsOpRip: ffffffff8204aea7
		ibs_op_data: 0000010002550001 CompToRetCtr 1 TagToRetCtr 597
		BrnRet 0 RipInvalid 0 BrnFuse 0 Microcode 1
		ibs_op_data2: 0000000000000013 RmtNode 1 DataSrc 3=DRAM
		ibs_op_data3: 0000000031960092 LdOp 0 StOp 1 DcL1TlbMiss 0
		DcL2TlbMiss 0 DcL1TlbHit2M 1 DcL1TlbHit1G 0 DcL2TlbHit2M 0
		DcMiss 1 DcMisAcc 0 DcWcMemAcc 0 DcUcMemAcc 0 DcLockedOp 0
		DcMissNoMabAlloc 0 DcLinAddrValid 1 DcPhyAddrValid 1
		DcL2TlbHit1G 0 L2Miss 1 SwPf 0 OpMemWidth 32 bytes
		OpDcMissOpenMemReqs 12 DcMissLat 0 TlbRefillLat 0
		IbsDCLinAd: ff110008a5398920
		IbsDCPhysAd: 00000008a5398920

		IBS applied in a real world usecase

		~90% regression was observed in tbench with specific scheduler hint
		which was counter intuitive. IBS profile of good and bad run captured
		using perf helped in identifying exact cause of the problem:

		https://lore.kernel.org/r/20220921063638.2489-1-kprateek.nayak@amd.com

		IBS Fetch PMU
		~~~~~~~~~~~~~

		Similar commands can be used with Fetch PMU as well.

		System-wide profile, fetch ops event, sampling period: 100000

		# perf record -e ibs_fetch// -c 100000 -a

		System-wide profile, fetch ops event, sampling period: 100000, Random enable

		# perf record -e ibs_fetch/rand_en=1/ -c 100000 -a

		Random enable adds small degree of variability to sample period. This
		helps in cases like long running loops where PMU is tagging the same
		instruction over and over because of fixed sample period.

		etc.

		PERF MEM AND PERF C2C
		---------------------

		perf mem is a memory access profiler tool and perf c2c is a shared data
		cacheline analyser tool. Both of them internally uses IBS Op PMU on AMD.
		Below is a simple example of the perf mem tool.

		# perf mem record -c 100000 -- make
		# perf mem report

		A normal perf mem report output will provide detailed memory access profile.
		However, it can also be aggregated based on output fields. For example:

		# perf mem report -F mem,sample,snoop
		Samples: 3M of event 'ibs_op//', Event count (approx.): 23524876
		Memory access Samples Snoop
		N/A 1903343 N/A
		L1 hit 1056754 N/A
		L2 hit 75231 N/A
		L3 hit 9496 HitM
		L3 hit 2270 N/A
		RAM hit 8710 N/A
		Remote node, same socket RAM hit 3241 N/A
		Remote core, same node Any cache hit 1572 HitM
		Remote core, same node Any cache hit 514 N/A
		Remote node, same socket Any cache hit 1216 HitM
		Remote node, same socket Any cache hit 350 N/A
		Uncached hit 18 N/A

		Please refer to their man page for more detail.

		SEE ALSO
		--------

		linkperf:perf-record[1], linkperf:perf-script[1], linkperf:perf-report[1],
		linkperf:perf-mem[1], linkperf:perf-c2c[1]

tools/perf/Documentation/perf-kwork.txt

+2 −2

Original line number	Diff line number	Diff line
		perf-kowrk(1)
		perf-kwork(1)
		=============

		NAME
		@@ -35,7 +35,7 @@ There are several variants of 'perf kwork':
		perf kwork top
		perf kwork top -b

		By default it shows the individual work events such as irq, workqeueu,
		By default it shows the individual work events such as irq, workqueue,
		including the run time and delay (time between raise and actually entry):

		Runtime start Runtime end Cpu Kwork name Runtime Delaytime