Commit 1bbeaf83 authored Mar 14, 2024 by Linus Torvalds

Merge tag 'perf-tools-for-v6.9-2024-03-13' of...

Merge tag 'perf-tools-for-v6.9-2024-03-13' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools

Pull perf tools updates from Namhyung Kim:
 "perf stat:

   - Support new 'cluster' aggregation mode for shared resources
     depending on the hardware configuration:

        $ sudo perf stat -a --per-cluster -e cycles,instructions sleep 1

         Performance counter stats for 'system wide':

        S0-D0-CLS0    2         85,051,822      cycles
        S0-D0-CLS0    2         73,909,908      instructions      #    0.87  insn per cycle
        S0-D0-CLS2    2         93,365,918      cycles
        S0-D0-CLS2    2         83,006,158      instructions      #    0.89  insn per cycle
        S0-D0-CLS4    2        104,157,523      cycles
        S0-D0-CLS4    2         53,234,396      instructions      #    0.51  insn per cycle
        S0-D0-CLS6    2         65,891,079      cycles
        S0-D0-CLS6    2         41,478,273      instructions      #    0.63  insn per cycle

               1.002407989 seconds time elapsed

   - Various fixes and cleanups for event metrics including NaN handling

  perf script:

   - Use libcapstone if available to disassemble the instructions. This
     enables 'perf script -F disasm' and 'perf script --insn-trace=disasm'
     (for Intel-PT):

        $ perf script -F event,ip,disasm
        cycles:P:  ffffffffa988d428             wrmsr
        cycles:P:  ffffffffa9839d25             movq %rax, %r14
        cycles:P:  ffffffffa9cdcaf0             endbr64
        cycles:P:  ffffffffa988d428             wrmsr
        cycles:P:  ffffffffa988d428             wrmsr
        cycles:P:  ffffffffaa401f86             iretq
        cycles:P:  ffffffffa99c4de5             movq 0x30(%rcx), %r8
        cycles:P:  ffffffffa988d428             wrmsr
        cycles:P:  ffffffffaa401f86             iretq
        cycles:P:  ffffffffa9907983             movl 0x68(%rbx), %eax
        cycles:P:  ffffffffa988d428             wrmsr

   - Expose sample ID / stream ID to python scripts

  perf test:

   - Add more perf test cases from Redhat internal test suites. This
     time it adds the base infra and a few perf probe tests. More to
     come. :)

   - Add 'perf test -p' for parallel execution and fix some issues found
     by the parallel test

   - Support symbol test to print symbols in given (active) module:

        $ perf test -F -v Symbols --dso /lib/modules/$(uname -r)/kernel/fs/ext4/ext4.ko
        --- start ---
        Testing /lib/modules/6.5.13-1rodete2-amd64/kernel/fs/ext4/ext4.ko
        Overlapping symbols:
         7a990-7a9a0 l __pfx_ext4_exit_fs
         7a990-7a9a0 g __pfx_cleanup_module
        Overlapping symbols:
         7a9a0-7aa1c l ext4_exit_fs
         7a9a0-7aa1c g cleanup_module
        ...

  JSON metric updates:

   - A new round of Intel metric updates

   - Support Power11 PVR (compatible to Power10)

   - Fix cache latency events on Zen 4 to set SliceId properly

  Internal:

   - Fix reference counting for 'map' data structure, tireless work from
     Ian!

   - More memory optimization for struct thread and annotate histogram.
     Now, 'perf report' (TUI) and 'perf annotate' should be much
     lighter-weight in terms of memory footprint

   - Support cross-arch perf register access. Clean up the build
     configuration so that it can detect arch-register support at
     runtime. This can allow to parse register data in sample which was
     recorded in a different arch

  Others:

   - Sync task state in 'perf sched' to kernel using trace event fields.
     The task states have been changed so tools cannot assume a fixed
     encoding

   - Clean up 'perf mem' to generalize the arch-specific events

   - Add support for local and global variables to data type profiling.
     This would increase the success rate of type resolution with DWARF

   - Add short option -H for --hierarchy in 'perf report' and 'perf top'"

* tag 'perf-tools-for-v6.9-2024-03-13' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools: (154 commits)
  perf annotate: Add comments in the data structures
  perf annotate: Remove sym_hist.addr[] array
  perf annotate: Calculate instruction overhead using hashmap
  perf annotate: Add a hashmap for symbol histogram
  perf threads: Reduce table size from 256 to 8
  perf threads: Switch from rbtree to hashmap
  perf threads: Move threads to its own files
  perf machine: Move machine's threads into its own abstraction
  perf machine: Move fprintf to for_each loop and a callback
  perf trace: Ignore thread hashing in summary
  perf report: Sort child tasks by tid
  perf vendor events amd: Fix Zen 4 cache latency events
  perf version: Display availability of OpenCSD support
  perf vendor events intel: Add umasks/occ_sel to PCU events.
  perf map: Fix map reference count issues
  libperf evlist: Avoid out-of-bounds access
  perf lock contention: Account contending locks too
  perf metrics: Fix segv for metrics with no events
  perf metrics: Fix metric matching
  perf pmu: Fix a potential memory leak in perf_pmu__lookup()
  ...

parents 63bd30f2 0f66dfe7

tools/build/Makefile.feature

+2 −0

Original line number	Diff line number	Diff line
		@@ -87,6 +87,7 @@ FEATURE_TESTS_EXTRA := \
		gtk2-infobar \
		hello \
		libbabeltrace \
		libcapstone \
		libbfd-liberty \
		libbfd-liberty-z \
		libopencsd \
		@@ -134,6 +135,7 @@ FEATURE_DISPLAY ?= \
		libcrypto \
		libunwind \
		libdw-dwarf-unwind \
		libcapstone \
		zlib \
		lzma \
		get_cpuid \

tools/build/feature/Makefile

+4 −0

Original line number	Diff line number	Diff line
		@@ -54,6 +54,7 @@ FILES= \
		test-timerfd.bin \
		test-libdw-dwarf-unwind.bin \
		test-libbabeltrace.bin \
		test-libcapstone.bin \
		test-compile-32.bin \
		test-compile-x32.bin \
		test-zlib.bin \
		@@ -286,6 +287,9 @@ $(OUTPUT)test-libdw-dwarf-unwind.bin:
		$(OUTPUT)test-libbabeltrace.bin:
		$(BUILD) # -lbabeltrace provided by $(FEATURE_CHECK_LDFLAGS-libbabeltrace)

		$(OUTPUT)test-libcapstone.bin:
		$(BUILD) # -lcapstone provided by $(FEATURE_CHECK_LDFLAGS-libcapstone)

		$(OUTPUT)test-compile-32.bin:
		$(CC) -m32 -o $@ test-compile.c

tools/build/feature/test-all.c

+4 −0

Original line number	Diff line number	Diff line
		@@ -134,6 +134,10 @@
		#undef main
		#endif

		#define main main_test_libcapstone
		# include "test-libcapstone.c"
		#undef main

		#define main main_test_lzma
		# include "test-lzma.c"
		#undef main

tools/build/feature/test-libcapstone.c

0 → 100644

+11 −0

Original line number	Diff line number	Diff line
		// SPDX-License-Identifier: GPL-2.0

		#include <capstone/capstone.h>

		int main(void)
		{
		csh handle;

		cs_open(CS_ARCH_X86, CS_MODE_64, &handle);
		return 0;
		}

tools/lib/perf/evlist.c

+12 −6

Original line number	Diff line number	Diff line
		@@ -248,10 +248,10 @@ u64 perf_evlist__read_format(struct perf_evlist *evlist)

		static void perf_evlist__id_hash(struct perf_evlist *evlist,
		struct perf_evsel *evsel,
		int cpu, int thread, u64 id)
		int cpu_map_idx, int thread, u64 id)
		{
		int hash;
		struct perf_sample_id *sid = SID(evsel, cpu, thread);
		struct perf_sample_id *sid = SID(evsel, cpu_map_idx, thread);

		sid->id = id;
		sid->evsel = evsel;
		@@ -269,21 +269,27 @@ void perf_evlist__reset_id_hash(struct perf_evlist *evlist)

		void perf_evlist__id_add(struct perf_evlist *evlist,
		struct perf_evsel *evsel,
		int cpu, int thread, u64 id)
		int cpu_map_idx, int thread, u64 id)
		{
		perf_evlist__id_hash(evlist, evsel, cpu, thread, id);
		if (!SID(evsel, cpu_map_idx, thread))
		return;

		perf_evlist__id_hash(evlist, evsel, cpu_map_idx, thread, id);
		evsel->id[evsel->ids++] = id;
		}

		int perf_evlist__id_add_fd(struct perf_evlist *evlist,
		struct perf_evsel *evsel,
		int cpu, int thread, int fd)
		int cpu_map_idx, int thread, int fd)
		{
		u64 read_data[4] = { 0, };
		int id_idx = 1; /* The first entry is the counter value */
		u64 id;
		int ret;

		if (!SID(evsel, cpu_map_idx, thread))
		return -1;

		ret = ioctl(fd, PERF_EVENT_IOC_ID, &id);
		if (!ret)
		goto add;
		@@ -312,7 +318,7 @@ int perf_evlist__id_add_fd(struct perf_evlist *evlist,
		id = read_data[id_idx];

		add:
		perf_evlist__id_add(evlist, evsel, cpu, thread, id);
		perf_evlist__id_add(evlist, evsel, cpu_map_idx, thread, id);
		return 0;
		}