Commit df8f6181 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'perf-tools-for-v7.1-2026-04-17' of...

Merge tag 'perf-tools-for-v7.1-2026-04-17' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools

Pull perf tools updates from Namhyung Kim:
 "perf report:

   - Add 'comm_nodigit' sort key to combine similar threads that only
     have different numbers in the comm. In the following example, the
     'comm_nodigit' will have samples from all threads starting with
     "bpfrb/" into an entry "bpfrb/<N>".

        $ perf report -s comm_nodigit,comm -H
        ...
        #
        #    Overhead  CommandNoDigit / Command
        # ...........  ........................
        #
            20.30%     swapper
               20.30%     swapper
            13.37%     chrome
               13.37%     chrome
            10.07%     bpfrb/<N>
                7.47%     bpfrb/0
                0.70%     bpfrb/1
                0.47%     bpfrb/3
                0.46%     bpfrb/2
                0.25%     bpfrb/4
                0.23%     bpfrb/5
                0.20%     bpfrb/6
                0.14%     bpfrb/10
                0.07%     bpfrb/7

   - Support flat layout for symfs. The --symfs option is to specify the
     location of debugging symbol files. The default 'hierarchy' layout
     would search the symbol file using the same path of the original
     file under the symfs root. The new 'flat' layout would search only
     in the root directory.

   - Update 'simd' sort key for ARM SIMD flags to cover ASE/SME and more
     predicate flags.

  perf stat:

   - Add --pmu-filter option to select specific PMUs. This would be
     useful when you measure metrics from multiple instance of uncore
     PMUs with similar names.

        # perf stat -M cpa_p0_avg_bw
         Performance counter stats for 'system wide':

            19,417,779,115      hisi_sicl0_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw
                         0      hisi_sicl0_cpa0/cpa_p0_wr_dat/
                         0      hisi_sicl0_cpa0/cpa_p0_rd_dat_64b/
                         0      hisi_sicl0_cpa0/cpa_p0_rd_dat_32b/
            19,417,751,103      hisi_sicl10_cpa0/cpa_cycles/     #     0.00 cpa_p0_avg_bw
                         0      hisi_sicl10_cpa0/cpa_p0_wr_dat/
                         0      hisi_sicl10_cpa0/cpa_p0_rd_dat_64b/
                         0      hisi_sicl10_cpa0/cpa_p0_rd_dat_32b/
            19,417,730,679      hisi_sicl2_cpa0/cpa_cycles/      #     0.31 cpa_p0_avg_bw
                75,635,749      hisi_sicl2_cpa0/cpa_p0_wr_dat/
                18,520,640      hisi_sicl2_cpa0/cpa_p0_rd_dat_64b/
                         0      hisi_sicl2_cpa0/cpa_p0_rd_dat_32b/
            19,417,674,227      hisi_sicl8_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw
                         0      hisi_sicl8_cpa0/cpa_p0_wr_dat/
                         0      hisi_sicl8_cpa0/cpa_p0_rd_dat_64b/
                         0      hisi_sicl8_cpa0/cpa_p0_rd_dat_32b/

              19.417734480 seconds time elapsed

     With --pmu-filter, users can select only hisi_sicl2_cpa0 PMU.

        # perf stat --pmu-filter hisi_sicl2_cpa0 -M cpa_p0_avg_bw
         Performance counter stats for 'system wide':

             6,234,093,559      cpa_cycles                       #     0.60 cpa_p0_avg_bw
                50,548,465      cpa_p0_wr_dat
                 7,552,182      cpa_p0_rd_dat_64b
                         0      cpa_p0_rd_dat_32b

               6.234139320 seconds time elapsed

  Data type profiling:

   - Quality improvements by tracking register state more precisely

   - Ensure array members to get the type

   - Handle more cases for global variables

  Vendor event/metric updates:

   - Update various Intel events and metrics

   - Add NVIDIA Tegra 410 Olympus events

  Internal changes:

   - Verify perf.data header for maliciously crafted files

   - Update perf test to cover more usages and make them robust

   - Move a couple of copied kernel headers not to annoy objtool build

   - Fix a bug in map sorting in name order

   - Remove some unused codes

  Misc:

   - Fix module symbol resolution with non-zero text address

   - Add -t/--threads option to `perf bench mem mmap`

   - Track duration of exit*() syscall by `perf trace -s`

   - Add core.addr2line-timeout and core.addr2line-disable-warn config
     items"

* tag 'perf-tools-for-v7.1-2026-04-17' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools: (131 commits)
  perf loongarch: Fix build failure with CONFIG_LIBDW_DWARF_UNWIND
  perf annotate: Use jump__delete when freeing LoongArch jumps
  perf test: Fixes for check branch stack sampling
  perf test: Fix inet_pton probe failure and unroll call graph
  perf build: fix "argument list too long" in second location
  perf header: Add sanity checks to HEADER_BPF_BTF processing
  perf header: Sanity check HEADER_BPF_PROG_INFO
  perf header: Sanity check HEADER_PMU_CAPS
  perf header: Sanity check HEADER_HYBRID_TOPOLOGY
  perf header: Sanity check HEADER_CACHE
  perf header: Sanity check HEADER_GROUP_DESC
  perf header: Sanity check HEADER_PMU_MAPPINGS
  perf header: Sanity check HEADER_MEM_TOPOLOGY
  perf header: Sanity check HEADER_NUMA_TOPOLOGY
  perf header: Sanity check HEADER_CPU_TOPOLOGY
  perf header: Sanity check HEADER_NRCPUS and HEADER_CPU_DOMAIN_INFO
  perf header: Bump up the max number of command line args allowed
  perf header: Validate nr_domains when reading HEADER_CPU_DOMAIN_INFO
  perf sample: Fix documentation typo
  perf arm_spe: Improve SIMD flags setting
  ...
parents 8541d8f7 9a683fe0
Loading
Loading
Loading
Loading
+8 −2
Original line number Diff line number Diff line
@@ -104,12 +104,18 @@ else
  endif
endif

ifeq ($(findstring -static,${LDFLAGS}),-static)
  PKG_CONFIG += --static
endif

all: $(FILES)

__BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS)
  BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1
  BUILD_BFD = $(BUILD) -DPACKAGE='"perf"' -lbfd -ldl
  BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -ldl -lz -llzma -lzstd -lssl
  BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang \
	      $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -ldl -lz -llzma -lzstd \
	      $(shell $(PKG_CONFIG) --libs --cflags openssl 2>/dev/null)

__BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS)
  BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1
@@ -388,7 +394,7 @@ $(OUTPUT)test-libpfm4.bin:
	$(BUILD) -lpfm

$(OUTPUT)test-libopenssl.bin:
	$(BUILD) -lssl
	$(BUILD) $(shell $(PKG_CONFIG) --libs --cflags openssl 2>/dev/null)

$(OUTPUT)test-bpftool-skeletons.bin:
	$(SYSTEM_BPFTOOL) version | grep '^features:.*skeletons' \
+23 −26
Original line number Diff line number Diff line
@@ -15,12 +15,12 @@

#define MAX_NR_CPUS 4096

void perf_cpu_map__set_nr(struct perf_cpu_map *map, int nr_cpus)
void perf_cpu_map__set_nr(struct perf_cpu_map *map, unsigned int nr_cpus)
{
	RC_CHK_ACCESS(map)->nr = nr_cpus;
}

struct perf_cpu_map *perf_cpu_map__alloc(int nr_cpus)
struct perf_cpu_map *perf_cpu_map__alloc(unsigned int nr_cpus)
{
	RC_STRUCT(perf_cpu_map) *cpus;
	struct perf_cpu_map *result;
@@ -78,7 +78,7 @@ void perf_cpu_map__put(struct perf_cpu_map *map)
static struct perf_cpu_map *cpu_map__new_sysconf(void)
{
	struct perf_cpu_map *cpus;
	int nr_cpus, nr_cpus_conf;
	long nr_cpus, nr_cpus_conf;

	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
	if (nr_cpus < 0)
@@ -86,15 +86,13 @@ static struct perf_cpu_map *cpu_map__new_sysconf(void)

	nr_cpus_conf = sysconf(_SC_NPROCESSORS_CONF);
	if (nr_cpus != nr_cpus_conf) {
		pr_warning("Number of online CPUs (%d) differs from the number configured (%d) the CPU map will only cover the first %d CPUs.",
		pr_warning("Number of online CPUs (%ld) differs from the number configured (%ld) the CPU map will only cover the first %ld CPUs.",
			nr_cpus, nr_cpus_conf, nr_cpus);
	}

	cpus = perf_cpu_map__alloc(nr_cpus);
	if (cpus != NULL) {
		int i;

		for (i = 0; i < nr_cpus; ++i)
		for (long i = 0; i < nr_cpus; ++i)
			RC_CHK_ACCESS(cpus)->map[i].cpu = i;
	}

@@ -132,23 +130,23 @@ static int cmp_cpu(const void *a, const void *b)
	return cpu_a->cpu - cpu_b->cpu;
}

static struct perf_cpu __perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx)
static struct perf_cpu __perf_cpu_map__cpu(const struct perf_cpu_map *cpus, unsigned int idx)
{
	return RC_CHK_ACCESS(cpus)->map[idx];
}

static struct perf_cpu_map *cpu_map__trim_new(int nr_cpus, const struct perf_cpu *tmp_cpus)
static struct perf_cpu_map *cpu_map__trim_new(unsigned int nr_cpus, const struct perf_cpu *tmp_cpus)
{
	size_t payload_size = nr_cpus * sizeof(struct perf_cpu);
	struct perf_cpu_map *cpus = perf_cpu_map__alloc(nr_cpus);
	int i, j;

	if (cpus != NULL) {
		unsigned int j = 0;

		memcpy(RC_CHK_ACCESS(cpus)->map, tmp_cpus, payload_size);
		qsort(RC_CHK_ACCESS(cpus)->map, nr_cpus, sizeof(struct perf_cpu), cmp_cpu);
		/* Remove dups */
		j = 0;
		for (i = 0; i < nr_cpus; i++) {
		for (unsigned int i = 0; i < nr_cpus; i++) {
			if (i == 0 ||
			    __perf_cpu_map__cpu(cpus, i).cpu !=
			    __perf_cpu_map__cpu(cpus, i - 1).cpu) {
@@ -167,9 +165,8 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
	struct perf_cpu_map *cpus = NULL;
	unsigned long start_cpu, end_cpu = 0;
	char *p = NULL;
	int i, nr_cpus = 0;
	unsigned int nr_cpus = 0, max_entries = 0;
	struct perf_cpu *tmp_cpus = NULL, *tmp;
	int max_entries = 0;

	if (!cpu_list)
		return perf_cpu_map__new_online_cpus();
@@ -208,9 +205,10 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)

		for (; start_cpu <= end_cpu; start_cpu++) {
			/* check for duplicates */
			for (i = 0; i < nr_cpus; i++)
			for (unsigned int i = 0; i < nr_cpus; i++) {
				if (tmp_cpus[i].cpu == (int16_t)start_cpu)
					goto invalid;
			}

			if (nr_cpus == max_entries) {
				max_entries += max(end_cpu - start_cpu + 1, 16UL);
@@ -252,12 +250,12 @@ struct perf_cpu_map *perf_cpu_map__new_int(int cpu)
	return cpus;
}

static int __perf_cpu_map__nr(const struct perf_cpu_map *cpus)
static unsigned int __perf_cpu_map__nr(const struct perf_cpu_map *cpus)
{
	return RC_CHK_ACCESS(cpus)->nr;
}

struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx)
struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, unsigned int idx)
{
	struct perf_cpu result = {
		.cpu = -1
@@ -269,7 +267,7 @@ struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx)
	return result;
}

int perf_cpu_map__nr(const struct perf_cpu_map *cpus)
unsigned int perf_cpu_map__nr(const struct perf_cpu_map *cpus)
{
	return cpus ? __perf_cpu_map__nr(cpus) : 1;
}
@@ -294,7 +292,7 @@ bool perf_cpu_map__is_empty(const struct perf_cpu_map *map)

int perf_cpu_map__idx(const struct perf_cpu_map *cpus, struct perf_cpu cpu)
{
	int low, high;
	unsigned int low, high;

	if (!cpus)
		return -1;
@@ -324,7 +322,7 @@ bool perf_cpu_map__has(const struct perf_cpu_map *cpus, struct perf_cpu cpu)

bool perf_cpu_map__equal(const struct perf_cpu_map *lhs, const struct perf_cpu_map *rhs)
{
	int nr;
	unsigned int nr;

	if (lhs == rhs)
		return true;
@@ -336,7 +334,7 @@ bool perf_cpu_map__equal(const struct perf_cpu_map *lhs, const struct perf_cpu_m
	if (nr != __perf_cpu_map__nr(rhs))
		return false;

	for (int idx = 0; idx < nr; idx++) {
	for (unsigned int idx = 0; idx < nr; idx++) {
		if (__perf_cpu_map__cpu(lhs, idx).cpu != __perf_cpu_map__cpu(rhs, idx).cpu)
			return false;
	}
@@ -353,7 +351,7 @@ struct perf_cpu perf_cpu_map__min(const struct perf_cpu_map *map)
	struct perf_cpu cpu, result = {
		.cpu = -1
	};
	int idx;
	unsigned int idx;

	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, map) {
		result = cpu;
@@ -384,7 +382,7 @@ bool perf_cpu_map__is_subset(const struct perf_cpu_map *a, const struct perf_cpu
	if (!a || __perf_cpu_map__nr(b) > __perf_cpu_map__nr(a))
		return false;

	for (int i = 0, j = 0; i < __perf_cpu_map__nr(a); i++) {
	for (unsigned int i = 0, j = 0; i < __perf_cpu_map__nr(a); i++) {
		if (__perf_cpu_map__cpu(a, i).cpu > __perf_cpu_map__cpu(b, j).cpu)
			return false;
		if (__perf_cpu_map__cpu(a, i).cpu == __perf_cpu_map__cpu(b, j).cpu) {
@@ -410,8 +408,7 @@ bool perf_cpu_map__is_subset(const struct perf_cpu_map *a, const struct perf_cpu
int perf_cpu_map__merge(struct perf_cpu_map **orig, struct perf_cpu_map *other)
{
	struct perf_cpu *tmp_cpus;
	int tmp_len;
	int i, j, k;
	unsigned int tmp_len, i, j, k;
	struct perf_cpu_map *merged;

	if (perf_cpu_map__is_subset(*orig, other))
@@ -455,7 +452,7 @@ int perf_cpu_map__merge(struct perf_cpu_map **orig, struct perf_cpu_map *other)
struct perf_cpu_map *perf_cpu_map__intersect(struct perf_cpu_map *orig,
					     struct perf_cpu_map *other)
{
	int i, j, k;
	unsigned int i, j, k;
	struct perf_cpu_map *merged;

	if (perf_cpu_map__is_subset(other, orig))
+6 −4
Original line number Diff line number Diff line
@@ -127,7 +127,8 @@ int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus,
		     struct perf_thread_map *threads)
{
	struct perf_cpu cpu;
	int idx, thread, err = 0;
	unsigned int idx;
	int thread, err = 0;

	if (cpus == NULL) {
		static struct perf_cpu_map *empty_cpu_map;
@@ -460,7 +461,7 @@ int perf_evsel__enable_cpu(struct perf_evsel *evsel, int cpu_map_idx)
int perf_evsel__enable_thread(struct perf_evsel *evsel, int thread)
{
	struct perf_cpu cpu __maybe_unused;
	int idx;
	unsigned int idx;
	int err;

	perf_cpu_map__for_each_cpu(cpu, idx, evsel->cpus) {
@@ -499,12 +500,13 @@ int perf_evsel__disable(struct perf_evsel *evsel)

int perf_evsel__apply_filter(struct perf_evsel *evsel, const char *filter)
{
	int err = 0, i;
	int err = 0;

	for (i = 0; i < perf_cpu_map__nr(evsel->cpus) && !err; i++)
	for (unsigned int i = 0; i < perf_cpu_map__nr(evsel->cpus) && !err; i++) {
		err = perf_evsel__run_ioctl(evsel,
				     PERF_EVENT_IOC_SET_FILTER,
				     (void *)filter, i);
	}
	return err;
}

+3 −3
Original line number Diff line number Diff line
@@ -16,16 +16,16 @@
DECLARE_RC_STRUCT(perf_cpu_map) {
	refcount_t	refcnt;
	/** Length of the map array. */
	int		nr;
	unsigned int	nr;
	/** The CPU values. */
	struct perf_cpu	map[];
};

struct perf_cpu_map *perf_cpu_map__alloc(int nr_cpus);
struct perf_cpu_map *perf_cpu_map__alloc(unsigned int nr_cpus);
int perf_cpu_map__idx(const struct perf_cpu_map *cpus, struct perf_cpu cpu);
bool perf_cpu_map__is_subset(const struct perf_cpu_map *a, const struct perf_cpu_map *b);

void perf_cpu_map__set_nr(struct perf_cpu_map *map, int nr_cpus);
void perf_cpu_map__set_nr(struct perf_cpu_map *map, unsigned int nr_cpus);

static inline refcount_t *perf_cpu_map__refcnt(struct perf_cpu_map *map)
{
+2 −2
Original line number Diff line number Diff line
@@ -49,7 +49,7 @@ LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map);
 * perf_cpu_map__cpu - get the CPU value at the given index. Returns -1 if index
 *                     is invalid.
 */
LIBPERF_API struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx);
LIBPERF_API struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, unsigned int idx);
/**
 * perf_cpu_map__nr - for an empty map returns 1, as perf_cpu_map__cpu returns a
 *                    cpu of -1 for an invalid index, this makes an empty map
@@ -57,7 +57,7 @@ LIBPERF_API struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, i
 *                    the result is the number CPUs in the map plus one if the
 *                    "any CPU"/dummy value is present.
 */
LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus);
LIBPERF_API unsigned int perf_cpu_map__nr(const struct perf_cpu_map *cpus);
/**
 * perf_cpu_map__has_any_cpu_or_is_empty - is map either empty or has the "any CPU"/dummy value.
 */
Loading