Commit 13f35928 authored by Namhyung Kim's avatar Namhyung Kim Committed by Arnaldo Carvalho de Melo
Browse files

perf lock contention: Symbolize zone->lock using BTF



The struct zone is embedded in struct pglist_data which can be allocated
for each NUMA node early in the boot process.  As it's not a slab object
nor a global lock, this was not symbolized.

Since the zone->lock is often contended, it'd be nice if we can
symbolize it.  On NUMA systems, node_data array will have pointers for
struct pglist_data.  By following the pointer, it can calculate the
address of each zone and its lock using BTF.  On UMA, it can just use
contig_page_data and its zones.

The following example shows the zone lock contention at the end.

  $ sudo ./perf lock con -abl -E 5 -- ./perf bench sched messaging
  # Running 'sched/messaging' benchmark:
  # 20 sender and receiver processes per group
  # 10 groups == 400 processes run

       Total time: 0.038 [sec]
   contended   total wait     max wait     avg wait            address   symbol

        5167     18.17 ms     10.27 us      3.52 us   ffff953340052d00   &kmem_cache_node (spinlock)
          38     11.75 ms    465.49 us    309.13 us   ffff95334060c480   &sock_inode_cache (spinlock)
        3916     10.13 ms     10.43 us      2.59 us   ffff953342aecb40   &kmem_cache_node (spinlock)
        2963     10.02 ms     13.75 us      3.38 us   ffff9533d2344098   &kmalloc-rnd-08-2k (spinlock)
         216      5.05 ms     99.49 us     23.39 us   ffff9542bf7d65d0   zone_lock (spinlock)

Signed-off-by: default avatarNamhyung Kim <namhyung@kernel.org>
Tested-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: bpf@vger.kernel.org
Cc: linux-mm@kvack.org
Link: https://lore.kernel.org/r/20250401063055.7431-1-namhyung@kernel.org


Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent 2d099cca
Loading
Loading
Loading
Loading
+82 −6
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@
#include "util/lock-contention.h"
#include <linux/zalloc.h>
#include <linux/string.h>
#include <api/fs/fs.h>
#include <bpf/bpf.h>
#include <bpf/btf.h>
#include <inttypes.h>
@@ -35,28 +36,26 @@ static bool slab_cache_equal(long key1, long key2, void *ctx __maybe_unused)

static void check_slab_cache_iter(struct lock_contention *con)
{
	struct btf *btf = btf__load_vmlinux_btf();
	s32 ret;

	hashmap__init(&slab_hash, slab_cache_hash, slab_cache_equal, /*ctx=*/NULL);

	if (btf == NULL) {
	con->btf = btf__load_vmlinux_btf();
	if (con->btf == NULL) {
		pr_debug("BTF loading failed: %s\n", strerror(errno));
		return;
	}

	ret = btf__find_by_name_kind(btf, "bpf_iter__kmem_cache", BTF_KIND_STRUCT);
	ret = btf__find_by_name_kind(con->btf, "bpf_iter__kmem_cache", BTF_KIND_STRUCT);
	if (ret < 0) {
		bpf_program__set_autoload(skel->progs.slab_cache_iter, false);
		pr_debug("slab cache iterator is not available: %d\n", ret);
		goto out;
		return;
	}

	has_slab_iter = true;

	bpf_map__set_max_entries(skel->maps.slab_caches, con->map_nr_entries);
out:
	btf__free(btf);
}

static void run_slab_cache_iter(void)
@@ -109,6 +108,75 @@ static void exit_slab_cache_iter(void)
	hashmap__clear(&slab_hash);
}

static void init_numa_data(struct lock_contention *con)
{
	struct symbol *sym;
	struct map *kmap;
	char *buf = NULL, *p;
	size_t len;
	long last = -1;
	int ret;

	/*
	 * 'struct zone' is embedded in 'struct pglist_data' as an array.
	 * As we may not have full information of the struct zone in the
	 * (fake) vmlinux.h, let's get the actual size from BTF.
	 */
	ret = btf__find_by_name_kind(con->btf, "zone", BTF_KIND_STRUCT);
	if (ret < 0) {
		pr_debug("cannot get type of struct zone: %d\n", ret);
		return;
	}

	ret = btf__resolve_size(con->btf, ret);
	if (ret < 0) {
		pr_debug("cannot get size of struct zone: %d\n", ret);
		return;
	}
	skel->rodata->sizeof_zone = ret;

	/* UMA system doesn't have 'node_data[]' - just use contig_page_data. */
	sym = machine__find_kernel_symbol_by_name(con->machine,
						  "contig_page_data",
						  &kmap);
	if (sym) {
		skel->rodata->contig_page_data_addr = map__unmap_ip(kmap, sym->start);
		map__put(kmap);
		return;
	}

	/*
	 * The 'node_data' is an array of pointers to struct pglist_data.
	 * It needs to follow the pointer for each node in BPF to get the
	 * address of struct pglist_data and its zones.
	 */
	sym = machine__find_kernel_symbol_by_name(con->machine,
						  "node_data",
						  &kmap);
	if (sym == NULL)
		return;

	skel->rodata->node_data_addr = map__unmap_ip(kmap, sym->start);
	map__put(kmap);

	/* get the number of online nodes using the last node number + 1 */
	ret = sysfs__read_str("devices/system/node/online", &buf, &len);
	if (ret < 0) {
		pr_debug("failed to read online node: %d\n", ret);
		return;
	}

	p = buf;
	while (p && *p) {
		last = strtol(p, &p, 0);

		if (p && (*p == ',' || *p == '-' || *p == '\n'))
			p++;
	}
	skel->rodata->nr_nodes = last + 1;
	free(buf);
}

int lock_contention_prepare(struct lock_contention *con)
{
	int i, fd;
@@ -218,6 +286,8 @@ int lock_contention_prepare(struct lock_contention *con)

	bpf_map__set_max_entries(skel->maps.slab_filter, nslabs);

	init_numa_data(con);

	if (lock_contention_bpf__load(skel) < 0) {
		pr_err("Failed to load lock-contention BPF skeleton\n");
		return -1;
@@ -505,6 +575,11 @@ static const char *lock_contention_get_name(struct lock_contention *con,
				return "rq_lock";
		}

		if (!bpf_map_lookup_elem(lock_fd, &key->lock_addr_or_cgroup, &flags)) {
			if (flags == LOCK_CLASS_ZONE_LOCK)
				return "zone_lock";
		}

		/* look slab_hash for dynamic locks in a slab object */
		if (hashmap__find(&slab_hash, flags & LCB_F_SLAB_ID_MASK, &slab_data)) {
			snprintf(name_buf, sizeof(name_buf), "&%s", slab_data->name);
@@ -743,6 +818,7 @@ int lock_contention_finish(struct lock_contention *con)
	}

	exit_slab_cache_iter();
	btf__free(con->btf);

	return 0;
}
+64 −0
Original line number Diff line number Diff line
@@ -11,6 +11,9 @@
/* for collect_lock_syms().  4096 was rejected by the verifier */
#define MAX_CPUS  1024

/* for collect_zone_lock().  It should be more than the actual zones. */
#define MAX_ZONES  10

/* lock contention flags from include/trace/events/lock.h */
#define LCB_F_SPIN	(1U << 0)
#define LCB_F_READ	(1U << 1)
@@ -801,6 +804,11 @@ int contention_end(u64 *ctx)

extern struct rq runqueues __ksym;

const volatile __u64 contig_page_data_addr;
const volatile __u64 node_data_addr;
const volatile int nr_nodes;
const volatile int sizeof_zone;

struct rq___old {
	raw_spinlock_t lock;
} __attribute__((preserve_access_index));
@@ -809,6 +817,59 @@ struct rq___new {
	raw_spinlock_t __lock;
} __attribute__((preserve_access_index));

static void collect_zone_lock(void)
{
	__u64 nr_zones, zone_off;
	__u64 lock_addr, lock_off;
	__u32 lock_flag = LOCK_CLASS_ZONE_LOCK;

	zone_off = offsetof(struct pglist_data, node_zones);
	lock_off = offsetof(struct zone, lock);

	if (contig_page_data_addr) {
		struct pglist_data *contig_page_data;

		contig_page_data = (void *)(long)contig_page_data_addr;
		nr_zones = BPF_CORE_READ(contig_page_data, nr_zones);

		for (int i = 0; i < MAX_ZONES; i++) {
			__u64 zone_addr;

			if (i >= nr_zones)
				break;

			zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off;
			lock_addr = zone_addr + lock_off;

			bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
		}
	} else if (nr_nodes > 0) {
		struct pglist_data **node_data = (void *)(long)node_data_addr;

		for (int i = 0; i < nr_nodes; i++) {
			struct pglist_data *pgdat = NULL;
			int err;

			err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]);
			if (err < 0 || pgdat == NULL)
				break;

			nr_zones = BPF_CORE_READ(pgdat, nr_zones);
			for (int k = 0; k < MAX_ZONES; k++) {
				__u64 zone_addr;

				if (k >= nr_zones)
					break;

				zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off;
				lock_addr = zone_addr + lock_off;

				bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
			}
		}
	}
}

SEC("raw_tp/bpf_test_finish")
int BPF_PROG(collect_lock_syms)
{
@@ -830,6 +891,9 @@ int BPF_PROG(collect_lock_syms)
		lock_flag = LOCK_CLASS_RQLOCK;
		bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
	}

	collect_zone_lock();

	return 0;
}

+1 −0
Original line number Diff line number Diff line
@@ -67,6 +67,7 @@ enum lock_aggr_mode {
enum lock_class_sym {
	LOCK_CLASS_NONE,
	LOCK_CLASS_RQLOCK,
	LOCK_CLASS_ZONE_LOCK,
};

struct slab_cache_data {
+9 −0
Original line number Diff line number Diff line
@@ -203,4 +203,13 @@ struct bpf_iter__kmem_cache {
	struct kmem_cache *s;
} __attribute__((preserve_access_index));

struct zone {
	spinlock_t lock;
} __attribute__((preserve_access_index));

struct pglist_data {
	struct zone node_zones[6]; /* value for all possible config */
	int nr_zones;
} __attribute__((preserve_access_index));

#endif // __VMLINUX_H
+1 −0
Original line number Diff line number Diff line
@@ -142,6 +142,7 @@ struct lock_contention {
	struct lock_filter *filters;
	struct lock_contention_fails fails;
	struct rb_root cgroups;
	void *btf;
	unsigned long map_nr_entries;
	int max_stack;
	int stack_skip;