Commit 3d8dc43e authored Dec 06, 2024 by Hou Tao Committed by Alexei Starovoitov Dec 06, 2024

bpf: Switch to bpf mem allocator for LPM trie

Multiple syzbot warnings have been reported. These warnings are mainly
about the lock order between trie->lock and kmalloc()'s internal lock.
See report [1] as an example:

======================================================
WARNING: possible circular locking dependency detected
6.10.0-rc7-syzkaller-00003-g4376e966ecb7 #0 Not tainted
------------------------------------------------------
syz.3.2069/15008 is trying to acquire lock:
ffff88801544e6d8 (&n->list_lock){-.-.}-{2:2}, at: get_partial_node ...

but task is already holding lock:
ffff88802dcc89f8 (&trie->lock){-.-.}-{2:2}, at: trie_update_elem ...

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (&trie->lock){-.-.}-{2:2}:
       __raw_spin_lock_irqsave
       _raw_spin_lock_irqsave+0x3a/0x60
       trie_delete_elem+0xb0/0x820
       ___bpf_prog_run+0x3e51/0xabd0
       __bpf_prog_run32+0xc1/0x100
       bpf_dispatcher_nop_func
       ......
       bpf_trace_run2+0x231/0x590
       __bpf_trace_contention_end+0xca/0x110
       trace_contention_end.constprop.0+0xea/0x170
       __pv_queued_spin_lock_slowpath+0x28e/0xcc0
       pv_queued_spin_lock_slowpath
       queued_spin_lock_slowpath
       queued_spin_lock
       do_raw_spin_lock+0x210/0x2c0
       __raw_spin_lock_irqsave
       _raw_spin_lock_irqsave+0x42/0x60
       __put_partials+0xc3/0x170
       qlink_free
       qlist_free_all+0x4e/0x140
       kasan_quarantine_reduce+0x192/0x1e0
       __kasan_slab_alloc+0x69/0x90
       kasan_slab_alloc
       slab_post_alloc_hook
       slab_alloc_node
       kmem_cache_alloc_node_noprof+0x153/0x310
       __alloc_skb+0x2b1/0x380
       ......

-> #0 (&n->list_lock){-.-.}-{2:2}:
       check_prev_add
       check_prevs_add
       validate_chain
       __lock_acquire+0x2478/0x3b30
       lock_acquire
       lock_acquire+0x1b1/0x560
       __raw_spin_lock_irqsave
       _raw_spin_lock_irqsave+0x3a/0x60
       get_partial_node.part.0+0x20/0x350
       get_partial_node
       get_partial
       ___slab_alloc+0x65b/0x1870
       __slab_alloc.constprop.0+0x56/0xb0
       __slab_alloc_node
       slab_alloc_node
       __do_kmalloc_node
       __kmalloc_node_noprof+0x35c/0x440
       kmalloc_node_noprof
       bpf_map_kmalloc_node+0x98/0x4a0
       lpm_trie_node_alloc
       trie_update_elem+0x1ef/0xe00
       bpf_map_update_value+0x2c1/0x6c0
       map_update_elem+0x623/0x910
       __sys_bpf+0x90c/0x49a0
       ...

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&trie->lock);
                               lock(&n->list_lock);
                               lock(&trie->lock);
  lock(&n->list_lock);

 *** DEADLOCK ***

[1]: https://syzkaller.appspot.com/bug?extid=9045c0a3d5a7f1b119f7



A bpf program attached to trace_contention_end() triggers after
acquiring &n->list_lock. The program invokes trie_delete_elem(), which
then acquires trie->lock. However, it is possible that another
process is invoking trie_update_elem(). trie_update_elem() will acquire
trie->lock first, then invoke kmalloc_node(). kmalloc_node() may invoke
get_partial_node() and try to acquire &n->list_lock (not necessarily the
same lock object). Therefore, lockdep warns about the circular locking
dependency.

Invoking kmalloc() before acquiring trie->lock could fix the warning.
However, since BPF programs call be invoked from any context (e.g.,
through kprobe/tracepoint/fentry), there may still be lock ordering
problems for internal locks in kmalloc() or trie->lock itself.

To eliminate these potential lock ordering problems with kmalloc()'s
internal locks, replacing kmalloc()/kfree()/kfree_rcu() with equivalent
BPF memory allocator APIs that can be invoked in any context. The lock
ordering problems with trie->lock (e.g., reentrance) will be handled
separately.

Three aspects of this change require explanation:

1. Intermediate and leaf nodes are allocated from the same allocator.
Since the value size of LPM trie is usually small, using a single
alocator reduces the memory overhead of the BPF memory allocator.

2. Leaf nodes are allocated before disabling IRQs. This handles cases
where leaf_size is large (e.g., > 4KB - 8) and updates require
intermediate node allocation. If leaf nodes were allocated in
IRQ-disabled region, the free objects in BPF memory allocator would not
be refilled timely and the intermediate node allocation may fail.

3. Paired migrate_{disable|enable}() calls for node alloc and free. The
BPF memory allocator uses per-CPU struct internally, these paired calls
are necessary to guarantee correctness.

Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20241206110622.1161752-7-houtao@huaweicloud.com


Signed-off-by: Alexei Starovoitov <ast@kernel.org>

parent 27abc7b3

kernel/bpf/lpm_trie.c

+48 −23

Original line number	Diff line number	Diff line
		@@ -15,6 +15,7 @@
		#include <net/ipv6.h>
		#include <uapi/linux/btf.h>
		#include <linux/btf_ids.h>
		#include <linux/bpf_mem_alloc.h>

		/* Intermediate node */
		#define LPM_TREE_NODE_FLAG_IM BIT(0)
		@@ -22,7 +23,6 @@
		struct lpm_trie_node;

		struct lpm_trie_node {
		struct rcu_head rcu;
		struct lpm_trie_node __rcu *child[2];
		u32 prefixlen;
		u32 flags;
		@@ -32,6 +32,7 @@ struct lpm_trie_node {
		struct lpm_trie {
		struct bpf_map map;
		struct lpm_trie_node __rcu *root;
		struct bpf_mem_alloc ma;
		size_t n_entries;
		size_t max_prefixlen;
		size_t data_size;
		@@ -287,17 +288,18 @@ static void trie_lookup_elem(struct bpf_map map, void *_key)
		return found->data + trie->data_size;
		}

		static struct lpm_trie_node lpm_trie_node_alloc(const struct lpm_trie trie,
		const void *value)
		static struct lpm_trie_node lpm_trie_node_alloc(struct lpm_trie trie,
		const void *value,
		bool disable_migration)
		{
		struct lpm_trie_node *node;
		size_t size = sizeof(struct lpm_trie_node) + trie->data_size;

		if (value)
		size += trie->map.value_size;
		if (disable_migration)
		migrate_disable();
		node = bpf_mem_cache_alloc(&trie->ma);
		if (disable_migration)
		migrate_enable();

		node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT \| __GFP_NOWARN,
		trie->map.numa_node);
		if (!node)
		return NULL;

		@@ -325,7 +327,7 @@ static long trie_update_elem(struct bpf_map *map,
		void _key, void value, u64 flags)
		{
		struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
		struct lpm_trie_node node, im_node, *new_node = NULL;
		struct lpm_trie_node node, im_node, *new_node;
		struct lpm_trie_node *free_node = NULL;
		struct lpm_trie_node __rcu **slot;
		struct bpf_lpm_trie_key_u8 *key = _key;
		@@ -340,14 +342,14 @@ static long trie_update_elem(struct bpf_map *map,
		if (key->prefixlen > trie->max_prefixlen)
		return -EINVAL;

		spin_lock_irqsave(&trie->lock, irq_flags);
		/* Allocate and fill a new node. Need to disable migration before
		* invoking bpf_mem_cache_alloc().
		*/
		new_node = lpm_trie_node_alloc(trie, value, true);
		if (!new_node)
		return -ENOMEM;

		/* Allocate and fill a new node */
		new_node = lpm_trie_node_alloc(trie, value);
		if (!new_node) {
		ret = -ENOMEM;
		goto out;
		}
		spin_lock_irqsave(&trie->lock, irq_flags);

		new_node->prefixlen = key->prefixlen;
		RCU_INIT_POINTER(new_node->child[0], NULL);
		@@ -423,7 +425,8 @@ static long trie_update_elem(struct bpf_map *map,
		goto out;
		}

		im_node = lpm_trie_node_alloc(trie, NULL);
		/* migration is disabled within the locked scope */
		im_node = lpm_trie_node_alloc(trie, NULL, false);
		if (!im_node) {
		trie->n_entries--;
		ret = -ENOMEM;
		@@ -447,10 +450,13 @@ static long trie_update_elem(struct bpf_map *map,
		rcu_assign_pointer(*slot, im_node);

		out:
		if (ret)
		kfree(new_node);
		spin_unlock_irqrestore(&trie->lock, irq_flags);
		kfree_rcu(free_node, rcu);

		migrate_disable();
		if (ret)
		bpf_mem_cache_free(&trie->ma, new_node);
		bpf_mem_cache_free_rcu(&trie->ma, free_node);
		migrate_enable();

		return ret;
		}
		@@ -548,8 +554,11 @@ static long trie_delete_elem(struct bpf_map map, void _key)

		out:
		spin_unlock_irqrestore(&trie->lock, irq_flags);
		kfree_rcu(free_parent, rcu);
		kfree_rcu(free_node, rcu);

		migrate_disable();
		bpf_mem_cache_free_rcu(&trie->ma, free_parent);
		bpf_mem_cache_free_rcu(&trie->ma, free_node);
		migrate_enable();

		return ret;
		}
		@@ -571,6 +580,8 @@ static long trie_delete_elem(struct bpf_map map, void _key)
		static struct bpf_map trie_alloc(union bpf_attr attr)
		{
		struct lpm_trie *trie;
		size_t leaf_size;
		int err;

		/* check sanity of attributes */
		if (attr->max_entries == 0 \|\|
		@@ -595,7 +606,17 @@ static struct bpf_map trie_alloc(union bpf_attr attr)

		spin_lock_init(&trie->lock);

		/* Allocate intermediate and leaf nodes from the same allocator */
		leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
		trie->map.value_size;
		err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
		if (err)
		goto free_out;
		return &trie->map;

		free_out:
		bpf_map_area_free(trie);
		return ERR_PTR(err);
		}

		static void trie_free(struct bpf_map *map)
		@@ -627,13 +648,17 @@ static void trie_free(struct bpf_map *map)
		continue;
		}

		kfree(node);
		/* No bpf program may access the map, so freeing the
		* node without waiting for the extra RCU GP.
		*/
		bpf_mem_cache_raw_free(node);
		RCU_INIT_POINTER(*slot, NULL);
		break;
		}
		}

		out:
		bpf_mem_alloc_destroy(&trie->ma);
		bpf_map_area_free(trie);
		}