Commit 8c0d9e17 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'bpf-cpumap-devmap-fix-per-cpu-bulk-queue-races-on-preempt_rt'

Jiayuan Chen says:

====================
bpf: Fix per-CPU bulk queue races on PREEMPT_RT

On PREEMPT_RT kernels, local_bh_disable() only calls migrate_disable()
(when PREEMPT_RT_NEEDS_BH_LOCK is not set) and does not disable
preemption. This means CFS scheduling can preempt a task inside the
per-CPU bulk queue (bq) operations in cpumap and devmap, allowing
another task on the same CPU to concurrently access the same bq,
leading to use-after-free, list corruption, and kernel panics.

Patch 1 fixes the cpumap race in bq_flush_to_queue(), originally
reported by syzbot [1].

Patch 2 fixes the same class of race in devmap's bq_xmit_all(),
identified by code inspection after Sebastian Andrzej Siewior pointed
out that devmap has the same per-CPU bulk queue pattern [2].

Both patches use local_lock_nested_bh() to serialize access to the
per-CPU bq. On non-RT this is a pure lockdep annotation with no
overhead; on PREEMPT_RT it provides a per-CPU sleeping lock.

To reproduce the devmap race, insert an mdelay(100) in bq_xmit_all()
after "cnt = bq->count" and before the actual transmit loop. Then pin
two threads to the same CPU, each running BPF_PROG_TEST_RUN with an XDP
program that redirects to a DEVMAP entry (e.g. a veth pair). CFS
timeslicing during the mdelay window causes interleaving. Without the
fix, KASAN reports null-ptr-deref due to operating on freed frames:

  BUG: KASAN: null-ptr-deref in __build_skb_around+0x22d/0x340
  Write of size 32 at addr 0000000000000d50 by task devmap_race_rep/449

  CPU: 0 UID: 0 PID: 449 Comm: devmap_race_rep Not tainted 6.19.0+ #31 PREEMPT_RT
  Call Trace:
   <TASK>
   __build_skb_around+0x22d/0x340
   build_skb_around+0x25/0x260
   __xdp_build_skb_from_frame+0x103/0x860
   veth_xdp_rcv_bulk_skb.isra.0+0x162/0x320
   veth_xdp_rcv.constprop.0+0x61e/0xbb0
   veth_poll+0x280/0xb50
   __napi_poll.constprop.0+0xa5/0x590
   net_rx_action+0x4b0/0xea0
   handle_softirqs.isra.0+0x1b3/0x780
   __local_bh_enable_ip+0x12a/0x240
   xdp_test_run_batch.constprop.0+0xedd/0x1f60
   bpf_test_run_xdp_live+0x304/0x640
   bpf_prog_test_run_xdp+0xd24/0x1b70
   __sys_bpf+0x61c/0x3e00
   </TASK>

  Kernel panic - not syncing: Fatal exception in interrupt

[1] https://lore.kernel.org/all/69369331.a70a0220.38f243.009d.GAE@google.com/T/
[2] https://lore.kernel.org/bpf/20260212023634.366343-1-jiayuan.chen@linux.dev/

v3 -> v4: https://lore.kernel.org/all/20260213034018.284146-1-jiayuan.chen@linux.dev/
- Move panic trace to cover letter. (Sebastian Andrzej Siewior)
- Add Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> to both patches
  from cover letter.

v2 -> v3: https://lore.kernel.org/bpf/20260212023634.366343-1-jiayuan.chen@linux.dev/
- Fix commit message: remove incorrect "spin_lock() becomes rt_mutex"
claim, the per-CPU bq has no spin_lock at all. (Sebastian Andrzej Siewior)
- Fix commit message: accurately describe local_lock_nested_bh()
behavior instead of referencing local_lock(). (Sebastian Andrzej Siewior)
- Remove incomplete discussion of snapshot alternative.
(Sebastian Andrzej Siewior)
- Remove panic trace from commit message. (Sebastian Andrzej Siewior)
- Add patch 2/2 for devmap, same race pattern. (Sebastian Andrzej Siewior)

v1 -> v2: https://lore.kernel.org/bpf/20260211064417.196401-1-jiayuan.chen@linux.dev/
- Use local_lock_nested_bh()/local_unlock_nested_bh() instead of
local_lock()/local_unlock(), since these paths already run under
local_bh_disable(). (Sebastian Andrzej Siewior)
- Replace "Caller must hold bq->bq_lock" comment with
lockdep_assert_held() in bq_flush_to_queue(). (Sebastian Andrzej Siewior)
- Fix Fixes tag to 3253cb49 ("softirq: Allow to drop the
softirq-BKL lock on PREEMPT_RT") which is the actual commit that
makes the race possible. (Sebastian Andrzej Siewior)
====================

Link: https://patch.msgid.link/20260225121459.183121-1-jiayuan.chen@linux.dev


Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 5263e30f 1872e753
Loading
Loading
Loading
Loading
+15 −2
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/local_lock.h>
#include <linux/completion.h>
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>
@@ -52,6 +53,7 @@ struct xdp_bulk_queue {
	struct list_head flush_node;
	struct bpf_cpu_map_entry *obj;
	unsigned int count;
	local_lock_t bq_lock;
};

/* Struct for every remote "destination" CPU in map */
@@ -451,6 +453,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
	for_each_possible_cpu(i) {
		bq = per_cpu_ptr(rcpu->bulkq, i);
		bq->obj = rcpu;
		local_lock_init(&bq->bq_lock);
	}

	/* Alloc queue */
@@ -722,6 +725,8 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
	struct ptr_ring *q;
	int i;

	lockdep_assert_held(&bq->bq_lock);

	if (unlikely(!bq->count))
		return;

@@ -749,11 +754,15 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
}

/* Runs under RCU-read-side, plus in softirq under NAPI protection.
 * Thus, safe percpu variable access.
 * Thus, safe percpu variable access. PREEMPT_RT relies on
 * local_lock_nested_bh() to serialise access to the per-CPU bq.
 */
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
	struct xdp_bulk_queue *bq;

	local_lock_nested_bh(&rcpu->bulkq->bq_lock);
	bq = this_cpu_ptr(rcpu->bulkq);

	if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
		bq_flush_to_queue(bq);
@@ -774,6 +783,8 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)

		list_add(&bq->flush_node, flush_list);
	}

	local_unlock_nested_bh(&rcpu->bulkq->bq_lock);
}

int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -810,7 +821,9 @@ void __cpu_map_flush(struct list_head *flush_list)
	struct xdp_bulk_queue *bq, *tmp;

	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
		local_lock_nested_bh(&bq->obj->bulkq->bq_lock);
		bq_flush_to_queue(bq);
		local_unlock_nested_bh(&bq->obj->bulkq->bq_lock);

		/* If already running, costs spin_lock_irqsave + smb_mb */
		wake_up_process(bq->obj->kthread);
+21 −4
Original line number Diff line number Diff line
@@ -45,6 +45,7 @@
 * types of devmap; only the lookup and insertion is different.
 */
#include <linux/bpf.h>
#include <linux/local_lock.h>
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
@@ -60,6 +61,7 @@ struct xdp_dev_bulk_queue {
	struct net_device *dev_rx;
	struct bpf_prog *xdp_prog;
	unsigned int count;
	local_lock_t bq_lock;
};

struct bpf_dtab_netdev {
@@ -381,6 +383,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
	int to_send = cnt;
	int i;

	lockdep_assert_held(&bq->bq_lock);

	if (unlikely(!cnt))
		return;

@@ -425,10 +429,12 @@ void __dev_flush(struct list_head *flush_list)
	struct xdp_dev_bulk_queue *bq, *tmp;

	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
		local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
		bq_xmit_all(bq, XDP_XMIT_FLUSH);
		bq->dev_rx = NULL;
		bq->xdp_prog = NULL;
		__list_del_clearprev(&bq->flush_node);
		local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
	}
}

@@ -451,12 +457,16 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)

/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
 * variable access, and map elements stick around. See comment above
 * xdp_do_flush() in filter.c.
 * xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh()
 * to serialise access to the per-CPU bq.
 */
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
		       struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
	struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
	struct xdp_dev_bulk_queue *bq;

	local_lock_nested_bh(&dev->xdp_bulkq->bq_lock);
	bq = this_cpu_ptr(dev->xdp_bulkq);

	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
		bq_xmit_all(bq, 0);
@@ -477,6 +487,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
	}

	bq->q[bq->count++] = xdpf;

	local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock);
}

static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
@@ -1127,8 +1139,13 @@ static int dev_map_notification(struct notifier_block *notifier,
		if (!netdev->xdp_bulkq)
			return NOTIFY_BAD;

		for_each_possible_cpu(cpu)
			per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
		for_each_possible_cpu(cpu) {
			struct xdp_dev_bulk_queue *bq;

			bq = per_cpu_ptr(netdev->xdp_bulkq, cpu);
			bq->dev = netdev;
			local_lock_init(&bq->bq_lock);
		}
		break;
	case NETDEV_UNREGISTER:
		/* This rcu_read_lock/unlock pair is needed because