Commit a819ff0c authored by Paolo Abeni's avatar Paolo Abeni
Browse files
Daniel Borkmann says:

====================
pull-request: bpf 2024-07-11

The following pull-request contains BPF updates for your *net* tree.

We've added 4 non-merge commits during the last 2 day(s) which contain
a total of 4 files changed, 262 insertions(+), 19 deletions(-).

The main changes are:

1) Fixes for a BPF timer lockup and a use-after-free scenario when timers
   are used concurrently, from Kumar Kartikeya Dwivedi.

2) Fix the argument order in the call to bpf_map_kvcalloc() which could
   otherwise lead to a compilation error, from Mohammad Shehar Yaar Tausif.

bpf-for-netdev

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
  selftests/bpf: Add timer lockup selftest
  bpf: Defer work in bpf_timer_cancel_and_free
  bpf: Fail bpf_timer_cancel when callback is being cancelled
  bpf: fix order of args in call to bpf_map_kvcalloc
====================

Link: https://patch.msgid.link/20240711084016.25757-1-daniel@iogearbox.net


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents 626dfed5 50bd5a0c
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
	nbuckets = max_t(u32, 2, nbuckets);
	smap->bucket_log = ilog2(nbuckets);

	smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
					 nbuckets, GFP_USER | __GFP_NOWARN);
	smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
					 sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
	if (!smap->buckets) {
		err = -ENOMEM;
		goto free_smap;
+82 −17
Original line number Diff line number Diff line
@@ -1084,7 +1084,10 @@ struct bpf_async_cb {
	struct bpf_prog *prog;
	void __rcu *callback_fn;
	void *value;
	union {
		struct rcu_head rcu;
		struct work_struct delete_work;
	};
	u64 flags;
};

@@ -1107,6 +1110,7 @@ struct bpf_async_cb {
struct bpf_hrtimer {
	struct bpf_async_cb cb;
	struct hrtimer timer;
	atomic_t cancelling;
};

struct bpf_work {
@@ -1219,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work)
	kfree_rcu(w, cb.rcu);
}

static void bpf_timer_delete_work(struct work_struct *work)
{
	struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);

	/* Cancel the timer and wait for callback to complete if it was running.
	 * If hrtimer_cancel() can be safely called it's safe to call
	 * kfree_rcu(t) right after for both preallocated and non-preallocated
	 * maps.  The async->cb = NULL was already done and no code path can see
	 * address 't' anymore. Timer if armed for existing bpf_hrtimer before
	 * bpf_timer_cancel_and_free will have been cancelled.
	 */
	hrtimer_cancel(&t->timer);
	kfree_rcu(t, cb.rcu);
}

static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
			    enum bpf_async_type type)
{
@@ -1262,6 +1281,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
		clockid = flags & (MAX_CLOCKS - 1);
		t = (struct bpf_hrtimer *)cb;

		atomic_set(&t->cancelling, 0);
		INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
		hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
		t->timer.function = bpf_timer_cb;
		cb->value = (void *)async - map->record->timer_off;
@@ -1440,7 +1461,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async)

BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
{
	struct bpf_hrtimer *t;
	struct bpf_hrtimer *t, *cur_t;
	bool inc = false;
	int ret = 0;

	if (in_nmi())
@@ -1452,14 +1474,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
		ret = -EINVAL;
		goto out;
	}
	if (this_cpu_read(hrtimer_running) == t) {

	cur_t = this_cpu_read(hrtimer_running);
	if (cur_t == t) {
		/* If bpf callback_fn is trying to bpf_timer_cancel()
		 * its own timer the hrtimer_cancel() will deadlock
		 * since it waits for callback_fn to finish
		 * since it waits for callback_fn to finish.
		 */
		ret = -EDEADLK;
		goto out;
	}

	/* Only account in-flight cancellations when invoked from a timer
	 * callback, since we want to avoid waiting only if other _callbacks_
	 * are waiting on us, to avoid introducing lockups. Non-callback paths
	 * are ok, since nobody would synchronously wait for their completion.
	 */
	if (!cur_t)
		goto drop;
	atomic_inc(&t->cancelling);
	/* Need full barrier after relaxed atomic_inc */
	smp_mb__after_atomic();
	inc = true;
	if (atomic_read(&cur_t->cancelling)) {
		/* We're cancelling timer t, while some other timer callback is
		 * attempting to cancel us. In such a case, it might be possible
		 * that timer t belongs to the other callback, or some other
		 * callback waiting upon it (creating transitive dependencies
		 * upon us), and we will enter a deadlock if we continue
		 * cancelling and waiting for it synchronously, since it might
		 * do the same. Bail!
		 */
		ret = -EDEADLK;
		goto out;
	}
drop:
	drop_prog_refcnt(&t->cb);
out:
	__bpf_spin_unlock_irqrestore(&timer->lock);
@@ -1467,6 +1516,8 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
	 * if it was running.
	 */
	ret = ret ?: hrtimer_cancel(&t->timer);
	if (inc)
		atomic_dec(&t->cancelling);
	rcu_read_unlock();
	return ret;
}
@@ -1512,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val)

	if (!t)
		return;
	/* Cancel the timer and wait for callback to complete if it was running.
	 * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
	 * right after for both preallocated and non-preallocated maps.
	 * The async->cb = NULL was already done and no code path can
	 * see address 't' anymore.
	 *
	 * Check that bpf_map_delete/update_elem() wasn't called from timer
	 * callback_fn. In such case don't call hrtimer_cancel() (since it will
	 * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
	 * return -1). Though callback_fn is still running on this cpu it's
	/* We check that bpf_map_delete/update_elem() was called from timer
	 * callback_fn. In such case we don't call hrtimer_cancel() (since it
	 * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
	 * just return -1). Though callback_fn is still running on this cpu it's
	 * safe to do kfree(t) because bpf_timer_cb() read everything it needed
	 * from 't'. The bpf subprog callback_fn won't be able to access 't',
	 * since async->cb = NULL was already done. The timer will be
	 * effectively cancelled because bpf_timer_cb() will return
	 * HRTIMER_NORESTART.
	 *
	 * However, it is possible the timer callback_fn calling us armed the
	 * timer _before_ calling us, such that failing to cancel it here will
	 * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
	 * Therefore, we _need_ to cancel any outstanding timers before we do
	 * kfree_rcu, even though no more timers can be armed.
	 *
	 * Moreover, we need to schedule work even if timer does not belong to
	 * the calling callback_fn, as on two different CPUs, we can end up in a
	 * situation where both sides run in parallel, try to cancel one
	 * another, and we end up waiting on both sides in hrtimer_cancel
	 * without making forward progress, since timer1 depends on time2
	 * callback to finish, and vice versa.
	 *
	 *  CPU 1 (timer1_cb)			CPU 2 (timer2_cb)
	 *  bpf_timer_cancel_and_free(timer2)	bpf_timer_cancel_and_free(timer1)
	 *
	 * To avoid these issues, punt to workqueue context when we are in a
	 * timer callback.
	 */
	if (this_cpu_read(hrtimer_running) != t)
		hrtimer_cancel(&t->timer);
	kfree_rcu(t, cb.rcu);
	if (this_cpu_read(hrtimer_running))
		queue_work(system_unbound_wq, &t->cb.delete_work);
	else
		bpf_timer_delete_work(&t->cb.delete_work);
}

/* This function is called by map_delete/update_elem for individual element and
+91 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0

#define _GNU_SOURCE
#include <sched.h>
#include <test_progs.h>
#include <pthread.h>
#include <network_helpers.h>

#include "timer_lockup.skel.h"

static long cpu;
static int *timer1_err;
static int *timer2_err;
static bool skip;

volatile int k = 0;

static void *timer_lockup_thread(void *arg)
{
	LIBBPF_OPTS(bpf_test_run_opts, opts,
		.data_in	= &pkt_v4,
		.data_size_in	= sizeof(pkt_v4),
		.repeat		= 1000,
	);
	int i, prog_fd = *(int *)arg;
	cpu_set_t cpuset;

	CPU_ZERO(&cpuset);
	CPU_SET(__sync_fetch_and_add(&cpu, 1), &cpuset);
	ASSERT_OK(pthread_setaffinity_np(pthread_self(), sizeof(cpuset),
				         &cpuset),
		  "cpu affinity");

	for (i = 0; !READ_ONCE(*timer1_err) && !READ_ONCE(*timer2_err); i++) {
		bpf_prog_test_run_opts(prog_fd, &opts);
		/* Skip the test if we can't reproduce the race in a reasonable
		 * amount of time.
		 */
		if (i > 50) {
			WRITE_ONCE(skip, true);
			break;
		}
	}

	return NULL;
}

void test_timer_lockup(void)
{
	int timer1_prog, timer2_prog;
	struct timer_lockup *skel;
	pthread_t thrds[2];
	void *ret;

	skel = timer_lockup__open_and_load();
	if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load"))
		return;

	timer1_prog = bpf_program__fd(skel->progs.timer1_prog);
	timer2_prog = bpf_program__fd(skel->progs.timer2_prog);

	timer1_err = &skel->bss->timer1_err;
	timer2_err = &skel->bss->timer2_err;

	if (!ASSERT_OK(pthread_create(&thrds[0], NULL, timer_lockup_thread,
				      &timer1_prog),
		       "pthread_create thread1"))
		goto out;
	if (!ASSERT_OK(pthread_create(&thrds[1], NULL, timer_lockup_thread,
				      &timer2_prog),
		       "pthread_create thread2")) {
		pthread_exit(&thrds[0]);
		goto out;
	}

	pthread_join(thrds[1], &ret);
	pthread_join(thrds[0], &ret);

	if (skip) {
		test__skip();
		goto out;
	}

	if (*timer1_err != -EDEADLK && *timer1_err != 0)
		ASSERT_FAIL("timer1_err bad value");
	if (*timer2_err != -EDEADLK && *timer2_err != 0)
		ASSERT_FAIL("timer2_err bad value");
out:
	timer_lockup__destroy(skel);
	return;
}
+87 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0

#include <linux/bpf.h>
#include <time.h>
#include <errno.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "bpf_misc.h"

char _license[] SEC("license") = "GPL";

struct elem {
	struct bpf_timer t;
};

struct {
	__uint(type, BPF_MAP_TYPE_ARRAY);
	__uint(max_entries, 1);
	__type(key, int);
	__type(value, struct elem);
} timer1_map SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_ARRAY);
	__uint(max_entries, 1);
	__type(key, int);
	__type(value, struct elem);
} timer2_map SEC(".maps");

int timer1_err;
int timer2_err;

static int timer_cb1(void *map, int *k, struct elem *v)
{
	struct bpf_timer *timer;
	int key = 0;

	timer = bpf_map_lookup_elem(&timer2_map, &key);
	if (timer)
		timer2_err = bpf_timer_cancel(timer);

	return 0;
}

static int timer_cb2(void *map, int *k, struct elem *v)
{
	struct bpf_timer *timer;
	int key = 0;

	timer = bpf_map_lookup_elem(&timer1_map, &key);
	if (timer)
		timer1_err = bpf_timer_cancel(timer);

	return 0;
}

SEC("tc")
int timer1_prog(void *ctx)
{
	struct bpf_timer *timer;
	int key = 0;

	timer = bpf_map_lookup_elem(&timer1_map, &key);
	if (timer) {
		bpf_timer_init(timer, &timer1_map, CLOCK_BOOTTIME);
		bpf_timer_set_callback(timer, timer_cb1);
		bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN);
	}

	return 0;
}

SEC("tc")
int timer2_prog(void *ctx)
{
	struct bpf_timer *timer;
	int key = 0;

	timer = bpf_map_lookup_elem(&timer2_map, &key);
	if (timer) {
		bpf_timer_init(timer, &timer2_map, CLOCK_BOOTTIME);
		bpf_timer_set_callback(timer, timer_cb2);
		bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN);
	}

	return 0;
}