Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf (a819ff0c) · Commits · git / linux-net

kernel/bpf/bpf_local_storage.c

+2 −2

Original line number	Diff line number	Diff line
		@@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
		nbuckets = max_t(u32, 2, nbuckets);
		smap->bucket_log = ilog2(nbuckets);

		smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
		nbuckets, GFP_USER \| __GFP_NOWARN);
		smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
		sizeof(*smap->buckets), GFP_USER \| __GFP_NOWARN);
		if (!smap->buckets) {
		err = -ENOMEM;
		goto free_smap;

kernel/bpf/helpers.c

+82 −17

Original line number	Diff line number	Diff line
		@@ -1084,7 +1084,10 @@ struct bpf_async_cb {
		struct bpf_prog *prog;
		void __rcu *callback_fn;
		void *value;
		union {
		struct rcu_head rcu;
		struct work_struct delete_work;
		};
		u64 flags;
		};

		@@ -1107,6 +1110,7 @@ struct bpf_async_cb {
		struct bpf_hrtimer {
		struct bpf_async_cb cb;
		struct hrtimer timer;
		atomic_t cancelling;
		};

		struct bpf_work {
		@@ -1219,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work)
		kfree_rcu(w, cb.rcu);
		}

		static void bpf_timer_delete_work(struct work_struct *work)
		{
		struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);

		/* Cancel the timer and wait for callback to complete if it was running.
		* If hrtimer_cancel() can be safely called it's safe to call
		* kfree_rcu(t) right after for both preallocated and non-preallocated
		* maps. The async->cb = NULL was already done and no code path can see
		* address 't' anymore. Timer if armed for existing bpf_hrtimer before
		* bpf_timer_cancel_and_free will have been cancelled.
		*/
		hrtimer_cancel(&t->timer);
		kfree_rcu(t, cb.rcu);
		}

		static int __bpf_async_init(struct bpf_async_kern async, struct bpf_map map, u64 flags,
		enum bpf_async_type type)
		{
		@@ -1262,6 +1281,8 @@ static int __bpf_async_init(struct bpf_async_kern async, struct bpf_map map, u
		clockid = flags & (MAX_CLOCKS - 1);
		t = (struct bpf_hrtimer *)cb;

		atomic_set(&t->cancelling, 0);
		INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
		hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
		t->timer.function = bpf_timer_cb;
		cb->value = (void *)async - map->record->timer_off;
		@@ -1440,7 +1461,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async)

		BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
		{
		struct bpf_hrtimer *t;
		struct bpf_hrtimer t, cur_t;
		bool inc = false;
		int ret = 0;

		if (in_nmi())
		@@ -1452,14 +1474,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
		ret = -EINVAL;
		goto out;
		}
		if (this_cpu_read(hrtimer_running) == t) {

		cur_t = this_cpu_read(hrtimer_running);
		if (cur_t == t) {
		/* If bpf callback_fn is trying to bpf_timer_cancel()
		* its own timer the hrtimer_cancel() will deadlock
		* since it waits for callback_fn to finish
		* since it waits for callback_fn to finish.
		*/
		ret = -EDEADLK;
		goto out;
		}

		/* Only account in-flight cancellations when invoked from a timer
		* callback, since we want to avoid waiting only if other _callbacks_
		* are waiting on us, to avoid introducing lockups. Non-callback paths
		* are ok, since nobody would synchronously wait for their completion.
		*/
		if (!cur_t)
		goto drop;
		atomic_inc(&t->cancelling);
		/* Need full barrier after relaxed atomic_inc */
		smp_mb__after_atomic();
		inc = true;
		if (atomic_read(&cur_t->cancelling)) {
		/* We're cancelling timer t, while some other timer callback is
		* attempting to cancel us. In such a case, it might be possible
		* that timer t belongs to the other callback, or some other
		* callback waiting upon it (creating transitive dependencies
		* upon us), and we will enter a deadlock if we continue
		* cancelling and waiting for it synchronously, since it might
		* do the same. Bail!
		*/
		ret = -EDEADLK;
		goto out;
		}
		drop:
		drop_prog_refcnt(&t->cb);
		out:
		__bpf_spin_unlock_irqrestore(&timer->lock);
		@@ -1467,6 +1516,8 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
		* if it was running.
		*/
		ret = ret ?: hrtimer_cancel(&t->timer);
		if (inc)
		atomic_dec(&t->cancelling);
		rcu_read_unlock();
		return ret;
		}
		@@ -1512,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val)

		if (!t)
		return;
		/* Cancel the timer and wait for callback to complete if it was running.
		* If hrtimer_cancel() can be safely called it's safe to call kfree(t)
		* right after for both preallocated and non-preallocated maps.
		* The async->cb = NULL was already done and no code path can
		* see address 't' anymore.
		*
		* Check that bpf_map_delete/update_elem() wasn't called from timer
		* callback_fn. In such case don't call hrtimer_cancel() (since it will
		* deadlock) and don't call hrtimer_try_to_cancel() (since it will just
		* return -1). Though callback_fn is still running on this cpu it's
		/* We check that bpf_map_delete/update_elem() was called from timer
		* callback_fn. In such case we don't call hrtimer_cancel() (since it
		* will deadlock) and don't call hrtimer_try_to_cancel() (since it will
		* just return -1). Though callback_fn is still running on this cpu it's
		* safe to do kfree(t) because bpf_timer_cb() read everything it needed
		* from 't'. The bpf subprog callback_fn won't be able to access 't',
		* since async->cb = NULL was already done. The timer will be
		* effectively cancelled because bpf_timer_cb() will return
		* HRTIMER_NORESTART.
		*
		* However, it is possible the timer callback_fn calling us armed the
		* timer _before_ calling us, such that failing to cancel it here will
		* cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
		* Therefore, we _need_ to cancel any outstanding timers before we do
		* kfree_rcu, even though no more timers can be armed.
		*
		* Moreover, we need to schedule work even if timer does not belong to
		* the calling callback_fn, as on two different CPUs, we can end up in a
		* situation where both sides run in parallel, try to cancel one
		* another, and we end up waiting on both sides in hrtimer_cancel
		* without making forward progress, since timer1 depends on time2
		* callback to finish, and vice versa.
		*
		* CPU 1 (timer1_cb) CPU 2 (timer2_cb)
		* bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1)
		*
		* To avoid these issues, punt to workqueue context when we are in a
		* timer callback.
		*/
		if (this_cpu_read(hrtimer_running) != t)
		hrtimer_cancel(&t->timer);
		kfree_rcu(t, cb.rcu);
		if (this_cpu_read(hrtimer_running))
		queue_work(system_unbound_wq, &t->cb.delete_work);
		else
		bpf_timer_delete_work(&t->cb.delete_work);
		}

		/* This function is called by map_delete/update_elem for individual element and

tools/testing/selftests/bpf/prog_tests/timer_lockup.c

0 → 100644

+91 −0

Original line number	Diff line number	Diff line
		// SPDX-License-Identifier: GPL-2.0

		#define _GNU_SOURCE
		#include <sched.h>
		#include <test_progs.h>
		#include <pthread.h>
		#include <network_helpers.h>

		#include "timer_lockup.skel.h"

		static long cpu;
		static int *timer1_err;
		static int *timer2_err;
		static bool skip;

		volatile int k = 0;

		static void timer_lockup_thread(void arg)
		{
		LIBBPF_OPTS(bpf_test_run_opts, opts,
		.data_in = &pkt_v4,
		.data_size_in = sizeof(pkt_v4),
		.repeat = 1000,
		);
		int i, prog_fd = (int )arg;
		cpu_set_t cpuset;

		CPU_ZERO(&cpuset);
		CPU_SET(__sync_fetch_and_add(&cpu, 1), &cpuset);
		ASSERT_OK(pthread_setaffinity_np(pthread_self(), sizeof(cpuset),
		&cpuset),
		"cpu affinity");

		for (i = 0; !READ_ONCE(timer1_err) && !READ_ONCE(timer2_err); i++) {
		bpf_prog_test_run_opts(prog_fd, &opts);
		/* Skip the test if we can't reproduce the race in a reasonable
		* amount of time.
		*/
		if (i > 50) {
		WRITE_ONCE(skip, true);
		break;
		}
		}

		return NULL;
		}

		void test_timer_lockup(void)
		{
		int timer1_prog, timer2_prog;
		struct timer_lockup *skel;
		pthread_t thrds[2];
		void *ret;

		skel = timer_lockup__open_and_load();
		if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load"))
		return;

		timer1_prog = bpf_program__fd(skel->progs.timer1_prog);
		timer2_prog = bpf_program__fd(skel->progs.timer2_prog);

		timer1_err = &skel->bss->timer1_err;
		timer2_err = &skel->bss->timer2_err;

		if (!ASSERT_OK(pthread_create(&thrds[0], NULL, timer_lockup_thread,
		&timer1_prog),
		"pthread_create thread1"))
		goto out;
		if (!ASSERT_OK(pthread_create(&thrds[1], NULL, timer_lockup_thread,
		&timer2_prog),
		"pthread_create thread2")) {
		pthread_exit(&thrds[0]);
		goto out;
		}

		pthread_join(thrds[1], &ret);
		pthread_join(thrds[0], &ret);

		if (skip) {
		test__skip();
		goto out;
		}

		if (timer1_err != -EDEADLK && timer1_err != 0)
		ASSERT_FAIL("timer1_err bad value");
		if (timer2_err != -EDEADLK && timer2_err != 0)
		ASSERT_FAIL("timer2_err bad value");
		out:
		timer_lockup__destroy(skel);
		return;
		}

tools/testing/selftests/bpf/progs/timer_lockup.c

0 → 100644

+87 −0

Original line number	Diff line number	Diff line
		// SPDX-License-Identifier: GPL-2.0

		#include <linux/bpf.h>
		#include <time.h>
		#include <errno.h>
		#include <bpf/bpf_helpers.h>
		#include <bpf/bpf_tracing.h>
		#include "bpf_misc.h"

		char _license[] SEC("license") = "GPL";

		struct elem {
		struct bpf_timer t;
		};

		struct {
		__uint(type, BPF_MAP_TYPE_ARRAY);
		__uint(max_entries, 1);
		__type(key, int);
		__type(value, struct elem);
		} timer1_map SEC(".maps");

		struct {
		__uint(type, BPF_MAP_TYPE_ARRAY);
		__uint(max_entries, 1);
		__type(key, int);
		__type(value, struct elem);
		} timer2_map SEC(".maps");

		int timer1_err;
		int timer2_err;

		static int timer_cb1(void map, int k, struct elem *v)
		{
		struct bpf_timer *timer;
		int key = 0;

		timer = bpf_map_lookup_elem(&timer2_map, &key);
		if (timer)
		timer2_err = bpf_timer_cancel(timer);

		return 0;
		}

		static int timer_cb2(void map, int k, struct elem *v)
		{
		struct bpf_timer *timer;
		int key = 0;

		timer = bpf_map_lookup_elem(&timer1_map, &key);
		if (timer)
		timer1_err = bpf_timer_cancel(timer);

		return 0;
		}

		SEC("tc")
		int timer1_prog(void *ctx)
		{
		struct bpf_timer *timer;
		int key = 0;

		timer = bpf_map_lookup_elem(&timer1_map, &key);
		if (timer) {
		bpf_timer_init(timer, &timer1_map, CLOCK_BOOTTIME);
		bpf_timer_set_callback(timer, timer_cb1);
		bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN);
		}

		return 0;
		}

		SEC("tc")
		int timer2_prog(void *ctx)
		{
		struct bpf_timer *timer;
		int key = 0;

		timer = bpf_map_lookup_elem(&timer2_map, &key);
		if (timer) {
		bpf_timer_init(timer, &timer2_map, CLOCK_BOOTTIME);
		bpf_timer_set_callback(timer, timer_cb2);
		bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN);
		}

		return 0;
		}