Commit b4cd2ee5 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files
Martin KaFai Lau says:

====================
pull-request: bpf-next 2025-05-02

We've added 14 non-merge commits during the last 10 day(s) which contain
a total of 13 files changed, 740 insertions(+), 121 deletions(-).

The main changes are:

1) Avoid skipping or repeating a sk when using a UDP bpf_iter,
   from Jordan Rife.

2) Fixed a crash when a bpf qdisc is set in
   the net.core.default_qdisc, from Amery Hung.

3) A few other fixes in the bpf qdisc, from Amery Hung.
   - Always call qdisc_watchdog_init() in the .init prologue such that
     the .reset/.destroy epilogue can always call qdisc_watchdog_cancel()
     without issue.
   - bpf_qdisc_init_prologue() was incorrectly returning an error
     when the bpf qdisc is set as the default_qdisc and the mq is creating
     the default_qdisc. It is now fixed.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next:
  selftests/bpf: Cleanup bpf qdisc selftests
  selftests/bpf: Test attaching a bpf qdisc with incomplete operators
  bpf: net_sched: Make some Qdisc_ops ops mandatory
  selftests/bpf: Test setting and creating bpf qdisc as default qdisc
  bpf: net_sched: Fix bpf qdisc init prologue when set as default qdisc
  selftests/bpf: Add tests for bucket resume logic in UDP socket iterators
  selftests/bpf: Return socket cookies from sock_iter_batch progs
  bpf: udp: Avoid socket skips and repeats during iteration
  bpf: udp: Use bpf_udp_iter_batch_item for bpf_udp_iter_state batch items
  bpf: udp: Get rid of st_bucket_done
  bpf: udp: Make sure iter->batch always contains a full bucket snapshot
  bpf: udp: Make mem flags configurable through bpf_iter_udp_realloc_batch
  bpf: net_sched: Fix using bpf qdisc as default qdisc
  selftests/bpf: Fix compilation errors
====================

Link: https://patch.msgid.link/20250503010755.4030524-1-martin.lau@linux.dev


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 836b313a 30190f82
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -216,6 +216,9 @@ static inline void udp_allow_gso(struct sock *sk)
#define udp_portaddr_for_each_entry(__sk, list) \
	hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)

#define udp_portaddr_for_each_entry_from(__sk) \
	hlist_for_each_entry_from(__sk, __sk_common.skc_portaddr_node)

#define udp_portaddr_for_each_entry_rcu(__sk, list) \
	hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)

+117 −56
Original line number Diff line number Diff line
@@ -93,6 +93,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/sock_diag.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
@@ -3413,34 +3414,55 @@ struct bpf_iter__udp {
	int bucket __aligned(8);
};

union bpf_udp_iter_batch_item {
	struct sock *sk;
	__u64 cookie;
};

struct bpf_udp_iter_state {
	struct udp_iter_state state;
	unsigned int cur_sk;
	unsigned int end_sk;
	unsigned int max_sk;
	int offset;
	struct sock **batch;
	bool st_bucket_done;
	union bpf_udp_iter_batch_item *batch;
};

static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
				      unsigned int new_batch_sz);
				      unsigned int new_batch_sz, gfp_t flags);
static struct sock *bpf_iter_udp_resume(struct sock *first_sk,
					union bpf_udp_iter_batch_item *cookies,
					int n_cookies)
{
	struct sock *sk = NULL;
	int i;

	for (i = 0; i < n_cookies; i++) {
		sk = first_sk;
		udp_portaddr_for_each_entry_from(sk)
			if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
				goto done;
	}
done:
	return sk;
}

static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
{
	struct bpf_udp_iter_state *iter = seq->private;
	struct udp_iter_state *state = &iter->state;
	unsigned int find_cookie, end_cookie;
	struct net *net = seq_file_net(seq);
	int resume_bucket, resume_offset;
	struct udp_table *udptable;
	unsigned int batch_sks = 0;
	bool resized = false;
	int resume_bucket;
	int resizes = 0;
	struct sock *sk;
	int err = 0;

	resume_bucket = state->bucket;
	resume_offset = iter->offset;

	/* The current batch is done, so advance the bucket. */
	if (iter->st_bucket_done)
	if (iter->cur_sk == iter->end_sk)
		state->bucket++;

	udptable = udp_get_table_seq(seq, net);
@@ -3453,62 +3475,89 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
	 * before releasing the bucket lock. This allows BPF programs that are
	 * called in seq_show to acquire the bucket lock if needed.
	 */
	find_cookie = iter->cur_sk;
	end_cookie = iter->end_sk;
	iter->cur_sk = 0;
	iter->end_sk = 0;
	iter->st_bucket_done = false;
	batch_sks = 0;

	for (; state->bucket <= udptable->mask; state->bucket++) {
		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;

		if (hlist_empty(&hslot2->head))
			continue;
			goto next_bucket;

		iter->offset = 0;
		spin_lock_bh(&hslot2->lock);
		udp_portaddr_for_each_entry(sk, &hslot2->head) {
		sk = hlist_entry_safe(hslot2->head.first, struct sock,
				      __sk_common.skc_portaddr_node);
		/* Resume from the first (in iteration order) unseen socket from
		 * the last batch that still exists in resume_bucket. Most of
		 * the time this will just be where the last iteration left off
		 * in resume_bucket unless that socket disappeared between
		 * reads.
		 */
		if (state->bucket == resume_bucket)
			sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie],
						 end_cookie - find_cookie);
fill_batch:
		udp_portaddr_for_each_entry_from(sk) {
			if (seq_sk_match(seq, sk)) {
				/* Resume from the last iterated socket at the
				 * offset in the bucket before iterator was stopped.
				 */
				if (state->bucket == resume_bucket &&
				    iter->offset < resume_offset) {
					++iter->offset;
					continue;
				}
				if (iter->end_sk < iter->max_sk) {
					sock_hold(sk);
					iter->batch[iter->end_sk++] = sk;
					iter->batch[iter->end_sk++].sk = sk;
				}
				batch_sks++;
			}
		}
		spin_unlock_bh(&hslot2->lock);

		if (iter->end_sk)
			break;
	}

	/* All done: no batch made. */
	if (!iter->end_sk)
		return NULL;
		/* Allocate a larger batch and try again. */
		if (unlikely(resizes <= 1 && iter->end_sk &&
			     iter->end_sk != batch_sks)) {
			resizes++;

	if (iter->end_sk == batch_sks) {
		/* Batching is done for the current bucket; return the first
		 * socket to be iterated from the batch.
			/* First, try with GFP_USER to maximize the chances of
			 * grabbing more memory.
			 */
		iter->st_bucket_done = true;
		goto done;
			if (resizes == 1) {
				spin_unlock_bh(&hslot2->lock);
				err = bpf_iter_udp_realloc_batch(iter,
								 batch_sks * 3 / 2,
								 GFP_USER);
				if (err)
					return ERR_PTR(err);
				/* Start over. */
				goto again;
			}
	if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
		resized = true;
		/* After allocating a larger batch, retry one more time to grab
		 * the whole bucket.

			/* Next, hold onto the lock, so the bucket doesn't
			 * change while we get the rest of the sockets.
			 */
		goto again;
			err = bpf_iter_udp_realloc_batch(iter, batch_sks,
							 GFP_NOWAIT);
			if (err) {
				spin_unlock_bh(&hslot2->lock);
				return ERR_PTR(err);
			}
done:
	return iter->batch[0];

			/* Pick up where we left off. */
			sk = iter->batch[iter->end_sk - 1].sk;
			sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next,
					      struct sock,
					      __sk_common.skc_portaddr_node);
			batch_sks = iter->end_sk;
			goto fill_batch;
		}

		spin_unlock_bh(&hslot2->lock);

		if (iter->end_sk)
			break;
next_bucket:
		resizes = 0;
	}

	WARN_ON_ONCE(iter->end_sk != batch_sks);
	return iter->end_sk ? iter->batch[0].sk : NULL;
}

static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -3519,16 +3568,14 @@ static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
	/* Whenever seq_next() is called, the iter->cur_sk is
	 * done with seq_show(), so unref the iter->cur_sk.
	 */
	if (iter->cur_sk < iter->end_sk) {
		sock_put(iter->batch[iter->cur_sk++]);
		++iter->offset;
	}
	if (iter->cur_sk < iter->end_sk)
		sock_put(iter->batch[iter->cur_sk++].sk);

	/* After updating iter->cur_sk, check if there are more sockets
	 * available in the current bucket batch.
	 */
	if (iter->cur_sk < iter->end_sk)
		sk = iter->batch[iter->cur_sk];
		sk = iter->batch[iter->cur_sk].sk;
	else
		/* Prepare a new batch. */
		sk = bpf_iter_udp_batch(seq);
@@ -3592,8 +3639,19 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)

static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
{
	while (iter->cur_sk < iter->end_sk)
		sock_put(iter->batch[iter->cur_sk++]);
	union bpf_udp_iter_batch_item *item;
	unsigned int cur_sk = iter->cur_sk;
	__u64 cookie;

	/* Remember the cookies of the sockets we haven't seen yet, so we can
	 * pick up where we left off next time around.
	 */
	while (cur_sk < iter->end_sk) {
		item = &iter->batch[cur_sk++];
		cookie = sock_gen_cookie(item->sk);
		sock_put(item->sk);
		item->cookie = cookie;
	}
}

static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
@@ -3609,10 +3667,8 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
			(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
	}

	if (iter->cur_sk < iter->end_sk) {
	if (iter->cur_sk < iter->end_sk)
		bpf_iter_udp_put_batch(iter);
		iter->st_bucket_done = false;
	}
}

static const struct seq_operations bpf_iter_udp_seq_ops = {
@@ -3863,16 +3919,19 @@ DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
		     struct udp_sock *udp_sk, uid_t uid, int bucket)

static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
				      unsigned int new_batch_sz)
				      unsigned int new_batch_sz, gfp_t flags)
{
	struct sock **new_batch;
	union bpf_udp_iter_batch_item *new_batch;

	new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
				   GFP_USER | __GFP_NOWARN);
				   flags | __GFP_NOWARN);
	if (!new_batch)
		return -ENOMEM;

	if (flags != GFP_NOWAIT)
		bpf_iter_udp_put_batch(iter);

	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
	kvfree(iter->batch);
	iter->batch = new_batch;
	iter->max_sk = new_batch_sz;
@@ -3891,10 +3950,12 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
	if (ret)
		return ret;

	ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
	ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
	if (ret)
		bpf_iter_fini_seq_net(priv_data);

	iter->state.bucket = -1;

	return ret;
}

+19 −5
Original line number Diff line number Diff line
@@ -234,18 +234,20 @@ __bpf_kfunc int bpf_qdisc_init_prologue(struct Qdisc *sch,
	struct net_device *dev = qdisc_dev(sch);
	struct Qdisc *p;

	qdisc_watchdog_init(&q->watchdog, sch);

	if (sch->parent != TC_H_ROOT) {
		/* If qdisc_lookup() returns NULL, it means .init is called by
		 * qdisc_create_dflt() in mq/mqprio_init and the parent qdisc
		 * has not been added to qdisc_hash yet.
		 */
		p = qdisc_lookup(dev, TC_H_MAJ(sch->parent));
		if (!p)
			return -ENOENT;

		if (!(p->flags & TCQ_F_MQROOT)) {
		if (p && !(p->flags & TCQ_F_MQROOT)) {
			NL_SET_ERR_MSG(extack, "BPF qdisc only supported on root or mq");
			return -EINVAL;
		}
	}

	qdisc_watchdog_init(&q->watchdog, sch);
	return 0;
}

@@ -393,6 +395,17 @@ static void bpf_qdisc_unreg(void *kdata, struct bpf_link *link)
	return unregister_qdisc(kdata);
}

static int bpf_qdisc_validate(void *kdata)
{
	struct Qdisc_ops *ops = (struct Qdisc_ops *)kdata;

	if (!ops->enqueue || !ops->dequeue || !ops->init ||
	    !ops->reset || !ops->destroy)
		return -EINVAL;

	return 0;
}

static int Qdisc_ops__enqueue(struct sk_buff *skb__ref, struct Qdisc *sch,
			      struct sk_buff **to_free)
{
@@ -430,6 +443,7 @@ static struct bpf_struct_ops bpf_Qdisc_ops = {
	.verifier_ops = &bpf_qdisc_verifier_ops,
	.reg = bpf_qdisc_reg,
	.unreg = bpf_qdisc_unreg,
	.validate = bpf_qdisc_validate,
	.init_member = bpf_qdisc_init_member,
	.init = bpf_qdisc_init,
	.name = "Qdisc_ops",
+2 −2
Original line number Diff line number Diff line
@@ -208,7 +208,7 @@ static struct Qdisc_ops *qdisc_lookup_default(const char *name)

	for (q = qdisc_base; q; q = q->next) {
		if (!strcmp(name, q->id)) {
			if (!try_module_get(q->owner))
			if (!bpf_try_module_get(q, q->owner))
				q = NULL;
			break;
		}
@@ -238,7 +238,7 @@ int qdisc_set_default(const char *name)

	if (ops) {
		/* Set new default */
		module_put(default_qdisc_ops->owner);
		bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner);
		default_qdisc_ops = ops;
	}
	write_unlock(&qdisc_mod_lock);
+2 −2
Original line number Diff line number Diff line
@@ -1002,14 +1002,14 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
{
	struct Qdisc *sch;

	if (!try_module_get(ops->owner)) {
	if (!bpf_try_module_get(ops, ops->owner)) {
		NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
		return NULL;
	}

	sch = qdisc_alloc(dev_queue, ops, extack);
	if (IS_ERR(sch)) {
		module_put(ops->owner);
		bpf_module_put(ops, ops->owner);
		return NULL;
	}
	sch->parent = parentid;
Loading