Commit 3e8ec344 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'xsk-fix-bugs-around-xsk-skb-allocation'

Jason Xing says:

====================
xsk: fix bugs around xsk skb allocation

There are rare issues around xsk_build_skb(). Some of them
were founded by Sashiko[1][2].

[1]: https://lore.kernel.org/all/20260415082654.21026-1-kerneljasonxing@gmail.com/
[2]: https://lore.kernel.org/all/20260418045644.28612-1-kerneljasonxing@gmail.com/
====================

Link: https://patch.msgid.link/20260502200722.53960-1-kerneljasonxing@gmail.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 22675f07 203cee64
Loading
Loading
Loading
Loading
+74 −41
Original line number Diff line number Diff line
@@ -646,9 +646,42 @@ static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
	return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
}

static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
static struct xsk_addrs *__xsk_addrs_alloc(struct sk_buff *skb, u64 addr)
{
	struct xsk_addrs *xsk_addr;

	xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
	if (unlikely(!xsk_addr))
		return NULL;

	xsk_addr->addrs[0] = addr;
	skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
	return xsk_addr;
}

static struct xsk_addrs *xsk_addrs_alloc(struct sk_buff *skb)
{
	struct xsk_addrs *xsk_addr;

	if (!xsk_skb_destructor_is_addr(skb))
		return (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;

	xsk_addr = __xsk_addrs_alloc(skb, xsk_skb_destructor_get_addr(skb));
	if (likely(xsk_addr))
		xsk_addr->num_descs = 1;
	return xsk_addr;
}

static int xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
{
	if (IS_ENABLED(CONFIG_64BIT)) {
		skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
		return 0;
	}

	if (unlikely(!__xsk_addrs_alloc(skb, addr)))
		return -ENOMEM;
	return 0;
}

static void xsk_inc_num_desc(struct sk_buff *skb)
@@ -685,7 +718,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
	spin_lock_irqsave(&pool->cq_prod_lock, flags);
	idx = xskq_get_prod(pool->cq);

	if (unlikely(num_descs > 1)) {
	if (unlikely(!xsk_skb_destructor_is_addr(skb))) {
		xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;

		for (i = 0; i < num_descs; i++) {
@@ -724,14 +757,20 @@ void xsk_destruct_skb(struct sk_buff *skb)
	sock_wfree(skb);
}

static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
static int xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
			     u64 addr)
{
	int err;

	err = xsk_skb_destructor_set_addr(skb, addr);
	if (unlikely(err))
		return err;

	skb->dev = xs->dev;
	skb->priority = READ_ONCE(xs->sk.sk_priority);
	skb->mark = READ_ONCE(xs->sk.sk_mark);
	skb->destructor = xsk_destruct_skb;
	xsk_skb_destructor_set_addr(skb, addr);
	return 0;
}

static void xsk_consume_skb(struct sk_buff *skb)
@@ -740,7 +779,7 @@ static void xsk_consume_skb(struct sk_buff *skb)
	u32 num_descs = xsk_get_num_desc(skb);
	struct xsk_addrs *xsk_addr;

	if (unlikely(num_descs > 1)) {
	if (unlikely(!xsk_skb_destructor_is_addr(skb))) {
		xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
		kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
	}
@@ -819,29 +858,20 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
			return ERR_PTR(err);

		skb_reserve(skb, hr);

		xsk_skb_init_misc(skb, xs, desc->addr);
		if (desc->options & XDP_TX_METADATA) {
			err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
			if (unlikely(err))
			if (unlikely(err)) {
				kfree_skb(skb);
				return ERR_PTR(err);
			}
		}
	} else {
		struct xsk_addrs *xsk_addr;

		if (xsk_skb_destructor_is_addr(skb)) {
			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
						     GFP_KERNEL);
		xsk_addr = xsk_addrs_alloc(skb);
		if (!xsk_addr)
			return ERR_PTR(-ENOMEM);

			xsk_addr->num_descs = 1;
			xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
			skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
		} else {
			xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
		}

		/* in case of -EOVERFLOW that could happen below,
		 * xsk_consume_skb() will release this node as whole skb
		 * would be dropped, which implies freeing all list elements
@@ -856,8 +886,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
	addr = buffer - pool->addrs;

	for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
		if (unlikely(i >= MAX_SKB_FRAGS))
		if (unlikely(i >= MAX_SKB_FRAGS)) {
			if (!xs->skb)
				kfree_skb(skb);
			return ERR_PTR(-EOVERFLOW);
		}

		page = pool->umem->pgs[addr >> PAGE_SHIFT];
		get_page(page);
@@ -914,7 +947,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
			if (unlikely(err))
				goto free_err;

			xsk_skb_init_misc(skb, xs, desc->addr);
			if (desc->options & XDP_TX_METADATA) {
				err = xsk_skb_metadata(skb, buffer, desc,
						       xs->pool, hr);
@@ -927,21 +959,12 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
			struct page *page;
			u8 *vaddr;

			if (xsk_skb_destructor_is_addr(skb)) {
				xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
							     GFP_KERNEL);
			xsk_addr = xsk_addrs_alloc(skb);
			if (!xsk_addr) {
				err = -ENOMEM;
				goto free_err;
			}

				xsk_addr->num_descs = 1;
				xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
				skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
			} else {
				xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
			}

			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
				err = -EOVERFLOW;
				goto free_err;
@@ -964,18 +987,28 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
		}
	}

	if (!xs->skb) {
		err = xsk_skb_init_misc(skb, xs, desc->addr);
		if (unlikely(err))
			goto free_err;
	}
	xsk_inc_num_desc(skb);

	return skb;

free_err:
	if (skb && !skb_shinfo(skb)->nr_frags)
	if (skb && !xs->skb)
		kfree_skb(skb);

	if (err == -EOVERFLOW) {
		if (xs->skb) {
			/* Drop the packet */
			xsk_inc_num_desc(xs->skb);
			xsk_drop_skb(xs->skb);
		} else {
			xsk_cq_cancel_locked(xs->pool, 1);
			xs->tx->invalid_descs++;
		}
		xskq_cons_release(xs->tx);
	} else {
		/* Let application retry */
+3 −0
Original line number Diff line number Diff line
@@ -175,6 +175,9 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
	if (force_zc && force_copy)
		return -EINVAL;

	if (pool->tx_sw_csum && (netdev->priv_flags & IFF_TX_SKB_NO_LINEAR))
		return -EOPNOTSUPP;

	if (xsk_get_pool_from_qid(netdev, queue_id))
		return -EBUSY;