Commit bc57c7d3 authored by Pavel Begunkov's avatar Pavel Begunkov Committed by Jens Axboe
Browse files

io_uring/zcrx: add copy fallback



There are scenarios in which the zerocopy path can get a kernel buffer
instead of a net_iov and needs to copy it to the user, whether it is
because of mis-steering or simply getting an skb with the linear part.
In this case, grab a net_iov, copy into it and return it to the user as
normally.

At the moment the user doesn't get any indication whether there was a
copy or not, which is left for follow up work.

Reviewed-by: default avatarJens Axboe <axboe@kernel.dk>
Signed-off-by: default avatarPavel Begunkov <asml.silence@gmail.com>
Signed-off-by: default avatarDavid Wei <dw@davidwei.uk>
Acked-by: default avatarJakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20250215000947.789731-10-dw@davidwei.uk


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 931dfae1
Loading
Loading
Loading
Loading
+114 −6
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@
#include <linux/io_uring.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff_ref.h>

#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
@@ -134,6 +135,13 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
	atomic_inc(io_get_user_counter(niov));
}

static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
{
	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);

	return area->pages[net_iov_idx(niov)];
}

static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
				 struct io_uring_zcrx_ifq_reg *reg,
				 struct io_uring_region_desc *rd)
@@ -448,6 +456,11 @@ static void io_zcrx_return_niov(struct net_iov *niov)
{
	netmem_ref netmem = net_iov_to_netmem(niov);

	if (!niov->pp) {
		/* copy fallback allocated niovs */
		io_zcrx_return_niov_freelist(niov);
		return;
	}
	page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false);
}

@@ -686,13 +699,93 @@ static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
	return true;
}

static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area)
{
	struct net_iov *niov = NULL;

	spin_lock_bh(&area->freelist_lock);
	if (area->free_count)
		niov = __io_zcrx_get_free_niov(area);
	spin_unlock_bh(&area->freelist_lock);

	if (niov)
		page_pool_fragment_netmem(net_iov_to_netmem(niov), 1);
	return niov;
}

static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
				  void *src_base, struct page *src_page,
				  unsigned int src_offset, size_t len)
{
	struct io_zcrx_area *area = ifq->area;
	size_t copied = 0;
	int ret = 0;

	while (len) {
		size_t copy_size = min_t(size_t, PAGE_SIZE, len);
		const int dst_off = 0;
		struct net_iov *niov;
		struct page *dst_page;
		void *dst_addr;

		niov = io_zcrx_alloc_fallback(area);
		if (!niov) {
			ret = -ENOMEM;
			break;
		}

		dst_page = io_zcrx_iov_page(niov);
		dst_addr = kmap_local_page(dst_page);
		if (src_page)
			src_base = kmap_local_page(src_page);

		memcpy(dst_addr, src_base + src_offset, copy_size);

		if (src_page)
			kunmap_local(src_base);
		kunmap_local(dst_addr);

		if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) {
			io_zcrx_return_niov(niov);
			ret = -ENOSPC;
			break;
		}

		io_zcrx_get_niov_uref(niov);
		src_offset += copy_size;
		len -= copy_size;
		copied += copy_size;
	}

	return copied ? copied : ret;
}

static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
			     const skb_frag_t *frag, int off, int len)
{
	struct page *page = skb_frag_page(frag);
	u32 p_off, p_len, t, copied = 0;
	int ret = 0;

	off += skb_frag_off(frag);

	skb_frag_foreach_page(frag, off, len,
			      page, p_off, p_len, t) {
		ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len);
		if (ret < 0)
			return copied ? copied : ret;
		copied += ret;
	}
	return copied;
}

static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
			     const skb_frag_t *frag, int off, int len)
{
	struct net_iov *niov;

	if (unlikely(!skb_frag_is_net_iov(frag)))
		return -EOPNOTSUPP;
		return io_zcrx_copy_frag(req, ifq, frag, off, len);

	niov = netmem_to_net_iov(frag->netmem);
	if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
@@ -719,18 +812,33 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
	struct io_zcrx_ifq *ifq = args->ifq;
	struct io_kiocb *req = args->req;
	struct sk_buff *frag_iter;
	unsigned start, start_off;
	unsigned start, start_off = offset;
	int i, copy, end, off;
	int ret = 0;

	if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT))
		return -EAGAIN;

	start = skb_headlen(skb);
	start_off = offset;
	if (unlikely(offset < skb_headlen(skb))) {
		ssize_t copied;
		size_t to_copy;

	if (offset < start)
		return -EOPNOTSUPP;
		to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
		copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL,
					    offset, to_copy);
		if (copied < 0) {
			ret = copied;
			goto out;
		}
		offset += copied;
		len -= copied;
		if (!len)
			goto out;
		if (offset != skb_headlen(skb))
			goto out;
	}

	start = skb_headlen(skb);

	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
		const skb_frag_t *frag;