Commit f8f115ba authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge branch 'zcrx-updates-6.19' into for-6.19/io_uring

Merge zcrx updates from Pavel:

"Zcrx updates for 6.19. It includes a bunch of small patches,
 IORING_REGISTER_ZCRX_CTRL and RQ flushing (Patches 4-5) and David's
 work on sharing zcrx b/w multiple io_uring instances."

Link: https://lore.kernel.org/io-uring/cover.1763029704.git.asml.silence@gmail.com/


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>

* zcrx-updates-6.19:
  io_uring/zcrx: share an ifq between rings
  io_uring/zcrx: add io_fill_zcrx_offsets()
  io_uring/zcrx: export zcrx via a file
  io_uring/zcrx: move io_zcrx_scrub() and dependencies up
  io_uring/zcrx: count zcrx users
  io_uring/zcrx: add sync refill queue flushing
  io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL
  io_uring/zcrx: elide passing msg flags
  io_uring/zcrx: use folio_nr_pages() instead of shift operation
  io_uring/zcrx: convert to use netmem_desc
parents 5bd38e18 00d91481
Loading
Loading
Loading
Loading
+34 −0
Original line number Diff line number Diff line
@@ -697,6 +697,9 @@ enum io_uring_register_op {
	/* query various aspects of io_uring, see linux/io_uring/query.h */
	IORING_REGISTER_QUERY			= 35,

	/* auxiliary zcrx configuration, see enum zcrx_ctrl_op */
	IORING_REGISTER_ZCRX_CTRL		= 36,

	/* this goes last */
	IORING_REGISTER_LAST,

@@ -1060,6 +1063,10 @@ struct io_uring_zcrx_area_reg {
	__u64	__resv2[2];
};

enum zcrx_reg_flags {
	ZCRX_REG_IMPORT	= 1,
};

/*
 * Argument for IORING_REGISTER_ZCRX_IFQ
 */
@@ -1078,6 +1085,33 @@ struct io_uring_zcrx_ifq_reg {
	__u64	__resv[3];
};

enum zcrx_ctrl_op {
	ZCRX_CTRL_FLUSH_RQ,
	ZCRX_CTRL_EXPORT,

	__ZCRX_CTRL_LAST,
};

struct zcrx_ctrl_flush_rq {
	__u64		__resv[6];
};

struct zcrx_ctrl_export {
	__u32		zcrx_fd;
	__u32 		__resv1[11];
};

struct zcrx_ctrl {
	__u32	zcrx_id;
	__u32	op; /* see enum zcrx_ctrl_op */
	__u64	__resv[2];

	union {
		struct zcrx_ctrl_export		zc_export;
		struct zcrx_ctrl_flush_rq	zc_flush;
	};
};

#ifdef __cplusplus
}
#endif
+2 −5
Original line number Diff line number Diff line
@@ -110,7 +110,6 @@ enum sr_retry_flags {

struct io_recvzc {
	struct file			*file;
	unsigned			msg_flags;
	u16				flags;
	u32				len;
	struct io_zcrx_ifq		*ifq;
@@ -1253,8 +1252,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)

	zc->len = READ_ONCE(sqe->len);
	zc->flags = READ_ONCE(sqe->ioprio);
	zc->msg_flags = READ_ONCE(sqe->msg_flags);
	if (zc->msg_flags)
	if (READ_ONCE(sqe->msg_flags))
		return -EINVAL;
	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT))
		return -EINVAL;
@@ -1283,8 +1281,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
		return -ENOTSOCK;

	len = zc->len;
	ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT,
			   issue_flags, &zc->len);
	ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len);
	if (len && zc->len == 0) {
		io_req_set_res(req, 0, 0);

+3 −0
Original line number Diff line number Diff line
@@ -815,6 +815,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
	case IORING_REGISTER_QUERY:
		ret = io_query(ctx, arg, nr_args);
		break;
	case IORING_REGISTER_ZCRX_CTRL:
		ret = io_zcrx_ctrl(ctx, arg, nr_args);
		break;
	default:
		ret = -EINVAL;
		break;
+270 −56
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff_ref.h>
#include <linux/anon_inodes.h>

#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
@@ -170,7 +171,7 @@ static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pag
		if (folio == last_folio)
			continue;
		last_folio = folio;
		res += 1UL << folio_order(folio);
		res += folio_nr_pages(folio);
	}
	return res;
}
@@ -344,6 +345,13 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
	atomic_inc(io_get_user_counter(niov));
}

static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets)
{
	offsets->head = offsetof(struct io_uring, head);
	offsets->tail = offsetof(struct io_uring, tail);
	offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES);
}

static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
				 struct io_zcrx_ifq *ifq,
				 struct io_uring_zcrx_ifq_reg *reg,
@@ -355,7 +363,8 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
	void *ptr;
	int ret;

	off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES);
	io_fill_zcrx_offsets(&reg->offsets);
	off = reg->offsets.rqes;
	size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
	if (size > rd->size)
		return -EINVAL;
@@ -371,9 +380,6 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
	ifq->rq_ring = (struct io_uring *)ptr;
	ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);

	reg->offsets.head = offsetof(struct io_uring, head);
	reg->offsets.tail = offsetof(struct io_uring, tail);
	reg->offsets.rqes = off;
	return 0;
}

@@ -482,6 +488,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
	spin_lock_init(&ifq->rq_lock);
	mutex_init(&ifq->pp_lock);
	refcount_set(&ifq->refs, 1);
	refcount_set(&ifq->user_refs, 1);
	return ifq;
}

@@ -543,6 +550,57 @@ static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq)
		io_zcrx_ifq_free(ifq);
}

static void io_zcrx_return_niov_freelist(struct net_iov *niov)
{
	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);

	spin_lock_bh(&area->freelist_lock);
	area->freelist[area->free_count++] = net_iov_idx(niov);
	spin_unlock_bh(&area->freelist_lock);
}

static void io_zcrx_return_niov(struct net_iov *niov)
{
	netmem_ref netmem = net_iov_to_netmem(niov);

	if (!niov->desc.pp) {
		/* copy fallback allocated niovs */
		io_zcrx_return_niov_freelist(niov);
		return;
	}
	page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false);
}

static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
{
	struct io_zcrx_area *area = ifq->area;
	int i;

	if (!area)
		return;

	/* Reclaim back all buffers given to the user space. */
	for (i = 0; i < area->nia.num_niovs; i++) {
		struct net_iov *niov = &area->nia.niovs[i];
		int nr;

		if (!atomic_read(io_get_user_counter(niov)))
			continue;
		nr = atomic_xchg(io_get_user_counter(niov), 0);
		if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
			io_zcrx_return_niov(niov);
	}
}

static void zcrx_unregister(struct io_zcrx_ifq *ifq)
{
	if (refcount_dec_and_test(&ifq->user_refs)) {
		io_close_queue(ifq);
		io_zcrx_scrub(ifq);
	}
	io_put_zcrx_ifq(ifq);
}

struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
					    unsigned int id)
{
@@ -553,6 +611,112 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
	return ifq ? &ifq->region : NULL;
}

static int zcrx_box_release(struct inode *inode, struct file *file)
{
	struct io_zcrx_ifq *ifq = file->private_data;

	if (WARN_ON_ONCE(!ifq))
		return -EFAULT;
	zcrx_unregister(ifq);
	return 0;
}

static const struct file_operations zcrx_box_fops = {
	.owner		= THIS_MODULE,
	.release	= zcrx_box_release,
};

static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
		       struct zcrx_ctrl *ctrl, void __user *arg)
{
	struct zcrx_ctrl_export *ce = &ctrl->zc_export;
	struct file *file;
	int fd = -1;

	if (!mem_is_zero(ce, sizeof(*ce)))
		return -EINVAL;
	fd = get_unused_fd_flags(O_CLOEXEC);
	if (fd < 0)
		return fd;

	ce->zcrx_fd = fd;
	if (copy_to_user(arg, ctrl, sizeof(*ctrl))) {
		put_unused_fd(fd);
		return -EFAULT;
	}

	refcount_inc(&ifq->refs);
	refcount_inc(&ifq->user_refs);

	file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops,
					 ifq, O_CLOEXEC, NULL);
	if (IS_ERR(file)) {
		put_unused_fd(fd);
		zcrx_unregister(ifq);
		return PTR_ERR(file);
	}

	fd_install(fd, file);
	return 0;
}

static int import_zcrx(struct io_ring_ctx *ctx,
		       struct io_uring_zcrx_ifq_reg __user *arg,
		       struct io_uring_zcrx_ifq_reg *reg)
{
	struct io_zcrx_ifq *ifq;
	struct file *file;
	int fd, ret;
	u32 id;

	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
		return -EINVAL;
	if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
		return -EINVAL;
	if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr)
		return -EINVAL;

	fd = reg->if_idx;
	CLASS(fd, f)(fd);
	if (fd_empty(f))
		return -EBADF;

	file = fd_file(f);
	if (file->f_op != &zcrx_box_fops || !file->private_data)
		return -EBADF;

	ifq = file->private_data;
	refcount_inc(&ifq->refs);
	refcount_inc(&ifq->user_refs);

	scoped_guard(mutex, &ctx->mmap_lock) {
		ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
		if (ret)
			goto err;
	}

	reg->zcrx_id = id;
	io_fill_zcrx_offsets(&reg->offsets);
	if (copy_to_user(arg, reg, sizeof(*reg))) {
		ret = -EFAULT;
		goto err_xa_erase;
	}

	scoped_guard(mutex, &ctx->mmap_lock) {
		ret = -ENOMEM;
		if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
			goto err_xa_erase;
	}

	return 0;
err_xa_erase:
	scoped_guard(mutex, &ctx->mmap_lock)
		xa_erase(&ctx->zcrx_ctxs, id);
err:
	zcrx_unregister(ifq);
	return ret;
}

int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
			  struct io_uring_zcrx_ifq_reg __user *arg)
{
@@ -578,11 +742,13 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
		return -EINVAL;
	if (copy_from_user(&reg, arg, sizeof(reg)))
		return -EFAULT;
	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
		return -EFAULT;
	if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) ||
	    reg.__resv2 || reg.zcrx_id)
		return -EINVAL;
	if (reg.flags & ZCRX_REG_IMPORT)
		return import_zcrx(ctx, arg, &reg);
	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
		return -EFAULT;
	if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
		return -EINVAL;
	if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
@@ -683,48 +849,6 @@ static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
	return &area->nia.niovs[niov_idx];
}

static void io_zcrx_return_niov_freelist(struct net_iov *niov)
{
	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);

	spin_lock_bh(&area->freelist_lock);
	area->freelist[area->free_count++] = net_iov_idx(niov);
	spin_unlock_bh(&area->freelist_lock);
}

static void io_zcrx_return_niov(struct net_iov *niov)
{
	netmem_ref netmem = net_iov_to_netmem(niov);

	if (!niov->pp) {
		/* copy fallback allocated niovs */
		io_zcrx_return_niov_freelist(niov);
		return;
	}
	page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false);
}

static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
{
	struct io_zcrx_area *area = ifq->area;
	int i;

	if (!area)
		return;

	/* Reclaim back all buffers given to the user space. */
	for (i = 0; i < area->nia.num_niovs; i++) {
		struct net_iov *niov = &area->nia.niovs[i];
		int nr;

		if (!atomic_read(io_get_user_counter(niov)))
			continue;
		nr = atomic_xchg(io_get_user_counter(niov), 0);
		if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
			io_zcrx_return_niov(niov);
	}
}

void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
	struct io_zcrx_ifq *ifq;
@@ -741,10 +865,7 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
		}
		if (!ifq)
			break;

		io_close_queue(ifq);
		io_zcrx_scrub(ifq);
		io_put_zcrx_ifq(ifq);
		zcrx_unregister(ifq);
	}

	xa_destroy(&ctx->zcrx_ctxs);
@@ -815,7 +936,7 @@ static void io_zcrx_ring_refill(struct page_pool *pp,
		if (!page_pool_unref_and_test(netmem))
			continue;

		if (unlikely(niov->pp != pp)) {
		if (unlikely(niov->desc.pp != pp)) {
			io_zcrx_return_niov(niov);
			continue;
		}
@@ -941,6 +1062,97 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
	.uninstall		= io_pp_uninstall,
};

static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
			      struct io_zcrx_ifq *zcrx)
{
	unsigned int mask = zcrx->rq_entries - 1;
	unsigned int i;

	guard(spinlock_bh)(&zcrx->rq_lock);

	nr = min(nr, io_zcrx_rqring_entries(zcrx));
	for (i = 0; i < nr; i++) {
		struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask);
		struct net_iov *niov;

		if (!io_parse_rqe(rqe, zcrx, &niov))
			break;
		netmem_array[i] = net_iov_to_netmem(niov);
	}

	smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head);
	return i;
}

#define ZCRX_FLUSH_BATCH 32

static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr)
{
	unsigned i;

	for (i = 0; i < nr; i++) {
		netmem_ref netmem = netmems[i];
		struct net_iov *niov = netmem_to_net_iov(netmem);

		if (!io_zcrx_put_niov_uref(niov))
			continue;
		if (!page_pool_unref_and_test(netmem))
			continue;
		io_zcrx_return_niov(niov);
	}
}

static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
			 struct zcrx_ctrl *ctrl)
{
	struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush;
	netmem_ref netmems[ZCRX_FLUSH_BATCH];
	unsigned total = 0;
	unsigned nr;

	if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv)))
		return -EINVAL;

	do {
		nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx);

		zcrx_return_buffers(netmems, nr);
		total += nr;

		if (fatal_signal_pending(current))
			break;
		cond_resched();
	} while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries);

	return 0;
}

int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
{
	struct zcrx_ctrl ctrl;
	struct io_zcrx_ifq *zcrx;

	if (nr_args)
		return -EINVAL;
	if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
		return -EFAULT;
	if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv)))
		return -EFAULT;

	zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id);
	if (!zcrx)
		return -ENXIO;

	switch (ctrl.op) {
	case ZCRX_CTRL_FLUSH_RQ:
		return zcrx_flush_rq(ctx, zcrx, &ctrl);
	case ZCRX_CTRL_EXPORT:
		return zcrx_export(ctx, zcrx, &ctrl, arg);
	}

	return -EOPNOTSUPP;
}

static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
			      struct io_zcrx_ifq *ifq, int off, int len)
{
@@ -1082,13 +1294,15 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
			     const skb_frag_t *frag, int off, int len)
{
	struct net_iov *niov;
	struct page_pool *pp;

	if (unlikely(!skb_frag_is_net_iov(frag)))
		return io_zcrx_copy_frag(req, ifq, frag, off, len);

	niov = netmem_to_net_iov(frag->netmem);
	if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops ||
	    io_pp_to_ifq(niov->pp) != ifq)
	pp = niov->desc.pp;

	if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq)
		return -EFAULT;

	if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
+8 −0
Original line number Diff line number Diff line
@@ -55,6 +55,8 @@ struct io_zcrx_ifq {
	struct net_device		*netdev;
	netdevice_tracker		netdev_tracker;
	refcount_t			refs;
	/* counts userspace facing users like io_uring */
	refcount_t			user_refs;

	/*
	 * Page pool and net configuration lock, can be taken deeper in the
@@ -65,6 +67,7 @@ struct io_zcrx_ifq {
};

#if defined(CONFIG_IO_URING_ZCRX)
int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg);
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
			 struct io_uring_zcrx_ifq_reg __user *arg);
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
@@ -93,6 +96,11 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct
{
	return NULL;
}
static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx,
				void __user *arg, unsigned nr_arg)
{
	return -EOPNOTSUPP;
}
#endif

int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);