Commit 27cb27b6 authored by Keith Busch's avatar Keith Busch Committed by Jens Axboe
Browse files

io_uring: add support for kernel registered bvecs



Provide an interface for the kernel to leverage the existing
pre-registered buffers that io_uring provides. User space can reference
these later to achieve zero-copy IO.

User space must register an empty fixed buffer table with io_uring in
order for the kernel to make use of it.

Signed-off-by: default avatarKeith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20250227223916.143006-5-kbusch@meta.com


Reviewed-by: default avatarMing Lei <ming.lei@redhat.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 99fde895
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@

#include <uapi/linux/io_uring.h>
#include <linux/io_uring_types.h>
#include <linux/blk-mq.h>

/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
#define IORING_URING_CMD_CANCELABLE	(1U << 30)
@@ -125,4 +126,10 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur
	return cmd_to_io_kiocb(cmd)->async_data;
}

int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
			    void (*release)(void *), unsigned int index,
			    unsigned int issue_flags);
void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
			       unsigned int issue_flags);

#endif /* _LINUX_IO_URING_CMD_H */
+3 −0
Original line number Diff line number Diff line
@@ -3940,6 +3940,9 @@ static int __init io_uring_init(void)

	io_uring_optable_init();

	/* imu->dir is u8 */
	BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX);

	/*
	 * Allow user copy in the per-command field, which starts after the
	 * file in io_kiocb and until the opcode field. The openat2 handling
+116 −7
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@
#include <linux/hugetlb.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/io_uring/cmd.h>

#include <uapi/linux/io_uring.h>

@@ -101,17 +102,23 @@ static int io_buffer_validate(struct iovec *iov)
	return 0;
}

static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
static void io_release_ubuf(void *priv)
{
	struct io_mapped_ubuf *imu = node->buf;
	struct io_mapped_ubuf *imu = priv;
	unsigned int i;

	if (!refcount_dec_and_test(&imu->refs))
		return;
	for (i = 0; i < imu->nr_bvecs; i++)
		unpin_user_page(imu->bvec[i].bv_page);
}

static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
{
	if (!refcount_dec_and_test(&imu->refs))
		return;

	if (imu->acct_pages)
		io_unaccount_mem(ctx, imu->acct_pages);
	imu->release(imu->priv);
	kvfree(imu);
}

@@ -451,7 +458,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
		break;
	case IORING_RSRC_BUFFER:
		if (node->buf)
			io_buffer_unmap(ctx, node);
			io_buffer_unmap(ctx, node->buf);
		break;
	default:
		WARN_ON_ONCE(1);
@@ -761,6 +768,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
	imu->len = iov->iov_len;
	imu->nr_bvecs = nr_pages;
	imu->folio_shift = PAGE_SHIFT;
	imu->release = io_release_ubuf;
	imu->priv = imu;
	imu->is_kbuf = false;
	imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
	if (coalesced)
		imu->folio_shift = data.folio_shift;
	refcount_set(&imu->refs, 1);
@@ -857,6 +868,95 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
	return ret;
}

int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
			    void (*release)(void *), unsigned int index,
			    unsigned int issue_flags)
{
	struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
	struct io_rsrc_data *data = &ctx->buf_table;
	struct req_iterator rq_iter;
	struct io_mapped_ubuf *imu;
	struct io_rsrc_node *node;
	struct bio_vec bv, *bvec;
	u16 nr_bvecs;
	int ret = 0;

	io_ring_submit_lock(ctx, issue_flags);
	if (index >= data->nr) {
		ret = -EINVAL;
		goto unlock;
	}
	index = array_index_nospec(index, data->nr);

	if (data->nodes[index]) {
		ret = -EBUSY;
		goto unlock;
	}

	node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
	if (!node) {
		ret = -ENOMEM;
		goto unlock;
	}

	nr_bvecs = blk_rq_nr_phys_segments(rq);
	imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
	if (!imu) {
		kfree(node);
		ret = -ENOMEM;
		goto unlock;
	}

	imu->ubuf = 0;
	imu->len = blk_rq_bytes(rq);
	imu->acct_pages = 0;
	imu->folio_shift = PAGE_SHIFT;
	imu->nr_bvecs = nr_bvecs;
	refcount_set(&imu->refs, 1);
	imu->release = release;
	imu->priv = rq;
	imu->is_kbuf = true;

	if (op_is_write(req_op(rq)))
		imu->dir = IO_IMU_SOURCE;
	else
		imu->dir = IO_IMU_DEST;

	bvec = imu->bvec;
	rq_for_each_bvec(bv, rq, rq_iter)
		*bvec++ = bv;

	node->buf = imu;
	data->nodes[index] = node;
unlock:
	io_ring_submit_unlock(ctx, issue_flags);
	return ret;
}
EXPORT_SYMBOL_GPL(io_buffer_register_bvec);

void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
			       unsigned int issue_flags)
{
	struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
	struct io_rsrc_data *data = &ctx->buf_table;
	struct io_rsrc_node *node;

	io_ring_submit_lock(ctx, issue_flags);
	if (index >= data->nr)
		goto unlock;
	index = array_index_nospec(index, data->nr);

	node = data->nodes[index];
	if (!node || !node->buf->is_kbuf)
		goto unlock;

	io_put_rsrc_node(ctx, node);
	data->nodes[index] = NULL;
unlock:
	io_ring_submit_unlock(ctx, issue_flags);
}
EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);

static int io_import_fixed(int ddir, struct iov_iter *iter,
			   struct io_mapped_ubuf *imu,
			   u64 buf_addr, size_t len)
@@ -871,6 +971,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
	/* not inside the mapped region */
	if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
		return -EFAULT;
	if (!(imu->dir & (1 << ddir)))
		return -EFAULT;

	/*
	 * Might not be a start of buffer, set size appropriately
@@ -883,8 +985,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
		/*
		 * Don't use iov_iter_advance() here, as it's really slow for
		 * using the latter parts of a big fixed buffer - it iterates
		 * over each segment manually. We can cheat a bit here, because
		 * we know that:
		 * over each segment manually. We can cheat a bit here for user
		 * registered nodes, because we know that:
		 *
		 * 1) it's a BVEC iter, we set it up
		 * 2) all bvecs are the same in size, except potentially the
@@ -898,8 +1000,15 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
		 */
		const struct bio_vec *bvec = imu->bvec;

		/*
		 * Kernel buffer bvecs, on the other hand, don't necessarily
		 * have the size property of user registered ones, so we have
		 * to use the slow iter advance.
		 */
		if (offset < bvec->bv_len) {
			iter->iov_offset = offset;
		} else if (imu->is_kbuf) {
			iov_iter_advance(iter, offset);
		} else {
			unsigned long seg_skip;

+9 −0
Original line number Diff line number Diff line
@@ -20,6 +20,11 @@ struct io_rsrc_node {
	};
};

enum {
	IO_IMU_DEST	= 1 << ITER_DEST,
	IO_IMU_SOURCE	= 1 << ITER_SOURCE,
};

struct io_mapped_ubuf {
	u64		ubuf;
	unsigned int	len;
@@ -27,6 +32,10 @@ struct io_mapped_ubuf {
	unsigned int    folio_shift;
	refcount_t	refs;
	unsigned long	acct_pages;
	void		(*release)(void *);
	void		*priv;
	bool		is_kbuf;
	u8		dir;
	struct bio_vec	bvec[] __counted_by(nr_bvecs);
};

+3 −0
Original line number Diff line number Diff line
@@ -629,6 +629,7 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
 */
static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
{
	struct io_kiocb *req = cmd_to_io_kiocb(rw);
	struct kiocb *kiocb = &rw->kiocb;
	struct file *file = kiocb->ki_filp;
	ssize_t ret = 0;
@@ -644,6 +645,8 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
	if ((kiocb->ki_flags & IOCB_NOWAIT) &&
	    !(kiocb->ki_filp->f_flags & O_NONBLOCK))
		return -EAGAIN;
	if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf)
		return -EFAULT;

	ppos = io_kiocb_ppos(kiocb);