Commit eff5f16b authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-6.15/io_uring-reg-vec-20250327' of git://git.kernel.dk/linux

Pull more io_uring updates from Jens Axboe:
 "Final separate updates for io_uring.

  This started out as a series of cleanups improvements and improvements
  for registered buffers, but as the last series of the io_uring changes
  for 6.15, it also collected a few fixes for the other branches on top:

   - Add support for vectored fixed/registered buffers.

     Previously only single segments have been supported for commands,
     now vectored variants are supported as well. This series includes
     networking and file read/write support.

   - Small series unifying return codes across multi and single shot.

   - Small series cleaning up registerd buffer importing.

   - Adding support for vectored registered buffers for uring_cmd.

   - Fix for io-wq handling of command reissue.

   - Various little fixes and tweaks"

* tag 'for-6.15/io_uring-reg-vec-20250327' of git://git.kernel.dk/linux: (25 commits)
  io_uring/net: fix io_req_post_cqe abuse by send bundle
  io_uring/net: use REQ_F_IMPORT_BUFFER for send_zc
  io_uring: move min_events sanitisation
  io_uring: rename "min" arg in io_iopoll_check()
  io_uring: open code __io_post_aux_cqe()
  io_uring: defer iowq cqe overflow via task_work
  io_uring: fix retry handling off iowq
  io_uring/net: only import send_zc buffer once
  io_uring/cmd: introduce io_uring_cmd_import_fixed_vec
  io_uring/cmd: add iovec cache for commands
  io_uring/cmd: don't expose entire cmd async data
  io_uring: rename the data cmd cache
  io_uring: rely on io_prep_reg_vec for iovec placement
  io_uring: introduce io_prep_reg_iovec()
  io_uring: unify STOP_MULTISHOT with IOU_OK
  io_uring: return -EAGAIN to continue multishot
  io_uring: cap cached iovec/bvec size
  io_uring/net: implement vectored reg bufs for zctx
  io_uring/net: convert to struct iou_vec
  io_uring/net: pull vec alloc out of msghdr import
  ...
parents 6df9d086 6889ae1b
Loading
Loading
Loading
Loading
+13 −0
Original line number Diff line number Diff line
@@ -43,6 +43,11 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
			      struct iov_iter *iter,
			      struct io_uring_cmd *ioucmd,
			      unsigned int issue_flags);
int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
				  const struct iovec __user *uvec,
				  size_t uvec_segs,
				  int ddir, struct iov_iter *iter,
				  unsigned issue_flags);

/*
 * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd
@@ -76,6 +81,14 @@ io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
{
	return -EOPNOTSUPP;
}
static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
						const struct iovec __user *uvec,
						size_t uvec_segs,
						int ddir, struct iov_iter *iter,
						unsigned issue_flags)
{
	return -EOPNOTSUPP;
}
static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
		u64 ret2, unsigned issue_flags)
{
+18 −1
Original line number Diff line number Diff line
@@ -110,6 +110,14 @@ struct io_uring_task {
	} ____cacheline_aligned_in_smp;
};

struct iou_vec {
	union {
		struct iovec	*iovec;
		struct bio_vec	*bvec;
	};
	unsigned		nr; /* number of struct iovec it can hold */
};

struct io_uring {
	u32 head;
	u32 tail;
@@ -310,7 +318,7 @@ struct io_ring_ctx {
		struct io_alloc_cache	apoll_cache;
		struct io_alloc_cache	netmsg_cache;
		struct io_alloc_cache	rw_cache;
		struct io_alloc_cache	uring_cache;
		struct io_alloc_cache	cmd_cache;

		/*
		 * Any cancelable uring_cmd is added to this list in
@@ -482,6 +490,7 @@ enum {
	REQ_F_SKIP_LINK_CQES_BIT,
	REQ_F_SINGLE_POLL_BIT,
	REQ_F_DOUBLE_POLL_BIT,
	REQ_F_MULTISHOT_BIT,
	REQ_F_APOLL_MULTISHOT_BIT,
	REQ_F_CLEAR_POLLIN_BIT,
	/* keep async read/write and isreg together and in order */
@@ -494,6 +503,7 @@ enum {
	REQ_F_BUFFERS_COMMIT_BIT,
	REQ_F_BUF_NODE_BIT,
	REQ_F_HAS_METADATA_BIT,
	REQ_F_IMPORT_BUFFER_BIT,

	/* not a real bit, just to check we're not overflowing the space */
	__REQ_F_LAST_BIT,
@@ -558,6 +568,8 @@ enum {
	REQ_F_SINGLE_POLL	= IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT),
	/* double poll may active */
	REQ_F_DOUBLE_POLL	= IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT),
	/* request posts multiple completions, should be set at prep time */
	REQ_F_MULTISHOT		= IO_REQ_FLAG(REQ_F_MULTISHOT_BIT),
	/* fast poll multishot mode */
	REQ_F_APOLL_MULTISHOT	= IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT),
	/* recvmsg special flag, clear EPOLLIN */
@@ -576,6 +588,11 @@ enum {
	REQ_F_BUF_NODE		= IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
	/* request has read/write metadata assigned */
	REQ_F_HAS_METADATA	= IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT),
	/*
	 * For vectored fixed buffers, resolve iovec to registered buffers.
	 * For SEND_ZC, whether to import buffers (i.e. the first issue).
	 */
	REQ_F_IMPORT_BUFFER	= IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
};

typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw);
+2 −0
Original line number Diff line number Diff line
@@ -281,6 +281,8 @@ enum io_uring_op {
	IORING_OP_LISTEN,
	IORING_OP_RECV_ZC,
	IORING_OP_EPOLL_WAIT,
	IORING_OP_READV_FIXED,
	IORING_OP_WRITEV_FIXED,

	/* this goes last, obviously */
	IORING_OP_LAST,
+0 −9
Original line number Diff line number Diff line
@@ -16,15 +16,6 @@ bool io_alloc_cache_init(struct io_alloc_cache *cache,

void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp);

static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr)
{
	if (IS_ENABLED(CONFIG_KASAN)) {
		kfree(*iov);
		*iov = NULL;
		*nr = 0;
	}
}

static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
				      void *entry)
{
+27 −38
Original line number Diff line number Diff line
@@ -289,7 +289,7 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx)
	io_alloc_cache_free(&ctx->apoll_cache, kfree);
	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
	io_alloc_cache_free(&ctx->uring_cache, kfree);
	io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free);
	io_alloc_cache_free(&ctx->msg_cache, kfree);
	io_futex_cache_free(ctx);
	io_rsrc_cache_free(ctx);
@@ -334,8 +334,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
	ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
			    sizeof(struct io_async_rw),
			    offsetof(struct io_async_rw, clear));
	ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
			    sizeof(struct io_uring_cmd_data), 0);
	ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX,
			    sizeof(struct io_async_cmd),
			    sizeof(struct io_async_cmd));
	spin_lock_init(&ctx->msg_lock);
	ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
			    sizeof(struct io_kiocb), 0);
@@ -833,24 +834,14 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
	return false;
}

static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res,
			      u32 cflags)
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
	bool filled;

	io_cq_lock(ctx);
	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
	if (!filled)
		filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);

	return filled;
}

bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
	bool filled;

	io_cq_lock(ctx);
	filled = __io_post_aux_cqe(ctx, user_data, res, cflags);
	io_cq_unlock_post(ctx);
	return filled;
}
@@ -891,6 +882,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
	struct io_ring_ctx *ctx = req->ctx;
	bool completed = true;

	/*
	 * All execution paths but io-wq use the deferred completions by
@@ -903,19 +895,21 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
	 * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires
	 * the submitter task context, IOPOLL protects with uring_lock.
	 */
	if (ctx->lockless_cq) {
	if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) {
defer_complete:
		req->io_task_work.func = io_req_task_complete;
		io_req_task_work_add(req);
		return;
	}

	io_cq_lock(ctx);
	if (!(req->flags & REQ_F_CQE_SKIP)) {
		if (!io_fill_cqe_req(ctx, req))
			io_req_cqe_overflow(req);
	}
	if (!(req->flags & REQ_F_CQE_SKIP))
		completed = io_fill_cqe_req(ctx, req);
	io_cq_unlock_post(ctx);

	if (!completed)
		goto defer_complete;

	/*
	 * We don't free the request here because we know it's called from
	 * io-wq only, which holds a reference, so it cannot be the last put.
@@ -1511,11 +1505,13 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
	mutex_unlock(&ctx->uring_lock);
}

static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
{
	unsigned int nr_events = 0;
	unsigned long check_cq;

	min_events = min(min_events, ctx->cq_entries);

	lockdep_assert_held(&ctx->uring_lock);

	if (!io_allowed_run_tw(ctx))
@@ -1557,7 +1553,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
		    io_task_work_pending(ctx)) {
			u32 tail = ctx->cached_cq_tail;

			(void) io_run_local_work_locked(ctx, min);
			(void) io_run_local_work_locked(ctx, min_events);

			if (task_work_pending(current) ||
			    wq_list_empty(&ctx->iopoll_list)) {
@@ -1570,7 +1566,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
			    wq_list_empty(&ctx->iopoll_list))
				break;
		}
		ret = io_do_iopoll(ctx, !min);
		ret = io_do_iopoll(ctx, !min_events);
		if (unlikely(ret < 0))
			return ret;

@@ -1580,7 +1576,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
			break;

		nr_events += ret;
	} while (nr_events < min);
	} while (nr_events < min_events);

	return 0;
}
@@ -1791,10 +1787,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw)

	ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]);

	WARN_ON_ONCE(ret == IOU_OK);

	if (ret == IOU_ISSUE_SKIP_COMPLETE)
		ret = 0;
	WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE);
	return ret;
}

@@ -1847,7 +1840,7 @@ void io_wq_submit_work(struct io_wq_work *work)
	 * Don't allow any multishot execution from io-wq. It's more restrictive
	 * than necessary and also cleaner.
	 */
	if (req->flags & REQ_F_APOLL_MULTISHOT) {
	if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) {
		err = -EBADFD;
		if (!io_file_can_poll(req))
			goto fail;
@@ -1858,7 +1851,7 @@ void io_wq_submit_work(struct io_wq_work *work)
				goto fail;
			return;
		} else {
			req->flags &= ~REQ_F_APOLL_MULTISHOT;
			req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT);
		}
	}

@@ -2549,6 +2542,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
	ktime_t start_time;
	int ret;

	min_events = min_t(int, min_events, ctx->cq_entries);

	if (!io_allowed_run_tw(ctx))
		return -EEXIST;
	if (io_local_work_pending(ctx))
@@ -3435,23 +3430,17 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
			mutex_lock(&ctx->uring_lock);
iopoll_locked:
			ret2 = io_validate_ext_arg(ctx, flags, argp, argsz);
			if (likely(!ret2)) {
				min_complete = min(min_complete,
						   ctx->cq_entries);
			if (likely(!ret2))
				ret2 = io_iopoll_check(ctx, min_complete);
			}
			mutex_unlock(&ctx->uring_lock);
		} else {
			struct ext_arg ext_arg = { .argsz = argsz };

			ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg);
			if (likely(!ret2)) {
				min_complete = min(min_complete,
						   ctx->cq_entries);
			if (likely(!ret2))
				ret2 = io_cqring_wait(ctx, min_complete, flags,
						      &ext_arg);
		}
		}

		if (!ret) {
			ret = ret2;
Loading