Commit dd54fcce authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-6.13-20242901' of git://git.kernel.dk/linux

Pull more io_uring updates from Jens Axboe:

 - Remove a leftover struct from when the cqwait registered waiting was
   transitioned to regions.

 - Fix for an issue introduced in this merge window, where nop->fd might
   be used uninitialized. Ensure it's always set.

 - Add capping of the task_work run in local task_work mode, to prevent
   bursty and long chains from adding too much latency.

 - Work around xa_store() leaving ->head non-NULL if it encounters an
   allocation error during storing. Just a debug trigger, and can go
   away once xa_store() behaves in a more expected way for this
   condition. Not a major thing as it basically requires fault injection
   to trigger it.

 - Fix a few mapping corner cases

 - Fix KCSAN complaint on reading the table size post unlock. Again not
   a "real" issue, but it's easy to silence by just keeping the reading
   inside the lock that protects it.

* tag 'io_uring-6.13-20242901' of git://git.kernel.dk/linux:
  io_uring/tctx: work around xa_store() allocation error issue
  io_uring: fix corner case forgetting to vunmap
  io_uring: fix task_work cap overshooting
  io_uring: check for overflows in io_pin_pages
  io_uring/nop: ensure nop->fd is always initialized
  io_uring: limit local tw done
  io_uring: add io_local_work_pending()
  io_uring/region: return negative -E2BIG in io_create_region()
  io_uring: protect register tracing
  io_uring: remove io_uring_cqwait_reg_arg
parents 133577ca 7eb75ce7
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -336,6 +336,7 @@ struct io_ring_ctx {
	 */
	struct {
		struct llist_head	work_llist;
		struct llist_head	retry_llist;
		unsigned long		check_cq;
		atomic_t		cq_wait_nr;
		atomic_t		cq_timeouts;
+0 −14
Original line number Diff line number Diff line
@@ -873,20 +873,6 @@ enum {
	IORING_REG_WAIT_TS		= (1U << 0),
};

/*
 * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of
 * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is
 * called rather than pass in a wait argument structure separately.
 */
struct io_uring_cqwait_reg_arg {
	__u32		flags;
	__u32		struct_size;
	__u32		nr_entries;
	__u32		pad;
	__u64		user_addr;
	__u64		pad2[3];
};

/*
 * Argument for io_uring_enter(2) with
 * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument
+50 −25
Original line number Diff line number Diff line
@@ -121,6 +121,7 @@

#define IO_COMPL_BATCH			32
#define IO_REQ_ALLOC_BATCH		8
#define IO_LOCAL_TW_DEFAULT_MAX		20

struct io_defer_entry {
	struct list_head	list;
@@ -1255,12 +1256,14 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
	struct llist_node *node = llist_del_all(&ctx->work_llist);

	__io_fallback_tw(node, false);
	node = llist_del_all(&ctx->retry_llist);
	__io_fallback_tw(node, false);
}

static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
				       int min_events)
{
	if (llist_empty(&ctx->work_llist))
	if (!io_local_work_pending(ctx))
		return false;
	if (events < min_events)
		return true;
@@ -1269,8 +1272,29 @@ static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
	return false;
}

static int __io_run_local_work_loop(struct llist_node **node,
				    struct io_tw_state *ts,
				    int events)
{
	int ret = 0;

	while (*node) {
		struct llist_node *next = (*node)->next;
		struct io_kiocb *req = container_of(*node, struct io_kiocb,
						    io_task_work.node);
		INDIRECT_CALL_2(req->io_task_work.func,
				io_poll_task_func, io_req_rw_complete,
				req, ts);
		*node = next;
		if (++ret >= events)
			break;
	}

	return ret;
}

static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
			       int min_events)
			       int min_events, int max_events)
{
	struct llist_node *node;
	unsigned int loops = 0;
@@ -1281,25 +1305,23 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
again:
	min_events -= ret;
	ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, max_events);
	if (ctx->retry_llist.first)
		goto retry_done;

	/*
	 * llists are in reverse order, flip it back the right way before
	 * running the pending items.
	 */
	node = llist_reverse_order(llist_del_all(&ctx->work_llist));
	while (node) {
		struct llist_node *next = node->next;
		struct io_kiocb *req = container_of(node, struct io_kiocb,
						    io_task_work.node);
		INDIRECT_CALL_2(req->io_task_work.func,
				io_poll_task_func, io_req_rw_complete,
				req, ts);
		ret++;
		node = next;
	}
	ret += __io_run_local_work_loop(&node, ts, max_events - ret);
	ctx->retry_llist.first = node;
	loops++;

	if (io_run_local_work_continue(ctx, ret, min_events))
		goto again;
retry_done:
	io_submit_flush_completions(ctx);
	if (io_run_local_work_continue(ctx, ret, min_events))
		goto again;
@@ -1313,18 +1335,20 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
{
	struct io_tw_state ts = {};

	if (llist_empty(&ctx->work_llist))
	if (!io_local_work_pending(ctx))
		return 0;
	return __io_run_local_work(ctx, &ts, min_events);
	return __io_run_local_work(ctx, &ts, min_events,
					max(IO_LOCAL_TW_DEFAULT_MAX, min_events));
}

static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
static int io_run_local_work(struct io_ring_ctx *ctx, int min_events,
			     int max_events)
{
	struct io_tw_state ts = {};
	int ret;

	mutex_lock(&ctx->uring_lock);
	ret = __io_run_local_work(ctx, &ts, min_events);
	ret = __io_run_local_work(ctx, &ts, min_events, max_events);
	mutex_unlock(&ctx->uring_lock);
	return ret;
}
@@ -2328,9 +2352,9 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,

int io_run_task_work_sig(struct io_ring_ctx *ctx)
{
	if (!llist_empty(&ctx->work_llist)) {
	if (io_local_work_pending(ctx)) {
		__set_current_state(TASK_RUNNING);
		if (io_run_local_work(ctx, INT_MAX) > 0)
		if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0)
			return 0;
	}
	if (io_run_task_work() > 0)
@@ -2459,7 +2483,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
{
	if (unlikely(READ_ONCE(ctx->check_cq)))
		return 1;
	if (unlikely(!llist_empty(&ctx->work_llist)))
	if (unlikely(io_local_work_pending(ctx)))
		return 1;
	if (unlikely(task_work_pending(current)))
		return 1;
@@ -2493,8 +2517,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,

	if (!io_allowed_run_tw(ctx))
		return -EEXIST;
	if (!llist_empty(&ctx->work_llist))
		io_run_local_work(ctx, min_events);
	if (io_local_work_pending(ctx))
		io_run_local_work(ctx, min_events,
				  max(IO_LOCAL_TW_DEFAULT_MAX, min_events));
	io_run_task_work();

	if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
@@ -2564,8 +2589,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
		 * If we got woken because of task_work being processed, run it
		 * now rather than let the caller do another wait loop.
		 */
		if (!llist_empty(&ctx->work_llist))
			io_run_local_work(ctx, nr_wait);
		if (io_local_work_pending(ctx))
			io_run_local_work(ctx, nr_wait, nr_wait);
		io_run_task_work();

		/*
@@ -3077,7 +3102,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,

	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
	    io_allowed_defer_tw_run(ctx))
		ret |= io_run_local_work(ctx, INT_MAX) > 0;
		ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0;
	ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
	mutex_lock(&ctx->uring_lock);
	ret |= io_poll_remove_all(ctx, tctx, cancel_all);
@@ -3158,7 +3183,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
		io_run_task_work();
		io_uring_drop_tctx_refs(current);
		xa_for_each(&tctx->xa, index, node) {
			if (!llist_empty(&node->ctx->work_llist)) {
			if (io_local_work_pending(node->ctx)) {
				WARN_ON_ONCE(node->ctx->submitter_task &&
					     node->ctx->submitter_task != current);
				goto end_wait;
+7 −2
Original line number Diff line number Diff line
@@ -347,9 +347,14 @@ static inline int io_run_task_work(void)
	return ret;
}

static inline bool io_local_work_pending(struct io_ring_ctx *ctx)
{
	return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist);
}

static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
{
	return task_work_pending(current) || !llist_empty(&ctx->work_llist);
	return task_work_pending(current) || io_local_work_pending(ctx);
}

static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts)
@@ -484,6 +489,6 @@ enum {
static inline bool io_has_work(struct io_ring_ctx *ctx)
{
	return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
	       !llist_empty(&ctx->work_llist);
	       io_local_work_pending(ctx);
}
#endif
+10 −3
Original line number Diff line number Diff line
@@ -73,6 +73,8 @@ void *io_pages_map(struct page ***out_pages, unsigned short *npages,
	ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
	if (!IS_ERR(ret))
		goto done;
	if (nr_pages == 1)
		goto fail;

	ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
	if (!IS_ERR(ret)) {
@@ -81,7 +83,7 @@ void *io_pages_map(struct page ***out_pages, unsigned short *npages,
		*npages = nr_pages;
		return ret;
	}

fail:
	kvfree(pages);
	*out_pages = NULL;
	*npages = 0;
@@ -136,7 +138,12 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
	struct page **pages;
	int ret;

	end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (check_add_overflow(uaddr, len, &end))
		return ERR_PTR(-EOVERFLOW);
	if (check_add_overflow(end, PAGE_SIZE - 1, &end))
		return ERR_PTR(-EOVERFLOW);

	end = end >> PAGE_SHIFT;
	start = uaddr >> PAGE_SHIFT;
	nr_pages = end - start;
	if (WARN_ON_ONCE(!nr_pages))
@@ -229,7 +236,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
	if (!reg->size || reg->mmap_offset || reg->id)
		return -EINVAL;
	if ((reg->size >> PAGE_SHIFT) > INT_MAX)
		return E2BIG;
		return -E2BIG;
	if ((reg->user_addr | reg->size) & ~PAGE_MASK)
		return -EINVAL;
	if (check_add_overflow(reg->user_addr, reg->size, &end))
Loading