Commit cfd40392 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-6.19-20251208' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring updates from Jens Axboe:
 "Followup set of fixes for io_uring for this merge window. These are
  either later fixes, or cleanups that don't make sense to defer. This
  pull request contains:

   - Fix for a recent regression in io-wq worker creation

   - Tracing cleanup

   - Use READ_ONCE/WRITE_ONCE consistently for ring mapped kbufs. Mostly
     for documentation purposes, indicating that they are shared with
     userspace

   - Fix for POLL_ADD losing a completion, if the request is updated and
     now is triggerable - eg, if POLLIN is set with the updated, and the
     polled file is readable

   - In conjunction with the above fix, also unify how poll wait queue
     entries are deleted with the head update. We had 3 different spots
     doing both the list deletion and head write, with one of them
     nicely documented. Abstract that into a helper and use it
     consistently

   - Small series from Joanne fixing an issue with buffer cloning, and
     cleaning up the arg validation"

* tag 'io_uring-6.19-20251208' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
  io_uring/poll: unify poll waitqueue entry and list removal
  io_uring/kbuf: use WRITE_ONCE() for userspace-shared buffer ring fields
  io_uring/kbuf: use READ_ONCE() for userspace-mapped memory
  io_uring/rsrc: fix lost entries after cloned range
  io_uring/rsrc: rename misleading src_node variable in io_clone_buffers()
  io_uring/rsrc: clean up buffer cloning arg validation
  io_uring/trace: rename io_uring_queue_async_work event "rw" field
  io_uring/io-wq: always retry worker create on ERESTART*
  io_uring/poll: correctly handle io_poll_add() return value on update
parents 4482ebb2 55d57b3b
Loading
Loading
Loading
Loading
+6 −6
Original line number Diff line number Diff line
@@ -133,15 +133,15 @@ TRACE_EVENT(io_uring_file_get,
 * io_uring_queue_async_work - called before submitting a new async work
 *
 * @req:	pointer to a submitted request
 * @rw:		type of workqueue, hashed or normal
 * @hashed:	whether async work is hashed
 *
 * Allows to trace asynchronous work submission.
 */
TRACE_EVENT(io_uring_queue_async_work,

	TP_PROTO(struct io_kiocb *req, int rw),
	TP_PROTO(struct io_kiocb *req, bool hashed),

	TP_ARGS(req, rw),
	TP_ARGS(req, hashed),

	TP_STRUCT__entry (
		__field(  void *,			ctx		)
@@ -150,7 +150,7 @@ TRACE_EVENT(io_uring_queue_async_work,
		__field(  u8,				opcode		)
		__field(  unsigned long long,		flags		)
		__field(  struct io_wq_work *,		work		)
		__field(  int,				rw		)
		__field(  bool,				hashed		)

		__string( op_str, io_uring_get_opcode(req->opcode)	)
	),
@@ -162,7 +162,7 @@ TRACE_EVENT(io_uring_queue_async_work,
		__entry->flags		= (__force unsigned long long) req->flags;
		__entry->opcode		= req->opcode;
		__entry->work		= &req->work;
		__entry->rw		= rw;
		__entry->hashed		= hashed;

		__assign_str(op_str);
	),
@@ -170,7 +170,7 @@ TRACE_EVENT(io_uring_queue_async_work,
	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p",
		__entry->ctx, __entry->req, __entry->user_data,
		__get_str(op_str), __entry->flags,
		__entry->rw ? "hashed" : "normal", __entry->work)
		__entry->hashed ? "hashed" : "normal", __entry->work)
);

/**
+3 −2
Original line number Diff line number Diff line
@@ -805,11 +805,12 @@ static inline bool io_should_retry_thread(struct io_worker *worker, long err)
	 */
	if (fatal_signal_pending(current))
		return false;
	if (worker->init_retries++ >= WORKER_INIT_LIMIT)
		return false;

	worker->init_retries++;
	switch (err) {
	case -EAGAIN:
		return worker->init_retries <= WORKER_INIT_LIMIT;
	/* Analogous to a fork() syscall, always retry on a restartable error */
	case -ERESTARTSYS:
	case -ERESTARTNOINTR:
	case -ERESTARTNOHAND:
+8 −8
Original line number Diff line number Diff line
@@ -44,11 +44,11 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len)
		buf_len -= this_len;
		/* Stop looping for invalid buffer length of 0 */
		if (buf_len || !this_len) {
			buf->addr += this_len;
			buf->len = buf_len;
			WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len);
			WRITE_ONCE(buf->len, buf_len);
			return false;
		}
		buf->len = 0;
		WRITE_ONCE(buf->len, 0);
		bl->head++;
		len -= this_len;
	}
@@ -198,9 +198,9 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
	if (*len == 0 || *len > buf_len)
		*len = buf_len;
	req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
	req->buf_index = buf->bid;
	req->buf_index = READ_ONCE(buf->bid);
	sel.buf_list = bl;
	sel.addr = u64_to_user_ptr(buf->addr);
	sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));

	if (io_should_commit(req, issue_flags)) {
		io_kbuf_commit(req, sel.buf_list, *len, 1);
@@ -280,7 +280,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
	if (!arg->max_len)
		arg->max_len = INT_MAX;

	req->buf_index = buf->bid;
	req->buf_index = READ_ONCE(buf->bid);
	do {
		u32 len = READ_ONCE(buf->len);

@@ -291,11 +291,11 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
				arg->partial_map = 1;
				if (iov != arg->iovs)
					break;
				buf->len = len;
				WRITE_ONCE(buf->len, len);
			}
		}

		iov->iov_base = u64_to_user_ptr(buf->addr);
		iov->iov_base = u64_to_user_ptr(READ_ONCE(buf->addr));
		iov->iov_len = len;
		iov++;

+29 −23
Original line number Diff line number Diff line
@@ -138,14 +138,32 @@ static void io_init_poll_iocb(struct io_poll *poll, __poll_t events)
	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
}

static void io_poll_remove_waitq(struct io_poll *poll)
{
	/*
	 * If the waitqueue is being freed early but someone is already holds
	 * ownership over it, we have to tear down the request as best we can.
	 * That means immediately removing the request from its waitqueue and
	 * preventing all further accesses to the waitqueue via the request.
	 */
	list_del_init(&poll->wait.entry);

	/*
	 * Careful: this *must* be the last step, since as soon as req->head is
	 * NULL'ed out, the request can be completed and freed, since
	 * io_poll_remove_entry() will no longer need to take the waitqueue
	 * lock.
	 */
	smp_store_release(&poll->head, NULL);
}

static inline void io_poll_remove_entry(struct io_poll *poll)
{
	struct wait_queue_head *head = smp_load_acquire(&poll->head);

	if (head) {
		spin_lock_irq(&head->lock);
		list_del_init(&poll->wait.entry);
		poll->head = NULL;
		io_poll_remove_waitq(poll);
		spin_unlock_irq(&head->lock);
	}
}
@@ -368,23 +386,7 @@ static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll)
	io_poll_mark_cancelled(req);
	/* we have to kick tw in case it's not already */
	io_poll_execute(req, 0);

	/*
	 * If the waitqueue is being freed early but someone is already
	 * holds ownership over it, we have to tear down the request as
	 * best we can. That means immediately removing the request from
	 * its waitqueue and preventing all further accesses to the
	 * waitqueue via the request.
	 */
	list_del_init(&poll->wait.entry);

	/*
	 * Careful: this *must* be the last step, since as soon
	 * as req->head is NULL'ed out, the request can be
	 * completed and freed, since aio_poll_complete_work()
	 * will no longer need to take the waitqueue lock.
	 */
	smp_store_release(&poll->head, NULL);
	io_poll_remove_waitq(poll);
	return 1;
}

@@ -413,8 +415,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,

		/* optional, saves extra locking for removal in tw handler */
		if (mask && poll->events & EPOLLONESHOT) {
			list_del_init(&poll->wait.entry);
			poll->head = NULL;
			io_poll_remove_waitq(poll);
			if (wqe_is_double(wait))
				req->flags &= ~REQ_F_DOUBLE_POLL;
			else
@@ -937,12 +938,17 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)

		ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED);
		/* successfully updated, don't complete poll request */
		if (!ret2 || ret2 == -EIOCBQUEUED)
		if (ret2 == IOU_ISSUE_SKIP_COMPLETE)
			goto out;
		/* request completed as part of the update, complete it */
		else if (ret2 == IOU_COMPLETE)
			goto complete;
	}

	req_set_fail(preq);
	io_req_set_res(preq, -ECANCELED, 0);
complete:
	if (preq->cqe.res < 0)
		req_set_fail(preq);
	preq->io_task_work.func = io_req_task_complete;
	io_req_task_work_add(preq);
out:
+21 −26
Original line number Diff line number Diff line
@@ -1186,12 +1186,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
		return -EBUSY;

	nbufs = src_ctx->buf_table.nr;
	if (!nbufs)
		return -ENXIO;
	if (!arg->nr)
		arg->nr = nbufs;
	else if (arg->nr > nbufs)
		return -EINVAL;
	else if (arg->nr > IORING_MAX_REG_BUFFERS)
		return -EINVAL;
	if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs)
		return -EOVERFLOW;
	if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
		return -EOVERFLOW;
	if (nbufs > IORING_MAX_REG_BUFFERS)
@@ -1201,31 +1205,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
	if (ret)
		return ret;

	/* Fill entries in data from dst that won't overlap with src */
	/* Copy original dst nodes from before the cloned range */
	for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
		struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
		struct io_rsrc_node *node = ctx->buf_table.nodes[i];

		if (src_node) {
			data.nodes[i] = src_node;
			src_node->refs++;
		if (node) {
			data.nodes[i] = node;
			node->refs++;
		}
	}

	ret = -ENXIO;
	nbufs = src_ctx->buf_table.nr;
	if (!nbufs)
		goto out_free;
	ret = -EINVAL;
	if (!arg->nr)
		arg->nr = nbufs;
	else if (arg->nr > nbufs)
		goto out_free;
	ret = -EOVERFLOW;
	if (check_add_overflow(arg->nr, arg->src_off, &off))
		goto out_free;
	if (off > nbufs)
		goto out_free;

	off = arg->dst_off;
	i = arg->src_off;
	nr = arg->nr;
@@ -1238,8 +1227,8 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
		} else {
			dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
			if (!dst_node) {
				ret = -ENOMEM;
				goto out_free;
				io_rsrc_data_free(ctx, &data);
				return -ENOMEM;
			}

			refcount_inc(&src_node->buf->refs);
@@ -1249,6 +1238,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
		i++;
	}

	/* Copy original dst nodes from after the cloned range */
	for (i = nbufs; i < ctx->buf_table.nr; i++) {
		struct io_rsrc_node *node = ctx->buf_table.nodes[i];

		if (node) {
			data.nodes[i] = node;
			node->refs++;
		}
	}

	/*
	 * If asked for replace, put the old table. data->nodes[] holds both
	 * old and new nodes at this point.
@@ -1265,10 +1264,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
	WARN_ON_ONCE(ctx->buf_table.nr);
	ctx->buf_table = data;
	return 0;

out_free:
	io_rsrc_data_free(ctx, &data);
	return ret;
}

/*