Commit 9d88bb92 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-7.1-20260430' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring fixes from Jens Axboe:

 - Remove dead struct io_buffer_list member

 - Fix for incrementally consumed buffers with recvmsg multishot, which
   requires a minimum value left in a buffer for any receive for the
   headers. If there's still a bit of buffer left but it's smaller than
   that value, then userspace will see a spurious -EFAULT returned in
   the CQE

 - Locking fix for the DEFER_TASKRUN retry list, which otherwise could
   race with fallback cancelations. If the task is exiting with
   task_work left in both the normal and retry list AND the exit cleanup
   races with the task running task work, then entries could either be
   doubly completed or lost

 - Cap NAPI busy poll timeout to something sane, to avoid syzbot running
   into excessive polling and triggering warnings around that

* tag 'io_uring-7.1-20260430' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
  io_uring/tw: serialize ctx->retry_llist with ->uring_lock
  io_uring/napi: cap busy_poll_to 10 msec
  io_uring/kbuf: support min length left for incremental buffers
  io_uring/kbuf: kill dead struct io_buffer_list 'nr_entries' member
parents 33d0c9c5 17666e2d
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -905,7 +905,8 @@ struct io_uring_buf_reg {
	__u32	ring_entries;
	__u16	bgid;
	__u16	flags;
	__u64	resv[3];
	__u32	min_left;
	__u32	resv[5];
};

/* argument for IORING_REGISTER_PBUF_STATUS */
+7 −2
Original line number Diff line number Diff line
@@ -47,7 +47,7 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len)
		this_len = min_t(u32, len, buf_len);
		buf_len -= this_len;
		/* Stop looping for invalid buffer length of 0 */
		if (buf_len || !this_len) {
		if (buf_len > bl->min_left_sub_one || !this_len) {
			WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len);
			WRITE_ONCE(buf->len, buf_len);
			return false;
@@ -637,6 +637,10 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
	if (reg.ring_entries >= 65536)
		return -EINVAL;

	/* minimum left byte count is a property of incremental buffers */
	if (!(reg.flags & IOU_PBUF_RING_INC) && reg.min_left)
		return -EINVAL;

	bl = io_buffer_get_list(ctx, reg.bgid);
	if (bl) {
		/* if mapped buffer ring OR classic exists, don't allow */
@@ -680,10 +684,11 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
	}
#endif

	bl->nr_entries = reg.ring_entries;
	bl->mask = reg.ring_entries - 1;
	bl->flags |= IOBL_BUF_RING;
	bl->buf_ring = br;
	if (reg.min_left)
		bl->min_left_sub_one = reg.min_left - 1;
	if (reg.flags & IOU_PBUF_RING_INC)
		bl->flags |= IOBL_INC;
	ret = io_buffer_add_list(ctx, bl, reg.bgid);
+7 −1
Original line number Diff line number Diff line
@@ -27,12 +27,18 @@ struct io_buffer_list {
	__u16 bgid;

	/* below is for ring provided buffers */
	__u16 nr_entries;
	__u16 head;
	__u16 mask;

	__u16 flags;

	/*
	 * minimum required amount to be left to reuse an incrementally
	 * consumed buffer. If less than this is left at consumption time,
	 * buffer is done and head is incremented to the next buffer.
	 */
	__u32 min_left_sub_one;

	struct io_mapped_region region;
};

+2 −0
Original line number Diff line number Diff line
@@ -276,6 +276,8 @@ static int io_napi_register_napi(struct io_ring_ctx *ctx,
	/* clean the napi list for new settings */
	io_napi_free(ctx);
	WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
	/* cap NAPI at 10 msec of spin time */
	napi->busy_poll_to = min(10000, napi->busy_poll_to);
	WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
	return 0;
+11 −1
Original line number Diff line number Diff line
@@ -273,8 +273,18 @@ void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags)

void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
{
	struct llist_node *node = llist_del_all(&ctx->work_llist);
	struct llist_node *node;

	/*
	 * Running the work items may utilize ->retry_llist as a means
	 * for capping the number of task_work entries run at the same
	 * time. But that list can potentially race with moving the work
	 * from here, if the task is exiting. As any normal task_work
	 * running holds ->uring_lock already, just guard this slow path
	 * with ->uring_lock to avoid racing on ->retry_llist.
	 */
	guard(mutex)(&ctx->uring_lock);
	node = llist_del_all(&ctx->work_llist);
	__io_fallback_tw(node, false);
	node = llist_del_all(&ctx->retry_llist);
	__io_fallback_tw(node, false);