Commit 5832d264 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring updates from Jens Axboe:

 - Store ring provided buffers locally for the users, rather than stuff
   them into struct io_kiocb.

   These types of buffers must always be fully consumed or recycled in
   the current context, and leaving them in struct io_kiocb is hence not
   a good ideas as that struct has a vastly different life time.

   Basically just an architecture cleanup that can help prevent issues
   with ring provided buffers in the future.

 - Support for mixed CQE sizes in the same ring.

   Before this change, a CQ ring either used the default 16b CQEs, or it
   was setup with 32b CQE using IORING_SETUP_CQE32. For use cases where
   a few 32b CQEs were needed, this caused everything else to use big
   CQEs. This is wasteful both in terms of memory usage, but also memory
   bandwidth for the posted CQEs.

   With IORING_SETUP_CQE_MIXED, applications may use request types that
   post both normal 16b and big 32b CQEs on the same ring.

 - Add helpers for async data management, to make it harder for opcode
   handlers to mess it up.

 - Add support for multishot for uring_cmd, which ublk can use. This
   helps improve efficiency, by providing a persistent request type that
   can trigger multiple CQEs.

 - Add initial support for ring feature querying.

   We had basic support for probe operations, but the API isn't great.
   Rather than expand that, add support for QUERY which is easily
   expandable and can cover a lot more cases than the existing probe
   support. This will help applications get a better idea of what
   operations are supported on a given host.

 - zcrx improvements from Pavel:
        - Improve refill entry alignment for better caching
        - Various cleanups, especially around deduplicating normal
          memory vs dmabuf setup.
        - Generalisation of the niov size (Patch 12). It's still hard
          coded to PAGE_SIZE on init, but will let the user to specify
          the rx buffer length on setup.
        - Syscall / synchronous bufer return. It'll be used as a slow
          fallback path for returning buffers when the refill queue is
          full. Useful for tolerating slight queue size misconfiguration
          or with inconsistent load.
        - Accounting more memory to cgroups.
        - Additional independent cleanups that will also be useful for
          mutli-area support.

 - Various fixes and cleanups

* tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (68 commits)
  io_uring/cmd: drop unused res2 param from io_uring_cmd_done()
  io_uring: fix nvme's 32b cqes on mixed cq
  io_uring/query: cap number of queries
  io_uring/query: prevent infinite loops
  io_uring/zcrx: account niov arrays to cgroup
  io_uring/zcrx: allow synchronous buffer return
  io_uring/zcrx: introduce io_parse_rqe()
  io_uring/zcrx: don't adjust free cache space
  io_uring/zcrx: use guards for the refill lock
  io_uring/zcrx: reduce netmem scope in refill
  io_uring/zcrx: protect netdev with pp_lock
  io_uring/zcrx: rename dma lock
  io_uring/zcrx: make niov size variable
  io_uring/zcrx: set sgt for umem area
  io_uring/zcrx: remove dmabuf_offset
  io_uring/zcrx: deduplicate area mapping
  io_uring/zcrx: pass ifq to io_zcrx_alloc_fallback()
  io_uring/zcrx: check all niovs filled with dma addresses
  io_uring/zcrx: move area reg checks into io_import_area
  io_uring/zcrx: don't pass slot to io_zcrx_create_area
  ...
parents 77633c77 ef9f603f
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -75,7 +75,7 @@ Create an io_uring instance with the following required setup flags::

  IORING_SETUP_SINGLE_ISSUER
  IORING_SETUP_DEFER_TASKRUN
  IORING_SETUP_CQE32
  IORING_SETUP_CQE32 or IORING_SETUP_CQE_MIXED

Create memory area
------------------
+1 −1
Original line number Diff line number Diff line
@@ -776,7 +776,7 @@ static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags)
	if (bic->res == -EAGAIN && bic->nowait)
		io_uring_cmd_issue_blocking(cmd);
	else
		io_uring_cmd_done(cmd, bic->res, 0, issue_flags);
		io_uring_cmd_done(cmd, bic->res, issue_flags);
}

static void bio_cmd_bio_end_io(struct bio *bio)
+3 −3
Original line number Diff line number Diff line
@@ -1189,7 +1189,7 @@ static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
	struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);

	/* tell ublksrv one io request is coming */
	io_uring_cmd_done(cmd, res, 0, issue_flags);
	io_uring_cmd_done(cmd, res, issue_flags);
}

#define UBLK_REQUEUE_DELAY_MS	3
@@ -1873,7 +1873,7 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
	spin_unlock(&ubq->cancel_lock);

	if (!done)
		io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
		io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
}

/*
@@ -2520,7 +2520,7 @@ static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
	int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);

	if (ret != -EIOCBQUEUED)
		io_uring_cmd_done(cmd, ret, 0, issue_flags);
		io_uring_cmd_done(cmd, ret, issue_flags);
}

static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+1 −1
Original line number Diff line number Diff line
@@ -410,7 +410,7 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd,

	if (pdu->bio)
		blk_rq_unmap_user(pdu->bio);
	io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags);
	io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, issue_flags);
}

static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
+1 −1
Original line number Diff line number Diff line
@@ -4695,7 +4695,7 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss
	btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);

	io_uring_cmd_done(cmd, ret, 0, issue_flags);
	io_uring_cmd_done(cmd, ret, issue_flags);
	add_rchar(current, ret);

	for (index = 0; index < priv->nr_pages; index++)
Loading