Commit 3a4d319a authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:

 - NAPI fixes and cleanups (Pavel, Olivier)

 - Add support for absolute timeouts (Pavel)

 - Fixes for io-wq/sqpoll affinities (Felix)

 - Efficiency improvements for dealing with huge pages (Chenliang)

 - Support for a minwait mode, where the application essentially has two
   timouts - one smaller one that defines the batch timeout, and the
   overall large one similar to what we had before. This enables
   efficient use of batching based on count + timeout, while still
   working well with periods of less intensive workloads

 - Use ITER_UBUF for single segment sends

 - Add support for incremental buffer consumption. Right now each
   operation will always consume a full buffer. With incremental
   consumption, a recv/read operation only consumes the part of the
   buffer that it needs to satisfy the operation

 - Add support for GCOV for io_uring, to help retain a high coverage of
   test to code ratio

 - Fix regression with ocfs2, where an odd -EOPNOTSUPP wasn't correctly
   converted to a blocking retry

 - Add support for cloning registered buffers from one ring to another

 - Misc cleanups (Anuj, me)

* tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux: (35 commits)
  io_uring: add IORING_REGISTER_COPY_BUFFERS method
  io_uring/register: provide helper to get io_ring_ctx from 'fd'
  io_uring/rsrc: add reference count to struct io_mapped_ubuf
  io_uring/rsrc: clear 'slot' entry upfront
  io_uring/io-wq: inherit cpuset of cgroup in io worker
  io_uring/io-wq: do not allow pinning outside of cpuset
  io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common()
  io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN
  io_uring/sqpoll: do not allow pinning outside of cpuset
  io_uring/eventfd: move refs to refcount_t
  io_uring: remove unused rsrc_put_fn
  io_uring: add new line after variable declaration
  io_uring: add GCOV_PROFILE_URING Kconfig option
  io_uring/kbuf: add support for incremental buffer consumption
  io_uring/kbuf: pass in 'len' argument for buffer commit
  Revert "io_uring: Require zeroed sqe->len on provided-buffers send"
  io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h
  io_uring/kbuf: add io_kbuf_commit() helper
  io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg
  io_uring: wire up min batch wake timeout
  ...
parents 69a3a0a4 7cc2a6ea
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -239,6 +239,9 @@ struct io_ring_ctx {
		struct io_rings		*rings;
		struct percpu_ref	refs;

		clockid_t		clockid;
		enum tk_offsets		clock_offset;

		enum task_work_notify_mode	notify_method;
		unsigned			sq_thread_idle;
	} ____cacheline_aligned_in_smp;
+41 −1
Original line number Diff line number Diff line
@@ -440,11 +440,21 @@ struct io_uring_cqe {
 * IORING_CQE_F_SOCK_NONEMPTY	If set, more data to read after socket recv
 * IORING_CQE_F_NOTIF	Set for notification CQEs. Can be used to distinct
 * 			them from sends.
 * IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get
 *			more completions. In other words, the buffer is being
 *			partially consumed, and will be used by the kernel for
 *			more completions. This is only set for buffers used via
 *			the incremental buffer consumption, as provided by
 *			a ring buffer setup with IOU_PBUF_RING_INC. For any
 *			other provided buffer type, all completions with a
 *			buffer passed back is automatically returned to the
 *			application.
 */
#define IORING_CQE_F_BUFFER		(1U << 0)
#define IORING_CQE_F_MORE		(1U << 1)
#define IORING_CQE_F_SOCK_NONEMPTY	(1U << 2)
#define IORING_CQE_F_NOTIF		(1U << 3)
#define IORING_CQE_F_BUF_MORE		(1U << 4)

#define IORING_CQE_BUFFER_SHIFT		16

@@ -507,6 +517,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_SQ_WAIT		(1U << 2)
#define IORING_ENTER_EXT_ARG		(1U << 3)
#define IORING_ENTER_REGISTERED_RING	(1U << 4)
#define IORING_ENTER_ABS_TIMER		(1U << 5)

/*
 * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -542,6 +553,7 @@ struct io_uring_params {
#define IORING_FEAT_LINKED_FILE		(1U << 12)
#define IORING_FEAT_REG_REG_RING	(1U << 13)
#define IORING_FEAT_RECVSEND_BUNDLE	(1U << 14)
#define IORING_FEAT_MIN_TIMEOUT		(1U << 15)

/*
 * io_uring_register(2) opcodes and arguments
@@ -595,6 +607,11 @@ enum io_uring_register_op {
	IORING_REGISTER_NAPI			= 27,
	IORING_UNREGISTER_NAPI			= 28,

	IORING_REGISTER_CLOCK			= 29,

	/* copy registered buffers from source ring to current ring */
	IORING_REGISTER_COPY_BUFFERS		= 30,

	/* this goes last */
	IORING_REGISTER_LAST,

@@ -675,6 +692,21 @@ struct io_uring_restriction {
	__u32 resv2[3];
};

struct io_uring_clock_register {
	__u32	clockid;
	__u32	__resv[3];
};

enum {
	IORING_REGISTER_SRC_REGISTERED = 1,
};

struct io_uring_copy_buffers {
	__u32	src_fd;
	__u32	flags;
	__u32	pad[6];
};

struct io_uring_buf {
	__u64	addr;
	__u32	len;
@@ -707,9 +739,17 @@ struct io_uring_buf_ring {
 *			mmap(2) with the offset set as:
 *			IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
 *			to get a virtual mapping for the ring.
 * IOU_PBUF_RING_INC:	If set, buffers consumed from this buffer ring can be
 *			consumed incrementally. Normally one (or more) buffers
 *			are fully consumed. With incremental consumptions, it's
 *			feasible to register big ranges of buffers, and each
 *			use of it will consume only as much as it needs. This
 *			requires that both the kernel and application keep
 *			track of where the current read/recv index is at.
 */
enum io_uring_register_pbuf_ring_flags {
	IOU_PBUF_RING_MMAP	= 1,
	IOU_PBUF_RING_INC	= 2,
};

/* argument for IORING_(UN)REGISTER_PBUF_RING */
@@ -758,7 +798,7 @@ enum io_uring_register_restriction_op {
struct io_uring_getevents_arg {
	__u64	sigmask;
	__u32	sigmask_sz;
	__u32	pad;
	__u32	min_wait_usec;
	__u64	ts;
};

+13 −0
Original line number Diff line number Diff line
@@ -1687,6 +1687,19 @@ config IO_URING
	  applications to submit and complete IO through submission and
	  completion rings that are shared between the kernel and application.

config GCOV_PROFILE_URING
	bool "Enable GCOV profiling on the io_uring subsystem"
	depends on GCOV_KERNEL
	help
	  Enable GCOV profiling on the io_uring subsystem, to facilitate
	  code coverage testing.

	  If unsure, say N.

	  Note that this will have a negative impact on the performance of
	  the io_uring subsystem, hence this should only be enabled for
	  specific test purposes.

config ADVISE_SYSCALLS
	bool "Enable madvise/fadvise syscalls" if EXPERT
	default y
+4 −0
Original line number Diff line number Diff line
@@ -2,6 +2,10 @@
#
# Makefile for io_uring

ifdef CONFIG_GCOV_PROFILE_URING
GCOV_PROFILE := y
endif

obj-$(CONFIG_IO_URING)		+= io_uring.o opdef.o kbuf.o rsrc.o notif.o \
					tctx.o filetable.o rw.o net.o poll.o \
					eventfd.o uring_cmd.o openclose.o \
+7 −6
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@ struct io_ev_fd {
	struct eventfd_ctx	*cq_ev_fd;
	unsigned int		eventfd_async: 1;
	struct rcu_head		rcu;
	atomic_t		refs;
	refcount_t		refs;
	atomic_t		ops;
};

@@ -37,7 +37,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)

	eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);

	if (atomic_dec_and_test(&ev_fd->refs))
	if (refcount_dec_and_test(&ev_fd->refs))
		io_eventfd_free(rcu);
}

@@ -63,7 +63,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
	 */
	if (unlikely(!ev_fd))
		return;
	if (!atomic_inc_not_zero(&ev_fd->refs))
	if (!refcount_inc_not_zero(&ev_fd->refs))
		return;
	if (ev_fd->eventfd_async && !io_wq_current_is_worker())
		goto out;
@@ -77,7 +77,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
		}
	}
out:
	if (atomic_dec_and_test(&ev_fd->refs))
	if (refcount_dec_and_test(&ev_fd->refs))
		call_rcu(&ev_fd->rcu, io_eventfd_free);
}

@@ -126,6 +126,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
	if (IS_ERR(ev_fd->cq_ev_fd)) {
		int ret = PTR_ERR(ev_fd->cq_ev_fd);

		kfree(ev_fd);
		return ret;
	}
@@ -136,7 +137,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,

	ev_fd->eventfd_async = eventfd_async;
	ctx->has_evfd = true;
	atomic_set(&ev_fd->refs, 1);
	refcount_set(&ev_fd->refs, 1);
	atomic_set(&ev_fd->ops, 0);
	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
	return 0;
@@ -151,7 +152,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
	if (ev_fd) {
		ctx->has_evfd = false;
		rcu_assign_pointer(ctx->io_ev_fd, NULL);
		if (atomic_dec_and_test(&ev_fd->refs))
		if (refcount_dec_and_test(&ev_fd->refs))
			call_rcu(&ev_fd->rcu, io_eventfd_free);
		return 0;
	}
Loading