Commit 8350142a authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:

 - Cleanups of the eventfd handling code, making it fully private.

 - Support for sending a sync message to another ring, without having a
   ring available to send a normal async message.

 - Get rid of the separate unlocked hash table, unify everything around
   the single locked one.

 - Add support for ring resizing. It can be hard to appropriately size
   the CQ ring upfront, if the application doesn't know how busy it will
   be. This results in applications sizing rings for the most busy case,
   which can be wasteful. With ring resizing, they can start small and
   grow the ring, if needed.

 - Add support for fixed wait regions, rather than needing to copy the
   same wait data tons of times for each wait operation.

 - Rewrite the resource node handling, which before was serialized per
   ring. This caused issues with particularly fixed files, where one
   file waiting on IO could hold up putting and freeing of other
   unrelated files. Now each node is handled separately. New code is
   much simpler too, and was a net 250 line reduction in code.

 - Add support for just doing partial buffer clones, rather than always
   cloning the entire buffer table.

 - Series adding static NAPI support, where a specific NAPI instance is
   used rather than having a list of them available that need lookup.

 - Add support for mapped regions, and also convert the fixed wait
   support mentioned above to that concept. This avoids doing special
   mappings for various planned features, and folds the existing
   registered wait into that too.

 - Add support for hybrid IO polling, which is a variant of strict
   IOPOLL but with an initial sleep delay to avoid spinning too early
   and wasting resources on devices that aren't necessarily in the < 5
   usec category wrt latencies.

 - Various cleanups and little fixes.

* tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux: (79 commits)
  io_uring/region: fix error codes after failed vmap
  io_uring: restore back registered wait arguments
  io_uring: add memory region registration
  io_uring: introduce concept of memory regions
  io_uring: temporarily disable registered waits
  io_uring: disable ENTER_EXT_ARG_REG for IOPOLL
  io_uring: fortify io_pin_pages with a warning
  switch io_msg_ring() to CLASS(fd)
  io_uring: fix invalid hybrid polling ctx leaks
  io_uring/uring_cmd: fix buffer index retrieval
  io_uring/rsrc: add & apply io_req_assign_buf_node()
  io_uring/rsrc: remove '->ctx_ptr' of 'struct io_rsrc_node'
  io_uring/rsrc: pass 'struct io_ring_ctx' reference to rsrc helpers
  io_uring: avoid normal tw intermediate fallback
  io_uring/napi: add static napi tracking strategy
  io_uring/napi: clean up __io_napi_do_busy_loop
  io_uring/napi: Use lock guards
  io_uring/napi: improve __io_napi_add
  io_uring/napi: fix io_napi_entry RCU accesses
  io_uring/napi: protect concurrent io_napi_entry timeout accesses
  ...
parents 77a0cfaf a6529588
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -110,7 +110,7 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,

static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
{
	return cmd_to_io_kiocb(cmd)->task;
	return cmd_to_io_kiocb(cmd)->tctx->task;
}

#endif /* _LINUX_IO_URING_CMD_H */
+56 −32
Original line number Diff line number Diff line
@@ -56,19 +56,18 @@ struct io_wq_work {
	int cancel_seq;
};

struct io_fixed_file {
	/* file * with additional FFS_* flags */
	unsigned long file_ptr;
struct io_rsrc_data {
	unsigned int			nr;
	struct io_rsrc_node		**nodes;
};

struct io_file_table {
	struct io_fixed_file *files;
	struct io_rsrc_data data;
	unsigned long *bitmap;
	unsigned int alloc_hint;
};

struct io_hash_bucket {
	spinlock_t		lock;
	struct hlist_head	list;
} ____cacheline_aligned_in_smp;

@@ -77,6 +76,12 @@ struct io_hash_table {
	unsigned		hash_bits;
};

struct io_mapped_region {
	struct page		**pages;
	void			*vmap_ptr;
	size_t			nr_pages;
};

/*
 * Arbitrary limit, can be raised if need be
 */
@@ -86,6 +91,7 @@ struct io_uring_task {
	/* submission side */
	int				cached_refs;
	const struct io_ring_ctx 	*last;
	struct task_struct		*task;
	struct io_wq			*io_wq;
	struct file			*registered_rings[IO_RINGFD_REG_MAX];

@@ -271,7 +277,6 @@ struct io_ring_ctx {
		 * Fixed resources fast path, should be accessed only under
		 * uring_lock, and updated through io_uring_register(2)
		 */
		struct io_rsrc_node	*rsrc_node;
		atomic_t		cancel_seq;

		/*
@@ -284,15 +289,13 @@ struct io_ring_ctx {
		struct io_wq_work_list	iopoll_list;

		struct io_file_table	file_table;
		struct io_mapped_ubuf	**user_bufs;
		unsigned		nr_user_files;
		unsigned		nr_user_bufs;
		struct io_rsrc_data	buf_table;

		struct io_submit_state	submit_state;

		struct xarray		io_bl_xa;

		struct io_hash_table	cancel_table_locked;
		struct io_hash_table	cancel_table;
		struct io_alloc_cache	apoll_cache;
		struct io_alloc_cache	netmsg_cache;
		struct io_alloc_cache	rw_cache;
@@ -303,6 +306,11 @@ struct io_ring_ctx {
		 * ->uring_cmd() by io_uring_cmd_insert_cancelable()
		 */
		struct hlist_head	cancelable_uring_cmd;
		/*
		 * For Hybrid IOPOLL, runtime in hybrid polling, without
		 * scheduling time
		 */
		u64					hybrid_poll_time;
	} ____cacheline_aligned_in_smp;

	struct {
@@ -317,6 +325,9 @@ struct io_ring_ctx {
		unsigned		cq_entries;
		struct io_ev_fd	__rcu	*io_ev_fd;
		unsigned		cq_extra;

		void			*cq_wait_arg;
		size_t			cq_wait_size;
	} ____cacheline_aligned_in_smp;

	/*
@@ -343,7 +354,6 @@ struct io_ring_ctx {

	struct list_head	io_buffers_comp;
	struct list_head	cq_overflow_list;
	struct io_hash_table	cancel_table;

	struct hlist_head	waitid_list;

@@ -367,16 +377,6 @@ struct io_ring_ctx {
	struct wait_queue_head		poll_wq;
	struct io_restriction		restrictions;

	/* slow path rsrc auxilary data, used by update/register */
	struct io_rsrc_data		*file_data;
	struct io_rsrc_data		*buf_data;

	/* protected by ->uring_lock */
	struct list_head		rsrc_ref_list;
	struct io_alloc_cache		rsrc_node_cache;
	struct wait_queue_head		rsrc_quiesce_wq;
	unsigned			rsrc_quiesce;

	u32			pers_next;
	struct xarray		personalities;

@@ -410,7 +410,7 @@ struct io_ring_ctx {
	/* napi busy poll default timeout */
	ktime_t			napi_busy_poll_dt;
	bool			napi_prefer_busy_poll;
	bool			napi_enabled;
	u8			napi_track_mode;

	DECLARE_HASHTABLE(napi_ht, 4);
#endif
@@ -418,6 +418,13 @@ struct io_ring_ctx {
	/* protected by ->completion_lock */
	unsigned			evfd_last_cq_tail;

	/*
	 * Protection for resize vs mmap races - both the mmap and resize
	 * side will need to grab this lock, to prevent either side from
	 * being run concurrently with the other.
	 */
	struct mutex			resize_lock;

	/*
	 * If IORING_SETUP_NO_MMAP is used, then the below holds
	 * the gup'ed pages for the two rings, and the sqes.
@@ -426,6 +433,9 @@ struct io_ring_ctx {
	unsigned short			n_sqe_pages;
	struct page			**ring_pages;
	struct page			**sqe_pages;

	/* used for optimised request parameter and wait argument passing  */
	struct io_mapped_region		param_region;
};

struct io_tw_state {
@@ -448,6 +458,7 @@ enum {
	REQ_F_LINK_TIMEOUT_BIT,
	REQ_F_NEED_CLEANUP_BIT,
	REQ_F_POLLED_BIT,
	REQ_F_HYBRID_IOPOLL_STATE_BIT,
	REQ_F_BUFFER_SELECTED_BIT,
	REQ_F_BUFFER_RING_BIT,
	REQ_F_REISSUE_BIT,
@@ -460,7 +471,6 @@ enum {
	REQ_F_DOUBLE_POLL_BIT,
	REQ_F_APOLL_MULTISHOT_BIT,
	REQ_F_CLEAR_POLLIN_BIT,
	REQ_F_HASH_LOCKED_BIT,
	/* keep async read/write and isreg together and in order */
	REQ_F_SUPPORT_NOWAIT_BIT,
	REQ_F_ISREG_BIT,
@@ -469,6 +479,7 @@ enum {
	REQ_F_BL_EMPTY_BIT,
	REQ_F_BL_NO_RECYCLE_BIT,
	REQ_F_BUFFERS_COMMIT_BIT,
	REQ_F_BUF_NODE_BIT,

	/* not a real bit, just to check we're not overflowing the space */
	__REQ_F_LAST_BIT,
@@ -507,6 +518,8 @@ enum {
	REQ_F_NEED_CLEANUP	= IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT),
	/* already went through poll handler */
	REQ_F_POLLED		= IO_REQ_FLAG(REQ_F_POLLED_BIT),
	/* every req only blocks once in hybrid poll */
	REQ_F_IOPOLL_STATE        = IO_REQ_FLAG(REQ_F_HYBRID_IOPOLL_STATE_BIT),
	/* buffer already selected */
	REQ_F_BUFFER_SELECTED	= IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT),
	/* buffer selected from ring, needs commit */
@@ -535,8 +548,6 @@ enum {
	REQ_F_APOLL_MULTISHOT	= IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT),
	/* recvmsg special flag, clear EPOLLIN */
	REQ_F_CLEAR_POLLIN	= IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT),
	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
	REQ_F_HASH_LOCKED	= IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT),
	/* don't use lazy poll wake for this request */
	REQ_F_POLL_NO_LAZY	= IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT),
	/* file is pollable */
@@ -547,6 +558,8 @@ enum {
	REQ_F_BL_NO_RECYCLE	= IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
	/* buffer ring head needs incrementing on put */
	REQ_F_BUFFERS_COMMIT	= IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
	/* buf node is valid */
	REQ_F_BUF_NODE		= IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
};

typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
@@ -616,12 +629,9 @@ struct io_kiocb {
	struct io_cqe			cqe;

	struct io_ring_ctx		*ctx;
	struct task_struct		*task;
	struct io_uring_task		*tctx;

	union {
		/* store used ubuf, so we can prevent reloading */
		struct io_mapped_ubuf	*imu;

		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
		struct io_buffer	*kbuf;

@@ -630,6 +640,8 @@ struct io_kiocb {
		 * REQ_F_BUFFER_RING is set.
		 */
		struct io_buffer_list	*buf_list;

		struct io_rsrc_node	*buf_node;
	};

	union {
@@ -639,13 +651,20 @@ struct io_kiocb {
		__poll_t apoll_events;
	};

	struct io_rsrc_node		*rsrc_node;
	struct io_rsrc_node		*file_node;

	atomic_t			refs;
	bool				cancel_seq_set;
	struct io_task_work		io_task_work;
	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
	union {
		/*
		 * for polled requests, i.e. IORING_OP_POLL_ADD and async armed
		 * poll
		 */
		struct hlist_node	hash_node;
		/* For IOPOLL setup queues, with hybrid polling */
		u64                     iopoll_start;
	};
	/* internal polling, see IORING_FEAT_FAST_POLL */
	struct async_poll		*apoll;
	/* opcode allocated if it needs to store data for async defer */
@@ -668,4 +687,9 @@ struct io_overflow_cqe {
	struct io_uring_cqe cqe;
};

static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx)
{
	return ctx->flags & IORING_SETUP_CQE32;
}

#endif
+9 −15
Original line number Diff line number Diff line
@@ -315,20 +315,14 @@ TRACE_EVENT(io_uring_fail_link,
 * io_uring_complete - called when completing an SQE
 *
 * @ctx:		pointer to a ring context structure
 * @req:		pointer to a submitted request
 * @user_data:		user data associated with the request
 * @res:		result of the request
 * @cflags:		completion flags
 * @extra1:		extra 64-bit data for CQE32
 * @extra2:		extra 64-bit data for CQE32
 *
 * @req:		(optional) pointer to a submitted request
 * @cqe:		pointer to the filled in CQE being posted
 */
TRACE_EVENT(io_uring_complete,

	TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags,
		 u64 extra1, u64 extra2),
TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe),

	TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2),
	TP_ARGS(ctx, req, cqe),

	TP_STRUCT__entry (
		__field(  void *,	ctx		)
@@ -343,11 +337,11 @@ TRACE_EVENT(io_uring_complete,
	TP_fast_assign(
		__entry->ctx		= ctx;
		__entry->req		= req;
		__entry->user_data	= user_data;
		__entry->res		= res;
		__entry->cflags		= cflags;
		__entry->extra1		= extra1;
		__entry->extra2		= extra2;
		__entry->user_data	= cqe->user_data;
		__entry->res		= cqe->res;
		__entry->cflags		= cqe->flags;
		__entry->extra1		= io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0;
		__entry->extra2		= io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0;
	),

	TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x "
+115 −4
Original line number Diff line number Diff line
@@ -200,6 +200,9 @@ enum io_uring_sqe_flags_bit {
 */
#define IORING_SETUP_NO_SQARRAY		(1U << 16)

/* Use hybrid poll in iopoll process */
#define IORING_SETUP_HYBRID_IOPOLL	(1U << 17)

enum io_uring_op {
	IORING_OP_NOP,
	IORING_OP_READV,
@@ -416,6 +419,9 @@ enum io_uring_msg_ring_flags {
 * IORING_NOP_INJECT_RESULT	Inject result from sqe->result
 */
#define IORING_NOP_INJECT_RESULT	(1U << 0)
#define IORING_NOP_FILE			(1U << 1)
#define IORING_NOP_FIXED_FILE		(1U << 2)
#define IORING_NOP_FIXED_BUFFER		(1U << 3)

/*
 * IO completion data structure (Completion Queue Entry)
@@ -518,6 +524,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_EXT_ARG		(1U << 3)
#define IORING_ENTER_REGISTERED_RING	(1U << 4)
#define IORING_ENTER_ABS_TIMER		(1U << 5)
#define IORING_ENTER_EXT_ARG_REG	(1U << 6)

/*
 * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -612,6 +619,16 @@ enum io_uring_register_op {
	/* clone registered buffers from source ring to current ring */
	IORING_REGISTER_CLONE_BUFFERS		= 30,

	/* send MSG_RING without having a ring */
	IORING_REGISTER_SEND_MSG_RING		= 31,

	/* 32 reserved for zc rx */

	/* resize CQ ring */
	IORING_REGISTER_RESIZE_RINGS		= 33,

	IORING_REGISTER_MEM_REGION		= 34,

	/* this goes last */
	IORING_REGISTER_LAST,

@@ -632,6 +649,31 @@ struct io_uring_files_update {
	__aligned_u64 /* __s32 * */ fds;
};

enum {
	/* initialise with user provided memory pointed by user_addr */
	IORING_MEM_REGION_TYPE_USER		= 1,
};

struct io_uring_region_desc {
	__u64 user_addr;
	__u64 size;
	__u32 flags;
	__u32 id;
	__u64 mmap_offset;
	__u64 __resv[4];
};

enum {
	/* expose the region as registered wait arguments */
	IORING_MEM_REGION_REG_WAIT_ARG		= 1,
};

struct io_uring_mem_region_reg {
	__u64 region_uptr; /* struct io_uring_region_desc * */
	__u64 flags;
	__u64 __resv[2];
};

/*
 * Register a fully sparse file space, rather than pass in an array of all
 * -1 file descriptors.
@@ -698,13 +740,17 @@ struct io_uring_clock_register {
};

enum {
	IORING_REGISTER_SRC_REGISTERED = 1,
	IORING_REGISTER_SRC_REGISTERED	= (1U << 0),
	IORING_REGISTER_DST_REPLACE	= (1U << 1),
};

struct io_uring_clone_buffers {
	__u32	src_fd;
	__u32	flags;
	__u32	pad[6];
	__u32	src_off;
	__u32	dst_off;
	__u32	nr;
	__u32	pad[3];
};

struct io_uring_buf {
@@ -768,12 +814,40 @@ struct io_uring_buf_status {
	__u32	resv[8];
};

enum io_uring_napi_op {
	/* register/ungister backward compatible opcode */
	IO_URING_NAPI_REGISTER_OP = 0,

	/* opcodes to update napi_list when static tracking is used */
	IO_URING_NAPI_STATIC_ADD_ID = 1,
	IO_URING_NAPI_STATIC_DEL_ID = 2
};

enum io_uring_napi_tracking_strategy {
	/* value must be 0 for backward compatibility */
	IO_URING_NAPI_TRACKING_DYNAMIC = 0,
	IO_URING_NAPI_TRACKING_STATIC = 1,
	IO_URING_NAPI_TRACKING_INACTIVE = 255
};

/* argument for IORING_(UN)REGISTER_NAPI */
struct io_uring_napi {
	__u32	busy_poll_to;
	__u8	prefer_busy_poll;
	__u8	pad[3];
	__u64	resv;

	/* a io_uring_napi_op value */
	__u8	opcode;
	__u8	pad[2];

	/*
	 * for IO_URING_NAPI_REGISTER_OP, it is a
	 * io_uring_napi_tracking_strategy value.
	 *
	 * for IO_URING_NAPI_STATIC_ADD_ID/IO_URING_NAPI_STATIC_DEL_ID
	 * it is the napi id to add/del from napi_list.
	 */
	__u32	op_param;
	__u32	resv;
};

/*
@@ -795,6 +869,43 @@ enum io_uring_register_restriction_op {
	IORING_RESTRICTION_LAST
};

enum {
	IORING_REG_WAIT_TS		= (1U << 0),
};

/*
 * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of
 * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is
 * called rather than pass in a wait argument structure separately.
 */
struct io_uring_cqwait_reg_arg {
	__u32		flags;
	__u32		struct_size;
	__u32		nr_entries;
	__u32		pad;
	__u64		user_addr;
	__u64		pad2[3];
};

/*
 * Argument for io_uring_enter(2) with
 * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument
 * is an index into a previously registered fixed wait region described by
 * the below structure.
 */
struct io_uring_reg_wait {
	struct __kernel_timespec	ts;
	__u32				min_wait_usec;
	__u32				flags;
	__u64				sigmask;
	__u32				sigmask_sz;
	__u32				pad[3];
	__u64				pad2[2];
};

/*
 * Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG
 */
struct io_uring_getevents_arg {
	__u64	sigmask;
	__u32	sigmask_sz;
+6 −14
Original line number Diff line number Diff line
@@ -205,7 +205,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
		.opcode	= cancel->opcode,
		.seq	= atomic_inc_return(&req->ctx->cancel_seq),
	};
	struct io_uring_task *tctx = req->task->io_uring;
	struct io_uring_task *tctx = req->tctx;
	int ret;

	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
@@ -232,16 +232,6 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
	return IOU_OK;
}

void init_hash_table(struct io_hash_table *table, unsigned size)
{
	unsigned int i;

	for (i = 0; i < size; i++) {
		spin_lock_init(&table->hbs[i].lock);
		INIT_HLIST_HEAD(&table->hbs[i].list);
	}
}

static int __io_sync_cancel(struct io_uring_task *tctx,
			    struct io_cancel_data *cd, int fd)
{
@@ -250,10 +240,12 @@ static int __io_sync_cancel(struct io_uring_task *tctx,
	/* fixed must be grabbed every time since we drop the uring_lock */
	if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
	    (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
		if (unlikely(fd >= ctx->nr_user_files))
		struct io_rsrc_node *node;

		node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
		if (unlikely(!node))
			return -EBADF;
		fd = array_index_nospec(fd, ctx->nr_user_files);
		cd->file = io_file_from_index(&ctx->file_table, fd);
		cd->file = io_slot_file(node);
		if (!cd->file)
			return -EBADF;
	}
Loading