Commit d2c84bdc authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:

 - Make running of task_work internal loops more fair, and unify how the
   different methods deal with them (me)

 - Support for per-ring NAPI. The two minor networking patches are in a
   shared branch with netdev (Stefan)

 - Add support for truncate (Tony)

 - Export SQPOLL utilization stats (Xiaobing)

 - Multishot fixes (Pavel)

 - Fix for a race in manipulating the request flags via poll (Pavel)

 - Cleanup the multishot checking by making it generic, moving it out of
   opcode handlers (Pavel)

 - Various tweaks and cleanups (me, Kunwu, Alexander)

* tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux: (53 commits)
  io_uring: Fix sqpoll utilization check racing with dying sqpoll
  io_uring/net: dedup io_recv_finish req completion
  io_uring: refactor DEFER_TASKRUN multishot checks
  io_uring: fix mshot io-wq checks
  io_uring/net: add io_req_msg_cleanup() helper
  io_uring/net: simplify msghd->msg_inq checking
  io_uring/kbuf: rename REQ_F_PARTIAL_IO to REQ_F_BL_NO_RECYCLE
  io_uring/net: remove dependency on REQ_F_PARTIAL_IO for sr->done_io
  io_uring/net: correctly handle multishot recvmsg retry setup
  io_uring/net: clear REQ_F_BL_EMPTY in the multishot retry handler
  io_uring: fix io_queue_proc modifying req->flags
  io_uring: fix mshot read defer taskrun cqe posting
  io_uring/net: fix overflow check in io_recvmsg_mshot_prep()
  io_uring/net: correct the type of variable
  io_uring/sqpoll: statistics of the true utilization of sq threads
  io_uring/net: move recv/recvmsg flags out of retry loop
  io_uring/kbuf: flag request if buffer pool is empty after buffer pick
  io_uring/net: improve the usercopy for sendmsg/recvmsg
  io_uring/net: move receive multishot out of the generic msghdr path
  io_uring/net: unify how recvmsg and sendmsg copy in the msghdr
  ...
parents 0f1a8766 606559dc
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -183,6 +183,7 @@ extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);

long do_ftruncate(struct file *file, loff_t length, int small);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
+28 −25
Original line number Diff line number Diff line
@@ -154,49 +154,52 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length
}
#endif

long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
long do_ftruncate(struct file *file, loff_t length, int small)
{
	struct inode *inode;
	struct dentry *dentry;
	struct fd f;
	int error;

	error = -EINVAL;
	if (length < 0)
		goto out;
	error = -EBADF;
	f = fdget(fd);
	if (!f.file)
		goto out;

	/* explicitly opened as large or we are on 64-bit box */
	if (f.file->f_flags & O_LARGEFILE)
	if (file->f_flags & O_LARGEFILE)
		small = 0;

	dentry = f.file->f_path.dentry;
	dentry = file->f_path.dentry;
	inode = dentry->d_inode;
	error = -EINVAL;
	if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
		goto out_putf;
	if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
		return -EINVAL;

	error = -EINVAL;
	/* Cannot ftruncate over 2^31 bytes without large file support */
	if (small && length > MAX_NON_LFS)
		goto out_putf;
		return -EINVAL;

	error = -EPERM;
	/* Check IS_APPEND on real upper inode */
	if (IS_APPEND(file_inode(f.file)))
		goto out_putf;
	if (IS_APPEND(file_inode(file)))
		return -EPERM;
	sb_start_write(inode->i_sb);
	error = security_file_truncate(f.file);
	error = security_file_truncate(file);
	if (!error)
		error = do_truncate(file_mnt_idmap(f.file), dentry, length,
				    ATTR_MTIME | ATTR_CTIME, f.file);
		error = do_truncate(file_mnt_idmap(file), dentry, length,
				    ATTR_MTIME | ATTR_CTIME, file);
	sb_end_write(inode->i_sb);
out_putf:

	return error;
}

long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
	struct fd f;
	int error;

	if (length < 0)
		return -EINVAL;
	f = fdget(fd);
	if (!f.file)
		return -EBADF;

	error = do_ftruncate(f.file, length, small);

	fdput(f);
out:
	return error;
}

+84 −53
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@
#define IO_URING_TYPES_H

#include <linux/blkdev.h>
#include <linux/hashtable.h>
#include <linux/task_work.h>
#include <linux/bitmap.h>
#include <linux/llist.h>
@@ -240,12 +241,14 @@ struct io_ring_ctx {
		unsigned int		poll_activated: 1;
		unsigned int		drain_disabled: 1;
		unsigned int		compat: 1;
		unsigned int		iowq_limits_set : 1;

		struct task_struct	*submitter_task;
		struct io_rings		*rings;
		struct percpu_ref	refs;

		enum task_work_notify_mode	notify_method;
		unsigned			sq_thread_idle;
	} ____cacheline_aligned_in_smp;

	/* submission data */
@@ -274,10 +277,20 @@ struct io_ring_ctx {
		 */
		struct io_rsrc_node	*rsrc_node;
		atomic_t		cancel_seq;

		/*
		 * ->iopoll_list is protected by the ctx->uring_lock for
		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
		 * For SQPOLL, only the single threaded io_sq_thread() will
		 * manipulate the list, hence no extra locking is needed there.
		 */
		bool			poll_multi_queue;
		struct io_wq_work_list	iopoll_list;

		struct io_file_table	file_table;
		struct io_mapped_ubuf	**user_bufs;
		unsigned		nr_user_files;
		unsigned		nr_user_bufs;
		struct io_mapped_ubuf	**user_bufs;

		struct io_submit_state	submit_state;

@@ -288,15 +301,6 @@ struct io_ring_ctx {
		struct io_alloc_cache	apoll_cache;
		struct io_alloc_cache	netmsg_cache;

		/*
		 * ->iopoll_list is protected by the ctx->uring_lock for
		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
		 * For SQPOLL, only the single threaded io_sq_thread() will
		 * manipulate the list, hence no extra locking is needed there.
		 */
		struct io_wq_work_list	iopoll_list;
		bool			poll_multi_queue;

		/*
		 * Any cancelable uring_cmd is added to this list in
		 * ->uring_cmd() by io_uring_cmd_insert_cancelable()
@@ -343,8 +347,8 @@ struct io_ring_ctx {
	spinlock_t		completion_lock;

	/* IRQ completion list, under ->completion_lock */
	struct io_wq_work_list	locked_free_list;
	unsigned int		locked_free_nr;
	struct io_wq_work_list	locked_free_list;

	struct list_head	io_buffers_comp;
	struct list_head	cq_overflow_list;
@@ -366,9 +370,6 @@ struct io_ring_ctx {
	unsigned int		file_alloc_start;
	unsigned int		file_alloc_end;

	struct xarray		personalities;
	u32			pers_next;

	struct list_head	io_buffers_cache;

	/* deferred free list, protected by ->uring_lock */
@@ -389,6 +390,9 @@ struct io_ring_ctx {
	struct wait_queue_head		rsrc_quiesce_wq;
	unsigned			rsrc_quiesce;

	u32			pers_next;
	struct xarray		personalities;

	/* hashed buffered write serialization */
	struct io_wq_hash		*hash_map;

@@ -405,11 +409,22 @@ struct io_ring_ctx {

	/* io-wq management, e.g. thread count */
	u32				iowq_limits[2];
	bool				iowq_limits_set;

	struct callback_head		poll_wq_task_work;
	struct list_head		defer_list;
	unsigned			sq_thread_idle;

#ifdef CONFIG_NET_RX_BUSY_POLL
	struct list_head	napi_list;	/* track busy poll napi_id */
	spinlock_t		napi_lock;	/* napi_list lock */

	/* napi busy poll default timeout */
	unsigned int		napi_busy_poll_to;
	bool			napi_prefer_busy_poll;
	bool			napi_enabled;

	DECLARE_HASHTABLE(napi_ht, 4);
#endif

	/* protected by ->completion_lock */
	unsigned			evfd_last_cq_tail;

@@ -455,7 +470,6 @@ enum {
	REQ_F_SKIP_LINK_CQES_BIT,
	REQ_F_SINGLE_POLL_BIT,
	REQ_F_DOUBLE_POLL_BIT,
	REQ_F_PARTIAL_IO_BIT,
	REQ_F_APOLL_MULTISHOT_BIT,
	REQ_F_CLEAR_POLLIN_BIT,
	REQ_F_HASH_LOCKED_BIT,
@@ -463,75 +477,88 @@ enum {
	REQ_F_SUPPORT_NOWAIT_BIT,
	REQ_F_ISREG_BIT,
	REQ_F_POLL_NO_LAZY_BIT,
	REQ_F_CANCEL_SEQ_BIT,
	REQ_F_CAN_POLL_BIT,
	REQ_F_BL_EMPTY_BIT,
	REQ_F_BL_NO_RECYCLE_BIT,

	/* not a real bit, just to check we're not overflowing the space */
	__REQ_F_LAST_BIT,
};

typedef u64 __bitwise io_req_flags_t;
#define IO_REQ_FLAG(bitno)	((__force io_req_flags_t) BIT_ULL((bitno)))

enum {
	/* ctx owns file */
	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
	REQ_F_FIXED_FILE	= IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT),
	/* drain existing IO first */
	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
	REQ_F_IO_DRAIN		= IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT),
	/* linked sqes */
	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
	REQ_F_LINK		= IO_REQ_FLAG(REQ_F_LINK_BIT),
	/* doesn't sever on completion < 0 */
	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
	REQ_F_HARDLINK		= IO_REQ_FLAG(REQ_F_HARDLINK_BIT),
	/* IOSQE_ASYNC */
	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
	REQ_F_FORCE_ASYNC	= IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT),
	/* IOSQE_BUFFER_SELECT */
	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
	REQ_F_BUFFER_SELECT	= IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT),
	/* IOSQE_CQE_SKIP_SUCCESS */
	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT),
	REQ_F_CQE_SKIP		= IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT),

	/* fail rest of links */
	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
	REQ_F_FAIL		= IO_REQ_FLAG(REQ_F_FAIL_BIT),
	/* on inflight list, should be cancelled and waited on exit reliably */
	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
	REQ_F_INFLIGHT		= IO_REQ_FLAG(REQ_F_INFLIGHT_BIT),
	/* read/write uses file position */
	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
	REQ_F_CUR_POS		= IO_REQ_FLAG(REQ_F_CUR_POS_BIT),
	/* must not punt to workers */
	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
	REQ_F_NOWAIT		= IO_REQ_FLAG(REQ_F_NOWAIT_BIT),
	/* has or had linked timeout */
	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
	REQ_F_LINK_TIMEOUT	= IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT),
	/* needs cleanup */
	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
	REQ_F_NEED_CLEANUP	= IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT),
	/* already went through poll handler */
	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
	REQ_F_POLLED		= IO_REQ_FLAG(REQ_F_POLLED_BIT),
	/* buffer already selected */
	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
	REQ_F_BUFFER_SELECTED	= IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT),
	/* buffer selected from ring, needs commit */
	REQ_F_BUFFER_RING	= BIT(REQ_F_BUFFER_RING_BIT),
	REQ_F_BUFFER_RING	= IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT),
	/* caller should reissue async */
	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
	REQ_F_REISSUE		= IO_REQ_FLAG(REQ_F_REISSUE_BIT),
	/* supports async reads/writes */
	REQ_F_SUPPORT_NOWAIT	= BIT(REQ_F_SUPPORT_NOWAIT_BIT),
	REQ_F_SUPPORT_NOWAIT	= IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT),
	/* regular file */
	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
	REQ_F_ISREG		= IO_REQ_FLAG(REQ_F_ISREG_BIT),
	/* has creds assigned */
	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
	REQ_F_CREDS		= IO_REQ_FLAG(REQ_F_CREDS_BIT),
	/* skip refcounting if not set */
	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
	REQ_F_REFCOUNT		= IO_REQ_FLAG(REQ_F_REFCOUNT_BIT),
	/* there is a linked timeout that has to be armed */
	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
	REQ_F_ARM_LTIMEOUT	= IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT),
	/* ->async_data allocated */
	REQ_F_ASYNC_DATA	= BIT(REQ_F_ASYNC_DATA_BIT),
	REQ_F_ASYNC_DATA	= IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT),
	/* don't post CQEs while failing linked requests */
	REQ_F_SKIP_LINK_CQES	= BIT(REQ_F_SKIP_LINK_CQES_BIT),
	REQ_F_SKIP_LINK_CQES	= IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT),
	/* single poll may be active */
	REQ_F_SINGLE_POLL	= BIT(REQ_F_SINGLE_POLL_BIT),
	REQ_F_SINGLE_POLL	= IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT),
	/* double poll may active */
	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT),
	/* request has already done partial IO */
	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),
	REQ_F_DOUBLE_POLL	= IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT),
	/* fast poll multishot mode */
	REQ_F_APOLL_MULTISHOT	= BIT(REQ_F_APOLL_MULTISHOT_BIT),
	REQ_F_APOLL_MULTISHOT	= IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT),
	/* recvmsg special flag, clear EPOLLIN */
	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT),
	REQ_F_CLEAR_POLLIN	= IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT),
	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
	REQ_F_HASH_LOCKED	= BIT(REQ_F_HASH_LOCKED_BIT),
	REQ_F_HASH_LOCKED	= IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT),
	/* don't use lazy poll wake for this request */
	REQ_F_POLL_NO_LAZY	= BIT(REQ_F_POLL_NO_LAZY_BIT),
	REQ_F_POLL_NO_LAZY	= IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT),
	/* cancel sequence is set and valid */
	REQ_F_CANCEL_SEQ	= IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT),
	/* file is pollable */
	REQ_F_CAN_POLL		= IO_REQ_FLAG(REQ_F_CAN_POLL_BIT),
	/* buffer list was empty after selection of buffer */
	REQ_F_BL_EMPTY		= IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT),
	/* don't recycle provided buffers for this request */
	REQ_F_BL_NO_RECYCLE	= IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
};

typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
@@ -592,15 +619,17 @@ struct io_kiocb {
	 * and after selection it points to the buffer ID itself.
	 */
	u16				buf_index;
	unsigned int			flags;

	unsigned			nr_tw;

	/* REQ_F_* flags */
	io_req_flags_t			flags;

	struct io_cqe			cqe;

	struct io_ring_ctx		*ctx;
	struct task_struct		*task;

	struct io_rsrc_node		*rsrc_node;

	union {
		/* store used ubuf, so we can prevent reloading */
		struct io_mapped_ubuf	*imu;
@@ -621,10 +650,12 @@ struct io_kiocb {
		/* cache ->apoll->events */
		__poll_t apoll_events;
	};

	struct io_rsrc_node		*rsrc_node;

	atomic_t			refs;
	atomic_t			poll_refs;
	struct io_task_work		io_task_work;
	unsigned			nr_tw;
	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
	struct hlist_node		hash_node;
	/* internal polling, see IORING_FEAT_FAST_POLL */
+4 −0
Original line number Diff line number Diff line
@@ -48,6 +48,10 @@ void napi_busy_loop(unsigned int napi_id,
		    bool (*loop_end)(void *, unsigned long),
		    void *loop_end_arg, bool prefer_busy_poll, u16 budget);

void napi_busy_loop_rcu(unsigned int napi_id,
			bool (*loop_end)(void *, unsigned long),
			void *loop_end_arg, bool prefer_busy_poll, u16 budget);

#else /* CONFIG_NET_RX_BUSY_POLL */
static inline unsigned long net_busy_loop_on(void)
{
+13 −17
Original line number Diff line number Diff line
@@ -148,7 +148,7 @@ TRACE_EVENT(io_uring_queue_async_work,
		__field(  void *,			req		)
		__field(  u64,				user_data	)
		__field(  u8,				opcode		)
		__field(  unsigned int,			flags		)
		__field(  unsigned long long,		flags		)
		__field(  struct io_wq_work *,		work		)
		__field(  int,				rw		)

@@ -159,7 +159,7 @@ TRACE_EVENT(io_uring_queue_async_work,
		__entry->ctx		= req->ctx;
		__entry->req		= req;
		__entry->user_data	= req->cqe.user_data;
		__entry->flags		= req->flags;
		__entry->flags		= (__force unsigned long long) req->flags;
		__entry->opcode		= req->opcode;
		__entry->work		= &req->work;
		__entry->rw		= rw;
@@ -167,10 +167,10 @@ TRACE_EVENT(io_uring_queue_async_work,
		__assign_str(op_str, io_uring_get_opcode(req->opcode));
	),

	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p",
		__entry->ctx, __entry->req, __entry->user_data,
		__get_str(op_str),
		__entry->flags, __entry->rw ? "hashed" : "normal", __entry->work)
		__get_str(op_str), __entry->flags,
		__entry->rw ? "hashed" : "normal", __entry->work)
);

/**
@@ -378,7 +378,7 @@ TRACE_EVENT(io_uring_submit_req,
		__field(  void *,		req		)
		__field(  unsigned long long,	user_data	)
		__field(  u8,			opcode		)
		__field(  u32,			flags		)
		__field(  unsigned long long,	flags		)
		__field(  bool,			sq_thread	)

		__string( op_str, io_uring_get_opcode(req->opcode) )
@@ -389,16 +389,16 @@ TRACE_EVENT(io_uring_submit_req,
		__entry->req		= req;
		__entry->user_data	= req->cqe.user_data;
		__entry->opcode		= req->opcode;
		__entry->flags		= req->flags;
		__entry->flags		= (__force unsigned long long) req->flags;
		__entry->sq_thread	= req->ctx->flags & IORING_SETUP_SQPOLL;

		__assign_str(op_str, io_uring_get_opcode(req->opcode));
	),

	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%llx, "
		  "sq_thread %d", __entry->ctx, __entry->req,
		  __entry->user_data, __get_str(op_str),
		  __entry->flags, __entry->sq_thread)
		  __entry->user_data, __get_str(op_str), __entry->flags,
		  __entry->sq_thread)
);

/*
@@ -602,29 +602,25 @@ TRACE_EVENT(io_uring_cqe_overflow,
 *
 * @tctx:		pointer to a io_uring_task
 * @count:		how many functions it ran
 * @loops:		how many loops it ran
 *
 */
TRACE_EVENT(io_uring_task_work_run,

	TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
	TP_PROTO(void *tctx, unsigned int count),

	TP_ARGS(tctx, count, loops),
	TP_ARGS(tctx, count),

	TP_STRUCT__entry (
		__field(  void *,		tctx		)
		__field(  unsigned int,		count		)
		__field(  unsigned int,		loops		)
	),

	TP_fast_assign(
		__entry->tctx		= tctx;
		__entry->count		= count;
		__entry->loops		= loops;
	),

	TP_printk("tctx %p, count %u, loops %u",
		 __entry->tctx, __entry->count, __entry->loops)
	TP_printk("tctx %p, count %u", __entry->tctx, __entry->count)
);

TRACE_EVENT(io_uring_short_write,
Loading