Commit f5d4feed authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-7.0/io_uring-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring updates from Jens Axboe:

 - Clean up the IORING_SETUP_R_DISABLED and submitter task checking,
   mostly just in preparation for relaxing the locking for SINGLE_ISSUER
   in the future.

 - Improve IOPOLL by using a doubly linked list to manage completions.

   Previously it was singly listed, which meant that to complete request
   N in the chain 0..N-1 had to have completed first. With a doubly
   linked list we can complete whatever request completes in that order,
   rather than need to wait for a consecutive range to be available.
   This reduces latencies.

 - Improve the restriction setup and checking. Mostly in preparation for
   adding further features on top of that. Coming in a separate pull
   request.

 - Split out task_work and wait handling into separate files. These are
   mostly nicely abstracted already, but still remained in the
   io_uring.c file which is on the larger side.

 - Use GFP_KERNEL_ACCOUNT in a few more spots, where appropriate.

 - Ensure even the idle io-wq worker exits if a task no longer has any
   rings open.

 - Add support for a non-circular submission queue.

   By default, the SQ ring keeps moving around, even if only a few
   entries are used for each submission. This can be wasteful in terms
   of cachelines.

   If IORING_SETUP_SQ_REWIND is set for the ring when created, each
   submission will start at offset 0 instead of where we last left off
   doing submissions.

 - Various little cleanups

* tag 'for-7.0/io_uring-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (30 commits)
  io_uring/kbuf: fix memory leak if io_buffer_add_list fails
  io_uring: Add SPDX id lines to remaining source files
  io_uring: allow io-wq workers to exit when unused
  io_uring/io-wq: add exit-on-idle state
  io_uring/net: don't continue send bundle if poll was required for retry
  io_uring/rsrc: use GFP_KERNEL_ACCOUNT consistently
  io_uring/futex: use GFP_KERNEL_ACCOUNT for futex data allocation
  io_uring/io-wq: handle !sysctl_hung_task_timeout_secs
  io_uring: fix bad indentation for setup flags if statement
  io_uring/rsrc: take unsigned index in io_rsrc_node_lookup()
  io_uring: introduce non-circular SQ
  io_uring: split out CQ waiting code into wait.c
  io_uring: split out task work code into tw.c
  io_uring/io-wq: don't trigger hung task for syzbot craziness
  io_uring: add IO_URING_EXIT_WAIT_MAX definition
  io_uring/sync: validate passed in offset
  io_uring/eventfd: remove unused ctx->evfd_last_cq_tail member
  io_uring/timeout: annotate data race in io_flush_timeouts()
  io_uring/uring_cmd: explicitly disallow cancelations for IOPOLL
  io_uring: fix IOPOLL with passthrough I/O
  ...
parents 26c9342b 442ae406
Loading
Loading
Loading
Loading
+19 −10
Original line number Diff line number Diff line
@@ -224,7 +224,10 @@ struct io_restriction {
	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
	u8 sqe_flags_allowed;
	u8 sqe_flags_required;
	bool registered;
	/* IORING_OP_* restrictions exist */
	bool op_registered;
	/* IORING_REGISTER_* restrictions exist */
	bool reg_registered;
};

struct io_submit_link {
@@ -259,7 +262,8 @@ struct io_ring_ctx {
	struct {
		unsigned int		flags;
		unsigned int		drain_next: 1;
		unsigned int		restricted: 1;
		unsigned int		op_restricted: 1;
		unsigned int		reg_restricted: 1;
		unsigned int		off_timeout_used: 1;
		unsigned int		drain_active: 1;
		unsigned int		has_evfd: 1;
@@ -316,7 +320,7 @@ struct io_ring_ctx {
		 * manipulate the list, hence no extra locking is needed there.
		 */
		bool			poll_multi_queue;
		struct io_wq_work_list	iopoll_list;
		struct list_head	iopoll_list;

		struct io_file_table	file_table;
		struct io_rsrc_data	buf_table;
@@ -444,6 +448,9 @@ struct io_ring_ctx {
	struct list_head		defer_list;
	unsigned			nr_drained;

	/* protected by ->completion_lock */
	unsigned			nr_req_allocated;

#ifdef CONFIG_NET_RX_BUSY_POLL
	struct list_head	napi_list;	/* track busy poll napi_id */
	spinlock_t		napi_lock;	/* napi_list lock */
@@ -456,10 +463,6 @@ struct io_ring_ctx {
	DECLARE_HASHTABLE(napi_ht, 4);
#endif

	/* protected by ->completion_lock */
	unsigned			evfd_last_cq_tail;
	unsigned			nr_req_allocated;

	/*
	 * Protection for resize vs mmap races - both the mmap and resize
	 * side will need to grab this lock, to prevent either side from
@@ -714,15 +717,21 @@ struct io_kiocb {

	atomic_t			refs;
	bool				cancel_seq_set;

	union {
		struct io_task_work	io_task_work;
		/* For IOPOLL setup queues, with hybrid polling */
		u64                     iopoll_start;
	};

	union {
		/*
		 * for polled requests, i.e. IORING_OP_POLL_ADD and async armed
		 * poll
		 */
		struct hlist_node	hash_node;
		/* For IOPOLL setup queues, with hybrid polling */
		u64                     iopoll_start;
		/* IOPOLL completion handling */
		struct list_head	iopoll_node;
		/* for private io_kiocb freeing */
		struct rcu_head		rcu_head;
	};
+12 −0
Original line number Diff line number Diff line
@@ -237,6 +237,18 @@ enum io_uring_sqe_flags_bit {
 */
#define IORING_SETUP_SQE_MIXED		(1U << 19)

/*
 * When set, io_uring ignores SQ head and tail and fetches SQEs to submit
 * starting from index 0 instead from the index stored in the head pointer.
 * IOW, the user should place all SQE at the beginning of the SQ memory
 * before issuing a submission syscall.
 *
 * It requires IORING_SETUP_NO_SQARRAY and is incompatible with
 * IORING_SETUP_SQPOLL. The user must also never change the SQ head and tail
 * values and keep it set to 0. Any other value is undefined behaviour.
 */
#define IORING_SETUP_SQ_REWIND		(1U << 20)

enum io_uring_op {
	IORING_OP_NOP,
	IORING_OP_READV,
+8 −6
Original line number Diff line number Diff line
@@ -8,12 +8,14 @@ endif

obj-$(CONFIG_IO_URING)		+= io_uring.o opdef.o kbuf.o rsrc.o notif.o \
					tctx.o filetable.o rw.o poll.o \
					eventfd.o uring_cmd.o openclose.o \
					sqpoll.o xattr.o nop.o fs.o splice.o \
					sync.o msg_ring.o advise.o openclose.o \
					statx.o timeout.o cancel.o \
					waitid.o register.o truncate.o \
					memmap.o alloc_cache.o query.o
					tw.o wait.o eventfd.o uring_cmd.o \
					openclose.o sqpoll.o xattr.o nop.o \
					fs.o splice.o sync.o msg_ring.o \
					advise.o openclose.o statx.o timeout.o \
					cancel.o waitid.o register.o \
					truncate.o memmap.o alloc_cache.o \
					query.o

obj-$(CONFIG_IO_URING_ZCRX)	+= zcrx.o
obj-$(CONFIG_IO_WQ)		+= io-wq.o
obj-$(CONFIG_FUTEX)		+= futex.o
+2 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef IOU_ALLOC_CACHE_H
#define IOU_ALLOC_CACHE_H

#include <linux/io_uring_types.h>
#include <linux/kasan.h>

/*
 * Don't allow the cache to grow beyond this size.
+2 −3
Original line number Diff line number Diff line
@@ -2,10 +2,8 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/nospec.h>
#include <linux/io_uring.h>

@@ -21,6 +19,7 @@
#include "waitid.h"
#include "futex.h"
#include "cancel.h"
#include "wait.h"

struct io_cancel {
	struct file			*file;
@@ -539,7 +538,7 @@ __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
	/* SQPOLL thread does its own polling */
	if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
	    is_sqpoll_thread) {
		while (!wq_list_empty(&ctx->iopoll_list)) {
		while (!list_empty(&ctx->iopoll_list)) {
			io_iopoll_try_reap_events(ctx);
			ret = true;
			cond_resched();
Loading