Commit 23acda7c authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring updates from Jens Axboe:

 - Add a callback driven main loop for io_uring, and BPF struct_ops
   on top to allow implementing custom event loop logic

 - Decouple IOPOLL from being a ring-wide all-or-nothing setting,
   allowing IOPOLL use cases to also issue certain white listed
   non-polled opcodes

 - Timeout improvements. Migrate internal timeout storage from
   timespec64 to ktime_t for simpler arithmetic and avoid copying of
   timespec data

 - Zero-copy receive (zcrx) updates:

      - Add a device-less mode (ZCRX_REG_NODEV) for testing and
        experimentation where data flows through the copy fallback path

      - Fix two-step unregistration regression, DMA length calculations,
        xarray mark usage, and a potential 32-bit overflow in id
        shifting

      - Refactoring toward multi-area support: dedicated refill queue
        struct, consolidated DMA syncing, netmem array refilling format,
        and guard-based locking

 - Zero-copy transmit (zctx) cleanup:

      - Unify io_send_zc() and io_sendmsg_zc() into a single function

      - Add vectorized registered buffer send for IORING_OP_SEND_ZC

      - Add separate notification user_data via sqe->addr3 so
        notification and completion CQEs can be distinguished without
        extra reference counting

 - Switch struct io_ring_ctx internal bitfields to explicit flag bits
   with atomic-safe accessors, and annotate the known harmless races on
   those flags

 - Various optimizations caching ctx and other request fields in local
   variables to avoid repeated loads, and cleanups for tctx setup, ring
   fd registration, and read path early returns

* tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (58 commits)
  io_uring: unify getting ctx from passed in file descriptor
  io_uring/register: don't get a reference to the registered ring fd
  io_uring/tctx: clean up __io_uring_add_tctx_node() error handling
  io_uring/tctx: have io_uring_alloc_task_context() return tctx
  io_uring/timeout: use 'ctx' consistently
  io_uring/rw: clean up __io_read() obsolete comment and early returns
  io_uring/zcrx: use correct mmap off constants
  io_uring/zcrx: use dma_len for chunk size calculation
  io_uring/zcrx: don't clear not allocated niovs
  io_uring/zcrx: don't use mark0 for allocating xarray
  io_uring: cast id to u64 before shifting in io_allocate_rbuf_ring()
  io_uring/zcrx: reject REG_NODEV with large rx_buf_size
  io_uring/cancel: validate opcode for IORING_ASYNC_CANCEL_OP
  io_uring/rsrc: use io_cache_free() to free node
  io_uring/zcrx: rename zcrx [un]register functions
  io_uring/zcrx: check ctrl op payload struct sizes
  io_uring/zcrx: cache fallback availability in zcrx ctx
  io_uring/zcrx: warn on a repeated area append
  io_uring/zcrx: consolidate dma syncing
  io_uring/zcrx: netmem array as refiling format
  ...
parents 7fe6ac15 c5e9f6a9
Loading
Loading
Loading
Loading
+0 −4
Original line number Diff line number Diff line
@@ -786,10 +786,6 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
	struct nvme_ctrl *ctrl = ioucmd->file->private_data;
	int ret;

	/* IOPOLL not supported yet */
	if (issue_flags & IO_URING_F_IOPOLL)
		return -EOPNOTSUPP;

	ret = nvme_uring_cmd_checks(issue_flags);
	if (ret)
		return ret;
+33 −14
Original line number Diff line number Diff line
@@ -8,6 +8,9 @@
#include <linux/llist.h>
#include <uapi/linux/io_uring.h>

struct iou_loop_params;
struct io_uring_bpf_ops;

enum {
	/*
	 * A hint to not wake right away but delay until there are enough of
@@ -41,6 +44,8 @@ enum io_uring_cmd_flags {
	IO_URING_F_COMPAT		= (1 << 12),
};

struct iou_loop_params;

struct io_wq_work_node {
	struct io_wq_work_node *next;
};
@@ -268,24 +273,30 @@ struct io_alloc_cache {
	unsigned int		init_clear;
};

enum {
	IO_RING_F_DRAIN_NEXT		= BIT(0),
	IO_RING_F_OP_RESTRICTED		= BIT(1),
	IO_RING_F_REG_RESTRICTED	= BIT(2),
	IO_RING_F_OFF_TIMEOUT_USED	= BIT(3),
	IO_RING_F_DRAIN_ACTIVE		= BIT(4),
	IO_RING_F_HAS_EVFD		= BIT(5),
	/* all CQEs should be posted only by the submitter task */
	IO_RING_F_TASK_COMPLETE		= BIT(6),
	IO_RING_F_LOCKLESS_CQ		= BIT(7),
	IO_RING_F_SYSCALL_IOPOLL	= BIT(8),
	IO_RING_F_POLL_ACTIVATED	= BIT(9),
	IO_RING_F_DRAIN_DISABLED	= BIT(10),
	IO_RING_F_COMPAT		= BIT(11),
	IO_RING_F_IOWQ_LIMITS_SET	= BIT(12),
};

struct io_ring_ctx {
	/* const or read-mostly hot data */
	struct {
		/* ring setup flags */
		unsigned int		flags;
		unsigned int		drain_next: 1;
		unsigned int		op_restricted: 1;
		unsigned int		reg_restricted: 1;
		unsigned int		off_timeout_used: 1;
		unsigned int		drain_active: 1;
		unsigned int		has_evfd: 1;
		/* all CQEs should be posted only by the submitter task */
		unsigned int		task_complete: 1;
		unsigned int		lockless_cq: 1;
		unsigned int		syscall_iopoll: 1;
		unsigned int		poll_activated: 1;
		unsigned int		drain_disabled: 1;
		unsigned int		compat: 1;
		unsigned int		iowq_limits_set : 1;
		/* internal state flags IO_RING_F_* flags , mostly read-only */
		unsigned int		int_flags;

		struct task_struct	*submitter_task;
		struct io_rings		*rings;
@@ -355,6 +366,9 @@ struct io_ring_ctx {
		struct io_alloc_cache	rw_cache;
		struct io_alloc_cache	cmd_cache;

		int (*loop_step)(struct io_ring_ctx *ctx,
				 struct iou_loop_params *);

		/*
		 * Any cancelable uring_cmd is added to this list in
		 * ->uring_cmd() by io_uring_cmd_insert_cancelable()
@@ -477,6 +491,8 @@ struct io_ring_ctx {
	DECLARE_HASHTABLE(napi_ht, 4);
#endif

	struct io_uring_bpf_ops		*bpf_ops;

	/*
	 * Protection for resize vs mmap races - both the mmap and resize
	 * side will need to grab this lock, to prevent either side from
@@ -545,6 +561,7 @@ enum {
	REQ_F_HAS_METADATA_BIT,
	REQ_F_IMPORT_BUFFER_BIT,
	REQ_F_SQE_COPIED_BIT,
	REQ_F_IOPOLL_BIT,

	/* not a real bit, just to check we're not overflowing the space */
	__REQ_F_LAST_BIT,
@@ -638,6 +655,8 @@ enum {
	REQ_F_IMPORT_BUFFER	= IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
	/* ->sqe_copy() has been called, if necessary */
	REQ_F_SQE_COPIED	= IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT),
	/* request must be iopolled to completion (set in ->issue()) */
	REQ_F_IOPOLL		= IO_REQ_FLAG(REQ_F_IOPOLL_BIT),
};

struct io_tw_req {
+7 −94
Original line number Diff line number Diff line
@@ -10,6 +10,8 @@

#include <linux/fs.h>
#include <linux/types.h>
#include <linux/io_uring/zcrx.h>

/*
 * this file is shared with liburing and that has to autodetect
 * if linux/time_types.h is available or not, it can
@@ -341,6 +343,10 @@ enum io_uring_op {

/*
 * sqe->timeout_flags
 *
 * IORING_TIMEOUT_IMMEDIATE_ARG:	If set, sqe->addr stores the timeout
 *					value in nanoseconds instead of
 *					pointing to a timespec.
 */
#define IORING_TIMEOUT_ABS		(1U << 0)
#define IORING_TIMEOUT_UPDATE		(1U << 1)
@@ -349,6 +355,7 @@ enum io_uring_op {
#define IORING_LINK_TIMEOUT_UPDATE	(1U << 4)
#define IORING_TIMEOUT_ETIME_SUCCESS	(1U << 5)
#define IORING_TIMEOUT_MULTISHOT	(1U << 6)
#define IORING_TIMEOUT_IMMEDIATE_ARG	(1U << 7)
#define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK	(IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*
@@ -1050,100 +1057,6 @@ struct io_timespec {
	__u64		tv_nsec;
};

/* Zero copy receive refill queue entry */
struct io_uring_zcrx_rqe {
	__u64	off;
	__u32	len;
	__u32	__pad;
};

struct io_uring_zcrx_cqe {
	__u64	off;
	__u64	__pad;
};

/* The bit from which area id is encoded into offsets */
#define IORING_ZCRX_AREA_SHIFT	48
#define IORING_ZCRX_AREA_MASK	(~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))

struct io_uring_zcrx_offsets {
	__u32	head;
	__u32	tail;
	__u32	rqes;
	__u32	__resv2;
	__u64	__resv[2];
};

enum io_uring_zcrx_area_flags {
	IORING_ZCRX_AREA_DMABUF		= 1,
};

struct io_uring_zcrx_area_reg {
	__u64	addr;
	__u64	len;
	__u64	rq_area_token;
	__u32	flags;
	__u32	dmabuf_fd;
	__u64	__resv2[2];
};

enum zcrx_reg_flags {
	ZCRX_REG_IMPORT	= 1,
};

enum zcrx_features {
	/*
	 * The user can ask for the desired rx page size by passing the
	 * value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
	 */
	ZCRX_FEATURE_RX_PAGE_SIZE	= 1 << 0,
};

/*
 * Argument for IORING_REGISTER_ZCRX_IFQ
 */
struct io_uring_zcrx_ifq_reg {
	__u32	if_idx;
	__u32	if_rxq;
	__u32	rq_entries;
	__u32	flags;

	__u64	area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
	__u64	region_ptr; /* struct io_uring_region_desc * */

	struct io_uring_zcrx_offsets offsets;
	__u32	zcrx_id;
	__u32	rx_buf_len;
	__u64	__resv[3];
};

enum zcrx_ctrl_op {
	ZCRX_CTRL_FLUSH_RQ,
	ZCRX_CTRL_EXPORT,

	__ZCRX_CTRL_LAST,
};

struct zcrx_ctrl_flush_rq {
	__u64		__resv[6];
};

struct zcrx_ctrl_export {
	__u32		zcrx_fd;
	__u32 		__resv1[11];
};

struct zcrx_ctrl {
	__u32	zcrx_id;
	__u32	op; /* see enum zcrx_ctrl_op */
	__u64	__resv[2];

	union {
		struct zcrx_ctrl_export		zc_export;
		struct zcrx_ctrl_flush_rq	zc_flush;
	};
};

#ifdef __cplusplus
}
#endif
+115 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
/*
 * Header file for the io_uring zerocopy receive (zcrx) interface.
 *
 * Copyright (C) 2026 Pavel Begunkov
 * Copyright (C) 2026 David Wei
 * Copyright (C) Meta Platforms, Inc.
 */
#ifndef LINUX_IO_ZCRX_H
#define LINUX_IO_ZCRX_H

#include <linux/types.h>

/* Zero copy receive refill queue entry */
struct io_uring_zcrx_rqe {
	__u64	off;
	__u32	len;
	__u32	__pad;
};

struct io_uring_zcrx_cqe {
	__u64	off;
	__u64	__pad;
};

/* The bit from which area id is encoded into offsets */
#define IORING_ZCRX_AREA_SHIFT	48
#define IORING_ZCRX_AREA_MASK	(~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))

struct io_uring_zcrx_offsets {
	__u32	head;
	__u32	tail;
	__u32	rqes;
	__u32	__resv2;
	__u64	__resv[2];
};

enum io_uring_zcrx_area_flags {
	IORING_ZCRX_AREA_DMABUF		= 1,
};

struct io_uring_zcrx_area_reg {
	__u64	addr;
	__u64	len;
	__u64	rq_area_token;
	__u32	flags;
	__u32	dmabuf_fd;
	__u64	__resv2[2];
};

enum zcrx_reg_flags {
	ZCRX_REG_IMPORT		= 1,

	/*
	 * Register a zcrx instance without a net device. All data will be
	 * copied. The refill queue entries might not be automatically
	 * consumed and need to be flushed, see ZCRX_CTRL_FLUSH_RQ.
	 */
	ZCRX_REG_NODEV		= 2,
};

enum zcrx_features {
	/*
	 * The user can ask for the desired rx page size by passing the
	 * value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
	 */
	ZCRX_FEATURE_RX_PAGE_SIZE	= 1 << 0,
};

/*
 * Argument for IORING_REGISTER_ZCRX_IFQ
 */
struct io_uring_zcrx_ifq_reg {
	__u32	if_idx;
	__u32	if_rxq;
	__u32	rq_entries;
	__u32	flags;

	__u64	area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
	__u64	region_ptr; /* struct io_uring_region_desc * */

	struct io_uring_zcrx_offsets offsets;
	__u32	zcrx_id;
	__u32	rx_buf_len;
	__u64	__resv[3];
};

enum zcrx_ctrl_op {
	ZCRX_CTRL_FLUSH_RQ,
	ZCRX_CTRL_EXPORT,

	__ZCRX_CTRL_LAST,
};

struct zcrx_ctrl_flush_rq {
	__u64		__resv[6];
};

struct zcrx_ctrl_export {
	__u32		zcrx_fd;
	__u32 		__resv1[11];
};

struct zcrx_ctrl {
	__u32	zcrx_id;
	__u32	op; /* see enum zcrx_ctrl_op */
	__u64	__resv[2];

	union {
		struct zcrx_ctrl_export		zc_export;
		struct zcrx_ctrl_flush_rq	zc_flush;
	};
};

#endif /* LINUX_IO_ZCRX_H */
+5 −0
Original line number Diff line number Diff line
@@ -14,3 +14,8 @@ config IO_URING_BPF
	def_bool y
	depends on BPF
	depends on NET

config IO_URING_BPF_OPS
	def_bool y
	depends on IO_URING
	depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
Loading