Commit 6df9d086 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-6.15/io_uring-epoll-wait-20250325' of git://git.kernel.dk/linux

Pull io_uring epoll support from Jens Axboe:
 "This adds support for reading epoll events via io_uring.

  While this may seem counter-intuitive (and/or productive), the
  reasoning here is that quite a few existing epoll event loops can
  easily do a partial conversion to a completion based model, but are
  still stuck with one (or few) event types that remain readiness based.

  For that case, they then need to add the io_uring fd to the epoll
  context, and continue to rely on epoll_wait(2) for waiting on events.
  This misses out on the finer grained waiting that io_uring can do, to
  reduce context switches and wait for multiple events in one batch
  reliably.

  With adding support for reaping epoll events via io_uring, the whole
  legacy readiness based event types can still be reaped via epoll, with
  the overall waiting in the loop be driven by io_uring"

* tag 'for-6.15/io_uring-epoll-wait-20250325' of git://git.kernel.dk/linux:
  io_uring/epoll: add support for IORING_OP_EPOLL_WAIT
  io_uring/epoll: remove CONFIG_EPOLL guards
parents ca0b04ba 19f7e942
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -280,6 +280,7 @@ enum io_uring_op {
	IORING_OP_BIND,
	IORING_OP_LISTEN,
	IORING_OP_RECV_ZC,
	IORING_OP_EPOLL_WAIT,

	/* this goes last, obviously */
	IORING_OP_LAST,
+5 −4
Original line number Diff line number Diff line
@@ -11,10 +11,11 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
					eventfd.o uring_cmd.o openclose.o \
					sqpoll.o xattr.o nop.o fs.o splice.o \
					sync.o msg_ring.o advise.o openclose.o \
					epoll.o statx.o timeout.o fdinfo.o \
					cancel.o waitid.o register.o \
					truncate.o memmap.o alloc_cache.o
					statx.o timeout.o fdinfo.o cancel.o \
					waitid.o register.o truncate.o \
					memmap.o alloc_cache.o
obj-$(CONFIG_IO_URING_ZCRX)	+= zcrx.o
obj-$(CONFIG_IO_WQ)		+= io-wq.o
obj-$(CONFIG_FUTEX)		+= futex.o
obj-$(CONFIG_EPOLL)		+= epoll.o
obj-$(CONFIG_NET_RX_BUSY_POLL)	+= napi.o
+33 −2
Original line number Diff line number Diff line
@@ -12,7 +12,6 @@
#include "io_uring.h"
#include "epoll.h"

#if defined(CONFIG_EPOLL)
struct io_epoll {
	struct file			*file;
	int				epfd;
@@ -21,6 +20,12 @@ struct io_epoll {
	struct epoll_event		event;
};

struct io_epoll_wait {
	struct file			*file;
	int				maxevents;
	struct epoll_event __user	*events;
};

int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
	struct io_epoll *epoll = io_kiocb_to_cmd(req, struct io_epoll);
@@ -58,4 +63,30 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
	io_req_set_res(req, ret, 0);
	return IOU_OK;
}
#endif

int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
	struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait);

	if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
		return -EINVAL;

	iew->maxevents = READ_ONCE(sqe->len);
	iew->events = u64_to_user_ptr(READ_ONCE(sqe->addr));
	return 0;
}

int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags)
{
	struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait);
	int ret;

	ret = epoll_sendevents(req->file, iew->events, iew->maxevents);
	if (ret == 0)
		return -EAGAIN;
	if (ret < 0)
		req_set_fail(req);

	io_req_set_res(req, ret, 0);
	return IOU_OK;
}
+2 −0
Original line number Diff line number Diff line
@@ -3,4 +3,6 @@
#if defined(CONFIG_EPOLL)
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags);
int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags);
#endif
+14 −0
Original line number Diff line number Diff line
@@ -527,6 +527,17 @@ const struct io_issue_def io_issue_defs[] = {
		.issue			= io_recvzc,
#else
		.prep			= io_eopnotsupp_prep,
#endif
	},
	[IORING_OP_EPOLL_WAIT] = {
		.needs_file		= 1,
		.audit_skip		= 1,
		.pollin			= 1,
#if defined(CONFIG_EPOLL)
		.prep			= io_epoll_wait_prep,
		.issue			= io_epoll_wait,
#else
		.prep			= io_eopnotsupp_prep,
#endif
	},
};
@@ -761,6 +772,9 @@ const struct io_cold_def io_cold_defs[] = {
	[IORING_OP_RECV_ZC] = {
		.name			= "RECV_ZC",
	},
	[IORING_OP_EPOLL_WAIT] = {
		.name			= "EPOLL_WAIT",
	},
};

const char *io_uring_get_opcode(u8 opcode)