Commit 3b1509f2 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-5.18/io_uring-2022-04-01' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:
 "A little bit all over the map, some regression fixes for this merge
  window, and some general fixes that are stable bound. In detail:

   - Fix an SQPOLL memory ordering issue (Almog)

   - Accept fixes (Dylan)

   - Poll fixes (me)

   - Fixes for provided buffers and recycling (me)

   - Tweak to IORING_OP_MSG_RING command added in this merge window (me)

   - Memory leak fix (Pavel)

   - Misc fixes and tweaks (Pavel, me)"

* tag 'for-5.18/io_uring-2022-04-01' of git://git.kernel.dk/linux-block:
  io_uring: defer msg-ring file validity check until command issue
  io_uring: fail links if msg-ring doesn't succeeed
  io_uring: fix memory leak of uid in files registration
  io_uring: fix put_kbuf without proper locking
  io_uring: fix invalid flags for io_put_kbuf()
  io_uring: improve req fields comments
  io_uring: enable EPOLLEXCLUSIVE for accept poll
  io_uring: improve task work cache utilization
  io_uring: fix async accept on O_NONBLOCK sockets
  io_uring: remove IORING_CQE_F_MSG
  io_uring: add flag for disabling provided buffer recycling
  io_uring: ensure recv and recvmsg handle MSG_WAITALL correctly
  io_uring: don't recycle provided buffer if punted to async worker
  io_uring: fix assuming triggered poll waitqueue is the single poll
  io_uring: bump poll refs to full 31-bits
  io_uring: remove poll entry from list when canceling all
  io_uring: fix memory ordering when SQPOLL thread goes to sleep
  io_uring: ensure that fsnotify is always called
  io_uring: recycle provided before arming poll
parents fe35fdb3 3f1d52ab
Loading
Loading
Loading
Loading
+110 −23
Original line number Diff line number Diff line
@@ -611,6 +611,7 @@ struct io_sr_msg {
	int				msg_flags;
	int				bgid;
	size_t				len;
	size_t				done_io;
};

struct io_open {
@@ -781,6 +782,7 @@ enum {
	REQ_F_SKIP_LINK_CQES_BIT,
	REQ_F_SINGLE_POLL_BIT,
	REQ_F_DOUBLE_POLL_BIT,
	REQ_F_PARTIAL_IO_BIT,
	/* keep async read/write and isreg together and in order */
	REQ_F_SUPPORT_NOWAIT_BIT,
	REQ_F_ISREG_BIT,
@@ -843,6 +845,8 @@ enum {
	REQ_F_SINGLE_POLL	= BIT(REQ_F_SINGLE_POLL_BIT),
	/* double poll may active */
	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT),
	/* request has already done partial IO */
	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),
};

struct async_poll {
@@ -923,7 +927,6 @@ struct io_kiocb {
	struct io_wq_work_node		comp_list;
	atomic_t			refs;
	atomic_t			poll_refs;
	struct io_kiocb			*link;
	struct io_task_work		io_task_work;
	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
	struct hlist_node		hash_node;
@@ -931,9 +934,11 @@ struct io_kiocb {
	struct async_poll		*apoll;
	/* opcode allocated if it needs to store data for async defer */
	void				*async_data;
	/* custom credentials, valid IFF REQ_F_CREDS is set */
	/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
	struct io_buffer		*kbuf;
	/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
	struct io_kiocb			*link;
	/* custom credentials, valid IFF REQ_F_CREDS is set */
	const struct cred		*creds;
	struct io_wq_work		work;
};
@@ -962,6 +967,7 @@ struct io_op_def {
	/* set if opcode supports polled "wait" */
	unsigned		pollin : 1;
	unsigned		pollout : 1;
	unsigned		poll_exclusive : 1;
	/* op supports buffer selection */
	unsigned		buffer_select : 1;
	/* do prep async if is going to be punted */
@@ -1056,6 +1062,7 @@ static const struct io_op_def io_op_defs[] = {
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollin			= 1,
		.poll_exclusive		= 1,
	},
	[IORING_OP_ASYNC_CANCEL] = {
		.audit_skip		= 1,
@@ -1330,6 +1337,8 @@ static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)

static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
{
	lockdep_assert_held(&req->ctx->completion_lock);

	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
		return 0;
	return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
@@ -1362,6 +1371,8 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
		cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
		spin_unlock(&ctx->completion_lock);
	} else {
		lockdep_assert_held(&req->ctx->uring_lock);

		cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
	}

@@ -1382,7 +1393,7 @@ static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
	return NULL;
}

static void io_kbuf_recycle(struct io_kiocb *req)
static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
{
	struct io_ring_ctx *ctx = req->ctx;
	struct io_buffer_list *bl;
@@ -1390,6 +1401,12 @@ static void io_kbuf_recycle(struct io_kiocb *req)

	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
		return;
	/* don't recycle if we already did IO to this buffer */
	if (req->flags & REQ_F_PARTIAL_IO)
		return;

	if (issue_flags & IO_URING_F_UNLOCKED)
		mutex_lock(&ctx->uring_lock);

	lockdep_assert_held(&ctx->uring_lock);

@@ -1398,6 +1415,9 @@ static void io_kbuf_recycle(struct io_kiocb *req)
	list_add(&buf->list, &bl->buf_list);
	req->flags &= ~REQ_F_BUFFER_SELECTED;
	req->kbuf = NULL;

	if (issue_flags & IO_URING_F_UNLOCKED)
		mutex_unlock(&ctx->uring_lock);
}

static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
@@ -2104,6 +2124,12 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
			}
		}
		io_req_put_rsrc(req, ctx);
		/*
		 * Selected buffer deallocation in io_clean_op() assumes that
		 * we don't hold ->completion_lock. Clean them here to avoid
		 * deadlocks.
		 */
		io_put_kbuf_comp(req);
		io_dismantle_req(req);
		io_put_task(req->task, 1);
		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
@@ -2148,7 +2174,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res)
static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
	req_set_fail(req);
	io_req_complete_post(req, res, io_put_kbuf(req, 0));
	io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
}

static void io_req_complete_fail_submit(struct io_kiocb *req)
@@ -2437,6 +2463,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
		struct io_kiocb *req = container_of(node, struct io_kiocb,
						    io_task_work.node);

		prefetch(container_of(next, struct io_kiocb, io_task_work.node));

		if (req->ctx != *ctx) {
			if (unlikely(!*uring_locked && *ctx))
				ctx_commit_and_unlock(*ctx);
@@ -2469,6 +2497,8 @@ static void handle_tw_list(struct io_wq_work_node *node,
		struct io_kiocb *req = container_of(node, struct io_kiocb,
						    io_task_work.node);

		prefetch(container_of(next, struct io_kiocb, io_task_work.node));

		if (req->ctx != *ctx) {
			ctx_flush_and_put(*ctx, locked);
			*ctx = req->ctx;
@@ -2974,8 +3004,12 @@ static bool io_rw_should_reissue(struct io_kiocb *req)

static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
	if (req->rw.kiocb.ki_flags & IOCB_WRITE)
	if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
		kiocb_end_write(req);
		fsnotify_modify(req->file);
	} else {
		fsnotify_access(req->file);
	}
	if (unlikely(res != req->result)) {
		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
		    io_rw_should_reissue(req)) {
@@ -4439,9 +4473,6 @@ static int io_msg_ring_prep(struct io_kiocb *req,
		     sqe->splice_fd_in || sqe->buf_index || sqe->personality))
		return -EINVAL;

	if (req->file->f_op != &io_uring_fops)
		return -EBADFD;

	req->msg.user_data = READ_ONCE(sqe->off);
	req->msg.len = READ_ONCE(sqe->len);
	return 0;
@@ -4451,14 +4482,18 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
{
	struct io_ring_ctx *target_ctx;
	struct io_msg *msg = &req->msg;
	int ret = -EOVERFLOW;
	bool filled;
	int ret;

	ret = -EBADFD;
	if (req->file->f_op != &io_uring_fops)
		goto done;

	ret = -EOVERFLOW;
	target_ctx = req->file->private_data;

	spin_lock(&target_ctx->completion_lock);
	filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len,
					IORING_CQE_F_MSG);
	filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
	io_commit_cqring(target_ctx);
	spin_unlock(&target_ctx->completion_lock);

@@ -4467,6 +4502,9 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
		ret = 0;
	}

done:
	if (ret < 0)
		req_set_fail(req);
	__io_req_complete(req, issue_flags, ret, 0);
	return 0;
}
@@ -4537,6 +4575,8 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
				req->sync.len);
	if (ret < 0)
		req_set_fail(req);
	else
		fsnotify_modify(req->file);
	io_req_complete(req, ret);
	return 0;
}
@@ -5419,12 +5459,21 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
	if (req->ctx->compat)
		sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
	sr->done_io = 0;
	return 0;
}

static bool io_net_retry(struct socket *sock, int flags)
{
	if (!(flags & MSG_WAITALL))
		return false;
	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
}

static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
	struct io_async_msghdr iomsg, *kmsg;
	struct io_sr_msg *sr = &req->sr_msg;
	struct socket *sock;
	struct io_buffer *kbuf;
	unsigned flags;
@@ -5467,6 +5516,11 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
			return io_setup_async_msg(req, kmsg);
		if (ret == -ERESTARTSYS)
			ret = -EINTR;
		if (ret > 0 && io_net_retry(sock, flags)) {
			sr->done_io += ret;
			req->flags |= REQ_F_PARTIAL_IO;
			return io_setup_async_msg(req, kmsg);
		}
		req_set_fail(req);
	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
		req_set_fail(req);
@@ -5476,6 +5530,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
	if (kmsg->free_iov)
		kfree(kmsg->free_iov);
	req->flags &= ~REQ_F_NEED_CLEANUP;
	if (ret >= 0)
		ret += sr->done_io;
	else if (sr->done_io)
		ret = sr->done_io;
	__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
	return 0;
}
@@ -5526,12 +5584,23 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
			return -EAGAIN;
		if (ret == -ERESTARTSYS)
			ret = -EINTR;
		if (ret > 0 && io_net_retry(sock, flags)) {
			sr->len -= ret;
			sr->buf += ret;
			sr->done_io += ret;
			req->flags |= REQ_F_PARTIAL_IO;
			return -EAGAIN;
		}
		req_set_fail(req);
	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
		req_set_fail(req);
	}

	if (ret >= 0)
		ret += sr->done_io;
	else if (sr->done_io)
		ret = sr->done_io;
	__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
	return 0;
}
@@ -5569,9 +5638,6 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
	struct file *file;
	int ret, fd;

	if (req->file->f_flags & O_NONBLOCK)
		req->flags |= REQ_F_NOWAIT;

	if (!fixed) {
		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
		if (unlikely(fd < 0))
@@ -5801,7 +5867,7 @@ struct io_poll_table {
};

#define IO_POLL_CANCEL_FLAG	BIT(31)
#define IO_POLL_REF_MASK	((1u << 20)-1)
#define IO_POLL_REF_MASK	GENMASK(30, 0)

/*
 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
@@ -6035,10 +6101,13 @@ static void io_poll_cancel_req(struct io_kiocb *req)
	io_poll_execute(req, 0, 0);
}

#define wqe_to_req(wait)	((void *)((unsigned long) (wait)->private & ~1))
#define wqe_is_double(wait)	((unsigned long) (wait)->private & 1)

static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
			void *key)
{
	struct io_kiocb *req = wait->private;
	struct io_kiocb *req = wqe_to_req(wait);
	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
						 wait);
	__poll_t mask = key_to_poll(key);
@@ -6076,6 +6145,9 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
		if (mask && poll->events & EPOLLONESHOT) {
			list_del_init(&poll->wait.entry);
			poll->head = NULL;
			if (wqe_is_double(wait))
				req->flags &= ~REQ_F_DOUBLE_POLL;
			else
				req->flags &= ~REQ_F_SINGLE_POLL;
		}
		__io_poll_execute(req, mask, poll->events);
@@ -6088,6 +6160,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
			    struct io_poll_iocb **poll_ptr)
{
	struct io_kiocb *req = pt->req;
	unsigned long wqe_private = (unsigned long) req;

	/*
	 * The file being polled uses multiple waitqueues for poll handling
@@ -6113,6 +6186,8 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
			pt->error = -ENOMEM;
			return;
		}
		/* mark as double wq entry */
		wqe_private |= 1;
		req->flags |= REQ_F_DOUBLE_POLL;
		io_init_poll_iocb(poll, first->events, first->wait.func);
		*poll_ptr = poll;
@@ -6123,7 +6198,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
	req->flags |= REQ_F_SINGLE_POLL;
	pt->nr_entries++;
	poll->head = head;
	poll->wait.private = req;
	poll->wait.private = (void *) wqe_private;

	if (poll->events & EPOLLEXCLUSIVE)
		add_wait_queue_exclusive(head, &poll->wait);
@@ -6150,7 +6225,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
	INIT_HLIST_NODE(&req->hash_node);
	io_init_poll_iocb(poll, mask, io_poll_wake);
	poll->file = req->file;
	poll->wait.private = req;

	ipt->pt._key = mask;
	ipt->req = req;
@@ -6238,7 +6312,8 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
	} else {
		mask |= POLLOUT | POLLWRNORM;
	}

	if (def->poll_exclusive)
		mask |= EPOLLEXCLUSIVE;
	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
	    !list_empty(&ctx->apoll_cache)) {
		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
@@ -6254,6 +6329,8 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
	req->flags |= REQ_F_POLLED;
	ipt.pt._qproc = io_async_queue_proc;

	io_kbuf_recycle(req, issue_flags);

	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
	if (ret || ipt.error)
		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
@@ -6281,6 +6358,7 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
		list = &ctx->cancel_hash[i];
		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
			if (io_match_task_safe(req, tsk, cancel_all)) {
				hlist_del_init(&req->hash_node);
				io_poll_cancel_req(req);
				found = true;
			}
@@ -7075,8 +7153,11 @@ static __cold void io_drain_req(struct io_kiocb *req)

static void io_clean_op(struct io_kiocb *req)
{
	if (req->flags & REQ_F_BUFFER_SELECTED)
	if (req->flags & REQ_F_BUFFER_SELECTED) {
		spin_lock(&req->ctx->completion_lock);
		io_put_kbuf_comp(req);
		spin_unlock(&req->ctx->completion_lock);
	}

	if (req->flags & REQ_F_NEED_CLEANUP) {
		switch (req->opcode) {
@@ -7505,11 +7586,9 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
		 * Queued up for async execution, worker will release
		 * submit reference when the iocb is actually submitted.
		 */
		io_kbuf_recycle(req);
		io_queue_async_work(req, NULL);
		break;
	case IO_APOLL_OK:
		io_kbuf_recycle(req);
		break;
	}

@@ -8053,6 +8132,13 @@ static int io_sq_thread(void *data)
					needs_sched = false;
					break;
				}

				/*
				 * Ensure the store of the wakeup flag is not
				 * reordered with the load of the SQ tail
				 */
				smp_mb();

				if (io_sqring_entries(ctx)) {
					needs_sched = false;
					break;
@@ -8782,6 +8868,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
			fput(fpl->fp[i]);
	} else {
		kfree_skb(skb);
		free_uid(fpl->user);
		kfree(fpl);
	}

+0 −2
Original line number Diff line number Diff line
@@ -201,11 +201,9 @@ struct io_uring_cqe {
 *
 * IORING_CQE_F_BUFFER	If set, the upper 16 bits are the buffer ID
 * IORING_CQE_F_MORE	If set, parent SQE will generate more CQE entries
 * IORING_CQE_F_MSG	If set, CQE was generated with IORING_OP_MSG_RING
 */
#define IORING_CQE_F_BUFFER		(1U << 0)
#define IORING_CQE_F_MORE		(1U << 1)
#define IORING_CQE_F_MSG		(1U << 2)

enum {
	IORING_CQE_BUFFER_SHIFT		= 16,