Commit 3de7361f authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge branch 'io_uring-6.15' into for-6.16/io_uring

Merge in 6.15 io_uring fixes, mostly so that the fdinfo changes can
get easily extended without causing merge conflicts.

* io_uring-6.15:
  io_uring/fdinfo: grab ctx->uring_lock around io_uring_show_fdinfo()
  io_uring/memmap: don't use page_address() on a highmem page
  io_uring/uring_cmd: fix hybrid polling initialization issue
  io_uring/sqpoll: Increase task_work submission batch size
  io_uring: ensure deferred completions are flushed for multishot
  io_uring: always arm linked timeouts prior to issue
  io_uring/fdinfo: annotate racy sq/cq head/tail reads
  io_uring: fix 'sync' handling of io_fallback_tw()
  io_uring: don't duplicate flushing in io_req_post_cqe
parents 2b61bb1d d871198e
Loading
Loading
Loading
Loading
+27 −25
Original line number Diff line number Diff line
@@ -86,13 +86,8 @@ static inline void napi_show_fdinfo(struct io_ring_ctx *ctx,
}
#endif

/*
 * Caller holds a reference to the file already, we don't need to do
 * anything else to get an extra reference.
 */
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
{
	struct io_ring_ctx *ctx = file->private_data;
	struct io_overflow_cqe *ocqe;
	struct io_rings *r = ctx->rings;
	struct rusage sq_usage;
@@ -106,7 +101,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
	unsigned int sq_entries, cq_entries;
	int sq_pid = -1, sq_cpu = -1;
	u64 sq_total_time = 0, sq_work_time = 0;
	bool has_lock;
	unsigned int i;

	if (ctx->flags & IORING_SETUP_CQE32)
@@ -123,11 +117,11 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
	seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
	seq_printf(m, "SqHead:\t%u\n", sq_head);
	seq_printf(m, "SqTail:\t%u\n", sq_tail);
	seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
	seq_printf(m, "CachedSqHead:\t%u\n", data_race(ctx->cached_sq_head));
	seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
	seq_printf(m, "CqHead:\t%u\n", cq_head);
	seq_printf(m, "CqTail:\t%u\n", cq_tail);
	seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
	seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail));
	seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head);
	sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
	for (i = 0; i < sq_entries; i++) {
@@ -176,15 +170,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
		seq_printf(m, "\n");
	}

	/*
	 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
	 * since fdinfo case grabs it in the opposite direction of normal use
	 * cases. If we fail to get the lock, we just don't iterate any
	 * structures that could be going away outside the io_uring mutex.
	 */
	has_lock = mutex_trylock(&ctx->uring_lock);

	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
	if (ctx->flags & IORING_SETUP_SQPOLL) {
		struct io_sq_data *sq = ctx->sq_data;

		/*
@@ -206,7 +192,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
	seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
	seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
	seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr);
	for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) {
	for (i = 0; i < ctx->file_table.data.nr; i++) {
		struct file *f = NULL;

		if (ctx->file_table.data.nodes[i])
@@ -218,7 +204,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
		}
	}
	seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr);
	for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {
	for (i = 0; i < ctx->buf_table.nr; i++) {
		struct io_mapped_ubuf *buf = NULL;

		if (ctx->buf_table.nodes[i])
@@ -228,7 +214,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
		else
			seq_printf(m, "%5u: <none>\n", i);
	}
	if (has_lock && !xa_empty(&ctx->personalities)) {
	if (!xa_empty(&ctx->personalities)) {
		unsigned long index;
		const struct cred *cred;

@@ -238,7 +224,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
	}

	seq_puts(m, "PollList:\n");
	for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) {
	for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
		struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
		struct io_kiocb *req;

@@ -247,9 +233,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
					task_work_pending(req->tctx->task));
	}

	if (has_lock)
		mutex_unlock(&ctx->uring_lock);

	seq_puts(m, "CqOverflowList:\n");
	spin_lock(&ctx->completion_lock);
	list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
@@ -262,4 +245,23 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
	spin_unlock(&ctx->completion_lock);
	napi_show_fdinfo(ctx, m);
}

/*
 * Caller holds a reference to the file already, we don't need to do
 * anything else to get an extra reference.
 */
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
{
	struct io_ring_ctx *ctx = file->private_data;

	/*
	 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
	 * since fdinfo case grabs it in the opposite direction of normal use
	 * cases.
	 */
	if (mutex_trylock(&ctx->uring_lock)) {
		__io_uring_show_fdinfo(ctx, m);
		mutex_unlock(&ctx->uring_lock);
	}
}
#endif
+38 −44
Original line number Diff line number Diff line
@@ -430,24 +430,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
	return req->link;
}

static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
		return NULL;
	return __io_prep_linked_timeout(req);
}

static noinline void __io_arm_ltimeout(struct io_kiocb *req)
{
	io_queue_linked_timeout(__io_prep_linked_timeout(req));
}

static inline void io_arm_ltimeout(struct io_kiocb *req)
{
	if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
		__io_arm_ltimeout(req);
}

static void io_prep_async_work(struct io_kiocb *req)
{
	const struct io_issue_def *def = &io_issue_defs[req->opcode];
@@ -500,7 +482,6 @@ static void io_prep_async_link(struct io_kiocb *req)

static void io_queue_iowq(struct io_kiocb *req)
{
	struct io_kiocb *link = io_prep_linked_timeout(req);
	struct io_uring_task *tctx = req->tctx;

	BUG_ON(!tctx);
@@ -525,8 +506,6 @@ static void io_queue_iowq(struct io_kiocb *req)

	trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
	io_wq_enqueue(tctx->io_wq, &req->work);
	if (link)
		io_queue_linked_timeout(link);
}

static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw)
@@ -864,13 +843,26 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
	struct io_ring_ctx *ctx = req->ctx;
	bool posted;

	/*
	 * If multishot has already posted deferred completions, ensure that
	 * those are flushed first before posting this one. If not, CQEs
	 * could get reordered.
	 */
	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
		__io_submit_flush_completions(ctx);

	lockdep_assert(!io_wq_current_is_worker());
	lockdep_assert_held(&ctx->uring_lock);

	__io_cq_lock(ctx);
	if (!ctx->lockless_cq) {
		spin_lock(&ctx->completion_lock);
		posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
		spin_unlock(&ctx->completion_lock);
	} else {
		posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
	}

	ctx->submit_state.cq_flush = true;
	__io_cq_unlock_post(ctx);
	return posted;
}

@@ -1058,20 +1050,21 @@ static __cold void __io_fallback_tw(struct llist_node *node, bool sync)
	while (node) {
		req = container_of(node, struct io_kiocb, io_task_work.node);
		node = node->next;
		if (sync && last_ctx != req->ctx) {
		if (last_ctx != req->ctx) {
			if (last_ctx) {
				if (sync)
					flush_delayed_work(&last_ctx->fallback_work);
				percpu_ref_put(&last_ctx->refs);
			}
			last_ctx = req->ctx;
			percpu_ref_get(&last_ctx->refs);
		}
		if (llist_add(&req->io_task_work.node,
			      &req->ctx->fallback_llist))
			schedule_delayed_work(&req->ctx->fallback_work, 1);
		if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist))
			schedule_delayed_work(&last_ctx->fallback_work, 1);
	}

	if (last_ctx) {
		if (sync)
			flush_delayed_work(&last_ctx->fallback_work);
		percpu_ref_put(&last_ctx->refs);
	}
@@ -1684,15 +1677,22 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
	return !!req->file;
}

#define REQ_ISSUE_SLOW_FLAGS	(REQ_F_CREDS | REQ_F_ARM_LTIMEOUT)

static inline int __io_issue_sqe(struct io_kiocb *req,
				 unsigned int issue_flags,
				 const struct io_issue_def *def)
{
	const struct cred *creds = NULL;
	struct io_kiocb *link = NULL;
	int ret;

	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
	if (unlikely(req->flags & REQ_ISSUE_SLOW_FLAGS)) {
		if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
			creds = override_creds(req->creds);
		if (req->flags & REQ_F_ARM_LTIMEOUT)
			link = __io_prep_linked_timeout(req);
	}

	if (!def->audit_skip)
		audit_uring_entry(req->opcode);
@@ -1702,8 +1702,12 @@ static inline int __io_issue_sqe(struct io_kiocb *req,
	if (!def->audit_skip)
		audit_uring_exit(!ret, ret);

	if (unlikely(creds || link)) {
		if (creds)
			revert_creds(creds);
		if (link)
			io_queue_linked_timeout(link);
	}

	return ret;
}
@@ -1729,7 +1733,6 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)

	if (ret == IOU_ISSUE_SKIP_COMPLETE) {
		ret = 0;
		io_arm_ltimeout(req);

		/* If the op doesn't have a file, we're not polling for it */
		if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
@@ -1784,8 +1787,6 @@ void io_wq_submit_work(struct io_wq_work *work)
	else
		req_ref_get(req);

	io_arm_ltimeout(req);

	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
	if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) {
fail:
@@ -1902,15 +1903,11 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd)
static void io_queue_async(struct io_kiocb *req, int ret)
	__must_hold(&req->ctx->uring_lock)
{
	struct io_kiocb *linked_timeout;

	if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
		io_req_defer_failed(req, ret);
		return;
	}

	linked_timeout = io_prep_linked_timeout(req);

	switch (io_arm_poll_handler(req, 0)) {
	case IO_APOLL_READY:
		io_kbuf_recycle(req, 0);
@@ -1923,9 +1920,6 @@ static void io_queue_async(struct io_kiocb *req, int ret)
	case IO_APOLL_OK:
		break;
	}

	if (linked_timeout)
		io_queue_linked_timeout(linked_timeout);
}

static inline void io_queue_sqe(struct io_kiocb *req)
+1 −1
Original line number Diff line number Diff line
@@ -117,7 +117,7 @@ static int io_region_init_ptr(struct io_mapped_region *mr)
	void *ptr;

	if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) {
		if (ifd.nr_folios == 1) {
		if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) {
			mr->ptr = page_address(mr->pages[0]);
			return 0;
		}
+1 −1
Original line number Diff line number Diff line
@@ -20,7 +20,7 @@
#include "sqpoll.h"

#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
#define IORING_TW_CAP_ENTRIES_VALUE	8
#define IORING_TW_CAP_ENTRIES_VALUE	32

enum {
	IO_SQ_THREAD_SHOULD_STOP = 0,
+5 −0
Original line number Diff line number Diff line
@@ -251,6 +251,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
			return -EOPNOTSUPP;
		issue_flags |= IO_URING_F_IOPOLL;
		req->iopoll_completed = 0;
		if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
			/* make sure every req only blocks once */
			req->flags &= ~REQ_F_IOPOLL_STATE;
			req->iopoll_start = ktime_get_ns();
		}
	}

	ret = file->f_op->uring_cmd(ioucmd, issue_flags);