Commit 812e7eb2 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-6.18-20251023' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring fixes from Jens Axboe:

 - Add MAINTAINERS entry for zcrx, mostly so that netdev gets
   automatically CC'ed by default on any changes there too.

 - Fix for the SQPOLL busy vs work time accounting.

   It was using getrusage(), which was both broken from a thread point
   of view (we only care about the SQPOLL thread itself), and vastly
   overkill as only the systime was used. On top of that, also be a bit
   smarter in when it's queried. It used excessive CPU before this
   change. Marked for stable as well.

 - Fix provided ring buffer auto commit for uring_cmd.

 - Fix a few style issues and sparse annotation for a lock.

* tag 'io_uring-6.18-20251023' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
  io_uring: fix buffer auto-commit for multishot uring_cmd
  io_uring: correct __must_hold annotation in io_install_fixed_file
  io_uring zcrx: add MAINTAINERS entry
  io_uring: Fix code indentation error
  io_uring/sqpoll: be smarter on when to update the stime usage
  io_uring/sqpoll: switch away from getrusage() for CPU accounting
  io_uring: fix incorrect unlikely() usage in io_waitid_prep()
parents 66cd8e9c 6f1cbf6d
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -13116,6 +13116,15 @@ F: include/uapi/linux/io_uring.h
F:	include/uapi/linux/io_uring/
F:	io_uring/
IO_URING ZCRX
M:	Pavel Begunkov <asml.silence@gmail.com>
L:	io-uring@vger.kernel.org
L:	netdev@vger.kernel.org
T:	git https://github.com/isilence/linux.git zcrx/for-next
T:	git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git
S:	Maintained
F:	io_uring/zcrx.*
IPMI SUBSYSTEM
M:	Corey Minyard <corey@minyard.net>
L:	openipmi-developer@lists.sourceforge.net (moderated for non-subscribers)
+4 −4
Original line number Diff line number Diff line
@@ -59,7 +59,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
{
	struct io_overflow_cqe *ocqe;
	struct io_rings *r = ctx->rings;
	struct rusage sq_usage;
	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
	unsigned int sq_head = READ_ONCE(r->sq.head);
	unsigned int sq_tail = READ_ONCE(r->sq.tail);
@@ -152,14 +151,15 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
		 * thread termination.
		 */
		if (tsk) {
			u64 usec;

			get_task_struct(tsk);
			rcu_read_unlock();
			getrusage(tsk, RUSAGE_SELF, &sq_usage);
			usec = io_sq_cpu_usec(tsk);
			put_task_struct(tsk);
			sq_pid = sq->task_pid;
			sq_cpu = sq->sq_cpu;
			sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
					 + sq_usage.ru_stime.tv_usec);
			sq_total_time = usec;
			sq_work_time = sq->work_time;
		} else {
			rcu_read_unlock();
+1 −1
Original line number Diff line number Diff line
@@ -57,7 +57,7 @@ void io_free_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table)

static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
				 u32 slot_index)
	__must_hold(&req->ctx->uring_lock)
	__must_hold(&ctx->uring_lock)
{
	struct io_rsrc_node *node;

+22 −11
Original line number Diff line number Diff line
@@ -155,6 +155,27 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
	return 1;
}

static bool io_should_commit(struct io_kiocb *req, unsigned int issue_flags)
{
	/*
	* If we came in unlocked, we have no choice but to consume the
	* buffer here, otherwise nothing ensures that the buffer won't
	* get used by others. This does mean it'll be pinned until the
	* IO completes, coming in unlocked means we're being called from
	* io-wq context and there may be further retries in async hybrid
	* mode. For the locked case, the caller must call commit when
	* the transfer completes (or if we get -EAGAIN and must poll of
	* retry).
	*/
	if (issue_flags & IO_URING_F_UNLOCKED)
		return true;

	/* uring_cmd commits kbuf upfront, no need to auto-commit */
	if (!io_file_can_poll(req) && req->opcode != IORING_OP_URING_CMD)
		return true;
	return false;
}

static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
					      struct io_buffer_list *bl,
					      unsigned int issue_flags)
@@ -181,17 +202,7 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
	sel.buf_list = bl;
	sel.addr = u64_to_user_ptr(buf->addr);

	if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) {
		/*
		 * If we came in unlocked, we have no choice but to consume the
		 * buffer here, otherwise nothing ensures that the buffer won't
		 * get used by others. This does mean it'll be pinned until the
		 * IO completes, coming in unlocked means we're being called from
		 * io-wq context and there may be further retries in async hybrid
		 * mode. For the locked case, the caller must call commit when
		 * the transfer completes (or if we get -EAGAIN and must poll of
		 * retry).
		 */
	if (io_should_commit(req, issue_flags)) {
		io_kbuf_commit(req, sel.buf_list, *len, 1);
		sel.buf_list = NULL;
	}
+45 −20
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@
#include <linux/audit.h>
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/sched/cputime.h>
#include <linux/io_uring.h>

#include <uapi/linux/io_uring.h>
@@ -169,7 +170,38 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
	return READ_ONCE(sqd->state);
}

static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
struct io_sq_time {
	bool started;
	u64 usec;
};

u64 io_sq_cpu_usec(struct task_struct *tsk)
{
	u64 utime, stime;

	task_cputime_adjusted(tsk, &utime, &stime);
	do_div(stime, 1000);
	return stime;
}

static void io_sq_update_worktime(struct io_sq_data *sqd, struct io_sq_time *ist)
{
	if (!ist->started)
		return;
	ist->started = false;
	sqd->work_time += io_sq_cpu_usec(current) - ist->usec;
}

static void io_sq_start_worktime(struct io_sq_time *ist)
{
	if (ist->started)
		return;
	ist->started = true;
	ist->usec = io_sq_cpu_usec(current);
}

static int __io_sq_thread(struct io_ring_ctx *ctx, struct io_sq_data *sqd,
			  bool cap_entries, struct io_sq_time *ist)
{
	unsigned int to_submit;
	int ret = 0;
@@ -182,6 +214,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
	if (to_submit || !wq_list_empty(&ctx->iopoll_list)) {
		const struct cred *creds = NULL;

		io_sq_start_worktime(ist);

		if (ctx->sq_creds != current_cred())
			creds = override_creds(ctx->sq_creds);

@@ -255,23 +289,11 @@ static bool io_sq_tw_pending(struct llist_node *retry_list)
	return retry_list || !llist_empty(&tctx->task_list);
}

static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start)
{
	struct rusage end;

	getrusage(current, RUSAGE_SELF, &end);
	end.ru_stime.tv_sec -= start->ru_stime.tv_sec;
	end.ru_stime.tv_usec -= start->ru_stime.tv_usec;

	sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000;
}

static int io_sq_thread(void *data)
{
	struct llist_node *retry_list = NULL;
	struct io_sq_data *sqd = data;
	struct io_ring_ctx *ctx;
	struct rusage start;
	unsigned long timeout = 0;
	char buf[TASK_COMM_LEN] = {};
	DEFINE_WAIT(wait);
@@ -309,6 +331,7 @@ static int io_sq_thread(void *data)
	mutex_lock(&sqd->lock);
	while (1) {
		bool cap_entries, sqt_spin = false;
		struct io_sq_time ist = { };

		if (io_sqd_events_pending(sqd) || signal_pending(current)) {
			if (io_sqd_handle_event(sqd))
@@ -317,9 +340,8 @@ static int io_sq_thread(void *data)
		}

		cap_entries = !list_is_singular(&sqd->ctx_list);
		getrusage(current, RUSAGE_SELF, &start);
		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
			int ret = __io_sq_thread(ctx, cap_entries);
			int ret = __io_sq_thread(ctx, sqd, cap_entries, &ist);

			if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
				sqt_spin = true;
@@ -327,15 +349,18 @@ static int io_sq_thread(void *data)
		if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
			sqt_spin = true;

		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
			if (io_napi(ctx))
		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
			if (io_napi(ctx)) {
				io_sq_start_worktime(&ist);
				io_napi_sqpoll_busy_poll(ctx);
			}
		}

		io_sq_update_worktime(sqd, &ist);

		if (sqt_spin || !time_after(jiffies, timeout)) {
			if (sqt_spin) {
				io_sq_update_worktime(sqd, &start);
			if (sqt_spin)
				timeout = jiffies + sqd->sq_thread_idle;
			}
			if (unlikely(need_resched())) {
				mutex_unlock(&sqd->lock);
				cond_resched();
Loading