Commit 6d13760e authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-6.16-20250614' of git://git.kernel.dk/linux

Pull io_uring fixes from Jens Axboe:

 - Fix for a race between SQPOLL exit and fdinfo reading.

   It's slim and I was only able to reproduce this with an artificial
   delay in the kernel. Followup sparse fix as well to unify the access
   to ->thread.

 - Fix for multiple buffer peeking, avoiding truncation if possible.

 - Run local task_work for IOPOLL reaping when the ring is exiting.

   This currently isn't done due to an assumption that polled IO will
   never need task_work, but a fix on the block side is going to change
   that.

* tag 'io_uring-6.16-20250614' of git://git.kernel.dk/linux:
  io_uring: run local task_work from ring exit IOPOLL reaping
  io_uring/kbuf: don't truncate end buffer for multiple buffer peeks
  io_uring: consistently use rcu semantics with sqpoll thread
  io_uring: fix use-after-free of sq->thread in __io_uring_show_fdinfo()
parents 588adb24 b62e0efd
Loading
Loading
Loading
Loading
+10 −2
Original line number Diff line number Diff line
@@ -141,18 +141,26 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)

	if (ctx->flags & IORING_SETUP_SQPOLL) {
		struct io_sq_data *sq = ctx->sq_data;
		struct task_struct *tsk;

		rcu_read_lock();
		tsk = rcu_dereference(sq->thread);
		/*
		 * sq->thread might be NULL if we raced with the sqpoll
		 * thread termination.
		 */
		if (sq->thread) {
		if (tsk) {
			get_task_struct(tsk);
			rcu_read_unlock();
			getrusage(tsk, RUSAGE_SELF, &sq_usage);
			put_task_struct(tsk);
			sq_pid = sq->task_pid;
			sq_cpu = sq->sq_cpu;
			getrusage(sq->thread, RUSAGE_SELF, &sq_usage);
			sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
					 + sq_usage.ru_stime.tv_usec);
			sq_work_time = sq->work_time;
		} else {
			rcu_read_unlock();
		}
	}

+5 −2
Original line number Diff line number Diff line
@@ -1523,6 +1523,9 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
		}
	}
	mutex_unlock(&ctx->uring_lock);

	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
		io_move_task_work_from_local(ctx);
}

static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
@@ -2906,7 +2909,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
			struct task_struct *tsk;

			io_sq_thread_park(sqd);
			tsk = sqd->thread;
			tsk = sqpoll_task_locked(sqd);
			if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
				io_wq_cancel_cb(tsk->io_uring->io_wq,
						io_cancel_ctx_cb, ctx, true);
@@ -3142,7 +3145,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
	s64 inflight;
	DEFINE_WAIT(wait);

	WARN_ON_ONCE(sqd && sqd->thread != current);
	WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current);

	if (!current->io_uring)
		return;
+4 −1
Original line number Diff line number Diff line
@@ -270,9 +270,12 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
		/* truncate end piece, if needed, for non partial buffers */
		if (len > arg->max_len) {
			len = arg->max_len;
			if (!(bl->flags & IOBL_INC))
			if (!(bl->flags & IOBL_INC)) {
				if (iov != arg->iovs)
					break;
				buf->len = len;
			}
		}

		iov->iov_base = u64_to_user_ptr(buf->addr);
		iov->iov_len = len;
+5 −2
Original line number Diff line number Diff line
@@ -273,6 +273,8 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
	if (ctx->flags & IORING_SETUP_SQPOLL) {
		sqd = ctx->sq_data;
		if (sqd) {
			struct task_struct *tsk;

			/*
			 * Observe the correct sqd->lock -> ctx->uring_lock
			 * ordering. Fine to drop uring_lock here, we hold
@@ -282,8 +284,9 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
			mutex_unlock(&ctx->uring_lock);
			mutex_lock(&sqd->lock);
			mutex_lock(&ctx->uring_lock);
			if (sqd->thread)
				tctx = sqd->thread->io_uring;
			tsk = sqpoll_task_locked(sqd);
			if (tsk)
				tctx = tsk->io_uring;
		}
	} else {
		tctx = current->io_uring;
+28 −15
Original line number Diff line number Diff line
@@ -30,7 +30,7 @@ enum {
void io_sq_thread_unpark(struct io_sq_data *sqd)
	__releases(&sqd->lock)
{
	WARN_ON_ONCE(sqd->thread == current);
	WARN_ON_ONCE(sqpoll_task_locked(sqd) == current);

	/*
	 * Do the dance but not conditional clear_bit() because it'd race with
@@ -46,24 +46,32 @@ void io_sq_thread_unpark(struct io_sq_data *sqd)
void io_sq_thread_park(struct io_sq_data *sqd)
	__acquires(&sqd->lock)
{
	WARN_ON_ONCE(data_race(sqd->thread) == current);
	struct task_struct *tsk;

	atomic_inc(&sqd->park_pending);
	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
	mutex_lock(&sqd->lock);
	if (sqd->thread)
		wake_up_process(sqd->thread);

	tsk = sqpoll_task_locked(sqd);
	if (tsk) {
		WARN_ON_ONCE(tsk == current);
		wake_up_process(tsk);
	}
}

void io_sq_thread_stop(struct io_sq_data *sqd)
{
	WARN_ON_ONCE(sqd->thread == current);
	struct task_struct *tsk;

	WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));

	set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
	mutex_lock(&sqd->lock);
	if (sqd->thread)
		wake_up_process(sqd->thread);
	tsk = sqpoll_task_locked(sqd);
	if (tsk) {
		WARN_ON_ONCE(tsk == current);
		wake_up_process(tsk);
	}
	mutex_unlock(&sqd->lock);
	wait_for_completion(&sqd->exited);
}
@@ -270,7 +278,8 @@ static int io_sq_thread(void *data)
	/* offload context creation failed, just exit */
	if (!current->io_uring) {
		mutex_lock(&sqd->lock);
		sqd->thread = NULL;
		rcu_assign_pointer(sqd->thread, NULL);
		put_task_struct(current);
		mutex_unlock(&sqd->lock);
		goto err_out;
	}
@@ -379,7 +388,8 @@ static int io_sq_thread(void *data)
		io_sq_tw(&retry_list, UINT_MAX);

	io_uring_cancel_generic(true, sqd);
	sqd->thread = NULL;
	rcu_assign_pointer(sqd->thread, NULL);
	put_task_struct(current);
	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
		atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
	io_run_task_work();
@@ -484,7 +494,10 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
			goto err_sqpoll;
		}

		sqd->thread = tsk;
		mutex_lock(&sqd->lock);
		rcu_assign_pointer(sqd->thread, tsk);
		mutex_unlock(&sqd->lock);

		task_to_put = get_task_struct(tsk);
		ret = io_uring_alloc_task_context(tsk, ctx);
		wake_up_new_task(tsk);
@@ -495,9 +508,6 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
		ret = -EINVAL;
		goto err;
	}

	if (task_to_put)
		put_task_struct(task_to_put);
	return 0;
err_sqpoll:
	complete(&ctx->sq_data->exited);
@@ -515,10 +525,13 @@ __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx,
	int ret = -EINVAL;

	if (sqd) {
		struct task_struct *tsk;

		io_sq_thread_park(sqd);
		/* Don't set affinity for a dying thread */
		if (sqd->thread)
			ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
		tsk = sqpoll_task_locked(sqd);
		if (tsk)
			ret = io_wq_cpu_affinity(tsk->io_uring, mask);
		io_sq_thread_unpark(sqd);
	}

Loading