Commit b55945c5 authored by Peter Zijlstra's avatar Peter Zijlstra
Browse files

sched: Fix pick_next_task_fair() vs try_to_wake_up() race



Syzkaller robot reported KCSAN tripping over the
ASSERT_EXCLUSIVE_WRITER(p->on_rq) in __block_task().

The report noted that both pick_next_task_fair() and try_to_wake_up()
were concurrently trying to write to the same p->on_rq, violating the
assertion -- even though both paths hold rq->__lock.

The logical consequence is that both code paths end up holding a
different rq->__lock. And looking through ttwu(), this is possible
when the __block_task() 'p->on_rq = 0' store is visible to the ttwu()
'p->on_rq' load, which then assumes the task is not queued and
continues to migrate it.

Rearrange things such that __block_task() releases @p with the store
and no code thereafter will use @p again.

Fixes: 152e11f6 ("sched/fair: Implement delayed dequeue")
Reported-by: default avatar <syzbot+0ec1e96c2cdf5c0e512a@syzkaller.appspotmail.com>
Reported-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: default avatarMarco Elver <elver@google.com>
Link: https://lkml.kernel.org/r/20241023093641.GE16066@noisy.programming.kicks-ass.net
parent 42f7652d
Loading
Loading
Loading
Loading
+14 −7
Original line number Diff line number Diff line
@@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
	struct sched_entity *se = pick_eevdf(cfs_rq);
	if (se->sched_delayed) {
		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
		SCHED_WARN_ON(se->sched_delayed);
		SCHED_WARN_ON(se->on_rq);
		/*
		 * Must not reference @se again, see __block_task().
		 */
		return NULL;
	}
	return se;
@@ -7176,7 +7177,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
		/* Fix-up what dequeue_task_fair() skipped */
		hrtick_update(rq);

		/* Fix-up what block_task() skipped. */
		/*
		 * Fix-up what block_task() skipped.
		 *
		 * Must be last, @p might not be valid after this.
		 */
		__block_task(rq, p);
	}

@@ -7193,12 +7198,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
		util_est_dequeue(&rq->cfs, p);

	if (dequeue_entities(rq, &p->se, flags) < 0) {
		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
	if (dequeue_entities(rq, &p->se, flags) < 0)
		return false;
	}

	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
	/*
	 * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
	 */

	hrtick_update(rq);
	return true;
}
+32 −2
Original line number Diff line number Diff line
@@ -2769,8 +2769,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)

static inline void __block_task(struct rq *rq, struct task_struct *p)
{
	WRITE_ONCE(p->on_rq, 0);
	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
	if (p->sched_contributes_to_load)
		rq->nr_uninterruptible++;

@@ -2778,6 +2776,38 @@ static inline void __block_task(struct rq *rq, struct task_struct *p)
		atomic_inc(&rq->nr_iowait);
		delayacct_blkio_start();
	}

	ASSERT_EXCLUSIVE_WRITER(p->on_rq);

	/*
	 * The moment this write goes through, ttwu() can swoop in and migrate
	 * this task, rendering our rq->__lock ineffective.
	 *
	 * __schedule()				try_to_wake_up()
	 *   LOCK rq->__lock			  LOCK p->pi_lock
	 *   pick_next_task()
	 *     pick_next_task_fair()
	 *       pick_next_entity()
	 *         dequeue_entities()
	 *           __block_task()
	 *             RELEASE p->on_rq = 0	  if (p->on_rq && ...)
	 *					    break;
	 *
	 *					  ACQUIRE (after ctrl-dep)
	 *
	 *					  cpu = select_task_rq();
	 *					  set_task_cpu(p, cpu);
	 *					  ttwu_queue()
	 *					    ttwu_do_activate()
	 *					      LOCK rq->__lock
	 *					      activate_task()
	 *					        STORE p->on_rq = 1
	 *   UNLOCK rq->__lock
	 *
	 * Callers must ensure to not reference @p after this -- we no longer
	 * own it.
	 */
	smp_store_release(&p->on_rq, 0);
}

extern void activate_task(struct rq *rq, struct task_struct *p, int flags);