Commit c6508124 authored by Johannes Weiner's avatar Johannes Weiner Committed by Ingo Molnar
Browse files

sched/psi: Fix mistaken CPU pressure indication after corrupted task state bug



Since sched_delayed tasks remain queued even after blocking, the load
balancer can migrate them between runqueues while PSI considers them
to be asleep. As a result, it misreads the migration requeue followed
by a wakeup as a double queue:

  psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=. set=4

First, call psi_enqueue() after p->sched_class->enqueue_task(). A
wakeup will clear p->se.sched_delayed while a migration will not, so
psi can use that flag to tell them apart.

Then teach psi to migrate any "sleep" state when delayed-dequeue tasks
are being migrated.

Delayed-dequeue tasks can be revived by ttwu_runnable(), which will
call down with a new ENQUEUE_DELAYED. Instead of further complicating
the wakeup conditional in enqueue_task(), identify migration contexts
instead and default to wakeup handling for all other cases.

It's not just the warning in dmesg, the task state corruption causes a
permanent CPU pressure indication, which messes with workload/machine
health monitoring.

Debugged-by-and-original-fix-by: default avatarK Prateek Nayak <kprateek.nayak@amd.com>
Fixes: 152e11f6 ("sched/fair: Implement delayed dequeue")
Closes: https://lore.kernel.org/lkml/20240830123458.3557-1-spasswolf@web.de/
Closes: https://lore.kernel.org/all/cd67fbcd-d659-4822-bb90-7e8fbb40a856@molgen.mpg.de/


Signed-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
Tested-by: default avatarK Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20241010193712.GC181795@cmpxchg.org
parent f5aaff7b
Loading
Loading
Loading
Loading
+6 −6
Original line number Diff line number Diff line
@@ -2012,11 +2012,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
	if (!(flags & ENQUEUE_NOCLOCK))
		update_rq_clock(rq);

	if (!(flags & ENQUEUE_RESTORE)) {
		sched_info_enqueue(rq, p);
		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
	}

	p->sched_class->enqueue_task(rq, p, flags);
	/*
	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
@@ -2024,6 +2019,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
	 */
	uclamp_rq_inc(rq, p);

	if (!(flags & ENQUEUE_RESTORE)) {
		sched_info_enqueue(rq, p);
		psi_enqueue(p, flags & ENQUEUE_MIGRATED);
	}

	if (sched_core_enabled(rq))
		sched_core_enqueue(rq, p);
}
@@ -2041,7 +2041,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)

	if (!(flags & DEQUEUE_SAVE)) {
		sched_info_dequeue(rq, p);
		psi_dequeue(p, flags & DEQUEUE_SLEEP);
		psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
	}

	/*
+33 −15
Original line number Diff line number Diff line
@@ -119,45 +119,63 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
/*
 * PSI tracks state that persists across sleeps, such as iowaits and
 * memory stalls. As a result, it has to distinguish between sleeps,
 * where a task's runnable state changes, and requeues, where a task
 * and its state are being moved between CPUs and runqueues.
 * where a task's runnable state changes, and migrations, where a task
 * and its runnable state are being moved between CPUs and runqueues.
 *
 * A notable case is a task whose dequeue is delayed. PSI considers
 * those sleeping, but because they are still on the runqueue they can
 * go through migration requeues. In this case, *sleeping* states need
 * to be transferred.
 */
static inline void psi_enqueue(struct task_struct *p, bool wakeup)
static inline void psi_enqueue(struct task_struct *p, bool migrate)
{
	int clear = 0, set = TSK_RUNNING;
	int clear = 0, set = 0;

	if (static_branch_likely(&psi_disabled))
		return;

	if (p->in_memstall)
		set |= TSK_MEMSTALL_RUNNING;

	if (!wakeup) {
	if (p->se.sched_delayed) {
		/* CPU migration of "sleeping" task */
		SCHED_WARN_ON(!migrate);
		if (p->in_memstall)
			set |= TSK_MEMSTALL;
		if (p->in_iowait)
			set |= TSK_IOWAIT;
	} else if (migrate) {
		/* CPU migration of runnable task */
		set = TSK_RUNNING;
		if (p->in_memstall)
			set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING;
	} else {
		/* Wakeup of new or sleeping task */
		if (p->in_iowait)
			clear |= TSK_IOWAIT;
		set = TSK_RUNNING;
		if (p->in_memstall)
			set |= TSK_MEMSTALL_RUNNING;
	}

	psi_task_change(p, clear, set);
}

static inline void psi_dequeue(struct task_struct *p, bool sleep)
static inline void psi_dequeue(struct task_struct *p, bool migrate)
{
	if (static_branch_likely(&psi_disabled))
		return;

	/*
	 * When migrating a task to another CPU, clear all psi
	 * state. The enqueue callback above will work it out.
	 */
	if (migrate)
		psi_task_change(p, p->psi_flags, 0);

	/*
	 * A voluntary sleep is a dequeue followed by a task switch. To
	 * avoid walking all ancestors twice, psi_task_switch() handles
	 * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
	 * Do nothing here.
	 */
	if (sleep)
		return;

	psi_task_change(p, p->psi_flags, 0);
}

static inline void psi_ttwu_dequeue(struct task_struct *p)
@@ -190,8 +208,8 @@ static inline void psi_sched_switch(struct task_struct *prev,
}

#else /* CONFIG_PSI */
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
				    struct task_struct *next,