Commit 837c8180 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched-urgent-2026-01-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
 "Misc deadline scheduler fixes, mainly for a new category of bugs that
  were discovered and fixed recently:

   - Fix a race condition in the DL server

   - Fix a DL server bug which can result in incorrectly going idle when
     there's work available

   - Fix DL server bug which triggers a WARN() due to broken
     get_prio_dl() logic and subsequent misbehavior

   - Fix double update_rq_clock() calls

   - Fix setscheduler() assumption about static priorities

   - Make sure balancing callbacks are always called

   - Plus a handful of preparatory commits for the fixes"

* tag 'sched-urgent-2026-01-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/deadline: Use ENQUEUE_MOVE to allow priority change
  sched: Deadline has dynamic priority
  sched: Audit MOVE vs balance_callbacks
  sched: Fold rq-pin swizzle into __balance_callbacks()
  sched/deadline: Avoid double update_rq_clock()
  sched/deadline: Ensure get_prio_dl() is up-to-date
  sched/deadline: Fix server stopping with runnable tasks
  sched: Provide idle_rq() helper
  sched/deadline: Fix potential race in dl_add_task_root_domain()
  sched/deadline: Remove unnecessary comment in dl_add_task_root_domain()
parents cee47579 627cc25f
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -1874,7 +1874,6 @@ static inline int task_nice(const struct task_struct *p)
extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);
+11 −7
Original line number Diff line number Diff line
@@ -4950,9 +4950,13 @@ struct balance_callback *splice_balance_callbacks(struct rq *rq)
	return __splice_balance_callbacks(rq, true);
}

static void __balance_callbacks(struct rq *rq)
void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
{
	if (rf)
		rq_unpin_lock(rq, rf);
	do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
	if (rf)
		rq_repin_lock(rq, rf);
}

void balance_callbacks(struct rq *rq, struct balance_callback *head)
@@ -4991,7 +4995,7 @@ static inline void finish_lock_switch(struct rq *rq)
	 * prev into current:
	 */
	spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
	__balance_callbacks(rq);
	__balance_callbacks(rq, NULL);
	raw_spin_rq_unlock_irq(rq);
}

@@ -6867,7 +6871,7 @@ static void __sched notrace __schedule(int sched_mode)
			proxy_tag_curr(rq, next);

		rq_unpin_lock(rq, &rf);
		__balance_callbacks(rq);
		__balance_callbacks(rq, NULL);
		raw_spin_rq_unlock_irq(rq);
	}
	trace_sched_exit_tp(is_switch);
@@ -7316,7 +7320,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
	trace_sched_pi_setprio(p, pi_task);
	oldprio = p->prio;

	if (oldprio == prio)
	if (oldprio == prio && !dl_prio(prio))
		queue_flag &= ~DEQUEUE_MOVE;

	prev_class = p->sched_class;
@@ -7362,9 +7366,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
out_unlock:
	/* Caller holds task_struct::pi_lock, IRQs are still disabled */

	rq_unpin_lock(rq, &rf);
	__balance_callbacks(rq);
	rq_repin_lock(rq, &rf);
	__balance_callbacks(rq, &rf);
	__task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_RT_MUTEXES */
@@ -9124,6 +9126,8 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)

	if (resched)
		resched_curr(rq);

	__balance_callbacks(rq, &rq_guard.rf);
}

static struct cgroup_subsys_state *
+19 −17
Original line number Diff line number Diff line
@@ -752,8 +752,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
	struct rq *rq = rq_of_dl_rq(dl_rq);

	update_rq_clock(rq);

	WARN_ON(is_dl_boosted(dl_se));
	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));

@@ -1420,7 +1418,7 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int

static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
{
	bool idle = rq->curr == rq->idle;
	bool idle = idle_rq(rq);
	s64 scaled_delta_exec;

	if (unlikely(delta_exec <= 0)) {
@@ -1604,7 +1602,7 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
 * |   |                                | <---+    |
 * |   +--------------------------------+          |
 * |     |              ^         ^       2        |
 * |     | 7            | 2   +--------------------+
 * |     | 7            | 2, 1    +----------------+
 * |     v              |
 * |   +-------------+  |
 * +-- | C:idle-wait | -+
@@ -1649,8 +1647,11 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
 *   dl_defer_idle = 0
 *
 *
 * [1] A->B, A->D
 * [1] A->B, A->D, C->B
 * dl_server_start()
 *   dl_defer_idle = 0;
 *   if (dl_server_active)
 *     return; // [B]
 *   dl_server_active = 1;
 *   enqueue_dl_entity()
 *     update_dl_entity(WAKEUP)
@@ -1759,6 +1760,7 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
 *   "B:zero_laxity-wait" -> "C:idle-wait"        [label="7:dl_server_update_idle"]
 *   "B:zero_laxity-wait" -> "D:running"          [label="3:dl_server_timer"]
 *   "C:idle-wait" -> "A:init"                    [label="8:dl_server_timer"]
 *   "C:idle-wait" -> "B:zero_laxity-wait"        [label="1:dl_server_start"]
 *   "C:idle-wait" -> "B:zero_laxity-wait"        [label="2:dl_server_update"]
 *   "C:idle-wait" -> "C:idle-wait"               [label="7:dl_server_update_idle"]
 *   "D:running" -> "A:init"                      [label="4:pick_task_dl"]
@@ -1784,6 +1786,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
{
	struct rq *rq = dl_se->rq;

	dl_se->dl_defer_idle = 0;
	if (!dl_server(dl_se) || dl_se->dl_server_active)
		return;

@@ -1834,6 +1837,7 @@ void sched_init_dl_servers(void)
		rq = cpu_rq(cpu);

		guard(rq_lock_irq)(rq);
		update_rq_clock(rq);

		dl_se = &rq->fair_server;

@@ -2210,7 +2214,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
		update_dl_entity(dl_se);
	} else if (flags & ENQUEUE_REPLENISH) {
		replenish_dl_entity(dl_se);
	} else if ((flags & ENQUEUE_RESTORE) &&
	} else if ((flags & ENQUEUE_MOVE) &&
		   !is_dl_boosted(dl_se) &&
		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
		setup_new_dl_entity(dl_se);
@@ -3154,7 +3158,7 @@ void dl_add_task_root_domain(struct task_struct *p)
	struct rq *rq;
	struct dl_bw *dl_b;
	unsigned int cpu;
	struct cpumask *msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
	struct cpumask *msk;

	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
	if (!dl_task(p) || dl_entity_is_special(&p->dl)) {
@@ -3162,20 +3166,12 @@ void dl_add_task_root_domain(struct task_struct *p)
		return;
	}

	/*
	 * Get an active rq, whose rq->rd traces the correct root
	 * domain.
	 * Ideally this would be under cpuset reader lock until rq->rd is
	 * fetched.  However, sleepable locks cannot nest inside pi_lock, so we
	 * rely on the caller of dl_add_task_root_domain() holds 'cpuset_mutex'
	 * to guarantee the CPU stays in the cpuset.
	 */
	msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
	dl_get_task_effective_cpus(p, msk);
	cpu = cpumask_first_and(cpu_active_mask, msk);
	BUG_ON(cpu >= nr_cpu_ids);
	rq = cpu_rq(cpu);
	dl_b = &rq->rd->dl_bw;
	/* End of fetching rd */

	raw_spin_lock(&dl_b->lock);
	__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
@@ -3299,6 +3295,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)

static u64 get_prio_dl(struct rq *rq, struct task_struct *p)
{
	/*
	 * Make sure to update current so we don't return a stale value.
	 */
	if (task_current_donor(rq, p))
		update_curr_dl(rq);

	return p->dl.deadline;
}

+1 −0
Original line number Diff line number Diff line
@@ -545,6 +545,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
{
	if (iter->locked_task) {
		__balance_callbacks(iter->rq, &iter->rf);
		task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
		iter->locked_task = NULL;
	}
+26 −1
Original line number Diff line number Diff line
@@ -1364,6 +1364,28 @@ static inline u32 sched_rng(void)
#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
#define raw_rq()		raw_cpu_ptr(&runqueues)

static inline bool idle_rq(struct rq *rq)
{
	return rq->curr == rq->idle && !rq->nr_running && !rq->ttwu_pending;
}

/**
 * available_idle_cpu - is a given CPU idle for enqueuing work.
 * @cpu: the CPU in question.
 *
 * Return: 1 if the CPU is currently idle. 0 otherwise.
 */
static inline bool available_idle_cpu(int cpu)
{
	if (!idle_rq(cpu_rq(cpu)))
		return 0;

	if (vcpu_is_preempted(cpu))
		return 0;

	return 1;
}

#ifdef CONFIG_SCHED_PROXY_EXEC
static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
{
@@ -2366,7 +2388,8 @@ extern const u32 sched_prio_to_wmult[40];
 *                should preserve as much state as possible.
 *
 * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
 *        in the runqueue.
 *        in the runqueue. IOW the priority is allowed to change. Callers
 *        must expect to deal with balance callbacks.
 *
 * NOCLOCK - skip the update_rq_clock() (avoids double updates)
 *
@@ -3947,6 +3970,8 @@ extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);

extern struct balance_callback *splice_balance_callbacks(struct rq *rq);

extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);

/*
Loading