Commit 33e83ffe authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched-urgent-2024-11-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner:

 - Plug a race between pick_next_task_fair() and try_to_wake_up() where
   both try to write to the same task, even though both paths hold a
   runqueue lock, but obviously from different runqueues.

   The problem is that the store to task::on_rq in __block_task() is
   visible to try_to_wake_up() which assumes that the task is not
   queued. Both sides then operate on the same task.

   Cure it by rearranging __block_task() so the the store to task::on_rq
   is the last operation on the task.

 - Prevent a potential NULL pointer dereference in task_numa_work()

   task_numa_work() iterates the VMAs of a process. A concurrent unmap
   of the address space can result in a NULL pointer return from
   vma_next() which is unchecked.

   Add the missing NULL pointer check to prevent this.

 - Operate on the correct scheduler policy in task_should_scx()

   task_should_scx() returns true when a task should be handled by sched
   EXT. It checks the tasks scheduling policy.

   This fails when the check is done before a policy has been set.

   Cure it by handing the policy into task_should_scx() so it operates
   on the requested value.

 - Add the missing handling of sched EXT in the delayed dequeue
   mechanism. This was simply forgotten.

* tag 'sched-urgent-2024-11-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/ext: Fix scx vs sched_delayed
  sched: Pass correct scheduling policy to __setscheduler_class
  sched/numa: Fix the potential null pointer dereference in task_numa_work()
  sched: Fix pick_next_task_fair() vs try_to_wake_up() race
parents 68f05b25 69d5e722
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -4711,7 +4711,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
	if (rt_prio(p->prio)) {
		p->sched_class = &rt_sched_class;
#ifdef CONFIG_SCHED_CLASS_EXT
	} else if (task_should_scx(p)) {
	} else if (task_should_scx(p->policy)) {
		p->sched_class = &ext_sched_class;
#endif
	} else {
@@ -7025,7 +7025,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);

const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
const struct sched_class *__setscheduler_class(int policy, int prio)
{
	if (dl_prio(prio))
		return &dl_sched_class;
@@ -7034,7 +7034,7 @@ const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
		return &rt_sched_class;

#ifdef CONFIG_SCHED_CLASS_EXT
	if (task_should_scx(p))
	if (task_should_scx(policy))
		return &ext_sched_class;
#endif

@@ -7142,7 +7142,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
		queue_flag &= ~DEQUEUE_MOVE;

	prev_class = p->sched_class;
	next_class = __setscheduler_class(p, prio);
	next_class = __setscheduler_class(p->policy, prio);

	if (prev_class != next_class && p->se.sched_delayed)
		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+14 −4
Original line number Diff line number Diff line
@@ -4257,14 +4257,14 @@ static const struct kset_uevent_ops scx_uevent_ops = {
 * Used by sched_fork() and __setscheduler_prio() to pick the matching
 * sched_class. dl/rt are already handled.
 */
bool task_should_scx(struct task_struct *p)
bool task_should_scx(int policy)
{
	if (!scx_enabled() ||
	    unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
		return false;
	if (READ_ONCE(scx_switching_all))
		return true;
	return p->policy == SCHED_EXT;
	return policy == SCHED_EXT;
}

/**
@@ -4494,11 +4494,16 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
	scx_task_iter_start(&sti);
	while ((p = scx_task_iter_next_locked(&sti))) {
		const struct sched_class *old_class = p->sched_class;
		const struct sched_class *new_class =
			__setscheduler_class(p->policy, p->prio);
		struct sched_enq_and_set_ctx ctx;

		if (old_class != new_class && p->se.sched_delayed)
			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);

		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);

		p->sched_class = __setscheduler_class(p, p->prio);
		p->sched_class = new_class;
		check_class_changing(task_rq(p), p, old_class);

		sched_enq_and_set_task(&ctx);
@@ -5204,12 +5209,17 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
	scx_task_iter_start(&sti);
	while ((p = scx_task_iter_next_locked(&sti))) {
		const struct sched_class *old_class = p->sched_class;
		const struct sched_class *new_class =
			__setscheduler_class(p->policy, p->prio);
		struct sched_enq_and_set_ctx ctx;

		if (old_class != new_class && p->se.sched_delayed)
			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);

		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);

		p->scx.slice = SCX_SLICE_DFL;
		p->sched_class = __setscheduler_class(p, p->prio);
		p->sched_class = new_class;
		check_class_changing(task_rq(p), p, old_class);

		sched_enq_and_set_task(&ctx);
+1 −1
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@ bool scx_can_stop_tick(struct rq *rq);
void scx_rq_activate(struct rq *rq);
void scx_rq_deactivate(struct rq *rq);
int scx_check_setscheduler(struct task_struct *p, int policy);
bool task_should_scx(struct task_struct *p);
bool task_should_scx(int policy);
void init_sched_ext_class(void);

static inline u32 scx_cpuperf_target(s32 cpu)
+16 −9
Original line number Diff line number Diff line
@@ -3369,7 +3369,7 @@ static void task_numa_work(struct callback_head *work)
		vma = vma_next(&vmi);
	}

	do {
	for (; vma; vma = vma_next(&vmi)) {
		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
@@ -3491,7 +3491,7 @@ static void task_numa_work(struct callback_head *work)
		 */
		if (vma_pids_forced)
			break;
	} for_each_vma(vmi, vma);
	}

	/*
	 * If no VMAs are remaining and VMAs were skipped due to the PID
@@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
	struct sched_entity *se = pick_eevdf(cfs_rq);
	if (se->sched_delayed) {
		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
		SCHED_WARN_ON(se->sched_delayed);
		SCHED_WARN_ON(se->on_rq);
		/*
		 * Must not reference @se again, see __block_task().
		 */
		return NULL;
	}
	return se;
@@ -7176,7 +7177,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
		/* Fix-up what dequeue_task_fair() skipped */
		hrtick_update(rq);

		/* Fix-up what block_task() skipped. */
		/*
		 * Fix-up what block_task() skipped.
		 *
		 * Must be last, @p might not be valid after this.
		 */
		__block_task(rq, p);
	}

@@ -7193,12 +7198,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
		util_est_dequeue(&rq->cfs, p);

	if (dequeue_entities(rq, &p->se, flags) < 0) {
		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
	if (dequeue_entities(rq, &p->se, flags) < 0)
		return false;
	}

	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
	/*
	 * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
	 */

	hrtick_update(rq);
	return true;
}
+33 −3
Original line number Diff line number Diff line
@@ -2769,8 +2769,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)

static inline void __block_task(struct rq *rq, struct task_struct *p)
{
	WRITE_ONCE(p->on_rq, 0);
	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
	if (p->sched_contributes_to_load)
		rq->nr_uninterruptible++;

@@ -2778,6 +2776,38 @@ static inline void __block_task(struct rq *rq, struct task_struct *p)
		atomic_inc(&rq->nr_iowait);
		delayacct_blkio_start();
	}

	ASSERT_EXCLUSIVE_WRITER(p->on_rq);

	/*
	 * The moment this write goes through, ttwu() can swoop in and migrate
	 * this task, rendering our rq->__lock ineffective.
	 *
	 * __schedule()				try_to_wake_up()
	 *   LOCK rq->__lock			  LOCK p->pi_lock
	 *   pick_next_task()
	 *     pick_next_task_fair()
	 *       pick_next_entity()
	 *         dequeue_entities()
	 *           __block_task()
	 *             RELEASE p->on_rq = 0	  if (p->on_rq && ...)
	 *					    break;
	 *
	 *					  ACQUIRE (after ctrl-dep)
	 *
	 *					  cpu = select_task_rq();
	 *					  set_task_cpu(p, cpu);
	 *					  ttwu_queue()
	 *					    ttwu_do_activate()
	 *					      LOCK rq->__lock
	 *					      activate_task()
	 *					        STORE p->on_rq = 1
	 *   UNLOCK rq->__lock
	 *
	 * Callers must ensure to not reference @p after this -- we no longer
	 * own it.
	 */
	smp_store_release(&p->on_rq, 0);
}

extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -3800,7 +3830,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)

extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio);
extern const struct sched_class *__setscheduler_class(int policy, int prio);
extern void set_load_weight(struct task_struct *p, bool update_load);
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
Loading