Commit 3c7b4d19 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched_ext-for-6.19-rc8-fixes' of...

Merge tag 'sched_ext-for-6.19-rc8-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fix from Tejun Heo:

 - Fix race where sched_class operations (sched_setscheduler() and
   friends) could be invoked on dead tasks after sched_ext_dead()
   already ran, causing invalid SCX task state transitions and NULL
   pointer dereferences.

   This was a regression from the cgroup exit ordering fix which
   moved sched_ext_free() to finish_task_switch().

* tag 'sched_ext-for-6.19-rc8-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  sched_ext: Short-circuit sched_class operations on dead tasks
parents 27db1ae6 0eca95cb
Loading
Loading
Loading
Loading
+48 −0
Original line number Diff line number Diff line
@@ -194,6 +194,7 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond
#include <trace/events/sched_ext.h>

static void process_ddsp_deferred_locals(struct rq *rq);
static bool task_dead_and_done(struct task_struct *p);
static u32 reenq_local(struct rq *rq);
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
@@ -2619,6 +2620,9 @@ static void set_cpus_allowed_scx(struct task_struct *p,

	set_cpus_allowed_common(p, ac);

	if (task_dead_and_done(p))
		return;

	/*
	 * The effective cpumask is stored in @p->cpus_ptr which may temporarily
	 * differ from the configured one in @p->cpus_mask. Always tell the bpf
@@ -3034,10 +3038,45 @@ void scx_cancel_fork(struct task_struct *p)
	percpu_up_read(&scx_fork_rwsem);
}

/**
 * task_dead_and_done - Is a task dead and done running?
 * @p: target task
 *
 * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the
 * task no longer exists from SCX's POV. However, certain sched_class ops may be
 * invoked on these dead tasks leading to failures - e.g. sched_setscheduler()
 * may try to switch a task which finished sched_ext_dead() back into SCX
 * triggering invalid SCX task state transitions and worse.
 *
 * Once a task has finished the final switch, sched_ext_dead() is the only thing
 * that needs to happen on the task. Use this test to short-circuit sched_class
 * operations which may be called on dead tasks.
 */
static bool task_dead_and_done(struct task_struct *p)
{
	struct rq *rq = task_rq(p);

	lockdep_assert_rq_held(rq);

	/*
	 * In do_task_dead(), a dying task sets %TASK_DEAD with preemption
	 * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p
	 * won't ever run again.
	 */
	return unlikely(READ_ONCE(p->__state) == TASK_DEAD) &&
		!task_on_cpu(rq, p);
}

void sched_ext_dead(struct task_struct *p)
{
	unsigned long flags;

	/*
	 * By the time control reaches here, @p has %TASK_DEAD set, switched out
	 * for the last time and then dropped the rq lock - task_dead_and_done()
	 * should be returning %true nullifying the straggling sched_class ops.
	 * Remove from scx_tasks and exit @p.
	 */
	raw_spin_lock_irqsave(&scx_tasks_lock, flags);
	list_del_init(&p->scx.tasks_node);
	raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
@@ -3063,6 +3102,9 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,

	lockdep_assert_rq_held(task_rq(p));

	if (task_dead_and_done(p))
		return;

	p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
	if (SCX_HAS_OP(sch, set_weight))
		SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
@@ -3077,6 +3119,9 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
{
	struct scx_sched *sch = scx_root;

	if (task_dead_and_done(p))
		return;

	scx_enable_task(p);

	/*
@@ -3090,6 +3135,9 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)

static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
	if (task_dead_and_done(p))
		return;

	scx_disable_task(p);
}