Commit de95ad90 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched_ext-for-7.1-rc2-fixes' of...

Merge tag 'sched_ext-for-7.1-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - Fix idle CPU selection returning prev_cpu outside the task's cpus_ptr
   when the BPF caller's allowed mask was wider. Stable backport.

 - Two opposite-direction gaps in scx_task_iter's cgroup-scoped mode
   versus the global mode:

    - Tasks past exit_signals() are filtered by the cgroup walk but kept
      by global. Sub-scheduler enable abort leaked __scx_init_task()
      state. Add a CSS_TASK_ITER_WITH_DEAD flag to cgroup's task
      iterator (scx_task_iter is its only user) and use it.

    - Tasks past sched_ext_dead() are still returned, tripping
      WARN_ON_ONCE() in callers or making them touch torn-down state.
      Mark and skip under the per-task rq lock.

* tag 'sched_ext-for-7.1-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  sched_ext: idle: Recheck prev_cpu after narrowing allowed mask
  sched_ext: Skip past-sched_ext_dead() tasks in scx_task_iter_next_locked()
  cgroup, sched_ext: Include exiting tasks in cgroup iter
parents 50fb0bcc b34c8277
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -53,6 +53,7 @@ struct kernel_clone_args;
enum css_task_iter_flags {
	CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
	CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
	CSS_TASK_ITER_WITH_DEAD = (1U << 2),  /* include exiting tasks */
	CSS_TASK_ITER_SKIPPED  = (1U << 16), /* internal flags */
};

+1 −0
Original line number Diff line number Diff line
@@ -101,6 +101,7 @@ enum scx_ent_flags {
	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
	SCX_TASK_SUB_INIT	= 1 << 4, /* task being initialized for a sub sched */
	SCX_TASK_IMMED		= 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */
	SCX_TASK_OFF_TASKS	= 1 << 6, /* removed from scx_tasks by sched_ext_dead() */

	/*
	 * Bits 8 and 9 are used to carry task state:
+5 −3
Original line number Diff line number Diff line
@@ -5067,10 +5067,12 @@ static void css_task_iter_advance(struct css_task_iter *it)

	task = list_entry(it->task_pos, struct task_struct, cg_list);
	/*
	 * Hide tasks that are exiting but not yet removed. Keep zombie
	 * leaders with live threads visible.
	 * Hide tasks that are exiting but not yet removed by default. Keep
	 * zombie leaders with live threads visible. Usages that need to walk
	 * every existing task can opt out via CSS_TASK_ITER_WITH_DEAD.
	 */
	if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
	if (!(it->flags & CSS_TASK_ITER_WITH_DEAD) &&
	    (task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
		goto repeat;

	if (it->flags & CSS_TASK_ITER_PROCS) {
+29 −10
Original line number Diff line number Diff line
@@ -766,7 +766,8 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp)
		lockdep_assert_held(&cgroup_mutex);
		iter->cgrp = cgrp;
		iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self);
		css_task_iter_start(iter->css_pos, 0, &iter->css_iter);
		css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD,
				    &iter->css_iter);
		return;
	}
#endif
@@ -866,7 +867,8 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
			iter->css_pos = css_next_descendant_pre(iter->css_pos,
								&iter->cgrp->self);
			if (iter->css_pos)
				css_task_iter_start(iter->css_pos, 0, &iter->css_iter);
				css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD,
						    &iter->css_iter);
		}
		return NULL;
	}
@@ -926,17 +928,28 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
		 *
		 * Test for idle_sched_class as only init_tasks are on it.
		 */
		if (p->sched_class != &idle_sched_class)
			break;
	}
	if (!p)
		return NULL;
		if (p->sched_class == &idle_sched_class)
			continue;

		iter->rq = task_rq_lock(p, &iter->rf);
		iter->locked_task = p;

		/*
		 * cgroup_task_dead() removes the dead tasks from cset->tasks
		 * after sched_ext_dead() and cgroup iteration may see tasks
		 * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS
		 * is set by sched_ext_dead() under @p's rq lock. Test it to
		 * avoid visiting tasks which are already dead from SCX POV.
		 */
		if (p->scx.flags & SCX_TASK_OFF_TASKS) {
			__scx_task_iter_rq_unlock(iter);
			continue;
		}

		return p;
	}
	return NULL;
}

/**
 * scx_add_event - Increase an event counter for 'name' by 'cnt'
@@ -3848,6 +3861,11 @@ void sched_ext_dead(struct task_struct *p)
	/*
	 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
	 * ENABLED transitions can't race us. Disable ops for @p.
	 *
	 * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see
	 * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup
	 * iteration is only used from sub-sched paths, which require root
	 * enabled. Root enable transitions every live task to at least READY.
	 */
	if (scx_get_task_state(p) != SCX_TASK_NONE) {
		struct rq_flags rf;
@@ -3855,6 +3873,7 @@ void sched_ext_dead(struct task_struct *p)

		rq = task_rq_lock(p, &rf);
		scx_disable_and_exit_task(scx_task_sched(p), p);
		p->scx.flags |= SCX_TASK_OFF_TASKS;
		task_rq_unlock(rq, p, &rf);
	}
}
+6 −6
Original line number Diff line number Diff line
@@ -465,12 +465,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,

	preempt_disable();

	/*
	 * Check whether @prev_cpu is still within the allowed set. If not,
	 * we can still try selecting a nearby CPU.
	 */
	is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);

	/*
	 * Determine the subset of CPUs usable by @p within @cpus_allowed.
	 */
@@ -487,6 +481,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
		}
	}

	/*
	 * Check whether @prev_cpu is still within the allowed set. If not,
	 * we can still try selecting a nearby CPU.
	 */
	is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);

	/*
	 * This is necessary to protect llc_cpus.
	 */