Commit 3022e9d0 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched_ext-for-6.12-rc7-fixes' of...

Merge tag 'sched_ext-for-6.12-rc7-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - The fair sched class currently has a bug where its balance() returns
   true telling the sched core that it has tasks to run but then NULL
   from pick_task(). This makes sched core call sched_ext's pick_task()
   without preceding balance() which can lead to stalls in partial mode.

   For now, work around by detecting the condition and forcing the CPU
   to go through another scheduling cycle.

 - Add a missing newline to an error message and fix drgn introspection
   tool which went out of sync.

* tag 'sched_ext-for-6.12-rc7-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()
  sched_ext: Update scx_show_state.py to match scx_ops_bypass_depth's new type
  sched_ext: Add a missing newline at the end of an error message
parents 0ccd733a a6250aa2
Loading
Loading
Loading
Loading
+8 −5
Original line number Diff line number Diff line
@@ -5920,12 +5920,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,

#ifdef CONFIG_SCHED_CLASS_EXT
	/*
	 * SCX requires a balance() call before every pick_next_task() including
	 * when waking up from SCHED_IDLE. If @start_class is below SCX, start
	 * from SCX instead.
	 * SCX requires a balance() call before every pick_task() including when
	 * waking up from SCHED_IDLE. If @start_class is below SCX, start from
	 * SCX instead. Also, set a flag to detect missing balance() call.
	 */
	if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
	if (scx_enabled()) {
		rq->scx.flags |= SCX_RQ_BAL_PENDING;
		if (sched_class_above(&ext_sched_class, start_class))
			start_class = &ext_sched_class;
	}
#endif

	/*
+32 −14
Original line number Diff line number Diff line
@@ -2634,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)

	lockdep_assert_rq_held(rq);
	rq->scx.flags |= SCX_RQ_IN_BALANCE;
	rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
	rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);

	if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
	    unlikely(rq->scx.cpu_released)) {
@@ -2948,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq)
{
	struct task_struct *prev = rq->curr;
	struct task_struct *p;
	bool prev_on_scx = prev->sched_class == &ext_sched_class;
	bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
	bool kick_idle = false;

	/*
	 * If balance_scx() is telling us to keep running @prev, replenish slice
	 * if necessary and keep running @prev. Otherwise, pop the first one
	 * from the local DSQ.
	 *
	 * WORKAROUND:
	 *
	 * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
@@ -2962,22 +2961,41 @@ static struct task_struct *pick_task_scx(struct rq *rq)
	 * which then ends up calling pick_task_scx() without preceding
	 * balance_scx().
	 *
	 * For now, ignore cases where $prev is not on SCX. This isn't great and
	 * can theoretically lead to stalls. However, for switch_all cases, this
	 * happens only while a BPF scheduler is being loaded or unloaded, and,
	 * for partial cases, fair will likely keep triggering this CPU.
	 * Keep running @prev if possible and avoid stalling from entering idle
	 * without balancing.
	 *
	 * Once fair is fixed, restore WARN_ON_ONCE().
	 * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
	 * if pick_task_scx() is called without preceding balance_scx().
	 */
	if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
	    prev->sched_class == &ext_sched_class) {
	if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
		if (prev_on_scx) {
			keep_prev = true;
		} else {
			keep_prev = false;
			kick_idle = true;
		}
	} else if (unlikely(keep_prev && !prev_on_scx)) {
		/* only allowed during transitions */
		WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
		keep_prev = false;
	}

	/*
	 * If balance_scx() is telling us to keep running @prev, replenish slice
	 * if necessary and keep running @prev. Otherwise, pop the first one
	 * from the local DSQ.
	 */
	if (keep_prev) {
		p = prev;
		if (!p->scx.slice)
			p->scx.slice = SCX_SLICE_DFL;
	} else {
		p = first_local_task(rq);
		if (!p)
		if (!p) {
			if (kick_idle)
				scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
			return NULL;
		}

		if (unlikely(!p->scx.slice)) {
			if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
@@ -4979,7 +4997,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)

	if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
			   cpu_possible_mask)) {
		pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation");
		pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
		return -EINVAL;
	}

+3 −2
Original line number Diff line number Diff line
@@ -751,8 +751,9 @@ enum scx_rq_flags {
	 */
	SCX_RQ_ONLINE		= 1 << 0,
	SCX_RQ_CAN_STOP_TICK	= 1 << 1,
	SCX_RQ_BAL_KEEP		= 1 << 2, /* balance decided to keep current */
	SCX_RQ_BYPASSING	= 1 << 3,
	SCX_RQ_BAL_PENDING	= 1 << 2, /* balance hasn't run yet */
	SCX_RQ_BAL_KEEP		= 1 << 3, /* balance decided to keep current */
	SCX_RQ_BYPASSING	= 1 << 4,

	SCX_RQ_IN_WAKEUP	= 1 << 16,
	SCX_RQ_IN_BALANCE	= 1 << 17,
+1 −1
Original line number Diff line number Diff line
@@ -35,6 +35,6 @@ print(f'enabled : {read_static_key("__scx_ops_enabled")}')
print(f'switching_all : {read_int("scx_switching_all")}')
print(f'switched_all  : {read_static_key("__scx_switched_all")}')
print(f'enable_state  : {ops_state_str(enable_state)} ({enable_state})')
print(f'bypass_depth  : {read_atomic("scx_ops_bypass_depth")}')
print(f'bypass_depth  : {prog["scx_ops_bypass_depth"].value_()}')
print(f'nr_rejected   : {read_atomic("scx_nr_rejected")}')
print(f'enable_seq    : {read_atomic("scx_enable_seq")}')