Commit 04f41cbf authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched_ext-for-6.14-rc2-fixes' of...

Merge tag 'sched_ext-for-6.14-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - Fix lock imbalance in a corner case of dispatch_to_local_dsq()

 - Migration disabled tasks were confusing some BPF schedulers and its
   handling had a bug. Fix it and simplify the default behavior by
   dispatching them automatically

 - ops.tick(), ops.disable() and ops.exit_task() were incorrectly
   disallowing kfuncs that require the task argument to be the rq
   operation is currently operating on and thus is rq-locked.
   Allow them.

 - Fix autogroup migration handling bug which was occasionally
   triggering a warning in the cgroup migration path

 - tools/sched_ext, selftest and other misc updates

* tag 'sched_ext-for-6.14-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  sched_ext: Use SCX_CALL_OP_TASK in task_tick_scx
  sched_ext: Fix the incorrect bpf_list kfunc API in common.bpf.h.
  sched_ext: selftests: Fix grammar in tests description
  sched_ext: Fix incorrect assumption about migration disabled tasks in task_can_run_on_remote_rq()
  sched_ext: Fix migration disabled handling in targeted dispatches
  sched_ext: Implement auto local dispatching of migration disabled tasks
  sched_ext: Fix incorrect time delta calculation in time_delta()
  sched_ext: Fix lock imbalance in dispatch_to_local_dsq()
  sched_ext: selftests/dsp_local_on: Fix selftest on UP systems
  tools/sched_ext: Add helper to check task migration state
  sched_ext: Fix incorrect autogroup migration detection
  sched_ext: selftests/dsp_local_on: Fix sporadic failures
  selftests/sched_ext: Fix enum resolution
  sched_ext: Include task weight in the error state dump
  sched_ext: Fixes typos in comments
parents 80868f5d f5717c93
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -150,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p)
	 * see this thread after that: we can no longer use signal->autogroup.
	 * See the PF_EXITING check in task_wants_autogroup().
	 */
	sched_move_task(p);
	sched_move_task(p, true);
}

static void
@@ -182,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
	 * sched_autogroup_exit_task().
	 */
	for_each_thread(p, t)
		sched_move_task(t);
		sched_move_task(t, true);

	unlock_task_sighand(p, &flags);
	autogroup_kref_put(prev);
+4 −3
Original line number Diff line number Diff line
@@ -9050,7 +9050,7 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group
 * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
 * its new group.
 */
void sched_move_task(struct task_struct *tsk)
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
	int queued, running, queue_flags =
		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -9079,7 +9079,8 @@ void sched_move_task(struct task_struct *tsk)
		put_prev_task(rq, tsk);

	sched_change_group(tsk, group);
	scx_move_task(tsk);
	if (!for_autogroup)
		scx_cgroup_move_task(tsk);

	if (queued)
		enqueue_task(rq, tsk, queue_flags);
@@ -9180,7 +9181,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
	struct cgroup_subsys_state *css;

	cgroup_taskset_for_each(task, css, tset)
		sched_move_task(task);
		sched_move_task(task, false);

	scx_cgroup_finish_attach();
}
+76 −37
Original line number Diff line number Diff line
@@ -122,6 +122,19 @@ enum scx_ops_flags {
	 */
	SCX_OPS_SWITCH_PARTIAL	= 1LLU << 3,

	/*
	 * A migration disabled task can only execute on its current CPU. By
	 * default, such tasks are automatically put on the CPU's local DSQ with
	 * the default slice on enqueue. If this ops flag is set, they also go
	 * through ops.enqueue().
	 *
	 * A migration disabled task never invokes ops.select_cpu() as it can
	 * only select the current CPU. Also, p->cpus_ptr will only contain its
	 * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
	 * and thus may disagree with cpumask_weight(p->cpus_ptr).
	 */
	SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,

	/*
	 * CPU cgroup support flags
	 */
@@ -130,6 +143,7 @@ enum scx_ops_flags {
	SCX_OPS_ALL_FLAGS	= SCX_OPS_KEEP_BUILTIN_IDLE |
				  SCX_OPS_ENQ_LAST |
				  SCX_OPS_ENQ_EXITING |
				  SCX_OPS_ENQ_MIGRATION_DISABLED |
				  SCX_OPS_SWITCH_PARTIAL |
				  SCX_OPS_HAS_CGROUP_WEIGHT,
};
@@ -416,7 +430,7 @@ struct sched_ext_ops {

	/**
	 * @update_idle: Update the idle state of a CPU
	 * @cpu: CPU to udpate the idle state for
	 * @cpu: CPU to update the idle state for
	 * @idle: whether entering or exiting the idle state
	 *
	 * This operation is called when @rq's CPU goes or leaves the idle
@@ -882,6 +896,7 @@ static bool scx_warned_zero_slice;

static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);

@@ -1214,7 +1229,7 @@ static bool scx_kf_allowed_if_unlocked(void)

/**
 * nldsq_next_task - Iterate to the next task in a non-local DSQ
 * @dsq: user dsq being interated
 * @dsq: user dsq being iterated
 * @cur: current position, %NULL to start iteration
 * @rev: walk backwards
 *
@@ -2014,6 +2029,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
	    unlikely(p->flags & PF_EXITING))
		goto local;

	/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
	if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
	    is_migration_disabled(p))
		goto local;

	if (!SCX_HAS_OP(enqueue))
		goto global;

@@ -2078,7 +2098,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)

	/*
	 * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
	 * appened to the runnable_list.
	 * appended to the runnable_list.
	 */
	list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
}
@@ -2313,12 +2333,35 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
 *
 * - The BPF scheduler is bypassed while the rq is offline and we can always say
 *   no to the BPF scheduler initiated migrations while offline.
 *
 * The caller must ensure that @p and @rq are on different CPUs.
 */
static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
				      bool trigger_error)
{
	int cpu = cpu_of(rq);

	SCHED_WARN_ON(task_cpu(p) == cpu);

	/*
	 * If @p has migration disabled, @p->cpus_ptr is updated to contain only
	 * the pinned CPU in migrate_disable_switch() while @p is being switched
	 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is
	 * updated and thus another CPU may see @p on a DSQ inbetween leading to
	 * @p passing the below task_allowed_on_cpu() check while migration is
	 * disabled.
	 *
	 * Test the migration disabled state first as the race window is narrow
	 * and the BPF scheduler failing to check migration disabled state can
	 * easily be masked if task_allowed_on_cpu() is done first.
	 */
	if (unlikely(is_migration_disabled(p))) {
		if (trigger_error)
			scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
				      p->comm, p->pid, task_cpu(p), cpu);
		return false;
	}

	/*
	 * We don't require the BPF scheduler to avoid dispatching to offline
	 * CPUs mostly for convenience but also because CPUs can go offline
@@ -2327,14 +2370,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
	 */
	if (!task_allowed_on_cpu(p, cpu)) {
		if (trigger_error)
			scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
				      cpu_of(rq), p->comm, p->pid);
			scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
				      cpu, p->comm, p->pid);
		return false;
	}

	if (unlikely(is_migration_disabled(p)))
		return false;

	if (!scx_rq_online(rq))
		return false;

@@ -2437,7 +2477,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,

	if (dst_dsq->id == SCX_DSQ_LOCAL) {
		dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
		if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
		if (src_rq != dst_rq &&
		    unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
			dst_dsq = find_global_dsq(p);
			dst_rq = src_rq;
		}
@@ -2480,7 +2521,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
/*
 * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
 * banging on the same DSQ on a large NUMA system to the point where switching
 * to the bypass mode can take a long time. Inject artifical delays while the
 * to the bypass mode can take a long time. Inject artificial delays while the
 * bypass mode is switching to guarantee timely completion.
 */
static void scx_ops_breather(struct rq *rq)
@@ -2575,6 +2616,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
{
	struct rq *src_rq = task_rq(p);
	struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
#ifdef CONFIG_SMP
	struct rq *locked_rq = rq;
#endif

	/*
	 * We're synchronized against dequeue through DISPATCHING. As @p can't
@@ -2588,7 +2632,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
	}

#ifdef CONFIG_SMP
	if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
	if (src_rq != dst_rq &&
	    unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
		dispatch_enqueue(find_global_dsq(p), p,
				 enq_flags | SCX_ENQ_CLEAR_OPSS);
		return;
@@ -2611,8 +2656,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);

	/* switch to @src_rq lock */
	if (rq != src_rq) {
		raw_spin_rq_unlock(rq);
	if (locked_rq != src_rq) {
		raw_spin_rq_unlock(locked_rq);
		locked_rq = src_rq;
		raw_spin_rq_lock(src_rq);
	}

@@ -2630,6 +2676,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
		} else {
			move_remote_task_to_local_dsq(p, enq_flags,
						      src_rq, dst_rq);
			/* task has been moved to dst_rq, which is now locked */
			locked_rq = dst_rq;
		}

		/* if the destination CPU is idle, wake it up */
@@ -2638,8 +2686,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
	}

	/* switch back to @rq lock */
	if (rq != dst_rq) {
		raw_spin_rq_unlock(dst_rq);
	if (locked_rq != rq) {
		raw_spin_rq_unlock(locked_rq);
		raw_spin_rq_lock(rq);
	}
#else	/* CONFIG_SMP */
@@ -3144,7 +3192,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
 *
 * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
 * to implement the default task ordering. The older the timestamp, the higher
 * prority the task - the global FIFO ordering matching the default scheduling
 * priority the task - the global FIFO ordering matching the default scheduling
 * behavior.
 *
 * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
@@ -3851,7 +3899,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
		curr->scx.slice = 0;
		touch_core_sched(rq, curr);
	} else if (SCX_HAS_OP(tick)) {
		SCX_CALL_OP(SCX_KF_REST, tick, curr);
		SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);
	}

	if (!curr->scx.slice)
@@ -3998,7 +4046,7 @@ static void scx_ops_disable_task(struct task_struct *p)
	WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);

	if (SCX_HAS_OP(disable))
		SCX_CALL_OP(SCX_KF_REST, disable, p);
		SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);
	scx_set_task_state(p, SCX_TASK_READY);
}

@@ -4027,7 +4075,7 @@ static void scx_ops_exit_task(struct task_struct *p)
	}

	if (SCX_HAS_OP(exit_task))
		SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
		SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);
	scx_set_task_state(p, SCX_TASK_NONE);
}

@@ -4323,24 +4371,11 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
	return ops_sanitize_err("cgroup_prep_move", ret);
}

void scx_move_task(struct task_struct *p)
void scx_cgroup_move_task(struct task_struct *p)
{
	if (!scx_cgroup_enabled)
		return;

	/*
	 * We're called from sched_move_task() which handles both cgroup and
	 * autogroup moves. Ignore the latter.
	 *
	 * Also ignore exiting tasks, because in the exit path tasks transition
	 * from the autogroup to the root group, so task_group_is_autogroup()
	 * alone isn't able to catch exiting autogroup tasks. This is safe for
	 * cgroup_move(), because cgroup migrations never happen for PF_EXITING
	 * tasks.
	 */
	if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING))
		return;

	/*
	 * @p must have ops.cgroup_prep_move() called on it and thus
	 * cgrp_moving_from set.
@@ -4590,7 +4625,7 @@ static int scx_cgroup_init(void)
	cgroup_warned_missing_idle = false;

	/*
	 * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
	 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
	 * cgroups and init, all online cgroups are initialized.
	 */
	rcu_read_lock();
@@ -5059,6 +5094,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
		static_branch_disable(&scx_has_op[i]);
	static_branch_disable(&scx_ops_enq_last);
	static_branch_disable(&scx_ops_enq_exiting);
	static_branch_disable(&scx_ops_enq_migration_disabled);
	static_branch_disable(&scx_ops_cpu_preempt);
	static_branch_disable(&scx_builtin_idle_enabled);
	synchronize_rcu();
@@ -5277,9 +5313,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
		  scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
		  p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
		  ops_state >> SCX_OPSS_QSEQ_SHIFT);
	dump_line(s, "      sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu",
		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,
		  p->scx.dsq_vtime, p->scx.slice);
	dump_line(s, "      sticky/holding_cpu=%d/%d dsq_id=%s",
		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
	dump_line(s, "      dsq_vtime=%llu slice=%llu weight=%u",
		  p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
	dump_line(s, "      cpus=%*pb", cpumask_pr_args(p->cpus_ptr));

	if (SCX_HAS_OP(dump_task)) {
@@ -5667,6 +5704,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)

	if (ops->flags & SCX_OPS_ENQ_EXITING)
		static_branch_enable(&scx_ops_enq_exiting);
	if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
		static_branch_enable(&scx_ops_enq_migration_disabled);
	if (scx_ops.cpu_acquire || scx_ops.cpu_release)
		static_branch_enable(&scx_ops_cpu_preempt);

+2 −2
Original line number Diff line number Diff line
@@ -73,7 +73,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
void scx_move_task(struct task_struct *p);
void scx_cgroup_move_task(struct task_struct *p);
void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
@@ -82,7 +82,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle);
static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
static inline void scx_move_task(struct task_struct *p) {}
static inline void scx_cgroup_move_task(struct task_struct *p) {}
static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
+1 −1
Original line number Diff line number Diff line
@@ -572,7 +572,7 @@ extern void sched_online_group(struct task_group *tg,
extern void sched_destroy_group(struct task_group *tg);
extern void sched_release_group(struct task_group *tg);

extern void sched_move_task(struct task_struct *tsk);
extern void sched_move_task(struct task_struct *tsk, bool for_autogroup);

#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
Loading