Commit f07b806a authored by Tejun Heo's avatar Tejun Heo
Browse files

Merge branch 'for-6.12-fixes' into for-6.13



Pull sched_ext/for-6.12-fixes to receive 0e7ffff1 ("scx: Fix raciness in
scx_ops_bypass()"). Planned updates for scx_ops_bypass() depends on it.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parents 6d594af5 a759bf0d
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -66,7 +66,7 @@ BPF scheduler and reverts all tasks back to CFS.
.. code-block:: none

    # make -j16 -C tools/sched_ext
    # tools/sched_ext/scx_simple
    # tools/sched_ext/build/bin/scx_simple
    local=0 global=3
    local=5 global=24
    local=9 global=44
+14 −7
Original line number Diff line number Diff line
@@ -3518,14 +3518,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
 */
static inline
int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
int select_task_rq(struct task_struct *p, int cpu, int *wake_flags)
{
	lockdep_assert_held(&p->pi_lock);

	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
		cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
	else
	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) {
		cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags);
		*wake_flags |= WF_RQ_SELECTED;
	} else {
		cpu = cpumask_any(p->cpus_ptr);
	}

	/*
	 * In order not to call set_task_cpu() on a blocking task we need
@@ -3659,6 +3661,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
		rq->nr_uninterruptible--;

#ifdef CONFIG_SMP
	if (wake_flags & WF_RQ_SELECTED)
		en_flags |= ENQUEUE_RQ_SELECTED;
	if (wake_flags & WF_MIGRATED)
		en_flags |= ENQUEUE_MIGRATED;
	else
@@ -4120,6 +4124,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
	guard(preempt)();
	int cpu, success = 0;

	wake_flags |= WF_TTWU;

	if (p == current) {
		/*
		 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -4252,7 +4258,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
		 */
		smp_cond_load_acquire(&p->on_cpu, !VAL);

		cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
		cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
		if (task_cpu(p) != cpu) {
			if (p->in_iowait) {
				delayacct_blkio_end(p);
@@ -4793,6 +4799,7 @@ void wake_up_new_task(struct task_struct *p)
{
	struct rq_flags rf;
	struct rq *rq;
	int wake_flags = WF_FORK;

	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
	WRITE_ONCE(p->__state, TASK_RUNNING);
@@ -4807,7 +4814,7 @@ void wake_up_new_task(struct task_struct *p)
	 */
	p->recent_used_cpu = task_cpu(p);
	rseq_migrate(p);
	__set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
	__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
#endif
	rq = __task_rq_lock(p, &rf);
	update_rq_clock(rq);
@@ -4815,7 +4822,7 @@ void wake_up_new_task(struct task_struct *p)

	activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
	trace_sched_wakeup_new(p);
	wakeup_preempt(rq, p, WF_FORK);
	wakeup_preempt(rq, p, wake_flags);
#ifdef CONFIG_SMP
	if (p->sched_class->task_woken) {
		/*
+139 −110
Original line number Diff line number Diff line
@@ -9,7 +9,6 @@
#define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))

enum scx_consts {
	SCX_SLICE_BYPASS		= SCX_SLICE_DFL / 4,
	SCX_DSP_DFL_MAX_BATCH		= 32,
	SCX_DSP_MAX_LOOPS		= 32,
	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ,
@@ -19,6 +18,12 @@ enum scx_consts {
	SCX_EXIT_DUMP_DFL_LEN		= 32768,

	SCX_CPUPERF_ONE			= SCHED_CAPACITY_SCALE,

	/*
	 * Iterating all tasks may take a while. Periodically drop
	 * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
	 */
	SCX_OPS_TASK_ITER_BATCH		= 32,
};

enum scx_exit_kind {
@@ -630,6 +635,10 @@ struct sched_ext_ops {
	/**
	 * exit - Clean up after the BPF scheduler
	 * @info: Exit info
	 *
	 * ops.exit() is also called on ops.init() failure, which is a bit
	 * unusual. This is to allow rich reporting through @info on how
	 * ops.init() failed.
	 */
	void (*exit)(struct scx_exit_info *info);

@@ -697,6 +706,7 @@ enum scx_enq_flags {
	/* expose select ENQUEUE_* flags as enums */
	SCX_ENQ_WAKEUP		= ENQUEUE_WAKEUP,
	SCX_ENQ_HEAD		= ENQUEUE_HEAD,
	SCX_ENQ_CPU_SELECTED	= ENQUEUE_RQ_SELECTED,

	/* high 32bits are SCX specific */

@@ -857,7 +867,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
static int scx_ops_bypass_depth;
static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock);
static bool scx_ops_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -1279,86 +1290,104 @@ struct scx_task_iter {
	struct task_struct		*locked;
	struct rq			*rq;
	struct rq_flags			rf;
	u32				cnt;
};

/**
 * scx_task_iter_init - Initialize a task iterator
 * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
 * @iter: iterator to init
 *
 * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized,
 * @iter must eventually be exited with scx_task_iter_exit().
 * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter
 * must eventually be stopped with scx_task_iter_stop().
 *
 * scx_tasks_lock may be released between this and the first next() call or
 * between any two next() calls. If scx_tasks_lock is released between two
 * next() calls, the caller is responsible for ensuring that the task being
 * iterated remains accessible either through RCU read lock or obtaining a
 * reference count.
 * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock()
 * between this and the first next() call or between any two next() calls. If
 * the locks are released between two next() calls, the caller is responsible
 * for ensuring that the task being iterated remains accessible either through
 * RCU read lock or obtaining a reference count.
 *
 * All tasks which existed when the iteration started are guaranteed to be
 * visited as long as they still exist.
 */
static void scx_task_iter_init(struct scx_task_iter *iter)
static void scx_task_iter_start(struct scx_task_iter *iter)
{
	lockdep_assert_held(&scx_tasks_lock);

	BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
		     ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));

	spin_lock_irq(&scx_tasks_lock);

	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
	list_add(&iter->cursor.tasks_node, &scx_tasks);
	iter->locked = NULL;
	iter->cnt = 0;
}

static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
{
	if (iter->locked) {
		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
		iter->locked = NULL;
	}
}

/**
 * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator
 * @iter: iterator to unlock rq for
 * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
 * @iter: iterator to unlock
 *
 * If @iter is in the middle of a locked iteration, it may be locking the rq of
 * the task currently being visited. Unlock the rq if so. This function can be
 * safely called anytime during an iteration.
 *
 * Returns %true if the rq @iter was locking is unlocked. %false if @iter was
 * not locking an rq.
 * the task currently being visited in addition to scx_tasks_lock. Unlock both.
 * This function can be safely called anytime during an iteration.
 */
static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter)
static void scx_task_iter_unlock(struct scx_task_iter *iter)
{
	if (iter->locked) {
		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
		iter->locked = NULL;
		return true;
	} else {
		return false;
	__scx_task_iter_rq_unlock(iter);
	spin_unlock_irq(&scx_tasks_lock);
}

/**
 * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
 * @iter: iterator to re-lock
 *
 * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
 * doesn't re-lock the rq lock. Must be called before other iterator operations.
 */
static void scx_task_iter_relock(struct scx_task_iter *iter)
{
	spin_lock_irq(&scx_tasks_lock);
}

/**
 * scx_task_iter_exit - Exit a task iterator
 * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
 * @iter: iterator to exit
 *
 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held.
 * If the iterator holds a task's rq lock, that rq lock is released. See
 * scx_task_iter_init() for details.
 * Exit a previously initialized @iter. Must be called with scx_tasks_lock held
 * which is released on return. If the iterator holds a task's rq lock, that rq
 * lock is also released. See scx_task_iter_start() for details.
 */
static void scx_task_iter_exit(struct scx_task_iter *iter)
static void scx_task_iter_stop(struct scx_task_iter *iter)
{
	lockdep_assert_held(&scx_tasks_lock);

	scx_task_iter_rq_unlock(iter);
	list_del_init(&iter->cursor.tasks_node);
	scx_task_iter_unlock(iter);
}

/**
 * scx_task_iter_next - Next task
 * @iter: iterator to walk
 *
 * Visit the next task. See scx_task_iter_init() for details.
 * Visit the next task. See scx_task_iter_start() for details. Locks are dropped
 * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
 * stalls by holding scx_tasks_lock for too long.
 */
static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
{
	struct list_head *cursor = &iter->cursor.tasks_node;
	struct sched_ext_entity *pos;

	lockdep_assert_held(&scx_tasks_lock);
	if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {
		scx_task_iter_unlock(iter);
		cond_resched();
		scx_task_iter_relock(iter);
	}

	list_for_each_entry(pos, cursor, tasks_node) {
		if (&pos->tasks_node == &scx_tasks)
@@ -1379,14 +1408,14 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
 * @include_dead: Whether we should include dead tasks in the iteration
 *
 * Visit the non-idle task with its rq lock held. Allows callers to specify
 * whether they would like to filter out dead tasks. See scx_task_iter_init()
 * whether they would like to filter out dead tasks. See scx_task_iter_start()
 * for details.
 */
static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
{
	struct task_struct *p;

	scx_task_iter_rq_unlock(iter);
	__scx_task_iter_rq_unlock(iter);

	while ((p = scx_task_iter_next(iter))) {
		/*
@@ -1954,7 +1983,6 @@ static bool scx_rq_online(struct rq *rq)
static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
			    int sticky_cpu)
{
	bool bypassing = scx_rq_bypassing(rq);
	struct task_struct **ddsp_taskp;
	unsigned long qseq;

@@ -1972,7 +2000,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
	if (!scx_rq_online(rq))
		goto local;

	if (bypassing)
	if (scx_rq_bypassing(rq))
		goto global;

	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2027,7 +2055,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,

global:
	touch_core_sched(rq, p);	/* see the comment in local: */
	p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL;
	p->scx.slice = SCX_SLICE_DFL;
	dispatch_enqueue(find_global_dsq(p), p, enq_flags);
}

@@ -3030,8 +3058,8 @@ static struct task_struct *pick_task_scx(struct rq *rq)

		if (unlikely(!p->scx.slice)) {
			if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
						p->comm, p->pid);
				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
						p->comm, p->pid, __func__);
				scx_warned_zero_slice = true;
			}
			p->scx.slice = SCX_SLICE_DFL;
@@ -3274,11 +3302,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,

	*found = false;

	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
		scx_ops_error("built-in idle tracking is disabled");
		return prev_cpu;
	}

	/*
	 * Determine the scheduling domain only if the task is allowed to run
	 * on all CPUs.
@@ -3435,7 +3458,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
	if (unlikely(wake_flags & WF_EXEC))
		return prev_cpu;

	if (SCX_HAS_OP(select_cpu)) {
	if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {
		s32 cpu;
		struct task_struct **ddsp_taskp;

@@ -3500,7 +3523,7 @@ void __scx_update_idle(struct rq *rq, bool idle)
{
	int cpu = cpu_of(rq);

	if (SCX_HAS_OP(update_idle)) {
	if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
		if (!static_branch_unlikely(&scx_builtin_idle_enabled))
			return;
@@ -4358,7 +4381,6 @@ static void scx_cgroup_exit(void)

	percpu_rwsem_assert_held(&scx_cgroup_rwsem);

	WARN_ON_ONCE(!scx_cgroup_enabled);
	scx_cgroup_enabled = false;

	/*
@@ -4427,6 +4449,7 @@ static int scx_cgroup_init(void)
				      css->cgroup, &args);
		if (ret) {
			css_put(css);
			scx_ops_error("ops.cgroup_init() failed (%d)", ret);
			return ret;
		}
		tg->scx_flags |= SCX_TG_INITED;
@@ -4566,36 +4589,40 @@ bool task_should_scx(struct task_struct *p)
 * the DISABLING state and then cycling the queued tasks through dequeue/enqueue
 * to force global FIFO scheduling.
 *
 * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
 * - ops.select_cpu() is ignored and the default select_cpu() is used.
 *
 * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
 *   %SCX_OPS_ENQ_LAST is also ignored.
 *
 * b. ops.dispatch() is ignored.
 * - ops.dispatch() is ignored.
 *
 * c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
 * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
 *   can't be trusted. Whenever a tick triggers, the running task is rotated to
 *   the tail of the queue with core_sched_at touched.
 *
 * d. pick_next_task() suppresses zero slice warning.
 * - pick_next_task() suppresses zero slice warning.
 *
 * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
 * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
 *   operations.
 *
 * f. scx_prio_less() reverts to the default core_sched_at order.
 * - scx_prio_less() reverts to the default core_sched_at order.
 */
static void scx_ops_bypass(bool bypass)
{
	int depth, cpu;
	int cpu;
	unsigned long flags;

	raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags);
	if (bypass) {
		depth = atomic_inc_return(&scx_ops_bypass_depth);
		WARN_ON_ONCE(depth <= 0);
		if (depth != 1)
			return;
		scx_ops_bypass_depth++;
		WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
		if (scx_ops_bypass_depth != 1)
			goto unlock;
	} else {
		depth = atomic_dec_return(&scx_ops_bypass_depth);
		WARN_ON_ONCE(depth < 0);
		if (depth != 0)
			return;
		scx_ops_bypass_depth--;
		WARN_ON_ONCE(scx_ops_bypass_depth < 0);
		if (scx_ops_bypass_depth != 0)
			goto unlock;
	}

	/*
@@ -4612,7 +4639,7 @@ static void scx_ops_bypass(bool bypass)
		struct rq_flags rf;
		struct task_struct *p, *n;

		rq_lock_irqsave(rq, &rf);
		rq_lock(rq, &rf);

		if (bypass) {
			WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
@@ -4648,11 +4675,13 @@ static void scx_ops_bypass(bool bypass)
			sched_enq_and_set_task(&ctx);
		}

		rq_unlock_irqrestore(rq, &rf);
		rq_unlock(rq, &rf);

		/* kick to restore ticks */
		/* resched to restore ticks and idle state */
		resched_cpu(cpu);
	}
unlock:
	raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags);
}

static void free_exit_info(struct scx_exit_info *ei)
@@ -4772,15 +4801,13 @@ static void scx_ops_disable_workfn(struct kthread_work *work)

	scx_ops_init_task_enabled = false;

	spin_lock_irq(&scx_tasks_lock);
	scx_task_iter_init(&sti);
	scx_task_iter_start(&sti);
	while ((p = scx_task_iter_next_locked(&sti))) {
		const struct sched_class *old_class = p->sched_class;
		struct sched_enq_and_set_ctx ctx;

		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);

		p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
		__setscheduler_prio(p, p->prio);
		check_class_changing(task_rq(p), p, old_class);

@@ -4789,8 +4816,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
		check_class_changed(task_rq(p), p, old_class, p->prio);
		scx_ops_exit_task(p);
	}
	scx_task_iter_exit(&sti);
	spin_unlock_irq(&scx_tasks_lock);
	scx_task_iter_stop(&sti);
	percpu_up_write(&scx_fork_rwsem);

	/* no task is on scx, turn off all the switches and flush in-progress calls */
@@ -5258,7 +5284,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)

	if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
			   cpu_possible_mask)) {
		pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation");
		pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
		return -EINVAL;
	}

@@ -5351,6 +5377,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
		if (ret) {
			ret = ops_sanitize_err("init", ret);
			cpus_read_unlock();
			scx_ops_error("ops.init() failed (%d)", ret);
			goto err_disable;
		}
	}
@@ -5443,8 +5470,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
	if (ret)
		goto err_disable_unlock_all;

	spin_lock_irq(&scx_tasks_lock);
	scx_task_iter_init(&sti);
	scx_task_iter_start(&sti);
	while ((p = scx_task_iter_next_locked(&sti))) {
		/*
		 * @p may already be dead, have lost all its usages counts and
@@ -5454,16 +5480,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
		if (!tryget_task_struct(p))
			continue;

		scx_task_iter_rq_unlock(&sti);
		spin_unlock_irq(&scx_tasks_lock);
		scx_task_iter_unlock(&sti);

		ret = scx_ops_init_task(p, task_group(p), false);
		if (ret) {
			put_task_struct(p);
			spin_lock_irq(&scx_tasks_lock);
			scx_task_iter_exit(&sti);
			spin_unlock_irq(&scx_tasks_lock);
			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
			scx_task_iter_relock(&sti);
			scx_task_iter_stop(&sti);
			scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
				      ret, p->comm, p->pid);
			goto err_disable_unlock_all;
		}
@@ -5471,10 +5495,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
		scx_set_task_state(p, SCX_TASK_READY);

		put_task_struct(p);
		spin_lock_irq(&scx_tasks_lock);
		scx_task_iter_relock(&sti);
	}
	scx_task_iter_exit(&sti);
	spin_unlock_irq(&scx_tasks_lock);
	scx_task_iter_stop(&sti);
	scx_cgroup_unlock();
	percpu_up_write(&scx_fork_rwsem);

@@ -5491,14 +5514,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
	 * scx_tasks_lock.
	 */
	percpu_down_write(&scx_fork_rwsem);
	spin_lock_irq(&scx_tasks_lock);
	scx_task_iter_init(&sti);
	scx_task_iter_start(&sti);
	while ((p = scx_task_iter_next_locked(&sti))) {
		const struct sched_class *old_class = p->sched_class;
		struct sched_enq_and_set_ctx ctx;

		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);

		p->scx.slice = SCX_SLICE_DFL;
		__setscheduler_prio(p, p->prio);
		check_class_changing(task_rq(p), p, old_class);

@@ -5506,20 +5529,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)

		check_class_changed(task_rq(p), p, old_class, p->prio);
	}
	scx_task_iter_exit(&sti);
	spin_unlock_irq(&scx_tasks_lock);
	scx_task_iter_stop(&sti);
	percpu_up_write(&scx_fork_rwsem);

	scx_ops_bypass(false);

	/*
	 * Returning an error code here would lose the recorded error
	 * information. Exit indicating success so that the error is notified
	 * through ops.exit() with all the details.
	 */
	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
		ret = 0;
		goto err_disable;
	}

@@ -5554,10 +5570,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
	scx_ops_bypass(false);
err_disable:
	mutex_unlock(&scx_ops_enable_mutex);
	/* must be fully disabled before returning */
	scx_ops_disable(SCX_EXIT_ERROR);
	/*
	 * Returning an error code here would not pass all the error information
	 * to userspace. Record errno using scx_ops_error() for cases
	 * scx_ops_error() wasn't already invoked and exit indicating success so
	 * that the error is notified through ops.exit() with all the details.
	 *
	 * Flush scx_ops_disable_work to ensure that error is reported before
	 * init completion.
	 */
	scx_ops_error("scx_ops_enable() failed (%d)", ret);
	kthread_flush_work(&scx_ops_disable_work);
	return ret;
	return 0;
}


@@ -6108,16 +6132,21 @@ __bpf_kfunc_start_defs();
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
				       u64 wake_flags, bool *is_idle)
{
	if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) {
		*is_idle = false;
		return prev_cpu;
	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
		scx_ops_error("built-in idle tracking is disabled");
		goto prev_cpu;
	}

	if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
		goto prev_cpu;

#ifdef CONFIG_SMP
	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
#else
#endif

prev_cpu:
	*is_idle = false;
	return prev_cpu;
#endif
}

__bpf_kfunc_end_defs();
+3 −0
Original line number Diff line number Diff line
@@ -2292,6 +2292,7 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_SYNC			0x10 /* Waker goes to sleep after wakeup */
#define WF_MIGRATED		0x20 /* Internal use, task got migrated */
#define WF_CURRENT_CPU		0x40 /* Prefer to move the wakee to the current CPU. */
#define WF_RQ_SELECTED		0x80 /* ->select_task_rq() was called */

#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
@@ -2334,6 +2335,7 @@ extern const u32 sched_prio_to_wmult[40];
 * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
 * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
 * ENQUEUE_MIGRATED  - the task was migrated during wakeup
 * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
 *
 */

@@ -2360,6 +2362,7 @@ extern const u32 sched_prio_to_wmult[40];
#define ENQUEUE_INITIAL		0x80
#define ENQUEUE_MIGRATING	0x100
#define ENQUEUE_DELAYED		0x200
#define ENQUEUE_RQ_SELECTED	0x400

#define RETRY_TASK		((void *)-1UL)

+4 −4
Original line number Diff line number Diff line
@@ -41,8 +41,8 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vt
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
void scx_bpf_dispatch_cancel(void) __ksym;
bool scx_bpf_consume(u64 dsq_id) __ksym;
void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
u32 scx_bpf_reenqueue_local(void) __ksym;
@@ -71,7 +71,7 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
bool scx_bpf_task_running(const struct task_struct *p) __ksym;
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;

/*
 * Use the following as @it__iter when calling
@@ -320,7 +320,7 @@ u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
/*
 * Access a cpumask in read-only mode (typically to check bits).
 */
const struct cpumask *cast_mask(struct bpf_cpumask *mask)
static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
{
	return (const struct cpumask *)mask;
}
Loading