Merge branch 'for-6.12-fixes' into for-6.13 (f07b806a) · Commits · git / linux-net

Documentation/scheduler/sched-ext.rst

+1 −1

Original line number	Diff line number	Diff line
		@@ -66,7 +66,7 @@ BPF scheduler and reverts all tasks back to CFS.
		.. code-block:: none

		# make -j16 -C tools/sched_ext
		# tools/sched_ext/scx_simple
		# tools/sched_ext/build/bin/scx_simple
		local=0 global=3
		local=5 global=24
		local=9 global=44

kernel/sched/core.c

+14 −7

Original line number	Diff line number	Diff line
		@@ -3518,14 +3518,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
		* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
		*/
		static inline
		int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
		int select_task_rq(struct task_struct p, int cpu, int wake_flags)
		{
		lockdep_assert_held(&p->pi_lock);

		if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
		cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
		else
		if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) {
		cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags);
		*wake_flags \|= WF_RQ_SELECTED;
		} else {
		cpu = cpumask_any(p->cpus_ptr);
		}

		/*
		* In order not to call set_task_cpu() on a blocking task we need
		@@ -3659,6 +3661,8 @@ ttwu_do_activate(struct rq rq, struct task_struct p, int wake_flags,
		rq->nr_uninterruptible--;

		#ifdef CONFIG_SMP
		if (wake_flags & WF_RQ_SELECTED)
		en_flags \|= ENQUEUE_RQ_SELECTED;
		if (wake_flags & WF_MIGRATED)
		en_flags \|= ENQUEUE_MIGRATED;
		else
		@@ -4120,6 +4124,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
		guard(preempt)();
		int cpu, success = 0;

		wake_flags \|= WF_TTWU;

		if (p == current) {
		/*
		* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
		@@ -4252,7 +4258,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
		*/
		smp_cond_load_acquire(&p->on_cpu, !VAL);

		cpu = select_task_rq(p, p->wake_cpu, wake_flags \| WF_TTWU);
		cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
		if (task_cpu(p) != cpu) {
		if (p->in_iowait) {
		delayacct_blkio_end(p);
		@@ -4793,6 +4799,7 @@ void wake_up_new_task(struct task_struct *p)
		{
		struct rq_flags rf;
		struct rq *rq;
		int wake_flags = WF_FORK;

		raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
		WRITE_ONCE(p->__state, TASK_RUNNING);
		@@ -4807,7 +4814,7 @@ void wake_up_new_task(struct task_struct *p)
		*/
		p->recent_used_cpu = task_cpu(p);
		rseq_migrate(p);
		__set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
		__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
		#endif
		rq = __task_rq_lock(p, &rf);
		update_rq_clock(rq);
		@@ -4815,7 +4822,7 @@ void wake_up_new_task(struct task_struct *p)

		activate_task(rq, p, ENQUEUE_NOCLOCK \| ENQUEUE_INITIAL);
		trace_sched_wakeup_new(p);
		wakeup_preempt(rq, p, WF_FORK);
		wakeup_preempt(rq, p, wake_flags);
		#ifdef CONFIG_SMP
		if (p->sched_class->task_woken) {
		/*

kernel/sched/ext.c

+139 −110

Original line number	Diff line number	Diff line
		@@ -9,7 +9,6 @@
		#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))

		enum scx_consts {
		SCX_SLICE_BYPASS = SCX_SLICE_DFL / 4,
		SCX_DSP_DFL_MAX_BATCH = 32,
		SCX_DSP_MAX_LOOPS = 32,
		SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
		@@ -19,6 +18,12 @@ enum scx_consts {
		SCX_EXIT_DUMP_DFL_LEN = 32768,

		SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,

		/*
		* Iterating all tasks may take a while. Periodically drop
		* scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
		*/
		SCX_OPS_TASK_ITER_BATCH = 32,
		};

		enum scx_exit_kind {
		@@ -630,6 +635,10 @@ struct sched_ext_ops {
		/**
		* exit - Clean up after the BPF scheduler
		* @info: Exit info
		*
		* ops.exit() is also called on ops.init() failure, which is a bit
		* unusual. This is to allow rich reporting through @info on how
		* ops.init() failed.
		*/
		void (exit)(struct scx_exit_info info);

		@@ -697,6 +706,7 @@ enum scx_enq_flags {
		/* expose select ENQUEUE_* flags as enums */
		SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
		SCX_ENQ_HEAD = ENQUEUE_HEAD,
		SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,

		/* high 32bits are SCX specific */

		@@ -857,7 +867,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
		DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
		DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
		static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
		static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
		static int scx_ops_bypass_depth;
		static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock);
		static bool scx_ops_init_task_enabled;
		static bool scx_switching_all;
		DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
		@@ -1279,86 +1290,104 @@ struct scx_task_iter {
		struct task_struct *locked;
		struct rq *rq;
		struct rq_flags rf;
		u32 cnt;
		};

		/**
		* scx_task_iter_init - Initialize a task iterator
		* scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
		* @iter: iterator to init
		*
		* Initialize @iter. Must be called with scx_tasks_lock held. Once initialized,
		* @iter must eventually be exited with scx_task_iter_exit().
		* Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter
		* must eventually be stopped with scx_task_iter_stop().
		*
		* scx_tasks_lock may be released between this and the first next() call or
		* between any two next() calls. If scx_tasks_lock is released between two
		* next() calls, the caller is responsible for ensuring that the task being
		* iterated remains accessible either through RCU read lock or obtaining a
		* reference count.
		* scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock()
		* between this and the first next() call or between any two next() calls. If
		* the locks are released between two next() calls, the caller is responsible
		* for ensuring that the task being iterated remains accessible either through
		* RCU read lock or obtaining a reference count.
		*
		* All tasks which existed when the iteration started are guaranteed to be
		* visited as long as they still exist.
		*/
		static void scx_task_iter_init(struct scx_task_iter *iter)
		static void scx_task_iter_start(struct scx_task_iter *iter)
		{
		lockdep_assert_held(&scx_tasks_lock);

		BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
		((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));

		spin_lock_irq(&scx_tasks_lock);

		iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
		list_add(&iter->cursor.tasks_node, &scx_tasks);
		iter->locked = NULL;
		iter->cnt = 0;
		}

		static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
		{
		if (iter->locked) {
		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
		iter->locked = NULL;
		}
		}

		/**
		* scx_task_iter_rq_unlock - Unlock rq locked by a task iterator
		* @iter: iterator to unlock rq for
		* scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
		* @iter: iterator to unlock
		*
		* If @iter is in the middle of a locked iteration, it may be locking the rq of
		* the task currently being visited. Unlock the rq if so. This function can be
		* safely called anytime during an iteration.
		*
		* Returns %true if the rq @iter was locking is unlocked. %false if @iter was
		* not locking an rq.
		* the task currently being visited in addition to scx_tasks_lock. Unlock both.
		* This function can be safely called anytime during an iteration.
		*/
		static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter)
		static void scx_task_iter_unlock(struct scx_task_iter *iter)
		{
		if (iter->locked) {
		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
		iter->locked = NULL;
		return true;
		} else {
		return false;
		__scx_task_iter_rq_unlock(iter);
		spin_unlock_irq(&scx_tasks_lock);
		}

		/**
		* scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
		* @iter: iterator to re-lock
		*
		* Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
		* doesn't re-lock the rq lock. Must be called before other iterator operations.
		*/
		static void scx_task_iter_relock(struct scx_task_iter *iter)
		{
		spin_lock_irq(&scx_tasks_lock);
		}

		/**
		* scx_task_iter_exit - Exit a task iterator
		* scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
		* @iter: iterator to exit
		*
		* Exit a previously initialized @iter. Must be called with scx_tasks_lock held.
		* If the iterator holds a task's rq lock, that rq lock is released. See
		* scx_task_iter_init() for details.
		* Exit a previously initialized @iter. Must be called with scx_tasks_lock held
		* which is released on return. If the iterator holds a task's rq lock, that rq
		* lock is also released. See scx_task_iter_start() for details.
		*/
		static void scx_task_iter_exit(struct scx_task_iter *iter)
		static void scx_task_iter_stop(struct scx_task_iter *iter)
		{
		lockdep_assert_held(&scx_tasks_lock);

		scx_task_iter_rq_unlock(iter);
		list_del_init(&iter->cursor.tasks_node);
		scx_task_iter_unlock(iter);
		}

		/**
		* scx_task_iter_next - Next task
		* @iter: iterator to walk
		*
		* Visit the next task. See scx_task_iter_init() for details.
		* Visit the next task. See scx_task_iter_start() for details. Locks are dropped
		* and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
		* stalls by holding scx_tasks_lock for too long.
		*/
		static struct task_struct scx_task_iter_next(struct scx_task_iter iter)
		{
		struct list_head *cursor = &iter->cursor.tasks_node;
		struct sched_ext_entity *pos;

		lockdep_assert_held(&scx_tasks_lock);
		if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {
		scx_task_iter_unlock(iter);
		cond_resched();
		scx_task_iter_relock(iter);
		}

		list_for_each_entry(pos, cursor, tasks_node) {
		if (&pos->tasks_node == &scx_tasks)
		@@ -1379,14 +1408,14 @@ static struct task_struct scx_task_iter_next(struct scx_task_iter iter)
		* @include_dead: Whether we should include dead tasks in the iteration
		*
		* Visit the non-idle task with its rq lock held. Allows callers to specify
		* whether they would like to filter out dead tasks. See scx_task_iter_init()
		* whether they would like to filter out dead tasks. See scx_task_iter_start()
		* for details.
		*/
		static struct task_struct scx_task_iter_next_locked(struct scx_task_iter iter)
		{
		struct task_struct *p;

		scx_task_iter_rq_unlock(iter);
		__scx_task_iter_rq_unlock(iter);

		while ((p = scx_task_iter_next(iter))) {
		/*
		@@ -1954,7 +1983,6 @@ static bool scx_rq_online(struct rq *rq)
		static void do_enqueue_task(struct rq rq, struct task_struct p, u64 enq_flags,
		int sticky_cpu)
		{
		bool bypassing = scx_rq_bypassing(rq);
		struct task_struct **ddsp_taskp;
		unsigned long qseq;

		@@ -1972,7 +2000,7 @@ static void do_enqueue_task(struct rq rq, struct task_struct p, u64 enq_flags,
		if (!scx_rq_online(rq))
		goto local;

		if (bypassing)
		if (scx_rq_bypassing(rq))
		goto global;

		if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
		@@ -2027,7 +2055,7 @@ static void do_enqueue_task(struct rq rq, struct task_struct p, u64 enq_flags,

		global:
		touch_core_sched(rq, p); /* see the comment in local: */
		p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL;
		p->scx.slice = SCX_SLICE_DFL;
		dispatch_enqueue(find_global_dsq(p), p, enq_flags);
		}

		@@ -3030,8 +3058,8 @@ static struct task_struct pick_task_scx(struct rq rq)

		if (unlikely(!p->scx.slice)) {
		if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
		printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
		p->comm, p->pid);
		printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
		p->comm, p->pid, __func__);
		scx_warned_zero_slice = true;
		}
		p->scx.slice = SCX_SLICE_DFL;
		@@ -3274,11 +3302,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,

		*found = false;

		if (!static_branch_likely(&scx_builtin_idle_enabled)) {
		scx_ops_error("built-in idle tracking is disabled");
		return prev_cpu;
		}

		/*
		* Determine the scheduling domain only if the task is allowed to run
		* on all CPUs.
		@@ -3435,7 +3458,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
		if (unlikely(wake_flags & WF_EXEC))
		return prev_cpu;

		if (SCX_HAS_OP(select_cpu)) {
		if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {
		s32 cpu;
		struct task_struct **ddsp_taskp;

		@@ -3500,7 +3523,7 @@ void __scx_update_idle(struct rq *rq, bool idle)
		{
		int cpu = cpu_of(rq);

		if (SCX_HAS_OP(update_idle)) {
		if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
		if (!static_branch_unlikely(&scx_builtin_idle_enabled))
		return;
		@@ -4358,7 +4381,6 @@ static void scx_cgroup_exit(void)

		percpu_rwsem_assert_held(&scx_cgroup_rwsem);

		WARN_ON_ONCE(!scx_cgroup_enabled);
		scx_cgroup_enabled = false;

		/*
		@@ -4427,6 +4449,7 @@ static int scx_cgroup_init(void)
		css->cgroup, &args);
		if (ret) {
		css_put(css);
		scx_ops_error("ops.cgroup_init() failed (%d)", ret);
		return ret;
		}
		tg->scx_flags \|= SCX_TG_INITED;
		@@ -4566,36 +4589,40 @@ bool task_should_scx(struct task_struct *p)
		* the DISABLING state and then cycling the queued tasks through dequeue/enqueue
		* to force global FIFO scheduling.
		*
		* a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
		* - ops.select_cpu() is ignored and the default select_cpu() is used.
		*
		* - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
		* %SCX_OPS_ENQ_LAST is also ignored.
		*
		* b. ops.dispatch() is ignored.
		* - ops.dispatch() is ignored.
		*
		* c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
		* - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
		* can't be trusted. Whenever a tick triggers, the running task is rotated to
		* the tail of the queue with core_sched_at touched.
		*
		* d. pick_next_task() suppresses zero slice warning.
		* - pick_next_task() suppresses zero slice warning.
		*
		* e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
		* - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
		* operations.
		*
		* f. scx_prio_less() reverts to the default core_sched_at order.
		* - scx_prio_less() reverts to the default core_sched_at order.
		*/
		static void scx_ops_bypass(bool bypass)
		{
		int depth, cpu;
		int cpu;
		unsigned long flags;

		raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags);
		if (bypass) {
		depth = atomic_inc_return(&scx_ops_bypass_depth);
		WARN_ON_ONCE(depth <= 0);
		if (depth != 1)
		return;
		scx_ops_bypass_depth++;
		WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
		if (scx_ops_bypass_depth != 1)
		goto unlock;
		} else {
		depth = atomic_dec_return(&scx_ops_bypass_depth);
		WARN_ON_ONCE(depth < 0);
		if (depth != 0)
		return;
		scx_ops_bypass_depth--;
		WARN_ON_ONCE(scx_ops_bypass_depth < 0);
		if (scx_ops_bypass_depth != 0)
		goto unlock;
		}

		/*
		@@ -4612,7 +4639,7 @@ static void scx_ops_bypass(bool bypass)
		struct rq_flags rf;
		struct task_struct p, n;

		rq_lock_irqsave(rq, &rf);
		rq_lock(rq, &rf);

		if (bypass) {
		WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
		@@ -4648,11 +4675,13 @@ static void scx_ops_bypass(bool bypass)
		sched_enq_and_set_task(&ctx);
		}

		rq_unlock_irqrestore(rq, &rf);
		rq_unlock(rq, &rf);

		/* kick to restore ticks */
		/* resched to restore ticks and idle state */
		resched_cpu(cpu);
		}
		unlock:
		raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags);
		}

		static void free_exit_info(struct scx_exit_info *ei)
		@@ -4772,15 +4801,13 @@ static void scx_ops_disable_workfn(struct kthread_work *work)

		scx_ops_init_task_enabled = false;

		spin_lock_irq(&scx_tasks_lock);
		scx_task_iter_init(&sti);
		scx_task_iter_start(&sti);
		while ((p = scx_task_iter_next_locked(&sti))) {
		const struct sched_class *old_class = p->sched_class;
		struct sched_enq_and_set_ctx ctx;

		sched_deq_and_put_task(p, DEQUEUE_SAVE \| DEQUEUE_MOVE, &ctx);

		p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
		__setscheduler_prio(p, p->prio);
		check_class_changing(task_rq(p), p, old_class);

		@@ -4789,8 +4816,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
		check_class_changed(task_rq(p), p, old_class, p->prio);
		scx_ops_exit_task(p);
		}
		scx_task_iter_exit(&sti);
		spin_unlock_irq(&scx_tasks_lock);
		scx_task_iter_stop(&sti);
		percpu_up_write(&scx_fork_rwsem);

		/* no task is on scx, turn off all the switches and flush in-progress calls */
		@@ -5258,7 +5284,7 @@ static int scx_ops_enable(struct sched_ext_ops ops, struct bpf_link link)

		if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
		cpu_possible_mask)) {
		pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation");
		pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
		return -EINVAL;
		}

		@@ -5351,6 +5377,7 @@ static int scx_ops_enable(struct sched_ext_ops ops, struct bpf_link link)
		if (ret) {
		ret = ops_sanitize_err("init", ret);
		cpus_read_unlock();
		scx_ops_error("ops.init() failed (%d)", ret);
		goto err_disable;
		}
		}
		@@ -5443,8 +5470,7 @@ static int scx_ops_enable(struct sched_ext_ops ops, struct bpf_link link)
		if (ret)
		goto err_disable_unlock_all;

		spin_lock_irq(&scx_tasks_lock);
		scx_task_iter_init(&sti);
		scx_task_iter_start(&sti);
		while ((p = scx_task_iter_next_locked(&sti))) {
		/*
		* @p may already be dead, have lost all its usages counts and
		@@ -5454,16 +5480,14 @@ static int scx_ops_enable(struct sched_ext_ops ops, struct bpf_link link)
		if (!tryget_task_struct(p))
		continue;

		scx_task_iter_rq_unlock(&sti);
		spin_unlock_irq(&scx_tasks_lock);
		scx_task_iter_unlock(&sti);

		ret = scx_ops_init_task(p, task_group(p), false);
		if (ret) {
		put_task_struct(p);
		spin_lock_irq(&scx_tasks_lock);
		scx_task_iter_exit(&sti);
		spin_unlock_irq(&scx_tasks_lock);
		pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
		scx_task_iter_relock(&sti);
		scx_task_iter_stop(&sti);
		scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
		ret, p->comm, p->pid);
		goto err_disable_unlock_all;
		}
		@@ -5471,10 +5495,9 @@ static int scx_ops_enable(struct sched_ext_ops ops, struct bpf_link link)
		scx_set_task_state(p, SCX_TASK_READY);

		put_task_struct(p);
		spin_lock_irq(&scx_tasks_lock);
		scx_task_iter_relock(&sti);
		}
		scx_task_iter_exit(&sti);
		spin_unlock_irq(&scx_tasks_lock);
		scx_task_iter_stop(&sti);
		scx_cgroup_unlock();
		percpu_up_write(&scx_fork_rwsem);

		@@ -5491,14 +5514,14 @@ static int scx_ops_enable(struct sched_ext_ops ops, struct bpf_link link)
		* scx_tasks_lock.
		*/
		percpu_down_write(&scx_fork_rwsem);
		spin_lock_irq(&scx_tasks_lock);
		scx_task_iter_init(&sti);
		scx_task_iter_start(&sti);
		while ((p = scx_task_iter_next_locked(&sti))) {
		const struct sched_class *old_class = p->sched_class;
		struct sched_enq_and_set_ctx ctx;

		sched_deq_and_put_task(p, DEQUEUE_SAVE \| DEQUEUE_MOVE, &ctx);

		p->scx.slice = SCX_SLICE_DFL;
		__setscheduler_prio(p, p->prio);
		check_class_changing(task_rq(p), p, old_class);

		@@ -5506,20 +5529,13 @@ static int scx_ops_enable(struct sched_ext_ops ops, struct bpf_link link)

		check_class_changed(task_rq(p), p, old_class, p->prio);
		}
		scx_task_iter_exit(&sti);
		spin_unlock_irq(&scx_tasks_lock);
		scx_task_iter_stop(&sti);
		percpu_up_write(&scx_fork_rwsem);

		scx_ops_bypass(false);

		/*
		* Returning an error code here would lose the recorded error
		* information. Exit indicating success so that the error is notified
		* through ops.exit() with all the details.
		*/
		if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
		ret = 0;
		goto err_disable;
		}

		@@ -5554,10 +5570,18 @@ static int scx_ops_enable(struct sched_ext_ops ops, struct bpf_link link)
		scx_ops_bypass(false);
		err_disable:
		mutex_unlock(&scx_ops_enable_mutex);
		/* must be fully disabled before returning */
		scx_ops_disable(SCX_EXIT_ERROR);
		/*
		* Returning an error code here would not pass all the error information
		* to userspace. Record errno using scx_ops_error() for cases
		* scx_ops_error() wasn't already invoked and exit indicating success so
		* that the error is notified through ops.exit() with all the details.
		*
		* Flush scx_ops_disable_work to ensure that error is reported before
		* init completion.
		*/
		scx_ops_error("scx_ops_enable() failed (%d)", ret);
		kthread_flush_work(&scx_ops_disable_work);
		return ret;
		return 0;
		}


		@@ -6108,16 +6132,21 @@ __bpf_kfunc_start_defs();
		__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
		u64 wake_flags, bool *is_idle)
		{
		if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) {
		*is_idle = false;
		return prev_cpu;
		if (!static_branch_likely(&scx_builtin_idle_enabled)) {
		scx_ops_error("built-in idle tracking is disabled");
		goto prev_cpu;
		}

		if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
		goto prev_cpu;

		#ifdef CONFIG_SMP
		return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
		#else
		#endif

		prev_cpu:
		*is_idle = false;
		return prev_cpu;
		#endif
		}

		__bpf_kfunc_end_defs();

kernel/sched/sched.h

+3 −0

Original line number	Diff line number	Diff line
		@@ -2292,6 +2292,7 @@ static inline int task_on_rq_migrating(struct task_struct *p)
		#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
		#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
		#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
		#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */

		#ifdef CONFIG_SMP
		static_assert(WF_EXEC == SD_BALANCE_EXEC);
		@@ -2334,6 +2335,7 @@ extern const u32 sched_prio_to_wmult[40];
		* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
		* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
		* ENQUEUE_MIGRATED - the task was migrated during wakeup
		* ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
		*
		*/

		@@ -2360,6 +2362,7 @@ extern const u32 sched_prio_to_wmult[40];
		#define ENQUEUE_INITIAL 0x80
		#define ENQUEUE_MIGRATING 0x100
		#define ENQUEUE_DELAYED 0x200
		#define ENQUEUE_RQ_SELECTED 0x400

		#define RETRY_TASK ((void *)-1UL)

tools/sched_ext/include/scx/common.bpf.h

+4 −4

Original line number	Diff line number	Diff line
		@@ -41,8 +41,8 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vt
		u32 scx_bpf_dispatch_nr_slots(void) __ksym;
		void scx_bpf_dispatch_cancel(void) __ksym;
		bool scx_bpf_consume(u64 dsq_id) __ksym;
		void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
		void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
		void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
		void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
		bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq it__iter, struct task_struct p, u64 dsq_id, u64 enq_flags) __ksym __weak;
		bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq it__iter, struct task_struct p, u64 dsq_id, u64 enq_flags) __ksym __weak;
		u32 scx_bpf_reenqueue_local(void) __ksym;
		@@ -71,7 +71,7 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
		bool scx_bpf_task_running(const struct task_struct *p) __ksym;
		s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
		struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
		struct cgroup scx_bpf_task_cgroup(struct task_struct p) __ksym;
		struct cgroup scx_bpf_task_cgroup(struct task_struct p) __ksym __weak;

		/*
		* Use the following as @it__iter when calling
		@@ -320,7 +320,7 @@ u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
		/*
		* Access a cpumask in read-only mode (typically to check bits).
		*/
		const struct cpumask cast_mask(struct bpf_cpumask mask)
		static __always_inline const struct cpumask cast_mask(struct bpf_cpumask mask)
		{
		return (const struct cpumask *)mask;
		}