Merge tag 'sched_ext-for-7.0-rc3-fixes' of... (8369b2e9) · Commits · git / linux-net

Documentation/scheduler/sched-ext.rst

+27 −3

Original line number	Diff line number	Diff line
		@@ -43,7 +43,6 @@ options should be enabled to use sched_ext:
		CONFIG_DEBUG_INFO_BTF=y
		CONFIG_BPF_JIT_ALWAYS_ON=y
		CONFIG_BPF_JIT_DEFAULT_ON=y
		CONFIG_PAHOLE_HAS_BTF_TAG=y

		sched_ext is used only when the BPF scheduler is loaded and running.

		@@ -58,7 +57,8 @@ in ``ops->flags``, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE``, and
		However, when the BPF scheduler is loaded and ``SCX_OPS_SWITCH_PARTIAL`` is
		set in ``ops->flags``, only tasks with the ``SCHED_EXT`` policy are scheduled
		by sched_ext, while tasks with ``SCHED_NORMAL``, ``SCHED_BATCH`` and
		``SCHED_IDLE`` policies are scheduled by the fair-class scheduler.
		``SCHED_IDLE`` policies are scheduled by the fair-class scheduler which has
		higher sched_class precedence than ``SCHED_EXT``.

		Terminating the sched_ext scheduler program, triggering `SysRq-S`, or
		detection of any internal error including stalled runnable tasks aborts the
		@@ -345,6 +345,8 @@ Where to Look
		The functions prefixed with ``scx_bpf_`` can be called from the BPF
		scheduler.

		* ``kernel/sched/ext_idle.c`` contains the built-in idle CPU selection policy.

		* ``tools/sched_ext/`` hosts example BPF scheduler implementations.

		* ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a
		@@ -353,13 +355,35 @@ Where to Look
		* ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five
		levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``.

		* ``scx_central[.bpf].c``: A central FIFO scheduler where all scheduling
		decisions are made on one CPU, demonstrating ``LOCAL_ON`` dispatching,
		tickless operation, and kthread preemption.

		* ``scx_cpu0[.bpf].c``: A scheduler that queues all tasks to a shared DSQ
		and only dispatches them on CPU0 in FIFO order. Useful for testing bypass
		behavior.

		* ``scx_flatcg[.bpf].c``: A flattened cgroup hierarchy scheduler
		implementing hierarchical weight-based cgroup CPU control by compounding
		each cgroup's share at every level into a single flat scheduling layer.

		* ``scx_pair[.bpf].c``: A core-scheduling example that always makes
		sibling CPU pairs execute tasks from the same CPU cgroup.

		* ``scx_sdt[.bpf].c``: A variation of ``scx_simple`` demonstrating BPF
		arena memory management for per-task data.

		* ``scx_userland[.bpf].c``: A minimal scheduler demonstrating user space
		scheduling. Tasks with CPU affinity are direct-dispatched in FIFO order;
		all others are scheduled in user space by a simple vruntime scheduler.

		ABI Instability
		===============

		The APIs provided by sched_ext to BPF schedulers programs have no stability
		guarantees. This includes the ops table callbacks and constants defined in
		``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in
		``kernel/sched/ext.c``.
		``kernel/sched/ext.c`` and ``kernel/sched/ext_idle.c``.

		While we will attempt to provide a relatively stable API surface when
		possible, they are subject to change without warning between kernel

kernel/sched/ext.c

+11 −11

Original line number	Diff line number	Diff line
		@@ -1103,7 +1103,7 @@ static void dispatch_enqueue(struct scx_sched sch, struct scx_dispatch_q dsq,
		}

		/* seq records the order tasks are queued, used by BPF DSQ iterator */
		dsq->seq++;
		WRITE_ONCE(dsq->seq, dsq->seq + 1);
		p->scx.dsq_seq = dsq->seq;

		dsq_mod_nr(dsq, 1);
		@@ -1470,16 +1470,15 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
		p->scx.flags \|= SCX_TASK_RESET_RUNNABLE_AT;
		}

		static void enqueue_task_scx(struct rq rq, struct task_struct p, int enq_flags)
		static void enqueue_task_scx(struct rq rq, struct task_struct p, int core_enq_flags)
		{
		struct scx_sched *sch = scx_root;
		int sticky_cpu = p->scx.sticky_cpu;
		u64 enq_flags = core_enq_flags \| rq->scx.extra_enq_flags;

		if (enq_flags & ENQUEUE_WAKEUP)
		rq->scx.flags \|= SCX_RQ_IN_WAKEUP;

		enq_flags \|= rq->scx.extra_enq_flags;

		if (sticky_cpu >= 0)
		p->scx.sticky_cpu = -1;

		@@ -3908,8 +3907,8 @@ static u32 bypass_lb_cpu(struct scx_sched sch, struct rq rq,
		* consider offloading iff the total queued duration is over the
		* threshold.
		*/
		min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
		if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
		min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV;
		if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us)))
		return 0;

		raw_spin_rq_lock_irq(rq);
		@@ -4137,7 +4136,7 @@ static void scx_bypass(bool bypass)
		WARN_ON_ONCE(scx_bypass_depth <= 0);
		if (scx_bypass_depth != 1)
		goto unlock;
		WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
		WRITE_ONCE(scx_slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
		bypass_timestamp = ktime_get_ns();
		if (sch)
		scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
		@@ -5259,13 +5258,14 @@ static int scx_enable(struct sched_ext_ops ops, struct bpf_link link)
		if (!READ_ONCE(helper)) {
		mutex_lock(&helper_mutex);
		if (!helper) {
		helper = kthread_run_worker(0, "scx_enable_helper");
		if (IS_ERR_OR_NULL(helper)) {
		helper = NULL;
		struct kthread_worker *w =
		kthread_run_worker(0, "scx_enable_helper");
		if (IS_ERR_OR_NULL(w)) {
		mutex_unlock(&helper_mutex);
		return -ENOMEM;
		}
		sched_set_fifo(helper->task);
		sched_set_fifo(w->task);
		WRITE_ONCE(helper, w);
		}
		mutex_unlock(&helper_mutex);
		}

kernel/sched/ext_internal.h

+98 −16

Original line number	Diff line number	Diff line
		@@ -1035,26 +1035,108 @@ static const char *scx_enable_state_str[] = {
		};

		/*
		* sched_ext_entity->ops_state
		*
		* Used to track the task ownership between the SCX core and the BPF scheduler.
		* State transitions look as follows:
		*
		* NONE -> QUEUEING -> QUEUED -> DISPATCHING
		* ^ \| \|
		* \| v v
		* \-------------------------------/
		*
		* QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
		* sites for explanations on the conditions being waited upon and why they are
		* safe. Transitions out of them into NONE or QUEUED must store_release and the
		* waiters should load_acquire.
		*
		* Tracking scx_ops_state enables sched_ext core to reliably determine whether
		* any given task can be dispatched by the BPF scheduler at all times and thus
		* relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
		* to try to dispatch any task anytime regardless of its state as the SCX core
		* can safely reject invalid dispatches.
		* Task Ownership State Machine (sched_ext_entity->ops_state)
		*
		* The sched_ext core uses this state machine to track task ownership
		* between the SCX core and the BPF scheduler. This allows the BPF
		* scheduler to dispatch tasks without strict ordering requirements, while
		* the SCX core safely rejects invalid dispatches.
		*
		* State Transitions
		*
		* .------------> NONE (owned by SCX core)
		* \| \| ^
		* \| enqueue \| \| direct dispatch
		* \| v \|
		* \| QUEUEING -------'
		* \| \|
		* \| enqueue \|
		* \| completes \|
		* \| v
		* \| QUEUED (owned by BPF scheduler)
		* \| \|
		* \| dispatch \|
		* \| \|
		* \| v
		* \| DISPATCHING
		* \| \|
		* \| dispatch \|
		* \| completes \|
		* `---------------'
		*
		* State Descriptions
		*
		* - %SCX_OPSS_NONE:
		* Task is owned by the SCX core. It's either on a run queue, running,
		* or being manipulated by the core scheduler. The BPF scheduler has no
		* claim on this task.
		*
		* - %SCX_OPSS_QUEUEING:
		* Transitional state while transferring a task from the SCX core to
		* the BPF scheduler. The task's rq lock is held during this state.
		* Since QUEUEING is both entered and exited under the rq lock, dequeue
		* can never observe this state (it would be a BUG). When finishing a
		* dispatch, if the task is still in %SCX_OPSS_QUEUEING the completion
		* path busy-waits for it to leave this state (via wait_ops_state())
		* before retrying.
		*
		* - %SCX_OPSS_QUEUED:
		* Task is owned by the BPF scheduler. It's on a DSQ (dispatch queue)
		* and the BPF scheduler is responsible for dispatching it. A QSEQ
		* (queue sequence number) is embedded in this state to detect
		* dispatch/dequeue races: if a task is dequeued and re-enqueued, the
		* QSEQ changes and any in-flight dispatch operations targeting the old
		* QSEQ are safely ignored.
		*
		* - %SCX_OPSS_DISPATCHING:
		* Transitional state while transferring a task from the BPF scheduler
		* back to the SCX core. This state indicates the BPF scheduler has
		* selected the task for execution. When dequeue needs to take the task
		* off a DSQ and it is still in %SCX_OPSS_DISPATCHING, the dequeue path
		* busy-waits for it to leave this state (via wait_ops_state()) before
		* proceeding. Exits to %SCX_OPSS_NONE when dispatch completes.
		*
		* Memory Ordering
		*
		* Transitions out of %SCX_OPSS_QUEUEING and %SCX_OPSS_DISPATCHING into
		* %SCX_OPSS_NONE or %SCX_OPSS_QUEUED must use atomic_long_set_release()
		* and waiters must use atomic_long_read_acquire(). This ensures proper
		* synchronization between concurrent operations.
		*
		* Cross-CPU Task Migration
		*
		* When moving a task in the %SCX_OPSS_DISPATCHING state, we can't simply
		* grab the target CPU's rq lock because a concurrent dequeue might be
		* waiting on %SCX_OPSS_DISPATCHING while holding the source rq lock
		* (deadlock).
		*
		* The sched_ext core uses a "lock dancing" protocol coordinated by
		* p->scx.holding_cpu. When moving a task to a different rq:
		*
		* 1. Verify task can be moved (CPU affinity, migration_disabled, etc.)
		* 2. Set p->scx.holding_cpu to the current CPU
		* 3. Set task state to %SCX_OPSS_NONE; dequeue waits while DISPATCHING
		* is set, so clearing DISPATCHING first prevents the circular wait
		* (safe to lock the rq we need)
		* 4. Unlock the current CPU's rq
		* 5. Lock src_rq (where the task currently lives)
		* 6. Verify p->scx.holding_cpu == current CPU, if not, dequeue won the
		* race (dequeue clears holding_cpu to -1 when it takes the task), in
		* this case migration is aborted
		* 7. If src_rq == dst_rq: clear holding_cpu and enqueue directly
		* into dst_rq's local DSQ (no lock swap needed)
		* 8. Otherwise: call move_remote_task_to_local_dsq(), which releases
		* src_rq, locks dst_rq, and performs the deactivate/activate
		* migration cycle (dst_rq is held on return)
		* 9. Unlock dst_rq and re-lock the current CPU's rq to restore
		* the lock state expected by the caller
		*
		* If any verification fails, abort the migration.
		*
		* This state tracking allows the BPF scheduler to try to dispatch any task
		* at any time regardless of its state. The SCX core can safely
		* reject/ignore invalid dispatches, simplifying the BPF scheduler
		* implementation.
		*/
		enum scx_ops_state {
		SCX_OPSS_NONE, /* owned by the SCX core */

tools/testing/selftests/sched_ext/util.c

+2 −2

Original line number	Diff line number	Diff line
		@@ -60,11 +60,11 @@ int file_write_long(const char *path, long val)
		char buf[64];
		int ret;

		ret = sprintf(buf, "%lu", val);
		ret = sprintf(buf, "%ld", val);
		if (ret < 0)
		return ret;

		if (write_text(path, buf, sizeof(buf)) <= 0)
		if (write_text(path, buf, ret) <= 0)
		return -1;

		return 0;