Commit 8369b2e9 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched_ext-for-7.0-rc3-fixes' of...

Merge tag 'sched_ext-for-7.0-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - Fix data races flagged by KCSAN: add missing READ_ONCE()/WRITE_ONCE()
   annotations for lock-free accesses to module parameters and dsq->seq

 - Fix silent truncation of upper 32 enqueue flags (SCX_ENQ_PREEMPT and
   above) when passed through the int sched_class interface

 - Documentation updates: scheduling class precedence, task ownership
   state machine, example scheduler descriptions, config list cleanup

 - Selftest fix for format specifier and buffer length in
   file_write_long()

* tag 'sched_ext-for-7.0-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  sched_ext: Use WRITE_ONCE() for the write side of scx_enable helper pointer
  sched_ext: Fix enqueue_task_scx() truncation of upper enqueue flags
  sched_ext: Documentation: Update sched-ext.rst
  sched_ext: Use READ_ONCE() for scx_slice_bypass_us in scx_bypass()
  sched_ext: Documentation: Mention scheduling class precedence
  sched_ext: Document task ownership state machine
  sched_ext: Use READ_ONCE() for lock-free reads of module param variables
  sched_ext/selftests: Fix format specifier and buffer length in file_write_long()
  sched_ext: Use WRITE_ONCE() for the write side of dsq->seq update
parents 8040dc41 2fcfe595
Loading
Loading
Loading
Loading
+27 −3
Original line number Diff line number Diff line
@@ -43,7 +43,6 @@ options should be enabled to use sched_ext:
    CONFIG_DEBUG_INFO_BTF=y
    CONFIG_BPF_JIT_ALWAYS_ON=y
    CONFIG_BPF_JIT_DEFAULT_ON=y
    CONFIG_PAHOLE_HAS_BTF_TAG=y

sched_ext is used only when the BPF scheduler is loaded and running.

@@ -58,7 +57,8 @@ in ``ops->flags``, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE``, and
However, when the BPF scheduler is loaded and ``SCX_OPS_SWITCH_PARTIAL`` is
set in ``ops->flags``, only tasks with the ``SCHED_EXT`` policy are scheduled
by sched_ext, while tasks with ``SCHED_NORMAL``, ``SCHED_BATCH`` and
``SCHED_IDLE`` policies are scheduled by the fair-class scheduler.
``SCHED_IDLE`` policies are scheduled by the fair-class scheduler which has
higher sched_class precedence than ``SCHED_EXT``.

Terminating the sched_ext scheduler program, triggering `SysRq-S`, or
detection of any internal error including stalled runnable tasks aborts the
@@ -345,6 +345,8 @@ Where to Look
  The functions prefixed with ``scx_bpf_`` can be called from the BPF
  scheduler.

* ``kernel/sched/ext_idle.c`` contains the built-in idle CPU selection policy.

* ``tools/sched_ext/`` hosts example BPF scheduler implementations.

  * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a
@@ -353,13 +355,35 @@ Where to Look
  * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five
    levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``.

  * ``scx_central[.bpf].c``: A central FIFO scheduler where all scheduling
    decisions are made on one CPU, demonstrating ``LOCAL_ON`` dispatching,
    tickless operation, and kthread preemption.

  * ``scx_cpu0[.bpf].c``: A scheduler that queues all tasks to a shared DSQ
    and only dispatches them on CPU0 in FIFO order. Useful for testing bypass
    behavior.

  * ``scx_flatcg[.bpf].c``: A flattened cgroup hierarchy scheduler
    implementing hierarchical weight-based cgroup CPU control by compounding
    each cgroup's share at every level into a single flat scheduling layer.

  * ``scx_pair[.bpf].c``: A core-scheduling example that always makes
    sibling CPU pairs execute tasks from the same CPU cgroup.

  * ``scx_sdt[.bpf].c``: A variation of ``scx_simple`` demonstrating BPF
    arena memory management for per-task data.

  * ``scx_userland[.bpf].c``: A minimal scheduler demonstrating user space
    scheduling. Tasks with CPU affinity are direct-dispatched in FIFO order;
    all others are scheduled in user space by a simple vruntime scheduler.

ABI Instability
===============

The APIs provided by sched_ext to BPF schedulers programs have no stability
guarantees. This includes the ops table callbacks and constants defined in
``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in
``kernel/sched/ext.c``.
``kernel/sched/ext.c`` and ``kernel/sched/ext_idle.c``.

While we will attempt to provide a relatively stable API surface when
possible, they are subject to change without warning between kernel
+11 −11
Original line number Diff line number Diff line
@@ -1103,7 +1103,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
	}

	/* seq records the order tasks are queued, used by BPF DSQ iterator */
	dsq->seq++;
	WRITE_ONCE(dsq->seq, dsq->seq + 1);
	p->scx.dsq_seq = dsq->seq;

	dsq_mod_nr(dsq, 1);
@@ -1470,16 +1470,15 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
}

static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags)
{
	struct scx_sched *sch = scx_root;
	int sticky_cpu = p->scx.sticky_cpu;
	u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags;

	if (enq_flags & ENQUEUE_WAKEUP)
		rq->scx.flags |= SCX_RQ_IN_WAKEUP;

	enq_flags |= rq->scx.extra_enq_flags;

	if (sticky_cpu >= 0)
		p->scx.sticky_cpu = -1;

@@ -3908,8 +3907,8 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
	 * consider offloading iff the total queued duration is over the
	 * threshold.
	 */
	min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
	if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
	min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV;
	if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us)))
		return 0;

	raw_spin_rq_lock_irq(rq);
@@ -4137,7 +4136,7 @@ static void scx_bypass(bool bypass)
		WARN_ON_ONCE(scx_bypass_depth <= 0);
		if (scx_bypass_depth != 1)
			goto unlock;
		WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
		WRITE_ONCE(scx_slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
		bypass_timestamp = ktime_get_ns();
		if (sch)
			scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
@@ -5259,13 +5258,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
	if (!READ_ONCE(helper)) {
		mutex_lock(&helper_mutex);
		if (!helper) {
			helper = kthread_run_worker(0, "scx_enable_helper");
			if (IS_ERR_OR_NULL(helper)) {
				helper = NULL;
			struct kthread_worker *w =
				kthread_run_worker(0, "scx_enable_helper");
			if (IS_ERR_OR_NULL(w)) {
				mutex_unlock(&helper_mutex);
				return -ENOMEM;
			}
			sched_set_fifo(helper->task);
			sched_set_fifo(w->task);
			WRITE_ONCE(helper, w);
		}
		mutex_unlock(&helper_mutex);
	}
+98 −16
Original line number Diff line number Diff line
@@ -1035,26 +1035,108 @@ static const char *scx_enable_state_str[] = {
};

/*
 * sched_ext_entity->ops_state
 *
 * Used to track the task ownership between the SCX core and the BPF scheduler.
 * State transitions look as follows:
 *
 * NONE -> QUEUEING -> QUEUED -> DISPATCHING
 *   ^              |                 |
 *   |              v                 v
 *   \-------------------------------/
 *
 * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
 * sites for explanations on the conditions being waited upon and why they are
 * safe. Transitions out of them into NONE or QUEUED must store_release and the
 * waiters should load_acquire.
 *
 * Tracking scx_ops_state enables sched_ext core to reliably determine whether
 * any given task can be dispatched by the BPF scheduler at all times and thus
 * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
 * to try to dispatch any task anytime regardless of its state as the SCX core
 * can safely reject invalid dispatches.
 * Task Ownership State Machine (sched_ext_entity->ops_state)
 *
 * The sched_ext core uses this state machine to track task ownership
 * between the SCX core and the BPF scheduler. This allows the BPF
 * scheduler to dispatch tasks without strict ordering requirements, while
 * the SCX core safely rejects invalid dispatches.
 *
 * State Transitions
 *
 *       .------------> NONE (owned by SCX core)
 *       |               |           ^
 *       |       enqueue |           | direct dispatch
 *       |               v           |
 *       |           QUEUEING -------'
 *       |               |
 *       |       enqueue |
 *       |     completes |
 *       |               v
 *       |            QUEUED (owned by BPF scheduler)
 *       |               |
 *       |      dispatch |
 *       |               |
 *       |               v
 *       |          DISPATCHING
 *       |               |
 *       |      dispatch |
 *       |     completes |
 *       `---------------'
 *
 * State Descriptions
 *
 * - %SCX_OPSS_NONE:
 *     Task is owned by the SCX core. It's either on a run queue, running,
 *     or being manipulated by the core scheduler. The BPF scheduler has no
 *     claim on this task.
 *
 * - %SCX_OPSS_QUEUEING:
 *     Transitional state while transferring a task from the SCX core to
 *     the BPF scheduler. The task's rq lock is held during this state.
 *     Since QUEUEING is both entered and exited under the rq lock, dequeue
 *     can never observe this state (it would be a BUG). When finishing a
 *     dispatch, if the task is still in %SCX_OPSS_QUEUEING the completion
 *     path busy-waits for it to leave this state (via wait_ops_state())
 *     before retrying.
 *
 * - %SCX_OPSS_QUEUED:
 *     Task is owned by the BPF scheduler. It's on a DSQ (dispatch queue)
 *     and the BPF scheduler is responsible for dispatching it. A QSEQ
 *     (queue sequence number) is embedded in this state to detect
 *     dispatch/dequeue races: if a task is dequeued and re-enqueued, the
 *     QSEQ changes and any in-flight dispatch operations targeting the old
 *     QSEQ are safely ignored.
 *
 * - %SCX_OPSS_DISPATCHING:
 *     Transitional state while transferring a task from the BPF scheduler
 *     back to the SCX core. This state indicates the BPF scheduler has
 *     selected the task for execution. When dequeue needs to take the task
 *     off a DSQ and it is still in %SCX_OPSS_DISPATCHING, the dequeue path
 *     busy-waits for it to leave this state (via wait_ops_state()) before
 *     proceeding. Exits to %SCX_OPSS_NONE when dispatch completes.
 *
 * Memory Ordering
 *
 * Transitions out of %SCX_OPSS_QUEUEING and %SCX_OPSS_DISPATCHING into
 * %SCX_OPSS_NONE or %SCX_OPSS_QUEUED must use atomic_long_set_release()
 * and waiters must use atomic_long_read_acquire(). This ensures proper
 * synchronization between concurrent operations.
 *
 * Cross-CPU Task Migration
 *
 * When moving a task in the %SCX_OPSS_DISPATCHING state, we can't simply
 * grab the target CPU's rq lock because a concurrent dequeue might be
 * waiting on %SCX_OPSS_DISPATCHING while holding the source rq lock
 * (deadlock).
 *
 * The sched_ext core uses a "lock dancing" protocol coordinated by
 * p->scx.holding_cpu. When moving a task to a different rq:
 *
 *   1. Verify task can be moved (CPU affinity, migration_disabled, etc.)
 *   2. Set p->scx.holding_cpu to the current CPU
 *   3. Set task state to %SCX_OPSS_NONE; dequeue waits while DISPATCHING
 *      is set, so clearing DISPATCHING first prevents the circular wait
 *      (safe to lock the rq we need)
 *   4. Unlock the current CPU's rq
 *   5. Lock src_rq (where the task currently lives)
 *   6. Verify p->scx.holding_cpu == current CPU, if not, dequeue won the
 *      race (dequeue clears holding_cpu to -1 when it takes the task), in
 *      this case migration is aborted
 *   7. If src_rq == dst_rq: clear holding_cpu and enqueue directly
 *      into dst_rq's local DSQ (no lock swap needed)
 *   8. Otherwise: call move_remote_task_to_local_dsq(), which releases
 *      src_rq, locks dst_rq, and performs the deactivate/activate
 *      migration cycle (dst_rq is held on return)
 *   9. Unlock dst_rq and re-lock the current CPU's rq to restore
 *      the lock state expected by the caller
 *
 * If any verification fails, abort the migration.
 *
 * This state tracking allows the BPF scheduler to try to dispatch any task
 * at any time regardless of its state. The SCX core can safely
 * reject/ignore invalid dispatches, simplifying the BPF scheduler
 * implementation.
 */
enum scx_ops_state {
	SCX_OPSS_NONE,		/* owned by the SCX core */
+2 −2
Original line number Diff line number Diff line
@@ -60,11 +60,11 @@ int file_write_long(const char *path, long val)
	char buf[64];
	int ret;

	ret = sprintf(buf, "%lu", val);
	ret = sprintf(buf, "%ld", val);
	if (ret < 0)
		return ret;

	if (write_text(path, buf, sizeof(buf)) <= 0)
	if (write_text(path, buf, ret) <= 0)
		return -1;

	return 0;