Commit 4f8b1228 authored by Tejun Heo's avatar Tejun Heo
Browse files

sched_ext: Add basic building blocks for nested sub-scheduler dispatching



This is an early-stage partial implementation that demonstrates the core
building blocks for nested sub-scheduler dispatching. While significant
work remains in the enqueue path and other areas, this patch establishes
the fundamental mechanisms needed for hierarchical scheduler operation.

The key building blocks introduced include:

- Private stack support for ops.dispatch() to prevent stack overflow when
  walking down nested schedulers during dispatch operations

- scx_bpf_sub_dispatch() kfunc that allows parent schedulers to trigger
  dispatch operations on their direct child schedulers

- Proper parent-child relationship validation to ensure dispatch requests
  are only made to legitimate child schedulers

- Updated scx_dispatch_sched() to handle both nested and non-nested
  invocations with appropriate kf_mask handling

The qmap scheduler is updated to demonstrate the functionality by calling
scx_bpf_sub_dispatch() on registered child schedulers when it has no
tasks in its own queues.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reviewed-by: default avatarAndrea Righi <arighi@nvidia.com>
parent 25037af7
Loading
Loading
Loading
Loading
+105 −15
Original line number Diff line number Diff line
@@ -2444,8 +2444,14 @@ static inline void maybe_queue_balance_callback(struct rq *rq)
	rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
}

static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
			       struct task_struct *prev)
/*
 * One user of this function is scx_bpf_dispatch() which can be called
 * recursively as sub-sched dispatches nest. Always inline to reduce stack usage
 * from the call frame.
 */
static __always_inline bool
scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
		   struct task_struct *prev, bool nested)
{
	struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
	int nr_loops = SCX_DSP_MAX_LOOPS;
@@ -2499,8 +2505,23 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
	do {
		dspc->nr_tasks = 0;

		if (nested) {
			/*
			 * If nested, don't update kf_mask as the originating
			 * invocation would already have set it up.
			 */
			SCX_CALL_OP(sch, 0, dispatch, rq, cpu,
				    prev_on_sch ? prev : NULL);
		} else {
			/*
			 * If not nested, stash @prev so that nested invocations
			 * can access it.
			 */
			rq->scx.sub_dispatch_prev = prev;
			SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
				    prev_on_sch ? prev : NULL);
			rq->scx.sub_dispatch_prev = NULL;
		}

		flush_dispatch_buf(sch, rq);

@@ -2541,7 +2562,7 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,

static int balance_one(struct rq *rq, struct task_struct *prev)
{
	struct scx_sched *sch = scx_root, *pos;
	struct scx_sched *sch = scx_root;
	s32 cpu = cpu_of(rq);

	lockdep_assert_rq_held(rq);
@@ -2585,12 +2606,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
	if (rq->scx.local_dsq.nr)
		goto has_tasks;

	/*
	 * TEMPORARY - Dispatch all scheds. This will be replaced by BPF-driven
	 * hierarchical operation.
	 */
	list_for_each_entry_rcu(pos, &scx_sched_all, all)
		if (scx_dispatch_sched(pos, rq, prev))
	if (scx_dispatch_sched(sch, rq, prev, false))
		goto has_tasks;

	/*
@@ -4942,9 +4958,8 @@ static void scx_sub_disable(struct scx_sched *sch)

	/*
	 * Guarantee forward progress and wait for descendants to be disabled.
	 * To limit
	 * disruptions, $parent is not bypassed. Tasks are fully prepped and
	 * then inserted back into $parent.
	 * To limit disruptions, $parent is not bypassed. Tasks are fully
	 * prepped and then inserted back into $parent.
	 */
	scx_bypass(sch, true);
	drain_descendants(sch);
@@ -6580,6 +6595,20 @@ static int bpf_scx_init_member(const struct btf_type *t,
	return 0;
}

#ifdef CONFIG_EXT_SUB_SCHED
static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog)
{
	struct scx_sched *sch;

	guard(rcu)();
	sch = scx_prog_sched(prog->aux);
	if (unlikely(!sch))
		return;

	scx_error(sch, "dispatch recursion detected");
}
#endif	/* CONFIG_EXT_SUB_SCHED */

static int bpf_scx_check_member(const struct btf_type *t,
				const struct btf_member *member,
				const struct bpf_prog *prog)
@@ -6605,6 +6634,22 @@ static int bpf_scx_check_member(const struct btf_type *t,
			return -EINVAL;
	}

#ifdef CONFIG_EXT_SUB_SCHED
	/*
	 * Enable private stack for operations that can nest along the
	 * hierarchy.
	 *
	 * XXX - Ideally, we should only do this for scheds that allow
	 * sub-scheds and sub-scheds themselves but I don't know how to access
	 * struct_ops from here.
	 */
	switch (moff) {
	case offsetof(struct sched_ext_ops, dispatch):
		prog->aux->priv_stack_requested = true;
		prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch;
	}
#endif	/* CONFIG_EXT_SUB_SCHED */

	return 0;
}

@@ -7583,6 +7628,48 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
			    p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}

#ifdef CONFIG_EXT_SUB_SCHED
/**
 * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler
 * @cgroup_id: cgroup ID of the child scheduler to dispatch
 * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
 *
 * Allows a parent scheduler to trigger dispatching on one of its direct
 * child schedulers. The child scheduler runs its dispatch operation to
 * move tasks from dispatch queues to the local runqueue.
 *
 * Returns: true on success, false if cgroup_id is invalid, not a direct
 * child, or caller lacks dispatch permission.
 */
__bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux)
{
	struct rq *this_rq = this_rq();
	struct scx_sched *parent, *child;

	guard(rcu)();
	parent = scx_prog_sched(aux);
	if (unlikely(!parent))
		return false;

	if (!scx_kf_allowed(parent, SCX_KF_DISPATCH))
		return false;

	child = scx_find_sub_sched(cgroup_id);

	if (unlikely(!child))
		return false;

	if (unlikely(scx_parent(child) != parent)) {
		scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu",
			  cgroup_id);
		return false;
	}

	return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev,
				  true);
}
#endif	/* CONFIG_EXT_SUB_SCHED */

__bpf_kfunc_end_defs();

BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
@@ -7593,6 +7680,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
#ifdef CONFIG_EXT_SUB_SCHED
BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS)
#endif
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)

static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
+3 −0
Original line number Diff line number Diff line
@@ -805,6 +805,9 @@ struct scx_rq {
	cpumask_var_t		cpus_to_preempt;
	cpumask_var_t		cpus_to_wait;
	unsigned long		kick_sync;

	struct task_struct	*sub_dispatch_prev;

	struct llist_head	deferred_reenq_locals;
	struct balance_callback	deferred_bal_cb;
	struct irq_work		deferred_irq_work;
+1 −0
Original line number Diff line number Diff line
@@ -101,6 +101,7 @@ struct rq *scx_bpf_locked_rq(void) __ksym;
struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
u64 scx_bpf_now(void) __ksym __weak;
void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
bool scx_bpf_sub_dispatch(u64 cgroup_id) __ksym __weak;

/*
 * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
+36 −1
Original line number Diff line number Diff line
@@ -48,6 +48,9 @@ const volatile bool suppress_dump;
u64 nr_highpri_queued;
u32 test_error_cnt;

#define MAX_SUB_SCHEDS		8
u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];

UEI_DEFINE(uei);

struct qmap {
@@ -451,6 +454,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
		cpuc->dsp_cnt = 0;
	}

	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
		if (sub_sched_cgroup_ids[i] &&
		    scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
			return;
	}

	/*
	 * No other tasks. @prev will keep running. Update its core_sched_seq as
	 * if the task were enqueued and dispatched immediately.
@@ -895,8 +904,33 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)

s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
{
	s32 i;

	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
		if (!sub_sched_cgroup_ids[i]) {
			sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
			bpf_printk("attaching sub-sched[%d] on %s",
				   i, args->cgroup_path);
			return 0;
		}
	}

	return -ENOSPC;
}

void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
{
	s32 i;

	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
		if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
			sub_sched_cgroup_ids[i] = 0;
			bpf_printk("detaching sub-sched[%d] on %s",
				   i, args->cgroup_path);
			break;
		}
	}
}

SCX_OPS_DEFINE(qmap_ops,
	       .select_cpu		= (void *)qmap_select_cpu,
@@ -914,6 +948,7 @@ SCX_OPS_DEFINE(qmap_ops,
	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
	       .sub_attach		= (void *)qmap_sub_attach,
	       .sub_detach		= (void *)qmap_sub_detach,
	       .cpu_online		= (void *)qmap_cpu_online,
	       .cpu_offline		= (void *)qmap_cpu_offline,
	       .init			= (void *)qmap_init,