Commit 61debc25 authored by Tejun Heo's avatar Tejun Heo
Browse files

sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode



Bypass mode routes tasks through fallback dispatch queues. Originally a single
global DSQ, b7b3b2db ("sched_ext: Split the global DSQ per NUMA node")
changed this to per-node DSQs to resolve NUMA-related livelocks.

Dan Schatzberg found per-node DSQs can still livelock when many threads are
pinned to different small CPU subsets: each CPU must scan many incompatible
tasks to find runnable ones, causing severe contention with high CPU counts.

Switch to per-CPU bypass DSQs. Each task queues on its current CPU. Default
idle CPU selection and direct dispatch handle most cases well.

This introduces a failure mode when tasks concentrate on one CPU in
over-saturated systems. If the BPF scheduler severely skews placement before
triggering bypass, that CPU's queue may be too long to drain, causing RCU
stalls. A load balancer in a future patch will address this. The bypass DSQ is
separate from local DSQ to enable load balancing: local DSQs use rq locks,
preventing efficient scanning and transfer across CPUs, especially problematic
when systems are already contended.

v2: Clarified why bypass DSQ is separate from local DSQ (Andrea Righi).

Reported-by: default avatarDan Schatzberg <schatzberg.dan@gmail.com>
Reviewed-by: default avatarDan Schatzberg <schatzberg.dan@gmail.com>
Reviewed-by: default avatarAndrea Righi <arighi@nvidia.com>
Reviewed-by: default avatarEmil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parent 3546119f
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -57,6 +57,7 @@ enum scx_dsq_id_flags {
	SCX_DSQ_INVALID		= SCX_DSQ_FLAG_BUILTIN | 0,
	SCX_DSQ_GLOBAL		= SCX_DSQ_FLAG_BUILTIN | 1,
	SCX_DSQ_LOCAL		= SCX_DSQ_FLAG_BUILTIN | 2,
	SCX_DSQ_BYPASS		= SCX_DSQ_FLAG_BUILTIN | 3,
	SCX_DSQ_LOCAL_ON	= SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
};
+13 −3
Original line number Diff line number Diff line
@@ -1298,7 +1298,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,

	if (scx_rq_bypassing(rq)) {
		__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
		goto global;
		goto bypass;
	}

	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -1356,6 +1356,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
global:
	dsq = find_global_dsq(sch, p);
	goto enqueue;
bypass:
	dsq = &task_rq(p)->scx.bypass_dsq;
	goto enqueue;

enqueue:
	/*
@@ -2154,8 +2157,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
	if (consume_global_dsq(sch, rq))
		goto has_tasks;

	if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
	    scx_rq_bypassing(rq) || !scx_rq_online(rq))
	if (scx_rq_bypassing(rq)) {
		if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
			goto has_tasks;
		else
			goto no_tasks;
	}

	if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
		goto no_tasks;

	dspc->rq = rq;
@@ -5371,6 +5380,7 @@ void __init init_sched_ext_class(void)
		int  n = cpu_to_node(cpu);

		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
		init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
		INIT_LIST_HEAD(&rq->scx.runnable_list);
		INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);

+1 −0
Original line number Diff line number Diff line
@@ -808,6 +808,7 @@ struct scx_rq {
	struct balance_callback	deferred_bal_cb;
	struct irq_work		deferred_irq_work;
	struct irq_work		kick_cpus_irq_work;
	struct scx_dispatch_q	bypass_dsq;
};
#endif /* CONFIG_SCHED_CLASS_EXT */