Commit 3539c641 authored by Tejun Heo's avatar Tejun Heo
Browse files

sched_ext: Implement SCX_OPS_ALLOW_QUEUED_WAKEUP



A task wakeup can be either processed on the waker's CPU or bounced to the
wakee's previous CPU using an IPI (ttwu_queue). Bouncing to the wakee's CPU
avoids the waker's CPU locking and accessing the wakee's rq which can be
expensive across cache and node boundaries.

When ttwu_queue path is taken, select_task_rq() and thus ops.select_cpu()
may be skipped in some cases (racing against the wakee switching out). As
this confused some BPF schedulers, there wasn't a good way for a BPF
scheduler to tell whether idle CPU selection has been skipped, ops.enqueue()
couldn't insert tasks into foreign local DSQs, and the performance
difference on machines with simple toplogies were minimal, sched_ext
disabled ttwu_queue.

However, this optimization makes noticeable difference on more complex
topologies and a BPF scheduler now has an easy way tell whether
ops.select_cpu() was skipped since 9b671793 ("sched_ext, scx_qmap: Add
and use SCX_ENQ_CPU_SELECTED") and can insert tasks into foreign local DSQs
since 5b26f7b9 ("sched_ext: Allow SCX_DSQ_LOCAL_ON for direct
dispatches").

Implement SCX_OPS_ALLOW_QUEUED_WAKEUP which allows BPF schedulers to choose
to enable ttwu_queue optimization.

v2: Update the patch description and comment re. ops.select_cpu() being
    skipped in some cases as opposed to always as per Neel.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reported-by: default avatarNeel Natu <neelnatu@google.com>
Reported-by: default avatarBarret Rhoden <brho@google.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: default avatarAndrea Righi <arighi@nvidia.com>
parent 78e4690d
Loading
Loading
Loading
Loading
+2 −7
Original line number Diff line number Diff line
@@ -3921,13 +3921,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)

static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
	/*
	 * The BPF scheduler may depend on select_task_rq() being invoked during
	 * wakeups. In addition, @p may end up executing on a different CPU
	 * regardless of what happens in the wakeup path making the ttwu_queue
	 * optimization less meaningful. Skip if on SCX.
	 */
	if (task_on_scx(p))
	/* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
	if (!scx_allow_ttwu_queue(p))
		return false;

	/*
+26 −6
Original line number Diff line number Diff line
@@ -138,6 +138,22 @@ enum scx_ops_flags {
	 */
	SCX_OPS_ENQ_MIGRATION_DISABLED	= 1LLU << 4,

	/*
	 * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
	 * ops.enqueue() on the ops.select_cpu() selected or the wakee's
	 * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
	 * transfers. When this optimization is enabled, ops.select_cpu() is
	 * skipped in some cases (when racing against the wakee switching out).
	 * As the BPF scheduler may depend on ops.select_cpu() being invoked
	 * during wakeups, queued wakeup is disabled by default.
	 *
	 * If this ops flag is set, queued wakeup optimization is enabled and
	 * the BPF scheduler must be able to handle ops.enqueue() invoked on the
	 * wakee's CPU without preceding ops.select_cpu() even for tasks which
	 * may be executed on multiple CPUs.
	 */
	SCX_OPS_ALLOW_QUEUED_WAKEUP	= 1LLU << 5,

	/*
	 * CPU cgroup support flags
	 */
@@ -147,6 +163,7 @@ enum scx_ops_flags {
				  SCX_OPS_ENQ_LAST |
				  SCX_OPS_ENQ_EXITING |
				  SCX_OPS_ENQ_MIGRATION_DISABLED |
				  SCX_OPS_ALLOW_QUEUED_WAKEUP |
				  SCX_OPS_SWITCH_PARTIAL |
				  SCX_OPS_HAS_CGROUP_WEIGHT,
};
@@ -897,6 +914,7 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
static struct sched_ext_ops scx_ops;
static bool scx_warned_zero_slice;

DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
@@ -4717,6 +4735,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
	static_branch_disable(&__scx_ops_enabled);
	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
		static_branch_disable(&scx_has_op[i]);
	static_branch_disable(&scx_ops_allow_queued_wakeup);
	static_branch_disable(&scx_ops_enq_last);
	static_branch_disable(&scx_ops_enq_exiting);
	static_branch_disable(&scx_ops_enq_migration_disabled);
@@ -5348,9 +5367,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
		if (((void (**)(void))ops)[i])
			static_branch_enable(&scx_has_op[i]);

	if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
		static_branch_enable(&scx_ops_allow_queued_wakeup);
	if (ops->flags & SCX_OPS_ENQ_LAST)
		static_branch_enable(&scx_ops_enq_last);

	if (ops->flags & SCX_OPS_ENQ_EXITING)
		static_branch_enable(&scx_ops_enq_exiting);
	if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
+10 −0
Original line number Diff line number Diff line
@@ -8,6 +8,8 @@
 */
#ifdef CONFIG_SCHED_CLASS_EXT

DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);

void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -34,6 +36,13 @@ static inline bool task_on_scx(const struct task_struct *p)
	return scx_enabled() && p->sched_class == &ext_sched_class;
}

static inline bool scx_allow_ttwu_queue(const struct task_struct *p)
{
	return !scx_enabled() ||
		static_branch_likely(&scx_ops_allow_queued_wakeup) ||
		p->sched_class != &ext_sched_class;
}

#ifdef CONFIG_SCHED_CORE
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
		   bool in_fi);
@@ -52,6 +61,7 @@ static inline void scx_rq_activate(struct rq *rq) {}
static inline void scx_rq_deactivate(struct rq *rq) {}
static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
static inline bool task_on_scx(const struct task_struct *p) { return false; }
static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; }
static inline void init_sched_ext_class(void) {}

#endif	/* CONFIG_SCHED_CLASS_EXT */