sched_ext: Implement core-sched support (7b0888b7) · Commits · git / linux-nf

include/linux/sched/ext.h

+3 −0

Original line number	Diff line number	Diff line
		@@ -129,6 +129,9 @@ struct sched_ext_entity {
		struct list_head runnable_node; /* rq->scx.runnable_list */
		unsigned long runnable_at;

		#ifdef CONFIG_SCHED_CORE
		u64 core_sched_at; /* see scx_prio_less() */
		#endif
		u64 ddsp_dsq_id;
		u64 ddsp_enq_flags;

kernel/Kconfig.preempt

+1 −1

Original line number	Diff line number	Diff line
		@@ -135,7 +135,7 @@ config SCHED_CORE

		config SCHED_CLASS_EXT
		bool "Extensible Scheduling Class"
		depends on BPF_SYSCALL && BPF_JIT && !SCHED_CORE
		depends on BPF_SYSCALL && BPF_JIT
		help
		This option enables a new scheduler class sched_ext (SCX), which
		allows scheduling policies to be implemented as BPF programs to

kernel/sched/core.c

+9 −1

Original line number	Diff line number	Diff line
		@@ -169,7 +169,10 @@ static inline int __task_prio(const struct task_struct *p)
		if (p->sched_class == &idle_sched_class)
		return MAX_RT_PRIO + NICE_WIDTH; /* 140 */

		return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
		if (task_on_scx(p))
		return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */

		return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */
		}

		/*
		@@ -198,6 +201,11 @@ static inline bool prio_less(const struct task_struct *a,
		if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
		return cfs_prio_less(a, b, in_fi);

		#ifdef CONFIG_SCHED_CLASS_EXT
		if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */
		return scx_prio_less(a, b, in_fi);
		#endif

		return false;
		}

kernel/sched/ext.c

+238 −12

Original line number	Diff line number	Diff line
		@@ -344,6 +344,24 @@ struct sched_ext_ops {
		*/
		bool (yield)(struct task_struct from, struct task_struct *to);

		/**
		* core_sched_before - Task ordering for core-sched
		* @a: task A
		* @b: task B
		*
		* Used by core-sched to determine the ordering between two tasks. See
		* Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
		* core-sched.
		*
		* Both @a and @b are runnable and may or may not currently be queued on
		* the BPF scheduler. Should return %true if @a should run before @b.
		* %false if there's no required ordering or @b should run before @a.
		*
		* If not specified, the default is ordering them according to when they
		* became runnable.
		*/
		bool (core_sched_before)(struct task_struct a, struct task_struct *b);

		/**
		* set_weight - Set task weight
		* @p: task to set weight for
		@@ -625,6 +643,14 @@ enum scx_enq_flags {
		enum scx_deq_flags {
		/* expose select DEQUEUE_* flags as enums */
		SCX_DEQ_SLEEP = DEQUEUE_SLEEP,

		/* high 32bits are SCX specific */

		/*
		* The generic core-sched layer decided to execute the task even though
		* it hasn't been dispatched yet. Dequeue from the BPF side.
		*/
		SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
		};

		enum scx_pick_idle_cpu_flags {
		@@ -1260,6 +1286,49 @@ static int ops_sanitize_err(const char *ops_name, s32 err)
		return -EPROTO;
		}

		/**
		* touch_core_sched - Update timestamp used for core-sched task ordering
		* @rq: rq to read clock from, must be locked
		* @p: task to update the timestamp for
		*
		* Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
		* implement global or local-DSQ FIFO ordering for core-sched. Should be called
		* when a task becomes runnable and its turn on the CPU ends (e.g. slice
		* exhaustion).
		*/
		static void touch_core_sched(struct rq rq, struct task_struct p)
		{
		#ifdef CONFIG_SCHED_CORE
		/*
		* It's okay to update the timestamp spuriously. Use
		* sched_core_disabled() which is cheaper than enabled().
		*/
		if (!sched_core_disabled())
		p->scx.core_sched_at = rq_clock_task(rq);
		#endif
		}

		/**
		* touch_core_sched_dispatch - Update core-sched timestamp on dispatch
		* @rq: rq to read clock from, must be locked
		* @p: task being dispatched
		*
		* If the BPF scheduler implements custom core-sched ordering via
		* ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
		* ordering within each local DSQ. This function is called from dispatch paths
		* and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
		*/
		static void touch_core_sched_dispatch(struct rq rq, struct task_struct p)
		{
		lockdep_assert_rq_held(rq);
		assert_clock_updated(rq);

		#ifdef CONFIG_SCHED_CORE
		if (SCX_HAS_OP(core_sched_before))
		touch_core_sched(rq, p);
		#endif
		}

		static void update_curr_scx(struct rq *rq)
		{
		struct task_struct *curr = rq->curr;
		@@ -1275,8 +1344,11 @@ static void update_curr_scx(struct rq *rq)
		account_group_exec_runtime(curr, delta_exec);
		cgroup_account_cputime(curr, delta_exec);

		if (curr->scx.slice != SCX_SLICE_INF)
		if (curr->scx.slice != SCX_SLICE_INF) {
		curr->scx.slice -= min(curr->scx.slice, delta_exec);
		if (!curr->scx.slice)
		touch_core_sched(rq, curr);
		}
		}

		static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
		@@ -1469,6 +1541,8 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)
		{
		struct scx_dispatch_q *dsq;

		touch_core_sched_dispatch(task_rq(p), p);

		enq_flags \|= (p->scx.ddsp_enq_flags \| SCX_ENQ_CLEAR_OPSS);
		dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p);
		dispatch_enqueue(dsq, p, enq_flags);
		@@ -1550,12 +1624,19 @@ static void do_enqueue_task(struct rq rq, struct task_struct p, u64 enq_flags,
		return;

		local:
		/*
		* For task-ordering, slice refill must be treated as implying the end
		* of the current slice. Otherwise, the longer @p stays on the CPU, the
		* higher priority it becomes from scx_prio_less()'s POV.
		*/
		touch_core_sched(rq, p);
		p->scx.slice = SCX_SLICE_DFL;
		local_norefill:
		dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
		return;

		global:
		touch_core_sched(rq, p); /* see the comment in local: */
		p->scx.slice = SCX_SLICE_DFL;
		dispatch_enqueue(&scx_dsq_global, p, enq_flags);
		}
		@@ -1619,6 +1700,9 @@ static void enqueue_task_scx(struct rq rq, struct task_struct p, int enq_flags
		if (SCX_HAS_OP(runnable))
		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);

		if (enq_flags & SCX_ENQ_WAKEUP)
		touch_core_sched(rq, p);

		do_enqueue_task(rq, p, enq_flags, sticky_cpu);
		}

		@@ -2106,6 +2190,7 @@ static void finish_dispatch(struct rq rq, struct rq_flags rf,
		struct scx_dispatch_q *dsq;
		unsigned long opss;

		touch_core_sched_dispatch(rq, p);
		retry:
		/*
		* No need for _acquire here. @p is accessed only after a successful
		@@ -2183,8 +2268,8 @@ static void flush_dispatch_buf(struct rq rq, struct rq_flags rf)
		dspc->cursor = 0;
		}

		static int balance_scx(struct rq rq, struct task_struct prev,
		struct rq_flags *rf)
		static int balance_one(struct rq rq, struct task_struct prev,
		struct rq_flags *rf, bool local)
		{
		struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
		bool prev_on_scx = prev->sched_class == &ext_sched_class;
		@@ -2208,7 +2293,7 @@ static int balance_scx(struct rq rq, struct task_struct prev,
		}

		if (prev_on_scx) {
		WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
		WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP));
		update_curr_scx(rq);

		/*
		@@ -2220,9 +2305,15 @@ static int balance_scx(struct rq rq, struct task_struct prev,
		*
		* See scx_ops_disable_workfn() for the explanation on the
		* bypassing test.
		*
		* When balancing a remote CPU for core-sched, there won't be a
		* following put_prev_task_scx() call and we don't own
		* %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the
		* same conditions later and pick @rq->curr accordingly.
		*/
		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
		prev->scx.slice && !scx_ops_bypassing()) {
		if (local)
		prev->scx.flags \|= SCX_TASK_BAL_KEEP;
		goto has_tasks;
		}
		@@ -2285,10 +2376,56 @@ static int balance_scx(struct rq rq, struct task_struct prev,
		return has_tasks;
		}

		static int balance_scx(struct rq rq, struct task_struct prev,
		struct rq_flags *rf)
		{
		int ret;

		ret = balance_one(rq, prev, rf, true);

		#ifdef CONFIG_SCHED_SMT
		/*
		* When core-sched is enabled, this ops.balance() call will be followed
		* by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx()
		* on the SMT siblings. Balance the siblings too.
		*/
		if (sched_core_enabled(rq)) {
		const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
		int scpu;

		for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
		struct rq *srq = cpu_rq(scpu);
		struct rq_flags srf;
		struct task_struct *sprev = srq->curr;

		/*
		* While core-scheduling, rq lock is shared among
		* siblings but the debug annotations and rq clock
		* aren't. Do pinning dance to transfer the ownership.
		*/
		WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
		rq_unpin_lock(rq, rf);
		rq_pin_lock(srq, &srf);

		update_rq_clock(srq);
		balance_one(srq, sprev, &srf, false);

		rq_unpin_lock(srq, &srf);
		rq_repin_lock(rq, rf);
		}
		}
		#endif
		return ret;
		}

		static void set_next_task_scx(struct rq rq, struct task_struct p, bool first)
		{
		if (p->scx.flags & SCX_TASK_QUEUED) {
		WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
		/*
		* Core-sched might decide to execute @p before it is
		* dispatched. Call ops_dequeue() to notify the BPF scheduler.
		*/
		ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
		dispatch_dequeue(rq, p);
		}

		@@ -2379,7 +2516,8 @@ static void put_prev_task_scx(struct rq rq, struct task_struct p)
		/*
		* If @p has slice left and balance_scx() didn't tag it for
		* keeping, @p is getting preempted by a higher priority
		* scheduler class. Leave it at the head of the local DSQ.
		* scheduler class or core-sched forcing a different task. Leave
		* it at the head of the local DSQ.
		*/
		if (p->scx.slice && !scx_ops_bypassing()) {
		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
		@@ -2436,6 +2574,84 @@ static struct task_struct pick_next_task_scx(struct rq rq)
		return p;
		}

		#ifdef CONFIG_SCHED_CORE
		/**
		* scx_prio_less - Task ordering for core-sched
		* @a: task A
		* @b: task B
		*
		* Core-sched is implemented as an additional scheduling layer on top of the
		* usual sched_class'es and needs to find out the expected task ordering. For
		* SCX, core-sched calls this function to interrogate the task ordering.
		*
		* Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
		* to implement the default task ordering. The older the timestamp, the higher
		* prority the task - the global FIFO ordering matching the default scheduling
		* behavior.
		*
		* When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
		* implement FIFO ordering within each local DSQ. See pick_task_scx().
		*/
		bool scx_prio_less(const struct task_struct a, const struct task_struct b,
		bool in_fi)
		{
		/*
		* The const qualifiers are dropped from task_struct pointers when
		* calling ops.core_sched_before(). Accesses are controlled by the
		* verifier.
		*/
		if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing())
		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
		(struct task_struct *)a,
		(struct task_struct *)b);
		else
		return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
		}

		/**
		* pick_task_scx - Pick a candidate task for core-sched
		* @rq: rq to pick the candidate task from
		*
		* Core-sched calls this function on each SMT sibling to determine the next
		* tasks to run on the SMT siblings. balance_one() has been called on all
		* siblings and put_prev_task_scx() has been called only for the current CPU.
		*
		* As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look
		* at the first task in the local dsq. @rq->curr has to be considered explicitly
		* to mimic %SCX_TASK_BAL_KEEP.
		*/
		static struct task_struct pick_task_scx(struct rq rq)
		{
		struct task_struct *curr = rq->curr;
		struct task_struct *first = first_local_task(rq);

		if (curr->scx.flags & SCX_TASK_QUEUED) {
		/* is curr the only runnable task? */
		if (!first)
		return curr;

		/*
		* Does curr trump first? We can always go by core_sched_at for
		* this comparison as it represents global FIFO ordering when
		* the default core-sched ordering is used and local-DSQ FIFO
		* ordering otherwise.
		*
		* We can have a task with an earlier timestamp on the DSQ. For
		* example, when a current task is preempted by a sibling
		* picking a different cookie, the task would be requeued at the
		* head of the local DSQ with an earlier timestamp than the
		* core-sched picked next task. Besides, the BPF scheduler may
		* dispatch any tasks to the local DSQ anytime.
		*/
		if (curr->scx.slice && time_before64(curr->scx.core_sched_at,
		first->scx.core_sched_at))
		return curr;
		}

		return first; /* this may be %NULL */
		}
		#endif /* CONFIG_SCHED_CORE */

		static enum scx_cpu_preempt_reason
		preempt_reason_from_class(const struct sched_class *class)
		{
		@@ -2843,13 +3059,15 @@ static void task_tick_scx(struct rq rq, struct task_struct curr, int queued)
		update_curr_scx(rq);

		/*
		* While bypassing, always resched as we can't trust the slice
		* management.
		* While disabling, always resched and refresh core-sched timestamp as
		* we can't trust the slice management or ops.core_sched_before().
		*/
		if (scx_ops_bypassing())
		if (scx_ops_bypassing()) {
		curr->scx.slice = 0;
		else if (SCX_HAS_OP(tick))
		touch_core_sched(rq, curr);
		} else if (SCX_HAS_OP(tick)) {
		SCX_CALL_OP(SCX_KF_REST, tick, curr);
		}

		if (!curr->scx.slice)
		resched_curr(rq);
		@@ -3203,6 +3421,10 @@ DEFINE_SCHED_CLASS(ext) = {
		.rq_offline = rq_offline_scx,
		#endif

		#ifdef CONFIG_SCHED_CORE
		.pick_task = pick_task_scx,
		#endif

		.task_tick = task_tick_scx,

		.switching_to = switching_to_scx,
		@@ -3416,12 +3638,14 @@ bool task_should_scx(struct task_struct *p)
		*
		* c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be
		* trusted. Whenever a tick triggers, the running task is rotated to the tail
		* of the queue.
		* of the queue with core_sched_at touched.
		*
		* d. pick_next_task() suppresses zero slice warning.
		*
		* e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
		* operations.
		*
		* f. scx_prio_less() reverts to the default core_sched_at order.
		*/
		static void scx_ops_bypass(bool bypass)
		{
		@@ -4583,6 +4807,7 @@ static void running_stub(struct task_struct *p) {}
		static void stopping_stub(struct task_struct *p, bool runnable) {}
		static void quiescent_stub(struct task_struct *p, u64 deq_flags) {}
		static bool yield_stub(struct task_struct from, struct task_struct to) { return false; }
		static bool core_sched_before_stub(struct task_struct a, struct task_struct b) { return false; }
		static void set_weight_stub(struct task_struct *p, u32 weight) {}
		static void set_cpumask_stub(struct task_struct p, const struct cpumask mask) {}
		static void update_idle_stub(s32 cpu, bool idle) {}
		@@ -4607,6 +4832,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
		.stopping = stopping_stub,
		.quiescent = quiescent_stub,
		.yield = yield_stub,
		.core_sched_before = core_sched_before_stub,
		.set_weight = set_weight_stub,
		.set_cpumask = set_cpumask_stub,
		.update_idle = update_idle_stub,

kernel/sched/ext.h

+5 −0

Original line number	Diff line number	Diff line
		@@ -70,6 +70,11 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
		for_active_class_range(class, (prev_class) > &ext_sched_class ? \
		&ext_sched_class : (prev_class), (end_class))

		#ifdef CONFIG_SCHED_CORE
		bool scx_prio_less(const struct task_struct a, const struct task_struct b,
		bool in_fi);
		#endif

		#else /* CONFIG_SCHED_CLASS_EXT */

		#define scx_enabled() false