sched_ext: Implement sched_ext_ops.cpu_online/offline() (60c27fb5) · Commits · git / linux-nf

kernel/sched/core.c

+4 −0

Original line number	Diff line number	Diff line
		@@ -7984,6 +7984,8 @@ int sched_cpu_activate(unsigned int cpu)
		cpuset_cpu_active();
		}

		scx_rq_activate(rq);

		/*
		* Put the rq online, if not already. This happens:
		*
		@@ -8044,6 +8046,8 @@ int sched_cpu_deactivate(unsigned int cpu)
		}
		rq_unlock_irqrestore(rq, &rf);

		scx_rq_deactivate(rq);

		#ifdef CONFIG_SCHED_SMT
		/*
		* When going down, decrement the number of cores with SMT present.

kernel/sched/ext.c

+149 −7

Original line number	Diff line number	Diff line
		@@ -30,6 +30,29 @@ enum scx_exit_kind {
		SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
		};

		/*
		* An exit code can be specified when exiting with scx_bpf_exit() or
		* scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
		* respectively. The codes are 64bit of the format:
		*
		* Bits: [63 .. 48 47 .. 32 31 .. 0]
		* [ SYS ACT ] [ SYS RSN ] [ USR ]
		*
		* SYS ACT: System-defined exit actions
		* SYS RSN: System-defined exit reasons
		* USR : User-defined exit codes and reasons
		*
		* Using the above, users may communicate intention and context by ORing system
		* actions and/or system reasons with a user-defined exit code.
		*/
		enum scx_exit_code {
		/* Reasons */
		SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,

		/* Actions */
		SCX_ECODE_ACT_RESTART = 1LLU << 48,
		};

		/*
		* scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
		* being disabled.
		@@ -457,7 +480,29 @@ struct sched_ext_ops {
		void (dump_task)(struct scx_dump_ctx ctx, struct task_struct *p);

		/*
		* All online ops must come before ops.init().
		* All online ops must come before ops.cpu_online().
		*/

		/**
		* cpu_online - A CPU became online
		* @cpu: CPU which just came up
		*
		* @cpu just came online. @cpu will not call ops.enqueue() or
		* ops.dispatch(), nor run tasks associated with other CPUs beforehand.
		*/
		void (*cpu_online)(s32 cpu);

		/**
		* cpu_offline - A CPU is going offline
		* @cpu: CPU which is going offline
		*
		* @cpu is going offline. @cpu will not call ops.enqueue() or
		* ops.dispatch(), nor run tasks associated with other CPUs afterwards.
		*/
		void (*cpu_offline)(s32 cpu);

		/*
		* All CPU hotplug ops must come before ops.init().
		*/

		/**
		@@ -496,6 +541,15 @@ struct sched_ext_ops {
		*/
		u32 exit_dump_len;

		/**
		* hotplug_seq - A sequence number that may be set by the scheduler to
		* detect when a hotplug event has occurred during the loading process.
		* If 0, no detection occurs. Otherwise, the scheduler will fail to
		* load if the sequence number does not match @scx_hotplug_seq on the
		* enable path.
		*/
		u64 hotplug_seq;

		/**
		* name - BPF scheduler's name
		*
		@@ -509,7 +563,9 @@ struct sched_ext_ops {
		enum scx_opi {
		SCX_OPI_BEGIN = 0,
		SCX_OPI_NORMAL_BEGIN = 0,
		SCX_OPI_NORMAL_END = SCX_OP_IDX(init),
		SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
		SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
		SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
		SCX_OPI_END = SCX_OP_IDX(init),
		};

		@@ -694,6 +750,7 @@ static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
		static struct scx_exit_info *scx_exit_info;

		static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
		static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);

		/*
		* The maximum amount of time in jiffies that a task may be runnable without
		@@ -1419,11 +1476,7 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)

		static bool scx_rq_online(struct rq *rq)
		{
		#ifdef CONFIG_SMP
		return likely(rq->online);
		#else
		return true;
		#endif
		return likely(rq->scx.flags & SCX_RQ_ONLINE);
		}

		static void do_enqueue_task(struct rq rq, struct task_struct p, u64 enq_flags,
		@@ -1438,6 +1491,11 @@ static void do_enqueue_task(struct rq rq, struct task_struct p, u64 enq_flags,
		if (sticky_cpu == cpu_of(rq))
		goto local_norefill;

		/*
		* If !scx_rq_online(), we already told the BPF scheduler that the CPU
		* is offline and are just running the hotplug path. Don't bother the
		* BPF scheduler.
		*/
		if (!scx_rq_online(rq))
		goto local;

		@@ -2673,6 +2731,42 @@ void __scx_update_idle(struct rq *rq, bool idle)
		#endif
		}

		static void handle_hotplug(struct rq *rq, bool online)
		{
		int cpu = cpu_of(rq);

		atomic_long_inc(&scx_hotplug_seq);

		if (online && SCX_HAS_OP(cpu_online))
		SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu);
		else if (!online && SCX_HAS_OP(cpu_offline))
		SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu);
		else
		scx_ops_exit(SCX_ECODE_ACT_RESTART \| SCX_ECODE_RSN_HOTPLUG,
		"cpu %d going %s, exiting scheduler", cpu,
		online ? "online" : "offline");
		}

		void scx_rq_activate(struct rq *rq)
		{
		handle_hotplug(rq, true);
		}

		void scx_rq_deactivate(struct rq *rq)
		{
		handle_hotplug(rq, false);
		}

		static void rq_online_scx(struct rq *rq)
		{
		rq->scx.flags \|= SCX_RQ_ONLINE;
		}

		static void rq_offline_scx(struct rq *rq)
		{
		rq->scx.flags &= ~SCX_RQ_ONLINE;
		}

		#else /* CONFIG_SMP */

		static bool test_and_clear_cpu_idle(int cpu) { return false; }
		@@ -3104,6 +3198,9 @@ DEFINE_SCHED_CLASS(ext) = {
		.balance = balance_scx,
		.select_task_rq = select_task_rq_scx,
		.set_cpus_allowed = set_cpus_allowed_scx,

		.rq_online = rq_online_scx,
		.rq_offline = rq_offline_scx,
		#endif

		.task_tick = task_tick_scx,
		@@ -3235,10 +3332,18 @@ static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
		}
		SCX_ATTR(nr_rejected);

		static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
		struct kobj_attribute ka, char buf)
		{
		return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq));
		}
		SCX_ATTR(hotplug_seq);

		static struct attribute *scx_global_attrs[] = {
		&scx_attr_state.attr,
		&scx_attr_switch_all.attr,
		&scx_attr_nr_rejected.attr,
		&scx_attr_hotplug_seq.attr,
		NULL,
		};

		@@ -3941,6 +4046,25 @@ static struct kthread_worker scx_create_rt_helper(const char name)
		return helper;
		}

		static void check_hotplug_seq(const struct sched_ext_ops *ops)
		{
		unsigned long long global_hotplug_seq;

		/*
		* If a hotplug event has occurred between when a scheduler was
		* initialized, and when we were able to attach, exit and notify user
		* space about it.
		*/
		if (ops->hotplug_seq) {
		global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
		if (ops->hotplug_seq != global_hotplug_seq) {
		scx_ops_exit(SCX_ECODE_ACT_RESTART \| SCX_ECODE_RSN_HOTPLUG,
		"expected hotplug seq %llu did not match actual %llu",
		ops->hotplug_seq, global_hotplug_seq);
		}
		}
		}

		static int validate_ops(const struct sched_ext_ops *ops)
		{
		/*
		@@ -4023,6 +4147,10 @@ static int scx_ops_enable(struct sched_ext_ops ops, struct bpf_link link)
		}
		}

		for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
		if (((void (**)(void))ops)[i])
		static_branch_enable_cpuslocked(&scx_has_op[i]);

		cpus_read_unlock();

		ret = validate_ops(ops);
		@@ -4064,6 +4192,8 @@ static int scx_ops_enable(struct sched_ext_ops ops, struct bpf_link link)
		percpu_down_write(&scx_fork_rwsem);
		cpus_read_lock();

		check_hotplug_seq(ops);

		for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
		if (((void (**)(void))ops)[i])
		static_branch_enable_cpuslocked(&scx_has_op[i]);
		@@ -4374,6 +4504,9 @@ static int bpf_scx_init_member(const struct btf_type *t,
		ops->exit_dump_len =
		(u32 )(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
		return 1;
		case offsetof(struct sched_ext_ops, hotplug_seq):
		ops->hotplug_seq = (u64 )(udata + moff);
		return 1;
		}

		return 0;
		@@ -4387,6 +4520,8 @@ static int bpf_scx_check_member(const struct btf_type *t,

		switch (moff) {
		case offsetof(struct sched_ext_ops, init_task):
		case offsetof(struct sched_ext_ops, cpu_online):
		case offsetof(struct sched_ext_ops, cpu_offline):
		case offsetof(struct sched_ext_ops, init):
		case offsetof(struct sched_ext_ops, exit):
		break;
		@@ -4457,6 +4592,8 @@ static s32 init_task_stub(struct task_struct p, struct scx_init_task_args args
		static void exit_task_stub(struct task_struct p, struct scx_exit_task_args args) {}
		static void enable_stub(struct task_struct *p) {}
		static void disable_stub(struct task_struct *p) {}
		static void cpu_online_stub(s32 cpu) {}
		static void cpu_offline_stub(s32 cpu) {}
		static s32 init_stub(void) { return -EINVAL; }
		static void exit_stub(struct scx_exit_info *info) {}

		@@ -4479,6 +4616,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
		.exit_task = exit_task_stub,
		.enable = enable_stub,
		.disable = disable_stub,
		.cpu_online = cpu_online_stub,
		.cpu_offline = cpu_offline_stub,
		.init = init_stub,
		.exit = exit_stub,
		};
		@@ -4719,6 +4858,9 @@ void __init init_sched_ext_class(void)
		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);

		if (cpu_online(cpu))
		cpu_rq(cpu)->scx.flags \|= SCX_RQ_ONLINE;
		}

		register_sysrq_key('S', &sysrq_sched_ext_reset_op);

kernel/sched/ext.h

+4 −0

Original line number	Diff line number	Diff line
		@@ -40,6 +40,8 @@ int scx_fork(struct task_struct *p);
		void scx_post_fork(struct task_struct *p);
		void scx_cancel_fork(struct task_struct *p);
		bool scx_can_stop_tick(struct rq *rq);
		void scx_rq_activate(struct rq *rq);
		void scx_rq_deactivate(struct rq *rq);
		int scx_check_setscheduler(struct task_struct *p, int policy);
		bool task_should_scx(struct task_struct *p);
		void init_sched_ext_class(void);
		@@ -81,6 +83,8 @@ static inline int scx_fork(struct task_struct *p) { return 0; }
		static inline void scx_post_fork(struct task_struct *p) {}
		static inline void scx_cancel_fork(struct task_struct *p) {}
		static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
		static inline void scx_rq_activate(struct rq *rq) {}
		static inline void scx_rq_deactivate(struct rq *rq) {}
		static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
		static inline bool task_on_scx(const struct task_struct *p) { return false; }
		static inline void init_sched_ext_class(void) {}

kernel/sched/sched.h

+6 −0

Original line number	Diff line number	Diff line
		@@ -726,6 +726,12 @@ struct cfs_rq {
		#ifdef CONFIG_SCHED_CLASS_EXT
		/* scx_rq->flags, protected by the rq lock */
		enum scx_rq_flags {
		/*
		* A hotplugged CPU starts scheduling before rq_online_scx(). Track
		* ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called
		* only while the BPF scheduler considers the CPU to be online.
		*/
		SCX_RQ_ONLINE = 1 << 0,
		SCX_RQ_BALANCING = 1 << 1,
		SCX_RQ_CAN_STOP_TICK = 1 << 2,
		};

tools/sched_ext/include/scx/compat.h

+26 −0

Original line number	Diff line number	Diff line
		@@ -8,6 +8,9 @@
		#define __SCX_COMPAT_H

		#include <bpf/btf.h>
		#include <fcntl.h>
		#include <stdlib.h>
		#include <unistd.h>

		struct btf *__COMPAT_vmlinux_btf __attribute__((weak));

		@@ -106,6 +109,28 @@ static inline bool __COMPAT_struct_has_field(const char type, const char field
		#define SCX_OPS_SWITCH_PARTIAL \
		__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")

		static inline long scx_hotplug_seq(void)
		{
		int fd;
		char buf[32];
		ssize_t len;
		long val;

		fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY);
		if (fd < 0)
		return -ENOENT;

		len = read(fd, buf, sizeof(buf) - 1);
		SCX_BUG_ON(len <= 0, "read failed (%ld)", len);
		buf[len] = 0;
		close(fd);

		val = strtoul(buf, NULL, 10);
		SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val);

		return val;
		}

		/*
		* struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
		* is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
		@@ -123,6 +148,7 @@ static inline bool __COMPAT_struct_has_field(const char type, const char field
		\
		__skel = __scx_name##__open(); \
		SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
		__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \
		__skel; \
		})