sched_ext: Use dynamic allocation for scx_sched (17108735) · Commits · git / linux-net

kernel/sched/ext.c

+91 −66

Original line number	Diff line number	Diff line
		@@ -752,6 +752,9 @@ struct sched_ext_ops {
		* BPF scheduler is enabled.
		*/
		char name[SCX_OPS_NAME_LEN];

		/* internal use only, must be NULL */
		void *priv;
		};

		enum scx_opi {
		@@ -772,7 +775,8 @@ struct scx_sched {
		atomic_t exit_kind;
		struct scx_exit_info *exit_info;

		struct kobject *kobj;
		struct kobject kobj;
		struct rcu_work rcu_work;
		};

		enum scx_wake_flags {
		@@ -933,11 +937,7 @@ enum scx_ops_state {
		#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
		#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)

		static struct scx_sched __scx_root = {
		.exit_kind = ATOMIC_INIT(SCX_EXIT_DONE),
		};

		static struct scx_sched *scx_root = &__scx_root;
		static struct scx_sched __rcu *scx_root;

		/*
		* During exit, a task may schedule after losing its PIDs. When disabling the
		@@ -4417,9 +4417,23 @@ static const struct attribute_group scx_global_attr_group = {
		.attrs = scx_global_attrs,
		};

		static void free_exit_info(struct scx_exit_info *ei);

		static void scx_sched_free_rcu_work(struct work_struct *work)
		{
		struct rcu_work *rcu_work = to_rcu_work(work);
		struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);

		free_exit_info(sch->exit_info);
		kfree(sch);
		}

		static void scx_kobj_release(struct kobject *kobj)
		{
		kfree(kobj);
		struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);

		INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
		queue_rcu_work(system_unbound_wq, &sch->rcu_work);
		}

		static ssize_t scx_attr_ops_show(struct kobject *kobj,
		@@ -4709,14 +4723,15 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)

		static void scx_disable_workfn(struct kthread_work *work)
		{
		struct scx_exit_info *ei = scx_root->exit_info;
		struct scx_sched *sch = scx_root;
		struct scx_exit_info *ei = sch->exit_info;
		struct scx_task_iter sti;
		struct task_struct *p;
		struct rhashtable_iter rht_iter;
		struct scx_dispatch_q *dsq;
		int kind, cpu;

		kind = atomic_read(&scx_root->exit_kind);
		kind = atomic_read(&sch->exit_kind);
		while (true) {
		/*
		* NONE indicates that a new scx_ops has been registered since
		@@ -4725,7 +4740,7 @@ static void scx_disable_workfn(struct kthread_work *work)
		*/
		if (kind == SCX_EXIT_NONE \|\| kind == SCX_EXIT_DONE)
		return;
		if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE))
		if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
		break;
		}
		ei->kind = kind;
		@@ -4740,7 +4755,7 @@ static void scx_disable_workfn(struct kthread_work *work)
		break;
		case SCX_DISABLED:
		pr_warn("sched_ext: ops error detected without ops (%s)\n",
		scx_root->exit_info->msg);
		sch->exit_info->msg);
		WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
		goto done;
		default:
		@@ -4807,41 +4822,43 @@ static void scx_disable_workfn(struct kthread_work *work)

		/* no task is on scx, turn off all the switches and flush in-progress calls */
		static_branch_disable(&__scx_enabled);
		bitmap_zero(scx_root->has_op, SCX_OPI_END);
		bitmap_zero(sch->has_op, SCX_OPI_END);
		scx_idle_disable();
		synchronize_rcu();

		if (ei->kind >= SCX_EXIT_ERROR) {
		pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
		scx_root->ops.name, ei->reason);
		sch->ops.name, ei->reason);

		if (ei->msg[0] != '\0')
		pr_err("sched_ext: %s: %s\n",
		scx_root->ops.name, ei->msg);
		pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
		#ifdef CONFIG_STACKTRACE
		stack_trace_print(ei->bt, ei->bt_len, 2);
		#endif
		} else {
		pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
		scx_root->ops.name, ei->reason);
		sch->ops.name, ei->reason);
		}

		if (scx_root->ops.exit)
		if (sch->ops.exit)
		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);

		cancel_delayed_work_sync(&scx_watchdog_work);

		/*
		* Delete the kobject from the hierarchy eagerly in addition to just
		* dropping a reference. Otherwise, if the object is deleted
		* asynchronously, sysfs could observe an object of the same name still
		* in the hierarchy when another scheduler is loaded.
		* scx_root clearing must be inside cpus_read_lock(). See
		* handle_hotplug().
		*/
		kobject_del(scx_root->kobj);
		kobject_put(scx_root->kobj);
		scx_root->kobj = NULL;
		cpus_read_lock();
		RCU_INIT_POINTER(scx_root, NULL);
		cpus_read_unlock();

		memset(&scx_root->ops, 0, sizeof(scx_root->ops));
		/*
		* Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
		* could observe an object of the same name still in the hierarchy when
		* the next scheduler is loaded.
		*/
		kobject_del(&sch->kobj);

		rhashtable_walk_enter(&dsq_hash, &rht_iter);
		do {
		@@ -4858,9 +4875,6 @@ static void scx_disable_workfn(struct kthread_work *work)
		scx_dsp_ctx = NULL;
		scx_dsp_max_batch = 0;

		free_exit_info(scx_root->exit_info);
		scx_root->exit_info = NULL;

		mutex_unlock(&scx_enable_mutex);

		WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
		@@ -4885,14 +4899,19 @@ static void schedule_scx_disable_work(void)
		static void scx_disable(enum scx_exit_kind kind)
		{
		int none = SCX_EXIT_NONE;
		struct scx_sched *sch;

		if (WARN_ON_ONCE(kind == SCX_EXIT_NONE \|\| kind == SCX_EXIT_DONE))
		kind = SCX_EXIT_ERROR;

		atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind);

		rcu_read_lock();
		sch = rcu_dereference(scx_root);
		if (sch) {
		atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
		schedule_scx_disable_work();
		}
		rcu_read_unlock();
		}

		static void dump_newline(struct seq_buf *s)
		{
		@@ -5288,6 +5307,7 @@ static int validate_ops(const struct sched_ext_ops *ops)

		static int scx_enable(struct sched_ext_ops ops, struct bpf_link link)
		{
		struct scx_sched *sch;
		struct scx_task_iter sti;
		struct task_struct *p;
		unsigned long timeout;
		@@ -5351,33 +5371,33 @@ static int scx_enable(struct sched_ext_ops ops, struct bpf_link link)
		goto err_unlock;
		}

		scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL);
		if (!scx_root->kobj) {
		sch = kzalloc(sizeof(*sch), GFP_KERNEL);
		if (!sch) {
		ret = -ENOMEM;
		goto err_unlock;
		}

		scx_root->kobj->kset = scx_kset;
		ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root");
		if (ret < 0)
		goto err;

		scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
		if (!scx_root->exit_info) {
		sch->exit_info = alloc_exit_info(ops->exit_dump_len);
		if (!sch->exit_info) {
		ret = -ENOMEM;
		goto err_del;
		goto err_free;
		}

		sch->kobj.kset = scx_kset;
		ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
		if (ret < 0)
		goto err_free;

		atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
		sch->ops = *ops;
		ops->priv = sch;

		/*
		* Set scx_ops, transition to ENABLING and clear exit info to arm the
		* disable path. Failure triggers full disabling from here on.
		* Transition to ENABLING and clear exit info to arm the disable path.
		* Failure triggers full disabling from here on.
		*/
		scx_root->ops = *ops;

		WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);

		atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
		scx_root->warned_zero_slice = false;
		WARN_ON_ONCE(scx_root);

		atomic_long_set(&scx_nr_rejected, 0);

		@@ -5390,9 +5410,15 @@ static int scx_enable(struct sched_ext_ops ops, struct bpf_link link)
		*/
		cpus_read_lock();

		/*
		* Make the scheduler instance visible. Must be inside cpus_read_lock().
		* See handle_hotplug().
		*/
		rcu_assign_pointer(scx_root, sch);

		scx_idle_enable(ops);

		if (scx_root->ops.init) {
		if (sch->ops.init) {
		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
		if (ret) {
		ret = ops_sanitize_err("init", ret);
		@@ -5404,7 +5430,7 @@ static int scx_enable(struct sched_ext_ops ops, struct bpf_link link)

		for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
		if (((void (**)(void))ops)[i])
		set_bit(i, scx_root->has_op);
		set_bit(i, sch->has_op);

		check_hotplug_seq(ops);
		scx_idle_update_selcpu_topology(ops);
		@@ -5445,10 +5471,10 @@ static int scx_enable(struct sched_ext_ops ops, struct bpf_link link)

		for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
		if (((void (**)(void))ops)[i])
		set_bit(i, scx_root->has_op);
		set_bit(i, sch->has_op);

		if (scx_root->ops.cpu_acquire \|\| scx_root->ops.cpu_release)
		scx_root->ops.flags \|= SCX_OPS_HAS_CPU_PREEMPT;
		if (sch->ops.cpu_acquire \|\| sch->ops.cpu_release)
		sch->ops.flags \|= SCX_OPS_HAS_CPU_PREEMPT;

		/*
		* Lock out forks, cgroup on/offlining and moves before opening the
		@@ -5547,7 +5573,7 @@ static int scx_enable(struct sched_ext_ops ops, struct bpf_link link)
		scx_bypass(false);

		if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
		WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE);
		WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
		goto err_disable;
		}

		@@ -5555,23 +5581,18 @@ static int scx_enable(struct sched_ext_ops ops, struct bpf_link link)
		static_branch_enable(&__scx_switched_all);

		pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
		scx_root->ops.name, scx_switched_all() ? "" : " (partial)");
		kobject_uevent(scx_root->kobj, KOBJ_ADD);
		sch->ops.name, scx_switched_all() ? "" : " (partial)");
		kobject_uevent(&sch->kobj, KOBJ_ADD);
		mutex_unlock(&scx_enable_mutex);

		atomic_long_inc(&scx_enable_seq);

		return 0;

		err_del:
		kobject_del(scx_root->kobj);
		err:
		kobject_put(scx_root->kobj);
		scx_root->kobj = NULL;
		if (scx_root->exit_info) {
		free_exit_info(scx_root->exit_info);
		scx_root->exit_info = NULL;
		}
		err_free:
		if (sch->exit_info)
		free_exit_info(sch->exit_info);
		kfree(sch);
		err_unlock:
		mutex_unlock(&scx_enable_mutex);
		return ret;
		@@ -5589,7 +5610,7 @@ static int scx_enable(struct sched_ext_ops ops, struct bpf_link link)
		* is notified through ops.exit() with all the details.
		*
		* Flush scx_disable_work to ensure that error is reported before init
		* completion.
		* completion. sch's base reference will be put by bpf_scx_unreg().
		*/
		scx_error("scx_enable() failed (%d)", ret);
		kthread_flush_work(&scx_disable_work);
		@@ -5741,8 +5762,12 @@ static int bpf_scx_reg(void kdata, struct bpf_link link)

		static void bpf_scx_unreg(void kdata, struct bpf_link link)
		{
		struct sched_ext_ops *ops = kdata;
		struct scx_sched *sch = ops->priv;

		scx_disable(SCX_EXIT_UNREG);
		kthread_flush_work(&scx_disable_work);
		kobject_put(&sch->kobj);
		}

		static int bpf_scx_init(struct btf *btf)