Commit 6a8dab04 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched_ext-for-7.0-rc2-fixes' of...

Merge tag 'sched_ext-for-7.0-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - Fix starvation of scx_enable() under fair-class saturation by
   offloading the enable path to an RT kthread

 - Fix out-of-bounds access in idle mask initialization on systems with
   non-contiguous NUMA node IDs

 - Fix a preemption window during scheduler exit and a refcount
   underflow in cgroup init error path

 - Fix SCX_EFLAG_INITIALIZED being a no-op flag

 - Add READ_ONCE() annotations for KCSAN-clean lockless accesses and
   replace naked scx_root dereferences with container_of() in kobject
   callbacks

 - Tooling and selftest fixes: compilation issues with clang 17,
   strtoul() misuse, unused options cleanup, and Kconfig sync

* tag 'sched_ext-for-7.0-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  sched_ext: Fix starvation of scx_enable() under fair-class saturation
  sched_ext: Remove redundant css_put() in scx_cgroup_init()
  selftests/sched_ext: Fix peek_dsq.bpf.c compile error for clang 17
  selftests/sched_ext: Add -fms-extensions to bpf build flags
  tools/sched_ext: Add -fms-extensions to bpf build flags
  sched_ext: Use READ_ONCE() for plain reads of scx_watchdog_timeout
  sched_ext: Replace naked scx_root dereferences in kobject callbacks
  sched_ext: Use READ_ONCE() for the read side of dsq->nr update
  tools/sched_ext: fix strtoul() misuse in scx_hotplug_seq()
  sched_ext: Fix SCX_EFLAG_INITIALIZED being a no-op flag
  sched_ext: Fix out-of-bounds access in scx_idle_init_masks()
  sched_ext: Disable preemption between scx_claim_exit() and kicking helper work
  tools/sched_ext: Add Kconfig to sync with upstream
  tools/sched_ext: Sync README.md Kconfig with upstream scx
  selftests/sched_ext: Remove duplicated unistd.h include in rt_stall.c
  tools/sched_ext: scx_sdt: Remove unused '-f' option
  tools/sched_ext: scx_central: Remove unused '-p' option
  selftests/sched_ext: Fix unused-result warning for read()
  selftests/sched_ext: Abort test loop on signal
parents c44db6c8 b06ccbab
Loading
Loading
Loading
Loading
+83 −18
Original line number Diff line number Diff line
@@ -976,8 +976,12 @@ static bool scx_dsq_priq_less(struct rb_node *node_a,

static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
{
	/* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
	WRITE_ONCE(dsq->nr, dsq->nr + delta);
	/*
	 * scx_bpf_dsq_nr_queued() reads ->nr without locking. Use READ_ONCE()
	 * on the read side and WRITE_ONCE() on the write side to properly
	 * annotate the concurrent lockless access and avoid KCSAN warnings.
	 */
	WRITE_ONCE(dsq->nr, READ_ONCE(dsq->nr) + delta);
}

static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
@@ -2735,7 +2739,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
		unsigned long last_runnable = p->scx.runnable_at;

		if (unlikely(time_after(jiffies,
					last_runnable + scx_watchdog_timeout))) {
					last_runnable + READ_ONCE(scx_watchdog_timeout)))) {
			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);

			scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@@ -2763,7 +2767,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
		cond_resched();
	}
	queue_delayed_work(system_unbound_wq, to_delayed_work(work),
			   scx_watchdog_timeout / 2);
			   READ_ONCE(scx_watchdog_timeout) / 2);
}

void scx_tick(struct rq *rq)
@@ -3585,7 +3589,6 @@ static int scx_cgroup_init(struct scx_sched *sch)
		ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
				      css->cgroup, &args);
		if (ret) {
			css_put(css);
			scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
			return ret;
		}
@@ -3708,7 +3711,9 @@ static void scx_kobj_release(struct kobject *kobj)
static ssize_t scx_attr_ops_show(struct kobject *kobj,
				 struct kobj_attribute *ka, char *buf)
{
	return sysfs_emit(buf, "%s\n", scx_root->ops.name);
	struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);

	return sysfs_emit(buf, "%s\n", sch->ops.name);
}
SCX_ATTR(ops);

@@ -3752,7 +3757,9 @@ static const struct kobj_type scx_ktype = {

static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
	return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
	const struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);

	return add_uevent_var(env, "SCXOPS=%s", sch->ops.name);
}

static const struct kset_uevent_ops scx_uevent_ops = {
@@ -4423,10 +4430,19 @@ static void scx_disable_workfn(struct kthread_work *work)
	scx_bypass(false);
}

/*
 * Claim the exit on @sch. The caller must ensure that the helper kthread work
 * is kicked before the current task can be preempted. Once exit_kind is
 * claimed, scx_error() can no longer trigger, so if the current task gets
 * preempted and the BPF scheduler fails to schedule it back, the helper work
 * will never be kicked and the whole system can wedge.
 */
static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
{
	int none = SCX_EXIT_NONE;

	lockdep_assert_preemption_disabled();

	if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
		return false;

@@ -4449,6 +4465,7 @@ static void scx_disable(enum scx_exit_kind kind)
	rcu_read_lock();
	sch = rcu_dereference(scx_root);
	if (sch) {
		guard(preempt)();
		scx_claim_exit(sch, kind);
		kthread_queue_work(sch->helper, &sch->disable_work);
	}
@@ -4771,6 +4788,8 @@ static bool scx_vexit(struct scx_sched *sch,
{
	struct scx_exit_info *ei = sch->exit_info;

	guard(preempt)();

	if (!scx_claim_exit(sch, kind))
		return false;

@@ -4955,20 +4974,30 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
	return 0;
}

static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
/*
 * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
 * starvation. During the READY -> ENABLED task switching loop, the calling
 * thread's sched_class gets switched from fair to ext. As fair has higher
 * priority than ext, the calling thread can be indefinitely starved under
 * fair-class saturation, leading to a system hang.
 */
struct scx_enable_cmd {
	struct kthread_work	work;
	struct sched_ext_ops	*ops;
	int			ret;
};

static void scx_enable_workfn(struct kthread_work *work)
{
	struct scx_enable_cmd *cmd =
		container_of(work, struct scx_enable_cmd, work);
	struct sched_ext_ops *ops = cmd->ops;
	struct scx_sched *sch;
	struct scx_task_iter sti;
	struct task_struct *p;
	unsigned long timeout;
	int i, cpu, ret;

	if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
			   cpu_possible_mask)) {
		pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
		return -EINVAL;
	}

	mutex_lock(&scx_enable_mutex);

	if (scx_enable_state() != SCX_DISABLED) {
@@ -5060,7 +5089,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
	WRITE_ONCE(scx_watchdog_timeout, timeout);
	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
			   scx_watchdog_timeout / 2);
			   READ_ONCE(scx_watchdog_timeout) / 2);

	/*
	 * Once __scx_enabled is set, %current can be switched to SCX anytime.
@@ -5185,13 +5214,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)

	atomic_long_inc(&scx_enable_seq);

	return 0;
	cmd->ret = 0;
	return;

err_free_ksyncs:
	free_kick_syncs();
err_unlock:
	mutex_unlock(&scx_enable_mutex);
	return ret;
	cmd->ret = ret;
	return;

err_disable_unlock_all:
	scx_cgroup_unlock();
@@ -5210,7 +5241,41 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
	 */
	scx_error(sch, "scx_enable() failed (%d)", ret);
	kthread_flush_work(&sch->disable_work);
	return 0;
	cmd->ret = 0;
}

static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
{
	static struct kthread_worker *helper;
	static DEFINE_MUTEX(helper_mutex);
	struct scx_enable_cmd cmd;

	if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
			   cpu_possible_mask)) {
		pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
		return -EINVAL;
	}

	if (!READ_ONCE(helper)) {
		mutex_lock(&helper_mutex);
		if (!helper) {
			helper = kthread_run_worker(0, "scx_enable_helper");
			if (IS_ERR_OR_NULL(helper)) {
				helper = NULL;
				mutex_unlock(&helper_mutex);
				return -ENOMEM;
			}
			sched_set_fifo(helper->task);
		}
		mutex_unlock(&helper_mutex);
	}

	kthread_init_work(&cmd.work, scx_enable_workfn);
	cmd.ops = ops;

	kthread_queue_work(READ_ONCE(helper), &cmd.work);
	kthread_flush_work(&cmd.work);
	return cmd.ret;
}


+2 −3
Original line number Diff line number Diff line
@@ -663,9 +663,8 @@ void scx_idle_init_masks(void)
	BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
	BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL));

	/* Allocate per-node idle cpumasks */
	scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks,
					   num_possible_nodes());
	/* Allocate per-node idle cpumasks (use nr_node_ids for non-contiguous NUMA nodes) */
	scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks, nr_node_ids);
	BUG_ON(!scx_idle_node_masks);

	for_each_node(i) {
+1 −1
Original line number Diff line number Diff line
@@ -74,7 +74,7 @@ enum scx_exit_flags {
	 * info communication. The following flag indicates whether ops.init()
	 * finished successfully.
	 */
	SCX_EFLAG_INITIALIZED,
	SCX_EFLAG_INITIALIZED   = 1LLU << 0,
};

/*
+61 −0
Original line number Diff line number Diff line
# sched-ext mandatory options
#
CONFIG_BPF=y
CONFIG_BPF_SYSCALL=y
CONFIG_BPF_JIT=y
CONFIG_DEBUG_INFO_BTF=y
CONFIG_BPF_JIT_ALWAYS_ON=y
CONFIG_BPF_JIT_DEFAULT_ON=y
CONFIG_SCHED_CLASS_EXT=y

# Required by some rust schedulers (e.g. scx_p2dq)
#
CONFIG_KALLSYMS_ALL=y

# Required on arm64
#
# CONFIG_DEBUG_INFO_REDUCED is not set

# LAVD tracks futex to give an additional time slice for futex holder
# (i.e., avoiding lock holder preemption) for better system-wide progress.
# LAVD first tries to use ftrace to trace futex function calls.
# If that is not available, it tries to use a tracepoint.
CONFIG_FUNCTION_TRACER=y

# Enable scheduling debugging
#
CONFIG_SCHED_DEBUG=y

# Enable extra scheduling features (for a better code coverage while testing
# the schedulers)
#
CONFIG_SCHED_AUTOGROUP=y
CONFIG_SCHED_CORE=y
CONFIG_SCHED_MC=y

# Enable fully preemptible kernel for a better test coverage of the schedulers
#
# CONFIG_PREEMPT_NONE is not set
# CONFIG_PREEMPT_VOLUNTARY is not set
CONFIG_PREEMPT=y
CONFIG_PREEMPT_DYNAMIC=y

# Additional debugging information (useful to catch potential locking issues)
CONFIG_DEBUG_LOCKDEP=y
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_PROVE_LOCKING=y

# Bpftrace headers (for additional debug info)
CONFIG_BPF_EVENTS=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_DYNAMIC_FTRACE=y
CONFIG_KPROBES=y
CONFIG_KPROBE_EVENTS=y
CONFIG_UPROBES=y
CONFIG_UPROBE_EVENTS=y
CONFIG_DEBUG_FS=y

# Enable access to kernel configuration and headers at runtime
CONFIG_IKHEADERS=y
CONFIG_IKCONFIG_PROC=y
CONFIG_IKCONFIG=y
+2 −0
Original line number Diff line number Diff line
@@ -122,6 +122,8 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \
	     -I../../include							\
	     $(call get_sys_includes,$(CLANG))					\
	     -Wall -Wno-compare-distinct-pointer-types				\
	     -Wno-microsoft-anon-tag						\
	     -fms-extensions							\
	     -O2 -mcpu=v3

# sort removes libbpf duplicates when not cross-building
Loading