Commit 9147566d authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched_ext-for-7.0-rc6-fixes' of...

Merge tag 'sched_ext-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - Fix SCX_KICK_WAIT deadlock where multiple CPUs waiting for each other
   in hardirq context form a cycle. Move the wait to a balance callback
   which can drop the rq lock and process IPIs.

 - Fix inconsistent NUMA node lookup in scx_select_cpu_dfl() where
   the waker_node used cpu_to_node() while prev_cpu used
   scx_cpu_node_if_enabled(), leading to undefined behavior when
   per-node idle tracking is disabled.

* tag 'sched_ext-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  selftests/sched_ext: Add cyclic SCX_KICK_WAIT stress test
  sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback
  sched_ext: Fix inconsistent NUMA node lookup in scx_select_cpu_dfl()
parents 0958d657 090d34f0
Loading
Loading
Loading
Loading
+70 −25
Original line number Diff line number Diff line
@@ -2404,7 +2404,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
{
	struct scx_sched *sch = scx_root;

	/* see kick_cpus_irq_workfn() */
	/* see kick_sync_wait_bal_cb() */
	smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);

	update_curr_scx(rq);
@@ -2447,6 +2447,48 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
		switch_class(rq, next);
}

static void kick_sync_wait_bal_cb(struct rq *rq)
{
	struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs);
	unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
	bool waited;
	s32 cpu;

	/*
	 * Drop rq lock and enable IRQs while waiting. IRQs must be enabled
	 * — a target CPU may be waiting for us to process an IPI (e.g. TLB
	 * flush) while we wait for its kick_sync to advance.
	 *
	 * Also, keep advancing our own kick_sync so that new kick_sync waits
	 * targeting us, which can start after we drop the lock, cannot form
	 * cyclic dependencies.
	 */
retry:
	waited = false;
	for_each_cpu(cpu, rq->scx.cpus_to_sync) {
		/*
		 * smp_load_acquire() pairs with smp_store_release() on
		 * kick_sync updates on the target CPUs.
		 */
		if (cpu == cpu_of(rq) ||
		    smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
			cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
			continue;
		}

		raw_spin_rq_unlock_irq(rq);
		while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
			smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
			cpu_relax();
		}
		raw_spin_rq_lock_irq(rq);
		waited = true;
	}

	if (waited)
		goto retry;
}

static struct task_struct *first_local_task(struct rq *rq)
{
	return list_first_entry_or_null(&rq->scx.local_dsq.list,
@@ -2460,7 +2502,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
	bool keep_prev;
	struct task_struct *p;

	/* see kick_cpus_irq_workfn() */
	/* see kick_sync_wait_bal_cb() */
	smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);

	rq_modified_begin(rq, &ext_sched_class);
@@ -2470,6 +2512,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
	rq_repin_lock(rq, rf);
	maybe_queue_balance_callback(rq);

	/*
	 * Defer to a balance callback which can drop rq lock and enable
	 * IRQs. Waiting directly in the pick path would deadlock against
	 * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
	 */
	if (unlikely(rq->scx.kick_sync_pending)) {
		rq->scx.kick_sync_pending = false;
		queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
				       kick_sync_wait_bal_cb);
	}

	/*
	 * If any higher-priority sched class enqueued a runnable task on
	 * this rq during balance_one(), abort and return RETRY_TASK, so
@@ -4713,6 +4766,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
		if (!cpumask_empty(rq->scx.cpus_to_wait))
			dump_line(&ns, "  cpus_to_wait   : %*pb",
				  cpumask_pr_args(rq->scx.cpus_to_wait));
		if (!cpumask_empty(rq->scx.cpus_to_sync))
			dump_line(&ns, "  cpus_to_sync   : %*pb",
				  cpumask_pr_args(rq->scx.cpus_to_sync));

		used = seq_buf_used(&ns);
		if (SCX_HAS_OP(sch, dump_cpu)) {
@@ -5610,11 +5666,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)

		if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
			if (cur_class == &ext_sched_class) {
				cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
				ksyncs[cpu] = rq->scx.kick_sync;
				should_wait = true;
			} else {
				cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
			}
			cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
		}

		resched_curr(rq);
@@ -5669,27 +5725,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
	}

	if (!should_wait)
		return;

	for_each_cpu(cpu, this_scx->cpus_to_wait) {
		unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;

	/*
		 * Busy-wait until the task running at the time of kicking is no
		 * longer running. This can be used to implement e.g. core
		 * scheduling.
		 *
		 * smp_cond_load_acquire() pairs with store_releases in
		 * pick_task_scx() and put_prev_task_scx(). The former breaks
		 * the wait if SCX's scheduling path is entered even if the same
		 * task is picked subsequently. The latter is necessary to break
		 * the wait when $cpu is taken by a higher sched class.
	 * Can't wait in hardirq — kick_sync can't advance, deadlocking if
	 * CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
	 */
		if (cpu != cpu_of(this_rq))
			smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);

		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
	if (should_wait) {
		raw_spin_rq_lock(this_rq);
		this_scx->kick_sync_pending = true;
		resched_curr(this_rq);
		raw_spin_rq_unlock(this_rq);
	}
}

@@ -5794,6 +5838,7 @@ void __init init_sched_ext_class(void)
		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
		rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
		rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);

+1 −1
Original line number Diff line number Diff line
@@ -543,7 +543,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
		 * piled up on it even if there is an idle core elsewhere on
		 * the system.
		 */
		waker_node = cpu_to_node(cpu);
		waker_node = scx_cpu_node_if_enabled(cpu);
		if (!(current->flags & PF_EXITING) &&
		    cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
		    (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
+3 −0
Original line number Diff line number Diff line
@@ -805,9 +805,12 @@ struct scx_rq {
	cpumask_var_t		cpus_to_kick_if_idle;
	cpumask_var_t		cpus_to_preempt;
	cpumask_var_t		cpus_to_wait;
	cpumask_var_t		cpus_to_sync;
	bool			kick_sync_pending;
	unsigned long		kick_sync;
	local_t			reenq_local_deferred;
	struct balance_callback	deferred_bal_cb;
	struct balance_callback	kick_sync_bal_cb;
	struct irq_work		deferred_irq_work;
	struct irq_work		kick_cpus_irq_work;
	struct scx_dispatch_q	bypass_dsq;
+1 −0
Original line number Diff line number Diff line
@@ -188,6 +188,7 @@ auto-test-targets := \
	rt_stall			\
	test_example			\
	total_bw			\
	cyclic_kick_wait		\

testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))

+68 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Stress concurrent SCX_KICK_WAIT calls to reproduce wait-cycle deadlock.
 *
 * Three CPUs are designated from userspace. Every enqueue from one of the
 * three CPUs kicks the next CPU in the ring with SCX_KICK_WAIT, creating a
 * persistent A -> B -> C -> A wait cycle pressure.
 */
#include <scx/common.bpf.h>

char _license[] SEC("license") = "GPL";

const volatile s32 test_cpu_a;
const volatile s32 test_cpu_b;
const volatile s32 test_cpu_c;

u64 nr_enqueues;
u64 nr_wait_kicks;

UEI_DEFINE(uei);

static s32 target_cpu(s32 cpu)
{
	if (cpu == test_cpu_a)
		return test_cpu_b;
	if (cpu == test_cpu_b)
		return test_cpu_c;
	if (cpu == test_cpu_c)
		return test_cpu_a;
	return -1;
}

void BPF_STRUCT_OPS(cyclic_kick_wait_enqueue, struct task_struct *p,
		    u64 enq_flags)
{
	s32 this_cpu = bpf_get_smp_processor_id();
	s32 tgt;

	__sync_fetch_and_add(&nr_enqueues, 1);

	if (p->flags & PF_KTHREAD) {
		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
				   enq_flags | SCX_ENQ_PREEMPT);
		return;
	}

	scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);

	tgt = target_cpu(this_cpu);
	if (tgt < 0 || tgt == this_cpu)
		return;

	__sync_fetch_and_add(&nr_wait_kicks, 1);
	scx_bpf_kick_cpu(tgt, SCX_KICK_WAIT);
}

void BPF_STRUCT_OPS(cyclic_kick_wait_exit, struct scx_exit_info *ei)
{
	UEI_RECORD(uei, ei);
}

SEC(".struct_ops.link")
struct sched_ext_ops cyclic_kick_wait_ops = {
	.enqueue		= cyclic_kick_wait_enqueue,
	.exit			= cyclic_kick_wait_exit,
	.name			= "cyclic_kick_wait",
	.timeout_ms		= 1000U,
};
Loading