Merge branch 'for-7.0-fixes' into for-7.1 (94555ca6) · Commits · git / linux-net

kernel/sched/ext.c

+70 −25

Original line number	Diff line number	Diff line
		@@ -3018,7 +3018,7 @@ static void put_prev_task_scx(struct rq rq, struct task_struct p,
		{
		struct scx_sched *sch = scx_task_sched(p);

		/* see kick_cpus_irq_workfn() */
		/* see kick_sync_wait_bal_cb() */
		smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);

		update_curr_scx(rq);
		@@ -3067,6 +3067,48 @@ static void put_prev_task_scx(struct rq rq, struct task_struct p,
		switch_class(rq, next);
		}

		static void kick_sync_wait_bal_cb(struct rq *rq)
		{
		struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs);
		unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
		bool waited;
		s32 cpu;

		/*
		* Drop rq lock and enable IRQs while waiting. IRQs must be enabled
		* — a target CPU may be waiting for us to process an IPI (e.g. TLB
		* flush) while we wait for its kick_sync to advance.
		*
		* Also, keep advancing our own kick_sync so that new kick_sync waits
		* targeting us, which can start after we drop the lock, cannot form
		* cyclic dependencies.
		*/
		retry:
		waited = false;
		for_each_cpu(cpu, rq->scx.cpus_to_sync) {
		/*
		* smp_load_acquire() pairs with smp_store_release() on
		* kick_sync updates on the target CPUs.
		*/
		if (cpu == cpu_of(rq) \|\|
		smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
		cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
		continue;
		}

		raw_spin_rq_unlock_irq(rq);
		while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
		smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
		cpu_relax();
		}
		raw_spin_rq_lock_irq(rq);
		waited = true;
		}

		if (waited)
		goto retry;
		}

		static struct task_struct first_local_task(struct rq rq)
		{
		return list_first_entry_or_null(&rq->scx.local_dsq.list,
		@@ -3080,7 +3122,7 @@ do_pick_task_scx(struct rq rq, struct rq_flags rf, bool force_scx)
		bool keep_prev;
		struct task_struct *p;

		/* see kick_cpus_irq_workfn() */
		/* see kick_sync_wait_bal_cb() */
		smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);

		rq_modified_begin(rq, &ext_sched_class);
		@@ -3090,6 +3132,17 @@ do_pick_task_scx(struct rq rq, struct rq_flags rf, bool force_scx)
		rq_repin_lock(rq, rf);
		maybe_queue_balance_callback(rq);

		/*
		* Defer to a balance callback which can drop rq lock and enable
		* IRQs. Waiting directly in the pick path would deadlock against
		* CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
		*/
		if (unlikely(rq->scx.kick_sync_pending)) {
		rq->scx.kick_sync_pending = false;
		queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
		kick_sync_wait_bal_cb);
		}

		/*
		* If any higher-priority sched class enqueued a runnable task on
		* this rq during balance_one(), abort and return RETRY_TASK, so
		@@ -6219,6 +6272,9 @@ static void scx_dump_state(struct scx_sched sch, struct scx_exit_info ei,
		if (!cpumask_empty(rq->scx.cpus_to_wait))
		dump_line(&ns, " cpus_to_wait : %*pb",
		cpumask_pr_args(rq->scx.cpus_to_wait));
		if (!cpumask_empty(rq->scx.cpus_to_sync))
		dump_line(&ns, " cpus_to_sync : %*pb",
		cpumask_pr_args(rq->scx.cpus_to_sync));

		used = seq_buf_used(&ns);
		if (SCX_HAS_OP(sch, dump_cpu)) {
		@@ -7583,11 +7639,11 @@ static bool kick_one_cpu(s32 cpu, struct rq this_rq, unsigned long ksyncs)

		if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
		if (cur_class == &ext_sched_class) {
		cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
		ksyncs[cpu] = rq->scx.kick_sync;
		should_wait = true;
		} else {
		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
		}
		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
		}

		resched_curr(rq);
		@@ -7642,27 +7698,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
		}

		if (!should_wait)
		return;

		for_each_cpu(cpu, this_scx->cpus_to_wait) {
		unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;

		/*
		* Busy-wait until the task running at the time of kicking is no
		* longer running. This can be used to implement e.g. core
		* scheduling.
		*
		* smp_cond_load_acquire() pairs with store_releases in
		* pick_task_scx() and put_prev_task_scx(). The former breaks
		* the wait if SCX's scheduling path is entered even if the same
		* task is picked subsequently. The latter is necessary to break
		* the wait when $cpu is taken by a higher sched class.
		* Can't wait in hardirq — kick_sync can't advance, deadlocking if
		* CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
		*/
		if (cpu != cpu_of(this_rq))
		smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);

		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
		if (should_wait) {
		raw_spin_rq_lock(this_rq);
		this_scx->kick_sync_pending = true;
		resched_curr(this_rq);
		raw_spin_rq_unlock(this_rq);
		}
		}

		@@ -7780,6 +7824,7 @@ void __init init_sched_ext_class(void)
		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
		raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
		INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
		INIT_LIST_HEAD(&rq->scx.deferred_reenq_users);

kernel/sched/ext_idle.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -549,7 +549,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
		* piled up on it even if there is an idle core elsewhere on
		* the system.
		*/
		waker_node = cpu_to_node(cpu);
		waker_node = scx_cpu_node_if_enabled(cpu);
		if (!(current->flags & PF_EXITING) &&
		cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
		(!(flags & SCX_PICK_IDLE_IN_NODE) \|\| (waker_node == node)) &&

kernel/sched/sched.h

+3 −0

Original line number	Diff line number	Diff line
		@@ -806,6 +806,8 @@ struct scx_rq {
		cpumask_var_t cpus_to_kick_if_idle;
		cpumask_var_t cpus_to_preempt;
		cpumask_var_t cpus_to_wait;
		cpumask_var_t cpus_to_sync;
		bool kick_sync_pending;
		unsigned long kick_sync;

		struct task_struct *sub_dispatch_prev;
		@@ -815,6 +817,7 @@ struct scx_rq {
		struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */
		struct list_head deferred_reenq_users; /* user DSQs requesting reenq */
		struct balance_callback deferred_bal_cb;
		struct balance_callback kick_sync_bal_cb;
		struct irq_work deferred_irq_work;
		struct irq_work kick_cpus_irq_work;
		};

tools/testing/selftests/sched_ext/Makefile

+1 −0

Original line number	Diff line number	Diff line
		@@ -189,6 +189,7 @@ auto-test-targets := \
		rt_stall \
		test_example \
		total_bw \
		cyclic_kick_wait \

		testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))

tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c

0 → 100644

+68 −0

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0 */
		/*
		* Stress concurrent SCX_KICK_WAIT calls to reproduce wait-cycle deadlock.
		*
		* Three CPUs are designated from userspace. Every enqueue from one of the
		* three CPUs kicks the next CPU in the ring with SCX_KICK_WAIT, creating a
		* persistent A -> B -> C -> A wait cycle pressure.
		*/
		#include <scx/common.bpf.h>

		char _license[] SEC("license") = "GPL";

		const volatile s32 test_cpu_a;
		const volatile s32 test_cpu_b;
		const volatile s32 test_cpu_c;

		u64 nr_enqueues;
		u64 nr_wait_kicks;

		UEI_DEFINE(uei);

		static s32 target_cpu(s32 cpu)
		{
		if (cpu == test_cpu_a)
		return test_cpu_b;
		if (cpu == test_cpu_b)
		return test_cpu_c;
		if (cpu == test_cpu_c)
		return test_cpu_a;
		return -1;
		}

		void BPF_STRUCT_OPS(cyclic_kick_wait_enqueue, struct task_struct *p,
		u64 enq_flags)
		{
		s32 this_cpu = bpf_get_smp_processor_id();
		s32 tgt;

		__sync_fetch_and_add(&nr_enqueues, 1);

		if (p->flags & PF_KTHREAD) {
		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
		enq_flags \| SCX_ENQ_PREEMPT);
		return;
		}

		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);

		tgt = target_cpu(this_cpu);
		if (tgt < 0 \|\| tgt == this_cpu)
		return;

		__sync_fetch_and_add(&nr_wait_kicks, 1);
		scx_bpf_kick_cpu(tgt, SCX_KICK_WAIT);
		}

		void BPF_STRUCT_OPS(cyclic_kick_wait_exit, struct scx_exit_info *ei)
		{
		UEI_RECORD(uei, ei);
		}

		SEC(".struct_ops.link")
		struct sched_ext_ops cyclic_kick_wait_ops = {
		.enqueue = cyclic_kick_wait_enqueue,
		.exit = cyclic_kick_wait_exit,
		.name = "cyclic_kick_wait",
		.timeout_ms = 1000U,
		};