Commit 8c1b9453 authored by Tejun Heo's avatar Tejun Heo
Browse files

sched_ext: Convert deferred_reenq_locals from llist to regular list



The deferred reenqueue local mechanism uses an llist (lockless list) for
collecting schedulers that need their local DSQs re-enqueued. Convert to a
regular list protected by a raw_spinlock.

The llist was used for its lockless properties, but the upcoming changes to
support remote reenqueue require more complex list operations that are
difficult to implement correctly with lockless data structures. A spinlock-
protected regular list provides the necessary flexibility.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reviewed-by: default avatarAndrea Righi <arighi@nvidia.com>
parent ea4593e9
Loading
Loading
Loading
Loading
+38 −19
Original line number Diff line number Diff line
@@ -3639,23 +3639,37 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq)
	return nr_enqueued;
}

static void run_deferred(struct rq *rq)
static void process_deferred_reenq_locals(struct rq *rq)
{
	process_ddsp_deferred_locals(rq);
	lockdep_assert_rq_held(rq);

	if (!llist_empty(&rq->scx.deferred_reenq_locals)) {
		struct llist_node *llist =
			llist_del_all(&rq->scx.deferred_reenq_locals);
		struct scx_sched_pcpu *pos, *next;
	while (true) {
		struct scx_sched *sch;

		llist_for_each_entry_safe(pos, next, llist,
					  deferred_reenq_locals_node) {
			init_llist_node(&pos->deferred_reenq_locals_node);
			reenq_local(pos->sch, rq);
		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
			struct scx_sched_pcpu *sch_pcpu =
				list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
							 struct scx_sched_pcpu,
							 deferred_reenq_local_node);
			if (!sch_pcpu)
				return;

			sch = sch_pcpu->sch;
			list_del_init(&sch_pcpu->deferred_reenq_local_node);
		}

		reenq_local(sch, rq);
	}
}

static void run_deferred(struct rq *rq)
{
	process_ddsp_deferred_locals(rq);

	if (!list_empty(&rq->scx.deferred_reenq_locals))
		process_deferred_reenq_locals(rq);
}

#ifdef CONFIG_NO_HZ_FULL
bool scx_can_stop_tick(struct rq *rq)
{
@@ -4179,13 +4193,13 @@ static void scx_sched_free_rcu_work(struct work_struct *work)

	/*
	 * $sch would have entered bypass mode before the RCU grace period. As
	 * that blocks new deferrals, all deferred_reenq_locals_node's must be
	 * that blocks new deferrals, all deferred_reenq_local_node's must be
	 * off-list by now.
	 */
	for_each_possible_cpu(cpu) {
		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);

		WARN_ON_ONCE(llist_on_list(&pcpu->deferred_reenq_locals_node));
		WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local_node));
	}

	free_percpu(sch->pcpu);
@@ -5798,7 +5812,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);

		pcpu->sch = sch;
		init_llist_node(&pcpu->deferred_reenq_locals_node);
		INIT_LIST_HEAD(&pcpu->deferred_reenq_local_node);
	}

	sch->helper = kthread_run_worker(0, "sched_ext_helper");
@@ -7125,7 +7139,8 @@ void __init init_sched_ext_class(void)
		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
		init_llist_head(&rq->scx.deferred_reenq_locals);
		raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
		INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
		rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
		rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);

@@ -8357,7 +8372,6 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
	unsigned long flags;
	struct scx_sched *sch;
	struct rq *rq;
	struct llist_node *lnode;

	raw_local_irq_save(flags);

@@ -8373,9 +8387,14 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
		goto out_irq_restore;

	rq = this_rq();
	lnode = &this_cpu_ptr(sch->pcpu)->deferred_reenq_locals_node;
	if (!llist_on_list(lnode))
		llist_add(lnode, &rq->scx.deferred_reenq_locals);
	scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
		struct scx_sched_pcpu *pcpu = this_cpu_ptr(sch->pcpu);

		if (list_empty(&pcpu->deferred_reenq_local_node))
			list_move_tail(&pcpu->deferred_reenq_local_node,
				       &rq->scx.deferred_reenq_locals);
	}

	schedule_deferred(rq);
out_irq_restore:
	raw_local_irq_restore(flags);
+1 −1
Original line number Diff line number Diff line
@@ -965,7 +965,7 @@ struct scx_sched_pcpu {
	 */
	struct scx_event_stats	event_stats;

	struct llist_node	deferred_reenq_locals_node;
	struct list_head	deferred_reenq_local_node;
	struct scx_dispatch_q	bypass_dsq;
#ifdef CONFIG_EXT_SUB_SCHED
	u32			bypass_host_seq;
+2 −1
Original line number Diff line number Diff line
@@ -808,7 +808,8 @@ struct scx_rq {

	struct task_struct	*sub_dispatch_prev;

	struct llist_head	deferred_reenq_locals;
	raw_spinlock_t		deferred_reenq_lock;
	struct list_head	deferred_reenq_locals;	/* scheds requesting reenq of local DSQ */
	struct balance_callback	deferred_bal_cb;
	struct irq_work		deferred_irq_work;
	struct irq_work		kick_cpus_irq_work;