Commit 5b26f7b9 authored by Tejun Heo's avatar Tejun Heo
Browse files

sched_ext: Allow SCX_DSQ_LOCAL_ON for direct dispatches



In ops.dispatch(), SCX_DSQ_LOCAL_ON can be used to dispatch the task to the
local DSQ of any CPU. However, during direct dispatch from ops.select_cpu()
and ops.enqueue(), this isn't allowed. This is because dispatching to the
local DSQ of a remote CPU requires locking both the task's current and new
rq's and such double locking can't be done directly from ops.enqueue().

While waking up a task, as ops.select_cpu() can pick any CPU and both
ops.select_cpu() and ops.enqueue() can use SCX_DSQ_LOCAL as the dispatch
target to dispatch to the DSQ of the picked CPU, the BPF scheduler can still
do whatever it wants to do. However, while a task is being enqueued for a
different reason, e.g. after its slice expiration, only ops.enqueue() is
called and there's no way for the BPF scheduler to directly dispatch to the
local DSQ of a remote CPU. This gap in API forces schedulers into
work-arounds which are not straightforward or optimal such as skipping
direct dispatches in such cases.

Implement deferred enqueueing to allow directly dispatching to the local DSQ
of a remote CPU from ops.select_cpu() and ops.enqueue(). Such tasks are
temporarily queued on rq->scx.ddsp_deferred_locals. When the rq lock can be
safely released, the tasks are taken off the list and queued on the target
local DSQs using dispatch_to_local_dsq().

v2: - Add missing return after queue_balance_callback() in
      schedule_deferred(). (David).

    - dispatch_to_local_dsq() now assumes that @rq is locked but unpinned
      and thus no longer takes @rf. Updated accordingly.

    - UP build warning fix.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Tested-by: default avatarAndrea Righi <righi.andrea@gmail.com>
Acked-by: default avatarDavid Vernet <void@manifault.com>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
Cc: Changwoo Min <changwoo@igalia.com>
parent f47a8189
Loading
Loading
Loading
Loading
+150 −18
Original line number Diff line number Diff line
@@ -888,6 +888,7 @@ static struct kobject *scx_root_kobj;
#define CREATE_TRACE_POINTS
#include <trace/events/sched_ext.h>

static void process_ddsp_deferred_locals(struct rq *rq);
static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
					     s64 exit_code,
@@ -1362,6 +1363,67 @@ static int ops_sanitize_err(const char *ops_name, s32 err)
	return -EPROTO;
}

static void run_deferred(struct rq *rq)
{
	process_ddsp_deferred_locals(rq);
}

#ifdef CONFIG_SMP
static void deferred_bal_cb_workfn(struct rq *rq)
{
	run_deferred(rq);
}
#endif

static void deferred_irq_workfn(struct irq_work *irq_work)
{
	struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work);

	raw_spin_rq_lock(rq);
	run_deferred(rq);
	raw_spin_rq_unlock(rq);
}

/**
 * schedule_deferred - Schedule execution of deferred actions on an rq
 * @rq: target rq
 *
 * Schedule execution of deferred actions on @rq. Must be called with @rq
 * locked. Deferred actions are executed with @rq locked but unpinned, and thus
 * can unlock @rq to e.g. migrate tasks to other rqs.
 */
static void schedule_deferred(struct rq *rq)
{
	lockdep_assert_rq_held(rq);

#ifdef CONFIG_SMP
	/*
	 * If in the middle of waking up a task, task_woken_scx() will be called
	 * afterwards which will then run the deferred actions, no need to
	 * schedule anything.
	 */
	if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
		return;

	/*
	 * If in balance, the balance callbacks will be called before rq lock is
	 * released. Schedule one.
	 */
	if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
		queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
				       deferred_bal_cb_workfn);
		return;
	}
#endif
	/*
	 * No scheduler hooks available. Queue an irq work. They are executed on
	 * IRQ re-enable which may take a bit longer than the scheduler hooks.
	 * The above WAKEUP and BALANCE paths should cover most of the cases and
	 * the time to IRQ re-enable shouldn't be long.
	 */
	irq_work_queue(&rq->scx.deferred_irq_work);
}

/**
 * touch_core_sched - Update timestamp used for core-sched task ordering
 * @rq: rq to read clock from, must be locked
@@ -1577,7 +1639,13 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
	bool is_local = dsq == &rq->scx.local_dsq;

	if (!dsq) {
		WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));
		/*
		 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
		 * Unlinking is all that's needed to cancel.
		 */
		if (unlikely(!list_empty(&p->scx.dsq_list.node)))
			list_del_init(&p->scx.dsq_list.node);

		/*
		 * When dispatching directly from the BPF scheduler to a local
		 * DSQ, the task isn't associated with any DSQ but
@@ -1586,6 +1654,7 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
		 */
		if (p->scx.holding_cpu >= 0)
			p->scx.holding_cpu = -1;

		return;
	}

@@ -1673,17 +1742,6 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
		return;
	}

	/*
	 * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because
	 * dispatching to the local DSQ of a different CPU requires unlocking
	 * the current rq which isn't allowed in the enqueue path. Use
	 * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL.
	 */
	if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) {
		scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch");
		return;
	}

	WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
	WARN_ON_ONCE(p->scx.ddsp_enq_flags);

@@ -1693,13 +1751,58 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,

static void direct_dispatch(struct task_struct *p, u64 enq_flags)
{
	struct rq *rq = task_rq(p);
	struct scx_dispatch_q *dsq;
	u64 dsq_id = p->scx.ddsp_dsq_id;

	touch_core_sched_dispatch(task_rq(p), p);
	touch_core_sched_dispatch(rq, p);

	p->scx.ddsp_enq_flags |= enq_flags;

	/*
	 * We are in the enqueue path with @rq locked and pinned, and thus can't
	 * double lock a remote rq and enqueue to its local DSQ. For
	 * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer
	 * the enqueue so that it's executed when @rq can be unlocked.
	 */
	if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
		unsigned long opss;

	enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
	dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p);
	dispatch_enqueue(dsq, p, enq_flags);
		if (cpu == cpu_of(rq)) {
			dsq_id = SCX_DSQ_LOCAL;
			goto dispatch;
		}

		opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;

		switch (opss & SCX_OPSS_STATE_MASK) {
		case SCX_OPSS_NONE:
			break;
		case SCX_OPSS_QUEUEING:
			/*
			 * As @p was never passed to the BPF side, _release is
			 * not strictly necessary. Still do it for consistency.
			 */
			atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
			break;
		default:
			WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()",
				  p->comm, p->pid, opss);
			atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
			break;
		}

		WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
		list_add_tail(&p->scx.dsq_list.node,
			      &rq->scx.ddsp_deferred_locals);
		schedule_deferred(rq);
		return;
	}

dispatch:
	dsq = find_dsq_for_dispatch(rq, dsq_id, p);
	dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
}

static bool scx_rq_online(struct rq *rq)
@@ -2601,6 +2704,29 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
	}
}

static void process_ddsp_deferred_locals(struct rq *rq)
{
	struct task_struct *p, *tmp;

	lockdep_assert_rq_held(rq);

	/*
	 * Now that @rq can be unlocked, execute the deferred enqueueing of
	 * tasks directly dispatched to the local DSQs of other CPUs. See
	 * direct_dispatch().
	 */
	list_for_each_entry_safe(p, tmp, &rq->scx.ddsp_deferred_locals,
				 scx.dsq_list.node) {
		s32 ret;

		list_del_init(&p->scx.dsq_list.node);

		ret = dispatch_to_local_dsq(rq, p->scx.ddsp_dsq_id, p,
					    p->scx.ddsp_enq_flags);
		WARN_ON_ONCE(ret == DTL_NOT_LOCAL);
	}
}

static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
{
#ifndef CONFIG_SMP
@@ -3022,6 +3148,11 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
	}
}

static void task_woken_scx(struct rq *rq, struct task_struct *p)
{
	run_deferred(rq);
}

static void set_cpus_allowed_scx(struct task_struct *p,
				 struct affinity_context *ac)
{
@@ -3538,8 +3669,6 @@ bool scx_can_stop_tick(struct rq *rq)
 *
 * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
 *   their current sched_class. Call them directly from sched core instead.
 *
 * - task_woken: Unnecessary.
 */
DEFINE_SCHED_CLASS(ext) = {
	.enqueue_task		= enqueue_task_scx,
@@ -3559,6 +3688,7 @@ DEFINE_SCHED_CLASS(ext) = {
#ifdef CONFIG_SMP
	.balance		= balance_scx,
	.select_task_rq		= select_task_rq_scx,
	.task_woken		= task_woken_scx,
	.set_cpus_allowed	= set_cpus_allowed_scx,

	.rq_online		= rq_online_scx,
@@ -5263,11 +5393,13 @@ void __init init_sched_ext_class(void)

		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
		INIT_LIST_HEAD(&rq->scx.runnable_list);
		INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);

		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
		init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);

		if (cpu_online(cpu))
+3 −0
Original line number Diff line number Diff line
@@ -746,6 +746,7 @@ enum scx_rq_flags {
struct scx_rq {
	struct scx_dispatch_q	local_dsq;
	struct list_head	runnable_list;		/* runnable tasks on this rq */
	struct list_head	ddsp_deferred_locals;	/* deferred ddsps from enq */
	unsigned long		ops_qseq;
	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
	u32			nr_running;
@@ -757,6 +758,8 @@ struct scx_rq {
	cpumask_var_t		cpus_to_preempt;
	cpumask_var_t		cpus_to_wait;
	unsigned long		pnt_seq;
	struct balance_callback	deferred_bal_cb;
	struct irq_work		deferred_irq_work;
	struct irq_work		kick_cpus_irq_work;
};
#endif /* CONFIG_SCHED_CLASS_EXT */