Commit 84b1a0ea authored by Tejun Heo's avatar Tejun Heo
Browse files

sched_ext: Implement scx_bpf_dsq_reenq() for user DSQs



scx_bpf_dsq_reenq() currently only supports local DSQs. Extend it to support
user-defined DSQs by adding a deferred re-enqueue mechanism similar to the
local DSQ handling.

Add per-cpu deferred_reenq_user_node/flags to scx_dsq_pcpu and
deferred_reenq_users list to scx_rq. When scx_bpf_dsq_reenq() is called on a
user DSQ, the DSQ's per-cpu node is added to the current rq's deferred list.
process_deferred_reenq_users() then iterates the DSQ using the cursor helpers
and re-enqueues each task.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reviewed-by: default avatarAndrea Righi <arighi@nvidia.com>
parent 35250720
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -62,8 +62,14 @@ enum scx_dsq_id_flags {
	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
};

struct scx_deferred_reenq_user {
	struct list_head	node;
	u64			flags;
};

struct scx_dsq_pcpu {
	struct scx_dispatch_q	*dsq;
	struct scx_deferred_reenq_user deferred_reenq_user;
};

/*
+128 −0
Original line number Diff line number Diff line
@@ -1180,6 +1180,18 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq
			drl->flags |= reenq_flags;
		}

		schedule_deferred(rq);
	} else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) {
		struct rq *rq = this_rq();
		struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq));
		struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user;

		scoped_guard (raw_spinlock_irqsave, &rq->scx.deferred_reenq_lock) {
			if (list_empty(&dru->node))
				list_move_tail(&dru->node, &rq->scx.deferred_reenq_users);
			dru->flags |= reenq_flags;
		}

		schedule_deferred(rq);
	} else {
		scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id);
@@ -3784,12 +3796,108 @@ static void process_deferred_reenq_locals(struct rq *rq)
	}
}

static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags)
{
	struct rq *locked_rq = rq;
	struct scx_sched *sch = dsq->sched;
	struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0);
	struct task_struct *p;
	s32 nr_enqueued = 0;

	lockdep_assert_rq_held(rq);

	raw_spin_lock(&dsq->lock);

	while (likely(!READ_ONCE(sch->bypass_depth))) {
		struct rq *task_rq;

		p = nldsq_cursor_next_task(&cursor, dsq);
		if (!p)
			break;

		if (!task_should_reenq(p, reenq_flags))
			continue;

		task_rq = task_rq(p);

		if (locked_rq != task_rq) {
			if (locked_rq)
				raw_spin_rq_unlock(locked_rq);
			if (unlikely(!raw_spin_rq_trylock(task_rq))) {
				raw_spin_unlock(&dsq->lock);
				raw_spin_rq_lock(task_rq);
				raw_spin_lock(&dsq->lock);
			}
			locked_rq = task_rq;

			/* did we lose @p while switching locks? */
			if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p))
				continue;
		}

		/* @p is on @dsq, its rq and @dsq are locked */
		dispatch_dequeue_locked(p, dsq);
		raw_spin_unlock(&dsq->lock);
		do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1);

		if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) {
			raw_spin_rq_unlock(locked_rq);
			locked_rq = NULL;
			cpu_relax();
		}

		raw_spin_lock(&dsq->lock);
	}

	list_del_init(&cursor.node);
	raw_spin_unlock(&dsq->lock);

	if (locked_rq != rq) {
		if (locked_rq)
			raw_spin_rq_unlock(locked_rq);
		raw_spin_rq_lock(rq);
	}
}

static void process_deferred_reenq_users(struct rq *rq)
{
	lockdep_assert_rq_held(rq);

	while (true) {
		struct scx_dispatch_q *dsq;
		u64 reenq_flags = 0;

		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
			struct scx_deferred_reenq_user *dru =
				list_first_entry_or_null(&rq->scx.deferred_reenq_users,
							 struct scx_deferred_reenq_user,
							 node);
			struct scx_dsq_pcpu *dsq_pcpu;

			if (!dru)
				return;

			dsq_pcpu = container_of(dru, struct scx_dsq_pcpu,
						deferred_reenq_user);
			dsq = dsq_pcpu->dsq;
			swap(dru->flags, reenq_flags);
			list_del_init(&dru->node);
		}

		BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN);
		reenq_user(rq, dsq, reenq_flags);
	}
}

static void run_deferred(struct rq *rq)
{
	process_ddsp_deferred_locals(rq);

	if (!list_empty(&rq->scx.deferred_reenq_locals))
		process_deferred_reenq_locals(rq);

	if (!list_empty(&rq->scx.deferred_reenq_users))
		process_deferred_reenq_users(rq);
}

#ifdef CONFIG_NO_HZ_FULL
@@ -4119,6 +4227,7 @@ static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
		struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);

		pcpu->dsq = dsq;
		INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node);
	}

	return 0;
@@ -4126,6 +4235,23 @@ static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,

static void exit_dsq(struct scx_dispatch_q *dsq)
{
	s32 cpu;

	for_each_possible_cpu(cpu) {
		struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
		struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user;
		struct rq *rq = cpu_rq(cpu);

		/*
		 * There must have been a RCU grace period since the last
		 * insertion and @dsq should be off the deferred list by now.
		 */
		if (WARN_ON_ONCE(!list_empty(&dru->node))) {
			guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
			list_del_init(&dru->node);
		}
	}

	free_percpu(dsq->pcpu);
}

@@ -7308,6 +7434,7 @@ void __init init_sched_ext_class(void)
		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
		raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
		INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
		INIT_LIST_HEAD(&rq->scx.deferred_reenq_users);
		rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
		rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);

@@ -8354,6 +8481,7 @@ __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id,
 * supported:
 *
 * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu)
 * - User DSQs
 *
 * Re-enqueues are performed asynchronously. Can be called from anywhere.
 */
+1 −0
Original line number Diff line number Diff line
@@ -810,6 +810,7 @@ struct scx_rq {

	raw_spinlock_t		deferred_reenq_lock;
	struct list_head	deferred_reenq_locals;	/* scheds requesting reenq of local DSQ */
	struct list_head	deferred_reenq_users;	/* user DSQs requesting reenq */
	struct balance_callback	deferred_bal_cb;
	struct irq_work		deferred_irq_work;
	struct irq_work		kick_cpus_irq_work;
+55 −2
Original line number Diff line number Diff line
@@ -26,8 +26,11 @@

enum consts {
	ONE_SEC_IN_NS		= 1000000000,
	ONE_MSEC_IN_NS		= 1000000,
	LOWPRI_INTV_NS		= 10 * ONE_MSEC_IN_NS,
	SHARED_DSQ		= 0,
	HIGHPRI_DSQ		= 1,
	LOWPRI_DSQ		= 2,
	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
};

@@ -172,6 +175,9 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
	if (!(tctx = lookup_task_ctx(p)))
		return -ESRCH;

	if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
		return prev_cpu;

	cpu = pick_direct_dispatch_cpu(p, prev_cpu);

	if (cpu >= 0) {
@@ -242,6 +248,13 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
		return;
	}

	/* see lowpri_timerfn() */
	if (__COMPAT_has_generic_reenq() &&
	    p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) {
		scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags);
		return;
	}

	/* if select_cpu() wasn't called, try direct dispatch */
	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
@@ -873,6 +886,28 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
	return 0;
}

struct lowpri_timer {
	struct bpf_timer timer;
};

struct {
	__uint(type, BPF_MAP_TYPE_ARRAY);
	__uint(max_entries, 1);
	__type(key, u32);
	__type(value, struct lowpri_timer);
} lowpri_timer SEC(".maps");

/*
 * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and
 * the tasks are transferred to SHARED_DSQ.
 */
static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
{
	scx_bpf_dsq_reenq(LOWPRI_DSQ, 0);
	bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
	return 0;
}

s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
{
	u32 key = 0;
@@ -894,14 +929,32 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
		return ret;
	}

	ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1);
	if (ret)
		return ret;

	timer = bpf_map_lookup_elem(&monitor_timer, &key);
	if (!timer)
		return -ESRCH;

	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
	bpf_timer_set_callback(timer, monitor_timerfn);
	ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
	if (ret)
		return ret;

	return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
	if (__COMPAT_has_generic_reenq()) {
		/* see lowpri_timerfn() */
		timer = bpf_map_lookup_elem(&lowpri_timer, &key);
		if (!timer)
			return -ESRCH;
		bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC);
		bpf_timer_set_callback(timer, lowpri_timerfn);
		ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
		if (ret)
			return ret;
	}

	return 0;
}

void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)