Commit 22a92020 authored by Tejun Heo's avatar Tejun Heo
Browse files

sched_ext: Implement tickless support



Allow BPF schedulers to indicate tickless operation by setting p->scx.slice
to SCX_SLICE_INF. A CPU whose current task has infinte slice goes into
tickless operation.

scx_central is updated to use tickless operations for all tasks and
instead use a BPF timer to expire slices. This also uses the SCX_ENQ_PREEMPT
and task state tracking added by the previous patches.

Currently, there is no way to pin the timer on the central CPU, so it may
end up on one of the worker CPUs; however, outside of that, the worker CPUs
can go tickless both while running sched_ext tasks and idling.

With schbench running, scx_central shows:

  root@test ~# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts
  LOC:     142024        656        664        449   Local timer interrupts
  LOC:     161663        663        665        449   Local timer interrupts

Without it:

  root@test ~ [SIGINT]# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts
  LOC:     188778       3142       3793       3993   Local timer interrupts
  LOC:     198993       5314       6323       6438   Local timer interrupts

While scx_central itself is too barebone to be useful as a
production scheduler, a more featureful central scheduler can be built using
the same approach. Google's experience shows that such an approach can have
significant benefits for certain applications such as VM hosting.

v4: Allow operation even if BPF_F_TIMER_CPU_PIN is not available.

v3: Pin the central scheduler's timer on the central_cpu using
    BPF_F_TIMER_CPU_PIN.

v2: Convert to BPF inline iterators.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reviewed-by: default avatarDavid Vernet <dvernet@meta.com>
Acked-by: default avatarJosh Don <joshdon@google.com>
Acked-by: default avatarHao Luo <haoluo@google.com>
Acked-by: default avatarBarret Rhoden <brho@google.com>
parent 1c29f854
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@ enum scx_public_consts {
	SCX_OPS_NAME_LEN	= 128,

	SCX_SLICE_DFL		= 20 * 1000000,	/* 20ms */
	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
};

/*
+7 −4
Original line number Diff line number Diff line
@@ -1256,11 +1256,14 @@ bool sched_can_stop_tick(struct rq *rq)
		return true;

	/*
	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
	 * if there's more than one we need the tick for involuntary
	 * preemption.
	 * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
	 * left. For CFS, if there's more than one we need the tick for
	 * involuntary preemption. For SCX, ask.
	 */
	if (rq->nr_running > 1)
	if (!scx_switched_all() && rq->nr_running > 1)
		return false;

	if (scx_enabled() && !scx_can_stop_tick(rq))
		return false;

	/*
+50 −2
Original line number Diff line number Diff line
@@ -1086,6 +1086,7 @@ static void update_curr_scx(struct rq *rq)
	account_group_exec_runtime(curr, delta_exec);
	cgroup_account_cputime(curr, delta_exec);

	if (curr->scx.slice != SCX_SLICE_INF)
		curr->scx.slice -= min(curr->scx.slice, delta_exec);
}

@@ -2093,6 +2094,28 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
		SCX_CALL_OP(SCX_KF_REST, running, p);

	clr_task_runnable(p, true);

	/*
	 * @p is getting newly scheduled or got kicked after someone updated its
	 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
	 */
	if ((p->scx.slice == SCX_SLICE_INF) !=
	    (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
		if (p->scx.slice == SCX_SLICE_INF)
			rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
		else
			rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;

		sched_update_tick_dependency(rq);

		/*
		 * For now, let's refresh the load_avgs just when transitioning
		 * in and out of nohz. In the future, we might want to add a
		 * mechanism which calls the following periodically on
		 * tick-stopped CPUs.
		 */
		update_other_load_avgs(rq);
	}
}

static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -2818,6 +2841,26 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
	return 0;
}

#ifdef CONFIG_NO_HZ_FULL
bool scx_can_stop_tick(struct rq *rq)
{
	struct task_struct *p = rq->curr;

	if (scx_ops_bypassing())
		return false;

	if (p->sched_class != &ext_sched_class)
		return true;

	/*
	 * @rq can dispatch from different DSQs, so we can't tell whether it
	 * needs the tick or not by looking at nr_running. Allow stopping ticks
	 * iff the BPF scheduler indicated so. See set_next_task_scx().
	 */
	return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
}
#endif

/*
 * Omitted operations:
 *
@@ -3120,6 +3163,9 @@ static void scx_ops_bypass(bool bypass)
		}

		rq_unlock_irqrestore(rq, &rf);

		/* kick to restore ticks */
		resched_cpu(cpu);
	}
}

@@ -4576,7 +4622,9 @@ __bpf_kfunc_start_defs();
 * BPF locks (in the future when BPF introduces more flexible locking).
 *
 * @p is allowed to run for @slice. The scheduling path is triggered on slice
 * exhaustion. If zero, the current residual slice is maintained.
 * exhaustion. If zero, the current residual slice is maintained. If
 * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
 * scx_bpf_kick_cpu() to trigger scheduling.
 */
__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
				  u64 enq_flags)
+2 −0
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@ void scx_pre_fork(struct task_struct *p);
int scx_fork(struct task_struct *p);
void scx_post_fork(struct task_struct *p);
void scx_cancel_fork(struct task_struct *p);
bool scx_can_stop_tick(struct rq *rq);
int scx_check_setscheduler(struct task_struct *p, int policy);
bool task_should_scx(struct task_struct *p);
void init_sched_ext_class(void);
@@ -73,6 +74,7 @@ static inline void scx_pre_fork(struct task_struct *p) {}
static inline int scx_fork(struct task_struct *p) { return 0; }
static inline void scx_post_fork(struct task_struct *p) {}
static inline void scx_cancel_fork(struct task_struct *p) {}
static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
static inline bool task_on_scx(const struct task_struct *p) { return false; }
static inline void init_sched_ext_class(void) {}
+1 −0
Original line number Diff line number Diff line
@@ -727,6 +727,7 @@ struct cfs_rq {
/* scx_rq->flags, protected by the rq lock */
enum scx_rq_flags {
	SCX_RQ_BALANCING	= 1 << 1,
	SCX_RQ_CAN_STOP_TICK	= 1 << 2,
};

struct scx_rq {
Loading