Commit e9139f76 authored by Peter Zijlstra's avatar Peter Zijlstra
Browse files

sched: Employ sched_change guards



As proposed a long while ago -- and half done by scx -- wrap the
scheduler's 'change' pattern in a guard helper.

Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: default avatarJuri Lelli <juri.lelli@redhat.com>
Acked-by: default avatarTejun Heo <tj@kernel.org>
Acked-by: default avatarVincent Guittot <vincent.guittot@linaro.org>
parent 82d6e01a
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -340,6 +340,11 @@ _label: \
#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond)	\
static __maybe_unused const bool class_##_name##_is_conditional = _is_cond

#define DEFINE_CLASS_IS_UNCONDITIONAL(_name)		\
	__DEFINE_CLASS_IS_CONDITIONAL(_name, false);	\
	static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
	{ return (void *)1; }

#define __GUARD_IS_ERR(_ptr)                                       \
	({                                                         \
		unsigned long _rc = (__force unsigned long)(_ptr); \
+61 −98
Original line number Diff line number Diff line
@@ -7326,7 +7326,7 @@ void rt_mutex_post_schedule(void)
 */
void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
	int prio, oldprio, queued, running, queue_flag =
	int prio, oldprio, queue_flag =
		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
	const struct sched_class *prev_class, *next_class;
	struct rq_flags rf;
@@ -7391,13 +7391,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
	if (prev_class != next_class && p->se.sched_delayed)
		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);

	queued = task_on_rq_queued(p);
	running = task_current_donor(rq, p);
	if (queued)
		dequeue_task(rq, p, queue_flag);
	if (running)
		put_prev_task(rq, p);

	scoped_guard (sched_change, p, queue_flag) {
		/*
		 * Boosting condition are:
		 * 1. -rt task is running and holds mutex A
@@ -7412,7 +7406,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
			    (pi_task && dl_prio(pi_task->prio) &&
			     dl_entity_preempt(&pi_task->dl, &p->dl))) {
				p->dl.pi_se = pi_task->dl.pi_se;
			queue_flag |= ENQUEUE_REPLENISH;
				scope->flags |= ENQUEUE_REPLENISH;
			} else {
				p->dl.pi_se = &p->dl;
			}
@@ -7420,7 +7414,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
			if (dl_prio(oldprio))
				p->dl.pi_se = &p->dl;
			if (oldprio < prio)
			queue_flag |= ENQUEUE_HEAD;
				scope->flags |= ENQUEUE_HEAD;
		} else {
			if (dl_prio(oldprio))
				p->dl.pi_se = &p->dl;
@@ -7432,11 +7426,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
		p->prio = prio;

		check_class_changing(rq, p, prev_class);

	if (queued)
		enqueue_task(rq, p, queue_flag);
	if (running)
		set_next_task(rq, p);
	}

	check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -8084,26 +8074,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
 */
void sched_setnuma(struct task_struct *p, int nid)
{
	bool queued, running;
	struct rq_flags rf;
	struct rq *rq;

	rq = task_rq_lock(p, &rf);
	queued = task_on_rq_queued(p);
	running = task_current_donor(rq, p);

	if (queued)
		dequeue_task(rq, p, DEQUEUE_SAVE);
	if (running)
		put_prev_task(rq, p);

	guard(task_rq_lock)(p);
	scoped_guard (sched_change, p, DEQUEUE_SAVE)
		p->numa_preferred_nid = nid;

	if (queued)
		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
	if (running)
		set_next_task(rq, p);
	task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_NUMA_BALANCING */

@@ -9205,8 +9178,9 @@ static void sched_change_group(struct task_struct *tsk)
 */
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
	int queued, running, queue_flags =
	unsigned int queue_flags =
		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
	bool resched = false;
	struct rq *rq;

	CLASS(task_rq_lock, rq_guard)(tsk);
@@ -9214,30 +9188,17 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)

	update_rq_clock(rq);

	running = task_current_donor(rq, tsk);
	queued = task_on_rq_queued(tsk);

	if (queued)
		dequeue_task(rq, tsk, queue_flags);
	if (running)
		put_prev_task(rq, tsk);

	scoped_guard (sched_change, tsk, queue_flags) {
		sched_change_group(tsk);
		if (!for_autogroup)
			scx_cgroup_move_task(tsk);
		if (scope->running)
			resched = true;
	}

	if (queued)
		enqueue_task(rq, tsk, queue_flags);
	if (running) {
		set_next_task(rq, tsk);
		/*
		 * After changing group, the running task may have joined a
		 * throttled one but it's still the running task. Trigger a
		 * resched to make sure that task can still run.
		 */
	if (resched)
		resched_curr(rq);
}
}

static struct cgroup_subsys_state *
cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
@@ -10892,37 +10853,39 @@ void sched_mm_cid_fork(struct task_struct *t)
}
#endif /* CONFIG_SCHED_MM_CID */

#ifdef CONFIG_SCHED_CLASS_EXT
void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
			    struct sched_enq_and_set_ctx *ctx)
static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);

struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
{
	struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
	struct rq *rq = task_rq(p);

	lockdep_assert_rq_held(rq);

	*ctx = (struct sched_enq_and_set_ctx){
	*ctx = (struct sched_change_ctx){
		.p = p,
		.queue_flags = queue_flags,
		.flags = flags,
		.queued = task_on_rq_queued(p),
		.running = task_current(rq, p),
		.running = task_current_donor(rq, p),
	};

	update_rq_clock(rq);
	if (ctx->queued)
		dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
		dequeue_task(rq, p, flags);
	if (ctx->running)
		put_prev_task(rq, p);

	return ctx;
}

void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
void sched_change_end(struct sched_change_ctx *ctx)
{
	struct rq *rq = task_rq(ctx->p);
	struct task_struct *p = ctx->p;
	struct rq *rq = task_rq(p);

	lockdep_assert_rq_held(rq);

	if (ctx->queued)
		enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
		enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
	if (ctx->running)
		set_next_task(rq, ctx->p);
		set_next_task(rq, p);
}
#endif /* CONFIG_SCHED_CLASS_EXT */
+18 −21
Original line number Diff line number Diff line
@@ -3780,11 +3780,10 @@ static void scx_bypass(bool bypass)
		 */
		list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
						 scx.runnable_node) {
			struct sched_enq_and_set_ctx ctx;

			/* cycling deq/enq is enough, see the function comment */
			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
			sched_enq_and_set_task(&ctx);
			scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
				/* nothing */ ;
			}
		}

		/* resched to restore ticks and idle state */
@@ -3916,17 +3915,16 @@ static void scx_disable_workfn(struct kthread_work *work)
		const struct sched_class *old_class = p->sched_class;
		const struct sched_class *new_class =
			__setscheduler_class(p->policy, p->prio);
		struct sched_enq_and_set_ctx ctx;

		if (old_class != new_class && p->se.sched_delayed)
			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
		update_rq_clock(task_rq(p));

		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
		if (old_class != new_class && p->se.sched_delayed)
			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);

		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
			p->sched_class = new_class;
			check_class_changing(task_rq(p), p, old_class);

		sched_enq_and_set_task(&ctx);
		}

		check_class_changed(task_rq(p), p, old_class, p->prio);
		scx_exit_task(p);
@@ -4660,21 +4658,20 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
		const struct sched_class *old_class = p->sched_class;
		const struct sched_class *new_class =
			__setscheduler_class(p->policy, p->prio);
		struct sched_enq_and_set_ctx ctx;

		if (!tryget_task_struct(p))
			continue;

		if (old_class != new_class && p->se.sched_delayed)
			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
		update_rq_clock(task_rq(p));

		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
		if (old_class != new_class && p->se.sched_delayed)
			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);

		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
			p->scx.slice = SCX_SLICE_DFL;
			p->sched_class = new_class;
			check_class_changing(task_rq(p), p, old_class);

		sched_enq_and_set_task(&ctx);
		}

		check_class_changed(task_rq(p), p, old_class, p->prio);
		put_task_struct(p);
+24 −9
Original line number Diff line number Diff line
@@ -3885,23 +3885,38 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p,
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);

#ifdef CONFIG_SCHED_CLASS_EXT
/*
 * Used by SCX in the enable/disable paths to move tasks between sched_classes
 * and establish invariants.
 * The 'sched_change' pattern is the safe, easy and slow way of changing a
 * task's scheduling properties. It dequeues a task, such that the scheduler
 * is fully unaware of it; at which point its properties can be modified;
 * after which it is enqueued again.
 *
 * Typically this must be called while holding task_rq_lock, since most/all
 * properties are serialized under those locks. There is currently one
 * exception to this rule in sched/ext which only holds rq->lock.
 */

/*
 * This structure is a temporary, used to preserve/convey the queueing state
 * of the task between sched_change_begin() and sched_change_end(). Ensuring
 * the task's queueing state is idempotent across the operation.
 */
struct sched_enq_and_set_ctx {
struct sched_change_ctx {
	struct task_struct	*p;
	int			queue_flags;
	int			flags;
	bool			queued;
	bool			running;
};

void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
			    struct sched_enq_and_set_ctx *ctx);
void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
void sched_change_end(struct sched_change_ctx *ctx);

#endif /* CONFIG_SCHED_CLASS_EXT */
DEFINE_CLASS(sched_change, struct sched_change_ctx *,
	     sched_change_end(_T),
	     sched_change_begin(p, flags),
	     struct task_struct *p, unsigned int flags)

DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)

#include "ext.h"

+23 −42
Original line number Diff line number Diff line
@@ -64,7 +64,6 @@ static int effective_prio(struct task_struct *p)

void set_user_nice(struct task_struct *p, long nice)
{
	bool queued, running;
	struct rq *rq;
	int old_prio;

@@ -90,22 +89,12 @@ void set_user_nice(struct task_struct *p, long nice)
		return;
	}

	queued = task_on_rq_queued(p);
	running = task_current_donor(rq, p);
	if (queued)
		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
	if (running)
		put_prev_task(rq, p);

	scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
		p->static_prio = NICE_TO_PRIO(nice);
		set_load_weight(p, true);
		old_prio = p->prio;
		p->prio = effective_prio(p);

	if (queued)
		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
	if (running)
		set_next_task(rq, p);
	}

	/*
	 * If the task increased its priority or is running and
@@ -515,7 +504,7 @@ int __sched_setscheduler(struct task_struct *p,
			 bool user, bool pi)
{
	int oldpolicy = -1, policy = attr->sched_policy;
	int retval, oldprio, newprio, queued, running;
	int retval, oldprio, newprio;
	const struct sched_class *prev_class, *next_class;
	struct balance_callback *head;
	struct rq_flags rf;
@@ -698,12 +687,7 @@ int __sched_setscheduler(struct task_struct *p,
	if (prev_class != next_class && p->se.sched_delayed)
		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);

	queued = task_on_rq_queued(p);
	running = task_current_donor(rq, p);
	if (queued)
		dequeue_task(rq, p, queue_flags);
	if (running)
		put_prev_task(rq, p);
	scoped_guard (sched_change, p, queue_flags) {

		if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
			__setscheduler_params(p, attr);
@@ -713,18 +697,15 @@ int __sched_setscheduler(struct task_struct *p,
		__setscheduler_uclamp(p, attr);
		check_class_changing(rq, p, prev_class);

	if (queued) {
		if (scope->queued) {
			/*
			 * We enqueue to tail when the priority of a task is
			 * increased (user space view).
			 */
			if (oldprio < p->prio)
			queue_flags |= ENQUEUE_HEAD;

		enqueue_task(rq, p, queue_flags);
				scope->flags |= ENQUEUE_HEAD;
		}
	}
	if (running)
		set_next_task(rq, p);

	check_class_changed(rq, p, prev_class, oldprio);