Commit 4ae8d9aa authored by Peter Zijlstra's avatar Peter Zijlstra
Browse files

sched/deadline: Fix dl_server getting stuck



John found it was easy to hit lockup warnings when running locktorture
on a 2 CPU VM, which he bisected down to: commit cccb45d7
("sched/deadline: Less agressive dl_server handling").

While debugging it seems there is a chance where we end up with the
dl_server dequeued, with dl_se->dl_server_active. This causes
dl_server_start() to return without enqueueing the dl_server, thus it
fails to run when RT tasks starve the cpu.

When this happens, dl_server_timer() catches the
'!dl_se->server_has_tasks(dl_se)' case, which then calls
replenish_dl_entity() and dl_server_stopped() and finally return
HRTIMER_NO_RESTART.

This ends in no new timer and also no enqueue, leaving the dl_server
'dead', allowing starvation.

What should have happened is for the bandwidth timer to start the
zero-laxity timer, which in turn would enqueue the dl_server and cause
dl_se->server_pick_task() to be called -- which will stop the
dl_server if no fair tasks are observed for a whole period.

IOW, it is totally irrelevant if there are fair tasks at the moment of
bandwidth refresh.

This removes all dl_se->server_has_tasks() users, so remove the whole
thing.

Fixes: cccb45d7 ("sched/deadline: Less agressive dl_server handling")
Reported-by: default avatarJohn Stultz <jstultz@google.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: default avatarJohn Stultz <jstultz@google.com>
parent f83ec76b
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -733,7 +733,6 @@ struct sched_dl_entity {
	 * runnable task.
	 */
	struct rq			*rq;
	dl_server_has_tasks_f		server_has_tasks;
	dl_server_pick_f		server_pick_task;

#ifdef CONFIG_RT_MUTEXES
+1 −11
Original line number Diff line number Diff line
@@ -875,7 +875,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
	 */
	if (dl_se->dl_defer && !dl_se->dl_defer_running &&
	    dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) {
		if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) {
		if (!is_dl_boosted(dl_se)) {

			/*
			 * Set dl_se->dl_defer_armed and dl_throttled variables to
@@ -1152,8 +1152,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;

static bool dl_server_stopped(struct sched_dl_entity *dl_se);

static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
{
	struct rq *rq = rq_of_dl_se(dl_se);
@@ -1171,12 +1169,6 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
		if (!dl_se->dl_runtime)
			return HRTIMER_NORESTART;

		if (!dl_se->server_has_tasks(dl_se)) {
			replenish_dl_entity(dl_se);
			dl_server_stopped(dl_se);
			return HRTIMER_NORESTART;
		}

		if (dl_se->dl_defer_armed) {
			/*
			 * First check if the server could consume runtime in background.
@@ -1625,11 +1617,9 @@ static bool dl_server_stopped(struct sched_dl_entity *dl_se)
}

void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
		    dl_server_has_tasks_f has_tasks,
		    dl_server_pick_f pick_task)
{
	dl_se->rq = rq;
	dl_se->server_has_tasks = has_tasks;
	dl_se->server_pick_task = pick_task;
}

+1 −6
Original line number Diff line number Diff line
@@ -8859,11 +8859,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru
	return pick_next_task_fair(rq, prev, NULL);
}

static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
{
	return !!dl_se->rq->cfs.nr_queued;
}

static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
{
	return pick_task_fair(dl_se->rq);
@@ -8875,7 +8870,7 @@ void fair_server_init(struct rq *rq)

	init_dl_entity(dl_se);

	dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
	dl_server_init(dl_se, rq, fair_server_pick_task);
}

/*
+0 −4
Original line number Diff line number Diff line
@@ -365,9 +365,6 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6
 *
 *   dl_se::rq -- runqueue we belong to.
 *
 *   dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
 *                                server when it runs out of tasks to run.
 *
 *   dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
 *                           returns NULL.
 *
@@ -383,7 +380,6 @@ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
extern void dl_server_start(struct sched_dl_entity *dl_se);
extern void dl_server_stop(struct sched_dl_entity *dl_se);
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
		    dl_server_has_tasks_f has_tasks,
		    dl_server_pick_f pick_task);
extern void sched_init_dl_servers(void);