Commit 2a0adc49 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched_urgent_for_v6.7_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Borislav Petkov:

 - Fix virtual runtime calculation when recomputing a sched entity's
   weights

 - Fix wrongly rejected unprivileged poll requests to the cgroup psi
   pressure files

 - Make sure the load balancing is done by only one CPU

* tag 'sched_urgent_for_v6.7_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Fix the decision for load balance
  sched: psi: fix unprivileged polling against cgroups
  sched/eevdf: Fix vruntime adjustment on reweight
parents 2f84f823 6d7e4782
Loading
Loading
Loading
Loading
+0 −12
Original line number Diff line number Diff line
@@ -3885,14 +3885,6 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
	return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}

static int cgroup_pressure_open(struct kernfs_open_file *of)
{
	if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
		return -EPERM;

	return 0;
}

static void cgroup_pressure_release(struct kernfs_open_file *of)
{
	struct cgroup_file_ctx *ctx = of->priv;
@@ -5299,7 +5291,6 @@ static struct cftype cgroup_psi_files[] = {
	{
		.name = "io.pressure",
		.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
		.open = cgroup_pressure_open,
		.seq_show = cgroup_io_pressure_show,
		.write = cgroup_io_pressure_write,
		.poll = cgroup_pressure_poll,
@@ -5308,7 +5299,6 @@ static struct cftype cgroup_psi_files[] = {
	{
		.name = "memory.pressure",
		.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
		.open = cgroup_pressure_open,
		.seq_show = cgroup_memory_pressure_show,
		.write = cgroup_memory_pressure_write,
		.poll = cgroup_pressure_poll,
@@ -5317,7 +5307,6 @@ static struct cftype cgroup_psi_files[] = {
	{
		.name = "cpu.pressure",
		.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
		.open = cgroup_pressure_open,
		.seq_show = cgroup_cpu_pressure_show,
		.write = cgroup_cpu_pressure_write,
		.poll = cgroup_pressure_poll,
@@ -5327,7 +5316,6 @@ static struct cftype cgroup_psi_files[] = {
	{
		.name = "irq.pressure",
		.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
		.open = cgroup_pressure_open,
		.seq_show = cgroup_irq_pressure_show,
		.write = cgroup_irq_pressure_write,
		.poll = cgroup_pressure_poll,
+135 −26
Original line number Diff line number Diff line
@@ -3666,41 +3666,140 @@ static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif

static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
			   unsigned long weight)
{
	unsigned long old_weight = se->load.weight;
	u64 avruntime = avg_vruntime(cfs_rq);
	s64 vlag, vslice;

	/*
	 * VRUNTIME
	 * ========
	 *
	 * COROLLARY #1: The virtual runtime of the entity needs to be
	 * adjusted if re-weight at !0-lag point.
	 *
	 * Proof: For contradiction assume this is not true, so we can
	 * re-weight without changing vruntime at !0-lag point.
	 *
	 *             Weight	VRuntime   Avg-VRuntime
	 *     before    w          v            V
	 *      after    w'         v'           V'
	 *
	 * Since lag needs to be preserved through re-weight:
	 *
	 *	lag = (V - v)*w = (V'- v')*w', where v = v'
	 *	==>	V' = (V - v)*w/w' + v		(1)
	 *
	 * Let W be the total weight of the entities before reweight,
	 * since V' is the new weighted average of entities:
	 *
	 *	V' = (WV + w'v - wv) / (W + w' - w)	(2)
	 *
	 * by using (1) & (2) we obtain:
	 *
	 *	(WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
	 *	==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
	 *	==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
	 *	==>	(V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
	 *
	 * Since we are doing at !0-lag point which means V != v, we
	 * can simplify (3):
	 *
	 *	==>	W / (W + w' - w) = w / w'
	 *	==>	Ww' = Ww + ww' - ww
	 *	==>	W * (w' - w) = w * (w' - w)
	 *	==>	W = w	(re-weight indicates w' != w)
	 *
	 * So the cfs_rq contains only one entity, hence vruntime of
	 * the entity @v should always equal to the cfs_rq's weighted
	 * average vruntime @V, which means we will always re-weight
	 * at 0-lag point, thus breach assumption. Proof completed.
	 *
	 *
	 * COROLLARY #2: Re-weight does NOT affect weighted average
	 * vruntime of all the entities.
	 *
	 * Proof: According to corollary #1, Eq. (1) should be:
	 *
	 *	(V - v)*w = (V' - v')*w'
	 *	==>    v' = V' - (V - v)*w/w'		(4)
	 *
	 * According to the weighted average formula, we have:
	 *
	 *	V' = (WV - wv + w'v') / (W - w + w')
	 *	   = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
	 *	   = (WV - wv + w'V' - Vw + wv) / (W - w + w')
	 *	   = (WV + w'V' - Vw) / (W - w + w')
	 *
	 *	==>  V'*(W - w + w') = WV + w'V' - Vw
	 *	==>	V' * (W - w) = (W - w) * V	(5)
	 *
	 * If the entity is the only one in the cfs_rq, then reweight
	 * always occurs at 0-lag point, so V won't change. Or else
	 * there are other entities, hence W != w, then Eq. (5) turns
	 * into V' = V. So V won't change in either case, proof done.
	 *
	 *
	 * So according to corollary #1 & #2, the effect of re-weight
	 * on vruntime should be:
	 *
	 *	v' = V' - (V - v) * w / w'		(4)
	 *	   = V  - (V - v) * w / w'
	 *	   = V  - vl * w / w'
	 *	   = V  - vl'
	 */
	if (avruntime != se->vruntime) {
		vlag = (s64)(avruntime - se->vruntime);
		vlag = div_s64(vlag * old_weight, weight);
		se->vruntime = avruntime - vlag;
	}

	/*
	 * DEADLINE
	 * ========
	 *
	 * When the weight changes, the virtual time slope changes and
	 * we should adjust the relative virtual deadline accordingly.
	 *
	 *	d' = v' + (d - v)*w/w'
	 *	   = V' - (V - v)*w/w' + (d - v)*w/w'
	 *	   = V  - (V - v)*w/w' + (d - v)*w/w'
	 *	   = V  + (d - V)*w/w'
	 */
	vslice = (s64)(se->deadline - avruntime);
	vslice = div_s64(vslice * old_weight, weight);
	se->deadline = avruntime + vslice;
}

static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
			    unsigned long weight)
{
	bool curr = cfs_rq->curr == se;

	if (se->on_rq) {
		/* commit outstanding execution time */
		if (cfs_rq->curr == se)
		if (curr)
			update_curr(cfs_rq);
		else
			avg_vruntime_sub(cfs_rq, se);
			__dequeue_entity(cfs_rq, se);
		update_load_sub(&cfs_rq->load, se->load.weight);
	}
	dequeue_load_avg(cfs_rq, se);

	update_load_set(&se->load, weight);

	if (!se->on_rq) {
		/*
		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
		 * we need to scale se->vlag when w_i changes.
		 */
		se->vlag = div_s64(se->vlag * old_weight, weight);
		se->vlag = div_s64(se->vlag * se->load.weight, weight);
	} else {
		s64 deadline = se->deadline - se->vruntime;
		/*
		 * When the weight changes, the virtual time slope changes and
		 * we should adjust the relative virtual deadline accordingly.
		 */
		deadline = div_s64(deadline * old_weight, weight);
		se->deadline = se->vruntime + deadline;
		if (se != cfs_rq->curr)
			min_deadline_cb_propagate(&se->run_node, NULL);
		reweight_eevdf(cfs_rq, se, weight);
	}

	update_load_set(&se->load, weight);

#ifdef CONFIG_SMP
	do {
		u32 divider = get_pelt_divider(&se->avg);
@@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
	enqueue_load_avg(cfs_rq, se);
	if (se->on_rq) {
		update_load_add(&cfs_rq->load, se->load.weight);
		if (cfs_rq->curr != se)
			avg_vruntime_add(cfs_rq, se);
		if (!curr) {
			/*
			 * The entity's vruntime has been adjusted, so let's check
			 * whether the rq-wide min_vruntime needs updated too. Since
			 * the calculations above require stable min_vruntime rather
			 * than up-to-date one, we do the update at the end of the
			 * reweight process.
			 */
			__enqueue_entity(cfs_rq, se);
			update_min_vruntime(cfs_rq);
		}
	}
}

@@ -3857,13 +3965,10 @@ static void update_cfs_group(struct sched_entity *se)

#ifndef CONFIG_SMP
	shares = READ_ONCE(gcfs_rq->tg->shares);

	if (likely(se->load.weight == shares))
		return;
#else
	shares = calc_group_shares(gcfs_rq);
#endif

	if (unlikely(se->load.weight != shares))
		reweight_entity(cfs_rq_of(se), se, shares);
}

@@ -11079,12 +11184,16 @@ static int should_we_balance(struct lb_env *env)
			continue;
		}

		/* Are we the first idle CPU? */
		/*
		 * Are we the first idle core in a non-SMT domain or higher,
		 * or the first idle CPU in a SMT domain?
		 */
		return cpu == env->dst_cpu;
	}

	if (idle_smt == env->dst_cpu)
		return true;
	/* Are we the first idle CPU with busy siblings? */
	if (idle_smt != -1)
		return idle_smt == env->dst_cpu;

	/* Are we the first CPU of this group ? */
	return group_balance_cpu(sg) == env->dst_cpu;