Commit 3b3bea6d authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'cgroup-for-7.1-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:

 - Fix UAF race in psi pressure_write() against cgroup file release by
   extending cgroup_mutex coverage and ordering of->priv access after
   cgroup_kn_lock_live()

 - Fix integer overflow in rdmacg_try_charge() when usage equals INT_MAX
   by performing the increment in s64

 - Fix asymmetric DL bandwidth accounting on cpuset attach rollback by
   recording the CPU used by dl_bw_alloc() so cancel_attach() returns
   the reservation to the same root domain

 - Fix nr_dying_subsys_* race that briefly showed 0 in cgroup.stat after
   rmdir by incrementing from kill_css() instead of offline_css()

 - Typo fix in cgroup-v2 documentation

* tag 'cgroup-for-7.1-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  docs: cgroup: fix typo 'protetion' -> 'protection'
  cgroup: Increment nr_dying_subsys_* from rmdir context
  cgroup/cpuset: record DL BW alloc CPU for attach rollback
  cgroup/rdma: fix integer overflow in rdmacg_try_charge()
  sched/psi: fix race between file release and pressure write
parents a1a67109 981cd338
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -220,7 +220,7 @@ cgroup v2 currently supports the following mount options.
  memory_hugetlb_accounting
        Count HugeTLB memory usage towards the cgroup's overall
        memory usage for the memory controller (for the purpose of
        statistics reporting and memory protetion). This is a new
        statistics reporting and memory protection). This is a new
        behavior that could regress existing setups, so it must be
        explicitly opted in with this mount option.

+28 −18
Original line number Diff line number Diff line
@@ -3934,33 +3934,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
			      size_t nbytes, enum psi_res res)
{
	struct cgroup_file_ctx *ctx = of->priv;
	struct cgroup_file_ctx *ctx;
	struct psi_trigger *new;
	struct cgroup *cgrp;
	struct psi_group *psi;
	ssize_t ret = 0;

	cgrp = cgroup_kn_lock_live(of->kn, false);
	if (!cgrp)
		return -ENODEV;

	cgroup_get(cgrp);
	cgroup_kn_unlock(of->kn);
	ctx = of->priv;
	if (!ctx) {
		ret = -ENODEV;
		goto out_unlock;
	}

	/* Allow only one trigger per file descriptor */
	if (ctx->psi.trigger) {
		cgroup_put(cgrp);
		return -EBUSY;
		ret = -EBUSY;
		goto out_unlock;
	}

	psi = cgroup_psi(cgrp);
	new = psi_trigger_create(psi, buf, res, of->file, of);
	if (IS_ERR(new)) {
		cgroup_put(cgrp);
		return PTR_ERR(new);
		ret = PTR_ERR(new);
		goto out_unlock;
	}

	smp_store_release(&ctx->psi.trigger, new);
	cgroup_put(cgrp);

out_unlock:
	cgroup_kn_unlock(of->kn);
	if (ret)
		return ret;

	return nbytes;
}
@@ -5716,16 +5724,6 @@ static void offline_css(struct cgroup_subsys_state *css)
	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);

	wake_up_all(&css->cgroup->offline_waitq);

	css->cgroup->nr_dying_subsys[ss->id]++;
	/*
	 * Parent css and cgroup cannot be freed until after the freeing
	 * of child css, see css_free_rwork_fn().
	 */
	while ((css = css->parent)) {
		css->nr_descendants--;
		css->cgroup->nr_dying_subsys[ss->id]++;
	}
}

/**
@@ -6038,6 +6036,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
 */
static void kill_css(struct cgroup_subsys_state *css)
{
	struct cgroup_subsys *ss = css->ss;

	lockdep_assert_held(&cgroup_mutex);

	if (css->flags & CSS_DYING)
@@ -6074,6 +6074,16 @@ static void kill_css(struct cgroup_subsys_state *css)
	 * css is confirmed to be seen as killed on all CPUs.
	 */
	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);

	css->cgroup->nr_dying_subsys[ss->id]++;
	/*
	 * Parent css and cgroup cannot be freed until after the freeing
	 * of child css, see css_free_rwork_fn().
	 */
	while ((css = css->parent)) {
		css->nr_descendants--;
		css->cgroup->nr_dying_subsys[ss->id]++;
	}
}

/**
+5 −0
Original line number Diff line number Diff line
@@ -168,6 +168,11 @@ struct cpuset {
	int nr_deadline_tasks;
	int nr_migrate_dl_tasks;
	u64 sum_migrate_dl_bw;
	/*
	 * CPU used for temporary DL bandwidth allocation during attach;
	 * -1 if no DL bandwidth was allocated in the current attach.
	 */
	int dl_bw_cpu;

	/* Invalid partition error code, not lock protected */
	enum prs_errcode prs_err;
+9 −4
Original line number Diff line number Diff line
@@ -288,6 +288,7 @@ struct cpuset top_cpuset = {
	.flags = BIT(CS_CPU_EXCLUSIVE) |
		 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
	.partition_root_state = PRS_ROOT,
	.dl_bw_cpu = -1,
};

/**
@@ -579,6 +580,8 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
	if (!trial)
		return NULL;

	trial->dl_bw_cpu = -1;

	/* Setup cpumask pointer array */
	cpumask_var_t *pmask[4] = {
		&trial->cpus_allowed,
@@ -2980,6 +2983,7 @@ static void reset_migrate_dl_data(struct cpuset *cs)
{
	cs->nr_migrate_dl_tasks = 0;
	cs->sum_migrate_dl_bw = 0;
	cs->dl_bw_cpu = -1;
}

/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
@@ -3056,6 +3060,8 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
			reset_migrate_dl_data(cs);
			goto out_unlock;
		}

		cs->dl_bw_cpu = cpu;
	}

out_success:
@@ -3080,12 +3086,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
	mutex_lock(&cpuset_mutex);
	dec_attach_in_progress_locked(cs);

	if (cs->nr_migrate_dl_tasks) {
		int cpu = cpumask_any(cs->effective_cpus);
	if (cs->dl_bw_cpu >= 0)
		dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);

		dl_bw_free(cpu, cs->sum_migrate_dl_bw);
	if (cs->nr_migrate_dl_tasks)
		reset_migrate_dl_data(cs);
	}

	mutex_unlock(&cpuset_mutex);
}
+1 −1
Original line number Diff line number Diff line
@@ -283,7 +283,7 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
			ret = PTR_ERR(rpool);
			goto err;
		} else {
			new = rpool->resources[index].usage + 1;
			new = (s64)rpool->resources[index].usage + 1;
			if (new > rpool->resources[index].max) {
				ret = -EAGAIN;
				goto err;