Commit 34f26a15 authored by Chengming Zhou's avatar Chengming Zhou Committed by Peter Zijlstra
Browse files

sched/psi: Per-cgroup PSI accounting disable/re-enable interface



PSI accounts stalls for each cgroup separately and aggregates it
at each level of the hierarchy. This may cause non-negligible overhead
for some workloads when under deep level of the hierarchy.

commit 3958e2d0 ("cgroup: make per-cgroup pressure stall tracking configurable")
make PSI to skip per-cgroup stall accounting, only account system-wide
to avoid this each level overhead.

But for our use case, we also want leaf cgroup PSI stats accounted for
userspace adjustment on that cgroup, apart from only system-wide adjustment.

So this patch introduce a per-cgroup PSI accounting disable/re-enable
interface "cgroup.pressure", which is a read-write single value file that
allowed values are "0" and "1", the defaults is "1" so per-cgroup
PSI stats is enabled by default.

Implementation details:

It should be relatively straight-forward to disable and re-enable
state aggregation, time tracking, averaging on a per-cgroup level,
if we can live with losing history from while it was disabled.
I.e. the avgs will restart from 0, total= will have gaps.

But it's hard or complex to stop/restart groupc->tasks[] updates,
which is not implemented in this patch. So we always update
groupc->tasks[] and PSI_ONCPU bit in psi_group_change() even when
the cgroup PSI stats is disabled.

Suggested-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Suggested-by: default avatarTejun Heo <tj@kernel.org>
Signed-off-by: default avatarChengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20220907090332.2078-1-zhouchengming@bytedance.com
parent dc86aba7
Loading
Loading
Loading
Loading
+17 −0
Original line number Diff line number Diff line
@@ -976,6 +976,23 @@ All cgroup core files are prefixed with "cgroup."
	killing cgroups is a process directed operation, i.e. it affects
	the whole thread-group.

  cgroup.pressure
	A read-write single value file that allowed values are "0" and "1".
	The default is "1".

	Writing "0" to the file will disable the cgroup PSI accounting.
	Writing "1" to the file will re-enable the cgroup PSI accounting.

	This control attribute is not hierarchical, so disable or enable PSI
	accounting in a cgroup does not affect PSI accounting in descendants
	and doesn't need pass enablement via ancestors from root.

	The reason this control attribute exists is that PSI accounts stalls for
	each cgroup separately and aggregates it at each level of the hierarchy.
	This may cause non-negligible overhead for some workloads when under
	deep level of the hierarchy, in which case this control attribute can
	be used to disable PSI accounting in the non-leaf cgroups.

  irq.pressure
	A read-write nested-keyed file.

+3 −0
Original line number Diff line number Diff line
@@ -428,6 +428,9 @@ struct cgroup {
	struct cgroup_file procs_file;	/* handle for "cgroup.procs" */
	struct cgroup_file events_file;	/* handle for "cgroup.events" */

	/* handles for "{cpu,memory,io,irq}.pressure" */
	struct cgroup_file psi_files[NR_PSI_RESOURCES];

	/*
	 * The bitmask of subsystems enabled on the child cgroups.
	 * ->subtree_control is the one configured through
+2 −0
Original line number Diff line number Diff line
@@ -39,6 +39,7 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
int psi_cgroup_alloc(struct cgroup *cgrp);
void psi_cgroup_free(struct cgroup *cgrp);
void cgroup_move_task(struct task_struct *p, struct css_set *to);
void psi_cgroup_restart(struct psi_group *group);
#endif

#else /* CONFIG_PSI */
@@ -60,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
{
	rcu_assign_pointer(p->cgroups, to);
}
static inline void psi_cgroup_restart(struct psi_group *group) {}
#endif

#endif /* CONFIG_PSI */
+3 −0
Original line number Diff line number Diff line
@@ -152,6 +152,7 @@ struct psi_trigger {

struct psi_group {
	struct psi_group *parent;
	bool enabled;

	/* Protects data used by the aggregator */
	struct mutex avgs_lock;
@@ -194,6 +195,8 @@ struct psi_group {

#else /* CONFIG_PSI */

#define NR_PSI_RESOURCES	0

struct psi_group { };

#endif /* CONFIG_PSI */
+64 −6
Original line number Diff line number Diff line
@@ -3708,7 +3708,7 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
	return psi_show(seq, psi, PSI_CPU);
}

static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
			      size_t nbytes, enum psi_res res)
{
	struct cgroup_file_ctx *ctx = of->priv;
@@ -3746,21 +3746,21 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
					  char *buf, size_t nbytes,
					  loff_t off)
{
	return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
	return pressure_write(of, buf, nbytes, PSI_IO);
}

static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
					  char *buf, size_t nbytes,
					  loff_t off)
{
	return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
	return pressure_write(of, buf, nbytes, PSI_MEM);
}

static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
					  char *buf, size_t nbytes,
					  loff_t off)
{
	return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
	return pressure_write(of, buf, nbytes, PSI_CPU);
}

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3776,10 +3776,58 @@ static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
					 char *buf, size_t nbytes,
					 loff_t off)
{
	return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ);
	return pressure_write(of, buf, nbytes, PSI_IRQ);
}
#endif

static int cgroup_pressure_show(struct seq_file *seq, void *v)
{
	struct cgroup *cgrp = seq_css(seq)->cgroup;
	struct psi_group *psi = cgroup_psi(cgrp);

	seq_printf(seq, "%d\n", psi->enabled);

	return 0;
}

static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
				     char *buf, size_t nbytes,
				     loff_t off)
{
	ssize_t ret;
	int enable;
	struct cgroup *cgrp;
	struct psi_group *psi;

	ret = kstrtoint(strstrip(buf), 0, &enable);
	if (ret)
		return ret;

	if (enable < 0 || enable > 1)
		return -ERANGE;

	cgrp = cgroup_kn_lock_live(of->kn, false);
	if (!cgrp)
		return -ENOENT;

	psi = cgroup_psi(cgrp);
	if (psi->enabled != enable) {
		int i;

		/* show or hide {cpu,memory,io,irq}.pressure files */
		for (i = 0; i < NR_PSI_RESOURCES; i++)
			cgroup_file_show(&cgrp->psi_files[i], enable);

		psi->enabled = enable;
		if (enable)
			psi_cgroup_restart(psi);
	}

	cgroup_kn_unlock(of->kn);

	return nbytes;
}

static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
					  poll_table *pt)
{
@@ -5175,6 +5223,7 @@ static struct cftype cgroup_base_files[] = {
	{
		.name = "io.pressure",
		.flags = CFTYPE_PRESSURE,
		.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
		.seq_show = cgroup_io_pressure_show,
		.write = cgroup_io_pressure_write,
		.poll = cgroup_pressure_poll,
@@ -5183,6 +5232,7 @@ static struct cftype cgroup_base_files[] = {
	{
		.name = "memory.pressure",
		.flags = CFTYPE_PRESSURE,
		.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
		.seq_show = cgroup_memory_pressure_show,
		.write = cgroup_memory_pressure_write,
		.poll = cgroup_pressure_poll,
@@ -5191,6 +5241,7 @@ static struct cftype cgroup_base_files[] = {
	{
		.name = "cpu.pressure",
		.flags = CFTYPE_PRESSURE,
		.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
		.seq_show = cgroup_cpu_pressure_show,
		.write = cgroup_cpu_pressure_write,
		.poll = cgroup_pressure_poll,
@@ -5200,12 +5251,19 @@ static struct cftype cgroup_base_files[] = {
	{
		.name = "irq.pressure",
		.flags = CFTYPE_PRESSURE,
		.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
		.seq_show = cgroup_irq_pressure_show,
		.write = cgroup_irq_pressure_write,
		.poll = cgroup_pressure_poll,
		.release = cgroup_pressure_release,
	},
#endif
	{
		.name = "cgroup.pressure",
		.flags = CFTYPE_PRESSURE,
		.seq_show = cgroup_pressure_show,
		.write = cgroup_pressure_write,
	},
#endif /* CONFIG_PSI */
	{ }	/* terminate */
};
Loading