Commit bd9a3dba authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull PSI updates from Ingo Molnar:

 - Various performance optimizations, resulting in a 4%-9% speedup in
   the mmtests/config-scheduler-perfpipe micro-benchmark.

 - New interface to turn PSI on/off on a per cgroup level.

* tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/psi: Per-cgroup PSI accounting disable/re-enable interface
  sched/psi: Cache parent psi_group to speed up group iteration
  sched/psi: Consolidate cgroup_psi()
  sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure
  sched/psi: Remove NR_ONCPU task accounting
  sched/psi: Optimize task switch inside shared cgroups again
  sched/psi: Move private helpers to sched/stats.h
  sched/psi: Save percpu memory when !psi_cgroups_enabled
  sched/psi: Don't create cgroup PSI files when psi_disabled
  sched/psi: Fix periodic aggregation shut off
parents 1df046ab 34f26a15
Loading
Loading
Loading
Loading
+23 −0
Original line number Diff line number Diff line
@@ -976,6 +976,29 @@ All cgroup core files are prefixed with "cgroup."
	killing cgroups is a process directed operation, i.e. it affects
	the whole thread-group.

  cgroup.pressure
	A read-write single value file that allowed values are "0" and "1".
	The default is "1".

	Writing "0" to the file will disable the cgroup PSI accounting.
	Writing "1" to the file will re-enable the cgroup PSI accounting.

	This control attribute is not hierarchical, so disable or enable PSI
	accounting in a cgroup does not affect PSI accounting in descendants
	and doesn't need pass enablement via ancestors from root.

	The reason this control attribute exists is that PSI accounts stalls for
	each cgroup separately and aggregates it at each level of the hierarchy.
	This may cause non-negligible overhead for some workloads when under
	deep level of the hierarchy, in which case this control attribute can
	be used to disable PSI accounting in the non-leaf cgroups.

  irq.pressure
	A read-write nested-keyed file.

	Shows pressure stall information for IRQ/SOFTIRQ. See
	:ref:`Documentation/accounting/psi.rst <psi>` for details.

Controllers
===========

+3 −0
Original line number Diff line number Diff line
@@ -428,6 +428,9 @@ struct cgroup {
	struct cgroup_file procs_file;	/* handle for "cgroup.procs" */
	struct cgroup_file events_file;	/* handle for "cgroup.events" */

	/* handles for "{cpu,memory,io,irq}.pressure" */
	struct cgroup_file psi_files[NR_PSI_RESOURCES];

	/*
	 * The bitmask of subsystems enabled on the child cgroups.
	 * ->subtree_control is the one configured through
+0 −5
Original line number Diff line number Diff line
@@ -682,11 +682,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
	pr_cont_kernfs_path(cgrp->kn);
}

static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
	return cgrp->psi;
}

bool cgroup_psi_enabled(void);

static inline void cgroup_init_kthreadd(void)
+8 −4
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/poll.h>
#include <linux/cgroup-defs.h>
#include <linux/cgroup.h>

struct seq_file;
struct css_set;
@@ -18,10 +19,6 @@ extern struct psi_group psi_system;

void psi_init(void);

void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
		     bool sleep);

void psi_memstall_enter(unsigned long *flags);
void psi_memstall_leave(unsigned long *flags);

@@ -34,9 +31,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
			poll_table *wait);

#ifdef CONFIG_CGROUPS
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
	return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
}

int psi_cgroup_alloc(struct cgroup *cgrp);
void psi_cgroup_free(struct cgroup *cgrp);
void cgroup_move_task(struct task_struct *p, struct css_set *to);
void psi_cgroup_restart(struct psi_group *group);
#endif

#else /* CONFIG_PSI */
@@ -58,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
{
	rcu_assign_pointer(p->cgroups, to);
}
static inline void psi_cgroup_restart(struct psi_group *group) {}
#endif

#endif /* CONFIG_PSI */
+20 −11
Original line number Diff line number Diff line
@@ -15,13 +15,6 @@ enum psi_task_count {
	NR_IOWAIT,
	NR_MEMSTALL,
	NR_RUNNING,
	/*
	 * This can't have values other than 0 or 1 and could be
	 * implemented as a bit flag. But for now we still have room
	 * in the first cacheline of psi_group_cpu, and this way we
	 * don't have to special case any state tracking for it.
	 */
	NR_ONCPU,
	/*
	 * For IO and CPU stalls the presence of running/oncpu tasks
	 * in the domain means a partial rather than a full stall.
@@ -32,22 +25,27 @@ enum psi_task_count {
	 * threads and memstall ones.
	 */
	NR_MEMSTALL_RUNNING,
	NR_PSI_TASK_COUNTS = 5,
	NR_PSI_TASK_COUNTS = 4,
};

/* Task state bitmasks */
#define TSK_IOWAIT	(1 << NR_IOWAIT)
#define TSK_MEMSTALL	(1 << NR_MEMSTALL)
#define TSK_RUNNING	(1 << NR_RUNNING)
#define TSK_ONCPU	(1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING	(1 << NR_MEMSTALL_RUNNING)

/* Only one task can be scheduled, no corresponding task count */
#define TSK_ONCPU	(1 << NR_PSI_TASK_COUNTS)

/* Resources that workloads could be stalled on */
enum psi_res {
	PSI_IO,
	PSI_MEM,
	PSI_CPU,
	NR_PSI_RESOURCES = 3,
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
	PSI_IRQ,
#endif
	NR_PSI_RESOURCES,
};

/*
@@ -63,11 +61,17 @@ enum psi_states {
	PSI_MEM_FULL,
	PSI_CPU_SOME,
	PSI_CPU_FULL,
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
	PSI_IRQ_FULL,
#endif
	/* Only per-CPU, to weigh the CPU in the global average: */
	PSI_NONIDLE,
	NR_PSI_STATES = 7,
	NR_PSI_STATES,
};

/* Use one bit in the state mask to track TSK_ONCPU */
#define PSI_ONCPU	(1 << NR_PSI_STATES)

enum psi_aggregators {
	PSI_AVGS = 0,
	PSI_POLL,
@@ -147,6 +151,9 @@ struct psi_trigger {
};

struct psi_group {
	struct psi_group *parent;
	bool enabled;

	/* Protects data used by the aggregator */
	struct mutex avgs_lock;

@@ -188,6 +195,8 @@ struct psi_group {

#else /* CONFIG_PSI */

#define NR_PSI_RESOURCES	0

struct psi_group { };

#endif /* CONFIG_PSI */
Loading