Merge tag 'cgroup-for-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup (3b66e6b3) · Commits · git / linux-nf

Documentation/admin-guide/cgroup-v2.rst

+57 −22

Original line number	Diff line number	Diff line
		@@ -1076,7 +1076,7 @@ cpufreq governor about the minimum desired frequency which should always be
		provided by a CPU, as well as the maximum desired frequency, which should not
		be exceeded by a CPU.

		WARNING: cgroup2 cpu controller doesn't yet fully support the control of
		WARNING: cgroup2 cpu controller doesn't yet support the (bandwidth) control of
		realtime processes. For a kernel built with the CONFIG_RT_GROUP_SCHED option
		enabled for group scheduling of realtime processes, the cpu controller can only
		be enabled when all RT processes are in the root cgroup. Be aware that system
		@@ -1095,19 +1095,34 @@ realtime processes irrespective of CONFIG_RT_GROUP_SCHED.
		CPU Interface Files
		~~~~~~~~~~~~~~~~~~~

		All time durations are in microseconds.
		The interaction of a process with the cpu controller depends on its scheduling
		policy and the underlying scheduler. From the point of view of the cpu controller,
		processes can be categorized as follows:

		* Processes under the fair-class scheduler
		* Processes under a BPF scheduler with the ``cgroup_set_weight`` callback
		* Everything else: ``SCHED_{FIFO,RR,DEADLINE}`` and processes under a BPF scheduler
		without the ``cgroup_set_weight`` callback

		For details on when a process is under the fair-class scheduler or a BPF scheduler,
		check out :ref:`Documentation/scheduler/sched-ext.rst <sched-ext>`.

		For each of the following interface files, the above categories
		will be referred to. All time durations are in microseconds.

		cpu.stat
		A read-only flat-keyed file.
		This file exists whether the controller is enabled or not.

		It always reports the following three stats:
		It always reports the following three stats, which account for all the
		processes in the cgroup:

		- usage_usec
		- user_usec
		- system_usec

		and the following five when the controller is enabled:
		and the following five when the controller is enabled, which account for
		only the processes under the fair-class scheduler:

		- nr_periods
		- nr_throttled
		@@ -1125,6 +1140,10 @@ All time durations are in microseconds.
		If the cgroup has been configured to be SCHED_IDLE (cpu.idle = 1),
		then the weight will show as a 0.

		This file affects only processes under the fair-class scheduler and a BPF
		scheduler with the ``cgroup_set_weight`` callback depending on what the
		callback actually does.

		cpu.weight.nice
		A read-write single value file which exists on non-root
		cgroups. The default is "0".
		@@ -1137,6 +1156,10 @@ All time durations are in microseconds.
		granularity is coarser for the nice values, the read value is
		the closest approximation of the current weight.

		This file affects only processes under the fair-class scheduler and a BPF
		scheduler with the ``cgroup_set_weight`` callback depending on what the
		callback actually does.

		cpu.max
		A read-write two value file which exists on non-root cgroups.
		The default is "max 100000".
		@@ -1149,18 +1172,24 @@ All time durations are in microseconds.
		$PERIOD duration. "max" for $MAX indicates no limit. If only
		one number is written, $MAX is updated.

		This file affects only processes under the fair-class scheduler.

		cpu.max.burst
		A read-write single value file which exists on non-root
		cgroups. The default is "0".

		The burst in the range [0, $MAX].

		This file affects only processes under the fair-class scheduler.

		cpu.pressure
		A read-write nested-keyed file.

		Shows pressure stall information for CPU. See
		:ref:`Documentation/accounting/psi.rst <psi>` for details.

		This file accounts for all the processes in the cgroup.

		cpu.uclamp.min
		A read-write single value file which exists on non-root cgroups.
		The default is "0", i.e. no utilization boosting.
		@@ -1170,12 +1199,15 @@ All time durations are in microseconds.

		This interface allows reading and setting minimum utilization clamp
		values similar to the sched_setattr(2). This minimum utilization
		value is used to clamp the task specific minimum utilization clamp.
		value is used to clamp the task specific minimum utilization clamp,
		including those of realtime processes.

		The requested minimum utilization (protection) is always capped by
		the current value for the maximum utilization (limit), i.e.
		`cpu.uclamp.max`.

		This file affects all the processes in the cgroup.

		cpu.uclamp.max
		A read-write single value file which exists on non-root cgroups.
		The default is "max". i.e. no utilization capping
		@@ -1185,7 +1217,10 @@ All time durations are in microseconds.

		This interface allows reading and setting maximum utilization clamp
		values similar to the sched_setattr(2). This maximum utilization
		value is used to clamp the task specific maximum utilization clamp.
		value is used to clamp the task specific maximum utilization clamp,
		including those of realtime processes.

		This file affects all the processes in the cgroup.

		cpu.idle
		A read-write single value file which exists on non-root cgroups.
		@@ -1197,7 +1232,7 @@ All time durations are in microseconds.
		own relative priorities, but the cgroup itself will be treated as
		very low priority relative to its peers.


		This file affects only processes under the fair-class scheduler.

		Memory
		------

block/blk-cgroup.c

+5 −5

Original line number	Diff line number	Diff line
		@@ -1074,8 +1074,8 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
		/*
		* For covering concurrent parent blkg update from blkg_release().
		*
		* When flushing from cgroup, cgroup_rstat_lock is always held, so
		* this lock won't cause contention most of time.
		* When flushing from cgroup, the subsystem rstat lock is always held,
		* so this lock won't cause contention most of time.
		*/
		raw_spin_lock_irqsave(&blkg_stat_lock, flags);

		@@ -1144,7 +1144,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
		/*
		* We source root cgroup stats from the system-wide stats to avoid
		* tracking the same information twice and incurring overhead when no
		* cgroups are defined. For that reason, cgroup_rstat_flush in
		* cgroups are defined. For that reason, css_rstat_flush in
		* blkcg_print_stat does not actually fill out the iostat in the root
		* cgroup's blkcg_gq.
		*
		@@ -1253,7 +1253,7 @@ static int blkcg_print_stat(struct seq_file sf, void v)
		if (!seq_css(sf)->parent)
		blkcg_fill_root_iostats();
		else
		cgroup_rstat_flush(blkcg->css.cgroup);
		css_rstat_flush(&blkcg->css);

		rcu_read_lock();
		hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
		@@ -2243,7 +2243,7 @@ void blk_cgroup_bio_start(struct bio *bio)
		}

		u64_stats_update_end_irqrestore(&bis->sync, flags);
		cgroup_rstat_updated(blkcg->css.cgroup, cpu);
		css_rstat_updated(&blkcg->css, cpu);
		put_cpu();
		}

include/linux/bpf-cgroup.h

+5 −4

Original line number	Diff line number	Diff line
		@@ -114,8 +114,7 @@ struct bpf_prog_list {
		u32 flags;
		};

		int cgroup_bpf_inherit(struct cgroup *cgrp);
		void cgroup_bpf_offline(struct cgroup *cgrp);
		void __init cgroup_bpf_lifetime_notifier_init(void);

		int __cgroup_bpf_run_filter_skb(struct sock *sk,
		struct sk_buff *skb,
		@@ -431,8 +430,10 @@ const struct bpf_func_proto *
		cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
		#else

		static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
		static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
		static inline void cgroup_bpf_lifetime_notifier_init(void)
		{
		return;
		}

		static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr,
		enum bpf_prog_type ptype,

include/linux/cgroup-defs.h

+66 −34

Original line number	Diff line number	Diff line
		@@ -169,6 +169,23 @@ struct cgroup_subsys_state {
		/* reference count - access via css_[try]get() and css_put() */
		struct percpu_ref refcnt;

		/*
		* Depending on the context, this field is initialized
		* via css_rstat_init() at different places:
		*
		* when css is associated with cgroup::self
		* when css->cgroup is the root cgroup
		* performed in cgroup_init()
		* when css->cgroup is not the root cgroup
		* performed in cgroup_create()
		* when css is associated with a subsystem
		* when css->cgroup is the root cgroup
		* performed in cgroup_init_subsys() in the non-early path
		* when css->cgroup is not the root cgroup
		* performed in css_create()
		*/
		struct css_rstat_cpu __percpu *rstat_cpu;

		/*
		* siblings list anchored at the parent's ->children
		*
		@@ -177,9 +194,6 @@ struct cgroup_subsys_state {
		struct list_head sibling;
		struct list_head children;

		/* flush target list anchored at cgrp->rstat_css_list */
		struct list_head rstat_css_node;

		/*
		* PI: Subsys-unique ID. 0 is unused and root is always 1. The
		* matching css can be looked up using css_from_id().
		@@ -219,6 +233,16 @@ struct cgroup_subsys_state {
		* Protected by cgroup_mutex.
		*/
		int nr_descendants;

		/*
		* A singly-linked list of css structures to be rstat flushed.
		* This is a scratch field to be used exclusively by
		* css_rstat_flush().
		*
		* Protected by rstat_base_lock when css is cgroup::self.
		* Protected by css->ss->rstat_ss_lock otherwise.
		*/
		struct cgroup_subsys_state *rstat_flush_next;
		};

		/*
		@@ -329,10 +353,10 @@ struct cgroup_base_stat {

		/*
		* rstat - cgroup scalable recursive statistics. Accounting is done
		* per-cpu in cgroup_rstat_cpu which is then lazily propagated up the
		* per-cpu in css_rstat_cpu which is then lazily propagated up the
		* hierarchy on reads.
		*
		* When a stat gets updated, the cgroup_rstat_cpu and its ancestors are
		* When a stat gets updated, the css_rstat_cpu and its ancestors are
		* linked into the updated tree. On the following read, propagation only
		* considers and consumes the updated tree. This makes reading O(the
		* number of descendants which have been active since last read) instead of
		@@ -344,10 +368,29 @@ struct cgroup_base_stat {
		* frequency decreases the cost of each read.
		*
		* This struct hosts both the fields which implement the above -
		* updated_children and updated_next - and the fields which track basic
		* resource statistics on top of it - bsync, bstat and last_bstat.
		* updated_children and updated_next.
		*/
		struct cgroup_rstat_cpu {
		struct css_rstat_cpu {
		/*
		* Child cgroups with stat updates on this cpu since the last read
		* are linked on the parent's ->updated_children through
		* ->updated_next. updated_children is terminated by its container css.
		*
		* In addition to being more compact, singly-linked list pointing to
		* the css makes it unnecessary for each per-cpu struct to point back
		* to the associated css.
		*
		* Protected by per-cpu css->ss->rstat_ss_cpu_lock.
		*/
		struct cgroup_subsys_state *updated_children;
		struct cgroup_subsys_state updated_next; / NULL if not on the list */
		};

		/*
		* This struct hosts the fields which track basic resource statistics on
		* top of it - bsync, bstat and last_bstat.
		*/
		struct cgroup_rstat_base_cpu {
		/*
		* ->bsync protects ->bstat. These are the only fields which get
		* updated in the hot path.
		@@ -374,20 +417,6 @@ struct cgroup_rstat_cpu {
		* deltas to propagate to the per-cpu subtree_bstat.
		*/
		struct cgroup_base_stat last_subtree_bstat;

		/*
		* Child cgroups with stat updates on this cpu since the last read
		* are linked on the parent's ->updated_children through
		* ->updated_next.
		*
		* In addition to being more compact, singly-linked list pointing
		* to the cgroup makes it unnecessary for each per-cpu struct to
		* point back to the associated cgroup.
		*
		* Protected by per-cpu cgroup_rstat_cpu_lock.
		*/
		struct cgroup updated_children; / terminated by self cgroup */
		struct cgroup updated_next; / NULL iff not on the list */
		};

		struct cgroup_freezer_state {
		@@ -516,23 +545,23 @@ struct cgroup {
		struct cgroup *dom_cgrp;
		struct cgroup old_dom_cgrp; / used while enabling threaded */

		/* per-cpu recursive resource statistics */
		struct cgroup_rstat_cpu __percpu *rstat_cpu;
		struct list_head rstat_css_list;

		/*
		* Add padding to separate the read mostly rstat_cpu and
		* rstat_css_list into a different cacheline from the following
		* rstat_flush_next and *bstat fields which can have frequent updates.
		* Depending on the context, this field is initialized via
		* css_rstat_init() at different places:
		*
		* when cgroup is the root cgroup
		* performed in cgroup_setup_root()
		* otherwise
		* performed in cgroup_create()
		*/
		CACHELINE_PADDING(_pad_);
		struct cgroup_rstat_base_cpu __percpu *rstat_base_cpu;

		/*
		* A singly-linked list of cgroup structures to be rstat flushed.
		* This is a scratch field to be used exclusively by
		* cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock.
		* Add padding to keep the read mostly rstat per-cpu pointer on a
		* different cacheline than the following *bstat fields which can have
		* frequent updates.
		*/
		struct cgroup *rstat_flush_next;
		CACHELINE_PADDING(_pad_);

		/* cgroup basic resource statistics */
		struct cgroup_base_stat last_bstat;
		@@ -790,6 +819,9 @@ struct cgroup_subsys {
		* specifies the mask of subsystems that this one depends on.
		*/
		unsigned int depends_on;

		spinlock_t rstat_ss_lock;
		raw_spinlock_t __percpu *rstat_ss_cpu_lock;
		};

		extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;

include/linux/cgroup.h

+21 −3

Original line number	Diff line number	Diff line
		@@ -19,6 +19,7 @@
		#include <linux/kernfs.h>
		#include <linux/jump_label.h>
		#include <linux/types.h>
		#include <linux/notifier.h>
		#include <linux/ns_common.h>
		#include <linux/nsproxy.h>
		#include <linux/user_namespace.h>
		@@ -40,7 +41,7 @@ struct kernel_clone_args;

		#ifdef CONFIG_CGROUPS

		enum {
		enum css_task_iter_flags {
		CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */
		CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */
		CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */
		@@ -66,10 +67,16 @@ struct css_task_iter {
		struct list_head iters_node; /* css_set->task_iters */
		};

		enum cgroup_lifetime_events {
		CGROUP_LIFETIME_ONLINE,
		CGROUP_LIFETIME_OFFLINE,
		};

		extern struct file_system_type cgroup_fs_type;
		extern struct cgroup_root cgrp_dfl_root;
		extern struct css_set init_css_set;
		extern spinlock_t css_set_lock;
		extern struct blocking_notifier_head cgroup_lifetime_notifier;

		#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
		#include <linux/cgroup_subsys.h>
		@@ -347,6 +354,17 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css)
		return css->flags & CSS_DYING;
		}

		static inline bool css_is_self(struct cgroup_subsys_state *css)
		{
		if (css == &css->cgroup->self) {
		/* cgroup::self should not have subsystem association */
		WARN_ON(css->ss != NULL);
		return true;
		}

		return false;
		}

		static inline void cgroup_get(struct cgroup *cgrp)
		{
		css_get(&cgrp->self);
		@@ -688,8 +706,8 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
		/*
		* cgroup scalable recursive statistics.
		*/
		void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
		void cgroup_rstat_flush(struct cgroup *cgrp);
		void css_rstat_updated(struct cgroup_subsys_state *css, int cpu);
		void css_rstat_flush(struct cgroup_subsys_state *css);

		/*
		* Basic resource stats.