cgroup: Changes for v6.18

- Extensive cpuset code cleanup and refactoring work with no functional changes: CPU mask computation logic refactoring, introducing new helpers, removing redundant code paths, and improving error handling for better maintainability. - A few bug fixes to cpuset including fixes for partition creation failures when isolcpus is in use, missing error returns, and null pointer access prevention in free_tmpmasks(). - Core cgroup changes include replacing the global percpu_rwsem with per-threadgroup rwsem when writing to cgroup.procs for better scalability, workqueue conversions to use WQ_PERCPU and system_percpu_wq to prepare for workqueue default switching from percpu to unbound, and removal of unused code including the post_attach callback. - New cgroup.stat.local time accounting feature that tracks frozen time duration. - Misc changes including selftests updates (new freezer time tests and backward compatibility fixes), documentation sync, string function safety improvements, and 64-bit division fixes. -----BEGIN PGP SIGNATURE----- iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCaNb1Sg4cdGpAa2VybmVs Lm9yZwAKCRCxYfJx3gVYGfLMAPwKwkvUg9DPJEuECRfM9woOOHyIWLp1DwUhpg1v Zq0lkAEAmo/+IkJXGZ7TGF+wzSj7GFIugrILu3upzLCHzgYoDgs= =39KF -----END PGP SIGNATURE----- Merge tag 'cgroup-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup Pull cgroup updates from Tejun Heo: - Extensive cpuset code cleanup and refactoring work with no functional changes: CPU mask computation logic refactoring, introducing new helpers, removing redundant code paths, and improving error handling for better maintainability. - A few bug fixes to cpuset including fixes for partition creation failures when isolcpus is in use, missing error returns, and null pointer access prevention in free_tmpmasks(). - Core cgroup changes include replacing the global percpu_rwsem with per-threadgroup rwsem when writing to cgroup.procs for better scalability, workqueue conversions to use WQ_PERCPU and system_percpu_wq to prepare for workqueue default switching from percpu to unbound, and removal of unused code including the post_attach callback. - New cgroup.stat.local time accounting feature that tracks frozen time duration. - Misc changes including selftests updates (new freezer time tests and backward compatibility fixes), documentation sync, string function safety improvements, and 64-bit division fixes. * tag 'cgroup-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (39 commits) cpuset: remove is_prs_invalid helper cpuset: remove impossible warning in update_parent_effective_cpumask cpuset: remove redundant special case for null input in node mask update cpuset: fix missing error return in update_cpumask cpuset: Use new excpus for nocpu error check when enabling root partition cpuset: fix failure to enable isolated partition when containing isolcpus Documentation: cgroup-v2: Sync manual toctree cpuset: use partition_cpus_change for setting exclusive cpus cpuset: use parse_cpulist for setting cpus.exclusive cpuset: introduce partition_cpus_change cpuset: refactor cpus_allowed_validate_change cpuset: refactor out validate_partition cpuset: introduce cpus_excl_conflict and mems_excl_conflict helpers cpuset: refactor CPU mask buffer parsing logic cpuset: Refactor exclusive CPU mask computation logic cpuset: change return type of is_partition_[in]valid to bool cpuset: remove unused assignment to trialcs->partition_root_state cpuset: move the root cpuset write check earlier cgroup/cpuset: Remove redundant rcu_read_lock/unlock() in spin_lock cgroup: Remove redundant rcu_read_lock/unlock() in spin_lock ...
2025-09-30 09:55:41 -07:00 · 2025-09-30 09:55:41 -07:00 · 755fa5b4fb
parent 77fc3f6696 8f0fdbd4a0
commit 755fa5b4fb
18 changed files with 1360 additions and 439 deletions
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@ -15,6 +15,9 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
 .. CONTENTS
   [Whenever any new section is added to this document, please also add
    an entry here.]
   1. Introduction
     1-1. Terminology
     1-2. What is cgroup?
@ -25,9 +28,10 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
       2-2-2. Threads
     2-3. [Un]populated Notification
     2-4. Controlling Controllers
-       2-4-1. Enabling and Disabling
+       2-4-1. Availability
-       2-4-2. Top-down Constraint
+       2-4-2. Enabling and Disabling
-       2-4-3. No Internal Process Constraint
+       2-4-3. Top-down Constraint
       2-4-4. No Internal Process Constraint
     2-5. Delegation
       2-5-1. Model of Delegation
       2-5-2. Delegation Containment
@ -61,14 +65,15 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
       5-4-1. PID Interface Files
     5-5. Cpuset
       5.5-1. Cpuset Interface Files
-     5-6. Device
+     5-6. Device controller
     5-7. RDMA
       5-7-1. RDMA Interface Files
     5-8. DMEM
       5-8-1. DMEM Interface Files
     5-9. HugeTLB
       5.9-1. HugeTLB Interface Files
     5-10. Misc
-       5.10-1 Miscellaneous cgroup Interface Files
+       5.10-1 Misc Interface Files
       5.10-2 Migration and Ownership
     5-11. Others
       5-11-1. perf_event
@ -1001,6 +1006,24 @@ All cgroup core files are prefixed with "cgroup."
 		Total number of dying cgroup subsystems (e.g. memory
 		cgroup) at and beneath the current cgroup.
  cgroup.stat.local
 	A read-only flat-keyed file which exists in non-root cgroups.
 	The following entry is defined:
 	  frozen_usec
 		Cumulative time that this cgroup has spent between freezing and
 		thawing, regardless of whether by self or ancestor groups.
 		NB: (not) reaching "frozen" state is not accounted here.
 		Using the following ASCII representation of a cgroup's freezer
 		state, ::
 			       1    _____
 			frozen 0 __/     \__
 			          ab    cd
 		the duration being measured is the span between a and c.
  cgroup.freeze
 	A read-write single value file which exists on non-root cgroups.
 	Allowed values are "0" and "1". The default is "0".
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@ -91,6 +91,12 @@ enum {
 	 * cgroup_threadgroup_rwsem. This makes hot path operations such as
 	 * forks and exits into the slow path and more expensive.
 	 *
 	 * Alleviate the contention between fork, exec, exit operations and
 	 * writing to cgroup.procs by taking a per threadgroup rwsem instead of
 	 * the global cgroup_threadgroup_rwsem. Fork and other operations
 	 * from threads in different thread groups no longer contend with
 	 * writing to cgroup.procs.
 	 *
 	 * The static usage pattern of creating a cgroup, enabling controllers,
 	 * and then seeding it with CLONE_INTO_CGROUP doesn't require write
 	 * locking cgroup_threadgroup_rwsem and thus doesn't benefit from
@ -140,6 +146,17 @@ enum {
 	__CFTYPE_ADDED		= (1 << 18),
 };
 enum cgroup_attach_lock_mode {
 	/* Default */
 	CGRP_ATTACH_LOCK_GLOBAL,
 	/* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */
 	CGRP_ATTACH_LOCK_NONE,
 	/* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */
 	CGRP_ATTACH_LOCK_PER_THREADGROUP,
 };
 /*
 * cgroup_file is the handle for a file instance created in a cgroup which
 * is used, for example, to generate file changed notifications.  This can
@ -433,6 +450,23 @@ struct cgroup_freezer_state {
 	 * frozen, SIGSTOPped, and PTRACEd.
 	 */
 	int nr_frozen_tasks;
 	/* Freeze time data consistency protection */
 	seqcount_t freeze_seq;
 	/*
 	 * Most recent time the cgroup was requested to freeze.
 	 * Accesses guarded by freeze_seq counter. Writes serialized
 	 * by css_set_lock.
 	 */
 	u64 freeze_start_nsec;
 	/*
 	 * Total duration the cgroup has spent freezing.
 	 * Accesses guarded by freeze_seq counter. Writes serialized
 	 * by css_set_lock.
 	 */
 	u64 frozen_nsec;
 };
 struct cgroup {
@ -746,7 +780,6 @@ struct cgroup_subsys {
 	int (*can_attach)(struct cgroup_taskset *tset);
 	void (*cancel_attach)(struct cgroup_taskset *tset);
 	void (*attach)(struct cgroup_taskset *tset);
 	void (*post_attach)(void);
 	int (*can_fork)(struct task_struct *task,
 			struct css_set *cset);
 	void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
@ -822,6 +855,7 @@ struct cgroup_subsys {
 };
 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 extern bool cgroup_enable_per_threadgroup_rwsem;
 struct cgroup_of_peak {
 	unsigned long		value;
@ -833,11 +867,14 @@ struct cgroup_of_peak {
 * @tsk: target task
 *
 * Allows cgroup operations to synchronize against threadgroup changes
- * using a percpu_rw_semaphore.
+ * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when
 * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
 */
 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
 {
 	percpu_down_read(&cgroup_threadgroup_rwsem);
 	if (cgroup_enable_per_threadgroup_rwsem)
 		down_read(&tsk->signal->cgroup_threadgroup_rwsem);
 }
 /**
@ -848,6 +885,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
 */
 static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
 {
 	if (cgroup_enable_per_threadgroup_rwsem)
 		up_read(&tsk->signal->cgroup_threadgroup_rwsem);
 	percpu_up_read(&cgroup_threadgroup_rwsem);
 }
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@ -355,6 +355,11 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css)
 	return css->flags & CSS_DYING;
 }
 static inline bool css_is_online(struct cgroup_subsys_state *css)
 {
 	return css->flags & CSS_ONLINE;
 }
 static inline bool css_is_self(struct cgroup_subsys_state *css)
 {
 	if (css == &css->cgroup->self) {
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@ -226,6 +226,10 @@ struct signal_struct {
 	struct tty_audit_buf *tty_audit_buf;
 #endif
 #ifdef CONFIG_CGROUPS
 	struct rw_semaphore cgroup_threadgroup_rwsem;
 #endif
 	/*
 	 * Thread is the potential origin of an oom condition; kill first on
 	 * oom
--- a/init/init_task.c
+++ b/init/init_task.c
@ -27,6 +27,9 @@ static struct signal_struct init_signals = {
 	},
 	.multiprocess	= HLIST_HEAD_INIT,
 	.rlim		= INIT_RLIMITS,
 #ifdef CONFIG_CGROUPS
 	.cgroup_threadgroup_rwsem	= __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem),
 #endif
 	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
 	.exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock),
 #ifdef CONFIG_POSIX_TIMERS
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@ -249,12 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 		       bool threadgroup);
-void cgroup_attach_lock(bool lock_threadgroup);
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
-void cgroup_attach_unlock(bool lock_threadgroup);
+			struct task_struct *tsk);
 void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
 			  struct task_struct *tsk);
 struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
-					     bool *locked)
+					     enum cgroup_attach_lock_mode *lock_mode)
 	__acquires(&cgroup_threadgroup_rwsem);
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
+void cgroup_procs_write_finish(struct task_struct *task,
 			       enum cgroup_attach_lock_mode lock_mode)
 	__releases(&cgroup_threadgroup_rwsem);
 void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@ -10,6 +10,7 @@
 #include <linux/sched/task.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/vmalloc.h>
 #include <linux/delayacct.h>
 #include <linux/pid_namespace.h>
@ -68,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 	int retval = 0;
 	cgroup_lock();
-	cgroup_attach_lock(true);
+	cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
 	for_each_root(root) {
 		struct cgroup *from_cgrp;
@ -80,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 		if (retval)
 			break;
 	}
-	cgroup_attach_unlock(true);
+	cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
 	cgroup_unlock();
 	return retval;
@ -117,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	cgroup_lock();
-	cgroup_attach_lock(true);
+	cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
 	/* all tasks in @from are being moved, all csets are source */
 	spin_lock_irq(&css_set_lock);
@ -153,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	} while (task && !ret);
 out_err:
 	cgroup_migrate_finish(&mgctx);
-	cgroup_attach_unlock(true);
+	cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
 	cgroup_unlock();
 	return ret;
 }
@ -502,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
 	struct task_struct *task;
 	const struct cred *cred, *tcred;
 	ssize_t ret;
-	bool locked;
+	enum cgroup_attach_lock_mode lock_mode;
 	cgrp = cgroup_kn_lock_live(of->kn, false);
 	if (!cgrp)
 		return -ENODEV;
-	task = cgroup_procs_write_start(buf, threadgroup, &locked);
+	task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
 	ret = PTR_ERR_OR_ZERO(task);
 	if (ret)
 		goto out_unlock;
@ -531,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
 	ret = cgroup_attach_task(cgrp, task, threadgroup);
 out_finish:
-	cgroup_procs_write_finish(task, locked);
+	cgroup_procs_write_finish(task, lock_mode);
 out_unlock:
 	cgroup_kn_unlock(of->kn);
@ -1133,7 +1134,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
 	if (ctx->release_agent) {
 		spin_lock(&release_agent_path_lock);
-		strcpy(root->release_agent_path, ctx->release_agent);
+		strscpy(root->release_agent_path, ctx->release_agent);
 		spin_unlock(&release_agent_path_lock);
 	}
@ -1325,7 +1326,7 @@ static int __init cgroup1_wq_init(void)
 	 * Cap @max_active to 1 too.
 	 */
 	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
-						    0, 1);
+						    WQ_PERCPU, 1);
 	BUG_ON(!cgroup_pidlist_destroy_wq);
 	return 0;
 }
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@ -125,7 +125,7 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
 /*
 * cgroup destruction makes heavy use of work items and there can be a lot
 * of concurrent destructions.  Use a separate workqueue so that cgroup
- * destruction work items don't end up filling up max_active of system_wq
+ * destruction work items don't end up filling up max_active of system_percpu_wq
 * which may lead to deadlock.
 *
 * A cgroup destruction should enqueue work sequentially to:
@ -240,6 +240,14 @@ static u16 have_canfork_callback __read_mostly;
 static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
 /*
 * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem,
 * read protected by either.
 *
 * Can only be turned on, but not turned off.
 */
 bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
 	.ns.__ns_ref	= REFCOUNT_INIT(2),
@ -1327,14 +1335,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
 {
 	bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
-	/* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+	/*
 	 * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
 	 * favordynmods can flip while task is between
 	 * cgroup_threadgroup_change_begin() and end(), so down_write global
 	 * cgroup_threadgroup_rwsem to synchronize them.
 	 *
 	 * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding
 	 * cgroup_threadgroup_rwsem doesn't exlude tasks between
 	 * cgroup_thread_group_change_begin() and end() and thus it's unsafe to
 	 * turn off. As the scenario is unlikely, simply disallow disabling once
 	 * enabled and print out a warning.
 	 */
 	percpu_down_write(&cgroup_threadgroup_rwsem);
 	if (favor && !favoring) {
 		cgroup_enable_per_threadgroup_rwsem = true;
 		rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
 		root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
 	} else if (!favor && favoring) {
 		if (cgroup_enable_per_threadgroup_rwsem)
 			pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n");
 		rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
 		root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
 	}
 	percpu_up_write(&cgroup_threadgroup_rwsem);
 }
 static int cgroup_init_root_id(struct cgroup_root *root)
@ -2484,7 +2508,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
 /**
 * cgroup_attach_lock - Lock for ->attach()
- * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether acquire and acquire which rwsem
 * @tsk: thread group to lock
 *
 * cgroup migration sometimes needs to stabilize threadgroups against forks and
 * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
@ -2504,22 +2529,55 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
 * Resolve the situation by always acquiring cpus_read_lock() before optionally
 * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
 * CPU hotplug is disabled on entry.
 *
 * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead
 * on dynamic cgroup modifications. see the comment above
 * CGRP_ROOT_FAVOR_DYNMODS definition.
 *
 * tsk is not NULL only when writing to cgroup.procs.
 */
-void cgroup_attach_lock(bool lock_threadgroup)
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
 			struct task_struct *tsk)
 {
 	cpus_read_lock();
-	if (lock_threadgroup)
+
 	switch (lock_mode) {
 	case CGRP_ATTACH_LOCK_NONE:
 		break;
 	case CGRP_ATTACH_LOCK_GLOBAL:
 		percpu_down_write(&cgroup_threadgroup_rwsem);
 		break;
 	case CGRP_ATTACH_LOCK_PER_THREADGROUP:
 		down_write(&tsk->signal->cgroup_threadgroup_rwsem);
 		break;
 	default:
 		pr_warn("cgroup: Unexpected attach lock mode.");
 		break;
 	}
 }
 /**
 * cgroup_attach_unlock - Undo cgroup_attach_lock()
- * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether release and release which rwsem
 * @tsk: thread group to lock
 */
-void cgroup_attach_unlock(bool lock_threadgroup)
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
 			  struct task_struct *tsk)
 {
-	if (lock_threadgroup)
+	switch (lock_mode) {
 	case CGRP_ATTACH_LOCK_NONE:
 		break;
 	case CGRP_ATTACH_LOCK_GLOBAL:
 		percpu_up_write(&cgroup_threadgroup_rwsem);
 		break;
 	case CGRP_ATTACH_LOCK_PER_THREADGROUP:
 		up_write(&tsk->signal->cgroup_threadgroup_rwsem);
 		break;
 	default:
 		pr_warn("cgroup: Unexpected attach lock mode.");
 		break;
 	}
 	cpus_read_unlock();
 }
@ -2969,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 	/* look up all src csets */
 	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	task = leader;
 	do {
 		cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
 	/* prepare dst csets and commit */
@ -2993,7 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 }
 struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
-					     bool *threadgroup_locked)
+					     enum cgroup_attach_lock_mode *lock_mode)
 {
 	struct task_struct *tsk;
 	pid_t pid;
@ -3001,24 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
 	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
 		return ERR_PTR(-EINVAL);
-	/*
+retry_find_task:
 	 * If we migrate a single thread, we don't care about threadgroup
 	 * stability. If the thread is `current`, it won't exit(2) under our
 	 * hands or change PID through exec(2). We exclude
 	 * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
 	 * callers by cgroup_mutex.
 	 * Therefore, we can skip the global lock.
 	 */
 	lockdep_assert_held(&cgroup_mutex);
 	*threadgroup_locked = pid || threadgroup;
 	cgroup_attach_lock(*threadgroup_locked);
 	rcu_read_lock();
 	if (pid) {
 		tsk = find_task_by_vpid(pid);
 		if (!tsk) {
 			tsk = ERR_PTR(-ESRCH);
-			goto out_unlock_threadgroup;
+			goto out_unlock_rcu;
 		}
 	} else {
 		tsk = current;
@ -3035,33 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
 	 */
 	if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
 		tsk = ERR_PTR(-EINVAL);
-		goto out_unlock_threadgroup;
+		goto out_unlock_rcu;
 	}
 	get_task_struct(tsk);
 	rcu_read_unlock();
 	/*
 	 * If we migrate a single thread, we don't care about threadgroup
 	 * stability. If the thread is `current`, it won't exit(2) under our
 	 * hands or change PID through exec(2). We exclude
 	 * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers
 	 * by cgroup_mutex. Therefore, we can skip the global lock.
 	 */
 	lockdep_assert_held(&cgroup_mutex);
 	if (pid || threadgroup) {
 		if (cgroup_enable_per_threadgroup_rwsem)
 			*lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP;
 		else
 			*lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
 	} else {
 		*lock_mode = CGRP_ATTACH_LOCK_NONE;
 	}
-	get_task_struct(tsk);
+	cgroup_attach_lock(*lock_mode, tsk);
-	goto out_unlock_rcu;
+
 	if (threadgroup) {
 		if (!thread_group_leader(tsk)) {
 			/*
 			 * A race with de_thread from another thread's exec()
 			 * may strip us of our leadership. If this happens,
 			 * throw this task away and try again.
 			 */
 			cgroup_attach_unlock(*lock_mode, tsk);
 			put_task_struct(tsk);
 			goto retry_find_task;
 		}
 	}
 	return tsk;
 out_unlock_threadgroup:
 	cgroup_attach_unlock(*threadgroup_locked);
 	*threadgroup_locked = false;
 out_unlock_rcu:
 	rcu_read_unlock();
 	return tsk;
 }
-void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
+void cgroup_procs_write_finish(struct task_struct *task,
 			       enum cgroup_attach_lock_mode lock_mode)
 {
-	struct cgroup_subsys *ss;
+	cgroup_attach_unlock(lock_mode, task);
 	int ssid;
 	/* release reference from cgroup_procs_write_start() */
 	put_task_struct(task);
 	cgroup_attach_unlock(threadgroup_locked);
 	for_each_subsys(ss, ssid)
 		if (ss->post_attach)
 			ss->post_attach();
 }
 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@ -3113,6 +3183,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	struct cgroup_subsys_state *d_css;
 	struct cgroup *dsct;
 	struct css_set *src_cset;
 	enum cgroup_attach_lock_mode lock_mode;
 	bool has_tasks;
 	int ret;
@ -3144,7 +3215,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	 * write-locking can be skipped safely.
 	 */
 	has_tasks = !list_empty(&mgctx.preloaded_src_csets);
-	cgroup_attach_lock(has_tasks);
+
 	if (has_tasks)
 		lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
 	else
 		lock_mode = CGRP_ATTACH_LOCK_NONE;
 	cgroup_attach_lock(lock_mode, NULL);
 	/* NULL dst indicates self on default hierarchy */
 	ret = cgroup_migrate_prepare_dst(&mgctx);
@ -3165,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	ret = cgroup_migrate_execute(&mgctx);
 out_finish:
 	cgroup_migrate_finish(&mgctx);
-	cgroup_attach_unlock(has_tasks);
+	cgroup_attach_unlock(lock_mode, NULL);
 	return ret;
 }
@ -3788,6 +3865,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
 	return 0;
 }
 static int cgroup_core_local_stat_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	unsigned int sequence;
 	u64 freeze_time;
 	do {
 		sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq);
 		freeze_time = cgrp->freezer.frozen_nsec;
 		/* Add in current freezer interval if the cgroup is freezing. */
 		if (test_bit(CGRP_FREEZE, &cgrp->flags))
 			freeze_time += (ktime_get_ns() -
 					cgrp->freezer.freeze_start_nsec);
 	} while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence));
 	do_div(freeze_time, NSEC_PER_USEC);
 	seq_printf(seq, "frozen_usec %llu\n", freeze_time);
 	return 0;
 }
 #ifdef CONFIG_CGROUP_SCHED
 /**
 * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
@ -5267,13 +5365,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	struct task_struct *task;
 	const struct cred *saved_cred;
 	ssize_t ret;
-	bool threadgroup_locked;
+	enum cgroup_attach_lock_mode lock_mode;
 	dst_cgrp = cgroup_kn_lock_live(of->kn, false);
 	if (!dst_cgrp)
 		return -ENODEV;
-	task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
+	task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
 	ret = PTR_ERR_OR_ZERO(task);
 	if (ret)
 		goto out_unlock;
@ -5299,7 +5397,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
 out_finish:
-	cgroup_procs_write_finish(task, threadgroup_locked);
+	cgroup_procs_write_finish(task, lock_mode);
 out_unlock:
 	cgroup_kn_unlock(of->kn);
@ -5380,6 +5478,11 @@ static struct cftype cgroup_base_files[] = {
 		.name = "cgroup.stat",
 		.seq_show = cgroup_stat_show,
 	},
 	{
 		.name = "cgroup.stat.local",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_core_local_stat_show,
 	},
 	{
 		.name = "cgroup.freeze",
 		.flags = CFTYPE_NOT_ON_ROOT,
@ -5789,6 +5892,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
 	 * if the parent has to be frozen, the child has too.
 	 */
 	cgrp->freezer.e_freeze = parent->freezer.e_freeze;
 	seqcount_init(&cgrp->freezer.freeze_seq);
 	if (cgrp->freezer.e_freeze) {
 		/*
 		 * Set the CGRP_FREEZE flag, so when a process will be
@ -5797,6 +5901,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
 		 * consider it frozen immediately.
 		 */
 		set_bit(CGRP_FREEZE, &cgrp->flags);
 		cgrp->freezer.freeze_start_nsec = ktime_get_ns();
 		set_bit(CGRP_FROZEN, &cgrp->flags);
 	}
@ -6352,13 +6457,13 @@ static int __init cgroup_wq_init(void)
 	 * We would prefer to do this in cgroup_init() above, but that
 	 * is called before init_workqueues(): so leave this until after.
 	 */
-	cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1);
+	cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1);
 	BUG_ON(!cgroup_offline_wq);
-	cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1);
+	cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1);
 	BUG_ON(!cgroup_release_wq);
-	cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1);
+	cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1);
 	BUG_ON(!cgroup_free_wq);
 	return 0;
 }
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@ -38,7 +38,6 @@ enum prs_errcode {
 /* bits in struct cpuset flags field */
 typedef enum {
 	CS_ONLINE,
 	CS_CPU_EXCLUSIVE,
 	CS_MEM_EXCLUSIVE,
 	CS_MEM_HARDWALL,
@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
 /* convenient tests for these bits */
 static inline bool is_cpuset_online(struct cpuset *cs)
 {
-	return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
+	return css_is_online(&cs->css) && !css_is_dying(&cs->css);
 }
 static inline int is_cpu_exclusive(const struct cpuset *cs)
@ -277,6 +276,8 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on)
 ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 				    char *buf, size_t nbytes, loff_t off);
 int cpuset_common_seq_show(struct seq_file *sf, void *v);
 void cpuset_full_lock(void);
 void cpuset_full_unlock(void);
 /*
 * cpuset-v1.c
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@ -169,8 +169,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = -ENODEV;
-	cpus_read_lock();
+	cpuset_full_lock();
 	cpuset_lock();
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
@ -184,8 +183,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 		break;
 	}
 out_unlock:
-	cpuset_unlock();
+	cpuset_full_unlock();
 	cpus_read_unlock();
 	return retval;
 }
@ -454,8 +452,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = 0;
-	cpus_read_lock();
+	cpuset_full_lock();
 	cpuset_lock();
 	if (!is_cpuset_online(cs)) {
 		retval = -ENODEV;
 		goto out_unlock;
@ -498,8 +495,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 		break;
 	}
 out_unlock:
-	cpuset_unlock();
+	cpuset_full_unlock();
 	cpus_read_unlock();
 	return retval;
 }
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
 		return -ENODEV;
 	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	cset = task_css_set(current);
 	refcnt = refcount_read(&cset->refcount);
 	seq_printf(seq, "css_set %pK %d", cset, refcnt);
@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
 		seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name,
 			  css, css->id);
 	}
 	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
 	cgroup_kn_unlock(of->kn);
 	return 0;
@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 		return -ENOMEM;
 	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	cset = task_css_set(current);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 		seq_printf(seq, "Root %d group %s\n",
 			   c->root->hierarchy_id, name_buf);
 	}
 	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
 	kfree(name_buf);
 	return 0;
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze)
 /*
 * Freeze or unfreeze all tasks in the given cgroup.
 */
-static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
+static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec)
 {
 	struct css_task_iter it;
 	struct task_struct *task;
@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
 	lockdep_assert_held(&cgroup_mutex);
 	spin_lock_irq(&css_set_lock);
-	if (freeze)
+	write_seqcount_begin(&cgrp->freezer.freeze_seq);
 	if (freeze) {
 		set_bit(CGRP_FREEZE, &cgrp->flags);
-	else
+		cgrp->freezer.freeze_start_nsec = ts_nsec;
 	} else {
 		clear_bit(CGRP_FREEZE, &cgrp->flags);
 		cgrp->freezer.frozen_nsec += (ts_nsec -
 			cgrp->freezer.freeze_start_nsec);
 	}
 	write_seqcount_end(&cgrp->freezer.freeze_seq);
 	spin_unlock_irq(&css_set_lock);
 	if (freeze)
@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
 	struct cgroup *parent;
 	struct cgroup *dsct;
 	bool applied = false;
 	u64 ts_nsec;
 	bool old_e;
 	lockdep_assert_held(&cgroup_mutex);
@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
 		return;
 	cgrp->freezer.freeze = freeze;
 	ts_nsec = ktime_get_ns();
 	/*
 	 * Propagate changes downwards the cgroup tree.
@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
 		/*
 		 * Do change actual state: freeze or unfreeze.
 		 */
-		cgroup_do_freeze(dsct, freeze);
+		cgroup_do_freeze(dsct, freeze, ts_nsec);
 		applied = true;
 	}
--- a/kernel/fork.c
+++ b/kernel/fork.c
@ -1688,6 +1688,10 @@ static int copy_signal(u64 clone_flags, struct task_struct *tsk)
 	tty_audit_fork(sig);
 	sched_autogroup_fork(sig);
 #ifdef CONFIG_CGROUPS
 	init_rwsem(&sig->cgroup_threadgroup_rwsem);
 #endif
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
--- a/tools/testing/selftests/cgroup/lib/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c
@ -522,6 +522,18 @@ int proc_mount_contains(const char *option)
 	return strstr(buf, option) != NULL;
 }
 int cgroup_feature(const char *feature)
 {
 	char buf[PAGE_SIZE];
 	ssize_t read;
 	read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf));
 	if (read < 0)
 		return read;
 	return strstr(buf, feature) != NULL;
 }
 ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
 {
 	char path[PATH_MAX];
--- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
@ -60,6 +60,7 @@ extern int cg_run_nowait(const char *cgroup,
 extern int cg_wait_for_proc_count(const char *cgroup, int count);
 extern int cg_killall(const char *cgroup);
 int proc_mount_contains(const char *option);
 int cgroup_feature(const char *feature);
 extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
 extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
 extern pid_t clone_into_cgroup(int cgroup_fd);
--- a/tools/testing/selftests/cgroup/test_freezer.c
+++ b/tools/testing/selftests/cgroup/test_freezer.c
@ -804,6 +804,662 @@ cleanup:
 	return ret;
 }
 /*
 * Get the current frozen_usec for the cgroup.
 */
 static long cg_check_freezetime(const char *cgroup)
 {
 	return cg_read_key_long(cgroup, "cgroup.stat.local",
 				"frozen_usec ");
 }
 /*
 * Test that the freeze time will behave as expected for an empty cgroup.
 */
 static int test_cgfreezer_time_empty(const char *root)
 {
 	int ret = KSFT_FAIL;
 	char *cgroup = NULL;
 	long prev, curr;
 	cgroup = cg_name(root, "cg_time_test_empty");
 	if (!cgroup)
 		goto cleanup;
 	/*
 	 * 1) Create an empty cgroup and check that its freeze time
 	 *    is 0.
 	 */
 	if (cg_create(cgroup))
 		goto cleanup;
 	curr = cg_check_freezetime(cgroup);
 	if (curr < 0) {
 		ret = KSFT_SKIP;
 		goto cleanup;
 	}
 	if (curr > 0) {
 		debug("Expect time (%ld) to be 0\n", curr);
 		goto cleanup;
 	}
 	if (cg_freeze_nowait(cgroup, true))
 		goto cleanup;
 	/*
 	 * 2) Sleep for 1000 us. Check that the freeze time is at
 	 *    least 1000 us.
 	 */
 	usleep(1000);
 	curr = cg_check_freezetime(cgroup);
 	if (curr < 1000) {
 		debug("Expect time (%ld) to be at least 1000 us\n",
 		      curr);
 		goto cleanup;
 	}
 	/*
 	 * 3) Unfreeze the cgroup. Check that the freeze time is
 	 *    larger than at 2).
 	 */
 	if (cg_freeze_nowait(cgroup, false))
 		goto cleanup;
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr <= prev) {
 		debug("Expect time (%ld) to be more than previous check (%ld)\n",
 		      curr, prev);
 		goto cleanup;
 	}
 	/*
 	 * 4) Check the freeze time again to ensure that it has not
 	 *    changed.
 	 */
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr != prev) {
 		debug("Expect time (%ld) to be unchanged from previous check (%ld)\n",
 		      curr, prev);
 		goto cleanup;
 	}
 	ret = KSFT_PASS;
 cleanup:
 	if (cgroup)
 		cg_destroy(cgroup);
 	free(cgroup);
 	return ret;
 }
 /*
 * A simple test for cgroup freezer time accounting. This test follows
 * the same flow as test_cgfreezer_time_empty, but with a single process
 * in the cgroup.
 */
 static int test_cgfreezer_time_simple(const char *root)
 {
 	int ret = KSFT_FAIL;
 	char *cgroup = NULL;
 	long prev, curr;
 	cgroup = cg_name(root, "cg_time_test_simple");
 	if (!cgroup)
 		goto cleanup;
 	/*
 	 * 1) Create a cgroup and check that its freeze time is 0.
 	 */
 	if (cg_create(cgroup))
 		goto cleanup;
 	curr = cg_check_freezetime(cgroup);
 	if (curr < 0) {
 		ret = KSFT_SKIP;
 		goto cleanup;
 	}
 	if (curr > 0) {
 		debug("Expect time (%ld) to be 0\n", curr);
 		goto cleanup;
 	}
 	/*
 	 * 2) Populate the cgroup with one child and check that the
 	 *    freeze time is still 0.
 	 */
 	cg_run_nowait(cgroup, child_fn, NULL);
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr > prev) {
 		debug("Expect time (%ld) to be 0\n", curr);
 		goto cleanup;
 	}
 	if (cg_freeze_nowait(cgroup, true))
 		goto cleanup;
 	/*
 	 * 3) Sleep for 1000 us. Check that the freeze time is at
 	 *    least 1000 us.
 	 */
 	usleep(1000);
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr < 1000) {
 		debug("Expect time (%ld) to be at least 1000 us\n",
 		      curr);
 		goto cleanup;
 	}
 	/*
 	 * 4) Unfreeze the cgroup. Check that the freeze time is
 	 *    larger than at 3).
 	 */
 	if (cg_freeze_nowait(cgroup, false))
 		goto cleanup;
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr <= prev) {
 		debug("Expect time (%ld) to be more than previous check (%ld)\n",
 		      curr, prev);
 		goto cleanup;
 	}
 	/*
 	 * 5) Sleep for 1000 us. Check that the freeze time is the
 	 *    same as at 4).
 	 */
 	usleep(1000);
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr != prev) {
 		debug("Expect time (%ld) to be unchanged from previous check (%ld)\n",
 		      curr, prev);
 		goto cleanup;
 	}
 	ret = KSFT_PASS;
 cleanup:
 	if (cgroup)
 		cg_destroy(cgroup);
 	free(cgroup);
 	return ret;
 }
 /*
 * Test that freezer time accounting works as expected, even while we're
 * populating a cgroup with processes.
 */
 static int test_cgfreezer_time_populate(const char *root)
 {
 	int ret = KSFT_FAIL;
 	char *cgroup = NULL;
 	long prev, curr;
 	int i;
 	cgroup = cg_name(root, "cg_time_test_populate");
 	if (!cgroup)
 		goto cleanup;
 	if (cg_create(cgroup))
 		goto cleanup;
 	curr = cg_check_freezetime(cgroup);
 	if (curr < 0) {
 		ret = KSFT_SKIP;
 		goto cleanup;
 	}
 	if (curr > 0) {
 		debug("Expect time (%ld) to be 0\n", curr);
 		goto cleanup;
 	}
 	/*
 	 * 1) Populate the cgroup with 100 processes. Check that
 	 *    the freeze time is 0.
 	 */
 	for (i = 0; i < 100; i++)
 		cg_run_nowait(cgroup, child_fn, NULL);
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr != prev) {
 		debug("Expect time (%ld) to be 0\n", curr);
 		goto cleanup;
 	}
 	/*
 	 * 2) Wait for the group to become fully populated. Check
 	 *    that the freeze time is 0.
 	 */
 	if (cg_wait_for_proc_count(cgroup, 100))
 		goto cleanup;
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr != prev) {
 		debug("Expect time (%ld) to be 0\n", curr);
 		goto cleanup;
 	}
 	/*
 	 * 3) Freeze the cgroup and then populate it with 100 more
 	 *    processes. Check that the freeze time continues to grow.
 	 */
 	if (cg_freeze_nowait(cgroup, true))
 		goto cleanup;
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr <= prev) {
 		debug("Expect time (%ld) to be more than previous check (%ld)\n",
 		      curr, prev);
 		goto cleanup;
 	}
 	for (i = 0; i < 100; i++)
 		cg_run_nowait(cgroup, child_fn, NULL);
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr <= prev) {
 		debug("Expect time (%ld) to be more than previous check (%ld)\n",
 		      curr, prev);
 		goto cleanup;
 	}
 	/*
 	 * 4) Wait for the group to become fully populated. Check
 	 *    that the freeze time is larger than at 3).
 	 */
 	if (cg_wait_for_proc_count(cgroup, 200))
 		goto cleanup;
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr <= prev) {
 		debug("Expect time (%ld) to be more than previous check (%ld)\n",
 		      curr, prev);
 		goto cleanup;
 	}
 	/*
 	 * 5) Unfreeze the cgroup. Check that the freeze time is
 	 *    larger than at 4).
 	 */
 	if (cg_freeze_nowait(cgroup, false))
 		goto cleanup;
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr <= prev) {
 		debug("Expect time (%ld) to be more than previous check (%ld)\n",
 		      curr, prev);
 		goto cleanup;
 	}
 	/*
 	 * 6) Kill the processes. Check that the freeze time is the
 	 *    same as it was at 5).
 	 */
 	if (cg_killall(cgroup))
 		goto cleanup;
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr != prev) {
 		debug("Expect time (%ld) to be unchanged from previous check (%ld)\n",
 		      curr, prev);
 		goto cleanup;
 	}
 	/*
 	 * 7) Freeze and unfreeze the cgroup. Check that the freeze
 	 *    time is larger than it was at 6).
 	 */
 	if (cg_freeze_nowait(cgroup, true))
 		goto cleanup;
 	if (cg_freeze_nowait(cgroup, false))
 		goto cleanup;
 	prev = curr;
 	curr = cg_check_freezetime(cgroup);
 	if (curr <= prev) {
 		debug("Expect time (%ld) to be more than previous check (%ld)\n",
 		      curr, prev);
 		goto cleanup;
 	}
 	ret = KSFT_PASS;
 cleanup:
 	if (cgroup)
 		cg_destroy(cgroup);
 	free(cgroup);
 	return ret;
 }
 /*
 * Test that frozen time for a cgroup continues to work as expected,
 * even as processes are migrated. Frozen cgroup A's freeze time should
 * continue to increase and running cgroup B's should stay 0.
 */
 static int test_cgfreezer_time_migrate(const char *root)
 {
 	long prev_A, curr_A, curr_B;
 	char *cgroup[2] = {0};
 	int ret = KSFT_FAIL;
 	int pid;
 	cgroup[0] = cg_name(root, "cg_time_test_migrate_A");
 	if (!cgroup[0])
 		goto cleanup;
 	cgroup[1] = cg_name(root, "cg_time_test_migrate_B");
 	if (!cgroup[1])
 		goto cleanup;
 	if (cg_create(cgroup[0]))
 		goto cleanup;
 	if (cg_check_freezetime(cgroup[0]) < 0) {
 		ret = KSFT_SKIP;
 		goto cleanup;
 	}
 	if (cg_create(cgroup[1]))
 		goto cleanup;
 	pid = cg_run_nowait(cgroup[0], child_fn, NULL);
 	if (pid < 0)
 		goto cleanup;
 	if (cg_wait_for_proc_count(cgroup[0], 1))
 		goto cleanup;
 	curr_A = cg_check_freezetime(cgroup[0]);
 	if (curr_A) {
 		debug("Expect time (%ld) to be 0\n", curr_A);
 		goto cleanup;
 	}
 	curr_B = cg_check_freezetime(cgroup[1]);
 	if (curr_B) {
 		debug("Expect time (%ld) to be 0\n", curr_B);
 		goto cleanup;
 	}
 	/*
 	 * Freeze cgroup A.
 	 */
 	if (cg_freeze_wait(cgroup[0], true))
 		goto cleanup;
 	prev_A = curr_A;
 	curr_A = cg_check_freezetime(cgroup[0]);
 	if (curr_A <= prev_A) {
 		debug("Expect time (%ld) to be > 0\n", curr_A);
 		goto cleanup;
 	}
 	/*
 	 * Migrate from A (frozen) to B (running).
 	 */
 	if (cg_enter(cgroup[1], pid))
 		goto cleanup;
 	usleep(1000);
 	curr_B = cg_check_freezetime(cgroup[1]);
 	if (curr_B) {
 		debug("Expect time (%ld) to be 0\n", curr_B);
 		goto cleanup;
 	}
 	prev_A = curr_A;
 	curr_A = cg_check_freezetime(cgroup[0]);
 	if (curr_A <= prev_A) {
 		debug("Expect time (%ld) to be more than previous check (%ld)\n",
 		      curr_A, prev_A);
 		goto cleanup;
 	}
 	ret = KSFT_PASS;
 cleanup:
 	if (cgroup[0])
 		cg_destroy(cgroup[0]);
 	free(cgroup[0]);
 	if (cgroup[1])
 		cg_destroy(cgroup[1]);
 	free(cgroup[1]);
 	return ret;
 }
 /*
 * The test creates a cgroup and freezes it. Then it creates a child cgroup.
 * After that it checks that the child cgroup has a non-zero freeze time
 * that is less than the parent's. Next, it freezes the child, unfreezes
 * the parent, and sleeps. Finally, it checks that the child's freeze
 * time has grown larger than the parent's.
 */
 static int test_cgfreezer_time_parent(const char *root)
 {
 	char *parent, *child = NULL;
 	int ret = KSFT_FAIL;
 	long ptime, ctime;
 	parent = cg_name(root, "cg_test_parent_A");
 	if (!parent)
 		goto cleanup;
 	child = cg_name(parent, "cg_test_parent_B");
 	if (!child)
 		goto cleanup;
 	if (cg_create(parent))
 		goto cleanup;
 	if (cg_check_freezetime(parent) < 0) {
 		ret = KSFT_SKIP;
 		goto cleanup;
 	}
 	if (cg_freeze_wait(parent, true))
 		goto cleanup;
 	usleep(1000);
 	if (cg_create(child))
 		goto cleanup;
 	if (cg_check_frozen(child, true))
 		goto cleanup;
 	/*
 	 * Since the parent was frozen the entire time the child cgroup
 	 * was being created, we expect the parent's freeze time to be
 	 * larger than the child's.
 	 *
 	 * Ideally, we would be able to check both times simultaneously,
 	 * but here we get the child's after we get the parent's.
 	 */
 	ptime = cg_check_freezetime(parent);
 	ctime = cg_check_freezetime(child);
 	if (ptime <= ctime) {
 		debug("Expect ptime (%ld) > ctime (%ld)\n", ptime, ctime);
 		goto cleanup;
 	}
 	if (cg_freeze_nowait(child, true))
 		goto cleanup;
 	if (cg_freeze_wait(parent, false))
 		goto cleanup;
 	if (cg_check_frozen(child, true))
 		goto cleanup;
 	usleep(100000);
 	ctime = cg_check_freezetime(child);
 	ptime = cg_check_freezetime(parent);
 	if (ctime <= ptime) {
 		debug("Expect ctime (%ld) > ptime (%ld)\n", ctime, ptime);
 		goto cleanup;
 	}
 	ret = KSFT_PASS;
 cleanup:
 	if (child)
 		cg_destroy(child);
 	free(child);
 	if (parent)
 		cg_destroy(parent);
 	free(parent);
 	return ret;
 }
 /*
 * The test creates a parent cgroup and a child cgroup. Then, it freezes
 * the child and checks that the child's freeze time is greater than the
 * parent's, which should be zero.
 */
 static int test_cgfreezer_time_child(const char *root)
 {
 	char *parent, *child = NULL;
 	int ret = KSFT_FAIL;
 	long ptime, ctime;
 	parent = cg_name(root, "cg_test_child_A");
 	if (!parent)
 		goto cleanup;
 	child = cg_name(parent, "cg_test_child_B");
 	if (!child)
 		goto cleanup;
 	if (cg_create(parent))
 		goto cleanup;
 	if (cg_check_freezetime(parent) < 0) {
 		ret = KSFT_SKIP;
 		goto cleanup;
 	}
 	if (cg_create(child))
 		goto cleanup;
 	if (cg_freeze_wait(child, true))
 		goto cleanup;
 	ctime = cg_check_freezetime(child);
 	ptime = cg_check_freezetime(parent);
 	if (ptime != 0) {
 		debug("Expect ptime (%ld) to be 0\n", ptime);
 		goto cleanup;
 	}
 	if (ctime <= ptime) {
 		debug("Expect ctime (%ld) <= ptime (%ld)\n", ctime, ptime);
 		goto cleanup;
 	}
 	ret = KSFT_PASS;
 cleanup:
 	if (child)
 		cg_destroy(child);
 	free(child);
 	if (parent)
 		cg_destroy(parent);
 	free(parent);
 	return ret;
 }
 /*
 * The test creates the following hierarchy:
 *    A
 *    |
 *    B
 *    |
 *    C
 *
 * Then it freezes the cgroups in the order C, B, A.
 * Then it unfreezes the cgroups in the order A, B, C.
 * Then it checks that C's freeze time is larger than B's and
 * that B's is larger than A's.
 */
 static int test_cgfreezer_time_nested(const char *root)
 {
 	char *cgroup[3] = {0};
 	int ret = KSFT_FAIL;
 	long time[3] = {0};
 	int i;
 	cgroup[0] = cg_name(root, "cg_test_time_A");
 	if (!cgroup[0])
 		goto cleanup;
 	cgroup[1] = cg_name(cgroup[0], "B");
 	if (!cgroup[1])
 		goto cleanup;
 	cgroup[2] = cg_name(cgroup[1], "C");
 	if (!cgroup[2])
 		goto cleanup;
 	if (cg_create(cgroup[0]))
 		goto cleanup;
 	if (cg_check_freezetime(cgroup[0]) < 0) {
 		ret = KSFT_SKIP;
 		goto cleanup;
 	}
 	if (cg_create(cgroup[1]))
 		goto cleanup;
 	if (cg_create(cgroup[2]))
 		goto cleanup;
 	if (cg_freeze_nowait(cgroup[2], true))
 		goto cleanup;
 	if (cg_freeze_nowait(cgroup[1], true))
 		goto cleanup;
 	if (cg_freeze_nowait(cgroup[0], true))
 		goto cleanup;
 	usleep(1000);
 	if (cg_freeze_nowait(cgroup[0], false))
 		goto cleanup;
 	if (cg_freeze_nowait(cgroup[1], false))
 		goto cleanup;
 	if (cg_freeze_nowait(cgroup[2], false))
 		goto cleanup;
 	time[2] = cg_check_freezetime(cgroup[2]);
 	time[1] = cg_check_freezetime(cgroup[1]);
 	time[0] = cg_check_freezetime(cgroup[0]);
 	if (time[2] <= time[1]) {
 		debug("Expect C's time (%ld) > B's time (%ld)", time[2], time[1]);
 		goto cleanup;
 	}
 	if (time[1] <= time[0]) {
 		debug("Expect B's time (%ld) > A's time (%ld)", time[1], time[0]);
 		goto cleanup;
 	}
 	ret = KSFT_PASS;
 cleanup:
 	for (i = 2; i >= 0 && cgroup[i]; i--) {
 		cg_destroy(cgroup[i]);
 		free(cgroup[i]);
 	}
 	return ret;
 }
 #define T(x) { x, #x }
 struct cgfreezer_test {
 	int (*fn)(const char *root);
@ -819,6 +1475,13 @@ struct cgfreezer_test {
 	T(test_cgfreezer_stopped),
 	T(test_cgfreezer_ptraced),
 	T(test_cgfreezer_vfork),
 	T(test_cgfreezer_time_empty),
 	T(test_cgfreezer_time_simple),
 	T(test_cgfreezer_time_populate),
 	T(test_cgfreezer_time_migrate),
 	T(test_cgfreezer_time_parent),
 	T(test_cgfreezer_time_child),
 	T(test_cgfreezer_time_nested),
 };
 #undef T
--- a/tools/testing/selftests/cgroup/test_pids.c
+++ b/tools/testing/selftests/cgroup/test_pids.c
@ -77,6 +77,9 @@ static int test_pids_events(const char *root)
 	char *cg_parent = NULL, *cg_child = NULL;
 	int pid;
 	if (cgroup_feature("pids_localevents") <= 0)
 		return KSFT_SKIP;
 	cg_parent = cg_name(root, "pids_parent");
 	cg_child = cg_name(cg_parent, "pids_child");
 	if (!cg_parent || !cg_child)