cgroup: Changes for v6.18
- Extensive cpuset code cleanup and refactoring work with no functional changes: CPU mask computation logic refactoring, introducing new helpers, removing redundant code paths, and improving error handling for better maintainability. - A few bug fixes to cpuset including fixes for partition creation failures when isolcpus is in use, missing error returns, and null pointer access prevention in free_tmpmasks(). - Core cgroup changes include replacing the global percpu_rwsem with per-threadgroup rwsem when writing to cgroup.procs for better scalability, workqueue conversions to use WQ_PERCPU and system_percpu_wq to prepare for workqueue default switching from percpu to unbound, and removal of unused code including the post_attach callback. - New cgroup.stat.local time accounting feature that tracks frozen time duration. - Misc changes including selftests updates (new freezer time tests and backward compatibility fixes), documentation sync, string function safety improvements, and 64-bit division fixes. -----BEGIN PGP SIGNATURE----- iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCaNb1Sg4cdGpAa2VybmVs Lm9yZwAKCRCxYfJx3gVYGfLMAPwKwkvUg9DPJEuECRfM9woOOHyIWLp1DwUhpg1v Zq0lkAEAmo/+IkJXGZ7TGF+wzSj7GFIugrILu3upzLCHzgYoDgs= =39KF -----END PGP SIGNATURE----- Merge tag 'cgroup-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup Pull cgroup updates from Tejun Heo: - Extensive cpuset code cleanup and refactoring work with no functional changes: CPU mask computation logic refactoring, introducing new helpers, removing redundant code paths, and improving error handling for better maintainability. - A few bug fixes to cpuset including fixes for partition creation failures when isolcpus is in use, missing error returns, and null pointer access prevention in free_tmpmasks(). - Core cgroup changes include replacing the global percpu_rwsem with per-threadgroup rwsem when writing to cgroup.procs for better scalability, workqueue conversions to use WQ_PERCPU and system_percpu_wq to prepare for workqueue default switching from percpu to unbound, and removal of unused code including the post_attach callback. - New cgroup.stat.local time accounting feature that tracks frozen time duration. - Misc changes including selftests updates (new freezer time tests and backward compatibility fixes), documentation sync, string function safety improvements, and 64-bit division fixes. * tag 'cgroup-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (39 commits) cpuset: remove is_prs_invalid helper cpuset: remove impossible warning in update_parent_effective_cpumask cpuset: remove redundant special case for null input in node mask update cpuset: fix missing error return in update_cpumask cpuset: Use new excpus for nocpu error check when enabling root partition cpuset: fix failure to enable isolated partition when containing isolcpus Documentation: cgroup-v2: Sync manual toctree cpuset: use partition_cpus_change for setting exclusive cpus cpuset: use parse_cpulist for setting cpus.exclusive cpuset: introduce partition_cpus_change cpuset: refactor cpus_allowed_validate_change cpuset: refactor out validate_partition cpuset: introduce cpus_excl_conflict and mems_excl_conflict helpers cpuset: refactor CPU mask buffer parsing logic cpuset: Refactor exclusive CPU mask computation logic cpuset: change return type of is_partition_[in]valid to bool cpuset: remove unused assignment to trialcs->partition_root_state cpuset: move the root cpuset write check earlier cgroup/cpuset: Remove redundant rcu_read_lock/unlock() in spin_lock cgroup: Remove redundant rcu_read_lock/unlock() in spin_lock ...
This commit is contained in:
commit
755fa5b4fb
|
@ -15,6 +15,9 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
|
|||
|
||||
.. CONTENTS
|
||||
|
||||
[Whenever any new section is added to this document, please also add
|
||||
an entry here.]
|
||||
|
||||
1. Introduction
|
||||
1-1. Terminology
|
||||
1-2. What is cgroup?
|
||||
|
@ -25,9 +28,10 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
|
|||
2-2-2. Threads
|
||||
2-3. [Un]populated Notification
|
||||
2-4. Controlling Controllers
|
||||
2-4-1. Enabling and Disabling
|
||||
2-4-2. Top-down Constraint
|
||||
2-4-3. No Internal Process Constraint
|
||||
2-4-1. Availability
|
||||
2-4-2. Enabling and Disabling
|
||||
2-4-3. Top-down Constraint
|
||||
2-4-4. No Internal Process Constraint
|
||||
2-5. Delegation
|
||||
2-5-1. Model of Delegation
|
||||
2-5-2. Delegation Containment
|
||||
|
@ -61,14 +65,15 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
|
|||
5-4-1. PID Interface Files
|
||||
5-5. Cpuset
|
||||
5.5-1. Cpuset Interface Files
|
||||
5-6. Device
|
||||
5-6. Device controller
|
||||
5-7. RDMA
|
||||
5-7-1. RDMA Interface Files
|
||||
5-8. DMEM
|
||||
5-8-1. DMEM Interface Files
|
||||
5-9. HugeTLB
|
||||
5.9-1. HugeTLB Interface Files
|
||||
5-10. Misc
|
||||
5.10-1 Miscellaneous cgroup Interface Files
|
||||
5.10-1 Misc Interface Files
|
||||
5.10-2 Migration and Ownership
|
||||
5-11. Others
|
||||
5-11-1. perf_event
|
||||
|
@ -1001,6 +1006,24 @@ All cgroup core files are prefixed with "cgroup."
|
|||
Total number of dying cgroup subsystems (e.g. memory
|
||||
cgroup) at and beneath the current cgroup.
|
||||
|
||||
cgroup.stat.local
|
||||
A read-only flat-keyed file which exists in non-root cgroups.
|
||||
The following entry is defined:
|
||||
|
||||
frozen_usec
|
||||
Cumulative time that this cgroup has spent between freezing and
|
||||
thawing, regardless of whether by self or ancestor groups.
|
||||
NB: (not) reaching "frozen" state is not accounted here.
|
||||
|
||||
Using the following ASCII representation of a cgroup's freezer
|
||||
state, ::
|
||||
|
||||
1 _____
|
||||
frozen 0 __/ \__
|
||||
ab cd
|
||||
|
||||
the duration being measured is the span between a and c.
|
||||
|
||||
cgroup.freeze
|
||||
A read-write single value file which exists on non-root cgroups.
|
||||
Allowed values are "0" and "1". The default is "0".
|
||||
|
|
|
@ -91,6 +91,12 @@ enum {
|
|||
* cgroup_threadgroup_rwsem. This makes hot path operations such as
|
||||
* forks and exits into the slow path and more expensive.
|
||||
*
|
||||
* Alleviate the contention between fork, exec, exit operations and
|
||||
* writing to cgroup.procs by taking a per threadgroup rwsem instead of
|
||||
* the global cgroup_threadgroup_rwsem. Fork and other operations
|
||||
* from threads in different thread groups no longer contend with
|
||||
* writing to cgroup.procs.
|
||||
*
|
||||
* The static usage pattern of creating a cgroup, enabling controllers,
|
||||
* and then seeding it with CLONE_INTO_CGROUP doesn't require write
|
||||
* locking cgroup_threadgroup_rwsem and thus doesn't benefit from
|
||||
|
@ -140,6 +146,17 @@ enum {
|
|||
__CFTYPE_ADDED = (1 << 18),
|
||||
};
|
||||
|
||||
enum cgroup_attach_lock_mode {
|
||||
/* Default */
|
||||
CGRP_ATTACH_LOCK_GLOBAL,
|
||||
|
||||
/* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */
|
||||
CGRP_ATTACH_LOCK_NONE,
|
||||
|
||||
/* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */
|
||||
CGRP_ATTACH_LOCK_PER_THREADGROUP,
|
||||
};
|
||||
|
||||
/*
|
||||
* cgroup_file is the handle for a file instance created in a cgroup which
|
||||
* is used, for example, to generate file changed notifications. This can
|
||||
|
@ -433,6 +450,23 @@ struct cgroup_freezer_state {
|
|||
* frozen, SIGSTOPped, and PTRACEd.
|
||||
*/
|
||||
int nr_frozen_tasks;
|
||||
|
||||
/* Freeze time data consistency protection */
|
||||
seqcount_t freeze_seq;
|
||||
|
||||
/*
|
||||
* Most recent time the cgroup was requested to freeze.
|
||||
* Accesses guarded by freeze_seq counter. Writes serialized
|
||||
* by css_set_lock.
|
||||
*/
|
||||
u64 freeze_start_nsec;
|
||||
|
||||
/*
|
||||
* Total duration the cgroup has spent freezing.
|
||||
* Accesses guarded by freeze_seq counter. Writes serialized
|
||||
* by css_set_lock.
|
||||
*/
|
||||
u64 frozen_nsec;
|
||||
};
|
||||
|
||||
struct cgroup {
|
||||
|
@ -746,7 +780,6 @@ struct cgroup_subsys {
|
|||
int (*can_attach)(struct cgroup_taskset *tset);
|
||||
void (*cancel_attach)(struct cgroup_taskset *tset);
|
||||
void (*attach)(struct cgroup_taskset *tset);
|
||||
void (*post_attach)(void);
|
||||
int (*can_fork)(struct task_struct *task,
|
||||
struct css_set *cset);
|
||||
void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
|
||||
|
@ -822,6 +855,7 @@ struct cgroup_subsys {
|
|||
};
|
||||
|
||||
extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
|
||||
extern bool cgroup_enable_per_threadgroup_rwsem;
|
||||
|
||||
struct cgroup_of_peak {
|
||||
unsigned long value;
|
||||
|
@ -833,11 +867,14 @@ struct cgroup_of_peak {
|
|||
* @tsk: target task
|
||||
*
|
||||
* Allows cgroup operations to synchronize against threadgroup changes
|
||||
* using a percpu_rw_semaphore.
|
||||
* using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when
|
||||
* favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
|
||||
*/
|
||||
static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
|
||||
{
|
||||
percpu_down_read(&cgroup_threadgroup_rwsem);
|
||||
if (cgroup_enable_per_threadgroup_rwsem)
|
||||
down_read(&tsk->signal->cgroup_threadgroup_rwsem);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -848,6 +885,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
|
|||
*/
|
||||
static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
|
||||
{
|
||||
if (cgroup_enable_per_threadgroup_rwsem)
|
||||
up_read(&tsk->signal->cgroup_threadgroup_rwsem);
|
||||
percpu_up_read(&cgroup_threadgroup_rwsem);
|
||||
}
|
||||
|
||||
|
|
|
@ -355,6 +355,11 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css)
|
|||
return css->flags & CSS_DYING;
|
||||
}
|
||||
|
||||
static inline bool css_is_online(struct cgroup_subsys_state *css)
|
||||
{
|
||||
return css->flags & CSS_ONLINE;
|
||||
}
|
||||
|
||||
static inline bool css_is_self(struct cgroup_subsys_state *css)
|
||||
{
|
||||
if (css == &css->cgroup->self) {
|
||||
|
|
|
@ -226,6 +226,10 @@ struct signal_struct {
|
|||
struct tty_audit_buf *tty_audit_buf;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
struct rw_semaphore cgroup_threadgroup_rwsem;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Thread is the potential origin of an oom condition; kill first on
|
||||
* oom
|
||||
|
|
|
@ -27,6 +27,9 @@ static struct signal_struct init_signals = {
|
|||
},
|
||||
.multiprocess = HLIST_HEAD_INIT,
|
||||
.rlim = INIT_RLIMITS,
|
||||
#ifdef CONFIG_CGROUPS
|
||||
.cgroup_threadgroup_rwsem = __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem),
|
||||
#endif
|
||||
.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
|
||||
.exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock),
|
||||
#ifdef CONFIG_POSIX_TIMERS
|
||||
|
|
|
@ -249,12 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
|
|||
|
||||
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
|
||||
bool threadgroup);
|
||||
void cgroup_attach_lock(bool lock_threadgroup);
|
||||
void cgroup_attach_unlock(bool lock_threadgroup);
|
||||
void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
|
||||
struct task_struct *tsk);
|
||||
void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
|
||||
struct task_struct *tsk);
|
||||
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
|
||||
bool *locked)
|
||||
enum cgroup_attach_lock_mode *lock_mode)
|
||||
__acquires(&cgroup_threadgroup_rwsem);
|
||||
void cgroup_procs_write_finish(struct task_struct *task, bool locked)
|
||||
void cgroup_procs_write_finish(struct task_struct *task,
|
||||
enum cgroup_attach_lock_mode lock_mode)
|
||||
__releases(&cgroup_threadgroup_rwsem);
|
||||
|
||||
void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include <linux/sched/task.h>
|
||||
#include <linux/magic.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/delayacct.h>
|
||||
#include <linux/pid_namespace.h>
|
||||
|
@ -68,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
|
|||
int retval = 0;
|
||||
|
||||
cgroup_lock();
|
||||
cgroup_attach_lock(true);
|
||||
cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
|
||||
for_each_root(root) {
|
||||
struct cgroup *from_cgrp;
|
||||
|
||||
|
@ -80,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
|
|||
if (retval)
|
||||
break;
|
||||
}
|
||||
cgroup_attach_unlock(true);
|
||||
cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
|
||||
cgroup_unlock();
|
||||
|
||||
return retval;
|
||||
|
@ -117,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
|
|||
|
||||
cgroup_lock();
|
||||
|
||||
cgroup_attach_lock(true);
|
||||
cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
|
||||
|
||||
/* all tasks in @from are being moved, all csets are source */
|
||||
spin_lock_irq(&css_set_lock);
|
||||
|
@ -153,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
|
|||
} while (task && !ret);
|
||||
out_err:
|
||||
cgroup_migrate_finish(&mgctx);
|
||||
cgroup_attach_unlock(true);
|
||||
cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
|
||||
cgroup_unlock();
|
||||
return ret;
|
||||
}
|
||||
|
@ -502,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
|
|||
struct task_struct *task;
|
||||
const struct cred *cred, *tcred;
|
||||
ssize_t ret;
|
||||
bool locked;
|
||||
enum cgroup_attach_lock_mode lock_mode;
|
||||
|
||||
cgrp = cgroup_kn_lock_live(of->kn, false);
|
||||
if (!cgrp)
|
||||
return -ENODEV;
|
||||
|
||||
task = cgroup_procs_write_start(buf, threadgroup, &locked);
|
||||
task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
|
||||
ret = PTR_ERR_OR_ZERO(task);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
@ -531,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
|
|||
ret = cgroup_attach_task(cgrp, task, threadgroup);
|
||||
|
||||
out_finish:
|
||||
cgroup_procs_write_finish(task, locked);
|
||||
cgroup_procs_write_finish(task, lock_mode);
|
||||
out_unlock:
|
||||
cgroup_kn_unlock(of->kn);
|
||||
|
||||
|
@ -1133,7 +1134,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
|
|||
|
||||
if (ctx->release_agent) {
|
||||
spin_lock(&release_agent_path_lock);
|
||||
strcpy(root->release_agent_path, ctx->release_agent);
|
||||
strscpy(root->release_agent_path, ctx->release_agent);
|
||||
spin_unlock(&release_agent_path_lock);
|
||||
}
|
||||
|
||||
|
@ -1325,7 +1326,7 @@ static int __init cgroup1_wq_init(void)
|
|||
* Cap @max_active to 1 too.
|
||||
*/
|
||||
cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
|
||||
0, 1);
|
||||
WQ_PERCPU, 1);
|
||||
BUG_ON(!cgroup_pidlist_destroy_wq);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -125,7 +125,7 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
|
|||
/*
|
||||
* cgroup destruction makes heavy use of work items and there can be a lot
|
||||
* of concurrent destructions. Use a separate workqueue so that cgroup
|
||||
* destruction work items don't end up filling up max_active of system_wq
|
||||
* destruction work items don't end up filling up max_active of system_percpu_wq
|
||||
* which may lead to deadlock.
|
||||
*
|
||||
* A cgroup destruction should enqueue work sequentially to:
|
||||
|
@ -240,6 +240,14 @@ static u16 have_canfork_callback __read_mostly;
|
|||
|
||||
static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
|
||||
|
||||
/*
|
||||
* Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem,
|
||||
* read protected by either.
|
||||
*
|
||||
* Can only be turned on, but not turned off.
|
||||
*/
|
||||
bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
|
||||
|
||||
/* cgroup namespace for init task */
|
||||
struct cgroup_namespace init_cgroup_ns = {
|
||||
.ns.__ns_ref = REFCOUNT_INIT(2),
|
||||
|
@ -1327,14 +1335,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
|
|||
{
|
||||
bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
|
||||
|
||||
/* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
|
||||
/*
|
||||
* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
|
||||
* favordynmods can flip while task is between
|
||||
* cgroup_threadgroup_change_begin() and end(), so down_write global
|
||||
* cgroup_threadgroup_rwsem to synchronize them.
|
||||
*
|
||||
* Once cgroup_enable_per_threadgroup_rwsem is enabled, holding
|
||||
* cgroup_threadgroup_rwsem doesn't exlude tasks between
|
||||
* cgroup_thread_group_change_begin() and end() and thus it's unsafe to
|
||||
* turn off. As the scenario is unlikely, simply disallow disabling once
|
||||
* enabled and print out a warning.
|
||||
*/
|
||||
percpu_down_write(&cgroup_threadgroup_rwsem);
|
||||
if (favor && !favoring) {
|
||||
cgroup_enable_per_threadgroup_rwsem = true;
|
||||
rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
|
||||
root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
|
||||
} else if (!favor && favoring) {
|
||||
if (cgroup_enable_per_threadgroup_rwsem)
|
||||
pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n");
|
||||
rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
|
||||
root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
|
||||
}
|
||||
percpu_up_write(&cgroup_threadgroup_rwsem);
|
||||
}
|
||||
|
||||
static int cgroup_init_root_id(struct cgroup_root *root)
|
||||
|
@ -2484,7 +2508,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
|
|||
|
||||
/**
|
||||
* cgroup_attach_lock - Lock for ->attach()
|
||||
* @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
|
||||
* @lock_mode: whether acquire and acquire which rwsem
|
||||
* @tsk: thread group to lock
|
||||
*
|
||||
* cgroup migration sometimes needs to stabilize threadgroups against forks and
|
||||
* exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
|
||||
|
@ -2504,22 +2529,55 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
|
|||
* Resolve the situation by always acquiring cpus_read_lock() before optionally
|
||||
* write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
|
||||
* CPU hotplug is disabled on entry.
|
||||
*
|
||||
* When favordynmods is enabled, take per threadgroup rwsem to reduce overhead
|
||||
* on dynamic cgroup modifications. see the comment above
|
||||
* CGRP_ROOT_FAVOR_DYNMODS definition.
|
||||
*
|
||||
* tsk is not NULL only when writing to cgroup.procs.
|
||||
*/
|
||||
void cgroup_attach_lock(bool lock_threadgroup)
|
||||
void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
|
||||
struct task_struct *tsk)
|
||||
{
|
||||
cpus_read_lock();
|
||||
if (lock_threadgroup)
|
||||
|
||||
switch (lock_mode) {
|
||||
case CGRP_ATTACH_LOCK_NONE:
|
||||
break;
|
||||
case CGRP_ATTACH_LOCK_GLOBAL:
|
||||
percpu_down_write(&cgroup_threadgroup_rwsem);
|
||||
break;
|
||||
case CGRP_ATTACH_LOCK_PER_THREADGROUP:
|
||||
down_write(&tsk->signal->cgroup_threadgroup_rwsem);
|
||||
break;
|
||||
default:
|
||||
pr_warn("cgroup: Unexpected attach lock mode.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_attach_unlock - Undo cgroup_attach_lock()
|
||||
* @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
|
||||
* @lock_mode: whether release and release which rwsem
|
||||
* @tsk: thread group to lock
|
||||
*/
|
||||
void cgroup_attach_unlock(bool lock_threadgroup)
|
||||
void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
|
||||
struct task_struct *tsk)
|
||||
{
|
||||
if (lock_threadgroup)
|
||||
switch (lock_mode) {
|
||||
case CGRP_ATTACH_LOCK_NONE:
|
||||
break;
|
||||
case CGRP_ATTACH_LOCK_GLOBAL:
|
||||
percpu_up_write(&cgroup_threadgroup_rwsem);
|
||||
break;
|
||||
case CGRP_ATTACH_LOCK_PER_THREADGROUP:
|
||||
up_write(&tsk->signal->cgroup_threadgroup_rwsem);
|
||||
break;
|
||||
default:
|
||||
pr_warn("cgroup: Unexpected attach lock mode.");
|
||||
break;
|
||||
}
|
||||
|
||||
cpus_read_unlock();
|
||||
}
|
||||
|
||||
|
@ -2969,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
|
|||
|
||||
/* look up all src csets */
|
||||
spin_lock_irq(&css_set_lock);
|
||||
rcu_read_lock();
|
||||
task = leader;
|
||||
do {
|
||||
cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
|
||||
if (!threadgroup)
|
||||
break;
|
||||
} while_each_thread(leader, task);
|
||||
rcu_read_unlock();
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
|
||||
/* prepare dst csets and commit */
|
||||
|
@ -2993,7 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
|
|||
}
|
||||
|
||||
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
|
||||
bool *threadgroup_locked)
|
||||
enum cgroup_attach_lock_mode *lock_mode)
|
||||
{
|
||||
struct task_struct *tsk;
|
||||
pid_t pid;
|
||||
|
@ -3001,24 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
|
|||
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
/*
|
||||
* If we migrate a single thread, we don't care about threadgroup
|
||||
* stability. If the thread is `current`, it won't exit(2) under our
|
||||
* hands or change PID through exec(2). We exclude
|
||||
* cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
|
||||
* callers by cgroup_mutex.
|
||||
* Therefore, we can skip the global lock.
|
||||
*/
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
*threadgroup_locked = pid || threadgroup;
|
||||
cgroup_attach_lock(*threadgroup_locked);
|
||||
|
||||
retry_find_task:
|
||||
rcu_read_lock();
|
||||
if (pid) {
|
||||
tsk = find_task_by_vpid(pid);
|
||||
if (!tsk) {
|
||||
tsk = ERR_PTR(-ESRCH);
|
||||
goto out_unlock_threadgroup;
|
||||
goto out_unlock_rcu;
|
||||
}
|
||||
} else {
|
||||
tsk = current;
|
||||
|
@ -3035,33 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
|
|||
*/
|
||||
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
|
||||
tsk = ERR_PTR(-EINVAL);
|
||||
goto out_unlock_threadgroup;
|
||||
goto out_unlock_rcu;
|
||||
}
|
||||
get_task_struct(tsk);
|
||||
rcu_read_unlock();
|
||||
|
||||
/*
|
||||
* If we migrate a single thread, we don't care about threadgroup
|
||||
* stability. If the thread is `current`, it won't exit(2) under our
|
||||
* hands or change PID through exec(2). We exclude
|
||||
* cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers
|
||||
* by cgroup_mutex. Therefore, we can skip the global lock.
|
||||
*/
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
if (pid || threadgroup) {
|
||||
if (cgroup_enable_per_threadgroup_rwsem)
|
||||
*lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP;
|
||||
else
|
||||
*lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
|
||||
} else {
|
||||
*lock_mode = CGRP_ATTACH_LOCK_NONE;
|
||||
}
|
||||
|
||||
get_task_struct(tsk);
|
||||
goto out_unlock_rcu;
|
||||
cgroup_attach_lock(*lock_mode, tsk);
|
||||
|
||||
if (threadgroup) {
|
||||
if (!thread_group_leader(tsk)) {
|
||||
/*
|
||||
* A race with de_thread from another thread's exec()
|
||||
* may strip us of our leadership. If this happens,
|
||||
* throw this task away and try again.
|
||||
*/
|
||||
cgroup_attach_unlock(*lock_mode, tsk);
|
||||
put_task_struct(tsk);
|
||||
goto retry_find_task;
|
||||
}
|
||||
}
|
||||
|
||||
return tsk;
|
||||
|
||||
out_unlock_threadgroup:
|
||||
cgroup_attach_unlock(*threadgroup_locked);
|
||||
*threadgroup_locked = false;
|
||||
out_unlock_rcu:
|
||||
rcu_read_unlock();
|
||||
return tsk;
|
||||
}
|
||||
|
||||
void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
|
||||
void cgroup_procs_write_finish(struct task_struct *task,
|
||||
enum cgroup_attach_lock_mode lock_mode)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
int ssid;
|
||||
cgroup_attach_unlock(lock_mode, task);
|
||||
|
||||
/* release reference from cgroup_procs_write_start() */
|
||||
put_task_struct(task);
|
||||
|
||||
cgroup_attach_unlock(threadgroup_locked);
|
||||
|
||||
for_each_subsys(ss, ssid)
|
||||
if (ss->post_attach)
|
||||
ss->post_attach();
|
||||
}
|
||||
|
||||
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
|
||||
|
@ -3113,6 +3183,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
|
|||
struct cgroup_subsys_state *d_css;
|
||||
struct cgroup *dsct;
|
||||
struct css_set *src_cset;
|
||||
enum cgroup_attach_lock_mode lock_mode;
|
||||
bool has_tasks;
|
||||
int ret;
|
||||
|
||||
|
@ -3144,7 +3215,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
|
|||
* write-locking can be skipped safely.
|
||||
*/
|
||||
has_tasks = !list_empty(&mgctx.preloaded_src_csets);
|
||||
cgroup_attach_lock(has_tasks);
|
||||
|
||||
if (has_tasks)
|
||||
lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
|
||||
else
|
||||
lock_mode = CGRP_ATTACH_LOCK_NONE;
|
||||
|
||||
cgroup_attach_lock(lock_mode, NULL);
|
||||
|
||||
/* NULL dst indicates self on default hierarchy */
|
||||
ret = cgroup_migrate_prepare_dst(&mgctx);
|
||||
|
@ -3165,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
|
|||
ret = cgroup_migrate_execute(&mgctx);
|
||||
out_finish:
|
||||
cgroup_migrate_finish(&mgctx);
|
||||
cgroup_attach_unlock(has_tasks);
|
||||
cgroup_attach_unlock(lock_mode, NULL);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -3788,6 +3865,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int cgroup_core_local_stat_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
unsigned int sequence;
|
||||
u64 freeze_time;
|
||||
|
||||
do {
|
||||
sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq);
|
||||
freeze_time = cgrp->freezer.frozen_nsec;
|
||||
/* Add in current freezer interval if the cgroup is freezing. */
|
||||
if (test_bit(CGRP_FREEZE, &cgrp->flags))
|
||||
freeze_time += (ktime_get_ns() -
|
||||
cgrp->freezer.freeze_start_nsec);
|
||||
} while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence));
|
||||
|
||||
do_div(freeze_time, NSEC_PER_USEC);
|
||||
seq_printf(seq, "frozen_usec %llu\n", freeze_time);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
/**
|
||||
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
|
||||
|
@ -5267,13 +5365,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
|
|||
struct task_struct *task;
|
||||
const struct cred *saved_cred;
|
||||
ssize_t ret;
|
||||
bool threadgroup_locked;
|
||||
enum cgroup_attach_lock_mode lock_mode;
|
||||
|
||||
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
|
||||
if (!dst_cgrp)
|
||||
return -ENODEV;
|
||||
|
||||
task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
|
||||
task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
|
||||
ret = PTR_ERR_OR_ZERO(task);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
@ -5299,7 +5397,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
|
|||
ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
|
||||
|
||||
out_finish:
|
||||
cgroup_procs_write_finish(task, threadgroup_locked);
|
||||
cgroup_procs_write_finish(task, lock_mode);
|
||||
out_unlock:
|
||||
cgroup_kn_unlock(of->kn);
|
||||
|
||||
|
@ -5380,6 +5478,11 @@ static struct cftype cgroup_base_files[] = {
|
|||
.name = "cgroup.stat",
|
||||
.seq_show = cgroup_stat_show,
|
||||
},
|
||||
{
|
||||
.name = "cgroup.stat.local",
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
.seq_show = cgroup_core_local_stat_show,
|
||||
},
|
||||
{
|
||||
.name = "cgroup.freeze",
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
|
@ -5789,6 +5892,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
|
|||
* if the parent has to be frozen, the child has too.
|
||||
*/
|
||||
cgrp->freezer.e_freeze = parent->freezer.e_freeze;
|
||||
seqcount_init(&cgrp->freezer.freeze_seq);
|
||||
if (cgrp->freezer.e_freeze) {
|
||||
/*
|
||||
* Set the CGRP_FREEZE flag, so when a process will be
|
||||
|
@ -5797,6 +5901,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
|
|||
* consider it frozen immediately.
|
||||
*/
|
||||
set_bit(CGRP_FREEZE, &cgrp->flags);
|
||||
cgrp->freezer.freeze_start_nsec = ktime_get_ns();
|
||||
set_bit(CGRP_FROZEN, &cgrp->flags);
|
||||
}
|
||||
|
||||
|
@ -6352,13 +6457,13 @@ static int __init cgroup_wq_init(void)
|
|||
* We would prefer to do this in cgroup_init() above, but that
|
||||
* is called before init_workqueues(): so leave this until after.
|
||||
*/
|
||||
cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1);
|
||||
cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1);
|
||||
BUG_ON(!cgroup_offline_wq);
|
||||
|
||||
cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1);
|
||||
cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1);
|
||||
BUG_ON(!cgroup_release_wq);
|
||||
|
||||
cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1);
|
||||
cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1);
|
||||
BUG_ON(!cgroup_free_wq);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -38,7 +38,6 @@ enum prs_errcode {
|
|||
|
||||
/* bits in struct cpuset flags field */
|
||||
typedef enum {
|
||||
CS_ONLINE,
|
||||
CS_CPU_EXCLUSIVE,
|
||||
CS_MEM_EXCLUSIVE,
|
||||
CS_MEM_HARDWALL,
|
||||
|
@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
|
|||
/* convenient tests for these bits */
|
||||
static inline bool is_cpuset_online(struct cpuset *cs)
|
||||
{
|
||||
return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
|
||||
return css_is_online(&cs->css) && !css_is_dying(&cs->css);
|
||||
}
|
||||
|
||||
static inline int is_cpu_exclusive(const struct cpuset *cs)
|
||||
|
@ -277,6 +276,8 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on)
|
|||
ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off);
|
||||
int cpuset_common_seq_show(struct seq_file *sf, void *v);
|
||||
void cpuset_full_lock(void);
|
||||
void cpuset_full_unlock(void);
|
||||
|
||||
/*
|
||||
* cpuset-v1.c
|
||||
|
|
|
@ -169,8 +169,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
|
|||
cpuset_filetype_t type = cft->private;
|
||||
int retval = -ENODEV;
|
||||
|
||||
cpus_read_lock();
|
||||
cpuset_lock();
|
||||
cpuset_full_lock();
|
||||
if (!is_cpuset_online(cs))
|
||||
goto out_unlock;
|
||||
|
||||
|
@ -184,8 +183,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
|
|||
break;
|
||||
}
|
||||
out_unlock:
|
||||
cpuset_unlock();
|
||||
cpus_read_unlock();
|
||||
cpuset_full_unlock();
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
@ -454,8 +452,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
|
|||
cpuset_filetype_t type = cft->private;
|
||||
int retval = 0;
|
||||
|
||||
cpus_read_lock();
|
||||
cpuset_lock();
|
||||
cpuset_full_lock();
|
||||
if (!is_cpuset_online(cs)) {
|
||||
retval = -ENODEV;
|
||||
goto out_unlock;
|
||||
|
@ -498,8 +495,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
|
|||
break;
|
||||
}
|
||||
out_unlock:
|
||||
cpuset_unlock();
|
||||
cpus_read_unlock();
|
||||
cpuset_full_unlock();
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
|
|||
return -ENODEV;
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
rcu_read_lock();
|
||||
cset = task_css_set(current);
|
||||
refcnt = refcount_read(&cset->refcount);
|
||||
seq_printf(seq, "css_set %pK %d", cset, refcnt);
|
||||
|
@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
|
|||
seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name,
|
||||
css, css->id);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
cgroup_kn_unlock(of->kn);
|
||||
return 0;
|
||||
|
@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
|
|||
return -ENOMEM;
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
rcu_read_lock();
|
||||
cset = task_css_set(current);
|
||||
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
|
||||
struct cgroup *c = link->cgrp;
|
||||
|
@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
|
|||
seq_printf(seq, "Root %d group %s\n",
|
||||
c->root->hierarchy_id, name_buf);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
kfree(name_buf);
|
||||
return 0;
|
||||
|
|
|
@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze)
|
|||
/*
|
||||
* Freeze or unfreeze all tasks in the given cgroup.
|
||||
*/
|
||||
static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
|
||||
static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec)
|
||||
{
|
||||
struct css_task_iter it;
|
||||
struct task_struct *task;
|
||||
|
@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
|
|||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
if (freeze)
|
||||
write_seqcount_begin(&cgrp->freezer.freeze_seq);
|
||||
if (freeze) {
|
||||
set_bit(CGRP_FREEZE, &cgrp->flags);
|
||||
else
|
||||
cgrp->freezer.freeze_start_nsec = ts_nsec;
|
||||
} else {
|
||||
clear_bit(CGRP_FREEZE, &cgrp->flags);
|
||||
cgrp->freezer.frozen_nsec += (ts_nsec -
|
||||
cgrp->freezer.freeze_start_nsec);
|
||||
}
|
||||
write_seqcount_end(&cgrp->freezer.freeze_seq);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
|
||||
if (freeze)
|
||||
|
@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
|
|||
struct cgroup *parent;
|
||||
struct cgroup *dsct;
|
||||
bool applied = false;
|
||||
u64 ts_nsec;
|
||||
bool old_e;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
|
|||
return;
|
||||
|
||||
cgrp->freezer.freeze = freeze;
|
||||
ts_nsec = ktime_get_ns();
|
||||
|
||||
/*
|
||||
* Propagate changes downwards the cgroup tree.
|
||||
|
@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
|
|||
/*
|
||||
* Do change actual state: freeze or unfreeze.
|
||||
*/
|
||||
cgroup_do_freeze(dsct, freeze);
|
||||
cgroup_do_freeze(dsct, freeze, ts_nsec);
|
||||
applied = true;
|
||||
}
|
||||
|
||||
|
|
|
@ -1688,6 +1688,10 @@ static int copy_signal(u64 clone_flags, struct task_struct *tsk)
|
|||
tty_audit_fork(sig);
|
||||
sched_autogroup_fork(sig);
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
init_rwsem(&sig->cgroup_threadgroup_rwsem);
|
||||
#endif
|
||||
|
||||
sig->oom_score_adj = current->signal->oom_score_adj;
|
||||
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
|
||||
|
||||
|
|
|
@ -522,6 +522,18 @@ int proc_mount_contains(const char *option)
|
|||
return strstr(buf, option) != NULL;
|
||||
}
|
||||
|
||||
int cgroup_feature(const char *feature)
|
||||
{
|
||||
char buf[PAGE_SIZE];
|
||||
ssize_t read;
|
||||
|
||||
read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf));
|
||||
if (read < 0)
|
||||
return read;
|
||||
|
||||
return strstr(buf, feature) != NULL;
|
||||
}
|
||||
|
||||
ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
|
|
|
@ -60,6 +60,7 @@ extern int cg_run_nowait(const char *cgroup,
|
|||
extern int cg_wait_for_proc_count(const char *cgroup, int count);
|
||||
extern int cg_killall(const char *cgroup);
|
||||
int proc_mount_contains(const char *option);
|
||||
int cgroup_feature(const char *feature);
|
||||
extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
|
||||
extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
|
||||
extern pid_t clone_into_cgroup(int cgroup_fd);
|
||||
|
|
|
@ -804,6 +804,662 @@ cleanup:
|
|||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the current frozen_usec for the cgroup.
|
||||
*/
|
||||
static long cg_check_freezetime(const char *cgroup)
|
||||
{
|
||||
return cg_read_key_long(cgroup, "cgroup.stat.local",
|
||||
"frozen_usec ");
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that the freeze time will behave as expected for an empty cgroup.
|
||||
*/
|
||||
static int test_cgfreezer_time_empty(const char *root)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
char *cgroup = NULL;
|
||||
long prev, curr;
|
||||
|
||||
cgroup = cg_name(root, "cg_time_test_empty");
|
||||
if (!cgroup)
|
||||
goto cleanup;
|
||||
|
||||
/*
|
||||
* 1) Create an empty cgroup and check that its freeze time
|
||||
* is 0.
|
||||
*/
|
||||
if (cg_create(cgroup))
|
||||
goto cleanup;
|
||||
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr < 0) {
|
||||
ret = KSFT_SKIP;
|
||||
goto cleanup;
|
||||
}
|
||||
if (curr > 0) {
|
||||
debug("Expect time (%ld) to be 0\n", curr);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (cg_freeze_nowait(cgroup, true))
|
||||
goto cleanup;
|
||||
|
||||
/*
|
||||
* 2) Sleep for 1000 us. Check that the freeze time is at
|
||||
* least 1000 us.
|
||||
*/
|
||||
usleep(1000);
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr < 1000) {
|
||||
debug("Expect time (%ld) to be at least 1000 us\n",
|
||||
curr);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* 3) Unfreeze the cgroup. Check that the freeze time is
|
||||
* larger than at 2).
|
||||
*/
|
||||
if (cg_freeze_nowait(cgroup, false))
|
||||
goto cleanup;
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr <= prev) {
|
||||
debug("Expect time (%ld) to be more than previous check (%ld)\n",
|
||||
curr, prev);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* 4) Check the freeze time again to ensure that it has not
|
||||
* changed.
|
||||
*/
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr != prev) {
|
||||
debug("Expect time (%ld) to be unchanged from previous check (%ld)\n",
|
||||
curr, prev);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
if (cgroup)
|
||||
cg_destroy(cgroup);
|
||||
free(cgroup);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* A simple test for cgroup freezer time accounting. This test follows
|
||||
* the same flow as test_cgfreezer_time_empty, but with a single process
|
||||
* in the cgroup.
|
||||
*/
|
||||
static int test_cgfreezer_time_simple(const char *root)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
char *cgroup = NULL;
|
||||
long prev, curr;
|
||||
|
||||
cgroup = cg_name(root, "cg_time_test_simple");
|
||||
if (!cgroup)
|
||||
goto cleanup;
|
||||
|
||||
/*
|
||||
* 1) Create a cgroup and check that its freeze time is 0.
|
||||
*/
|
||||
if (cg_create(cgroup))
|
||||
goto cleanup;
|
||||
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr < 0) {
|
||||
ret = KSFT_SKIP;
|
||||
goto cleanup;
|
||||
}
|
||||
if (curr > 0) {
|
||||
debug("Expect time (%ld) to be 0\n", curr);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* 2) Populate the cgroup with one child and check that the
|
||||
* freeze time is still 0.
|
||||
*/
|
||||
cg_run_nowait(cgroup, child_fn, NULL);
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr > prev) {
|
||||
debug("Expect time (%ld) to be 0\n", curr);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (cg_freeze_nowait(cgroup, true))
|
||||
goto cleanup;
|
||||
|
||||
/*
|
||||
* 3) Sleep for 1000 us. Check that the freeze time is at
|
||||
* least 1000 us.
|
||||
*/
|
||||
usleep(1000);
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr < 1000) {
|
||||
debug("Expect time (%ld) to be at least 1000 us\n",
|
||||
curr);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* 4) Unfreeze the cgroup. Check that the freeze time is
|
||||
* larger than at 3).
|
||||
*/
|
||||
if (cg_freeze_nowait(cgroup, false))
|
||||
goto cleanup;
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr <= prev) {
|
||||
debug("Expect time (%ld) to be more than previous check (%ld)\n",
|
||||
curr, prev);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* 5) Sleep for 1000 us. Check that the freeze time is the
|
||||
* same as at 4).
|
||||
*/
|
||||
usleep(1000);
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr != prev) {
|
||||
debug("Expect time (%ld) to be unchanged from previous check (%ld)\n",
|
||||
curr, prev);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
if (cgroup)
|
||||
cg_destroy(cgroup);
|
||||
free(cgroup);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that freezer time accounting works as expected, even while we're
|
||||
* populating a cgroup with processes.
|
||||
*/
|
||||
static int test_cgfreezer_time_populate(const char *root)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
char *cgroup = NULL;
|
||||
long prev, curr;
|
||||
int i;
|
||||
|
||||
cgroup = cg_name(root, "cg_time_test_populate");
|
||||
if (!cgroup)
|
||||
goto cleanup;
|
||||
|
||||
if (cg_create(cgroup))
|
||||
goto cleanup;
|
||||
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr < 0) {
|
||||
ret = KSFT_SKIP;
|
||||
goto cleanup;
|
||||
}
|
||||
if (curr > 0) {
|
||||
debug("Expect time (%ld) to be 0\n", curr);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* 1) Populate the cgroup with 100 processes. Check that
|
||||
* the freeze time is 0.
|
||||
*/
|
||||
for (i = 0; i < 100; i++)
|
||||
cg_run_nowait(cgroup, child_fn, NULL);
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr != prev) {
|
||||
debug("Expect time (%ld) to be 0\n", curr);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* 2) Wait for the group to become fully populated. Check
|
||||
* that the freeze time is 0.
|
||||
*/
|
||||
if (cg_wait_for_proc_count(cgroup, 100))
|
||||
goto cleanup;
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr != prev) {
|
||||
debug("Expect time (%ld) to be 0\n", curr);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* 3) Freeze the cgroup and then populate it with 100 more
|
||||
* processes. Check that the freeze time continues to grow.
|
||||
*/
|
||||
if (cg_freeze_nowait(cgroup, true))
|
||||
goto cleanup;
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr <= prev) {
|
||||
debug("Expect time (%ld) to be more than previous check (%ld)\n",
|
||||
curr, prev);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
for (i = 0; i < 100; i++)
|
||||
cg_run_nowait(cgroup, child_fn, NULL);
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr <= prev) {
|
||||
debug("Expect time (%ld) to be more than previous check (%ld)\n",
|
||||
curr, prev);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* 4) Wait for the group to become fully populated. Check
|
||||
* that the freeze time is larger than at 3).
|
||||
*/
|
||||
if (cg_wait_for_proc_count(cgroup, 200))
|
||||
goto cleanup;
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr <= prev) {
|
||||
debug("Expect time (%ld) to be more than previous check (%ld)\n",
|
||||
curr, prev);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* 5) Unfreeze the cgroup. Check that the freeze time is
|
||||
* larger than at 4).
|
||||
*/
|
||||
if (cg_freeze_nowait(cgroup, false))
|
||||
goto cleanup;
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr <= prev) {
|
||||
debug("Expect time (%ld) to be more than previous check (%ld)\n",
|
||||
curr, prev);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* 6) Kill the processes. Check that the freeze time is the
|
||||
* same as it was at 5).
|
||||
*/
|
||||
if (cg_killall(cgroup))
|
||||
goto cleanup;
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr != prev) {
|
||||
debug("Expect time (%ld) to be unchanged from previous check (%ld)\n",
|
||||
curr, prev);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* 7) Freeze and unfreeze the cgroup. Check that the freeze
|
||||
* time is larger than it was at 6).
|
||||
*/
|
||||
if (cg_freeze_nowait(cgroup, true))
|
||||
goto cleanup;
|
||||
if (cg_freeze_nowait(cgroup, false))
|
||||
goto cleanup;
|
||||
prev = curr;
|
||||
curr = cg_check_freezetime(cgroup);
|
||||
if (curr <= prev) {
|
||||
debug("Expect time (%ld) to be more than previous check (%ld)\n",
|
||||
curr, prev);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
if (cgroup)
|
||||
cg_destroy(cgroup);
|
||||
free(cgroup);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that frozen time for a cgroup continues to work as expected,
|
||||
* even as processes are migrated. Frozen cgroup A's freeze time should
|
||||
* continue to increase and running cgroup B's should stay 0.
|
||||
*/
|
||||
static int test_cgfreezer_time_migrate(const char *root)
|
||||
{
|
||||
long prev_A, curr_A, curr_B;
|
||||
char *cgroup[2] = {0};
|
||||
int ret = KSFT_FAIL;
|
||||
int pid;
|
||||
|
||||
cgroup[0] = cg_name(root, "cg_time_test_migrate_A");
|
||||
if (!cgroup[0])
|
||||
goto cleanup;
|
||||
|
||||
cgroup[1] = cg_name(root, "cg_time_test_migrate_B");
|
||||
if (!cgroup[1])
|
||||
goto cleanup;
|
||||
|
||||
if (cg_create(cgroup[0]))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_check_freezetime(cgroup[0]) < 0) {
|
||||
ret = KSFT_SKIP;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (cg_create(cgroup[1]))
|
||||
goto cleanup;
|
||||
|
||||
pid = cg_run_nowait(cgroup[0], child_fn, NULL);
|
||||
if (pid < 0)
|
||||
goto cleanup;
|
||||
|
||||
if (cg_wait_for_proc_count(cgroup[0], 1))
|
||||
goto cleanup;
|
||||
|
||||
curr_A = cg_check_freezetime(cgroup[0]);
|
||||
if (curr_A) {
|
||||
debug("Expect time (%ld) to be 0\n", curr_A);
|
||||
goto cleanup;
|
||||
}
|
||||
curr_B = cg_check_freezetime(cgroup[1]);
|
||||
if (curr_B) {
|
||||
debug("Expect time (%ld) to be 0\n", curr_B);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Freeze cgroup A.
|
||||
*/
|
||||
if (cg_freeze_wait(cgroup[0], true))
|
||||
goto cleanup;
|
||||
prev_A = curr_A;
|
||||
curr_A = cg_check_freezetime(cgroup[0]);
|
||||
if (curr_A <= prev_A) {
|
||||
debug("Expect time (%ld) to be > 0\n", curr_A);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Migrate from A (frozen) to B (running).
|
||||
*/
|
||||
if (cg_enter(cgroup[1], pid))
|
||||
goto cleanup;
|
||||
|
||||
usleep(1000);
|
||||
curr_B = cg_check_freezetime(cgroup[1]);
|
||||
if (curr_B) {
|
||||
debug("Expect time (%ld) to be 0\n", curr_B);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
prev_A = curr_A;
|
||||
curr_A = cg_check_freezetime(cgroup[0]);
|
||||
if (curr_A <= prev_A) {
|
||||
debug("Expect time (%ld) to be more than previous check (%ld)\n",
|
||||
curr_A, prev_A);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
if (cgroup[0])
|
||||
cg_destroy(cgroup[0]);
|
||||
free(cgroup[0]);
|
||||
if (cgroup[1])
|
||||
cg_destroy(cgroup[1]);
|
||||
free(cgroup[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The test creates a cgroup and freezes it. Then it creates a child cgroup.
|
||||
* After that it checks that the child cgroup has a non-zero freeze time
|
||||
* that is less than the parent's. Next, it freezes the child, unfreezes
|
||||
* the parent, and sleeps. Finally, it checks that the child's freeze
|
||||
* time has grown larger than the parent's.
|
||||
*/
|
||||
static int test_cgfreezer_time_parent(const char *root)
|
||||
{
|
||||
char *parent, *child = NULL;
|
||||
int ret = KSFT_FAIL;
|
||||
long ptime, ctime;
|
||||
|
||||
parent = cg_name(root, "cg_test_parent_A");
|
||||
if (!parent)
|
||||
goto cleanup;
|
||||
|
||||
child = cg_name(parent, "cg_test_parent_B");
|
||||
if (!child)
|
||||
goto cleanup;
|
||||
|
||||
if (cg_create(parent))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_check_freezetime(parent) < 0) {
|
||||
ret = KSFT_SKIP;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (cg_freeze_wait(parent, true))
|
||||
goto cleanup;
|
||||
|
||||
usleep(1000);
|
||||
if (cg_create(child))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_check_frozen(child, true))
|
||||
goto cleanup;
|
||||
|
||||
/*
|
||||
* Since the parent was frozen the entire time the child cgroup
|
||||
* was being created, we expect the parent's freeze time to be
|
||||
* larger than the child's.
|
||||
*
|
||||
* Ideally, we would be able to check both times simultaneously,
|
||||
* but here we get the child's after we get the parent's.
|
||||
*/
|
||||
ptime = cg_check_freezetime(parent);
|
||||
ctime = cg_check_freezetime(child);
|
||||
if (ptime <= ctime) {
|
||||
debug("Expect ptime (%ld) > ctime (%ld)\n", ptime, ctime);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (cg_freeze_nowait(child, true))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_freeze_wait(parent, false))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_check_frozen(child, true))
|
||||
goto cleanup;
|
||||
|
||||
usleep(100000);
|
||||
|
||||
ctime = cg_check_freezetime(child);
|
||||
ptime = cg_check_freezetime(parent);
|
||||
|
||||
if (ctime <= ptime) {
|
||||
debug("Expect ctime (%ld) > ptime (%ld)\n", ctime, ptime);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
if (child)
|
||||
cg_destroy(child);
|
||||
free(child);
|
||||
if (parent)
|
||||
cg_destroy(parent);
|
||||
free(parent);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The test creates a parent cgroup and a child cgroup. Then, it freezes
|
||||
* the child and checks that the child's freeze time is greater than the
|
||||
* parent's, which should be zero.
|
||||
*/
|
||||
static int test_cgfreezer_time_child(const char *root)
|
||||
{
|
||||
char *parent, *child = NULL;
|
||||
int ret = KSFT_FAIL;
|
||||
long ptime, ctime;
|
||||
|
||||
parent = cg_name(root, "cg_test_child_A");
|
||||
if (!parent)
|
||||
goto cleanup;
|
||||
|
||||
child = cg_name(parent, "cg_test_child_B");
|
||||
if (!child)
|
||||
goto cleanup;
|
||||
|
||||
if (cg_create(parent))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_check_freezetime(parent) < 0) {
|
||||
ret = KSFT_SKIP;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (cg_create(child))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_freeze_wait(child, true))
|
||||
goto cleanup;
|
||||
|
||||
ctime = cg_check_freezetime(child);
|
||||
ptime = cg_check_freezetime(parent);
|
||||
if (ptime != 0) {
|
||||
debug("Expect ptime (%ld) to be 0\n", ptime);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ctime <= ptime) {
|
||||
debug("Expect ctime (%ld) <= ptime (%ld)\n", ctime, ptime);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
if (child)
|
||||
cg_destroy(child);
|
||||
free(child);
|
||||
if (parent)
|
||||
cg_destroy(parent);
|
||||
free(parent);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The test creates the following hierarchy:
|
||||
* A
|
||||
* |
|
||||
* B
|
||||
* |
|
||||
* C
|
||||
*
|
||||
* Then it freezes the cgroups in the order C, B, A.
|
||||
* Then it unfreezes the cgroups in the order A, B, C.
|
||||
* Then it checks that C's freeze time is larger than B's and
|
||||
* that B's is larger than A's.
|
||||
*/
|
||||
static int test_cgfreezer_time_nested(const char *root)
|
||||
{
|
||||
char *cgroup[3] = {0};
|
||||
int ret = KSFT_FAIL;
|
||||
long time[3] = {0};
|
||||
int i;
|
||||
|
||||
cgroup[0] = cg_name(root, "cg_test_time_A");
|
||||
if (!cgroup[0])
|
||||
goto cleanup;
|
||||
|
||||
cgroup[1] = cg_name(cgroup[0], "B");
|
||||
if (!cgroup[1])
|
||||
goto cleanup;
|
||||
|
||||
cgroup[2] = cg_name(cgroup[1], "C");
|
||||
if (!cgroup[2])
|
||||
goto cleanup;
|
||||
|
||||
if (cg_create(cgroup[0]))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_check_freezetime(cgroup[0]) < 0) {
|
||||
ret = KSFT_SKIP;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (cg_create(cgroup[1]))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_create(cgroup[2]))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_freeze_nowait(cgroup[2], true))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_freeze_nowait(cgroup[1], true))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_freeze_nowait(cgroup[0], true))
|
||||
goto cleanup;
|
||||
|
||||
usleep(1000);
|
||||
|
||||
if (cg_freeze_nowait(cgroup[0], false))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_freeze_nowait(cgroup[1], false))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_freeze_nowait(cgroup[2], false))
|
||||
goto cleanup;
|
||||
|
||||
time[2] = cg_check_freezetime(cgroup[2]);
|
||||
time[1] = cg_check_freezetime(cgroup[1]);
|
||||
time[0] = cg_check_freezetime(cgroup[0]);
|
||||
|
||||
if (time[2] <= time[1]) {
|
||||
debug("Expect C's time (%ld) > B's time (%ld)", time[2], time[1]);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (time[1] <= time[0]) {
|
||||
debug("Expect B's time (%ld) > A's time (%ld)", time[1], time[0]);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
for (i = 2; i >= 0 && cgroup[i]; i--) {
|
||||
cg_destroy(cgroup[i]);
|
||||
free(cgroup[i]);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define T(x) { x, #x }
|
||||
struct cgfreezer_test {
|
||||
int (*fn)(const char *root);
|
||||
|
@ -819,6 +1475,13 @@ struct cgfreezer_test {
|
|||
T(test_cgfreezer_stopped),
|
||||
T(test_cgfreezer_ptraced),
|
||||
T(test_cgfreezer_vfork),
|
||||
T(test_cgfreezer_time_empty),
|
||||
T(test_cgfreezer_time_simple),
|
||||
T(test_cgfreezer_time_populate),
|
||||
T(test_cgfreezer_time_migrate),
|
||||
T(test_cgfreezer_time_parent),
|
||||
T(test_cgfreezer_time_child),
|
||||
T(test_cgfreezer_time_nested),
|
||||
};
|
||||
#undef T
|
||||
|
||||
|
|
|
@ -77,6 +77,9 @@ static int test_pids_events(const char *root)
|
|||
char *cg_parent = NULL, *cg_child = NULL;
|
||||
int pid;
|
||||
|
||||
if (cgroup_feature("pids_localevents") <= 0)
|
||||
return KSFT_SKIP;
|
||||
|
||||
cg_parent = cg_name(root, "pids_parent");
|
||||
cg_child = cg_name(cg_parent, "pids_child");
|
||||
if (!cg_parent || !cg_child)
|
||||
|
|
Loading…
Reference in New Issue