Commit 17b18600 authored by Waiman Long's avatar Waiman Long Committed by Tejun Heo
Browse files

cgroup/cpuset: Clarify exclusion rules for cpuset internal variables



Clarify the locking rules associated with file level internal variables
inside the cpuset code. There is no functional change.

Reviewed-by: default avatarChen Ridong <chenridong@huaweicloud.com>
Signed-off-by: default avatarWaiman Long <longman@redhat.com>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parent 68230aac
Loading
Loading
Loading
Loading
+61 −44
Original line number Diff line number Diff line
@@ -61,6 +61,58 @@ static const char * const perr_strings[] = {
	[PERR_REMOTE]    = "Have remote partition underneath",
};

/*
 * CPUSET Locking Convention
 * -------------------------
 *
 * Below are the three global locks guarding cpuset structures in lock
 * acquisition order:
 *  - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock)
 *  - cpuset_mutex
 *  - callback_lock (raw spinlock)
 *
 * A task must hold all the three locks to modify externally visible or
 * used fields of cpusets, though some of the internally used cpuset fields
 * and internal variables can be modified without holding callback_lock. If only
 * reliable read access of the externally used fields are needed, a task can
 * hold either cpuset_mutex or callback_lock which are exposed to other
 * external subsystems.
 *
 * If a task holds cpu_hotplug_lock and cpuset_mutex, it blocks others,
 * ensuring that it is the only task able to also acquire callback_lock and
 * be able to modify cpusets.  It can perform various checks on the cpuset
 * structure first, knowing nothing will change. It can also allocate memory
 * without holding callback_lock. While it is performing these checks, various
 * callback routines can briefly acquire callback_lock to query cpusets.  Once
 * it is ready to make the changes, it takes callback_lock, blocking everyone
 * else.
 *
 * Calls to the kernel memory allocator cannot be made while holding
 * callback_lock which is a spinlock, as the memory allocator may sleep or
 * call back into cpuset code and acquire callback_lock.
 *
 * Now, the task_struct fields mems_allowed and mempolicy may be changed
 * by other task, we use alloc_lock in the task_struct fields to protect
 * them.
 *
 * The cpuset_common_seq_show() handlers only hold callback_lock across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 */

static DEFINE_MUTEX(cpuset_mutex);

/*
 * File level internal variables below follow one of the following exclusion
 * rules.
 *
 * RWCS: Read/write-able by holding either cpus_write_lock (and optionally
 *	 cpuset_mutex) or both cpus_read_lock and cpuset_mutex.
 *
 * CSCB: Readable by holding either cpuset_mutex or callback_lock. Writable
 *	 by holding both cpuset_mutex and callback_lock.
 */

/*
 * For local partitions, update to subpartitions_cpus & isolated_cpus is done
 * in update_parent_effective_cpumask(). For remote partitions, it is done in
@@ -70,19 +122,18 @@ static const char * const perr_strings[] = {
 * Exclusive CPUs distributed out to local or remote sub-partitions of
 * top_cpuset
 */
static cpumask_var_t	subpartitions_cpus;
static cpumask_var_t	subpartitions_cpus;	/* RWCS */

/*
 * Exclusive CPUs in isolated partitions
 * Exclusive CPUs in isolated partitions (shown in cpuset.cpus.isolated)
 */
static cpumask_var_t	isolated_cpus;
static cpumask_var_t	isolated_cpus;		/* CSCB */

/*
 * isolated_cpus updating flag (protected by cpuset_mutex)
 * Set if isolated_cpus is going to be updated in the current
 * cpuset_mutex crtical section.
 * Set if isolated_cpus is being updated in the current cpuset_mutex
 * critical section.
 */
static bool isolated_cpus_updating;
static bool		isolated_cpus_updating;	/* RWCS */

/*
 * A flag to force sched domain rebuild at the end of an operation.
@@ -98,7 +149,7 @@ static bool isolated_cpus_updating;
 * Note that update_relax_domain_level() in cpuset-v1.c can still call
 * rebuild_sched_domains_locked() directly without using this flag.
 */
static bool force_sd_rebuild;
static bool force_sd_rebuild;			/* RWCS */

/*
 * Partition root states:
@@ -218,42 +269,6 @@ struct cpuset top_cpuset = {
	.partition_root_state = PRS_ROOT,
};

/*
 * There are two global locks guarding cpuset structures - cpuset_mutex and
 * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
 * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
 * structures. Note that cpuset_mutex needs to be a mutex as it is used in
 * paths that rely on priority inheritance (e.g. scheduler - on RT) for
 * correctness.
 *
 * A task must hold both locks to modify cpusets.  If a task holds
 * cpuset_mutex, it blocks others, ensuring that it is the only task able to
 * also acquire callback_lock and be able to modify cpusets.  It can perform
 * various checks on the cpuset structure first, knowing nothing will change.
 * It can also allocate memory while just holding cpuset_mutex.  While it is
 * performing these checks, various callback routines can briefly acquire
 * callback_lock to query cpusets.  Once it is ready to make the changes, it
 * takes callback_lock, blocking everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
 * callback_lock, as that would risk double tripping on callback_lock
 * from one of the callbacks into the cpuset code from within
 * __alloc_pages().
 *
 * If a task is only holding callback_lock, then it has read-only
 * access to cpusets.
 *
 * Now, the task_struct fields mems_allowed and mempolicy may be changed
 * by other task, we use alloc_lock in the task_struct fields to protect
 * them.
 *
 * The cpuset_common_seq_show() handlers only hold callback_lock across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 */

static DEFINE_MUTEX(cpuset_mutex);

/**
 * cpuset_lock - Acquire the global cpuset mutex
 *
@@ -1163,6 +1178,8 @@ static void reset_partition_data(struct cpuset *cs)
static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)
{
	WARN_ON_ONCE(old_prs == new_prs);
	lockdep_assert_held(&callback_lock);
	lockdep_assert_held(&cpuset_mutex);
	if (new_prs == PRS_ISOLATED)
		cpumask_or(isolated_cpus, isolated_cpus, xcpus);
	else