Commit f62a5d39 authored by Waiman Long's avatar Waiman Long Committed by Tejun Heo
Browse files

cgroup/cpuset: Remove remote_partition_check() & make update_cpumasks_hier()...


cgroup/cpuset: Remove remote_partition_check() & make update_cpumasks_hier() handle remote partition

Currently, changes in exclusive CPUs are being handled in
remote_partition_check() by disabling conflicting remote partitions.
However, that may lead to results unexpected by the users. Fix
this problem by removing remote_partition_check() and making
update_cpumasks_hier() handle changes in descendant remote partitions
properly.

The compute_effective_exclusive_cpumask() function is enhanced to check
the exclusive_cpus and effective_xcpus from siblings and excluded them
in its effective exclusive CPUs computation and return a value to show if
there is any sibling conflicts.  This is somewhat like the cpu_exclusive
flag check in validate_change(). This is the initial step to enable us
to retire the use of cpu_exclusive flag in cgroup v2 in the future.

One of the tests in the TEST_MATRIX of the test_cpuset_prs.sh
script has to be updated due to changes in the way a child remote
partition root is being handled (updated instead of invalidation)
in update_cpumasks_hier().

Signed-off-by: default avatarWaiman Long <longman@redhat.com>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parent 8bf450f3
Loading
Loading
Loading
Loading
+141 −117
Original line number Diff line number Diff line
@@ -86,7 +86,6 @@ static struct list_head remote_children;
 * A flag to force sched domain rebuild at the end of an operation.
 * It can be set in
 *  - update_partition_sd_lb()
 *  - remote_partition_check()
 *  - update_cpumasks_hier()
 *  - cpuset_update_flag()
 *  - cpuset_hotplug_update_tasks()
@@ -1340,20 +1339,57 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
 * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
 * @cs: cpuset
 * @xcpus: effective exclusive CPUs value to be set
 * Return: true if xcpus is not empty, false otherwise.
 * @real_cs: the real cpuset (can be NULL)
 * Return: 0 if there is no sibling conflict, > 0 otherwise
 *
 * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
 * it must be a subset of parent's effective_xcpus.
 */
static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
						struct cpumask *xcpus)
 * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to
 * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus
 * as well. The provision of real_cs means that a cpumask is being changed and
 * the given cs is a trial one.
 */
static int compute_effective_exclusive_cpumask(struct cpuset *cs,
					       struct cpumask *xcpus,
					       struct cpuset *real_cs)
{
	struct cgroup_subsys_state *css;
	struct cpuset *parent = parent_cs(cs);
	struct cpuset *sibling;
	int retval = 0;

	if (!xcpus)
		xcpus = cs->effective_xcpus;

	return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
	cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);

	if (!real_cs) {
		if (!cpumask_empty(cs->exclusive_cpus))
			return 0;
	} else {
		cs = real_cs;
	}

	/*
	 * Exclude exclusive CPUs from siblings
	 */
	rcu_read_lock();
	cpuset_for_each_child(sibling, css, parent) {
		if (sibling == cs)
			continue;

		if (!cpumask_empty(sibling->exclusive_cpus) &&
		    cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
			cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus);
			retval++;
			continue;
		}
		if (!cpumask_empty(sibling->effective_xcpus) &&
		    cpumask_intersects(xcpus, sibling->effective_xcpus)) {
			cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus);
			retval++;
		}
	}
	rcu_read_unlock();
	return retval;
}

static inline bool is_remote_partition(struct cpuset *cs)
@@ -1395,7 +1431,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
	 * remote partition root underneath it, its exclusive_cpus must
	 * have overlapped with subpartitions_cpus.
	 */
	compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
	compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL);
	if (cpumask_empty(tmp->new_cpus) ||
	    cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
	    cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
@@ -1404,8 +1440,10 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
	spin_lock_irq(&callback_lock);
	isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
	list_add(&cs->remote_sibling, &remote_children);
	cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
	spin_unlock_irq(&callback_lock);
	update_unbound_workqueue_cpumask(isolcpus_updated);
	cpuset_force_rebuild();
	cs->prs_err = 0;

	/*
@@ -1429,22 +1467,24 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
{
	bool isolcpus_updated;

	compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
	WARN_ON_ONCE(!is_remote_partition(cs));
	WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
	WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));

	spin_lock_irq(&callback_lock);
	list_del_init(&cs->remote_sibling);
	isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
					       NULL, tmp->new_cpus);
					       NULL, cs->effective_xcpus);
	if (cs->prs_err)
		cs->partition_root_state = -cs->partition_root_state;
	else
		cs->partition_root_state = PRS_MEMBER;

	/* effective_xcpus may need to be changed */
	compute_effective_exclusive_cpumask(cs, NULL, NULL);
	reset_partition_data(cs);
	spin_unlock_irq(&callback_lock);
	update_unbound_workqueue_cpumask(isolcpus_updated);
	cpuset_force_rebuild();

	/*
	 * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1456,14 +1496,15 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
/*
 * remote_cpus_update - cpus_exclusive change of remote partition
 * @cs: the cpuset to be updated
 * @newmask: the new effective_xcpus mask
 * @xcpus: the new exclusive_cpus mask, if non-NULL
 * @excpus: the new effective_xcpus mask
 * @tmp: temporary masks
 *
 * top_cpuset and subpartitions_cpus will be updated or partition can be
 * invalidated.
 */
static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
			       struct tmpmasks *tmp)
static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
			       struct cpumask *excpus, struct tmpmasks *tmp)
{
	bool adding, deleting;
	int prs = cs->partition_root_state;
@@ -1474,13 +1515,13 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,

	WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));

	if (cpumask_empty(newmask)) {
	if (cpumask_empty(excpus)) {
		cs->prs_err = PERR_CPUSEMPTY;
		goto invalidate;
	}

	adding   = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
	deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);
	adding   = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus);
	deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus);

	/*
	 * Additions of remote CPUs is only allowed if those CPUs are
@@ -1502,8 +1543,17 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
		isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
	if (deleting)
		isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
	/*
	 * Need to update effective_xcpus and exclusive_cpus now as
	 * update_sibling_cpumasks() below may iterate back to the same cs.
	 */
	cpumask_copy(cs->effective_xcpus, excpus);
	if (xcpus)
		cpumask_copy(cs->exclusive_cpus, xcpus);
	spin_unlock_irq(&callback_lock);
	update_unbound_workqueue_cpumask(isolcpus_updated);
	if (adding || deleting)
		cpuset_force_rebuild();

	/*
	 * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1516,47 +1566,6 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
	remote_partition_disable(cs, tmp);
}

/*
 * remote_partition_check - check if a child remote partition needs update
 * @cs: the cpuset to be updated
 * @newmask: the new effective_xcpus mask
 * @delmask: temporary mask for deletion (not in tmp)
 * @tmp: temporary masks
 *
 * This should be called before the given cs has updated its cpus_allowed
 * and/or effective_xcpus.
 */
static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
				   struct cpumask *delmask, struct tmpmasks *tmp)
{
	struct cpuset *child, *next;
	int disable_cnt = 0;

	/*
	 * Compute the effective exclusive CPUs that will be deleted.
	 */
	if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
	    !cpumask_intersects(delmask, subpartitions_cpus))
		return;	/* No deletion of exclusive CPUs in partitions */

	/*
	 * Searching the remote children list to look for those that will
	 * be impacted by the deletion of exclusive CPUs.
	 *
	 * Since a cpuset must be removed from the remote children list
	 * before it can go offline and holding cpuset_mutex will prevent
	 * any change in cpuset status. RCU read lock isn't needed.
	 */
	lockdep_assert_held(&cpuset_mutex);
	list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
		if (cpumask_intersects(child->effective_cpus, delmask)) {
			remote_partition_disable(child, tmp);
			disable_cnt++;
		}
	if (disable_cnt)
		cpuset_force_rebuild();
}

/*
 * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
 * @prstate: partition root state to be checked
@@ -1629,6 +1638,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
	bool nocpu;

	lockdep_assert_held(&cpuset_mutex);
	WARN_ON_ONCE(is_remote_partition(cs));

	/*
	 * new_prs will only be changed for the partcmd_update and
@@ -1670,13 +1680,20 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
	nocpu = tasks_nocpu_error(parent, cs, xcpus);

	if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
		/*
		 * Need to call compute_effective_exclusive_cpumask() in case
		 * exclusive_cpus not set. Sibling conflict should only happen
		 * if exclusive_cpus isn't set.
		 */
		xcpus = tmp->new_cpus;
		if (compute_effective_exclusive_cpumask(cs, xcpus, NULL))
			WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));

		/*
		 * Enabling partition root is not allowed if its
		 * effective_xcpus is empty or doesn't overlap with
		 * parent's effective_xcpus.
		 * effective_xcpus is empty.
		 */
		if (cpumask_empty(xcpus) ||
		    !cpumask_intersects(xcpus, parent->effective_xcpus))
		if (cpumask_empty(xcpus))
			return PERR_INVCPUS;

		if (prstate_housekeeping_conflict(new_prs, xcpus))
@@ -1695,13 +1712,16 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
		new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
	} else if (cmd == partcmd_disable) {
		/*
		 * May need to add cpus to parent's effective_cpus for
		 * valid partition root.
		 * May need to add cpus back to parent's effective_cpus
		 * (and maybe removed from subpartitions_cpus/isolated_cpus)
		 * for valid partition root. xcpus may contain CPUs that
		 * shouldn't be removed from the two global cpumasks.
		 */
		adding = !is_prs_invalid(old_prs) &&
			  cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
		if (adding)
		if (is_partition_valid(cs)) {
			cpumask_copy(tmp->addmask, cs->effective_xcpus);
			adding = true;
			subparts_delta--;
		}
		new_prs = PRS_MEMBER;
	} else if (newmask) {
		/*
@@ -1711,6 +1731,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
			part_error = PERR_CPUSEMPTY;
			goto write_error;
		}

		/* Check newmask again, whether cpus are available for parent/cs */
		nocpu |= tasks_nocpu_error(parent, cs, newmask);

@@ -1927,7 +1948,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
	 *  2) All the effective_cpus will be used up and cp
	 *     has tasks
	 */
	compute_effective_exclusive_cpumask(cs, new_ecpus);
	compute_effective_exclusive_cpumask(cs, new_ecpus, NULL);
	cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);

	rcu_read_lock();
@@ -1935,6 +1956,11 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
		if (!is_partition_valid(child))
			continue;

		/*
		 * There shouldn't be a remote partition underneath another
		 * partition root.
		 */
		WARN_ON_ONCE(is_remote_partition(child));
		child->prs_err = 0;
		if (!cpumask_subset(child->effective_xcpus,
				    cs->effective_xcpus))
@@ -1990,32 +2016,39 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
		bool remote = is_remote_partition(cp);
		bool update_parent = false;

		old_prs = new_prs = cp->partition_root_state;

		/*
		 * Skip descendent remote partition that acquires CPUs
		 * directly from top cpuset unless it is cs.
		 * For child remote partition root (!= cs), we need to call
		 * remote_cpus_update() if effective_xcpus will be changed.
		 * Otherwise, we can skip the whole subtree.
		 *
		 * remote_cpus_update() will reuse tmp->new_cpus only after
		 * its value is being processed.
		 */
		if (remote && (cp != cs)) {
			compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL);
			if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {
				pos_css = css_rightmost_descendant(pos_css);
				continue;
			}
			rcu_read_unlock();
			remote_cpus_update(cp, NULL, tmp->new_cpus, tmp);
			rcu_read_lock();

		/*
		 * Update effective_xcpus if exclusive_cpus set.
		 * The case when exclusive_cpus isn't set is handled later.
		 */
		if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
			spin_lock_irq(&callback_lock);
			compute_effective_exclusive_cpumask(cp, NULL);
			spin_unlock_irq(&callback_lock);
			/* Remote partition may be invalidated */
			new_prs = cp->partition_root_state;
			remote = (new_prs == old_prs);
		}

		old_prs = new_prs = cp->partition_root_state;
		if (remote || (is_partition_valid(parent) &&
			       is_partition_valid(cp)))
		if (remote || (is_partition_valid(parent) && is_partition_valid(cp)))
			compute_partition_effective_cpumask(cp, tmp->new_cpus);
		else
			compute_effective_cpumask(tmp->new_cpus, cp, parent);

		if (remote)
			goto get_css;	/* Ready to update cpuset data */

		/*
		 * A partition with no effective_cpus is allowed as long as
		 * there is no task associated with it. Call
@@ -2035,9 +2068,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
		if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))
			cpumask_copy(tmp->new_cpus, parent->effective_cpus);

		if (remote)
			goto get_css;

		/*
		 * Skip the whole subtree if
		 * 1) the cpumask remains the same,
@@ -2098,6 +2128,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
		spin_lock_irq(&callback_lock);
		cpumask_copy(cp->effective_cpus, tmp->new_cpus);
		cp->partition_root_state = new_prs;
		if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
			compute_effective_exclusive_cpumask(cp, NULL, NULL);

		/*
		 * Make sure effective_xcpus is properly set for a valid
		 * partition root.
@@ -2184,7 +2217,14 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
						  parent);
			if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
				continue;
		} else if (is_remote_partition(sibling)) {
			/*
			 * Change in a sibling cpuset won't affect a remote
			 * partition root.
			 */
			continue;
		}

		if (!css_tryget_online(&sibling->css))
			continue;

@@ -2241,8 +2281,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
		 * trialcs->effective_xcpus is used as a temporary cpumask
		 * for checking validity of the partition root.
		 */
		trialcs->partition_root_state = PRS_MEMBER;
		if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
			compute_effective_exclusive_cpumask(trialcs, NULL);
			compute_effective_exclusive_cpumask(trialcs, NULL, cs);
	}

	/* Nothing to do if the cpus didn't change */
@@ -2315,19 +2356,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
		 * Call remote_cpus_update() to handle valid remote partition
		 */
		if (is_remote_partition(cs))
			remote_cpus_update(cs, xcpus, &tmp);
			remote_cpus_update(cs, NULL, xcpus, &tmp);
		else if (invalidate)
			update_parent_effective_cpumask(cs, partcmd_invalidate,
							NULL, &tmp);
		else
			update_parent_effective_cpumask(cs, partcmd_update,
							xcpus, &tmp);
	} else if (!cpumask_empty(cs->exclusive_cpus)) {
		/*
		 * Use trialcs->effective_cpus as a temp cpumask
		 */
		remote_partition_check(cs, trialcs->effective_xcpus,
				       trialcs->effective_cpus, &tmp);
	}

	spin_lock_irq(&callback_lock);
@@ -2379,8 +2414,15 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
	if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
		return 0;

	if (*buf)
		compute_effective_exclusive_cpumask(trialcs, NULL);
	if (*buf) {
		trialcs->partition_root_state = PRS_MEMBER;
		/*
		 * Reject the change if there is exclusive CPUs conflict with
		 * the siblings.
		 */
		if (compute_effective_exclusive_cpumask(trialcs, NULL, cs))
			return -EINVAL;
	}

	/*
	 * Check all the descendants in update_cpumasks_hier() if
@@ -2411,8 +2453,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
			if (invalidate)
				remote_partition_disable(cs, &tmp);
			else
				remote_cpus_update(cs, trialcs->effective_xcpus,
						   &tmp);
				remote_cpus_update(cs, trialcs->exclusive_cpus,
						   trialcs->effective_xcpus, &tmp);
		} else if (invalidate) {
			update_parent_effective_cpumask(cs, partcmd_invalidate,
							NULL, &tmp);
@@ -2420,12 +2462,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
			update_parent_effective_cpumask(cs, partcmd_update,
						trialcs->effective_xcpus, &tmp);
		}
	} else if (!cpumask_empty(trialcs->exclusive_cpus)) {
		/*
		 * Use trialcs->effective_cpus as a temp cpumask
		 */
		remote_partition_check(cs, trialcs->effective_xcpus,
				       trialcs->effective_cpus, &tmp);
	}
	spin_lock_irq(&callback_lock);
	cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
@@ -2806,17 +2842,6 @@ static int update_prstate(struct cpuset *cs, int new_prs)
	if (alloc_cpumasks(NULL, &tmpmask))
		return -ENOMEM;

	/*
	 * Setup effective_xcpus if not properly set yet, it will be cleared
	 * later if partition becomes invalid.
	 */
	if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
		spin_lock_irq(&callback_lock);
		cpumask_and(cs->effective_xcpus,
			    cs->cpus_allowed, parent->effective_xcpus);
		spin_unlock_irq(&callback_lock);
	}

	err = update_partition_exclusive(cs, new_prs);
	if (err)
		goto out;
@@ -3753,7 +3778,6 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
		remote_partition_disable(cs, tmp);
		compute_effective_cpumask(&new_cpus, cs, parent);
		remote = false;
		cpuset_force_rebuild();
	}

	/*
+2 −2
Original line number Diff line number Diff line
@@ -326,8 +326,8 @@ TEST_MATRIX=(
				   .      .     X3      P2     .     0 A1:0-2,A2:1-2,XA2:3,XA3:3,A3:3 \
								       A1:P0,A3:P2 3"
	" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \
				   .      .     X3      .      .     0 A1:0-3,A2:1-3,XA2:3,XA3:3,A3:2-3 \
								       A1:P0,A3:P-2"
				   .      .     X3      .      .     0 A1:0-2,A2:1-2,XA2:3,XA3:3,A3:3,XA3:3 \
								       A1:P0,A3:P2 3"
	" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \
				   .     X4      .      .      .     0 A1:0-3,A2:1-3,A3:2-3,XA1:4,XA2:,XA3 \
								       A1:P0,A3:P-2"