Commit 53d85a20 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'cgroup-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:

 - Fix cgroup rmdir racing with dying tasks.

   Deferred task cgroup unlink introduced a window where cgroup.procs
   is empty but the cgroup is still populated, causing rmdir to fail
   with -EBUSY and selftest failures.

   Make rmdir wait for dying tasks to fully leave and fix selftests to
   not depend on synchronous populated updates.

 - Fix cpuset v1 task migration failure from empty cpusets under strict
   security policies.

   When CPU hotplug removes the last CPU from a v1 cpuset, tasks must be
   migrated to an ancestor without a security_task_setscheduler() check
   that would block the migration.

* tag 'cgroup-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup/cpuset: Skip security check for hotplug induced v1 task migration
  cgroup/cpuset: Simplify setsched decision check in task iteration loop of cpuset_can_attach()
  cgroup: Fix cgroup_drain_dying() testing the wrong condition
  selftests/cgroup: Don't require synchronous populated update on task exit
  cgroup: Wait for dying tasks to leave on rmdir
parents dbf00d8d 089f3fcd
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -609,6 +609,9 @@ struct cgroup {
	/* used to wait for offlining of csses */
	wait_queue_head_t offline_waitq;

	/* used by cgroup_rmdir() to wait for dying tasks to leave */
	wait_queue_head_t dying_populated_waitq;

	/* used to schedule release agent */
	struct work_struct release_agent_work;

+85 −3
Original line number Diff line number Diff line
@@ -2126,6 +2126,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
#endif

	init_waitqueue_head(&cgrp->offline_waitq);
	init_waitqueue_head(&cgrp->dying_populated_waitq);
	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}

@@ -6224,6 +6225,78 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
	return 0;
};

/**
 * cgroup_drain_dying - wait for dying tasks to leave before rmdir
 * @cgrp: the cgroup being removed
 *
 * cgroup.procs and cgroup.threads use css_task_iter which filters out
 * PF_EXITING tasks so that userspace doesn't see tasks that have already been
 * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the
 * cgroup has non-empty css_sets - is only updated when dying tasks pass through
 * cgroup_task_dead() in finish_task_switch(). This creates a window where
 * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir
 * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no
 * tasks.
 *
 * This function aligns cgroup_has_tasks() with what userspace can observe. If
 * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are
 * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the
 * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief.
 *
 * This function only concerns itself with this cgroup's own dying tasks.
 * Whether the cgroup has children is cgroup_destroy_locked()'s problem.
 *
 * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
 * retry the full check from scratch.
 *
 * Must be called with cgroup_mutex held.
 */
static int cgroup_drain_dying(struct cgroup *cgrp)
	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
	struct css_task_iter it;
	struct task_struct *task;
	DEFINE_WAIT(wait);

	lockdep_assert_held(&cgroup_mutex);
retry:
	if (!cgroup_has_tasks(cgrp))
		return 0;

	/* Same iterator as cgroup.threads - if any task is visible, it's busy */
	css_task_iter_start(&cgrp->self, 0, &it);
	task = css_task_iter_next(&it);
	css_task_iter_end(&it);

	if (task)
		return -EBUSY;

	/*
	 * All remaining tasks are PF_EXITING and will pass through
	 * cgroup_task_dead() shortly. Wait for a kick and retry.
	 *
	 * cgroup_has_tasks() can't transition from false to true while we're
	 * holding cgroup_mutex, but the true to false transition happens
	 * under css_set_lock (via cgroup_task_dead()). We must retest and
	 * prepare_to_wait() under css_set_lock. Otherwise, the transition
	 * can happen between our first test and prepare_to_wait(), and we
	 * sleep with no one to wake us.
	 */
	spin_lock_irq(&css_set_lock);
	if (!cgroup_has_tasks(cgrp)) {
		spin_unlock_irq(&css_set_lock);
		return 0;
	}
	prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
			TASK_UNINTERRUPTIBLE);
	spin_unlock_irq(&css_set_lock);
	mutex_unlock(&cgroup_mutex);
	schedule();
	finish_wait(&cgrp->dying_populated_waitq, &wait);
	mutex_lock(&cgroup_mutex);
	goto retry;
}

int cgroup_rmdir(struct kernfs_node *kn)
{
	struct cgroup *cgrp;
@@ -6233,9 +6306,12 @@ int cgroup_rmdir(struct kernfs_node *kn)
	if (!cgrp)
		return 0;

	ret = cgroup_drain_dying(cgrp);
	if (!ret) {
		ret = cgroup_destroy_locked(cgrp);
		if (!ret)
			TRACE_CGROUP_PATH(rmdir, cgrp);
	}

	cgroup_kn_unlock(kn);
	return ret;
@@ -6995,6 +7071,7 @@ void cgroup_task_exit(struct task_struct *tsk)

static void do_cgroup_task_dead(struct task_struct *tsk)
{
	struct cgrp_cset_link *link;
	struct css_set *cset;
	unsigned long flags;

@@ -7008,6 +7085,11 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
	if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
		list_add_tail(&tsk->cg_list, &cset->dying_tasks);

	/* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
		if (waitqueue_active(&link->cgrp->dying_populated_waitq))
			wake_up(&link->cgrp->dying_populated_waitq);

	if (dl_task(tsk))
		dec_dl_tasks_cs(tsk);

+20 −9
Original line number Diff line number Diff line
@@ -2988,7 +2988,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
	struct cgroup_subsys_state *css;
	struct cpuset *cs, *oldcs;
	struct task_struct *task;
	bool cpus_updated, mems_updated;
	bool setsched_check;
	int ret;

	/* used later by cpuset_attach() */
@@ -3003,20 +3003,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
	if (ret)
		goto out_unlock;

	cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
	mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
	/*
	 * Skip rights over task setsched check in v2 when nothing changes,
	 * migration permission derives from hierarchy ownership in
	 * cgroup_procs_write_permission()).
	 */
	setsched_check = !cpuset_v2() ||
		!cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) ||
		!nodes_equal(cs->effective_mems, oldcs->effective_mems);

	/*
	 * A v1 cpuset with tasks will have no CPU left only when CPU hotplug
	 * brings the last online CPU offline as users are not allowed to empty
	 * cpuset.cpus when there are active tasks inside. When that happens,
	 * we should allow tasks to migrate out without security check to make
	 * sure they will be able to run after migration.
	 */
	if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus))
		setsched_check = false;

	cgroup_taskset_for_each(task, css, tset) {
		ret = task_can_attach(task);
		if (ret)
			goto out_unlock;

		/*
		 * Skip rights over task check in v2 when nothing changes,
		 * migration permission derives from hierarchy ownership in
		 * cgroup_procs_write_permission()).
		 */
		if (!cpuset_v2() || (cpus_updated || mems_updated)) {
		if (setsched_check) {
			ret = security_task_setscheduler(task);
			if (ret)
				goto out_unlock;
+15 −0
Original line number Diff line number Diff line
@@ -123,6 +123,21 @@ int cg_read_strcmp(const char *cgroup, const char *control,
	return ret;
}

int cg_read_strcmp_wait(const char *cgroup, const char *control,
			    const char *expected)
{
	int i, ret;

	for (i = 0; i < 100; i++) {
		ret = cg_read_strcmp(cgroup, control, expected);
		if (!ret)
			return ret;
		usleep(10000);
	}

	return ret;
}

int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
{
	char buf[PAGE_SIZE];
+2 −0
Original line number Diff line number Diff line
@@ -61,6 +61,8 @@ extern int cg_read(const char *cgroup, const char *control,
		   char *buf, size_t len);
extern int cg_read_strcmp(const char *cgroup, const char *control,
			  const char *expected);
extern int cg_read_strcmp_wait(const char *cgroup, const char *control,
				   const char *expected);
extern int cg_read_strstr(const char *cgroup, const char *control,
			  const char *needle);
extern long cg_read_long(const char *cgroup, const char *control);
Loading