cgroup: Defer css percpu_ref kill on rmdir until cgroup is depopulated (93618edf) · Commits · git / linux-net

include/linux/cgroup-defs.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -611,8 +611,8 @@ struct cgroup {
		/* used to wait for offlining of csses */
		wait_queue_head_t offline_waitq;

		/* used by cgroup_rmdir() to wait for dying tasks to leave */
		wait_queue_head_t dying_populated_waitq;
		/* defers killing csses after removal until cgroup is depopulated */
		struct work_struct finish_destroy_work;

		/* used to schedule release agent */
		struct work_struct release_agent_work;

kernel/cgroup/cgroup.c

+117 −133

Original line number	Diff line number	Diff line
		@@ -264,10 +264,12 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
		static void css_task_iter_skip(struct css_task_iter *it,
		struct task_struct *task);
		static int cgroup_destroy_locked(struct cgroup *cgrp);
		static void cgroup_finish_destroy(struct cgroup *cgrp);
		static void kill_css_sync(struct cgroup_subsys_state *css);
		static void kill_css_finish(struct cgroup_subsys_state *css);
		static struct cgroup_subsys_state css_create(struct cgroup cgrp,
		struct cgroup_subsys *ss);
		static void css_release(struct percpu_ref *ref);
		static void kill_css(struct cgroup_subsys_state *css);
		static int cgroup_addrm_files(struct cgroup_subsys_state *css,
		struct cgroup *cgrp, struct cftype cfts[],
		bool is_add);
		@@ -797,6 +799,16 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
		if (was_populated == cgroup_is_populated(cgrp))
		break;

		/*
		* Subtree just emptied below an offlined cgrp. Fire deferred
		* destroy. The transition is one-shot.
		*/
		if (was_populated && !css_is_online(&cgrp->self)) {
		cgroup_get(cgrp);
		WARN_ON_ONCE(!queue_work(cgroup_offline_wq,
		&cgrp->finish_destroy_work));
		}

		cgroup1_check_for_release(cgrp);
		TRACE_CGROUP_PATH(notify_populated, cgrp,
		cgroup_is_populated(cgrp));
		@@ -2039,6 +2051,16 @@ static int cgroup_reconfigure(struct fs_context *fc)
		return 0;
		}

		static void cgroup_finish_destroy_work_fn(struct work_struct *work)
		{
		struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work);

		cgroup_lock();
		cgroup_finish_destroy(cgrp);
		cgroup_unlock();
		cgroup_put(cgrp);
		}

		static void init_cgroup_housekeeping(struct cgroup *cgrp)
		{
		struct cgroup_subsys *ss;
		@@ -2065,7 +2087,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
		#endif

		init_waitqueue_head(&cgrp->offline_waitq);
		init_waitqueue_head(&cgrp->dying_populated_waitq);
		INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn);
		INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
		}

		@@ -3375,7 +3397,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)

		if (css->parent &&
		!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
		kill_css(css);
		kill_css_sync(css);
		kill_css_finish(css);
		} else if (!css_visible(css)) {
		css_clear_dir(css);
		if (ss->css_reset)
		@@ -5514,7 +5537,7 @@ static struct cftype cgroup_psi_files[] = {
		* css destruction is four-stage process.
		*
		* 1. Destruction starts. Killing of the percpu_ref is initiated.
		* Implemented in kill_css().
		* Implemented in kill_css_finish().
		*
		* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
		* and thus css_tryget_online() is guaranteed to fail, the css can be
		@@ -5993,7 +6016,7 @@ int cgroup_mkdir(struct kernfs_node parent_kn, const char name, umode_t mode)
		/*
		* This is called when the refcnt of a css is confirmed to be killed.
		* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
		* initiate destruction and put the css ref from kill_css().
		* initiate destruction and put the css ref from kill_css_finish().
		*/
		static void css_killed_work_fn(struct work_struct *work)
		{
		@@ -6025,15 +6048,12 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
		}

		/**
		* kill_css - destroy a css
		* @css: css to destroy
		* kill_css_sync - synchronous half of css teardown
		* @css: css being killed
		*
		* This function initiates destruction of @css by removing cgroup interface
		* files and putting its base reference. ->css_offline() will be invoked
		* asynchronously once css_tryget_online() is guaranteed to fail and when
		* the reference count reaches zero, @css will be released.
		* See cgroup_destroy_locked().
		*/
		static void kill_css(struct cgroup_subsys_state *css)
		static void kill_css_sync(struct cgroup_subsys_state *css)
		{
		struct cgroup_subsys *ss = css->ss;

		@@ -6056,24 +6076,6 @@ static void kill_css(struct cgroup_subsys_state *css)
		*/
		css_clear_dir(css);

		/*
		* Killing would put the base ref, but we need to keep it alive
		* until after ->css_offline().
		*/
		css_get(css);

		/*
		* cgroup core guarantees that, by the time ->css_offline() is
		* invoked, no new css reference will be given out via
		* css_tryget_online(). We can't simply call percpu_ref_kill() and
		* proceed to offlining css's because percpu_ref_kill() doesn't
		* guarantee that the ref is seen as killed on all CPUs on return.
		*
		* Use percpu_ref_kill_and_confirm() to get notifications as each
		* css is confirmed to be seen as killed on all CPUs.
		*/
		percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);

		css->cgroup->nr_dying_subsys[ss->id]++;
		/*
		* Parent css and cgroup cannot be freed until after the freeing
		@@ -6086,44 +6088,88 @@ static void kill_css(struct cgroup_subsys_state *css)
		}

		/**
		* cgroup_destroy_locked - the first stage of cgroup destruction
		* kill_css_finish - deferred half of css teardown
		* @css: css being killed
		*
		* See cgroup_destroy_locked().
		*/
		static void kill_css_finish(struct cgroup_subsys_state *css)
		{
		lockdep_assert_held(&cgroup_mutex);

		/*
		* Skip on re-entry: cgroup_apply_control_disable() may have killed @css
		* earlier. cgroup_destroy_locked() can still walk it because
		* offline_css() (which NULLs cgrp->subsys[ssid]) runs async.
		*/
		if (percpu_ref_is_dying(&css->refcnt))
		return;

		/*
		* Killing would put the base ref, but we need to keep it alive until
		* after ->css_offline().
		*/
		css_get(css);

		/*
		* cgroup core guarantees that, by the time ->css_offline() is invoked,
		* no new css reference will be given out via css_tryget_online(). We
		* can't simply call percpu_ref_kill() and proceed to offlining css's
		* because percpu_ref_kill() doesn't guarantee that the ref is seen as
		* killed on all CPUs on return.
		*
		* Use percpu_ref_kill_and_confirm() to get notifications as each css is
		* confirmed to be seen as killed on all CPUs.
		*/
		percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
		}

		/**
		* cgroup_destroy_locked - destroy @cgrp (called on rmdir)
		* @cgrp: cgroup to be destroyed
		*
		* css's make use of percpu refcnts whose killing latency shouldn't be
		* exposed to userland and are RCU protected. Also, cgroup core needs to
		* guarantee that css_tryget_online() won't succeed by the time
		* ->css_offline() is invoked. To satisfy all the requirements,
		* destruction is implemented in the following two steps.
		* Tear down @cgrp on behalf of rmdir. Constraints:
		*
		* - Userspace: rmdir must succeed when cgroup.procs and friends are empty.
		*
		* - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's
		* subtree is still doing kernel work. A task hidden from cgroup.procs (past
		* exit_signals() with signal->live cleared) can still schedule, allocate, and
		* consume resources until its final context switch. Dying descendants in the
		* subtree can host such tasks too.
		*
		* - Kernel: css_tryget_online() must fail by the time ->css_offline() runs.
		*
		* s1. Verify @cgrp can be destroyed and mark it dying. Remove all
		* userland visible parts and start killing the percpu refcnts of
		* css's. Set up so that the next stage will be kicked off once all
		* the percpu refcnts are confirmed to be killed.
		* The destruction runs in three parts:
		*
		* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
		* rest of destruction. Once all cgroup references are gone, the
		* cgroup is RCU-freed.
		* - This function: synchronous user-visible state teardown plus kill_css_sync()
		* on each subsystem css.
		*
		* This function implements s1. After this step, @cgrp is gone as far as
		* the userland is concerned and a new cgroup with the same name may be
		* created. As cgroup doesn't care about the names internally, this
		* doesn't cause any problem.
		* - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on
		* each subsystem css. Fires once @cgrp's subtree is fully drained, either
		* inline here or from cgroup_update_populated().
		*
		* - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn ->
		* ->css_offline() -> release/free.
		*
		* Return 0 on success, -EBUSY if a userspace-visible task or an online child
		* remains.
		*/
		static int cgroup_destroy_locked(struct cgroup *cgrp)
		__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
		{
		struct cgroup tcgrp, parent = cgroup_parent(cgrp);
		struct cgroup_subsys_state *css;
		struct cgrp_cset_link *link;
		struct css_task_iter it;
		struct task_struct *task;
		int ssid, ret;

		lockdep_assert_held(&cgroup_mutex);

		/*
		* Only migration can raise populated from zero and we're already
		* holding cgroup_mutex.
		*/
		if (cgroup_is_populated(cgrp))
		css_task_iter_start(&cgrp->self, 0, &it);
		task = css_task_iter_next(&it);
		css_task_iter_end(&it);
		if (task)
		return -EBUSY;

		/*
		@@ -6147,9 +6193,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
		link->cset->dead = true;
		spin_unlock_irq(&css_set_lock);

		/* initiate massacre of all css's */
		for_each_css(css, ssid, cgrp)
		kill_css(css);
		kill_css_sync(css);

		/* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
		css_clear_dir(&cgrp->self);
		@@ -6180,79 +6225,27 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
		/* put the base reference */
		percpu_ref_kill(&cgrp->self.refcnt);

		if (!cgroup_is_populated(cgrp))
		cgroup_finish_destroy(cgrp);

		return 0;
		};

		/**
		* cgroup_drain_dying - wait for dying tasks to leave before rmdir
		* @cgrp: the cgroup being removed
		*
		* cgroup.procs and cgroup.threads use css_task_iter which filters out
		* PF_EXITING tasks so that userspace doesn't see tasks that have already been
		* reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the
		* cgroup has non-empty css_sets - is only updated when dying tasks pass through
		* cgroup_task_dead() in finish_task_switch(). This creates a window where
		* cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir
		* fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no
		* tasks.
		*
		* This function aligns cgroup_has_tasks() with what userspace can observe. If
		* cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are
		* PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the
		* window between PF_EXITING and cgroup_task_dead() is short, the wait is brief.
		*
		* This function only concerns itself with this cgroup's own dying tasks.
		* Whether the cgroup has children is cgroup_destroy_locked()'s problem.
		* cgroup_finish_destroy - deferred half of @cgrp destruction
		* @cgrp: cgroup whose subtree just became empty
		*
		* Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
		* retry the full check from scratch.
		*
		* Must be called with cgroup_mutex held.
		* See cgroup_destroy_locked() for the rationale.
		*/
		static int cgroup_drain_dying(struct cgroup *cgrp)
		__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
		static void cgroup_finish_destroy(struct cgroup *cgrp)
		{
		struct css_task_iter it;
		struct task_struct *task;
		DEFINE_WAIT(wait);
		struct cgroup_subsys_state *css;
		int ssid;

		lockdep_assert_held(&cgroup_mutex);
		retry:
		if (!cgroup_has_tasks(cgrp))
		return 0;

		/* Same iterator as cgroup.threads - if any task is visible, it's busy */
		css_task_iter_start(&cgrp->self, 0, &it);
		task = css_task_iter_next(&it);
		css_task_iter_end(&it);

		if (task)
		return -EBUSY;

		/*
		* All remaining tasks are PF_EXITING and will pass through
		* cgroup_task_dead() shortly. Wait for a kick and retry.
		*
		* cgroup_has_tasks() can't transition from false to true while we're
		* holding cgroup_mutex, but the true to false transition happens
		* under css_set_lock (via cgroup_task_dead()). We must retest and
		* prepare_to_wait() under css_set_lock. Otherwise, the transition
		* can happen between our first test and prepare_to_wait(), and we
		* sleep with no one to wake us.
		*/
		spin_lock_irq(&css_set_lock);
		if (!cgroup_has_tasks(cgrp)) {
		spin_unlock_irq(&css_set_lock);
		return 0;
		}
		prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
		TASK_UNINTERRUPTIBLE);
		spin_unlock_irq(&css_set_lock);
		mutex_unlock(&cgroup_mutex);
		schedule();
		finish_wait(&cgrp->dying_populated_waitq, &wait);
		mutex_lock(&cgroup_mutex);
		goto retry;
		for_each_css(css, ssid, cgrp)
		kill_css_finish(css);
		}

		int cgroup_rmdir(struct kernfs_node *kn)
		@@ -6264,12 +6257,9 @@ int cgroup_rmdir(struct kernfs_node *kn)
		if (!cgrp)
		return 0;

		ret = cgroup_drain_dying(cgrp);
		if (!ret) {
		ret = cgroup_destroy_locked(cgrp);
		if (!ret)
		TRACE_CGROUP_PATH(rmdir, cgrp);
		}

		cgroup_kn_unlock(kn);
		return ret;
		@@ -7029,7 +7019,6 @@ void cgroup_task_exit(struct task_struct *tsk)

		static void do_cgroup_task_dead(struct task_struct *tsk)
		{
		struct cgrp_cset_link *link;
		struct css_set *cset;
		unsigned long flags;

		@@ -7043,11 +7032,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
		if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
		list_add_tail(&tsk->cg_list, &cset->dying_tasks);

		/* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
		list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
		if (waitqueue_active(&link->cgrp->dying_populated_waitq))
		wake_up(&link->cgrp->dying_populated_waitq);

		if (dl_task(tsk))
		dec_dl_tasks_cs(tsk);