Commit 8449d325 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull cgroup updates from Tejun Heo:

 - Defer task cgroup unlink until after the dying task's final context
   switch so that controllers see the cgroup properly populated until
   the task is truly gone

 - cpuset cleanups and simplifications.

   Enforce that domain isolated CPUs stay in root or isolated partitions
   and fail if isolated+nohz_full would leave no housekeeping CPU. Fix
   sched/deadline root domain handling during CPU hot-unplug and race
   for tasks in attaching cpusets

 - Misc fixes including memory reclaim protection documentation and
   selftest KTAP conformance

* tag 'cgroup-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (21 commits)
  cpuset: Treat cpusets in attaching as populated
  sched/deadline: Walk up cpuset hierarchy to decide root domain when hot-unplug
  cgroup/cpuset: Introduce cpuset_cpus_allowed_locked()
  docs: cgroup: No special handling of unpopulated memcgs
  docs: cgroup: Note about sibling relative reclaim protection
  docs: cgroup: Explain reclaim protection target
  selftests/cgroup: conform test to KTAP format output
  cpuset: remove need_rebuild_sched_domains
  cpuset: remove global remote_children list
  cpuset: simplify node setting on error
  cgroup: include missing header for struct irq_work
  cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
  cgroup/cpuset: Globally track isolated_cpus update
  cgroup/cpuset: Ensure domain isolated CPUs stay in root or isolated partition
  cgroup/cpuset: Move up prstate_housekeeping_conflict() helper
  cgroup/cpuset: Fail if isolated and nohz_full don't leave any housekeeping
  cgroup/cpuset: Rename update_unbound_workqueue_cpumask() to update_isolation_cpumasks()
  cgroup: Defer task cgroup unlink until after the task is done switching out
  cgroup: Move dying_tasks cleanup from cgroup_task_release() to cgroup_task_free()
  cgroup: Rename cgroup lifecycle hooks to cgroup_task_*()
  ...
parents 2b601457 b1bcaed1
Loading
Loading
Loading
Loading
+25 −6
Original line number Diff line number Diff line
@@ -53,7 +53,8 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
     5-2. Memory
       5-2-1. Memory Interface Files
       5-2-2. Usage Guidelines
       5-2-3. Memory Ownership
       5-2-3. Reclaim Protection
       5-2-4. Memory Ownership
     5-3. IO
       5-3-1. IO Interface Files
       5-3-2. Writeback
@@ -1317,7 +1318,7 @@ PAGE_SIZE multiple when read back.
	smaller overages.

	Effective min boundary is limited by memory.min values of
	all ancestor cgroups. If there is memory.min overcommitment
	ancestor cgroups. If there is memory.min overcommitment
	(child cgroup or cgroups are requiring more protected memory
	than parent will allow), then each child cgroup will get
	the part of parent's protection proportional to its
@@ -1326,9 +1327,6 @@ PAGE_SIZE multiple when read back.
	Putting more memory than generally available under this
	protection is discouraged and may lead to constant OOMs.

	If a memory cgroup is not populated with processes,
	its memory.min is ignored.

  memory.low
	A read-write single value file which exists on non-root
	cgroups.  The default is "0".
@@ -1343,7 +1341,7 @@ PAGE_SIZE multiple when read back.
	smaller overages.

	Effective low boundary is limited by memory.low values of
	all ancestor cgroups. If there is memory.low overcommitment
	ancestor cgroups. If there is memory.low overcommitment
	(child cgroup or cgroups are requiring more protected memory
	than parent will allow), then each child cgroup will get
	the part of parent's protection proportional to its
@@ -1934,6 +1932,27 @@ memory - is necessary to determine whether a workload needs more
memory; unfortunately, memory pressure monitoring mechanism isn't
implemented yet.

Reclaim Protection
~~~~~~~~~~~~~~~~~~

The protection configured with "memory.low" or "memory.min" applies relatively
to the target of the reclaim (i.e. any of memory cgroup limits, proactive
memory.reclaim or global reclaim apparently located in the root cgroup).
The protection value configured for B applies unchanged to the reclaim
targeting A (i.e. caused by competition with the sibling E)::

		root - ... - A - B - C
		              \    ` D
		               ` E

When the reclaim targets ancestors of A, the effective protection of B is
capped by the protection value configured for A (and any other intermediate
ancestors between A and the target).

To express indifference about relative sibling protection, it is suggested to
use memory_recursiveprot. Configuring all descendants of a parent with finite
protection to "max" works but it may unnecessarily skew memory.events:low
field.

Memory Ownership
~~~~~~~~~~~~~~~~
+8 −6
Original line number Diff line number Diff line
@@ -137,9 +137,10 @@ extern void cgroup_cancel_fork(struct task_struct *p,
			       struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
			     struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p);
void cgroup_task_exit(struct task_struct *p);
void cgroup_task_dead(struct task_struct *p);
void cgroup_task_release(struct task_struct *p);
void cgroup_task_free(struct task_struct *p);

int cgroup_init_early(void);
int cgroup_init(void);
@@ -680,9 +681,10 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
				      struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
				    struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {}
static inline void cgroup_task_exit(struct task_struct *p) {}
static inline void cgroup_task_dead(struct task_struct *p) {}
static inline void cgroup_task_release(struct task_struct *p) {}
static inline void cgroup_task_free(struct task_struct *p) {}

static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
+8 −1
Original line number Diff line number Diff line
@@ -74,6 +74,7 @@ extern void inc_dl_tasks_cs(struct task_struct *task);
extern void dec_dl_tasks_cs(struct task_struct *task);
extern void cpuset_lock(void);
extern void cpuset_unlock(void);
extern void cpuset_cpus_allowed_locked(struct task_struct *p, struct cpumask *mask);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
extern bool cpuset_cpu_is_isolated(int cpu);
@@ -195,12 +196,18 @@ static inline void dec_dl_tasks_cs(struct task_struct *task) { }
static inline void cpuset_lock(void) { }
static inline void cpuset_unlock(void) { }

static inline void cpuset_cpus_allowed(struct task_struct *p,
static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
					struct cpumask *mask)
{
	cpumask_copy(mask, task_cpu_possible_mask(p));
}

static inline void cpuset_cpus_allowed(struct task_struct *p,
				       struct cpumask *mask)
{
	cpuset_cpus_allowed_locked(p, mask);
}

static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
{
	return false;
+4 −1
Original line number Diff line number Diff line
@@ -1324,7 +1324,10 @@ struct task_struct {
	struct css_set __rcu		*cgroups;
	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
	struct list_head		cg_list;
#endif
#ifdef CONFIG_PREEMPT_RT
	struct llist_node		cg_dead_lnode;
#endif	/* CONFIG_PREEMPT_RT */
#endif	/* CONFIG_CGROUPS */
#ifdef CONFIG_X86_CPU_RESCTRL
	u32				closid;
	u32				rmid;
+76 −15
Original line number Diff line number Diff line
@@ -60,6 +60,7 @@
#include <linux/sched/deadline.h>
#include <linux/psi.h>
#include <linux/nstree.h>
#include <linux/irq_work.h>
#include <net/sock.h>

#define CREATE_TRACE_POINTS
@@ -287,6 +288,7 @@ static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
			      struct cgroup *cgrp, struct cftype cfts[],
			      bool is_add);
static void cgroup_rt_init(void);

#ifdef CONFIG_DEBUG_CGROUP_REF
#define CGROUP_REF_FN_ATTRS	noinline
@@ -941,7 +943,8 @@ static void css_set_move_task(struct task_struct *task,
		/*
		 * We are synchronized through cgroup_threadgroup_rwsem
		 * against PF_EXITING setting such that we can't race
		 * against cgroup_exit()/cgroup_free() dropping the css_set.
		 * against cgroup_task_dead()/cgroup_task_free() dropping
		 * the css_set.
		 */
		WARN_ON_ONCE(task->flags & PF_EXITING);

@@ -6354,6 +6357,7 @@ int __init cgroup_init(void)
	BUG_ON(ss_rstat_init(NULL));

	get_user_ns(init_cgroup_ns.user_ns);
	cgroup_rt_init();

	cgroup_lock();

@@ -6967,19 +6971,29 @@ void cgroup_post_fork(struct task_struct *child,
}

/**
 * cgroup_exit - detach cgroup from exiting task
 * cgroup_task_exit - detach cgroup from exiting task
 * @tsk: pointer to task_struct of exiting process
 *
 * Description: Detach cgroup from @tsk.
 *
 */
void cgroup_exit(struct task_struct *tsk)
void cgroup_task_exit(struct task_struct *tsk)
{
	struct cgroup_subsys *ss;
	struct css_set *cset;
	int i;

	spin_lock_irq(&css_set_lock);
	/* see cgroup_post_fork() for details */
	do_each_subsys_mask(ss, i, have_exit_callback) {
		ss->exit(tsk);
	} while_each_subsys_mask();
}

static void do_cgroup_task_dead(struct task_struct *tsk)
{
	struct css_set *cset;
	unsigned long flags;

	spin_lock_irqsave(&css_set_lock, flags);

	WARN_ON_ONCE(list_empty(&tsk->cg_list));
	cset = task_css_set(tsk);
@@ -6997,15 +7011,61 @@ void cgroup_exit(struct task_struct *tsk)
		     test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
		cgroup_update_frozen(task_dfl_cgroup(tsk));

	spin_unlock_irq(&css_set_lock);
	spin_unlock_irqrestore(&css_set_lock, flags);
}

	/* see cgroup_post_fork() for details */
	do_each_subsys_mask(ss, i, have_exit_callback) {
		ss->exit(tsk);
	} while_each_subsys_mask();
#ifdef CONFIG_PREEMPT_RT
/*
 * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
 * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
 * this lead to sleeping in the invalid context warning bug. css_set_lock is too
 * big to become a raw_spinlock. The task_dead path doesn't need to run
 * synchronously but can't be delayed indefinitely either as the dead task pins
 * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
 * irq_work to allow batching while ensuring timely completion.
 */
static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);

static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
{
	struct llist_node *lnode;
	struct task_struct *task, *next;

	lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
	llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
		do_cgroup_task_dead(task);
		put_task_struct(task);
	}
}

static void __init cgroup_rt_init(void)
{
	int cpu;

	for_each_possible_cpu(cpu) {
		init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
		per_cpu(cgrp_dead_tasks_iwork, cpu) =
			IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
	}
}

void cgroup_task_dead(struct task_struct *task)
{
	get_task_struct(task);
	llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
	irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
}
#else	/* CONFIG_PREEMPT_RT */
static void __init cgroup_rt_init(void) {}

void cgroup_release(struct task_struct *task)
void cgroup_task_dead(struct task_struct *task)
{
	do_cgroup_task_dead(task);
}
#endif	/* CONFIG_PREEMPT_RT */

void cgroup_task_release(struct task_struct *task)
{
	struct cgroup_subsys *ss;
	int ssid;
@@ -7013,6 +7073,11 @@ void cgroup_release(struct task_struct *task)
	do_each_subsys_mask(ss, ssid, have_release_callback) {
		ss->release(task);
	} while_each_subsys_mask();
}

void cgroup_task_free(struct task_struct *task)
{
	struct css_set *cset = task_css_set(task);

	if (!list_empty(&task->cg_list)) {
		spin_lock_irq(&css_set_lock);
@@ -7020,11 +7085,7 @@ void cgroup_release(struct task_struct *task)
		list_del_init(&task->cg_list);
		spin_unlock_irq(&css_set_lock);
	}
}

void cgroup_free(struct task_struct *task)
{
	struct css_set *cset = task_css_set(task);
	put_css_set(cset);
}

Loading