Commit 8285917d authored by Qi Zheng's avatar Qi Zheng Committed by Andrew Morton
Browse files

mm: memcontrol: prepare for reparenting non-hierarchical stats

To resolve the dying memcg issue, we need to reparent LRU folios of child
memcg to its parent memcg.  This could cause problems for non-hierarchical
stats.

As Yosry Ahmed pointed out:

In short, if memory is charged to a dying cgroup at the time of
reparenting, when the memory gets uncharged the stats updates will occur
at the parent. This will update both hierarchical and non-hierarchical
stats of the parent, which would corrupt the parent's non-hierarchical
stats (because those counters were never incremented when the memory was
charged).

Now we have the following two types of non-hierarchical stats, and they
are only used in CONFIG_MEMCG_V1:

a. memcg->vmstats->state_local[i]
b. pn->lruvec_stats->state_local[i]

To ensure that these non-hierarchical stats work properly, we need to
reparent these non-hierarchical stats after reparenting LRU folios. To
this end, this commit makes the following preparations:

1. implement reparent_state_local() to reparent non-hierarchical stats
2. make css_killed_work_fn() to be called in rcu work, and implement
   get_non_dying_memcg_start() and get_non_dying_memcg_end() to avoid race
   between mod_memcg_state()/mod_memcg_lruvec_state()
   and reparent_state_local()

Link: https://lore.kernel.org/e862995c45a7101a541284b6ebee5e5c32c89066.1772711148.git.zhengqi.arch@bytedance.com


Co-developed-by: default avatarYosry Ahmed <yosry@kernel.org>
Signed-off-by: default avatarYosry Ahmed <yosry@kernel.org>
Signed-off-by: default avatarQi Zheng <zhengqi.arch@bytedance.com>
Acked-by: default avatarShakeel Butt <shakeel.butt@linux.dev>
Cc: Allen Pais <apais@linux.microsoft.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: Wei Xu <weixugc@google.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 5371e350
Loading
Loading
Loading
Loading
+5 −4
Original line number Diff line number Diff line
@@ -6050,8 +6050,9 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
 */
static void css_killed_work_fn(struct work_struct *work)
{
	struct cgroup_subsys_state *css =
		container_of(work, struct cgroup_subsys_state, destroy_work);
	struct cgroup_subsys_state *css;

	css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);

	cgroup_lock();

@@ -6072,8 +6073,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
		container_of(ref, struct cgroup_subsys_state, refcnt);

	if (atomic_dec_and_test(&css->online_cnt)) {
		INIT_WORK(&css->destroy_work, css_killed_work_fn);
		queue_work(cgroup_offline_wq, &css->destroy_work);
		INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
		queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
	}
}

+16 −0
Original line number Diff line number Diff line
@@ -1884,6 +1884,22 @@ static const unsigned int memcg1_events[] = {
	PGMAJFAULT,
};

void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
	int i;

	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
		reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
}

void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
	int i;

	for (i = 0; i < NR_LRU_LISTS; i++)
		reparent_memcg_lruvec_state_local(memcg, parent, i);
}

void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
	unsigned long memory, memsw;
+7 −0
Original line number Diff line number Diff line
@@ -73,6 +73,13 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
			   unsigned long nr_memory, int nid);

void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);

void reparent_memcg_state_local(struct mem_cgroup *memcg,
				struct mem_cgroup *parent, int idx);
void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
				       struct mem_cgroup *parent, int idx);

void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
+97 −0
Original line number Diff line number Diff line
@@ -225,6 +225,34 @@ static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memc
	return objcg;
}

#ifdef CONFIG_MEMCG_V1
static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);

static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
		return;

	/*
	 * Reparent stats exposed non-hierarchically. Flush @memcg's stats first
	 * to read its stats accurately , and conservatively flush @parent's
	 * stats after reparenting to avoid hiding a potentially large stat
	 * update (e.g. from callers of mem_cgroup_flush_stats_ratelimited()).
	 */
	__mem_cgroup_flush_stats(memcg, true);

	/* The following counts are all non-hierarchical and need to be reparented. */
	reparent_memcg1_state_local(memcg, parent);
	reparent_memcg1_lruvec_state_local(memcg, parent);

	__mem_cgroup_flush_stats(parent, true);
}
#else
static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
}
#endif

static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
	spin_lock_irq(&objcg_lock);
@@ -472,6 +500,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
	return x;
}

#ifdef CONFIG_MEMCG_V1
static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
				     enum node_stat_item idx, int val);

void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
				       struct mem_cgroup *parent, int idx)
{
	int nid;

	for_each_node(nid) {
		struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
		struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
		unsigned long value = lruvec_page_state_local(child_lruvec, idx);
		struct mem_cgroup_per_node *child_pn, *parent_pn;

		child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
		parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);

		__mod_memcg_lruvec_state(child_pn, idx, -value);
		__mod_memcg_lruvec_state(parent_pn, idx, value);
	}
}
#endif

/* Subset of vm_event_item to report for memcg event stats */
static const unsigned int memcg_vm_event_stat[] = {
#ifdef CONFIG_MEMCG_V1
@@ -717,6 +769,42 @@ static int memcg_state_val_in_pages(int idx, int val)
		return max(val * unit / PAGE_SIZE, 1UL);
}

#ifdef CONFIG_MEMCG_V1
/*
 * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
 * reparenting of non-hierarchical state_locals.
 */
static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
{
	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
		return memcg;

	rcu_read_lock();

	while (memcg_is_dying(memcg))
		memcg = parent_mem_cgroup(memcg);

	return memcg;
}

static inline void get_non_dying_memcg_end(void)
{
	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
		return;

	rcu_read_unlock();
}
#else
static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
{
	return memcg;
}

static inline void get_non_dying_memcg_end(void)
{
}
#endif

static void __mod_memcg_state(struct mem_cgroup *memcg,
			      enum memcg_stat_item idx, int val)
{
@@ -768,6 +856,15 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
#endif
	return x;
}

void reparent_memcg_state_local(struct mem_cgroup *memcg,
				struct mem_cgroup *parent, int idx)
{
	unsigned long value = memcg_page_state_local(memcg, idx);

	__mod_memcg_state(memcg, idx, -value);
	__mod_memcg_state(parent, idx, value);
}
#endif

static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,