Commit 38448181 authored by Johannes Weiner's avatar Johannes Weiner Committed by Andrew Morton
Browse files

mm: vmscan: restore high-cpu watermark safety in kswapd

Vlastimil points out that commit a211c655 ("mm: page_alloc:
defrag_mode kswapd/kcompactd watermarks") switched kswapd from
zone_watermark_ok_safe() to the standard, percpu-cached version of reading
free pages, thus dropping the watermark safety precautions for systems
with high CPU counts (e.g.  >212 cpus on 64G).  Restore them.

Since zone_watermark_ok_safe() is no longer the right interface, and this
was the last caller of the function anyway, open-code the
zone_page_state_snapshot() conditional and delete the function.

Link: https://lkml.kernel.org/r/20250416135142.778933-2-hannes@cmpxchg.org


Fixes: a211c655 ("mm: page_alloc: defrag_mode kswapd/kcompactd watermarks")
Signed-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Reported-by: default avatarVlastimil Babka <vbabka@suse.cz>
Reviewed-by: default avatarVlastimil Babka <vbabka@suse.cz>
Cc: Brendan Jackman <jackmanb@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 2db93a89
Loading
Loading
Loading
Loading
+0 −2
Original line number Diff line number Diff line
@@ -1502,8 +1502,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
bool zone_watermark_ok(struct zone *z, unsigned int order,
		unsigned long mark, int highest_zoneidx,
		unsigned int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
		unsigned long mark, int highest_zoneidx);
/*
 * Memory initialization context, use to differentiate memory added by
 * the platform statically or via memory hotplug interface.
+0 −12
Original line number Diff line number Diff line
@@ -3470,18 +3470,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
	return false;
}

bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
			unsigned long mark, int highest_zoneidx)
{
	long free_pages = zone_page_state(z, NR_FREE_PAGES);

	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

	return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
								free_pages);
}

#ifdef CONFIG_NUMA
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;

+19 −2
Original line number Diff line number Diff line
@@ -6736,6 +6736,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
	 * meet watermarks.
	 */
	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
		enum zone_stat_item item;
		unsigned long free_pages;

		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
@@ -6748,9 +6749,25 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
		 * blocks to avoid polluting allocator fallbacks.
		 */
		if (defrag_mode)
			free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS);
			item = NR_FREE_PAGES_BLOCKS;
		else
			free_pages = zone_page_state(zone, NR_FREE_PAGES);
			item = NR_FREE_PAGES;

		/*
		 * When there is a high number of CPUs in the system,
		 * the cumulative error from the vmstat per-cpu cache
		 * can blur the line between the watermarks. In that
		 * case, be safe and get an accurate snapshot.
		 *
		 * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
		 * pageblock_nr_pages, while the vmstat pcp threshold
		 * is limited to 125. On many configurations that
		 * counter won't actually be per-cpu cached. But keep
		 * things simple for now; revisit when somebody cares.
		 */
		free_pages = zone_page_state(zone, item);
		if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
			free_pages = zone_page_state_snapshot(zone, item);

		if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
					0, free_pages))