Commit 661383c6 authored by Kairui Song's avatar Kairui Song Committed by Andrew Morton
Browse files

mm: swap: relaim the cached parts that got scanned

This commit implements reclaim during scan for cluster allocator.

Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which could
result in low allocation success rate or early OOM.

So to ensure maximum allocation success rate, integrate reclaiming with
scanning.  If found a range of suitable swap slots but fragmented due to
HAS_CACHE, just try to reclaim the slots.

Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org


Signed-off-by: default avatarKairui Song <kasong@tencent.com>
Reported-by: default avatarBarry Song <21cnbao@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 477cb7ba
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -301,6 +301,7 @@ struct swap_info_struct {
					/* list of cluster that contains at least one free slot */
	struct list_head frag_clusters[SWAP_NR_ORDERS];
					/* list of cluster that are fragmented or contented */
	unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
	unsigned int lowest_bit;	/* index of first free in swap_map */
	unsigned int highest_bit;	/* index of last free in swap_map */
	unsigned int pages;		/* total of usable pages of swap */
+109 −31
Original line number Diff line number Diff line
@@ -513,6 +513,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
	VM_BUG_ON(ci->count != 0);
	lockdep_assert_held(&si->lock);
	lockdep_assert_held(&ci->lock);

	if (ci->flags & CLUSTER_FLAG_FRAG)
		si->frag_cluster_nr[ci->order]--;

	/*
	 * If the swap is discardable, prepare discard the cluster
	 * instead of free it immediately. The cluster will be freed
@@ -572,29 +576,82 @@ static void dec_cluster_info_page(struct swap_info_struct *p,

	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
		VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
		if (ci->flags & CLUSTER_FLAG_FRAG)
		if (ci->flags & CLUSTER_FLAG_FRAG) {
			p->frag_cluster_nr[ci->order]--;
			list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
		else
		} else {
			list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
		}
		ci->flags = CLUSTER_FLAG_NONFULL;
	}
}

static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start,
				      unsigned int nr_pages)
static bool cluster_reclaim_range(struct swap_info_struct *si,
				  struct swap_cluster_info *ci,
				  unsigned long start, unsigned long end)
{
	unsigned char *p = si->swap_map + start;
	unsigned char *end = p + nr_pages;
	unsigned char *map = si->swap_map;
	unsigned long offset;

	spin_unlock(&ci->lock);
	spin_unlock(&si->lock);

	for (offset = start; offset < end; offset++) {
		switch (READ_ONCE(map[offset])) {
		case 0:
			continue;
		case SWAP_HAS_CACHE:
			if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
				continue;
			goto out;
		default:
			goto out;
		}
	}
out:
	spin_lock(&si->lock);
	spin_lock(&ci->lock);

	while (p < end)
		if (*p++)
	/*
	 * Recheck the range no matter reclaim succeeded or not, the slot
	 * could have been be freed while we are not holding the lock.
	 */
	for (offset = start; offset < end; offset++)
		if (READ_ONCE(map[offset]))
			return false;

	return true;
}

static bool cluster_scan_range(struct swap_info_struct *si,
			       struct swap_cluster_info *ci,
			       unsigned long start, unsigned int nr_pages)
{
	unsigned long offset, end = start + nr_pages;
	unsigned char *map = si->swap_map;
	bool need_reclaim = false;

static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
	for (offset = start; offset < end; offset++) {
		switch (READ_ONCE(map[offset])) {
		case 0:
			continue;
		case SWAP_HAS_CACHE:
			if (!vm_swap_full())
				return false;
			need_reclaim = true;
			continue;
		default:
			return false;
		}
	}

	if (need_reclaim)
		return cluster_reclaim_range(si, ci, start, end);

	return true;
}

static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
				unsigned int start, unsigned char usage,
				unsigned int order)
{
@@ -615,6 +672,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_
	if (ci->count == SWAPFILE_CLUSTER) {
		VM_BUG_ON(!(ci->flags &
			  (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
		if (ci->flags & CLUSTER_FLAG_FRAG)
			si->frag_cluster_nr[ci->order]--;
		list_del(&ci->list);
		ci->flags = 0;
	}
@@ -640,7 +699,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
	}

	while (offset <= end) {
		if (cluster_scan_range(si, offset, nr_pages)) {
		if (cluster_scan_range(si, ci, offset, nr_pages)) {
			cluster_alloc_range(si, ci, offset, usage, order);
			*foundp = offset;
			if (ci->count == SWAPFILE_CLUSTER) {
@@ -668,9 +727,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
					      unsigned char usage)
{
	struct percpu_cluster *cluster;
	struct swap_cluster_info *ci, *n;
	struct swap_cluster_info *ci;
	unsigned int offset, found = 0;
	LIST_HEAD(fraged);

new_cluster:
	lockdep_assert_held(&si->lock);
@@ -690,25 +748,42 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
	}

	if (order < PMD_ORDER) {
		list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
			list_move_tail(&ci->list, &fraged);
		unsigned int frags = 0;

		while (!list_empty(&si->nonfull_clusters[order])) {
			ci = list_first_entry(&si->nonfull_clusters[order],
					      struct swap_cluster_info, list);
			list_move_tail(&ci->list, &si->frag_clusters[order]);
			ci->flags = CLUSTER_FLAG_FRAG;
			si->frag_cluster_nr[order]++;
			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
							 &found, order, usage);
			frags++;
			if (found)
				break;
		}

		if (!found) {
			list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) {
			/*
			 * Nonfull clusters are moved to frag tail if we reached
			 * here, count them too, don't over scan the frag list.
			 */
			while (frags < si->frag_cluster_nr[order]) {
				ci = list_first_entry(&si->frag_clusters[order],
						      struct swap_cluster_info, list);
				/*
				 * Rotate the frag list to iterate, they were all failing
				 * high order allocation or moved here due to per-CPU usage,
				 * this help keeping usable cluster ahead.
				 */
				list_move_tail(&ci->list, &si->frag_clusters[order]);
				offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
								 &found, order, usage);
				frags++;
				if (found)
					break;
			}
		}

		list_splice_tail(&fraged, &si->frag_clusters[order]);
	}

	if (found)
@@ -729,25 +804,28 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o

	/* Order 0 stealing from higher order */
	for (int o = 1; o < SWAP_NR_ORDERS; o++) {
		if (!list_empty(&si->frag_clusters[o])) {
		/*
		 * Clusters here have at least one usable slots and can't fail order 0
		 * allocation, but reclaim may drop si->lock and race with another user.
		 */
		while (!list_empty(&si->frag_clusters[o])) {
			ci = list_first_entry(&si->frag_clusters[o],
					      struct swap_cluster_info, list);
			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found,
							 0, usage);
			VM_BUG_ON(!found);
			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
							 &found, 0, usage);
			if (found)
				goto done;
		}

		if (!list_empty(&si->nonfull_clusters[o])) {
			ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
					      list);
		while (!list_empty(&si->nonfull_clusters[o])) {
			ci = list_first_entry(&si->nonfull_clusters[o],
					      struct swap_cluster_info, list);
			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
							 &found, 0, usage);
			VM_BUG_ON(!found);
			if (found)
				goto done;
		}
	}

done:
	cluster->next[order] = offset;
	return found;
@@ -3042,6 +3120,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
	for (i = 0; i < SWAP_NR_ORDERS; i++) {
		INIT_LIST_HEAD(&p->nonfull_clusters[i]);
		INIT_LIST_HEAD(&p->frag_clusters[i]);
		p->frag_cluster_nr[i] = 0;
	}

	for (i = 0; i < swap_header->info.nr_badpages; i++) {
@@ -3085,7 +3164,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
	if (!cluster_info)
		return nr_extents;


	/*
	 * Reduce false cache line sharing between cluster_info and
	 * sharing same address space.