Commit b487a2da authored by Kairui Song's avatar Kairui Song Committed by Andrew Morton
Browse files

mm, swap: simplify folio swap allocation

With slot cache gone, clean up the allocation helpers even more. 
folio_alloc_swap will be the only entry for allocation and adding the
folio to swap cache (except suspend), making it opposite of
folio_free_swap.

Link: https://lkml.kernel.org/r/20250313165935.63303-8-ryncsn@gmail.com


Signed-off-by: default avatarKairui Song <kasong@tencent.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 0ff67f99
Loading
Loading
Loading
Loading
+3 −5
Original line number Diff line number Diff line
@@ -478,7 +478,7 @@ static inline long get_nr_swap_pages(void)
}

extern void si_swapinfo(struct sysinfo *);
swp_entry_t folio_alloc_swap(struct folio *folio);
int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask);
bool folio_free_swap(struct folio *folio);
void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
@@ -586,11 +586,9 @@ static inline int swp_swapcount(swp_entry_t entry)
	return 0;
}

static inline swp_entry_t folio_alloc_swap(struct folio *folio)
static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask)
{
	swp_entry_t entry;
	entry.val = 0;
	return entry;
	return -EINVAL;
}

static inline bool folio_free_swap(struct folio *folio)
+6 −15
Original line number Diff line number Diff line
@@ -1533,7 +1533,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
	struct inode *inode = mapping->host;
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
	swp_entry_t swap;
	pgoff_t index;
	int nr_pages;
	bool split = false;
@@ -1615,14 +1614,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
		folio_mark_uptodate(folio);
	}

	swap = folio_alloc_swap(folio);
	if (!swap.val) {
		if (nr_pages > 1)
			goto try_split;

		goto redirty;
	}

	/*
	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
	 * if it's not already there.  Do it now before the folio is
@@ -1635,20 +1626,20 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
	if (list_empty(&info->swaplist))
		list_add(&info->swaplist, &shmem_swaplist);

	if (add_to_swap_cache(folio, swap,
			__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
			NULL) == 0) {
	if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
		shmem_recalc_inode(inode, 0, nr_pages);
		swap_shmem_alloc(swap, nr_pages);
		shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
		swap_shmem_alloc(folio->swap, nr_pages);
		shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));

		mutex_unlock(&shmem_swaplist_mutex);
		BUG_ON(folio_mapped(folio));
		return swap_writepage(&folio->page, wbc);
	}

	list_del_init(&info->swaplist);
	mutex_unlock(&shmem_swaplist_mutex);
	put_swap_folio(folio, swap);
	if (nr_pages > 1)
		goto try_split;
redirty:
	folio_mark_dirty(folio);
	if (wbc->for_reclaim)
+0 −6
Original line number Diff line number Diff line
@@ -50,7 +50,6 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
}

void show_swap_cache_info(void);
bool add_to_swap(struct folio *folio);
void *get_shadow_from_swap_cache(swp_entry_t entry);
int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
		      gfp_t gfp, void **shadowp);
@@ -163,11 +162,6 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
	return filemap_get_folio(mapping, index);
}

static inline bool add_to_swap(struct folio *folio)
{
	return false;
}

static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
{
	return NULL;
+0 −57
Original line number Diff line number Diff line
@@ -166,63 +166,6 @@ void __delete_from_swap_cache(struct folio *folio,
	__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
}

/**
 * add_to_swap - allocate swap space for a folio
 * @folio: folio we want to move to swap
 *
 * Allocate swap space for the folio and add the folio to the
 * swap cache.
 *
 * Context: Caller needs to hold the folio lock.
 * Return: Whether the folio was added to the swap cache.
 */
bool add_to_swap(struct folio *folio)
{
	swp_entry_t entry;
	int err;

	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
	VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);

	entry = folio_alloc_swap(folio);
	if (!entry.val)
		return false;

	/*
	 * XArray node allocations from PF_MEMALLOC contexts could
	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
	 * stops emergency reserves from being allocated.
	 *
	 * TODO: this could cause a theoretical memory reclaim
	 * deadlock in the swap out path.
	 */
	/*
	 * Add it to the swap cache.
	 */
	err = add_to_swap_cache(folio, entry,
			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
	if (err)
		goto fail;
	/*
	 * Normally the folio will be dirtied in unmap because its
	 * pte should be dirty. A special case is MADV_FREE page. The
	 * page's pte could have dirty bit cleared but the folio's
	 * SwapBacked flag is still set because clearing the dirty bit
	 * and SwapBacked flag has no lock protected. For such folio,
	 * unmap will not set dirty bit for it, so folio reclaim will
	 * not write the folio out. This can cause data corruption when
	 * the folio is swapped in later. Always setting the dirty flag
	 * for the folio solves the problem.
	 */
	folio_mark_dirty(folio);

	return true;

fail:
	put_swap_folio(folio, entry);
	return false;
}

/*
 * This must be called only on folios that have
 * been verified to be in the swap cache and locked.
+72 −39
Original line number Diff line number Diff line
@@ -1176,8 +1176,7 @@ static bool get_swap_device_info(struct swap_info_struct *si)
 * Fast path try to get swap entries with specified order from current
 * CPU's swap entry pool (a cluster).
 */
static int swap_alloc_fast(swp_entry_t *entry,
			   unsigned char usage,
static bool swap_alloc_fast(swp_entry_t *entry,
			    int order)
{
	struct swap_cluster_info *ci;
@@ -1197,7 +1196,7 @@ static int swap_alloc_fast(swp_entry_t *entry,
	if (cluster_is_usable(ci, order)) {
		if (cluster_is_empty(ci))
			offset = cluster_offset(si, ci);
		found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
		found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
		if (found)
			*entry = swp_entry(si->type, found);
	} else {
@@ -1208,47 +1207,30 @@ static int swap_alloc_fast(swp_entry_t *entry,
	return !!found;
}

swp_entry_t folio_alloc_swap(struct folio *folio)
/* Rotate the device and switch to a new cluster */
static bool swap_alloc_slow(swp_entry_t *entry,
			    int order)
{
	unsigned int order = folio_order(folio);
	unsigned int size = 1 << order;
	struct swap_info_struct *si, *next;
	swp_entry_t entry = {};
	unsigned long offset;
	int node;
	unsigned long offset;
	struct swap_info_struct *si, *next;

	if (order) {
		/*
		 * Should not even be attempting large allocations when huge
		 * page swap is disabled. Warn and fail the allocation.
		 */
		if (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER) {
			VM_WARN_ON_ONCE(1);
			return entry;
		}
	}

	/* Fast path using percpu cluster */
	local_lock(&percpu_swap_cluster.lock);
	if (swap_alloc_fast(&entry, SWAP_HAS_CACHE, order))
		goto out;

	/* Rotate the device and switch to a new cluster */
	node = numa_node_id();
	spin_lock(&swap_avail_lock);
start_over:
	node = numa_node_id();
	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
		/* Rotate the device and switch to a new cluster */
		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
		spin_unlock(&swap_avail_lock);
		if (get_swap_device_info(si)) {
			offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
			put_swap_device(si);
			if (offset) {
				entry = swp_entry(si->type, offset);
				goto out;
				*entry = swp_entry(si->type, offset);
				return true;
			}
			if (order)
				goto out;
				return false;
		}

		spin_lock(&swap_avail_lock);
@@ -1267,16 +1249,67 @@ swp_entry_t folio_alloc_swap(struct folio *folio)
			goto start_over;
	}
	spin_unlock(&swap_avail_lock);
out:
	return false;
}

/**
 * folio_alloc_swap - allocate swap space for a folio
 * @folio: folio we want to move to swap
 * @gfp: gfp mask for shadow nodes
 *
 * Allocate swap space for the folio and add the folio to the
 * swap cache.
 *
 * Context: Caller needs to hold the folio lock.
 * Return: Whether the folio was added to the swap cache.
 */
int folio_alloc_swap(struct folio *folio, gfp_t gfp)
{
	unsigned int order = folio_order(folio);
	unsigned int size = 1 << order;
	swp_entry_t entry = {};

	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
	VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);

	/*
	 * Should not even be attempting large allocations when huge
	 * page swap is disabled. Warn and fail the allocation.
	 */
	if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) {
		VM_WARN_ON_ONCE(1);
		return -EINVAL;
	}

	local_lock(&percpu_swap_cluster.lock);
	if (!swap_alloc_fast(&entry, order))
		swap_alloc_slow(&entry, order);
	local_unlock(&percpu_swap_cluster.lock);

	/* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
	if (mem_cgroup_try_charge_swap(folio, entry)) {
		put_swap_folio(folio, entry);
		entry.val = 0;
	}
	if (entry.val)
	if (mem_cgroup_try_charge_swap(folio, entry))
		goto out_free;

	if (!entry.val)
		return -ENOMEM;

	/*
	 * XArray node allocations from PF_MEMALLOC contexts could
	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
	 * stops emergency reserves from being allocated.
	 *
	 * TODO: this could cause a theoretical memory reclaim
	 * deadlock in the swap out path.
	 */
	if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
		goto out_free;

	atomic_long_sub(size, &nr_swap_pages);
	return entry;
	return 0;

out_free:
	put_swap_folio(folio, entry);
	return -ENOMEM;
}

static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
Loading