Commit 8578e0c0 authored by Kairui Song's avatar Kairui Song Committed by Andrew Morton
Browse files

mm, swap: use the swap table for the swap cache and switch API

Introduce basic swap table infrastructures, which are now just a
fixed-sized flat array inside each swap cluster, with access wrappers.

Each cluster contains a swap table of 512 entries.  Each table entry is an
opaque atomic long.  It could be in 3 types: a shadow type (XA_VALUE), a
folio type (pointer), or NULL.

In this first step, it only supports storing a folio or shadow, and it is
a drop-in replacement for the current swap cache.  Convert all swap cache
users to use the new sets of APIs.  Chris Li has been suggesting using a
new infrastructure for swap cache for better performance, and that idea
combined well with the swap table as the new backing structure.  Now the
lock contention range is reduced to 2M clusters, which is much smaller
than the 64M address_space.  And we can also drop the multiple
address_space design.

All the internal works are done with swap_cache_get_* helpers.  Swap cache
lookup is still lock-less like before, and the helper's contexts are same
with original swap cache helpers.  They still require a pin on the swap
device to prevent the backing data from being freed.

Swap cache updates are now protected by the swap cluster lock instead of
the XArray lock.  This is mostly handled internally, but new
__swap_cache_* helpers require the caller to lock the cluster.  So, a few
new cluster access and locking helpers are also introduced.

A fully cluster-based unified swap table can be implemented on top of this
to take care of all count tracking and synchronization work, with dynamic
allocation.  It should reduce the memory usage while making the
performance even better.

Link: https://lkml.kernel.org/r/20250916160100.31545-12-ryncsn@gmail.com


Co-developed-by: default avatarChris Li <chrisl@kernel.org>
Signed-off-by: default avatarChris Li <chrisl@kernel.org>
Signed-off-by: default avatarKairui Song <kasong@tencent.com>
Acked-by: default avatarChris Li <chrisl@kernel.org>
Suggested-by: default avatarChris Li <chrisl@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: kernel test robot <oliver.sang@intel.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 094dc8b0
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -16232,6 +16232,7 @@ F: include/linux/swapops.h
F:	mm/page_io.c
F:	mm/swap.c
F:	mm/swap.h
F:	mm/swap_table.h
F:	mm/swap_state.c
F:	mm/swapfile.c
+0 −2
Original line number Diff line number Diff line
@@ -480,8 +480,6 @@ extern int __swap_count(swp_entry_t entry);
extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
extern void exit_swap_address_space(unsigned int type);
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
sector_t swap_folio_sector(struct folio *folio);

+6 −7
Original line number Diff line number Diff line
@@ -3720,7 +3720,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
	/* Prevent deferred_split_scan() touching ->_refcount */
	spin_lock(&ds_queue->split_queue_lock);
	if (folio_ref_freeze(folio, 1 + extra_pins)) {
		struct address_space *swap_cache = NULL;
		struct swap_cluster_info *ci = NULL;
		struct lruvec *lruvec;
		int expected_refs;

@@ -3764,8 +3764,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
				goto fail;
			}

			swap_cache = swap_address_space(folio->swap);
			xa_lock(&swap_cache->i_pages);
			ci = swap_cluster_get_and_lock(folio);
		}

		/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
@@ -3797,8 +3796,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
			 * Anonymous folio with swap cache.
			 * NOTE: shmem in swap cache is not supported yet.
			 */
			if (swap_cache) {
				__swap_cache_replace_folio(folio, new_folio);
			if (ci) {
				__swap_cache_replace_folio(ci, folio, new_folio);
				continue;
			}

@@ -3833,8 +3832,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,

		unlock_page_lruvec(lruvec);

		if (swap_cache)
			xa_unlock(&swap_cache->i_pages);
		if (ci)
			swap_cluster_unlock(ci);
	} else {
		spin_unlock(&ds_queue->split_queue_lock);
		ret = -EAGAIN;
+15 −4
Original line number Diff line number Diff line
@@ -563,6 +563,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
		struct folio *newfolio, struct folio *folio, int expected_count)
{
	XA_STATE(xas, &mapping->i_pages, folio_index(folio));
	struct swap_cluster_info *ci = NULL;
	struct zone *oldzone, *newzone;
	int dirty;
	long nr = folio_nr_pages(folio);
@@ -591,8 +592,15 @@ static int __folio_migrate_mapping(struct address_space *mapping,
	oldzone = folio_zone(folio);
	newzone = folio_zone(newfolio);

	if (folio_test_swapcache(folio))
		ci = swap_cluster_get_and_lock_irq(folio);
	else
		xas_lock_irq(&xas);

	if (!folio_ref_freeze(folio, expected_count)) {
		if (ci)
			swap_cluster_unlock_irq(ci);
		else
			xas_unlock_irq(&xas);
		return -EAGAIN;
	}
@@ -624,7 +632,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
	}

	if (folio_test_swapcache(folio))
		__swap_cache_replace_folio(folio, newfolio);
		__swap_cache_replace_folio(ci, folio, newfolio);
	else
		xas_store(&xas, newfolio);

@@ -635,8 +643,11 @@ static int __folio_migrate_mapping(struct address_space *mapping,
	 */
	folio_ref_unfreeze(folio, expected_count - nr);

	xas_unlock(&xas);
	/* Leave irq disabled to prevent preemption while updating stats */
	if (ci)
		swap_cluster_unlock(ci);
	else
		xas_unlock(&xas);

	/*
	 * If moved to a different zone then also account
+4 −4
Original line number Diff line number Diff line
@@ -2083,9 +2083,9 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
				struct shmem_inode_info *info, pgoff_t index,
				struct vm_area_struct *vma)
{
	struct swap_cluster_info *ci;
	struct folio *new, *old = *foliop;
	swp_entry_t entry = old->swap;
	struct address_space *swap_mapping = swap_address_space(entry);
	int nr_pages = folio_nr_pages(old);
	int error = 0;

@@ -2116,12 +2116,12 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
	new->swap = entry;
	folio_set_swapcache(new);

	xa_lock_irq(&swap_mapping->i_pages);
	__swap_cache_replace_folio(old, new);
	ci = swap_cluster_get_and_lock_irq(old);
	__swap_cache_replace_folio(ci, old, new);
	mem_cgroup_replace_folio(old, new);
	shmem_update_stats(new, nr_pages);
	shmem_update_stats(old, -nr_pages);
	xa_unlock_irq(&swap_mapping->i_pages);
	swap_cluster_unlock_irq(ci);

	folio_add_lru(new);
	*foliop = new;
Loading