Commit 4b34f1d8 authored by Kairui Song's avatar Kairui Song Committed by Andrew Morton
Browse files

mm, swap: free the swap cache after folio is mapped

Currently, we remove the folio from the swap cache and free the swap cache
before mapping the PTE.  To reduce repeated faults due to parallel swapins
of the same PTE, change it to remove the folio from the swap cache after
it is mapped.  So new faults from the swap PTE will be much more likely to
see the folio in the swap cache and wait on it.

This does not eliminate all swapin races: an ongoing swapin fault may
still see an empty swap cache.  That's harmless, as the PTE is changed
before the swap cache is cleared, so it will just return and not trigger
any repeated faults.  This does help to reduce the chance.

Link: https://lkml.kernel.org/r/20251220-swap-table-p2-v5-6-8862a265a033@tencent.com


Signed-off-by: default avatarKairui Song <kasong@tencent.com>
Reviewed-by: default avatarBaoquan He <bhe@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Rafael J. Wysocki (Intel) <rafael@kernel.org>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
Cc: Deepanshu Kartikey <kartikey406@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kairui Song <ryncsn@gmail.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 6aeec9a1
Loading
Loading
Loading
Loading
+11 −10
Original line number Diff line number Diff line
@@ -4365,6 +4365,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
static inline bool should_try_to_free_swap(struct swap_info_struct *si,
					   struct folio *folio,
					   struct vm_area_struct *vma,
					   unsigned int extra_refs,
					   unsigned int fault_flags)
{
	if (!folio_test_swapcache(folio))
@@ -4387,7 +4388,7 @@ static inline bool should_try_to_free_swap(struct swap_info_struct *si,
	 * reference only in case it's likely that we'll be the exclusive user.
	 */
	return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
		folio_ref_count(folio) == (1 + folio_nr_pages(folio));
		folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
}

static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
@@ -4939,15 +4940,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
	 */
	arch_swap_restore(folio_swap(entry, folio), folio);

	/*
	 * Remove the swap entry and conditionally try to free up the swapcache.
	 * We're already holding a reference on the page but haven't mapped it
	 * yet.
	 */
	swap_free_nr(entry, nr_pages);
	if (should_try_to_free_swap(si, folio, vma, vmf->flags))
		folio_free_swap(folio);

	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
	add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
	pte = mk_pte(page, vma->vm_page_prot);
@@ -5001,6 +4993,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
	arch_do_swap_page_nr(vma->vm_mm, vma, address,
			pte, pte, nr_pages);

	/*
	 * Remove the swap entry and conditionally try to free up the swapcache.
	 * Do it after mapping, so raced page faults will likely see the folio
	 * in swap cache and wait on the folio lock.
	 */
	swap_free_nr(entry, nr_pages);
	if (should_try_to_free_swap(si, folio, vma, nr_pages, vmf->flags))
		folio_free_swap(folio);

	folio_unlock(folio);
	if (unlikely(folio != swapcache)) {
		/*