Commit f3cd4ab0 authored by Hugh Dickins's avatar Hugh Dickins Committed by Andrew Morton
Browse files

mm/madvise: clean up pte_offset_map_lock() scans

Came here to make madvise's several pte_offset_map_lock() scans advance to
next extent on failure, and remove superfluous pmd_trans_unstable() and
pmd_none_or_trans_huge_or_clear_bad() calls.  But also did some nearby
cleanup.

swapin_walk_pmd_entry(): don't name an address "index"; don't drop the
lock after every pte, only when calling out to read_swap_cache_async().

madvise_cold_or_pageout_pte_range() and madvise_free_pte_range(): prefer
"start_pte" for pointer, orig_pte usually denotes a saved pte value; leave
lazy MMU mode before unlocking; merge the success and failure paths after
split_folio().

Link: https://lkml.kernel.org/r/cc4d9a88-9da6-362-50d9-6735c2b125c6@google.com


Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Song Liu <song@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zack Rusin <zackr@vmware.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent a5be621e
Loading
Loading
Loading
Loading
+68 −54
Original line number Diff line number Diff line
@@ -191,34 +191,40 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
		unsigned long end, struct mm_walk *walk)
{
	struct vm_area_struct *vma = walk->private;
	unsigned long index;
	struct swap_iocb *splug = NULL;
	pte_t *ptep = NULL;
	spinlock_t *ptl;
	unsigned long addr;

	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
		return 0;

	for (index = start; index != end; index += PAGE_SIZE) {
	for (addr = start; addr < end; addr += PAGE_SIZE) {
		pte_t pte;
		swp_entry_t entry;
		struct page *page;
		spinlock_t *ptl;
		pte_t *ptep;

		ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl);
		pte = *ptep;
		pte_unmap_unlock(ptep, ptl);
		if (!ptep++) {
			ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
			if (!ptep)
				break;
		}

		pte = *ptep;
		if (!is_swap_pte(pte))
			continue;
		entry = pte_to_swp_entry(pte);
		if (unlikely(non_swap_entry(entry)))
			continue;

		pte_unmap_unlock(ptep, ptl);
		ptep = NULL;

		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
					     vma, index, false, &splug);
					     vma, addr, false, &splug);
		if (page)
			put_page(page);
	}

	if (ptep)
		pte_unmap_unlock(ptep, ptl);
	swap_read_unplug(splug);
	cond_resched();

@@ -340,7 +346,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
	bool pageout = private->pageout;
	struct mm_struct *mm = tlb->mm;
	struct vm_area_struct *vma = walk->vma;
	pte_t *orig_pte, *pte, ptent;
	pte_t *start_pte, *pte, ptent;
	spinlock_t *ptl;
	struct folio *folio = NULL;
	LIST_HEAD(folio_list);
@@ -422,11 +428,11 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
	}

regular_folio:
	if (pmd_trans_unstable(pmd))
		return 0;
#endif
	tlb_change_page_size(tlb, PAGE_SIZE);
	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
	start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
	if (!start_pte)
		return 0;
	flush_tlb_batched_pending(mm);
	arch_enter_lazy_mmu_mode();
	for (; addr < end; pte++, addr += PAGE_SIZE) {
@@ -447,25 +453,28 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
		 * are sure it's worth. Split it if we are only owner.
		 */
		if (folio_test_large(folio)) {
			int err;

			if (folio_mapcount(folio) != 1)
				break;
			if (pageout_anon_only_filter && !folio_test_anon(folio))
				break;
			folio_get(folio);
			if (!folio_trylock(folio)) {
				folio_put(folio);
			if (!folio_trylock(folio))
				break;
			}
			pte_unmap_unlock(orig_pte, ptl);
			if (split_folio(folio)) {
			folio_get(folio);
			arch_leave_lazy_mmu_mode();
			pte_unmap_unlock(start_pte, ptl);
			start_pte = NULL;
			err = split_folio(folio);
			folio_unlock(folio);
			folio_put(folio);
				orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
			if (err)
				break;
			}
			folio_unlock(folio);
			folio_put(folio);
			orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
			start_pte = pte =
				pte_offset_map_lock(mm, pmd, addr, &ptl);
			if (!start_pte)
				break;
			arch_enter_lazy_mmu_mode();
			pte--;
			addr -= PAGE_SIZE;
			continue;
@@ -510,8 +519,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
			folio_deactivate(folio);
	}

	if (start_pte) {
		arch_leave_lazy_mmu_mode();
	pte_unmap_unlock(orig_pte, ptl);
		pte_unmap_unlock(start_pte, ptl);
	}
	if (pageout)
		reclaim_pages(&folio_list);
	cond_resched();
@@ -612,7 +623,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
	struct mm_struct *mm = tlb->mm;
	struct vm_area_struct *vma = walk->vma;
	spinlock_t *ptl;
	pte_t *orig_pte, *pte, ptent;
	pte_t *start_pte, *pte, ptent;
	struct folio *folio;
	int nr_swap = 0;
	unsigned long next;
@@ -620,13 +631,12 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
	next = pmd_addr_end(addr, end);
	if (pmd_trans_huge(*pmd))
		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
			goto next;

	if (pmd_trans_unstable(pmd))
			return 0;

	tlb_change_page_size(tlb, PAGE_SIZE);
	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	if (!start_pte)
		return 0;
	flush_tlb_batched_pending(mm);
	arch_enter_lazy_mmu_mode();
	for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -664,23 +674,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
		 * deactivate all pages.
		 */
		if (folio_test_large(folio)) {
			int err;

			if (folio_mapcount(folio) != 1)
				goto out;
				break;
			if (!folio_trylock(folio))
				break;
			folio_get(folio);
			if (!folio_trylock(folio)) {
				folio_put(folio);
				goto out;
			}
			pte_unmap_unlock(orig_pte, ptl);
			if (split_folio(folio)) {
				folio_unlock(folio);
				folio_put(folio);
				orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
				goto out;
			}
			arch_leave_lazy_mmu_mode();
			pte_unmap_unlock(start_pte, ptl);
			start_pte = NULL;
			err = split_folio(folio);
			folio_unlock(folio);
			folio_put(folio);
			orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
			if (err)
				break;
			start_pte = pte =
				pte_offset_map_lock(mm, pmd, addr, &ptl);
			if (!start_pte)
				break;
			arch_enter_lazy_mmu_mode();
			pte--;
			addr -= PAGE_SIZE;
			continue;
@@ -725,17 +738,18 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
		}
		folio_mark_lazyfree(folio);
	}
out:

	if (nr_swap) {
		if (current->mm == mm)
			sync_mm_rss(mm);

		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
	}
	if (start_pte) {
		arch_leave_lazy_mmu_mode();
	pte_unmap_unlock(orig_pte, ptl);
		pte_unmap_unlock(start_pte, ptl);
	}
	cond_resched();
next:

	return 0;
}