Commit e244d82d authored by Lorenzo Stoakes's avatar Lorenzo Stoakes Committed by Andrew Morton
Browse files

mm/huge_memory: refactor copy_huge_pmd() non-present logic

Right now we are inconsistent in our use of thp_migration_supported():

static inline bool thp_migration_supported(void)
{
	return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
}

And simply having arbitrary and ugly #ifdef
CONFIG_ARCH_ENABLE_THP_MIGRATION blocks in code.

This is exhibited in copy_huge_pmd(), which inserts a large #ifdef
CONFIG_ARCH_ENABLE_THP_MIGRATION block and an if-branch which is difficult
to follow

It's difficult to follow the logic of such a large function and the
non-present PMD logic is clearly separate as it sits in a giant if-branch.

Therefore this patch both separates out the logic and utilises
thp_migration_supported().

No functional change intended.

Link: https://lkml.kernel.org/r/6eaadc23ed512d370ede65561e34e96241c54b9d.1762812360.git.lorenzo.stoakes@oracle.com


Signed-off-by: default avatarLorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: default avatarVlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent aa62204c
Loading
Loading
Loading
Loading
+59 −50
Original line number Diff line number Diff line
@@ -1699,54 +1699,14 @@ void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
		update_mmu_cache_pmd(vma, addr, pmd);
}

int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
static void copy_huge_non_present_pmd(
		struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
		struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
		pmd_t pmd, pgtable_t pgtable)
{
	spinlock_t *dst_ptl, *src_ptl;
	struct page *src_page;
	struct folio *src_folio;
	pmd_t pmd;
	pgtable_t pgtable = NULL;
	int ret = -ENOMEM;

	pmd = pmdp_get_lockless(src_pmd);
	if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
		     !is_huge_zero_pmd(pmd))) {
		dst_ptl = pmd_lock(dst_mm, dst_pmd);
		src_ptl = pmd_lockptr(src_mm, src_pmd);
		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
		/*
		 * No need to recheck the pmd, it can't change with write
		 * mmap lock held here.
		 *
		 * Meanwhile, making sure it's not a CoW VMA with writable
		 * mapping, otherwise it means either the anon page wrongly
		 * applied special bit, or we made the PRIVATE mapping be
		 * able to wrongly write to the backend MMIO.
		 */
		VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
		goto set_pmd;
	}

	/* Skip if can be re-fill on fault */
	if (!vma_is_anonymous(dst_vma))
		return 0;

	pgtable = pte_alloc_one(dst_mm);
	if (unlikely(!pgtable))
		goto out;

	dst_ptl = pmd_lock(dst_mm, dst_pmd);
	src_ptl = pmd_lockptr(src_mm, src_pmd);
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

	ret = -EAGAIN;
	pmd = *src_pmd;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
	if (unlikely(is_swap_pmd(pmd))) {
	swp_entry_t entry = pmd_to_swp_entry(pmd);
	struct folio *src_folio;

	VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd));

@@ -1793,10 +1753,59 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
	if (!userfaultfd_wp(dst_vma))
		pmd = pmd_swp_clear_uffd_wp(pmd);
	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
}

int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
	spinlock_t *dst_ptl, *src_ptl;
	struct page *src_page;
	struct folio *src_folio;
	pmd_t pmd;
	pgtable_t pgtable = NULL;
	int ret = -ENOMEM;

	pmd = pmdp_get_lockless(src_pmd);
	if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
		     !is_huge_zero_pmd(pmd))) {
		dst_ptl = pmd_lock(dst_mm, dst_pmd);
		src_ptl = pmd_lockptr(src_mm, src_pmd);
		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
		/*
		 * No need to recheck the pmd, it can't change with write
		 * mmap lock held here.
		 *
		 * Meanwhile, making sure it's not a CoW VMA with writable
		 * mapping, otherwise it means either the anon page wrongly
		 * applied special bit, or we made the PRIVATE mapping be
		 * able to wrongly write to the backend MMIO.
		 */
		VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
		goto set_pmd;
	}

	/* Skip if can be re-fill on fault */
	if (!vma_is_anonymous(dst_vma))
		return 0;

	pgtable = pte_alloc_one(dst_mm);
	if (unlikely(!pgtable))
		goto out;

	dst_ptl = pmd_lock(dst_mm, dst_pmd);
	src_ptl = pmd_lockptr(src_mm, src_pmd);
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

	ret = -EAGAIN;
	pmd = *src_pmd;

	if (unlikely(thp_migration_supported() && is_swap_pmd(pmd))) {
		copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
					  dst_vma, src_vma, pmd, pgtable);
		ret = 0;
		goto out_unlock;
	}
#endif

	if (unlikely(!pmd_trans_huge(pmd))) {
		pte_free(dst_mm, pgtable);