Commit eb1521da authored by Suren Baghdasaryan's avatar Suren Baghdasaryan Committed by Andrew Morton
Browse files

userfaultfd: handle zeropage moves by UFFDIO_MOVE

Current implementation of UFFDIO_MOVE fails to move zeropages and returns
EBUSY when it encounters one.  We can handle them by mapping a zeropage at
the destination and clearing the mapping at the source.  This is done both
for ordinary and for huge zeropages.

Link: https://lkml.kernel.org/r/20240131175618.2417291-1-surenb@google.com


Signed-off-by: default avatarSuren Baghdasaryan <surenb@google.com>
Reported-by: default avatarkernel test robot <lkp@intel.com>
Reported-by: default avatarDan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/r/202401300107.U8iMAkTl-lkp@intel.com/


Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: ZhangPeng <zhangpeng362@huawei.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent e777ae44
Loading
Loading
Loading
Loading
+61 −44
Original line number Diff line number Diff line
@@ -2200,6 +2200,8 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
	}

	src_page = pmd_page(src_pmdval);

	if (!is_huge_zero_pmd(src_pmdval)) {
		if (unlikely(!PageAnonExclusive(src_page))) {
			spin_unlock(src_ptl);
			return -EBUSY;
@@ -2207,6 +2209,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm

		src_folio = page_folio(src_page);
		folio_get(src_folio);
	} else
		src_folio = NULL;

	spin_unlock(src_ptl);

	flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
@@ -2214,6 +2219,7 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
				src_addr + HPAGE_PMD_SIZE);
	mmu_notifier_invalidate_range_start(&range);

	if (src_folio) {
		folio_lock(src_folio);

		/*
@@ -2227,6 +2233,8 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
			goto unlock_folio;
		}
		anon_vma_lock_write(src_anon_vma);
	} else
		src_anon_vma = NULL;

	dst_ptl = pmd_lockptr(mm, dst_pmd);
	double_pt_lock(src_ptl, dst_ptl);
@@ -2235,6 +2243,7 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
		err = -EAGAIN;
		goto unlock_ptls;
	}
	if (src_folio) {
		if (folio_maybe_dma_pinned(src_folio) ||
		    !PageAnonExclusive(&src_folio->page)) {
			err = -EBUSY;
@@ -2261,18 +2270,26 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
		_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
		/* Follow mremap() behavior and treat the entry dirty after the move */
		_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
	} else {
		src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
		_dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
	}
	set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);

	src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
	pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
unlock_ptls:
	double_pt_unlock(src_ptl, dst_ptl);
	if (src_anon_vma) {
		anon_vma_unlock_write(src_anon_vma);
		put_anon_vma(src_anon_vma);
	}
unlock_folio:
	/* unblock rmap walks */
	if (src_folio)
		folio_unlock(src_folio);
	mmu_notifier_invalidate_range_end(&range);
	if (src_folio)
		folio_put(src_folio);
	return err;
}
+37 −7
Original line number Diff line number Diff line
@@ -959,6 +959,33 @@ static int move_swap_pte(struct mm_struct *mm,
	return 0;
}

static int move_zeropage_pte(struct mm_struct *mm,
			     struct vm_area_struct *dst_vma,
			     struct vm_area_struct *src_vma,
			     unsigned long dst_addr, unsigned long src_addr,
			     pte_t *dst_pte, pte_t *src_pte,
			     pte_t orig_dst_pte, pte_t orig_src_pte,
			     spinlock_t *dst_ptl, spinlock_t *src_ptl)
{
	pte_t zero_pte;

	double_pt_lock(dst_ptl, src_ptl);
	if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
	    !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
		double_pt_unlock(dst_ptl, src_ptl);
		return -EAGAIN;
	}

	zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
					 dst_vma->vm_page_prot));
	ptep_clear_flush(src_vma, src_addr, src_pte);
	set_pte_at(mm, dst_addr, dst_pte, zero_pte);
	double_pt_unlock(dst_ptl, src_ptl);

	return 0;
}


/*
 * The mmap_lock for reading is held by the caller. Just move the page
 * from src_pmd to dst_pmd if possible, and return true if succeeded
@@ -1041,6 +1068,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
	}

	if (pte_present(orig_src_pte)) {
		if (is_zero_pfn(pte_pfn(orig_src_pte))) {
			err = move_zeropage_pte(mm, dst_vma, src_vma,
					       dst_addr, src_addr, dst_pte, src_pte,
					       orig_dst_pte, orig_src_pte,
					       dst_ptl, src_ptl);
			goto out;
		}

		/*
		 * Pin and lock both source folio and anon_vma. Since we are in
		 * RCU read section, we can't block, so on contention have to
@@ -1404,19 +1439,14 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
				err = -ENOENT;
				break;
			}
			/* Avoid moving zeropages for now */
			if (is_huge_zero_pmd(*src_pmd)) {
				spin_unlock(ptl);
				err = -EBUSY;
				break;
			}

			/* Check if we can move the pmd without splitting it. */
			if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
			    !pmd_none(dst_pmdval)) {
				struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));

				if (!folio || !PageAnonExclusive(&folio->page)) {
				if (!folio || (!is_huge_zero_page(&folio->page) &&
					       !PageAnonExclusive(&folio->page))) {
					spin_unlock(ptl);
					err = -EBUSY;
					break;