Commit 52526ca7 authored by Muhammad Usama Anjum's avatar Muhammad Usama Anjum Committed by Andrew Morton
Browse files

fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs

The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally
clear the info about page table entries. The following operations are
supported in this IOCTL:
- Scan the address range and get the memory ranges matching the provided
  criteria. This is performed when the output buffer is specified.
- Write-protect the pages. The PM_SCAN_WP_MATCHING is used to write-protect
  the pages of interest. The PM_SCAN_CHECK_WPASYNC aborts the operation if
  non-Async Write Protected pages are found. The ``PM_SCAN_WP_MATCHING``
  can be used with or without PM_SCAN_CHECK_WPASYNC.
- Both of those operations can be combined into one atomic operation where
  we can get and write protect the pages as well.

Following flags about pages are currently supported:
- PAGE_IS_WPALLOWED - Page has async-write-protection enabled
- PAGE_IS_WRITTEN - Page has been written to from the time it was write protected
- PAGE_IS_FILE - Page is file backed
- PAGE_IS_PRESENT - Page is present in the memory
- PAGE_IS_SWAPPED - Page is in swapped
- PAGE_IS_PFNZERO - Page has zero PFN
- PAGE_IS_HUGE - Page is THP or Hugetlb backed

This IOCTL can be extended to get information about more PTE bits. The
entire address range passed by user [start, end) is scanned until either
the user provided buffer is full or max_pages have been found.

[akpm@linux-foundation.org: update it for "mm: hugetlb: add huge page size param to set_huge_pte_at()"]
[akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n warning]
[arnd@arndb.de: hide unused pagemap_scan_backout_range() function]
  Link: https://lkml.kernel.org/r/20230927060257.2975412-1-arnd@kernel.org
[sfr@canb.auug.org.au: fix "fs/proc/task_mmu: hide unused pagemap_scan_backout_range() function"]
  Link: https://lkml.kernel.org/r/20230928092223.0625c6bf@canb.auug.org.au
Link: https://lkml.kernel.org/r/20230821141518.870589-3-usama.anjum@collabora.com


Signed-off-by: default avatarMuhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: default avatarMichał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: default avatarArnd Bergmann <arnd@arndb.de>
Signed-off-by: default avatarStephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: default avatarAndrei Vagin <avagin@gmail.com>
Reviewed-by: default avatarMichał Mirosław <mirq-linux@rere.qmqm.pl>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Miroslaw <emmir@google.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Paul Gofman <pgofman@codeweavers.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yun Zhou <yun.zhou@windriver.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent d61ea1cb
Loading
Loading
Loading
Loading
+692 −0
Original line number Diff line number Diff line
@@ -20,6 +20,8 @@
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
#include <linux/minmax.h>
#include <linux/overflow.h>

#include <asm/elf.h>
#include <asm/tlb.h>
@@ -1761,11 +1763,701 @@ static int pagemap_release(struct inode *inode, struct file *file)
	return 0;
}

#define PM_SCAN_CATEGORIES	(PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN |	\
				 PAGE_IS_FILE |	PAGE_IS_PRESENT |	\
				 PAGE_IS_SWAPPED | PAGE_IS_PFNZERO |	\
				 PAGE_IS_HUGE)
#define PM_SCAN_FLAGS		(PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)

struct pagemap_scan_private {
	struct pm_scan_arg arg;
	unsigned long masks_of_interest, cur_vma_category;
	struct page_region *vec_buf;
	unsigned long vec_buf_len, vec_buf_index, found_pages;
	struct page_region __user *vec_out;
};

static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
					   struct vm_area_struct *vma,
					   unsigned long addr, pte_t pte)
{
	unsigned long categories = 0;

	if (pte_present(pte)) {
		struct page *page;

		categories |= PAGE_IS_PRESENT;
		if (!pte_uffd_wp(pte))
			categories |= PAGE_IS_WRITTEN;

		if (p->masks_of_interest & PAGE_IS_FILE) {
			page = vm_normal_page(vma, addr, pte);
			if (page && !PageAnon(page))
				categories |= PAGE_IS_FILE;
		}

		if (is_zero_pfn(pte_pfn(pte)))
			categories |= PAGE_IS_PFNZERO;
	} else if (is_swap_pte(pte)) {
		swp_entry_t swp;

		categories |= PAGE_IS_SWAPPED;
		if (!pte_swp_uffd_wp_any(pte))
			categories |= PAGE_IS_WRITTEN;

		if (p->masks_of_interest & PAGE_IS_FILE) {
			swp = pte_to_swp_entry(pte);
			if (is_pfn_swap_entry(swp) &&
			    !PageAnon(pfn_swap_entry_to_page(swp)))
				categories |= PAGE_IS_FILE;
		}
	}

	return categories;
}

static void make_uffd_wp_pte(struct vm_area_struct *vma,
			     unsigned long addr, pte_t *pte)
{
	pte_t ptent = ptep_get(pte);

	if (pte_present(ptent)) {
		pte_t old_pte;

		old_pte = ptep_modify_prot_start(vma, addr, pte);
		ptent = pte_mkuffd_wp(ptent);
		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
	} else if (is_swap_pte(ptent)) {
		ptent = pte_swp_mkuffd_wp(ptent);
		set_pte_at(vma->vm_mm, addr, pte, ptent);
	} else {
		set_pte_at(vma->vm_mm, addr, pte,
			   make_pte_marker(PTE_MARKER_UFFD_WP));
	}
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
					  struct vm_area_struct *vma,
					  unsigned long addr, pmd_t pmd)
{
	unsigned long categories = PAGE_IS_HUGE;

	if (pmd_present(pmd)) {
		struct page *page;

		categories |= PAGE_IS_PRESENT;
		if (!pmd_uffd_wp(pmd))
			categories |= PAGE_IS_WRITTEN;

		if (p->masks_of_interest & PAGE_IS_FILE) {
			page = vm_normal_page_pmd(vma, addr, pmd);
			if (page && !PageAnon(page))
				categories |= PAGE_IS_FILE;
		}

		if (is_zero_pfn(pmd_pfn(pmd)))
			categories |= PAGE_IS_PFNZERO;
	} else if (is_swap_pmd(pmd)) {
		swp_entry_t swp;

		categories |= PAGE_IS_SWAPPED;
		if (!pmd_swp_uffd_wp(pmd))
			categories |= PAGE_IS_WRITTEN;

		if (p->masks_of_interest & PAGE_IS_FILE) {
			swp = pmd_to_swp_entry(pmd);
			if (is_pfn_swap_entry(swp) &&
			    !PageAnon(pfn_swap_entry_to_page(swp)))
				categories |= PAGE_IS_FILE;
		}
	}

	return categories;
}

static void make_uffd_wp_pmd(struct vm_area_struct *vma,
			     unsigned long addr, pmd_t *pmdp)
{
	pmd_t old, pmd = *pmdp;

	if (pmd_present(pmd)) {
		old = pmdp_invalidate_ad(vma, addr, pmdp);
		pmd = pmd_mkuffd_wp(old);
		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
		pmd = pmd_swp_mkuffd_wp(pmd);
		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
	}
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifdef CONFIG_HUGETLB_PAGE
static unsigned long pagemap_hugetlb_category(pte_t pte)
{
	unsigned long categories = PAGE_IS_HUGE;

	/*
	 * According to pagemap_hugetlb_range(), file-backed HugeTLB
	 * page cannot be swapped. So PAGE_IS_FILE is not checked for
	 * swapped pages.
	 */
	if (pte_present(pte)) {
		categories |= PAGE_IS_PRESENT;
		if (!huge_pte_uffd_wp(pte))
			categories |= PAGE_IS_WRITTEN;
		if (!PageAnon(pte_page(pte)))
			categories |= PAGE_IS_FILE;
		if (is_zero_pfn(pte_pfn(pte)))
			categories |= PAGE_IS_PFNZERO;
	} else if (is_swap_pte(pte)) {
		categories |= PAGE_IS_SWAPPED;
		if (!pte_swp_uffd_wp_any(pte))
			categories |= PAGE_IS_WRITTEN;
	}

	return categories;
}

static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
				  unsigned long addr, pte_t *ptep,
				  pte_t ptent)
{
	unsigned long psize;

	if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
		return;

	psize = huge_page_size(hstate_vma(vma));

	if (is_hugetlb_entry_migration(ptent))
		set_huge_pte_at(vma->vm_mm, addr, ptep,
				pte_swp_mkuffd_wp(ptent), psize);
	else if (!huge_pte_none(ptent))
		huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
					     huge_pte_mkuffd_wp(ptent));
	else
		set_huge_pte_at(vma->vm_mm, addr, ptep,
				make_pte_marker(PTE_MARKER_UFFD_WP), psize);
}
#endif /* CONFIG_HUGETLB_PAGE */

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
				       unsigned long addr, unsigned long end)
{
	struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];

	if (cur_buf->start != addr)
		cur_buf->end = addr;
	else
		cur_buf->start = cur_buf->end = 0;

	p->found_pages -= (end - addr) / PAGE_SIZE;
}
#endif

static bool pagemap_scan_is_interesting_page(unsigned long categories,
					     const struct pagemap_scan_private *p)
{
	categories ^= p->arg.category_inverted;
	if ((categories & p->arg.category_mask) != p->arg.category_mask)
		return false;
	if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
		return false;

	return true;
}

static bool pagemap_scan_is_interesting_vma(unsigned long categories,
					    const struct pagemap_scan_private *p)
{
	unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;

	categories ^= p->arg.category_inverted;
	if ((categories & required) != required)
		return false;

	return true;
}

static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
				  struct mm_walk *walk)
{
	struct pagemap_scan_private *p = walk->private;
	struct vm_area_struct *vma = walk->vma;
	unsigned long vma_category = 0;

	if (userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma))
		vma_category |= PAGE_IS_WPALLOWED;
	else if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
		return -EPERM;

	if (vma->vm_flags & VM_PFNMAP)
		return 1;

	if (!pagemap_scan_is_interesting_vma(vma_category, p))
		return 1;

	p->cur_vma_category = vma_category;

	return 0;
}

static bool pagemap_scan_push_range(unsigned long categories,
				    struct pagemap_scan_private *p,
				    unsigned long addr, unsigned long end)
{
	struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];

	/*
	 * When there is no output buffer provided at all, the sentinel values
	 * won't match here. There is no other way for `cur_buf->end` to be
	 * non-zero other than it being non-empty.
	 */
	if (addr == cur_buf->end && categories == cur_buf->categories) {
		cur_buf->end = end;
		return true;
	}

	if (cur_buf->end) {
		if (p->vec_buf_index >= p->vec_buf_len - 1)
			return false;

		cur_buf = &p->vec_buf[++p->vec_buf_index];
	}

	cur_buf->start = addr;
	cur_buf->end = end;
	cur_buf->categories = categories;

	return true;
}

static int pagemap_scan_output(unsigned long categories,
			       struct pagemap_scan_private *p,
			       unsigned long addr, unsigned long *end)
{
	unsigned long n_pages, total_pages;
	int ret = 0;

	if (!p->vec_buf)
		return 0;

	categories &= p->arg.return_mask;

	n_pages = (*end - addr) / PAGE_SIZE;
	if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
	    total_pages > p->arg.max_pages) {
		size_t n_too_much = total_pages - p->arg.max_pages;
		*end -= n_too_much * PAGE_SIZE;
		n_pages -= n_too_much;
		ret = -ENOSPC;
	}

	if (!pagemap_scan_push_range(categories, p, addr, *end)) {
		*end = addr;
		n_pages = 0;
		ret = -ENOSPC;
	}

	p->found_pages += n_pages;
	if (ret)
		p->arg.walk_end = *end;

	return ret;
}

static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
				  unsigned long end, struct mm_walk *walk)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	struct pagemap_scan_private *p = walk->private;
	struct vm_area_struct *vma = walk->vma;
	unsigned long categories;
	spinlock_t *ptl;
	int ret = 0;

	ptl = pmd_trans_huge_lock(pmd, vma);
	if (!ptl)
		return -ENOENT;

	categories = p->cur_vma_category |
		     pagemap_thp_category(p, vma, start, *pmd);

	if (!pagemap_scan_is_interesting_page(categories, p))
		goto out_unlock;

	ret = pagemap_scan_output(categories, p, start, &end);
	if (start == end)
		goto out_unlock;

	if (~p->arg.flags & PM_SCAN_WP_MATCHING)
		goto out_unlock;
	if (~categories & PAGE_IS_WRITTEN)
		goto out_unlock;

	/*
	 * Break huge page into small pages if the WP operation
	 * needs to be performed on a portion of the huge page.
	 */
	if (end != start + HPAGE_SIZE) {
		spin_unlock(ptl);
		split_huge_pmd(vma, pmd, start);
		pagemap_scan_backout_range(p, start, end);
		/* Report as if there was no THP */
		return -ENOENT;
	}

	make_uffd_wp_pmd(vma, start, pmd);
	flush_tlb_range(vma, start, end);
out_unlock:
	spin_unlock(ptl);
	return ret;
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
	return -ENOENT;
#endif
}

static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
				  unsigned long end, struct mm_walk *walk)
{
	struct pagemap_scan_private *p = walk->private;
	struct vm_area_struct *vma = walk->vma;
	unsigned long addr, flush_end = 0;
	pte_t *pte, *start_pte;
	spinlock_t *ptl;
	int ret;

	arch_enter_lazy_mmu_mode();

	ret = pagemap_scan_thp_entry(pmd, start, end, walk);
	if (ret != -ENOENT) {
		arch_leave_lazy_mmu_mode();
		return ret;
	}

	ret = 0;
	start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
	if (!pte) {
		arch_leave_lazy_mmu_mode();
		walk->action = ACTION_AGAIN;
		return 0;
	}

	for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
		unsigned long categories = p->cur_vma_category |
					   pagemap_page_category(p, vma, addr, ptep_get(pte));
		unsigned long next = addr + PAGE_SIZE;

		if (!pagemap_scan_is_interesting_page(categories, p))
			continue;

		ret = pagemap_scan_output(categories, p, addr, &next);
		if (next == addr)
			break;

		if (~p->arg.flags & PM_SCAN_WP_MATCHING)
			continue;
		if (~categories & PAGE_IS_WRITTEN)
			continue;

		make_uffd_wp_pte(vma, addr, pte);
		if (!flush_end)
			start = addr;
		flush_end = next;
	}

	if (flush_end)
		flush_tlb_range(vma, start, addr);

	pte_unmap_unlock(start_pte, ptl);
	arch_leave_lazy_mmu_mode();

	cond_resched();
	return ret;
}

#ifdef CONFIG_HUGETLB_PAGE
static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
				      unsigned long start, unsigned long end,
				      struct mm_walk *walk)
{
	struct pagemap_scan_private *p = walk->private;
	struct vm_area_struct *vma = walk->vma;
	unsigned long categories;
	spinlock_t *ptl;
	int ret = 0;
	pte_t pte;

	if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
		/* Go the short route when not write-protecting pages. */

		pte = huge_ptep_get(ptep);
		categories = p->cur_vma_category | pagemap_hugetlb_category(pte);

		if (!pagemap_scan_is_interesting_page(categories, p))
			return 0;

		return pagemap_scan_output(categories, p, start, &end);
	}

	i_mmap_lock_write(vma->vm_file->f_mapping);
	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);

	pte = huge_ptep_get(ptep);
	categories = p->cur_vma_category | pagemap_hugetlb_category(pte);

	if (!pagemap_scan_is_interesting_page(categories, p))
		goto out_unlock;

	ret = pagemap_scan_output(categories, p, start, &end);
	if (start == end)
		goto out_unlock;

	if (~categories & PAGE_IS_WRITTEN)
		goto out_unlock;

	if (end != start + HPAGE_SIZE) {
		/* Partial HugeTLB page WP isn't possible. */
		pagemap_scan_backout_range(p, start, end);
		p->arg.walk_end = start;
		ret = 0;
		goto out_unlock;
	}

	make_uffd_wp_huge_pte(vma, start, ptep, pte);
	flush_hugetlb_tlb_range(vma, start, end);

out_unlock:
	spin_unlock(ptl);
	i_mmap_unlock_write(vma->vm_file->f_mapping);

	return ret;
}
#else
#define pagemap_scan_hugetlb_entry NULL
#endif

static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
				 int depth, struct mm_walk *walk)
{
	struct pagemap_scan_private *p = walk->private;
	struct vm_area_struct *vma = walk->vma;
	int ret, err;

	if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
		return 0;

	ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
	if (addr == end)
		return ret;

	if (~p->arg.flags & PM_SCAN_WP_MATCHING)
		return ret;

	err = uffd_wp_range(vma, addr, end - addr, true);
	if (err < 0)
		ret = err;

	return ret;
}

static const struct mm_walk_ops pagemap_scan_ops = {
	.test_walk = pagemap_scan_test_walk,
	.pmd_entry = pagemap_scan_pmd_entry,
	.pte_hole = pagemap_scan_pte_hole,
	.hugetlb_entry = pagemap_scan_hugetlb_entry,
};

static int pagemap_scan_get_args(struct pm_scan_arg *arg,
				 unsigned long uarg)
{
	if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
		return -EFAULT;

	if (arg->size != sizeof(struct pm_scan_arg))
		return -EINVAL;

	/* Validate requested features */
	if (arg->flags & ~PM_SCAN_FLAGS)
		return -EINVAL;
	if ((arg->category_inverted | arg->category_mask |
	     arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
		return -EINVAL;

	arg->start = untagged_addr((unsigned long)arg->start);
	arg->end = untagged_addr((unsigned long)arg->end);
	arg->vec = untagged_addr((unsigned long)arg->vec);

	/* Validate memory pointers */
	if (!IS_ALIGNED(arg->start, PAGE_SIZE))
		return -EINVAL;
	if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
		return -EFAULT;
	if (!arg->vec && arg->vec_len)
		return -EINVAL;
	if (arg->vec && !access_ok((void __user *)(long)arg->vec,
			      arg->vec_len * sizeof(struct page_region)))
		return -EFAULT;

	/* Fixup default values */
	arg->end = ALIGN(arg->end, PAGE_SIZE);
	arg->walk_end = 0;
	if (!arg->max_pages)
		arg->max_pages = ULONG_MAX;

	return 0;
}

static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
				       unsigned long uargl)
{
	struct pm_scan_arg __user *uarg	= (void __user *)uargl;

	if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
		return -EFAULT;

	return 0;
}

static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
{
	if (!p->arg.vec_len)
		return 0;

	p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
			       p->arg.vec_len);
	p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
				   GFP_KERNEL);
	if (!p->vec_buf)
		return -ENOMEM;

	p->vec_buf->start = p->vec_buf->end = 0;
	p->vec_out = (struct page_region __user *)(long)p->arg.vec;

	return 0;
}

static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
{
	const struct page_region *buf = p->vec_buf;
	long n = p->vec_buf_index;

	if (!p->vec_buf)
		return 0;

	if (buf[n].end != buf[n].start)
		n++;

	if (!n)
		return 0;

	if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
		return -EFAULT;

	p->arg.vec_len -= n;
	p->vec_out += n;

	p->vec_buf_index = 0;
	p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
	p->vec_buf->start = p->vec_buf->end = 0;

	return n;
}

static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
{
	struct mmu_notifier_range range;
	struct pagemap_scan_private p = {0};
	unsigned long walk_start;
	size_t n_ranges_out = 0;
	int ret;

	ret = pagemap_scan_get_args(&p.arg, uarg);
	if (ret)
		return ret;

	p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
			      p.arg.return_mask;
	ret = pagemap_scan_init_bounce_buffer(&p);
	if (ret)
		return ret;

	/* Protection change for the range is going to happen. */
	if (p.arg.flags & PM_SCAN_WP_MATCHING) {
		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
					mm, p.arg.start, p.arg.end);
		mmu_notifier_invalidate_range_start(&range);
	}

	for (walk_start = p.arg.start; walk_start < p.arg.end;
			walk_start = p.arg.walk_end) {
		long n_out;

		if (fatal_signal_pending(current)) {
			ret = -EINTR;
			break;
		}

		ret = mmap_read_lock_killable(mm);
		if (ret)
			break;
		ret = walk_page_range(mm, walk_start, p.arg.end,
				      &pagemap_scan_ops, &p);
		mmap_read_unlock(mm);

		n_out = pagemap_scan_flush_buffer(&p);
		if (n_out < 0)
			ret = n_out;
		else
			n_ranges_out += n_out;

		if (ret != -ENOSPC)
			break;

		if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
			break;
	}

	/* ENOSPC signifies early stop (buffer full) from the walk. */
	if (!ret || ret == -ENOSPC)
		ret = n_ranges_out;

	/* The walk_end isn't set when ret is zero */
	if (!p.arg.walk_end)
		p.arg.walk_end = p.arg.end;
	if (pagemap_scan_writeback_args(&p.arg, uarg))
		ret = -EFAULT;

	if (p.arg.flags & PM_SCAN_WP_MATCHING)
		mmu_notifier_invalidate_range_end(&range);

	kfree(p.vec_buf);
	return ret;
}

static long do_pagemap_cmd(struct file *file, unsigned int cmd,
			   unsigned long arg)
{
	struct mm_struct *mm = file->private_data;

	switch (cmd) {
	case PAGEMAP_SCAN:
		return do_pagemap_scan(mm, arg);

	default:
		return -EINVAL;
	}
}

const struct file_operations proc_pagemap_operations = {
	.llseek		= mem_lseek, /* borrow this */
	.read		= pagemap_read,
	.open		= pagemap_open,
	.release	= pagemap_release,
	.unlocked_ioctl = do_pagemap_cmd,
	.compat_ioctl	= do_pagemap_cmd,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */

+1 −0
Original line number Diff line number Diff line
@@ -280,6 +280,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
		unsigned long cp_flags);

bool is_hugetlb_entry_migration(pte_t pte);
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);

#else /* !CONFIG_HUGETLB_PAGE */
+7 −0
Original line number Diff line number Diff line
@@ -221,6 +221,13 @@ static inline vm_fault_t handle_userfault(struct vm_fault *vmf,
	return VM_FAULT_SIGBUS;
}

static inline long uffd_wp_range(struct vm_area_struct *vma,
				 unsigned long start, unsigned long len,
				 bool enable_wp)
{
	return false;
}

static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
					struct vm_userfaultfd_ctx vm_ctx)
{
+59 −0
Original line number Diff line number Diff line
@@ -305,4 +305,63 @@ typedef int __bitwise __kernel_rwf_t;
#define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
			 RWF_APPEND)

/* Pagemap ioctl */
#define PAGEMAP_SCAN	_IOWR('f', 16, struct pm_scan_arg)

/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */
#define PAGE_IS_WPALLOWED	(1 << 0)
#define PAGE_IS_WRITTEN		(1 << 1)
#define PAGE_IS_FILE		(1 << 2)
#define PAGE_IS_PRESENT		(1 << 3)
#define PAGE_IS_SWAPPED		(1 << 4)
#define PAGE_IS_PFNZERO		(1 << 5)
#define PAGE_IS_HUGE		(1 << 6)

/*
 * struct page_region - Page region with flags
 * @start:	Start of the region
 * @end:	End of the region (exclusive)
 * @categories:	PAGE_IS_* category bitmask for the region
 */
struct page_region {
	__u64 start;
	__u64 end;
	__u64 categories;
};

/* Flags for PAGEMAP_SCAN ioctl */
#define PM_SCAN_WP_MATCHING	(1 << 0)	/* Write protect the pages matched. */
#define PM_SCAN_CHECK_WPASYNC	(1 << 1)	/* Abort the scan when a non-WP-enabled page is found. */

/*
 * struct pm_scan_arg - Pagemap ioctl argument
 * @size:		Size of the structure
 * @flags:		Flags for the IOCTL
 * @start:		Starting address of the region
 * @end:		Ending address of the region
 * @walk_end		Address where the scan stopped (written by kernel).
 *			walk_end == end (address tags cleared) informs that the scan completed on entire range.
 * @vec:		Address of page_region struct array for output
 * @vec_len:		Length of the page_region struct array
 * @max_pages:		Optional limit for number of returned pages (0 = disabled)
 * @category_inverted:	PAGE_IS_* categories which values match if 0 instead of 1
 * @category_mask:	Skip pages for which any category doesn't match
 * @category_anyof_mask: Skip pages for which no category matches
 * @return_mask:	PAGE_IS_* categories that are to be reported in `page_region`s returned
 */
struct pm_scan_arg {
	__u64 size;
	__u64 flags;
	__u64 start;
	__u64 end;
	__u64 walk_end;
	__u64 vec;
	__u64 vec_len;
	__u64 max_pages;
	__u64 category_inverted;
	__u64 category_mask;
	__u64 category_anyof_mask;
	__u64 return_mask;
};

#endif /* _UAPI_LINUX_FS_H */
+3 −2
Original line number Diff line number Diff line
@@ -5044,7 +5044,7 @@ bool is_hugetlb_entry_migration(pte_t pte)
		return false;
}

static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
bool is_hugetlb_entry_hwpoisoned(pte_t pte)
{
	swp_entry_t swp;

@@ -6266,7 +6266,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
		}

		entry = huge_pte_clear_uffd_wp(entry);
		set_huge_pte_at(mm, haddr, ptep, entry);
		set_huge_pte_at(mm, haddr, ptep, entry,
				huge_page_size(hstate_vma(vma)));
		/* Fallthrough to CoW */
	}