Commit 6da8e963 authored by Peter Xu's avatar Peter Xu Committed by Andrew Morton
Browse files

mm: new follow_pfnmap API

Introduce a pair of APIs to follow pfn mappings to get entry information. 
It's very similar to what follow_pte() does before, but different in that
it recognizes huge pfn mappings.

Link: https://lkml.kernel.org/r/20240826204353.2228736-10-peterx@redhat.com


Signed-off-by: default avatarPeter Xu <peterx@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gavin Shan <gshan@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 0515e022
Loading
Loading
Loading
Loading
+31 −0
Original line number Diff line number Diff line
@@ -2373,6 +2373,37 @@ int follow_pte(struct vm_area_struct *vma, unsigned long address,
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
			void *buf, int len, int write);

struct follow_pfnmap_args {
	/**
	 * Inputs:
	 * @vma: Pointer to @vm_area_struct struct
	 * @address: the virtual address to walk
	 */
	struct vm_area_struct *vma;
	unsigned long address;
	/**
	 * Internals:
	 *
	 * The caller shouldn't touch any of these.
	 */
	spinlock_t *lock;
	pte_t *ptep;
	/**
	 * Outputs:
	 *
	 * @pfn: the PFN of the address
	 * @pgprot: the pgprot_t of the mapping
	 * @writable: whether the mapping is writable
	 * @special: whether the mapping is a special mapping (real PFN maps)
	 */
	unsigned long pfn;
	pgprot_t pgprot;
	bool writable;
	bool special;
};
int follow_pfnmap_start(struct follow_pfnmap_args *args);
void follow_pfnmap_end(struct follow_pfnmap_args *args);

extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
+150 −0
Original line number Diff line number Diff line
@@ -6172,6 +6172,156 @@ int follow_pte(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(follow_pte);

static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
				     spinlock_t *lock, pte_t *ptep,
				     pgprot_t pgprot, unsigned long pfn_base,
				     unsigned long addr_mask, bool writable,
				     bool special)
{
	args->lock = lock;
	args->ptep = ptep;
	args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
	args->pgprot = pgprot;
	args->writable = writable;
	args->special = special;
}

static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
{
#ifdef CONFIG_LOCKDEP
	struct address_space *mapping = vma->vm_file->f_mapping;

	if (mapping)
		lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) ||
			       lockdep_is_held(&vma->vm_mm->mmap_lock));
	else
		lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
#endif
}

/**
 * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
 * @args: Pointer to struct @follow_pfnmap_args
 *
 * The caller needs to setup args->vma and args->address to point to the
 * virtual address as the target of such lookup.  On a successful return,
 * the results will be put into other output fields.
 *
 * After the caller finished using the fields, the caller must invoke
 * another follow_pfnmap_end() to proper releases the locks and resources
 * of such look up request.
 *
 * During the start() and end() calls, the results in @args will be valid
 * as proper locks will be held.  After the end() is called, all the fields
 * in @follow_pfnmap_args will be invalid to be further accessed.  Further
 * use of such information after end() may require proper synchronizations
 * by the caller with page table updates, otherwise it can create a
 * security bug.
 *
 * If the PTE maps a refcounted page, callers are responsible to protect
 * against invalidation with MMU notifiers; otherwise access to the PFN at
 * a later point in time can trigger use-after-free.
 *
 * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
 * should be taken for read, and the mmap semaphore cannot be released
 * before the end() is invoked.
 *
 * This function must not be used to modify PTE content.
 *
 * Return: zero on success, negative otherwise.
 */
int follow_pfnmap_start(struct follow_pfnmap_args *args)
{
	struct vm_area_struct *vma = args->vma;
	unsigned long address = args->address;
	struct mm_struct *mm = vma->vm_mm;
	spinlock_t *lock;
	pgd_t *pgdp;
	p4d_t *p4dp, p4d;
	pud_t *pudp, pud;
	pmd_t *pmdp, pmd;
	pte_t *ptep, pte;

	pfnmap_lockdep_assert(vma);

	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
		goto out;

	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
		goto out;
retry:
	pgdp = pgd_offset(mm, address);
	if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp)))
		goto out;

	p4dp = p4d_offset(pgdp, address);
	p4d = READ_ONCE(*p4dp);
	if (p4d_none(p4d) || unlikely(p4d_bad(p4d)))
		goto out;

	pudp = pud_offset(p4dp, address);
	pud = READ_ONCE(*pudp);
	if (pud_none(pud))
		goto out;
	if (pud_leaf(pud)) {
		lock = pud_lock(mm, pudp);
		if (!unlikely(pud_leaf(pud))) {
			spin_unlock(lock);
			goto retry;
		}
		pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud),
				  pud_pfn(pud), PUD_MASK, pud_write(pud),
				  pud_special(pud));
		return 0;
	}

	pmdp = pmd_offset(pudp, address);
	pmd = pmdp_get_lockless(pmdp);
	if (pmd_leaf(pmd)) {
		lock = pmd_lock(mm, pmdp);
		if (!unlikely(pmd_leaf(pmd))) {
			spin_unlock(lock);
			goto retry;
		}
		pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd),
				  pmd_pfn(pmd), PMD_MASK, pmd_write(pmd),
				  pmd_special(pmd));
		return 0;
	}

	ptep = pte_offset_map_lock(mm, pmdp, address, &lock);
	if (!ptep)
		goto out;
	pte = ptep_get(ptep);
	if (!pte_present(pte))
		goto unlock;
	pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte),
			  pte_pfn(pte), PAGE_MASK, pte_write(pte),
			  pte_special(pte));
	return 0;
unlock:
	pte_unmap_unlock(ptep, lock);
out:
	return -EINVAL;
}
EXPORT_SYMBOL_GPL(follow_pfnmap_start);

/**
 * follow_pfnmap_end(): End a follow_pfnmap_start() process
 * @args: Pointer to struct @follow_pfnmap_args
 *
 * Must be used in pair of follow_pfnmap_start().  See the start() function
 * above for more information.
 */
void follow_pfnmap_end(struct follow_pfnmap_args *args)
{
	if (args->lock)
		spin_unlock(args->lock);
	if (args->ptep)
		pte_unmap(args->ptep);
}
EXPORT_SYMBOL_GPL(follow_pfnmap_end);

#ifdef CONFIG_HAVE_IOREMAP_PROT
/**
 * generic_access_phys - generic implementation for iomem mmap access