Commit a5a5ce57 authored by Will Deacon's avatar Will Deacon
Browse files

Merge branch 'for-next/mm' into for-next/core

* for-next/mm:
  arm64/mm: Fix pud_user_accessible_page() for PGTABLE_LEVELS <= 2
  arm64/mm: Add uffd write-protect support
  arm64/mm: Move PTE_PRESENT_INVALID to overlay PTE_NG
  arm64/mm: Remove PTE_PROT_NONE bit
  arm64/mm: generalize PMD_PRESENT_INVALID for all levels
  arm64: mm: Don't remap pgtables for allocate vs populate
  arm64: mm: Batch dsb and isb when populating pgtables
  arm64: mm: Don't remap pgtables per-cont(pte|pmd) block
parents 7a7f6045 cb67ea12
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -255,6 +255,7 @@ config ARM64
	select SYSCTL_EXCEPTION_TRACE
	select THREAD_INFO_IN_TASK
	select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
	select HAVE_ARCH_USERFAULTFD_WP if USERFAULTFD
	select TRACE_IRQFLAGS_SUPPORT
	select TRACE_IRQFLAGS_NMI_SUPPORT
	select HAVE_SOFTIRQ_ON_OWN_STACK
+13 −6
Original line number Diff line number Diff line
@@ -18,14 +18,21 @@
#define PTE_DIRTY		(_AT(pteval_t, 1) << 55)
#define PTE_SPECIAL		(_AT(pteval_t, 1) << 56)
#define PTE_DEVMAP		(_AT(pteval_t, 1) << 57)
#define PTE_PROT_NONE		(_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */

/*
 * This bit indicates that the entry is present i.e. pmd_page()
 * still points to a valid huge page in memory even if the pmd
 * has been invalidated.
 * PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be
 * interpreted according to the HW layout by SW but any attempted HW access to
 * the address will result in a fault. pte_present() returns true.
 */
#define PMD_PRESENT_INVALID	(_AT(pteval_t, 1) << 59) /* only when !PMD_SECT_VALID */
#define PTE_PRESENT_INVALID	(PTE_NG)		 /* only when !PTE_VALID */

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define PTE_UFFD_WP		(_AT(pteval_t, 1) << 58) /* uffd-wp tracking */
#define PTE_SWP_UFFD_WP		(_AT(pteval_t, 1) << 3)	 /* only for swp ptes */
#else
#define PTE_UFFD_WP		(_AT(pteval_t, 0))
#define PTE_SWP_UFFD_WP		(_AT(pteval_t, 0))
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

#define _PROT_DEFAULT		(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
#define _PROT_SECT_DEFAULT	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
@@ -103,7 +110,7 @@ static inline bool __pure lpa2_is_enabled(void)
		__val;							\
	 })

#define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
#define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PRESENT_INVALID | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
/* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */
#define PAGE_SHARED		__pgprot(_PAGE_SHARED)
#define PAGE_SHARED_EXEC	__pgprot(_PAGE_SHARED_EXEC)
+86 −28
Original line number Diff line number Diff line
@@ -105,7 +105,7 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
/*
 * The following only work if pte_present(). Undefined behaviour otherwise.
 */
#define pte_present(pte)	(!!(pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)))
#define pte_present(pte)	(pte_valid(pte) || pte_present_invalid(pte))
#define pte_young(pte)		(!!(pte_val(pte) & PTE_AF))
#define pte_special(pte)	(!!(pte_val(pte) & PTE_SPECIAL))
#define pte_write(pte)		(!!(pte_val(pte) & PTE_WRITE))
@@ -132,6 +132,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
#define pte_dirty(pte)		(pte_sw_dirty(pte) || pte_hw_dirty(pte))

#define pte_valid(pte)		(!!(pte_val(pte) & PTE_VALID))
#define pte_present_invalid(pte) \
	((pte_val(pte) & (PTE_VALID | PTE_PRESENT_INVALID)) == PTE_PRESENT_INVALID)
/*
 * Execute-only user mappings do not have the PTE_USER bit set. All valid
 * kernel mappings have the PTE_UXN bit set.
@@ -261,6 +263,13 @@ static inline pte_t pte_mkpresent(pte_t pte)
	return set_pte_bit(pte, __pgprot(PTE_VALID));
}

static inline pte_t pte_mkinvalid(pte_t pte)
{
	pte = set_pte_bit(pte, __pgprot(PTE_PRESENT_INVALID));
	pte = clear_pte_bit(pte, __pgprot(PTE_VALID));
	return pte;
}

static inline pmd_t pmd_mkcont(pmd_t pmd)
{
	return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
@@ -271,9 +280,31 @@ static inline pte_t pte_mkdevmap(pte_t pte)
	return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
}

static inline void __set_pte(pte_t *ptep, pte_t pte)
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
	return !!(pte_val(pte) & PTE_UFFD_WP);
}

static inline pte_t pte_mkuffd_wp(pte_t pte)
{
	return pte_wrprotect(set_pte_bit(pte, __pgprot(PTE_UFFD_WP)));
}

static inline pte_t pte_clear_uffd_wp(pte_t pte)
{
	return clear_pte_bit(pte, __pgprot(PTE_UFFD_WP));
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline void __set_pte_nosync(pte_t *ptep, pte_t pte)
{
	WRITE_ONCE(*ptep, pte);
}

static inline void __set_pte(pte_t *ptep, pte_t pte)
{
	__set_pte_nosync(ptep, pte);

	/*
	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
@@ -463,13 +494,39 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
	return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
{
	return set_pte_bit(pte, __pgprot(PTE_SWP_UFFD_WP));
}

static inline int pte_swp_uffd_wp(pte_t pte)
{
	return !!(pte_val(pte) & PTE_SWP_UFFD_WP);
}

static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
{
	return clear_pte_bit(pte, __pgprot(PTE_SWP_UFFD_WP));
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

#ifdef CONFIG_NUMA_BALANCING
/*
 * See the comment in include/linux/pgtable.h
 */
static inline int pte_protnone(pte_t pte)
{
	return (pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)) == PTE_PROT_NONE;
	/*
	 * pte_present_invalid() tells us that the pte is invalid from HW
	 * perspective but present from SW perspective, so the fields are to be
	 * interpretted as per the HW layout. The second 2 checks are the unique
	 * encoding that we use for PROT_NONE. It is insufficient to only use
	 * the first check because we share the same encoding scheme with pmds
	 * which support pmd_mkinvalid(), so can be present-invalid without
	 * being PROT_NONE.
	 */
	return pte_present_invalid(pte) && !pte_user(pte) && !pte_user_exec(pte);
}

static inline int pmd_protnone(pmd_t pmd)
@@ -478,12 +535,7 @@ static inline int pmd_protnone(pmd_t pmd)
}
#endif

#define pmd_present_invalid(pmd)     (!!(pmd_val(pmd) & PMD_PRESENT_INVALID))

static inline int pmd_present(pmd_t pmd)
{
	return pte_present(pmd_pte(pmd)) || pmd_present_invalid(pmd);
}
#define pmd_present(pmd)	pte_present(pmd_pte(pmd))

/*
 * THP definitions.
@@ -508,14 +560,16 @@ static inline int pmd_trans_huge(pmd_t pmd)
#define pmd_mkclean(pmd)	pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))

static inline pmd_t pmd_mkinvalid(pmd_t pmd)
{
	pmd = set_pmd_bit(pmd, __pgprot(PMD_PRESENT_INVALID));
	pmd = clear_pmd_bit(pmd, __pgprot(PMD_SECT_VALID));

	return pmd;
}
#define pmd_mkinvalid(pmd)	pte_pmd(pte_mkinvalid(pmd_pte(pmd)))
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define pmd_uffd_wp(pmd)	pte_uffd_wp(pmd_pte(pmd))
#define pmd_mkuffd_wp(pmd)	pte_pmd(pte_mkuffd_wp(pmd_pte(pmd)))
#define pmd_clear_uffd_wp(pmd)	pte_pmd(pte_clear_uffd_wp(pmd_pte(pmd)))
#define pmd_swp_uffd_wp(pmd)	pte_swp_uffd_wp(pmd_pte(pmd))
#define pmd_swp_mkuffd_wp(pmd)	pte_pmd(pte_swp_mkuffd_wp(pmd_pte(pmd)))
#define pmd_swp_clear_uffd_wp(pmd) \
				pte_pmd(pte_swp_clear_uffd_wp(pmd_pte(pmd)))
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

#define pmd_thp_or_huge(pmd)	(pmd_huge(pmd) || pmd_trans_huge(pmd))

@@ -760,6 +814,7 @@ static inline pmd_t *pud_pgtable(pud_t pud)

#else

#define pud_valid(pud)		false
#define pud_page_paddr(pud)	({ BUILD_BUG(); 0; })
#define pud_user_exec(pud)	pud_user(pud) /* Always 0 with folding */

@@ -1005,6 +1060,8 @@ static inline p4d_t *p4d_offset_kimg(pgd_t *pgdp, u64 addr)

static inline bool pgtable_l5_enabled(void) { return false; }

#define p4d_index(addr)		(((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1))

/* Match p4d_offset folding in <asm/generic/pgtable-nop4d.h> */
#define p4d_set_fixmap(addr)		NULL
#define p4d_set_fixmap_offset(p4dp, addr)	((p4d_t *)p4dp)
@@ -1027,8 +1084,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
	 * in MAIR_EL1. The mask below has to include PTE_ATTRINDX_MASK.
	 */
	const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY |
			      PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP |
			      PTE_ATTRINDX_MASK;
			      PTE_PRESENT_INVALID | PTE_VALID | PTE_WRITE |
			      PTE_GP | PTE_ATTRINDX_MASK;
	/* preserve the hardware dirty information */
	if (pte_hw_dirty(pte))
		pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
@@ -1076,17 +1133,17 @@ static inline int pgd_devmap(pgd_t pgd)
#ifdef CONFIG_PAGE_TABLE_CHECK
static inline bool pte_user_accessible_page(pte_t pte)
{
	return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte));
	return pte_valid(pte) && (pte_user(pte) || pte_user_exec(pte));
}

static inline bool pmd_user_accessible_page(pmd_t pmd)
{
	return pmd_leaf(pmd) && !pmd_present_invalid(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
	return pmd_valid(pmd) && !pmd_table(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
}

static inline bool pud_user_accessible_page(pud_t pud)
{
	return pud_leaf(pud) && (pud_user(pud) || pud_user_exec(pud));
	return pud_valid(pud) && !pud_table(pud) && (pud_user(pud) || pud_user_exec(pud));
}
#endif

@@ -1248,15 +1305,16 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
 * Encode and decode a swap entry:
 *	bits 0-1:	present (must be zero)
 *	bits 2:		remember PG_anon_exclusive
 *	bits 3-7:	swap type
 *	bits 8-57:	swap offset
 *	bit  58:	PTE_PROT_NONE (must be zero)
 *	bit  3:		remember uffd-wp state
 *	bits 6-10:	swap type
 *	bit  11:	PTE_PRESENT_INVALID (must be zero)
 *	bits 12-61:	swap offset
 */
#define __SWP_TYPE_SHIFT	3
#define __SWP_TYPE_SHIFT	6
#define __SWP_TYPE_BITS		5
#define __SWP_OFFSET_BITS	50
#define __SWP_TYPE_MASK		((1 << __SWP_TYPE_BITS) - 1)
#define __SWP_OFFSET_SHIFT	(__SWP_TYPE_BITS + __SWP_TYPE_SHIFT)
#define __SWP_OFFSET_SHIFT	12
#define __SWP_OFFSET_BITS	50
#define __SWP_OFFSET_MASK	((1UL << __SWP_OFFSET_BITS) - 1)

#define __swp_type(x)		(((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK)
+57 −44
Original line number Diff line number Diff line
@@ -109,28 +109,12 @@ EXPORT_SYMBOL(phys_mem_access_prot);
static phys_addr_t __init early_pgtable_alloc(int shift)
{
	phys_addr_t phys;
	void *ptr;

	phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
					 MEMBLOCK_ALLOC_NOLEAKTRACE);
	if (!phys)
		panic("Failed to allocate page table page\n");

	/*
	 * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
	 * slot will be free, so we can (ab)use the FIX_PTE slot to initialise
	 * any level of table.
	 */
	ptr = pte_set_fixmap(phys);

	memset(ptr, 0, PAGE_SIZE);

	/*
	 * Implicit barriers also ensure the zeroed page is visible to the page
	 * table walker
	 */
	pte_clear_fixmap();

	return phys;
}

@@ -172,16 +156,25 @@ bool pgattr_change_is_safe(u64 old, u64 new)
	return ((old ^ new) & ~mask) == 0;
}

static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
		     phys_addr_t phys, pgprot_t prot)
static void init_clear_pgtable(void *table)
{
	pte_t *ptep;
	clear_page(table);

	ptep = pte_set_fixmap_offset(pmdp, addr);
	/* Ensure the zeroing is observed by page table walks. */
	dsb(ishst);
}

static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
		     phys_addr_t phys, pgprot_t prot)
{
	do {
		pte_t old_pte = __ptep_get(ptep);

		__set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
		/*
		 * Required barriers to make this visible to the table walker
		 * are deferred to the end of alloc_init_cont_pte().
		 */
		__set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot));

		/*
		 * After the PTE entry has been populated once, we
@@ -192,8 +185,6 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,

		phys += PAGE_SIZE;
	} while (ptep++, addr += PAGE_SIZE, addr != end);

	pte_clear_fixmap();
}

static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
@@ -204,6 +195,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
{
	unsigned long next;
	pmd_t pmd = READ_ONCE(*pmdp);
	pte_t *ptep;

	BUG_ON(pmd_sect(pmd));
	if (pmd_none(pmd)) {
@@ -214,10 +206,14 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
			pmdval |= PMD_TABLE_PXN;
		BUG_ON(!pgtable_alloc);
		pte_phys = pgtable_alloc(PAGE_SHIFT);
		ptep = pte_set_fixmap(pte_phys);
		init_clear_pgtable(ptep);
		ptep += pte_index(addr);
		__pmd_populate(pmdp, pte_phys, pmdval);
		pmd = READ_ONCE(*pmdp);
	}
	} else {
		BUG_ON(pmd_bad(pmd));
		ptep = pte_set_fixmap_offset(pmdp, addr);
	}

	do {
		pgprot_t __prot = prot;
@@ -229,20 +225,26 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
		    (flags & NO_CONT_MAPPINGS) == 0)
			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);

		init_pte(pmdp, addr, next, phys, __prot);
		init_pte(ptep, addr, next, phys, __prot);

		ptep += pte_index(next) - pte_index(addr);
		phys += next - addr;
	} while (addr = next, addr != end);

	/*
	 * Note: barriers and maintenance necessary to clear the fixmap slot
	 * ensure that all previous pgtable writes are visible to the table
	 * walker.
	 */
	pte_clear_fixmap();
}

static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
		     phys_addr_t phys, pgprot_t prot,
		     phys_addr_t (*pgtable_alloc)(int), int flags)
{
	unsigned long next;
	pmd_t *pmdp;

	pmdp = pmd_set_fixmap_offset(pudp, addr);
	do {
		pmd_t old_pmd = READ_ONCE(*pmdp);

@@ -268,8 +270,6 @@ static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
		}
		phys += next - addr;
	} while (pmdp++, addr = next, addr != end);

	pmd_clear_fixmap();
}

static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
@@ -279,6 +279,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
{
	unsigned long next;
	pud_t pud = READ_ONCE(*pudp);
	pmd_t *pmdp;

	/*
	 * Check for initial section mappings in the pgd/pud.
@@ -292,10 +293,14 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
			pudval |= PUD_TABLE_PXN;
		BUG_ON(!pgtable_alloc);
		pmd_phys = pgtable_alloc(PMD_SHIFT);
		pmdp = pmd_set_fixmap(pmd_phys);
		init_clear_pgtable(pmdp);
		pmdp += pmd_index(addr);
		__pud_populate(pudp, pmd_phys, pudval);
		pud = READ_ONCE(*pudp);
	}
	} else {
		BUG_ON(pud_bad(pud));
		pmdp = pmd_set_fixmap_offset(pudp, addr);
	}

	do {
		pgprot_t __prot = prot;
@@ -307,10 +312,13 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
		    (flags & NO_CONT_MAPPINGS) == 0)
			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);

		init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
		init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);

		pmdp += pmd_index(next) - pmd_index(addr);
		phys += next - addr;
	} while (addr = next, addr != end);

	pmd_clear_fixmap();
}

static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
@@ -330,12 +338,15 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
			p4dval |= P4D_TABLE_PXN;
		BUG_ON(!pgtable_alloc);
		pud_phys = pgtable_alloc(PUD_SHIFT);
		pudp = pud_set_fixmap(pud_phys);
		init_clear_pgtable(pudp);
		pudp += pud_index(addr);
		__p4d_populate(p4dp, pud_phys, p4dval);
		p4d = READ_ONCE(*p4dp);
	}
	} else {
		BUG_ON(p4d_bad(p4d));

		pudp = pud_set_fixmap_offset(p4dp, addr);
	}

	do {
		pud_t old_pud = READ_ONCE(*pudp);

@@ -385,12 +396,15 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
			pgdval |= PGD_TABLE_PXN;
		BUG_ON(!pgtable_alloc);
		p4d_phys = pgtable_alloc(P4D_SHIFT);
		p4dp = p4d_set_fixmap(p4d_phys);
		init_clear_pgtable(p4dp);
		p4dp += p4d_index(addr);
		__pgd_populate(pgdp, p4d_phys, pgdval);
		pgd = READ_ONCE(*pgdp);
	}
	} else {
		BUG_ON(pgd_bad(pgd));

		p4dp = p4d_set_fixmap_offset(pgdp, addr);
	}

	do {
		p4d_t old_p4d = READ_ONCE(*p4dp);

@@ -457,11 +471,10 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,

static phys_addr_t __pgd_pgtable_alloc(int shift)
{
	void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL);
	BUG_ON(!ptr);
	/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
	void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);

	/* Ensure the zeroed page is visible to the page table walker */
	dsb(ishst);
	BUG_ON(!ptr);
	return __pa(ptr);
}