Commit fa93b45f authored by Dev Jain's avatar Dev Jain Committed by Will Deacon
Browse files

arm64: Enable vmalloc-huge with ptdump

Our goal is to move towards enabling vmalloc-huge by default on arm64 so
as to reduce TLB pressure. Therefore, we need a way to analyze the portion
of block mappings in vmalloc space we can get on a production system; this
can be done through ptdump, but currently we disable vmalloc-huge if
CONFIG_PTDUMP_DEBUGFS is on. The reason is that lazy freeing of kernel
pagetables via vmap_try_huge_pxd() may race with ptdump, so ptdump
may dereference a bogus address.

To solve this, we need to synchronize ptdump_walk() and ptdump_check_wx()
with pud_free_pmd_page() and pmd_free_pte_page().

Since this race is very unlikely to happen in practice, we do not want to
penalize the vmalloc pagetable tearing path by taking the init_mm
mmap_lock. Therefore, we use static keys. ptdump_walk() and
ptdump_check_wx() are the pagetable walkers; they will enable the static
key - upon observing that, the vmalloc pagetable tearing path will get
patched in with an mmap_read_lock/unlock sequence. A combination of the
patched-in mmap_read_lock/unlock, the acquire semantics of
static_branch_inc(), and the barriers in __flush_tlb_kernel_pgtable()
ensures that ptdump will never get a hold on the address of a freed PMD
or PTE table.

We can verify the correctness of the algorithm via the following litmus
test (thanks to James Houghton and Will Deacon):

AArch64 ptdump
Variant=Ifetch
{
uint64_t pud=0xa110c;
uint64_t pmd;

0:X0=label:"P1:L0"; 0:X1=instr:"NOP"; 0:X2=lock; 0:X3=pud; 0:X4=pmd;
                    1:X1=0xdead;      1:X2=lock; 1:X3=pud; 1:X4=pmd;
}
 P0				| P1				;
 (* static_key_enable *)	| (* pud_free_pmd_page *)	;
 STR	W1, [X0]		| LDR	X9, [X3]		;
 DC	CVAU,X0			| STR	XZR, [X3]		;
 DSB	ISH			| DSB	ISH			;
 IC	IVAU,X0			| ISB				;
 DSB	ISH			|				;
 ISB				| (* static key *)		;
				| L0:				;
 (* mmap_lock *)		| B	out1			;
 Lwlock:			|				;
 MOV	W7, #1			| (* mmap_lock *)		;
 SWPA	W7, W8, [X2]		| Lrlock:			;
				| MOV	W7, #1			;
				| SWPA	W7, W8, [X2]		;
 (* walk pgtable *)		|				;
 LDR	X9, [X3]		| (* mmap_unlock *)		;
 CBZ	X9, out0		| STLR	WZR, [X2]		;
 EOR	X10, X9, X9		|				;
 LDR	X11, [X4, X10]		| out1:				;
				| EOR	X10, X9, X9		;
 out0:				| STR	X1, [X4, X10]		;

exists (0:X8=0 /\ 1:X8=0 /\	(* Lock acquisitions succeed *)
	0:X9=0xa110c /\		(* P0 sees the valid PUD ...*)
	0:X11=0xdead)		(* ... but the freed PMD *)

For an approximate written proof of why this algorithm works, please read
the code comment in [1], which is now removed for the sake of simplicity.

mm-selftests pass. No issues were observed while parallelly running
test_vmalloc.sh (which stresses the vmalloc subsystem),
and cat /sys/kernel/debug/{kernel_page_tables, check_wx_pages} in a loop.

Link: https://lore.kernel.org/all/20250723161827.15802-1-dev.jain@arm.com/

 [1]
Reviewed-by: default avatarRyan Roberts <ryan.roberts@arm.com>
Signed-off-by: default avatarDev Jain <dev.jain@arm.com>
Signed-off-by: default avatarWill Deacon <will@kernel.org>
parent 3df6979d
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -7,6 +7,8 @@

#include <linux/ptdump.h>

DECLARE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);

#ifdef CONFIG_PTDUMP

#include <linux/mm_types.h>
+2 −7
Original line number Diff line number Diff line
@@ -9,18 +9,13 @@
#define arch_vmap_pud_supported arch_vmap_pud_supported
static inline bool arch_vmap_pud_supported(pgprot_t prot)
{
	/*
	 * SW table walks can't handle removal of intermediate entries.
	 */
	return pud_sect_supported() &&
	       !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
	return pud_sect_supported();
}

#define arch_vmap_pmd_supported arch_vmap_pmd_supported
static inline bool arch_vmap_pmd_supported(pgprot_t prot)
{
	/* See arch_vmap_pud_supported() */
	return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
	return true;
}

#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
+39 −4
Original line number Diff line number Diff line
@@ -56,6 +56,8 @@ enum pgtable_type {
	TABLE_P4D,
};

DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);

u64 kimage_voffset __ro_after_init;
EXPORT_SYMBOL(kimage_voffset);

@@ -1665,7 +1667,8 @@ int pmd_clear_huge(pmd_t *pmdp)
	return 1;
}

int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr,
			       bool acquire_mmap_lock)
{
	pte_t *table;
	pmd_t pmd;
@@ -1677,13 +1680,25 @@ int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
		return 1;
	}

	/* See comment in pud_free_pmd_page for static key logic */
	table = pte_offset_kernel(pmdp, addr);
	pmd_clear(pmdp);
	__flush_tlb_kernel_pgtable(addr);
	if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) {
		mmap_read_lock(&init_mm);
		mmap_read_unlock(&init_mm);
	}

	pte_free_kernel(NULL, table);
	return 1;
}

int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
{
	/* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */
	return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true);
}

int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
{
	pmd_t *table;
@@ -1699,16 +1714,36 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
	}

	table = pmd_offset(pudp, addr);

	/*
	 * Our objective is to prevent ptdump from reading a PMD table which has
	 * been freed. In this race, if pud_free_pmd_page observes the key on
	 * (which got flipped by ptdump) then the mmap lock sequence here will,
	 * as a result of the mmap write lock/unlock sequence in ptdump, give
	 * us the correct synchronization. If not, this means that ptdump has
	 * yet not started walking the pagetables - the sequence of barriers
	 * issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will
	 * observe an empty PUD.
	 */
	pud_clear(pudp);
	__flush_tlb_kernel_pgtable(addr);
	if (static_branch_unlikely(&arm64_ptdump_lock_key)) {
		mmap_read_lock(&init_mm);
		mmap_read_unlock(&init_mm);
	}

	pmdp = table;
	next = addr;
	end = addr + PUD_SIZE;
	do {
		if (pmd_present(pmdp_get(pmdp)))
			pmd_free_pte_page(pmdp, next);
			/*
			 * PMD has been isolated, so ptdump won't see it. No
			 * need to acquire init_mm.mmap_lock.
			 */
			__pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false);
	} while (pmdp++, next += PMD_SIZE, next != end);

	pud_clear(pudp);
	__flush_tlb_kernel_pgtable(addr);
	pmd_free(NULL, table);
	return 1;
}
+9 −2
Original line number Diff line number Diff line
@@ -283,6 +283,13 @@ void note_page_flush(struct ptdump_state *pt_st)
	note_page(pt_st, 0, -1, pte_val(pte_zero));
}

static void arm64_ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm)
{
	static_branch_inc(&arm64_ptdump_lock_key);
	ptdump_walk_pgd(st, mm, NULL);
	static_branch_dec(&arm64_ptdump_lock_key);
}

void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
{
	unsigned long end = ~0UL;
@@ -311,7 +318,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
		}
	};

	ptdump_walk_pgd(&st.ptdump, info->mm, NULL);
	arm64_ptdump_walk_pgd(&st.ptdump, info->mm);
}

static void __init ptdump_initialize(void)
@@ -353,7 +360,7 @@ bool ptdump_check_wx(void)
		}
	};

	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
	arm64_ptdump_walk_pgd(&st.ptdump, &init_mm);

	if (st.wx_pages || st.uxn_pages) {
		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",