Commit 9eb6207b authored by Frank van der Linden's avatar Frank van der Linden Committed by Andrew Morton
Browse files

mm/sparse: add vmemmap_*_hvo functions

Add a few functions to enable early HVO:

vmemmap_populate_hvo
vmemmap_undo_hvo
vmemmap_wrprotect_hvo

The populate and undo functions are expected to be used in early init,
from the sparse_init_nid_early() function.  The wrprotect function is to
be used, potentially, later.

To implement these functions, mostly re-use the existing compound pages
vmemmap logic used by DAX.  vmemmap_populate_address has its argument
changed a bit in this commit: the page structure passed in to be reused in
the mapping is replaced by a PFN and a flag.  The flag indicates whether
an extra ref should be taken on the vmemmap page containing the head page
structure.  Taking the ref is appropriate to for DAX / ZONE_DEVICE, but
not for HugeTLB HVO.

The HugeTLB vmemmap optimization maps tail page structure pages read-only.
The vmemmap_wrprotect_hvo function that does this is implemented
separately, because it cannot be guaranteed that reserved page structures
will not be write accessed during memory initialization.  Even with
CONFIG_DEFERRED_STRUCT_PAGE_INIT, they might still be written to (if they
are at the bottom of a zone).  So, vmemmap_populate_hvo leaves the tail
page structure pages RW initially, and then later during initialization,
after memmap init is fully done, vmemmap_wrprotect_hvo must be called to
finish the job.

Subsequent commits will use these functions for early HugeTLB HVO.

Link: https://lkml.kernel.org/r/20250228182928.2645936-15-fvdl@google.com


Signed-off-by: default avatarFrank van der Linden <fvdl@google.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 14ed3a59
Loading
Loading
Loading
Loading
+8 −1
Original line number Diff line number Diff line
@@ -3937,7 +3937,8 @@ p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
			    struct vmem_altmap *altmap, struct page *reuse);
			    struct vmem_altmap *altmap, unsigned long ptpfn,
			    unsigned long flags);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
@@ -3953,6 +3954,12 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
			       int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
		struct vmem_altmap *altmap);
int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node,
			 unsigned long headsize);
int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node,
		     unsigned long headsize);
void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
			  unsigned long headsize);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
+127 −14
Original line number Diff line number Diff line
@@ -30,6 +30,13 @@

#include <asm/dma.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>

/*
 * Flags for vmemmap_populate_range and friends.
 */
/* Get a ref on the head page struct page, for ZONE_DEVICE compound pages */
#define VMEMMAP_POPULATE_PAGEREF	0x0001

#include "internal.h"

@@ -144,17 +151,18 @@ void __meminit vmemmap_verify(pte_t *pte, int node,

pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
				       struct vmem_altmap *altmap,
				       struct page *reuse)
				       unsigned long ptpfn, unsigned long flags)
{
	pte_t *pte = pte_offset_kernel(pmd, addr);
	if (pte_none(ptep_get(pte))) {
		pte_t entry;
		void *p;

		if (!reuse) {
		if (ptpfn == (unsigned long)-1) {
			p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
			if (!p)
				return NULL;
			ptpfn = PHYS_PFN(__pa(p));
		} else {
			/*
			 * When a PTE/PMD entry is freed from the init_mm
@@ -165,10 +173,10 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
			 * and through vmemmap_populate_compound_pages() when
			 * slab is available.
			 */
			get_page(reuse);
			p = page_to_virt(reuse);
			if (flags & VMEMMAP_POPULATE_PAGEREF)
				get_page(pfn_to_page(ptpfn));
		}
		entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
		entry = pfn_pte(ptpfn, PAGE_KERNEL);
		set_pte_at(&init_mm, addr, pte, entry);
	}
	return pte;
@@ -238,7 +246,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)

static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
					      struct vmem_altmap *altmap,
					      struct page *reuse)
					      unsigned long ptpfn,
					      unsigned long flags)
{
	pgd_t *pgd;
	p4d_t *p4d;
@@ -258,7 +267,7 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
	pmd = vmemmap_pmd_populate(pud, addr, node);
	if (!pmd)
		return NULL;
	pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
	pte = vmemmap_pte_populate(pmd, addr, node, altmap, ptpfn, flags);
	if (!pte)
		return NULL;
	vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
@@ -269,13 +278,15 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
static int __meminit vmemmap_populate_range(unsigned long start,
					    unsigned long end, int node,
					    struct vmem_altmap *altmap,
					    struct page *reuse)
					    unsigned long ptpfn,
					    unsigned long flags)
{
	unsigned long addr = start;
	pte_t *pte;

	for (; addr < end; addr += PAGE_SIZE) {
		pte = vmemmap_populate_address(addr, node, altmap, reuse);
		pte = vmemmap_populate_address(addr, node, altmap,
					       ptpfn, flags);
		if (!pte)
			return -ENOMEM;
	}
@@ -286,7 +297,107 @@ static int __meminit vmemmap_populate_range(unsigned long start,
int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
					 int node, struct vmem_altmap *altmap)
{
	return vmemmap_populate_range(start, end, node, altmap, NULL);
	return vmemmap_populate_range(start, end, node, altmap, -1, 0);
}

/*
 * Undo populate_hvo, and replace it with a normal base page mapping.
 * Used in memory init in case a HVO mapping needs to be undone.
 *
 * This can happen when it is discovered that a memblock allocated
 * hugetlb page spans multiple zones, which can only be verified
 * after zones have been initialized.
 *
 * We know that:
 * 1) The first @headsize / PAGE_SIZE vmemmap pages were individually
 *    allocated through memblock, and mapped.
 *
 * 2) The rest of the vmemmap pages are mirrors of the last head page.
 */
int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end,
				      int node, unsigned long headsize)
{
	unsigned long maddr, pfn;
	pte_t *pte;
	int headpages;

	/*
	 * Should only be called early in boot, so nothing will
	 * be accessing these page structures.
	 */
	WARN_ON(!early_boot_irqs_disabled);

	headpages = headsize >> PAGE_SHIFT;

	/*
	 * Clear mirrored mappings for tail page structs.
	 */
	for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
		pte = virt_to_kpte(maddr);
		pte_clear(&init_mm, maddr, pte);
	}

	/*
	 * Clear and free mappings for head page and first tail page
	 * structs.
	 */
	for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) {
		pte = virt_to_kpte(maddr);
		pfn = pte_pfn(ptep_get(pte));
		pte_clear(&init_mm, maddr, pte);
		memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE);
	}

	flush_tlb_kernel_range(addr, end);

	return vmemmap_populate(addr, end, node, NULL);
}

/*
 * Write protect the mirrored tail page structs for HVO. This will be
 * called from the hugetlb code when gathering and initializing the
 * memblock allocated gigantic pages. The write protect can't be
 * done earlier, since it can't be guaranteed that the reserved
 * page structures will not be written to during initialization,
 * even if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.
 *
 * The PTEs are known to exist, and nothing else should be touching
 * these pages. The caller is responsible for any TLB flushing.
 */
void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end,
				    int node, unsigned long headsize)
{
	unsigned long maddr;
	pte_t *pte;

	for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
		pte = virt_to_kpte(maddr);
		ptep_set_wrprotect(&init_mm, maddr, pte);
	}
}

/*
 * Populate vmemmap pages HVO-style. The first page contains the head
 * page and needed tail pages, the other ones are mirrors of the first
 * page.
 */
int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
				       int node, unsigned long headsize)
{
	pte_t *pte;
	unsigned long maddr;

	for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) {
		pte = vmemmap_populate_address(maddr, node, NULL, -1, 0);
		if (!pte)
			return -ENOMEM;
	}

	/*
	 * Reuse the last page struct page mapped above for the rest.
	 */
	return vmemmap_populate_range(maddr, end, node, NULL,
					pte_pfn(ptep_get(pte)), 0);
}

void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
@@ -409,7 +520,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
		 * with just tail struct pages.
		 */
		return vmemmap_populate_range(start, end, node, NULL,
					      pte_page(ptep_get(pte)));
					      pte_pfn(ptep_get(pte)),
					      VMEMMAP_POPULATE_PAGEREF);
	}

	size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
@@ -417,13 +529,13 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
		unsigned long next, last = addr + size;

		/* Populate the head page vmemmap page */
		pte = vmemmap_populate_address(addr, node, NULL, NULL);
		pte = vmemmap_populate_address(addr, node, NULL, -1, 0);
		if (!pte)
			return -ENOMEM;

		/* Populate the tail pages vmemmap page */
		next = addr + PAGE_SIZE;
		pte = vmemmap_populate_address(next, node, NULL, NULL);
		pte = vmemmap_populate_address(next, node, NULL, -1, 0);
		if (!pte)
			return -ENOMEM;

@@ -433,7 +545,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
		 */
		next += PAGE_SIZE;
		rc = vmemmap_populate_range(next, last, node, NULL,
					    pte_page(ptep_get(pte)));
					    pte_pfn(ptep_get(pte)),
					    VMEMMAP_POPULATE_PAGEREF);
		if (rc)
			return -ENOMEM;
	}