mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/
synced 2026-04-05 00:07:48 -04:00
Patch series "mm: folio_zero_user: clear page ranges", v11.
This series adds clearing of contiguous page ranges for hugepages.
The series improves on the current discontiguous clearing approach in two
ways:
- clear pages in a contiguous fashion.
- use batched clearing via clear_pages() wherever exposed.
The first is useful because it allows us to make much better use of
hardware prefetchers.
The second, enables advertising the real extent to the processor. Where
specific instructions support it (ex. string instructions on x86; "mops"
on arm64 etc), a processor can optimize based on this because, instead of
seeing a sequence of 8-byte stores, or a sequence of 4KB pages, it sees a
larger unit being operated on.
For instance, AMD Zen uarchs (for extents larger than LLC-size) switch to
a mode where they start eliding cacheline allocation. This is helpful not
just because it results in higher bandwidth, but also because now the
cache is not evicting useful cachelines and replacing them with zeroes.
Demand faulting a 64GB region shows performance improvement:
$ perf bench mem mmap -p $pg-sz -f demand -s 64GB -l 5
baseline +series
(GBps +- %stdev) (GBps +- %stdev)
pg-sz=2MB 11.76 +- 1.10% 25.34 +- 1.18% [*] +115.47% preempt=*
pg-sz=1GB 24.85 +- 2.41% 39.22 +- 2.32% + 57.82% preempt=none|voluntary
pg-sz=1GB (similar) 52.73 +- 0.20% [#] +112.19% preempt=full|lazy
[*] This improvement is because switching to sequential clearing
allows the hardware prefetchers to do a much better job.
[#] For pg-sz=1GB a large part of the improvement is because of the
cacheline elision mentioned above. preempt=full|lazy improves upon
that because, not needing explicit invocations of cond_resched() to
ensure reasonable preemption latency, it can clear the full extent
as a single unit. In comparison the maximum extent used for
preempt=none|voluntary is PROCESS_PAGES_NON_PREEMPT_BATCH (32MB).
When provided the full extent the processor forgoes allocating
cachelines on this path almost entirely.
(The hope is that eventually, in the fullness of time, the lazy
preemption model will be able to do the same job that none or
voluntary models are used for, allowing us to do away with
cond_resched().)
Raghavendra also tested previous version of the series on AMD Genoa and
sees similar improvement [1] with preempt=lazy.
$ perf bench mem map -p $page-size -f populate -s 64GB -l 10
base patched change
pg-sz=2MB 12.731939 GB/sec 26.304263 GB/sec 106.6%
pg-sz=1GB 26.232423 GB/sec 61.174836 GB/sec 133.2%
This patch (of 8):
Let's drop all variants that effectively map to clear_page() and provide
it in a generic variant instead.
We'll use the macro clear_user_page to indicate whether an architecture
provides it's own variant.
Also, clear_user_page() is only called from the generic variant of
clear_user_highpage(), so define it only if the architecture does not
provide a clear_user_highpage(). And, for simplicity define it in
linux/highmem.h.
Note that for parisc, clear_page() and clear_user_page() map to
clear_page_asm(), so we can just get rid of the custom clear_user_page()
implementation. There is a clear_user_page_asm() function on parisc, that
seems to be unused. Not sure what's up with that.
Link: https://lkml.kernel.org/r/20260107072009.1615991-1-ankur.a.arora@oracle.com
Link: https://lkml.kernel.org/r/20260107072009.1615991-2-ankur.a.arora@oracle.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Co-developed-by: Ankur Arora <ankur.a.arora@oracle.com>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ankur Arora <ankur.a.arora@oracle.com>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Konrad Rzessutek Wilk <konrad.wilk@oracle.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Li Zhe <lizhe.67@bytedance.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Raghavendra K T <raghavendra.kt@amd.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
180 lines
5.0 KiB
C
180 lines
5.0 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _PARISC_PAGE_H
|
|
#define _PARISC_PAGE_H
|
|
|
|
#include <linux/const.h>
|
|
|
|
#include <vdso/page.h>
|
|
|
|
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
|
|
|
|
#ifndef __ASSEMBLER__
|
|
|
|
#include <asm/types.h>
|
|
#include <asm/cache.h>
|
|
|
|
#define clear_page(page) clear_page_asm((void *)(page))
|
|
#define copy_page(to, from) copy_page_asm((void *)(to), (void *)(from))
|
|
|
|
struct page;
|
|
struct vm_area_struct;
|
|
|
|
void clear_page_asm(void *page);
|
|
void copy_page_asm(void *to, void *from);
|
|
void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr,
|
|
struct vm_area_struct *vma);
|
|
#define __HAVE_ARCH_COPY_USER_HIGHPAGE
|
|
|
|
/*
|
|
* These are used to make use of C type-checking..
|
|
*/
|
|
#define STRICT_MM_TYPECHECKS
|
|
#ifdef STRICT_MM_TYPECHECKS
|
|
typedef struct { unsigned long pte; } pte_t; /* either 32 or 64bit */
|
|
|
|
/* NOTE: even on 64 bits, these entries are __u32 because we allocate
|
|
* the pmd and pgd in ZONE_DMA (i.e. under 4GB) */
|
|
typedef struct { __u32 pgd; } pgd_t;
|
|
typedef struct { unsigned long pgprot; } pgprot_t;
|
|
|
|
#if CONFIG_PGTABLE_LEVELS == 3
|
|
typedef struct { __u32 pmd; } pmd_t;
|
|
#define __pmd(x) ((pmd_t) { (x) } )
|
|
/* pXd_val() do not work as lvalues, so make sure we don't use them as such. */
|
|
#define pmd_val(x) ((x).pmd + 0)
|
|
#endif
|
|
|
|
#define pte_val(x) ((x).pte)
|
|
#define pgd_val(x) ((x).pgd + 0)
|
|
#define pgprot_val(x) ((x).pgprot)
|
|
|
|
#define __pte(x) ((pte_t) { (x) } )
|
|
#define __pgd(x) ((pgd_t) { (x) } )
|
|
#define __pgprot(x) ((pgprot_t) { (x) } )
|
|
|
|
#else
|
|
/*
|
|
* .. while these make it easier on the compiler
|
|
*/
|
|
typedef unsigned long pte_t;
|
|
|
|
#if CONFIG_PGTABLE_LEVELS == 3
|
|
typedef __u32 pmd_t;
|
|
#define pmd_val(x) (x)
|
|
#define __pmd(x) (x)
|
|
#endif
|
|
|
|
typedef __u32 pgd_t;
|
|
typedef unsigned long pgprot_t;
|
|
|
|
#define pte_val(x) (x)
|
|
#define pgd_val(x) (x)
|
|
#define pgprot_val(x) (x)
|
|
|
|
#define __pte(x) (x)
|
|
#define __pgd(x) (x)
|
|
#define __pgprot(x) (x)
|
|
|
|
#endif /* STRICT_MM_TYPECHECKS */
|
|
|
|
#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
|
|
#if CONFIG_PGTABLE_LEVELS == 3
|
|
#define set_pud(pudptr, pudval) (*(pudptr) = (pudval))
|
|
#endif
|
|
|
|
typedef struct page *pgtable_t;
|
|
|
|
typedef struct __physmem_range {
|
|
unsigned long start_pfn;
|
|
unsigned long pages; /* PAGE_SIZE pages */
|
|
} physmem_range_t;
|
|
|
|
extern physmem_range_t pmem_ranges[];
|
|
extern int npmem_ranges;
|
|
|
|
#endif /* !__ASSEMBLER__ */
|
|
|
|
/* WARNING: The definitions below must match exactly to sizeof(pte_t)
|
|
* etc
|
|
*/
|
|
#ifdef CONFIG_64BIT
|
|
#define BITS_PER_PTE_ENTRY 3
|
|
#define BITS_PER_PMD_ENTRY 2
|
|
#define BITS_PER_PGD_ENTRY 2
|
|
#else
|
|
#define BITS_PER_PTE_ENTRY 2
|
|
#define BITS_PER_PMD_ENTRY 2
|
|
#define BITS_PER_PGD_ENTRY 2
|
|
#endif
|
|
#define PGD_ENTRY_SIZE (1UL << BITS_PER_PGD_ENTRY)
|
|
#define PMD_ENTRY_SIZE (1UL << BITS_PER_PMD_ENTRY)
|
|
#define PTE_ENTRY_SIZE (1UL << BITS_PER_PTE_ENTRY)
|
|
|
|
#define LINUX_GATEWAY_SPACE 0
|
|
|
|
/* This governs the relationship between virtual and physical addresses.
|
|
* If you alter it, make sure to take care of our various fixed mapping
|
|
* segments in fixmap.h */
|
|
#ifdef CONFIG_64BIT
|
|
#define __PAGE_OFFSET_DEFAULT (0x40000000) /* 1GB */
|
|
#else
|
|
#define __PAGE_OFFSET_DEFAULT (0x10000000) /* 256MB */
|
|
#endif
|
|
|
|
#if defined(BOOTLOADER)
|
|
#define __PAGE_OFFSET (0) /* bootloader uses physical addresses */
|
|
#else
|
|
#define __PAGE_OFFSET __PAGE_OFFSET_DEFAULT
|
|
#endif /* BOOTLOADER */
|
|
|
|
#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
|
|
|
|
/* The size of the gateway page (we leave lots of room for expansion) */
|
|
#define GATEWAY_PAGE_SIZE 0x4000
|
|
|
|
/* The start of the actual kernel binary---used in vmlinux.lds.S
|
|
* Leave some space after __PAGE_OFFSET for detecting kernel null
|
|
* ptr derefs */
|
|
#define KERNEL_BINARY_TEXT_START (__PAGE_OFFSET + 0x100000)
|
|
|
|
/* These macros don't work for 64-bit C code -- don't allow in C at all */
|
|
#ifdef __ASSEMBLER__
|
|
# define PA(x) ((x)-__PAGE_OFFSET)
|
|
# define VA(x) ((x)+__PAGE_OFFSET)
|
|
#endif
|
|
#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET)
|
|
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
|
|
|
|
#ifdef CONFIG_HUGETLB_PAGE
|
|
#define HPAGE_SHIFT PMD_SHIFT /* fixed for transparent huge pages */
|
|
#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
|
|
#define HPAGE_MASK (~(HPAGE_SIZE - 1))
|
|
#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
|
|
|
|
#if defined(CONFIG_64BIT) && defined(CONFIG_PARISC_PAGE_SIZE_4KB)
|
|
# define REAL_HPAGE_SHIFT 20 /* 20 = 1MB */
|
|
# define _HUGE_PAGE_SIZE_ENCODING_DEFAULT _PAGE_SIZE_ENCODING_1M
|
|
#elif !defined(CONFIG_64BIT) && defined(CONFIG_PARISC_PAGE_SIZE_4KB)
|
|
# define REAL_HPAGE_SHIFT 22 /* 22 = 4MB */
|
|
# define _HUGE_PAGE_SIZE_ENCODING_DEFAULT _PAGE_SIZE_ENCODING_4M
|
|
#else
|
|
# define REAL_HPAGE_SHIFT 24 /* 24 = 16MB */
|
|
# define _HUGE_PAGE_SIZE_ENCODING_DEFAULT _PAGE_SIZE_ENCODING_16M
|
|
#endif
|
|
#endif /* CONFIG_HUGETLB_PAGE */
|
|
|
|
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
|
|
|
|
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
|
|
|
|
#include <asm-generic/memory_model.h>
|
|
#include <asm-generic/getorder.h>
|
|
#include <asm/pdc.h>
|
|
|
|
#define PAGE0 ((struct zeropage *)absolute_pointer(__PAGE_OFFSET))
|
|
|
|
/* DEFINITION OF THE ZERO-PAGE (PAG0) */
|
|
/* based on work by Jason Eckhardt (jason@equator.com) */
|
|
|
|
#endif /* _PARISC_PAGE_H */
|