mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git
synced 2026-04-04 20:57:45 -04:00
Patch series "mm: folio_zero_user: clear page ranges", v11.
This series adds clearing of contiguous page ranges for hugepages.
The series improves on the current discontiguous clearing approach in two
ways:
- clear pages in a contiguous fashion.
- use batched clearing via clear_pages() wherever exposed.
The first is useful because it allows us to make much better use of
hardware prefetchers.
The second, enables advertising the real extent to the processor. Where
specific instructions support it (ex. string instructions on x86; "mops"
on arm64 etc), a processor can optimize based on this because, instead of
seeing a sequence of 8-byte stores, or a sequence of 4KB pages, it sees a
larger unit being operated on.
For instance, AMD Zen uarchs (for extents larger than LLC-size) switch to
a mode where they start eliding cacheline allocation. This is helpful not
just because it results in higher bandwidth, but also because now the
cache is not evicting useful cachelines and replacing them with zeroes.
Demand faulting a 64GB region shows performance improvement:
$ perf bench mem mmap -p $pg-sz -f demand -s 64GB -l 5
baseline +series
(GBps +- %stdev) (GBps +- %stdev)
pg-sz=2MB 11.76 +- 1.10% 25.34 +- 1.18% [*] +115.47% preempt=*
pg-sz=1GB 24.85 +- 2.41% 39.22 +- 2.32% + 57.82% preempt=none|voluntary
pg-sz=1GB (similar) 52.73 +- 0.20% [#] +112.19% preempt=full|lazy
[*] This improvement is because switching to sequential clearing
allows the hardware prefetchers to do a much better job.
[#] For pg-sz=1GB a large part of the improvement is because of the
cacheline elision mentioned above. preempt=full|lazy improves upon
that because, not needing explicit invocations of cond_resched() to
ensure reasonable preemption latency, it can clear the full extent
as a single unit. In comparison the maximum extent used for
preempt=none|voluntary is PROCESS_PAGES_NON_PREEMPT_BATCH (32MB).
When provided the full extent the processor forgoes allocating
cachelines on this path almost entirely.
(The hope is that eventually, in the fullness of time, the lazy
preemption model will be able to do the same job that none or
voluntary models are used for, allowing us to do away with
cond_resched().)
Raghavendra also tested previous version of the series on AMD Genoa and
sees similar improvement [1] with preempt=lazy.
$ perf bench mem map -p $page-size -f populate -s 64GB -l 10
base patched change
pg-sz=2MB 12.731939 GB/sec 26.304263 GB/sec 106.6%
pg-sz=1GB 26.232423 GB/sec 61.174836 GB/sec 133.2%
This patch (of 8):
Let's drop all variants that effectively map to clear_page() and provide
it in a generic variant instead.
We'll use the macro clear_user_page to indicate whether an architecture
provides it's own variant.
Also, clear_user_page() is only called from the generic variant of
clear_user_highpage(), so define it only if the architecture does not
provide a clear_user_highpage(). And, for simplicity define it in
linux/highmem.h.
Note that for parisc, clear_page() and clear_user_page() map to
clear_page_asm(), so we can just get rid of the custom clear_user_page()
implementation. There is a clear_user_page_asm() function on parisc, that
seems to be unused. Not sure what's up with that.
Link: https://lkml.kernel.org/r/20260107072009.1615991-1-ankur.a.arora@oracle.com
Link: https://lkml.kernel.org/r/20260107072009.1615991-2-ankur.a.arora@oracle.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Co-developed-by: Ankur Arora <ankur.a.arora@oracle.com>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ankur Arora <ankur.a.arora@oracle.com>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Konrad Rzessutek Wilk <konrad.wilk@oracle.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Li Zhe <lizhe.67@bytedance.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Raghavendra K T <raghavendra.kt@amd.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
214 lines
5.9 KiB
C
214 lines
5.9 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* Copyright (C) 2009 Chen Liqin <liqin.chen@sunplusct.com>
|
|
* Copyright (C) 2012 Regents of the University of California
|
|
* Copyright (C) 2017 SiFive
|
|
* Copyright (C) 2017 XiaojingZhu <zhuxiaoj@ict.ac.cn>
|
|
*/
|
|
|
|
#ifndef _ASM_RISCV_PAGE_H
|
|
#define _ASM_RISCV_PAGE_H
|
|
|
|
#include <linux/pfn.h>
|
|
#include <linux/const.h>
|
|
|
|
#include <vdso/page.h>
|
|
|
|
#define HPAGE_SHIFT PMD_SHIFT
|
|
#define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT)
|
|
#define HPAGE_MASK (~(HPAGE_SIZE - 1))
|
|
#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
|
|
|
|
/*
|
|
* PAGE_OFFSET -- the first address of the first page of memory.
|
|
* When not using MMU this corresponds to the first free page in
|
|
* physical memory (aligned on a page boundary).
|
|
*/
|
|
#ifdef CONFIG_MMU
|
|
#ifdef CONFIG_64BIT
|
|
#define PAGE_OFFSET_L5 _AC(0xff60000000000000, UL)
|
|
#define PAGE_OFFSET_L4 _AC(0xffffaf8000000000, UL)
|
|
#define PAGE_OFFSET_L3 _AC(0xffffffd600000000, UL)
|
|
#ifdef CONFIG_XIP_KERNEL
|
|
#define PAGE_OFFSET PAGE_OFFSET_L3
|
|
#else
|
|
#define PAGE_OFFSET kernel_map.page_offset
|
|
#endif /* CONFIG_XIP_KERNEL */
|
|
#else
|
|
#define PAGE_OFFSET _AC(0xc0000000, UL)
|
|
#endif /* CONFIG_64BIT */
|
|
#else
|
|
#define PAGE_OFFSET ((unsigned long)phys_ram_base)
|
|
#endif /* CONFIG_MMU */
|
|
|
|
#ifndef __ASSEMBLER__
|
|
|
|
#ifdef CONFIG_RISCV_ISA_ZICBOZ
|
|
void clear_page(void *page);
|
|
#else
|
|
#define clear_page(pgaddr) memset((pgaddr), 0, PAGE_SIZE)
|
|
#endif
|
|
#define copy_page(to, from) memcpy((to), (from), PAGE_SIZE)
|
|
|
|
#define copy_user_page(vto, vfrom, vaddr, topg) \
|
|
memcpy((vto), (vfrom), PAGE_SIZE)
|
|
|
|
/*
|
|
* Use struct definitions to apply C type checking
|
|
*/
|
|
|
|
/* Page Global Directory entry */
|
|
typedef struct {
|
|
unsigned long pgd;
|
|
} pgd_t;
|
|
|
|
/* Page Table entry */
|
|
typedef struct {
|
|
unsigned long pte;
|
|
} pte_t;
|
|
|
|
typedef struct {
|
|
unsigned long pgprot;
|
|
} pgprot_t;
|
|
|
|
typedef struct page *pgtable_t;
|
|
|
|
#define pte_val(x) ((x).pte)
|
|
#define pgd_val(x) ((x).pgd)
|
|
#define pgprot_val(x) ((x).pgprot)
|
|
|
|
#define __pte(x) ((pte_t) { (x) })
|
|
#define __pgd(x) ((pgd_t) { (x) })
|
|
#define __pgprot(x) ((pgprot_t) { (x) })
|
|
|
|
#ifdef CONFIG_64BIT
|
|
#define PTE_FMT "%016lx"
|
|
#else
|
|
#define PTE_FMT "%08lx"
|
|
#endif
|
|
|
|
#if defined(CONFIG_64BIT) && defined(CONFIG_MMU)
|
|
/*
|
|
* We override this value as its generic definition uses __pa too early in
|
|
* the boot process (before kernel_map.va_pa_offset is set).
|
|
*/
|
|
#define MIN_MEMBLOCK_ADDR 0
|
|
#endif
|
|
|
|
#define ARCH_PFN_OFFSET (PFN_DOWN((unsigned long)phys_ram_base))
|
|
|
|
struct kernel_mapping {
|
|
unsigned long virt_addr;
|
|
unsigned long virt_offset;
|
|
uintptr_t phys_addr;
|
|
uintptr_t size;
|
|
/* Offset between linear mapping virtual address and kernel load address */
|
|
unsigned long va_pa_offset;
|
|
/* Offset between kernel mapping virtual address and kernel load address */
|
|
#ifdef CONFIG_XIP_KERNEL
|
|
unsigned long va_kernel_xip_text_pa_offset;
|
|
unsigned long va_kernel_xip_data_pa_offset;
|
|
uintptr_t xiprom;
|
|
uintptr_t xiprom_sz;
|
|
#else
|
|
unsigned long page_offset;
|
|
unsigned long va_kernel_pa_offset;
|
|
#endif
|
|
};
|
|
|
|
extern struct kernel_mapping kernel_map;
|
|
extern phys_addr_t phys_ram_base;
|
|
extern unsigned long vmemmap_start_pfn;
|
|
|
|
#define is_kernel_mapping(x) \
|
|
((x) >= kernel_map.virt_addr && (x) < (kernel_map.virt_addr + kernel_map.size))
|
|
|
|
#define is_linear_mapping(x) \
|
|
((x) >= PAGE_OFFSET && (!IS_ENABLED(CONFIG_64BIT) || (x) < PAGE_OFFSET + KERN_VIRT_SIZE))
|
|
|
|
#ifndef CONFIG_DEBUG_VIRTUAL
|
|
#define linear_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + kernel_map.va_pa_offset))
|
|
#else
|
|
void *linear_mapping_pa_to_va(unsigned long x);
|
|
#endif
|
|
|
|
#ifdef CONFIG_XIP_KERNEL
|
|
#define kernel_mapping_pa_to_va(y) ({ \
|
|
unsigned long _y = (unsigned long)(y); \
|
|
(_y < phys_ram_base) ? \
|
|
(void *)(_y + kernel_map.va_kernel_xip_text_pa_offset) : \
|
|
(void *)(_y + kernel_map.va_kernel_xip_data_pa_offset); \
|
|
})
|
|
#else
|
|
#define kernel_mapping_pa_to_va(y) ((void *)((unsigned long)(y) + kernel_map.va_kernel_pa_offset))
|
|
#endif
|
|
|
|
#define __pa_to_va_nodebug(x) linear_mapping_pa_to_va(x)
|
|
|
|
#ifndef CONFIG_DEBUG_VIRTUAL
|
|
#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - kernel_map.va_pa_offset)
|
|
#else
|
|
phys_addr_t linear_mapping_va_to_pa(unsigned long x);
|
|
#endif
|
|
|
|
#ifdef CONFIG_XIP_KERNEL
|
|
#define kernel_mapping_va_to_pa(y) ({ \
|
|
unsigned long _y = (unsigned long)(y); \
|
|
(_y < kernel_map.virt_addr + kernel_map.xiprom_sz) ? \
|
|
(_y - kernel_map.va_kernel_xip_text_pa_offset) : \
|
|
(_y - kernel_map.va_kernel_xip_data_pa_offset); \
|
|
})
|
|
#else
|
|
#define kernel_mapping_va_to_pa(y) ((unsigned long)(y) - kernel_map.va_kernel_pa_offset)
|
|
#endif
|
|
|
|
#define __va_to_pa_nodebug(x) ({ \
|
|
unsigned long _x = x; \
|
|
is_linear_mapping(_x) ? \
|
|
linear_mapping_va_to_pa(_x) : kernel_mapping_va_to_pa(_x); \
|
|
})
|
|
|
|
#ifdef CONFIG_DEBUG_VIRTUAL
|
|
extern phys_addr_t __virt_to_phys(unsigned long x);
|
|
extern phys_addr_t __phys_addr_symbol(unsigned long x);
|
|
#else
|
|
#define __virt_to_phys(x) __va_to_pa_nodebug(x)
|
|
#define __phys_addr_symbol(x) __va_to_pa_nodebug(x)
|
|
#endif /* CONFIG_DEBUG_VIRTUAL */
|
|
|
|
#define __pa_symbol(x) __phys_addr_symbol(RELOC_HIDE((unsigned long)(x), 0))
|
|
#define __pa(x) __virt_to_phys((unsigned long)(x))
|
|
#define __va(x) ((void *)__pa_to_va_nodebug((phys_addr_t)(x)))
|
|
|
|
#define phys_to_pfn(phys) (PFN_DOWN(phys))
|
|
#define pfn_to_phys(pfn) (PFN_PHYS(pfn))
|
|
|
|
#define virt_to_pfn(vaddr) (phys_to_pfn(__pa(vaddr)))
|
|
#define pfn_to_virt(pfn) (__va(pfn_to_phys(pfn)))
|
|
|
|
#define virt_to_page(vaddr) (pfn_to_page(virt_to_pfn(vaddr)))
|
|
#define page_to_virt(page) (pfn_to_virt(page_to_pfn(page)))
|
|
|
|
#define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x))
|
|
|
|
unsigned long kaslr_offset(void);
|
|
|
|
static __always_inline void *pfn_to_kaddr(unsigned long pfn)
|
|
{
|
|
return __va(pfn << PAGE_SHIFT);
|
|
}
|
|
|
|
#endif /* __ASSEMBLER__ */
|
|
|
|
#define virt_addr_valid(vaddr) ({ \
|
|
unsigned long _addr = (unsigned long)vaddr; \
|
|
(unsigned long)(_addr) >= PAGE_OFFSET && pfn_valid(virt_to_pfn(_addr)); \
|
|
})
|
|
|
|
#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC
|
|
|
|
#include <asm-generic/memory_model.h>
|
|
#include <asm-generic/getorder.h>
|
|
|
|
#endif /* _ASM_RISCV_PAGE_H */
|