Commit 5ba2f0a1 authored by Dave Hansen's avatar Dave Hansen Committed by Andrew Morton
Browse files

mm: introduce deferred freeing for kernel page tables

This introduces a conditional asynchronous mechanism, enabled by
CONFIG_ASYNC_KERNEL_PGTABLE_FREE.  When enabled, this mechanism defers the
freeing of pages that are used as page tables for kernel address mappings.
These pages are now queued to a work struct instead of being freed
immediately.

This deferred freeing allows for batch-freeing of page tables, providing a
safe context for performing a single expensive operation (TLB flush) for a
batch of kernel page tables instead of performing that expensive operation
for each page table.

Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com


Signed-off-by: default avatarDave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: default avatarLu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: default avatarJason Gunthorpe <jgg@nvidia.com>
Reviewed-by: default avatarKevin Tian <kevin.tian@intel.com>
Acked-by: default avatarDavid Hildenbrand <david@redhat.com>
Acked-by: default avatarMike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent bf9e4e30
Loading
Loading
Loading
Loading
+13 −3
Original line number Diff line number Diff line
@@ -3053,6 +3053,14 @@ static inline void __pagetable_free(struct ptdesc *pt)
	__free_pages(page, compound_order(page));
}

#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
void pagetable_free_kernel(struct ptdesc *pt);
#else
static inline void pagetable_free_kernel(struct ptdesc *pt)
{
	__pagetable_free(pt);
}
#endif
/**
 * pagetable_free - Free pagetables
 * @pt:	The page table descriptor
@@ -3062,11 +3070,13 @@ static inline void __pagetable_free(struct ptdesc *pt)
 */
static inline void pagetable_free(struct ptdesc *pt)
{
	if (ptdesc_test_kernel(pt))
	if (ptdesc_test_kernel(pt)) {
		ptdesc_clear_kernel(pt);

		pagetable_free_kernel(pt);
	} else {
		__pagetable_free(pt);
	}
}

#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
#if ALLOC_SPLIT_PTLOCKS
+3 −0
Original line number Diff line number Diff line
@@ -906,6 +906,9 @@ config HAVE_GIGANTIC_FOLIOS
	def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \
		 (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)

config ASYNC_KERNEL_PGTABLE_FREE
	def_bool n

# TODO: Allow to be enabled without THP
config ARCH_SUPPORTS_HUGE_PFNMAP
	def_bool n
+37 −0
Original line number Diff line number Diff line
@@ -406,3 +406,40 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
	pte_unmap_unlock(pte, ptl);
	goto again;
}

#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
static void kernel_pgtable_work_func(struct work_struct *work);

static struct {
	struct list_head list;
	/* protect above ptdesc lists */
	spinlock_t lock;
	struct work_struct work;
} kernel_pgtable_work = {
	.list = LIST_HEAD_INIT(kernel_pgtable_work.list),
	.lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
	.work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
};

static void kernel_pgtable_work_func(struct work_struct *work)
{
	struct ptdesc *pt, *next;
	LIST_HEAD(page_list);

	spin_lock(&kernel_pgtable_work.lock);
	list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
	spin_unlock(&kernel_pgtable_work.lock);

	list_for_each_entry_safe(pt, next, &page_list, pt_list)
		__pagetable_free(pt);
}

void pagetable_free_kernel(struct ptdesc *pt)
{
	spin_lock(&kernel_pgtable_work.lock);
	list_add(&pt->pt_list, &kernel_pgtable_work.list);
	spin_unlock(&kernel_pgtable_work.lock);

	schedule_work(&kernel_pgtable_work.work);
}
#endif