Commit ebcfc63d authored by Dev Jain's avatar Dev Jain Committed by Andrew Morton
Browse files

mm: abstract THP allocation

Patch series "Do not shatter hugezeropage on wp-fault", v7.

It was observed at [1] and [2] that the current kernel behaviour of
shattering a hugezeropage is inconsistent and suboptimal.  For a VMA with
a THP allowable order, when we write-fault on it, the kernel installs a
PMD-mapped THP.  On the other hand, if we first get a read fault, we get a
PMD pointing to the hugezeropage; subsequent write will trigger a
write-protection fault, shattering the hugezeropage into one writable
page, and all the other PTEs write-protected.  The conclusion being, as
compared to the case of a single write-fault, applications have to suffer
512 extra page faults if they were to use the VMA as such, plus we get the
overhead of khugepaged trying to replace that area with a THP anyway.

Instead, replace the hugezeropage with a THP on wp-fault.

[1]: https://lore.kernel.org/all/3743d7e1-0b79-4eaf-82d5-d1ca29fe347d@arm.com/
[2]: https://lore.kernel.org/all/1cfae0c0-96a2-4308-9c62-f7a640520242@arm.com/


This patch (of 2):

In preparation for the second patch, abstract away the THP allocation
logic present in the create_huge_pmd() path, which corresponds to the
faulting case when no page is present.

There should be no functional change as a result of applying this patch,
except that, as David notes at [1], a PMD-aligned address should be passed
to update_mmu_cache_pmd().

[1]: https://lore.kernel.org/all/ddd3fcd2-48b3-4170-bcaa-2fe66e093f43@redhat.com/

Link: https://lkml.kernel.org/r/20241008061746.285961-1-dev.jain@arm.com
Link: https://lkml.kernel.org/r/20241008061746.285961-2-dev.jain@arm.com


Signed-off-by: default avatarDev Jain <dev.jain@arm.com>
Acked-by: default avatarDavid Hildenbrand <david@redhat.com>
Reviewed-by: default avatarKefeng Wang <wangkefeng.wang@huawei.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Lameter <cl@gentwo.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <yang@os.amperecomputing.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 150e0fb8
Loading
Loading
Loading
Loading
+57 −41
Original line number Diff line number Diff line
@@ -1136,47 +1136,81 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);

static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
			struct page *page, gfp_t gfp)
static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
		unsigned long addr)
{
	struct vm_area_struct *vma = vmf->vma;
	struct folio *folio = page_folio(page);
	pgtable_t pgtable;
	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
	vm_fault_t ret = 0;
	gfp_t gfp = vma_thp_gfp_mask(vma);
	const int order = HPAGE_PMD_ORDER;
	struct folio *folio;

	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
	folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true);

	if (unlikely(!folio)) {
		count_vm_event(THP_FAULT_FALLBACK);
		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
		return NULL;
	}

	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
		folio_put(folio);
		count_vm_event(THP_FAULT_FALLBACK);
		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
		return VM_FAULT_FALLBACK;
		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
		return NULL;
	}
	folio_throttle_swaprate(folio, gfp);

	pgtable = pte_alloc_one(vma->vm_mm);
	if (unlikely(!pgtable)) {
		ret = VM_FAULT_OOM;
		goto release;
	}

	folio_zero_user(folio, vmf->address);
	folio_zero_user(folio, addr);
	/*
	 * The memory barrier inside __folio_mark_uptodate makes sure that
	 * folio_zero_user writes become visible before the set_pmd_at()
	 * write.
	 */
	__folio_mark_uptodate(folio);
	return folio;
}

static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
		struct vm_area_struct *vma, unsigned long haddr)
{
	pmd_t entry;

	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
	folio_add_lru_vma(folio, vma);
	set_pmd_at(vma->vm_mm, haddr, pmd, entry);
	update_mmu_cache_pmd(vma, haddr, pmd);
	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
	count_vm_event(THP_FAULT_ALLOC);
	count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}

static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
	struct vm_area_struct *vma = vmf->vma;
	struct folio *folio;
	pgtable_t pgtable;
	vm_fault_t ret = 0;

	folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
	if (unlikely(!folio))
		return VM_FAULT_FALLBACK;

	pgtable = pte_alloc_one(vma->vm_mm);
	if (unlikely(!pgtable)) {
		ret = VM_FAULT_OOM;
		goto release;
	}

	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
	if (unlikely(!pmd_none(*vmf->pmd))) {
		goto unlock_release;
	} else {
		pmd_t entry;

		ret = check_stable_address_space(vma->vm_mm);
		if (ret)
			goto unlock_release;
@@ -1190,21 +1224,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
			return ret;
		}

		entry = mk_huge_pmd(page, vma->vm_page_prot);
		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
		folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
		folio_add_lru_vma(folio, vma);
		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
		map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
		mm_inc_nr_ptes(vma->vm_mm);
		deferred_split_folio(folio, false);
		spin_unlock(vmf->ptl);
		count_vm_event(THP_FAULT_ALLOC);
		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
	}

	return 0;
@@ -1271,8 +1295,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
	struct vm_area_struct *vma = vmf->vma;
	gfp_t gfp;
	struct folio *folio;
	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
	vm_fault_t ret;

@@ -1323,14 +1345,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
		}
		return ret;
	}
	gfp = vma_thp_gfp_mask(vma);
	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
	if (unlikely(!folio)) {
		count_vm_event(THP_FAULT_FALLBACK);
		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
		return VM_FAULT_FALLBACK;
	}
	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);

	return __do_huge_pmd_anonymous_page(vmf);
}

static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,