Commit 9b42fa16 authored by Vishal Moola (Oracle)'s avatar Vishal Moola (Oracle) Committed by Andrew Morton
Browse files

hugetlb: convert hugetlb_fault() to use struct vm_fault

Patch series "Hugetlb fault path to use struct vm_fault", v2.

This patchset converts the hugetlb fault path to use struct vm_fault. 
This helps make the code more readable, and alleviates the stack by
allowing us to consolidate many fault-related variables into an individual
pointer.


This patch (of 3):

Now that hugetlb_fault() has a vm_fault available for fault tracking, use
it throughout.  This cleans up the code by removing 2 variables, and
prepares hugetlb_fault() to take in a struct vm_fault argument.

Link: https://lkml.kernel.org/r/20240401202651.31440-1-vishal.moola@gmail.com
Link: https://lkml.kernel.org/r/20240401202651.31440-2-vishal.moola@gmail.com


Signed-off-by: default avatarVishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: default avatarOscar Salvador <osalvador@suse.de>
Reviewed-by: default avatarMuchun Song <muchun.song@linux.dev>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 7edea4c6
Loading
Loading
Loading
Loading
+41 −43
Original line number Diff line number Diff line
@@ -6427,8 +6427,6 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			unsigned long address, unsigned int flags)
{
	pte_t *ptep, entry;
	spinlock_t *ptl;
	vm_fault_t ret;
	u32 hash;
	struct folio *folio = NULL;
@@ -6436,13 +6434,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
	struct hstate *h = hstate_vma(vma);
	struct address_space *mapping;
	int need_wait_lock = 0;
	unsigned long haddr = address & huge_page_mask(h);
	struct vm_fault vmf = {
		.vma = vma,
		.address = haddr,
		.address = address & huge_page_mask(h),
		.real_address = address,
		.flags = flags,
		.pgoff = vma_hugecache_offset(h, vma, haddr),
		.pgoff = vma_hugecache_offset(h, vma,
				address & huge_page_mask(h)),
		/* TODO: Track hugetlb faults using vm_fault */

		/*
@@ -6462,22 +6460,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,

	/*
	 * Acquire vma lock before calling huge_pte_alloc and hold
	 * until finished with ptep.  This prevents huge_pmd_unshare from
	 * being called elsewhere and making the ptep no longer valid.
	 * until finished with vmf.pte.  This prevents huge_pmd_unshare from
	 * being called elsewhere and making the vmf.pte no longer valid.
	 */
	hugetlb_vma_lock_read(vma);
	ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
	if (!ptep) {
	vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
	if (!vmf.pte) {
		hugetlb_vma_unlock_read(vma);
		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
		return VM_FAULT_OOM;
	}

	entry = huge_ptep_get(ptep);
	if (huge_pte_none_mostly(entry)) {
		if (is_pte_marker(entry)) {
	vmf.orig_pte = huge_ptep_get(vmf.pte);
	if (huge_pte_none_mostly(vmf.orig_pte)) {
		if (is_pte_marker(vmf.orig_pte)) {
			pte_marker marker =
				pte_marker_get(pte_to_swp_entry(entry));
				pte_marker_get(pte_to_swp_entry(vmf.orig_pte));

			if (marker & PTE_MARKER_POISONED) {
				ret = VM_FAULT_HWPOISON_LARGE;
@@ -6492,20 +6490,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
		 * mutex internally, which make us return immediately.
		 */
		return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address,
					ptep, entry, flags, &vmf);
					vmf.pte, vmf.orig_pte, flags, &vmf);
	}

	ret = 0;

	/*
	 * entry could be a migration/hwpoison entry at this point, so this
	 * check prevents the kernel from going below assuming that we have
	 * an active hugepage in pagecache. This goto expects the 2nd page
	 * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
	 * properly handle it.
	 * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
	 * point, so this check prevents the kernel from going below assuming
	 * that we have an active hugepage in pagecache. This goto expects
	 * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
	 * check will properly handle it.
	 */
	if (!pte_present(entry)) {
		if (unlikely(is_hugetlb_entry_migration(entry))) {
	if (!pte_present(vmf.orig_pte)) {
		if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
			/*
			 * Release the hugetlb fault lock now, but retain
			 * the vma lock, because it is needed to guard the
@@ -6514,9 +6512,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			 * be released there.
			 */
			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
			migration_entry_wait_huge(vma, ptep);
			migration_entry_wait_huge(vma, vmf.pte);
			return 0;
		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
		} else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
			ret = VM_FAULT_HWPOISON_LARGE |
			    VM_FAULT_SET_HINDEX(hstate_index(h));
		goto out_mutex;
@@ -6530,13 +6528,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
	 * determine if a reservation has been consumed.
	 */
	if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
	    !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
		if (vma_needs_reservation(h, vma, haddr) < 0) {
	    !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
		if (vma_needs_reservation(h, vma, vmf.address) < 0) {
			ret = VM_FAULT_OOM;
			goto out_mutex;
		}
		/* Just decrements count, does not deallocate */
		vma_end_reservation(h, vma, haddr);
		vma_end_reservation(h, vma, vmf.address);

		pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
							     vmf.pgoff);
@@ -6544,17 +6542,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			pagecache_folio = NULL;
	}

	ptl = huge_pte_lock(h, mm, ptep);
	vmf.ptl = huge_pte_lock(h, mm, vmf.pte);

	/* Check for a racing update before calling hugetlb_wp() */
	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
	if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte))))
		goto out_ptl;

	/* Handle userfault-wp first, before trying to lock more pages */
	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
	    (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) &&
	    (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
		if (!userfaultfd_wp_async(vma)) {
			spin_unlock(ptl);
			spin_unlock(vmf.ptl);
			if (pagecache_folio) {
				folio_unlock(pagecache_folio);
				folio_put(pagecache_folio);
@@ -6564,18 +6562,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			return handle_userfault(&vmf, VM_UFFD_WP);
		}

		entry = huge_pte_clear_uffd_wp(entry);
		set_huge_pte_at(mm, haddr, ptep, entry,
		vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
		set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
				huge_page_size(hstate_vma(vma)));
		/* Fallthrough to CoW */
	}

	/*
	 * hugetlb_wp() requires page locks of pte_page(entry) and
	 * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
	 * pagecache_folio, so here we need take the former one
	 * when folio != pagecache_folio or !pagecache_folio.
	 */
	folio = page_folio(pte_page(entry));
	folio = page_folio(pte_page(vmf.orig_pte));
	if (folio != pagecache_folio)
		if (!folio_trylock(folio)) {
			need_wait_lock = 1;
@@ -6585,24 +6583,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
	folio_get(folio);

	if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
		if (!huge_pte_write(entry)) {
			ret = hugetlb_wp(mm, vma, address, ptep, flags,
					 pagecache_folio, ptl, &vmf);
		if (!huge_pte_write(vmf.orig_pte)) {
			ret = hugetlb_wp(mm, vma, address, vmf.pte, flags,
					 pagecache_folio, vmf.ptl, &vmf);
			goto out_put_page;
		} else if (likely(flags & FAULT_FLAG_WRITE)) {
			entry = huge_pte_mkdirty(entry);
			vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
		}
	}
	entry = pte_mkyoung(entry);
	if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
	vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
	if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
						flags & FAULT_FLAG_WRITE))
		update_mmu_cache(vma, haddr, ptep);
		update_mmu_cache(vma, vmf.address, vmf.pte);
out_put_page:
	if (folio != pagecache_folio)
		folio_unlock(folio);
	folio_put(folio);
out_ptl:
	spin_unlock(ptl);
	spin_unlock(vmf.ptl);

	if (pagecache_folio) {
		folio_unlock(pagecache_folio);