Commit 9b42fa16 authored by Vishal Moola (Oracle)'s avatar Vishal Moola (Oracle) Committed by Andrew Morton

hugetlb: convert hugetlb_fault() to use struct vm_fault

Patch series "Hugetlb fault path to use struct vm_fault", v2.

This patchset converts the hugetlb fault path to use struct vm_fault. 
This helps make the code more readable, and alleviates the stack by
allowing us to consolidate many fault-related variables into an individual
pointer.


This patch (of 3):

Now that hugetlb_fault() has a vm_fault available for fault tracking, use
it throughout.  This cleans up the code by removing 2 variables, and
prepares hugetlb_fault() to take in a struct vm_fault argument.

Link: https://lkml.kernel.org/r/20240401202651.31440-1-vishal.moola@gmail.com
Link: https://lkml.kernel.org/r/20240401202651.31440-2-vishal.moola@gmail.comSigned-off-by: default avatarVishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: default avatarOscar Salvador <osalvador@suse.de>
Reviewed-by: default avatarMuchun Song <muchun.song@linux.dev>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 7edea4c6
...@@ -6427,8 +6427,6 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) ...@@ -6427,8 +6427,6 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags) unsigned long address, unsigned int flags)
{ {
pte_t *ptep, entry;
spinlock_t *ptl;
vm_fault_t ret; vm_fault_t ret;
u32 hash; u32 hash;
struct folio *folio = NULL; struct folio *folio = NULL;
...@@ -6436,13 +6434,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -6436,13 +6434,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma); struct hstate *h = hstate_vma(vma);
struct address_space *mapping; struct address_space *mapping;
int need_wait_lock = 0; int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
struct vm_fault vmf = { struct vm_fault vmf = {
.vma = vma, .vma = vma,
.address = haddr, .address = address & huge_page_mask(h),
.real_address = address, .real_address = address,
.flags = flags, .flags = flags,
.pgoff = vma_hugecache_offset(h, vma, haddr), .pgoff = vma_hugecache_offset(h, vma,
address & huge_page_mask(h)),
/* TODO: Track hugetlb faults using vm_fault */ /* TODO: Track hugetlb faults using vm_fault */
/* /*
...@@ -6462,22 +6460,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -6462,22 +6460,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* /*
* Acquire vma lock before calling huge_pte_alloc and hold * Acquire vma lock before calling huge_pte_alloc and hold
* until finished with ptep. This prevents huge_pmd_unshare from * until finished with vmf.pte. This prevents huge_pmd_unshare from
* being called elsewhere and making the ptep no longer valid. * being called elsewhere and making the vmf.pte no longer valid.
*/ */
hugetlb_vma_lock_read(vma); hugetlb_vma_lock_read(vma);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
if (!ptep) { if (!vmf.pte) {
hugetlb_vma_unlock_read(vma); hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]); mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return VM_FAULT_OOM; return VM_FAULT_OOM;
} }
entry = huge_ptep_get(ptep); vmf.orig_pte = huge_ptep_get(vmf.pte);
if (huge_pte_none_mostly(entry)) { if (huge_pte_none_mostly(vmf.orig_pte)) {
if (is_pte_marker(entry)) { if (is_pte_marker(vmf.orig_pte)) {
pte_marker marker = pte_marker marker =
pte_marker_get(pte_to_swp_entry(entry)); pte_marker_get(pte_to_swp_entry(vmf.orig_pte));
if (marker & PTE_MARKER_POISONED) { if (marker & PTE_MARKER_POISONED) {
ret = VM_FAULT_HWPOISON_LARGE; ret = VM_FAULT_HWPOISON_LARGE;
...@@ -6492,20 +6490,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -6492,20 +6490,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* mutex internally, which make us return immediately. * mutex internally, which make us return immediately.
*/ */
return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address, return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address,
ptep, entry, flags, &vmf); vmf.pte, vmf.orig_pte, flags, &vmf);
} }
ret = 0; ret = 0;
/* /*
* entry could be a migration/hwpoison entry at this point, so this * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
* check prevents the kernel from going below assuming that we have * point, so this check prevents the kernel from going below assuming
* an active hugepage in pagecache. This goto expects the 2nd page * that we have an active hugepage in pagecache. This goto expects
* fault, and is_hugetlb_entry_(migration|hwpoisoned) check will * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
* properly handle it. * check will properly handle it.
*/ */
if (!pte_present(entry)) { if (!pte_present(vmf.orig_pte)) {
if (unlikely(is_hugetlb_entry_migration(entry))) { if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
/* /*
* Release the hugetlb fault lock now, but retain * Release the hugetlb fault lock now, but retain
* the vma lock, because it is needed to guard the * the vma lock, because it is needed to guard the
...@@ -6514,9 +6512,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -6514,9 +6512,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* be released there. * be released there.
*/ */
mutex_unlock(&hugetlb_fault_mutex_table[hash]); mutex_unlock(&hugetlb_fault_mutex_table[hash]);
migration_entry_wait_huge(vma, ptep); migration_entry_wait_huge(vma, vmf.pte);
return 0; return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
ret = VM_FAULT_HWPOISON_LARGE | ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h)); VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex; goto out_mutex;
...@@ -6530,13 +6528,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -6530,13 +6528,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* determine if a reservation has been consumed. * determine if a reservation has been consumed.
*/ */
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
!(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
if (vma_needs_reservation(h, vma, haddr) < 0) { if (vma_needs_reservation(h, vma, vmf.address) < 0) {
ret = VM_FAULT_OOM; ret = VM_FAULT_OOM;
goto out_mutex; goto out_mutex;
} }
/* Just decrements count, does not deallocate */ /* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr); vma_end_reservation(h, vma, vmf.address);
pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
vmf.pgoff); vmf.pgoff);
...@@ -6544,17 +6542,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -6544,17 +6542,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pagecache_folio = NULL; pagecache_folio = NULL;
} }
ptl = huge_pte_lock(h, mm, ptep); vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
/* Check for a racing update before calling hugetlb_wp() */ /* Check for a racing update before calling hugetlb_wp() */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte))))
goto out_ptl; goto out_ptl;
/* Handle userfault-wp first, before trying to lock more pages */ /* Handle userfault-wp first, before trying to lock more pages */
if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) &&
(flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
if (!userfaultfd_wp_async(vma)) { if (!userfaultfd_wp_async(vma)) {
spin_unlock(ptl); spin_unlock(vmf.ptl);
if (pagecache_folio) { if (pagecache_folio) {
folio_unlock(pagecache_folio); folio_unlock(pagecache_folio);
folio_put(pagecache_folio); folio_put(pagecache_folio);
...@@ -6564,18 +6562,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -6564,18 +6562,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return handle_userfault(&vmf, VM_UFFD_WP); return handle_userfault(&vmf, VM_UFFD_WP);
} }
entry = huge_pte_clear_uffd_wp(entry); vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
set_huge_pte_at(mm, haddr, ptep, entry, set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
huge_page_size(hstate_vma(vma))); huge_page_size(hstate_vma(vma)));
/* Fallthrough to CoW */ /* Fallthrough to CoW */
} }
/* /*
* hugetlb_wp() requires page locks of pte_page(entry) and * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
* pagecache_folio, so here we need take the former one * pagecache_folio, so here we need take the former one
* when folio != pagecache_folio or !pagecache_folio. * when folio != pagecache_folio or !pagecache_folio.
*/ */
folio = page_folio(pte_page(entry)); folio = page_folio(pte_page(vmf.orig_pte));
if (folio != pagecache_folio) if (folio != pagecache_folio)
if (!folio_trylock(folio)) { if (!folio_trylock(folio)) {
need_wait_lock = 1; need_wait_lock = 1;
...@@ -6585,24 +6583,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -6585,24 +6583,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
folio_get(folio); folio_get(folio);
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(entry)) { if (!huge_pte_write(vmf.orig_pte)) {
ret = hugetlb_wp(mm, vma, address, ptep, flags, ret = hugetlb_wp(mm, vma, address, vmf.pte, flags,
pagecache_folio, ptl, &vmf); pagecache_folio, vmf.ptl, &vmf);
goto out_put_page; goto out_put_page;
} else if (likely(flags & FAULT_FLAG_WRITE)) { } else if (likely(flags & FAULT_FLAG_WRITE)) {
entry = huge_pte_mkdirty(entry); vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
} }
} }
entry = pte_mkyoung(entry); vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
flags & FAULT_FLAG_WRITE)) flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, haddr, ptep); update_mmu_cache(vma, vmf.address, vmf.pte);
out_put_page: out_put_page:
if (folio != pagecache_folio) if (folio != pagecache_folio)
folio_unlock(folio); folio_unlock(folio);
folio_put(folio); folio_put(folio);
out_ptl: out_ptl:
spin_unlock(ptl); spin_unlock(vmf.ptl);
if (pagecache_folio) { if (pagecache_folio) {
folio_unlock(pagecache_folio); folio_unlock(pagecache_folio);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment