Commit 9b42fa16 authored by Vishal Moola (Oracle)'s avatar Vishal Moola (Oracle) Committed by Andrew Morton

hugetlb: convert hugetlb_fault() to use struct vm_fault

Patch series "Hugetlb fault path to use struct vm_fault", v2.

This patchset converts the hugetlb fault path to use struct vm_fault. 
This helps make the code more readable, and alleviates the stack by
allowing us to consolidate many fault-related variables into an individual
pointer.


This patch (of 3):

Now that hugetlb_fault() has a vm_fault available for fault tracking, use
it throughout.  This cleans up the code by removing 2 variables, and
prepares hugetlb_fault() to take in a struct vm_fault argument.

Link: https://lkml.kernel.org/r/20240401202651.31440-1-vishal.moola@gmail.com
Link: https://lkml.kernel.org/r/20240401202651.31440-2-vishal.moola@gmail.comSigned-off-by: default avatarVishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: default avatarOscar Salvador <osalvador@suse.de>
Reviewed-by: default avatarMuchun Song <muchun.song@linux.dev>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 7edea4c6
......@@ -6427,8 +6427,6 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pte_t *ptep, entry;
spinlock_t *ptl;
vm_fault_t ret;
u32 hash;
struct folio *folio = NULL;
......@@ -6436,13 +6434,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
struct vm_fault vmf = {
.vma = vma,
.address = haddr,
.address = address & huge_page_mask(h),
.real_address = address,
.flags = flags,
.pgoff = vma_hugecache_offset(h, vma, haddr),
.pgoff = vma_hugecache_offset(h, vma,
address & huge_page_mask(h)),
/* TODO: Track hugetlb faults using vm_fault */
/*
......@@ -6462,22 +6460,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Acquire vma lock before calling huge_pte_alloc and hold
* until finished with ptep. This prevents huge_pmd_unshare from
* being called elsewhere and making the ptep no longer valid.
* until finished with vmf.pte. This prevents huge_pmd_unshare from
* being called elsewhere and making the vmf.pte no longer valid.
*/
hugetlb_vma_lock_read(vma);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
if (!ptep) {
vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
if (!vmf.pte) {
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return VM_FAULT_OOM;
}
entry = huge_ptep_get(ptep);
if (huge_pte_none_mostly(entry)) {
if (is_pte_marker(entry)) {
vmf.orig_pte = huge_ptep_get(vmf.pte);
if (huge_pte_none_mostly(vmf.orig_pte)) {
if (is_pte_marker(vmf.orig_pte)) {
pte_marker marker =
pte_marker_get(pte_to_swp_entry(entry));
pte_marker_get(pte_to_swp_entry(vmf.orig_pte));
if (marker & PTE_MARKER_POISONED) {
ret = VM_FAULT_HWPOISON_LARGE;
......@@ -6492,20 +6490,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* mutex internally, which make us return immediately.
*/
return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address,
ptep, entry, flags, &vmf);
vmf.pte, vmf.orig_pte, flags, &vmf);
}
ret = 0;
/*
* entry could be a migration/hwpoison entry at this point, so this
* check prevents the kernel from going below assuming that we have
* an active hugepage in pagecache. This goto expects the 2nd page
* fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
* properly handle it.
* vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
* point, so this check prevents the kernel from going below assuming
* that we have an active hugepage in pagecache. This goto expects
* the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
* check will properly handle it.
*/
if (!pte_present(entry)) {
if (unlikely(is_hugetlb_entry_migration(entry))) {
if (!pte_present(vmf.orig_pte)) {
if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
/*
* Release the hugetlb fault lock now, but retain
* the vma lock, because it is needed to guard the
......@@ -6514,9 +6512,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* be released there.
*/
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
migration_entry_wait_huge(vma, ptep);
migration_entry_wait_huge(vma, vmf.pte);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
} else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
......@@ -6530,13 +6528,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* determine if a reservation has been consumed.
*/
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
!(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
if (vma_needs_reservation(h, vma, haddr) < 0) {
!(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
if (vma_needs_reservation(h, vma, vmf.address) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
}
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
vma_end_reservation(h, vma, vmf.address);
pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
vmf.pgoff);
......@@ -6544,17 +6542,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pagecache_folio = NULL;
}
ptl = huge_pte_lock(h, mm, ptep);
vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
/* Check for a racing update before calling hugetlb_wp() */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte))))
goto out_ptl;
/* Handle userfault-wp first, before trying to lock more pages */
if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
(flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) &&
(flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
if (!userfaultfd_wp_async(vma)) {
spin_unlock(ptl);
spin_unlock(vmf.ptl);
if (pagecache_folio) {
folio_unlock(pagecache_folio);
folio_put(pagecache_folio);
......@@ -6564,18 +6562,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return handle_userfault(&vmf, VM_UFFD_WP);
}
entry = huge_pte_clear_uffd_wp(entry);
set_huge_pte_at(mm, haddr, ptep, entry,
vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
huge_page_size(hstate_vma(vma)));
/* Fallthrough to CoW */
}
/*
* hugetlb_wp() requires page locks of pte_page(entry) and
* hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
* pagecache_folio, so here we need take the former one
* when folio != pagecache_folio or !pagecache_folio.
*/
folio = page_folio(pte_page(entry));
folio = page_folio(pte_page(vmf.orig_pte));
if (folio != pagecache_folio)
if (!folio_trylock(folio)) {
need_wait_lock = 1;
......@@ -6585,24 +6583,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
folio_get(folio);
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(entry)) {
ret = hugetlb_wp(mm, vma, address, ptep, flags,
pagecache_folio, ptl, &vmf);
if (!huge_pte_write(vmf.orig_pte)) {
ret = hugetlb_wp(mm, vma, address, vmf.pte, flags,
pagecache_folio, vmf.ptl, &vmf);
goto out_put_page;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
entry = huge_pte_mkdirty(entry);
vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
}
}
entry = pte_mkyoung(entry);
if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, haddr, ptep);
update_mmu_cache(vma, vmf.address, vmf.pte);
out_put_page:
if (folio != pagecache_folio)
folio_unlock(folio);
folio_put(folio);
out_ptl:
spin_unlock(ptl);
spin_unlock(vmf.ptl);
if (pagecache_folio) {
folio_unlock(pagecache_folio);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment