Commit fcd48540 authored by Peter Xu's avatar Peter Xu Committed by Andrew Morton

mm/hugetlb: move swap entry handling into vma lock when faulted

In hugetlb_fault(), there used to have a special path to handle swap entry
at the entrance using huge_pte_offset().  That's unsafe because
huge_pte_offset() for a pmd sharable range can access freed pgtables if
without any lock to protect the pgtable from being freed after pmd
unshare.

Here the simplest solution to make it safe is to move the swap handling to
be after the vma lock being held.  We may need to take the fault mutex on
either migration or hwpoison entries now (also the vma lock, but that's
really needed), however neither of them is hot path.

Note that the vma lock cannot be released in hugetlb_fault() when the
migration entry is detected, because in migration_entry_wait_huge() the
pgtable page will be used again (by taking the pgtable lock), so that also
need to be protected by the vma lock.  Modify migration_entry_wait_huge()
so that it must be called with vma read lock held, and properly release
the lock in __migration_entry_wait_huge().

Link: https://lkml.kernel.org/r/20221216155100.2043537-5-peterx@redhat.comSigned-off-by: default avatarPeter Xu <peterx@redhat.com>
Reviewed-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: default avatarJohn Hubbard <jhubbard@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent fe7d4c6d
...@@ -337,7 +337,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, ...@@ -337,7 +337,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address); unsigned long address);
#ifdef CONFIG_HUGETLB_PAGE #ifdef CONFIG_HUGETLB_PAGE
extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl); extern void __migration_entry_wait_huge(struct vm_area_struct *vma,
pte_t *ptep, spinlock_t *ptl);
extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
#endif /* CONFIG_HUGETLB_PAGE */ #endif /* CONFIG_HUGETLB_PAGE */
#else /* CONFIG_MIGRATION */ #else /* CONFIG_MIGRATION */
...@@ -366,7 +367,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, ...@@ -366,7 +367,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address) { } unsigned long address) { }
#ifdef CONFIG_HUGETLB_PAGE #ifdef CONFIG_HUGETLB_PAGE
static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { } static inline void __migration_entry_wait_huge(struct vm_area_struct *vma,
pte_t *ptep, spinlock_t *ptl) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
#endif /* CONFIG_HUGETLB_PAGE */ #endif /* CONFIG_HUGETLB_PAGE */
static inline int is_writable_migration_entry(swp_entry_t entry) static inline int is_writable_migration_entry(swp_entry_t entry)
......
...@@ -5993,22 +5993,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -5993,22 +5993,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int need_wait_lock = 0; int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h); unsigned long haddr = address & huge_page_mask(h);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
/*
* Since we hold no locks, ptep could be stale. That is
* OK as we are only making decisions based on content and
* not actually modifying content here.
*/
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, ptep);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
}
/* /*
* Serialize hugepage allocation and instantiation, so that we don't * Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate * get spurious allocation failures if two CPUs race to instantiate
...@@ -6023,10 +6007,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -6023,10 +6007,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* Acquire vma lock before calling huge_pte_alloc and hold * Acquire vma lock before calling huge_pte_alloc and hold
* until finished with ptep. This prevents huge_pmd_unshare from * until finished with ptep. This prevents huge_pmd_unshare from
* being called elsewhere and making the ptep no longer valid. * being called elsewhere and making the ptep no longer valid.
*
* ptep could have already be assigned via huge_pte_offset. That
* is OK, as huge_pte_alloc will return the same value unless
* something has changed.
*/ */
hugetlb_vma_lock_read(vma); hugetlb_vma_lock_read(vma);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
...@@ -6055,8 +6035,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -6055,8 +6035,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* fault, and is_hugetlb_entry_(migration|hwpoisoned) check will * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
* properly handle it. * properly handle it.
*/ */
if (!pte_present(entry)) if (!pte_present(entry)) {
if (unlikely(is_hugetlb_entry_migration(entry))) {
/*
* Release the hugetlb fault lock now, but retain
* the vma lock, because it is needed to guard the
* huge_pte_lockptr() later in
* migration_entry_wait_huge(). The vma lock will
* be released there.
*/
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
migration_entry_wait_huge(vma, ptep);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex; goto out_mutex;
}
/* /*
* If we are going to COW/unshare the mapping later, we examine the * If we are going to COW/unshare the mapping later, we examine the
......
...@@ -329,24 +329,41 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, ...@@ -329,24 +329,41 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
} }
#ifdef CONFIG_HUGETLB_PAGE #ifdef CONFIG_HUGETLB_PAGE
void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) /*
* The vma read lock must be held upon entry. Holding that lock prevents either
* the pte or the ptl from being freed.
*
* This function will release the vma lock before returning.
*/
void __migration_entry_wait_huge(struct vm_area_struct *vma,
pte_t *ptep, spinlock_t *ptl)
{ {
pte_t pte; pte_t pte;
hugetlb_vma_assert_locked(vma);
spin_lock(ptl); spin_lock(ptl);
pte = huge_ptep_get(ptep); pte = huge_ptep_get(ptep);
if (unlikely(!is_hugetlb_entry_migration(pte))) if (unlikely(!is_hugetlb_entry_migration(pte))) {
spin_unlock(ptl); spin_unlock(ptl);
else hugetlb_vma_unlock_read(vma);
} else {
/*
* If migration entry existed, safe to release vma lock
* here because the pgtable page won't be freed without the
* pgtable lock released. See comment right above pgtable
* lock release in migration_entry_wait_on_locked().
*/
hugetlb_vma_unlock_read(vma);
migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl); migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
}
} }
void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
{ {
spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte); spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
__migration_entry_wait_huge(pte, ptl); __migration_entry_wait_huge(vma, pte, ptl);
} }
#endif #endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment