Commit 3db82b93 authored by Hugh Dickins's avatar Hugh Dickins Committed by Andrew Morton

mm/memory: allow pte_offset_map[_lock]() to fail

copy_pte_range(): use pte_offset_map_nolock(), and allow for it to fail;
but with a comment on some further assumptions that are being made there.

zap_pte_range() and zap_pmd_range(): adjust their interaction so that a
pte_offset_map_lock() failure in zap_pte_range() leads to a retry in
zap_pmd_range(); remove call to pmd_none_or_trans_huge_or_clear_bad().

Allow pte_offset_map_lock() to fail in many functions.  Update comment on
calling pte_alloc() in do_anonymous_page().  Remove redundant calls to
pmd_trans_unstable(), pmd_devmap_trans_unstable(), pmd_none() and
pmd_bad(); but leave pmd_none_or_clear_bad() calls in free_pmd_range() and
copy_pmd_range(), those do simplify the next level down.

Link: https://lkml.kernel.org/r/bb548d50-e99a-f29e-eab1-a43bef2a1287@google.comSigned-off-by: default avatarHugh Dickins <hughd@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Song Liu <song@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zack Rusin <zackr@vmware.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 895f5ee4
...@@ -1012,13 +1012,25 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, ...@@ -1012,13 +1012,25 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
progress = 0; progress = 0;
init_rss_vec(rss); init_rss_vec(rss);
/*
* copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
* error handling here, assume that exclusive mmap_lock on dst and src
* protects anon from unexpected THP transitions; with shmem and file
* protected by mmap_lock-less collapse skipping areas with anon_vma
* (whereas vma_needs_copy() skips areas without anon_vma). A rework
* can remove such assumptions later, but this is good enough for now.
*/
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte) { if (!dst_pte) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
src_pte = pte_offset_map(src_pmd, addr); src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
src_ptl = pte_lockptr(src_mm, src_pmd); if (!src_pte) {
pte_unmap_unlock(dst_pte, dst_ptl);
/* ret == 0 */
goto out;
}
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte; orig_src_pte = src_pte;
orig_dst_pte = dst_pte; orig_dst_pte = dst_pte;
...@@ -1083,8 +1095,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, ...@@ -1083,8 +1095,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode(); arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl); pte_unmap_unlock(orig_src_pte, src_ptl);
pte_unmap(orig_src_pte);
add_mm_rss_vec(dst_mm, rss); add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl); pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched(); cond_resched();
...@@ -1388,10 +1399,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, ...@@ -1388,10 +1399,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
swp_entry_t entry; swp_entry_t entry;
tlb_change_page_size(tlb, PAGE_SIZE); tlb_change_page_size(tlb, PAGE_SIZE);
again:
init_rss_vec(rss); init_rss_vec(rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte = start_pte; if (!pte)
return addr;
flush_tlb_batched_pending(mm); flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode(); arch_enter_lazy_mmu_mode();
do { do {
...@@ -1507,17 +1519,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, ...@@ -1507,17 +1519,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
* If we forced a TLB flush (either due to running out of * If we forced a TLB flush (either due to running out of
* batch buffers or because we needed to flush dirty TLB * batch buffers or because we needed to flush dirty TLB
* entries before releasing the ptl), free the batched * entries before releasing the ptl), free the batched
* memory too. Restart if we didn't do everything. * memory too. Come back again if we didn't do everything.
*/ */
if (force_flush) { if (force_flush)
force_flush = 0;
tlb_flush_mmu(tlb); tlb_flush_mmu(tlb);
}
if (addr != end) {
cond_resched();
goto again;
}
return addr; return addr;
} }
...@@ -1536,8 +1541,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, ...@@ -1536,8 +1541,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) if (next - addr != HPAGE_PMD_SIZE)
__split_huge_pmd(vma, pmd, addr, false, NULL); __split_huge_pmd(vma, pmd, addr, false, NULL);
else if (zap_huge_pmd(tlb, vma, pmd, addr)) else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
goto next; addr = next;
continue;
}
/* fall through */ /* fall through */
} else if (details && details->single_folio && } else if (details && details->single_folio &&
folio_test_pmd_mappable(details->single_folio) && folio_test_pmd_mappable(details->single_folio) &&
...@@ -1550,20 +1557,14 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, ...@@ -1550,20 +1557,14 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
*/ */
spin_unlock(ptl); spin_unlock(ptl);
} }
if (pmd_none(*pmd)) {
/* addr = next;
* Here there can be other concurrent MADV_DONTNEED or continue;
* trans huge page faults running, and if the pmd is }
* none or trans huge it can change under us. This is addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
* because MADV_DONTNEED holds the mmap_lock in read if (addr != next)
* mode. pmd--;
*/ } while (pmd++, cond_resched(), addr != end);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
goto next;
next = zap_pte_range(tlb, vma, pmd, addr, next, details);
next:
cond_resched();
} while (pmd++, addr = next, addr != end);
return addr; return addr;
} }
...@@ -1905,6 +1906,10 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, ...@@ -1905,6 +1906,10 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
const int batch_size = min_t(int, pages_to_write_in_pmd, 8); const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
if (!start_pte) {
ret = -EFAULT;
goto out;
}
for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
int err = insert_page_in_batch_locked(vma, pte, int err = insert_page_in_batch_locked(vma, pte,
addr, pages[curr_page_idx], prot); addr, pages[curr_page_idx], prot);
...@@ -2572,10 +2577,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, ...@@ -2572,10 +2577,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
mapped_pte = pte = (mm == &init_mm) ? mapped_pte = pte = (mm == &init_mm) ?
pte_offset_kernel(pmd, addr) : pte_offset_kernel(pmd, addr) :
pte_offset_map_lock(mm, pmd, addr, &ptl); pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -EINVAL;
} }
BUG_ON(pmd_huge(*pmd));
arch_enter_lazy_mmu_mode(); arch_enter_lazy_mmu_mode();
if (fn) { if (fn) {
...@@ -2804,7 +2809,6 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, ...@@ -2804,7 +2809,6 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
int ret; int ret;
void *kaddr; void *kaddr;
void __user *uaddr; void __user *uaddr;
bool locked = false;
struct vm_area_struct *vma = vmf->vma; struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
unsigned long addr = vmf->address; unsigned long addr = vmf->address;
...@@ -2830,12 +2834,12 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, ...@@ -2830,12 +2834,12 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
* On architectures with software "accessed" bits, we would * On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here. * take a double page fault, so mark it accessed here.
*/ */
vmf->pte = NULL;
if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
pte_t entry; pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
locked = true; if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) {
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/* /*
* Other thread has already handled the fault * Other thread has already handled the fault
* and update local tlb only * and update local tlb only
...@@ -2857,13 +2861,12 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, ...@@ -2857,13 +2861,12 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
* zeroes. * zeroes.
*/ */
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
if (locked) if (vmf->pte)
goto warn; goto warn;
/* Re-validate under PTL if the page is still mapped */ /* Re-validate under PTL if the page is still mapped */
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
locked = true; if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) {
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/* The PTE changed under us, update local tlb */ /* The PTE changed under us, update local tlb */
update_mmu_tlb(vma, addr, vmf->pte); update_mmu_tlb(vma, addr, vmf->pte);
ret = -EAGAIN; ret = -EAGAIN;
...@@ -2888,7 +2891,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, ...@@ -2888,7 +2891,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
ret = 0; ret = 0;
pte_unlock: pte_unlock:
if (locked) if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
kunmap_atomic(kaddr); kunmap_atomic(kaddr);
flush_dcache_page(dst); flush_dcache_page(dst);
...@@ -3110,7 +3113,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) ...@@ -3110,7 +3113,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* Re-check the pte - we dropped the lock * Re-check the pte - we dropped the lock
*/ */
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_folio) { if (old_folio) {
if (!folio_test_anon(old_folio)) { if (!folio_test_anon(old_folio)) {
dec_mm_counter(mm, mm_counter_file(&old_folio->page)); dec_mm_counter(mm, mm_counter_file(&old_folio->page));
...@@ -3178,19 +3181,20 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) ...@@ -3178,19 +3181,20 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
/* Free the old page.. */ /* Free the old page.. */
new_folio = old_folio; new_folio = old_folio;
page_copied = 1; page_copied = 1;
} else { pte_unmap_unlock(vmf->pte, vmf->ptl);
} else if (vmf->pte) {
update_mmu_tlb(vma, vmf->address, vmf->pte); update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
} }
if (new_folio)
folio_put(new_folio);
pte_unmap_unlock(vmf->pte, vmf->ptl);
/* /*
* No need to double call mmu_notifier->invalidate_range() callback as * No need to double call mmu_notifier->invalidate_range() callback as
* the above ptep_clear_flush_notify() did already call it. * the above ptep_clear_flush_notify() did already call it.
*/ */
mmu_notifier_invalidate_range_only_end(&range); mmu_notifier_invalidate_range_only_end(&range);
if (new_folio)
folio_put(new_folio);
if (old_folio) { if (old_folio) {
if (page_copied) if (page_copied)
free_swap_cache(&old_folio->page); free_swap_cache(&old_folio->page);
...@@ -3230,6 +3234,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) ...@@ -3230,6 +3234,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl); &vmf->ptl);
if (!vmf->pte)
return VM_FAULT_NOPAGE;
/* /*
* We might have raced with another page fault while we released the * We might have raced with another page fault while we released the
* pte_offset_map_lock. * pte_offset_map_lock.
...@@ -3591,9 +3597,10 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) ...@@ -3591,9 +3597,10 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl); &vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte)))
restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
folio_unlock(folio); folio_unlock(folio);
folio_put(folio); folio_put(folio);
...@@ -3625,6 +3632,8 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) ...@@ -3625,6 +3632,8 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
{ {
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl); vmf->address, &vmf->ptl);
if (!vmf->pte)
return 0;
/* /*
* Be careful so that we will only recover a special uffd-wp pte into a * Be careful so that we will only recover a special uffd-wp pte into a
* none pte. Otherwise it means the pte could have changed, so retry. * none pte. Otherwise it means the pte could have changed, so retry.
...@@ -3728,7 +3737,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ...@@ -3728,7 +3737,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->page = pfn_swap_entry_to_page(entry); vmf->page = pfn_swap_entry_to_page(entry);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl); vmf->address, &vmf->ptl);
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) if (unlikely(!vmf->pte ||
!pte_same(*vmf->pte, vmf->orig_pte)))
goto unlock; goto unlock;
/* /*
...@@ -3805,7 +3815,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ...@@ -3805,7 +3815,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
*/ */
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl); vmf->address, &vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM; ret = VM_FAULT_OOM;
goto unlock; goto unlock;
} }
...@@ -3875,7 +3885,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ...@@ -3875,7 +3885,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
*/ */
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl); &vmf->ptl);
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte)))
goto out_nomap; goto out_nomap;
if (unlikely(!folio_test_uptodate(folio))) { if (unlikely(!folio_test_uptodate(folio))) {
...@@ -4001,12 +4011,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ...@@ -4001,12 +4011,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* No need to invalidate - it was non-present before */ /* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte); update_mmu_cache(vma, vmf->address, vmf->pte);
unlock: unlock:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
out: out:
if (si) if (si)
put_swap_device(si); put_swap_device(si);
return ret; return ret;
out_nomap: out_nomap:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page: out_page:
folio_unlock(folio); folio_unlock(folio);
...@@ -4039,22 +4051,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) ...@@ -4039,22 +4051,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
/* /*
* Use pte_alloc() instead of pte_alloc_map(). We can't run * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
* pte_offset_map() on pmds where a huge pmd might be created * be distinguished from a transient failure of pte_offset_map().
* from a different thread.
*
* pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
* parallel threads are excluded by other means.
*
* Here we only have mmap_read_lock(mm).
*/ */
if (pte_alloc(vma->vm_mm, vmf->pmd)) if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM; return VM_FAULT_OOM;
/* See comment in handle_pte_fault() */
if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;
/* Use the zero-page for reads */ /* Use the zero-page for reads */
if (!(vmf->flags & FAULT_FLAG_WRITE) && if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) { !mm_forbids_zeropage(vma->vm_mm)) {
...@@ -4062,6 +4064,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) ...@@ -4062,6 +4064,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vma->vm_page_prot)); vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl); vmf->address, &vmf->ptl);
if (!vmf->pte)
goto unlock;
if (vmf_pte_changed(vmf)) { if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte); update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock; goto unlock;
...@@ -4102,6 +4106,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) ...@@ -4102,6 +4106,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl); &vmf->ptl);
if (!vmf->pte)
goto release;
if (vmf_pte_changed(vmf)) { if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte); update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release; goto release;
...@@ -4129,6 +4135,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) ...@@ -4129,6 +4135,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/* No need to invalidate - it was non-present before */ /* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte); update_mmu_cache(vma, vmf->address, vmf->pte);
unlock: unlock:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret; return ret;
release: release:
...@@ -4378,15 +4385,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf) ...@@ -4378,15 +4385,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return VM_FAULT_OOM; return VM_FAULT_OOM;
} }
/*
* See comment in handle_pte_fault() for how this scenario happens, we
* need to return NOPAGE so that we drop this page.
*/
if (pmd_devmap_trans_unstable(vmf->pmd))
return VM_FAULT_NOPAGE;
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl); vmf->address, &vmf->ptl);
if (!vmf->pte)
return VM_FAULT_NOPAGE;
/* Re-check under ptl */ /* Re-check under ptl */
if (likely(!vmf_pte_changed(vmf))) { if (likely(!vmf_pte_changed(vmf))) {
...@@ -4628,17 +4630,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf) ...@@ -4628,17 +4630,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
*/ */
if (!vma->vm_ops->fault) { if (!vma->vm_ops->fault) {
/* vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
* If we find a migration pmd entry or a none pmd entry, which vmf->address, &vmf->ptl);
* should never happen, return SIGBUS if (unlikely(!vmf->pte))
*/
if (unlikely(!pmd_present(*vmf->pmd)))
ret = VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS;
else { else {
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
vmf->pmd,
vmf->address,
&vmf->ptl);
/* /*
* Make sure this is not a temporary clearing of pte * Make sure this is not a temporary clearing of pte
* by holding ptl and checking again. A R/M/W update * by holding ptl and checking again. A R/M/W update
...@@ -5427,10 +5423,9 @@ int follow_pte(struct mm_struct *mm, unsigned long address, ...@@ -5427,10 +5423,9 @@ int follow_pte(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address); pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd)); VM_BUG_ON(pmd_trans_huge(*pmd));
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, ptlp); ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!ptep)
goto out;
if (!pte_present(*ptep)) if (!pte_present(*ptep))
goto unlock; goto unlock;
*ptepp = ptep; *ptepp = ptep;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment