Commit 6b76c3ae authored by Will Deacon's avatar Will Deacon

Merge branch 'for-next/faultaround' into for-next/core

Initialise prefaulted PTEs as 'old' for arm64 when hardware access-flag
updates are supported, which drastically improves vmscan performance.

* for-next/faultaround:
  mm: filemap: Fix microblaze build failure with 'mmu_defconfig'
  mm/nommu: Fix return type of filemap_map_pages()
  mm: Mark anonymous struct field of 'struct vm_fault' as 'const'
  mm: Use static initialisers for immutable fields of 'struct vm_fault'
  mm: Avoid modifying vmf.address in __collapse_huge_page_swapin()
  mm: Pass 'address' to map to do_set_pte() and drop FAULT_FLAG_PREFAULT
  mm: Move immutable fields of 'struct vm_fault' into anonymous struct
  arm64: mm: Implement arch_wants_old_prefaulted_pte()
  mm: Allow architectures to request 'old' entries when prefaulting
  mm: Cleanup faultaround and finish_fault() codepaths
parents 90eb8c9d de591a82
...@@ -982,6 +982,16 @@ static inline bool arch_faults_on_old_pte(void) ...@@ -982,6 +982,16 @@ static inline bool arch_faults_on_old_pte(void)
} }
#define arch_faults_on_old_pte arch_faults_on_old_pte #define arch_faults_on_old_pte arch_faults_on_old_pte
/*
* Experimentally, it's cheap to set the access flag in hardware and we
* benefit from prefaulting mappings as 'old' to start with.
*/
static inline bool arch_wants_old_prefaulted_pte(void)
{
return !arch_faults_on_old_pte();
}
#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
#endif /* !__ASSEMBLY__ */ #endif /* !__ASSEMBLY__ */
#endif /* __ASM_PGTABLE_H */ #endif /* __ASM_PGTABLE_H */
...@@ -1319,17 +1319,19 @@ xfs_filemap_pfn_mkwrite( ...@@ -1319,17 +1319,19 @@ xfs_filemap_pfn_mkwrite(
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
} }
static void static vm_fault_t
xfs_filemap_map_pages( xfs_filemap_map_pages(
struct vm_fault *vmf, struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t start_pgoff,
pgoff_t end_pgoff) pgoff_t end_pgoff)
{ {
struct inode *inode = file_inode(vmf->vma->vm_file); struct inode *inode = file_inode(vmf->vma->vm_file);
vm_fault_t ret;
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
filemap_map_pages(vmf, start_pgoff, end_pgoff); ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
return ret;
} }
static const struct vm_operations_struct xfs_file_vm_ops = { static const struct vm_operations_struct xfs_file_vm_ops = {
......
...@@ -514,11 +514,14 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags) ...@@ -514,11 +514,14 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags)
* pgoff should be used in favour of virtual_address, if possible. * pgoff should be used in favour of virtual_address, if possible.
*/ */
struct vm_fault { struct vm_fault {
const struct {
struct vm_area_struct *vma; /* Target VMA */ struct vm_area_struct *vma; /* Target VMA */
unsigned int flags; /* FAULT_FLAG_xxx flags */
gfp_t gfp_mask; /* gfp mask to be used for allocations */ gfp_t gfp_mask; /* gfp mask to be used for allocations */
pgoff_t pgoff; /* Logical page offset based on vma */ pgoff_t pgoff; /* Logical page offset based on vma */
unsigned long address; /* Faulting virtual address */ unsigned long address; /* Faulting virtual address */
};
unsigned int flags; /* FAULT_FLAG_xxx flags
* XXX: should really be 'const' */
pmd_t *pmd; /* Pointer to pmd entry matching pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address' */ * the 'address' */
pud_t *pud; /* Pointer to pud entry matching pud_t *pud; /* Pointer to pud entry matching
...@@ -542,8 +545,8 @@ struct vm_fault { ...@@ -542,8 +545,8 @@ struct vm_fault {
* is not NULL, otherwise pmd. * is not NULL, otherwise pmd.
*/ */
pgtable_t prealloc_pte; /* Pre-allocated pte page table. pgtable_t prealloc_pte; /* Pre-allocated pte page table.
* vm_ops->map_pages() calls * vm_ops->map_pages() sets up a page
* alloc_set_pte() from atomic context. * table from atomic context.
* do_fault_around() pre-allocates * do_fault_around() pre-allocates
* page table to avoid allocation from * page table to avoid allocation from
* atomic context. * atomic context.
...@@ -578,7 +581,7 @@ struct vm_operations_struct { ...@@ -578,7 +581,7 @@ struct vm_operations_struct {
vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*fault)(struct vm_fault *vmf);
vm_fault_t (*huge_fault)(struct vm_fault *vmf, vm_fault_t (*huge_fault)(struct vm_fault *vmf,
enum page_entry_size pe_size); enum page_entry_size pe_size);
void (*map_pages)(struct vm_fault *vmf, vm_fault_t (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff); pgoff_t start_pgoff, pgoff_t end_pgoff);
unsigned long (*pagesize)(struct vm_area_struct * area); unsigned long (*pagesize)(struct vm_area_struct * area);
...@@ -988,7 +991,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) ...@@ -988,7 +991,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte; return pte;
} }
vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page); vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_fault(struct vm_fault *vmf);
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#endif #endif
...@@ -2622,7 +2627,7 @@ extern void truncate_inode_pages_final(struct address_space *); ...@@ -2622,7 +2627,7 @@ extern void truncate_inode_pages_final(struct address_space *);
/* generic vm_area_ops exported for stackable file systems */ /* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern void filemap_map_pages(struct vm_fault *vmf, extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff); pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
......
...@@ -1314,6 +1314,17 @@ static inline int pmd_trans_unstable(pmd_t *pmd) ...@@ -1314,6 +1314,17 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
#endif #endif
} }
/*
* the ordering of these checks is important for pmds with _page_devmap set.
* if we check pmd_trans_unstable() first we will trip the bad_pmd() check
* inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly
* returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
*/
static inline int pmd_devmap_trans_unstable(pmd_t *pmd)
{
return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
}
#ifndef CONFIG_NUMA_BALANCING #ifndef CONFIG_NUMA_BALANCING
/* /*
* Technically a PTE can be PROTNONE even when not doing NUMA balancing but * Technically a PTE can be PROTNONE even when not doing NUMA balancing but
......
...@@ -42,6 +42,8 @@ ...@@ -42,6 +42,8 @@
#include <linux/psi.h> #include <linux/psi.h>
#include <linux/ramfs.h> #include <linux/ramfs.h>
#include <linux/page_idle.h> #include <linux/page_idle.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h" #include "internal.h"
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
...@@ -2911,74 +2913,163 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) ...@@ -2911,74 +2913,163 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
} }
EXPORT_SYMBOL(filemap_fault); EXPORT_SYMBOL(filemap_fault);
void filemap_map_pages(struct vm_fault *vmf, static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
{
struct mm_struct *mm = vmf->vma->vm_mm;
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd)) {
unlock_page(page);
put_page(page);
return true;
}
if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
vm_fault_t ret = do_set_pmd(vmf, page);
if (!ret) {
/* The page is mapped successfully, reference consumed. */
unlock_page(page);
return true;
}
}
if (pmd_none(*vmf->pmd)) {
vmf->ptl = pmd_lock(mm, vmf->pmd);
if (likely(pmd_none(*vmf->pmd))) {
mm_inc_nr_ptes(mm);
pmd_populate(mm, vmf->pmd, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
spin_unlock(vmf->ptl);
}
/* See comment in handle_pte_fault() */
if (pmd_devmap_trans_unstable(vmf->pmd)) {
unlock_page(page);
put_page(page);
return true;
}
return false;
}
static struct page *next_uptodate_page(struct page *page,
struct address_space *mapping,
struct xa_state *xas, pgoff_t end_pgoff)
{
unsigned long max_idx;
do {
if (!page)
return NULL;
if (xas_retry(xas, page))
continue;
if (xa_is_value(page))
continue;
if (PageLocked(page))
continue;
if (!page_cache_get_speculative(page))
continue;
/* Has the page moved or been split? */
if (unlikely(page != xas_reload(xas)))
goto skip;
if (!PageUptodate(page) || PageReadahead(page))
goto skip;
if (PageHWPoison(page))
goto skip;
if (!trylock_page(page))
goto skip;
if (page->mapping != mapping)
goto unlock;
if (!PageUptodate(page))
goto unlock;
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (xas->xa_index >= max_idx)
goto unlock;
return page;
unlock:
unlock_page(page);
skip:
put_page(page);
} while ((page = xas_next_entry(xas, end_pgoff)) != NULL);
return NULL;
}
static inline struct page *first_map_page(struct address_space *mapping,
struct xa_state *xas,
pgoff_t end_pgoff)
{
return next_uptodate_page(xas_find(xas, end_pgoff),
mapping, xas, end_pgoff);
}
static inline struct page *next_map_page(struct address_space *mapping,
struct xa_state *xas,
pgoff_t end_pgoff)
{
return next_uptodate_page(xas_next_entry(xas, end_pgoff),
mapping, xas, end_pgoff);
}
vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff) pgoff_t start_pgoff, pgoff_t end_pgoff)
{ {
struct file *file = vmf->vma->vm_file; struct vm_area_struct *vma = vmf->vma;
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping; struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff; pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx; unsigned long addr;
XA_STATE(xas, &mapping->i_pages, start_pgoff); XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct page *head, *page; struct page *head, *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
vm_fault_t ret = 0;
rcu_read_lock(); rcu_read_lock();
xas_for_each(&xas, head, end_pgoff) { head = first_map_page(mapping, &xas, end_pgoff);
if (xas_retry(&xas, head)) if (!head)
continue; goto out;
if (xa_is_value(head))
goto next;
/* if (filemap_map_pmd(vmf, head)) {
* Check for a locked page first, as a speculative ret = VM_FAULT_NOPAGE;
* reference may adversely influence page migration. goto out;
*/ }
if (PageLocked(head))
goto next;
if (!page_cache_get_speculative(head))
goto next;
/* Has the page moved or been split? */ addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (unlikely(head != xas_reload(&xas))) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
goto skip; do {
page = find_subpage(head, xas.xa_index); page = find_subpage(head, xas.xa_index);
if (PageHWPoison(page))
if (!PageUptodate(head) ||
PageReadahead(page) ||
PageHWPoison(page))
goto skip;
if (!trylock_page(head))
goto skip;
if (head->mapping != mapping || !PageUptodate(head))
goto unlock;
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (xas.xa_index >= max_idx)
goto unlock; goto unlock;
if (mmap_miss > 0) if (mmap_miss > 0)
mmap_miss--; mmap_miss--;
vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
if (vmf->pte)
vmf->pte += xas.xa_index - last_pgoff; vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index; last_pgoff = xas.xa_index;
if (alloc_set_pte(vmf, page))
if (!pte_none(*vmf->pte))
goto unlock; goto unlock;
/* We're about to handle the fault */
if (vmf->address == addr)
ret = VM_FAULT_NOPAGE;
do_set_pte(vmf, page, addr);
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, addr, vmf->pte);
unlock_page(head); unlock_page(head);
goto next; continue;
unlock: unlock:
unlock_page(head); unlock_page(head);
skip:
put_page(head); put_page(head);
next: } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL);
/* Huge page is mapped? No need to proceed. */ pte_unmap_unlock(vmf->pte, vmf->ptl);
if (pmd_trans_huge(*vmf->pmd)) out:
break;
}
rcu_read_unlock(); rcu_read_unlock();
WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
return ret;
} }
EXPORT_SYMBOL(filemap_map_pages); EXPORT_SYMBOL(filemap_map_pages);
......
...@@ -991,38 +991,41 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, ...@@ -991,38 +991,41 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
static bool __collapse_huge_page_swapin(struct mm_struct *mm, static bool __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned long haddr, pmd_t *pmd,
int referenced) int referenced)
{ {
int swapped_in = 0; int swapped_in = 0;
vm_fault_t ret = 0; vm_fault_t ret = 0;
unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
for (address = haddr; address < end; address += PAGE_SIZE) {
struct vm_fault vmf = { struct vm_fault vmf = {
.vma = vma, .vma = vma,
.address = address, .address = address,
.pgoff = linear_page_index(vma, haddr),
.flags = FAULT_FLAG_ALLOW_RETRY, .flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd, .pmd = pmd,
.pgoff = linear_page_index(vma, address),
}; };
vmf.pte = pte_offset_map(pmd, address); vmf.pte = pte_offset_map(pmd, address);
for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
vmf.pte++, vmf.address += PAGE_SIZE) {
vmf.orig_pte = *vmf.pte; vmf.orig_pte = *vmf.pte;
if (!is_swap_pte(vmf.orig_pte)) if (!is_swap_pte(vmf.orig_pte)) {
pte_unmap(vmf.pte);
continue; continue;
}
swapped_in++; swapped_in++;
ret = do_swap_page(&vmf); ret = do_swap_page(&vmf);
/* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */ /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
if (ret & VM_FAULT_RETRY) { if (ret & VM_FAULT_RETRY) {
mmap_read_lock(mm); mmap_read_lock(mm);
if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { if (hugepage_vma_revalidate(mm, haddr, &vma)) {
/* vma is no longer available, don't continue to swapin */ /* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false; return false;
} }
/* check if the pmd is still valid */ /* check if the pmd is still valid */
if (mm_find_pmd(mm, address) != pmd) { if (mm_find_pmd(mm, haddr) != pmd) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false; return false;
} }
...@@ -1031,11 +1034,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, ...@@ -1031,11 +1034,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false; return false;
} }
/* pte is unmapped now, we need to map it */
vmf.pte = pte_offset_map(pmd, vmf.address);
} }
vmf.pte--;
pte_unmap(vmf.pte);
/* Drain LRU add pagevec to remove extra pin on the swapped in pages */ /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
if (swapped_in) if (swapped_in)
......
...@@ -134,6 +134,18 @@ static inline bool arch_faults_on_old_pte(void) ...@@ -134,6 +134,18 @@ static inline bool arch_faults_on_old_pte(void)
} }
#endif #endif
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
/*
* Transitioning a PTE from 'old' to 'young' can be expensive on
* some architectures, even if it's performed in hardware. By
* default, "false" means prefaulted entries will be 'young'.
*/
return false;
}
#endif
static int __init disable_randmaps(char *s) static int __init disable_randmaps(char *s)
{ {
randomize_va_space = 0; randomize_va_space = 0;
...@@ -3503,7 +3515,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) ...@@ -3503,7 +3515,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (pte_alloc(vma->vm_mm, vmf->pmd)) if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM; return VM_FAULT_OOM;
/* See the comment in pte_alloc_one_map() */ /* See comment in handle_pte_fault() */
if (unlikely(pmd_trans_unstable(vmf->pmd))) if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0; return 0;
...@@ -3643,66 +3655,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) ...@@ -3643,66 +3655,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
return ret; return ret;
} }
/*
* The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
* If we check pmd_trans_unstable() first we will trip the bad_pmd() check
* inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
* returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
*/
static int pmd_devmap_trans_unstable(pmd_t *pmd)
{
return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
}
static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
if (!pmd_none(*vmf->pmd))
goto map_pte;
if (vmf->prealloc_pte) {
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd))) {
spin_unlock(vmf->ptl);
goto map_pte;
}
mm_inc_nr_ptes(vma->vm_mm);
pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
spin_unlock(vmf->ptl);
vmf->prealloc_pte = NULL;
} else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
return VM_FAULT_OOM;
}
map_pte:
/*
* If a huge pmd materialized under us just retry later. Use
* pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
* pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
* under us and then back to pmd_none, as a result of MADV_DONTNEED
* running immediately after a huge pmd fault in a different thread of
* this mm, in turn leading to a misleading pmd_trans_huge() retval.
* All we have to ensure is that it is a regular pmd that we can walk
* with pte_offset_map() and we can do that through an atomic read in
* C, which is what pmd_trans_unstable() provides.
*/
if (pmd_devmap_trans_unstable(vmf->pmd))
return VM_FAULT_NOPAGE;
/*
* At this point we know that our vmf->pmd points to a page of ptes
* and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
* for the duration of the fault. If a racing MADV_DONTNEED runs and
* we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
* be valid and we will re-check to make sure the vmf->pte isn't
* pte_none() under vmf->ptl protection when we return to
* alloc_set_pte().
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
return 0;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf) static void deposit_prealloc_pte(struct vm_fault *vmf)
{ {
...@@ -3717,7 +3669,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) ...@@ -3717,7 +3669,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
vmf->prealloc_pte = NULL; vmf->prealloc_pte = NULL;
} }
static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{ {
struct vm_area_struct *vma = vmf->vma; struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE; bool write = vmf->flags & FAULT_FLAG_WRITE;
...@@ -3775,76 +3727,41 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) ...@@ -3775,76 +3727,41 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
return ret; return ret;
} }
#else #else
static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{ {
BUILD_BUG(); return VM_FAULT_FALLBACK;
return 0;
} }
#endif #endif
/** void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
* alloc_set_pte - setup new PTE entry for given page and add reverse page
* mapping. If needed, the function allocates page table or use pre-allocated.
*
* @vmf: fault environment
* @page: page to map
*
* Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
* return.
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
*
* Return: %0 on success, %VM_FAULT_ code in case of error.
*/
vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
{ {
struct vm_area_struct *vma = vmf->vma; struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE; bool write = vmf->flags & FAULT_FLAG_WRITE;
bool prefault = vmf->address != addr;
pte_t entry; pte_t entry;
vm_fault_t ret;
if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
if (!vmf->pte) {
ret = pte_alloc_one_map(vmf);
if (ret)
return ret;
}
/* Re-check under ptl */
if (unlikely(!pte_none(*vmf->pte))) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
return VM_FAULT_NOPAGE;
}
flush_icache_page(vma, page); flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot); entry = mk_pte(page, vma->vm_page_prot);
if (prefault && arch_wants_old_prefaulted_pte())
entry = pte_mkold(entry);
else
entry = pte_sw_mkyoung(entry); entry = pte_sw_mkyoung(entry);
if (write) if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma); entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/* copy-on-write page */ /* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) { if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false); page_add_new_anon_rmap(page, vma, addr, false);
lru_cache_add_inactive_or_unevictable(page, vma); lru_cache_add_inactive_or_unevictable(page, vma);
} else { } else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false); page_add_file_rmap(page, false);
} }
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, vmf->address, vmf->pte);
return 0;
} }
/** /**
* finish_fault - finish page fault once we have prepared the page to fault * finish_fault - finish page fault once we have prepared the page to fault
* *
...@@ -3862,12 +3779,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) ...@@ -3862,12 +3779,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
*/ */
vm_fault_t finish_fault(struct vm_fault *vmf) vm_fault_t finish_fault(struct vm_fault *vmf)
{ {
struct vm_area_struct *vma = vmf->vma;
struct page *page; struct page *page;
vm_fault_t ret = 0; vm_fault_t ret;
/* Did we COW the page? */ /* Did we COW the page? */
if ((vmf->flags & FAULT_FLAG_WRITE) && if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
!(vmf->vma->vm_flags & VM_SHARED))
page = vmf->cow_page; page = vmf->cow_page;
else else
page = vmf->page; page = vmf->page;
...@@ -3876,11 +3793,37 @@ vm_fault_t finish_fault(struct vm_fault *vmf) ...@@ -3876,11 +3793,37 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
* check even for read faults because we might have lost our CoWed * check even for read faults because we might have lost our CoWed
* page * page
*/ */
if (!(vmf->vma->vm_flags & VM_SHARED)) if (!(vma->vm_flags & VM_SHARED)) {
ret = check_stable_address_space(vmf->vma->vm_mm); ret = check_stable_address_space(vma->vm_mm);
if (!ret) if (ret)
ret = alloc_set_pte(vmf, page); return ret;
if (vmf->pte) }
if (pmd_none(*vmf->pmd)) {
if (PageTransCompound(page)) {
ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
return VM_FAULT_OOM;
}
/* See comment in handle_pte_fault() */
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
ret = 0;
/* Re-check under ptl */
if (likely(pte_none(*vmf->pte)))
do_set_pte(vmf, page, vmf->address);
else
ret = VM_FAULT_NOPAGE;
update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret; return ret;
} }
...@@ -3951,13 +3894,12 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) ...@@ -3951,13 +3894,12 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
pgoff_t start_pgoff = vmf->pgoff; pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff; pgoff_t end_pgoff;
int off; int off;
vm_fault_t ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
vmf->address = max(address & mask, vmf->vma->vm_start); address = max(address & mask, vmf->vma->vm_start);
off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
start_pgoff -= off; start_pgoff -= off;
/* /*
...@@ -3965,7 +3907,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) ...@@ -3965,7 +3907,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
* the vma or nr_pages from start_pgoff, depending what is nearest. * the vma or nr_pages from start_pgoff, depending what is nearest.
*/ */
end_pgoff = start_pgoff - end_pgoff = start_pgoff -
((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1; PTRS_PER_PTE - 1;
end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
start_pgoff + nr_pages - 1); start_pgoff + nr_pages - 1);
...@@ -3973,31 +3915,11 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) ...@@ -3973,31 +3915,11 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
if (pmd_none(*vmf->pmd)) { if (pmd_none(*vmf->pmd)) {
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte) if (!vmf->prealloc_pte)
goto out; return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */ smp_wmb(); /* See comment in __pte_alloc() */
} }
vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
/* Huge page is mapped? Page fault is solved */
if (pmd_trans_huge(*vmf->pmd)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
/* ->map_pages() haven't done anything useful. Cold page cache? */
if (!vmf->pte)
goto out;
/* check if the page fault is solved */
vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
if (!pte_none(*vmf->pte))
ret = VM_FAULT_NOPAGE;
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
vmf->address = address;
vmf->pte = NULL;
return ret;
} }
static vm_fault_t do_read_fault(struct vm_fault *vmf) static vm_fault_t do_read_fault(struct vm_fault *vmf)
...@@ -4353,7 +4275,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) ...@@ -4353,7 +4275,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
*/ */
vmf->pte = NULL; vmf->pte = NULL;
} else { } else {
/* See comment in pte_alloc_one_map() */ /*
* If a huge pmd materialized under us just retry later. Use
* pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
* of pmd_trans_huge() to ensure the pmd didn't become
* pmd_trans_huge under us and then back to pmd_none, as a
* result of MADV_DONTNEED running immediately after a huge pmd
* fault in a different thread of this mm, in turn leading to a
* misleading pmd_trans_huge() retval. All we have to ensure is
* that it is a regular pmd that we can walk with
* pte_offset_map() and we can do that through an atomic read
* in C, which is what pmd_trans_unstable() provides.
*/
if (pmd_devmap_trans_unstable(vmf->pmd)) if (pmd_devmap_trans_unstable(vmf->pmd))
return 0; return 0;
/* /*
......
...@@ -1668,10 +1668,11 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) ...@@ -1668,10 +1668,11 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
} }
EXPORT_SYMBOL(filemap_fault); EXPORT_SYMBOL(filemap_fault);
void filemap_map_pages(struct vm_fault *vmf, vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff) pgoff_t start_pgoff, pgoff_t end_pgoff)
{ {
BUG(); BUG();
return 0;
} }
EXPORT_SYMBOL(filemap_map_pages); EXPORT_SYMBOL(filemap_map_pages);
......
...@@ -1520,11 +1520,11 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, ...@@ -1520,11 +1520,11 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
{ {
struct vm_area_struct pvma; struct vm_area_struct pvma;
struct page *page; struct page *page;
struct vm_fault vmf; struct vm_fault vmf = {
.vma = &pvma,
};
shmem_pseudo_vma_init(&pvma, info, index); shmem_pseudo_vma_init(&pvma, info, index);
vmf.vma = &pvma;
vmf.address = 0;
page = swap_cluster_readahead(swap, gfp, &vmf); page = swap_cluster_readahead(swap, gfp, &vmf);
shmem_pseudo_vma_destroy(&pvma); shmem_pseudo_vma_destroy(&pvma);
......
...@@ -1951,8 +1951,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -1951,8 +1951,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
si = swap_info[type]; si = swap_info[type];
pte = pte_offset_map(pmd, addr); pte = pte_offset_map(pmd, addr);
do { do {
struct vm_fault vmf;
if (!is_swap_pte(*pte)) if (!is_swap_pte(*pte))
continue; continue;
...@@ -1968,9 +1966,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -1968,9 +1966,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
swap_map = &si->swap_map[offset]; swap_map = &si->swap_map[offset];
page = lookup_swap_cache(entry, vma, addr); page = lookup_swap_cache(entry, vma, addr);
if (!page) { if (!page) {
vmf.vma = vma; struct vm_fault vmf = {
vmf.address = addr; .vma = vma,
vmf.pmd = pmd; .address = addr,
.pmd = pmd,
};
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
&vmf); &vmf);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment