Commit bae473a4 authored by Kirill A. Shutemov's avatar Kirill A. Shutemov Committed by Linus Torvalds

mm: introduce fault_env

The idea borrowed from Peter's patch from patchset on speculative page
faults[1]:

Instead of passing around the endless list of function arguments,
replace the lot with a single structure so we can change context without
endless function signature changes.

The changes are mostly mechanical with exception of faultaround code:
filemap_map_pages() got reworked a bit.

This patch is preparation for the next one.

[1] http://lkml.kernel.org/r/20141020222841.302891540@infradead.org

Link: http://lkml.kernel.org/r/1466021202-61880-9-git-send-email-kirill.shutemov@linux.intel.comSigned-off-by: default avatarKirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent dcddffd4
......@@ -548,13 +548,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
locked. The VM will unlock the page.
->map_pages() is called when VM asks to map easy accessible pages.
Filesystem should find and map pages associated with offsets from "pgoff"
till "max_pgoff". ->map_pages() is called with page table locked and must
Filesystem should find and map pages associated with offsets from "start_pgoff"
till "end_pgoff". ->map_pages() is called with page table locked and must
not block. If it's not possible to reach a page without blocking,
filesystem should skip it. Filesystem should use do_set_pte() to setup
page table entry. Pointer to entry associated with offset "pgoff" is
passed in "pte" field in vm_fault structure. Pointers to entries for other
offsets should be calculated relative to "pte".
page table entry. Pointer to entry associated with the page is passed in
"pte" field in fault_env structure. Pointers to entries for other offsets
should be calculated relative to "pte".
->page_mkwrite() is called when a previously read-only pte is
about to become writeable. The filesystem again must ensure that there are
......
......@@ -257,10 +257,9 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
* fatal_signal_pending()s, and the mmap_sem must be released before
* returning it.
*/
int handle_userfault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, unsigned long reason)
int handle_userfault(struct fault_env *fe, unsigned long reason)
{
struct mm_struct *mm = vma->vm_mm;
struct mm_struct *mm = fe->vma->vm_mm;
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq;
int ret;
......@@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
ret = VM_FAULT_SIGBUS;
ctx = vma->vm_userfaultfd_ctx.ctx;
ctx = fe->vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
goto out;
......@@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
* without first stopping userland access to the memory. For
* VM_UFFD_MISSING userfaults this is enough for now.
*/
if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) {
/*
* Validate the invariant that nowait must allow retry
* to be sure not to return SIGBUS erroneously on
* nowait invocations.
*/
BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) {
printk(KERN_WARNING
"FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
"FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags);
dump_stack();
}
#endif
......@@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
* and wait.
*/
ret = VM_FAULT_RETRY;
if (flags & FAULT_FLAG_RETRY_NOWAIT)
if (fe->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
/* take the reference before dropping the mmap_sem */
......@@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
uwq.msg = userfault_msg(address, flags, reason);
uwq.msg = userfault_msg(fe->address, fe->flags, reason);
uwq.ctx = ctx;
return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
return_to_userland =
(fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
spin_lock(&ctx->fault_pending_wqh.lock);
......@@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
TASK_KILLABLE);
spin_unlock(&ctx->fault_pending_wqh.lock);
must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason);
up_read(&mm->mmap_sem);
if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
......
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H
extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags);
extern int do_huge_pmd_anonymous_page(struct fault_env *fe);
extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma);
extern void huge_pmd_set_accessed(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pmd_t orig_pmd, int dirty);
extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pmd_t orig_pmd);
extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd);
extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd);
extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
......@@ -134,8 +126,7 @@ static inline int hpage_nr_pages(struct page *page)
return 1;
}
extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp);
extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd);
extern struct page *huge_zero_page;
......@@ -196,8 +187,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
return NULL;
}
static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd)
{
return 0;
}
......
......@@ -309,10 +309,27 @@ struct vm_fault {
* VM_FAULT_DAX_LOCKED and fill in
* entry here.
*/
/* for ->map_pages() only */
pgoff_t max_pgoff; /* map pages for offset from pgoff till
* max_pgoff inclusive */
pte_t *pte; /* pte entry associated with ->pgoff */
};
/*
* Page fault context: passes though page fault handler instead of endless list
* of function arguments.
*/
struct fault_env {
struct vm_area_struct *vma; /* Target VMA */
unsigned long address; /* Faulting virtual address */
unsigned int flags; /* FAULT_FLAG_xxx flags */
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address'
*/
pte_t *pte; /* Pointer to pte entry matching
* the 'address'. NULL if the page
* table hasn't been allocated.
*/
spinlock_t *ptl; /* Page table lock.
* Protects pte page table if 'pte'
* is not NULL, otherwise pmd.
*/
};
/*
......@@ -327,7 +344,8 @@ struct vm_operations_struct {
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
pmd_t *, unsigned int flags);
void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
void (*map_pages)(struct fault_env *fe,
pgoff_t start_pgoff, pgoff_t end_pgoff);
/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
......@@ -600,8 +618,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}
void do_set_pte(struct vm_area_struct *vma, unsigned long address,
struct page *page, pte_t *pte, bool write, bool anon);
void do_set_pte(struct fault_env *fe, struct page *page);
#endif
/*
......@@ -2062,7 +2079,8 @@ extern void truncate_inode_pages_final(struct address_space *);
/* generic vm_area_ops exported for stackable file systems */
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf);
extern void filemap_map_pages(struct fault_env *fe,
pgoff_t start_pgoff, pgoff_t end_pgoff);
extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
/* mm/page-writeback.c */
......
......@@ -27,8 +27,7 @@
#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, unsigned long reason);
extern int handle_userfault(struct fault_env *fe, unsigned long reason);
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len);
......@@ -56,10 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
#else /* CONFIG_USERFAULTFD */
/* mm helpers */
static inline int handle_userfault(struct vm_area_struct *vma,
unsigned long address,
unsigned int flags,
unsigned long reason)
static inline int handle_userfault(struct fault_env *fe, unsigned long reason)
{
return VM_FAULT_SIGBUS;
}
......
......@@ -2128,22 +2128,27 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
void filemap_map_pages(struct fault_env *fe,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct radix_tree_iter iter;
void **slot;
struct file *file = vma->vm_file;
struct file *file = fe->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
loff_t size;
struct page *page;
unsigned long address = (unsigned long) vmf->virtual_address;
unsigned long addr;
pte_t *pte;
rcu_read_lock();
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) {
if (iter.index > vmf->max_pgoff)
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
start_pgoff) {
if (iter.index > end_pgoff)
break;
fe->pte += iter.index - last_pgoff;
fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
last_pgoff = iter.index;
if (!pte_none(*fe->pte))
goto next;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
......@@ -2179,14 +2184,9 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
if (page->index >= size >> PAGE_SHIFT)
goto unlock;
pte = vmf->pte + page->index - vmf->pgoff;
if (!pte_none(*pte))
goto unlock;
if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
do_set_pte(vma, addr, page, pte, false, false);
do_set_pte(fe, page);
unlock_page(page);
goto next;
unlock:
......@@ -2194,7 +2194,7 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
skip:
put_page(page);
next:
if (iter.index == vmf->max_pgoff)
if (iter.index == end_pgoff)
break;
}
rcu_read_unlock();
......
......@@ -821,26 +821,23 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
struct page *page, gfp_t gfp,
unsigned int flags)
static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
gfp_t gfp)
{
struct vm_area_struct *vma = fe->vma;
struct mem_cgroup *memcg;
pgtable_t pgtable;
spinlock_t *ptl;
unsigned long haddr = address & HPAGE_PMD_MASK;
unsigned long haddr = fe->address & HPAGE_PMD_MASK;
VM_BUG_ON_PAGE(!PageCompound(page), page);
if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
pgtable = pte_alloc_one(mm, haddr);
pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable)) {
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
......@@ -855,12 +852,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/
__SetPageUptodate(page);
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_none(*pmd))) {
spin_unlock(ptl);
fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
if (unlikely(!pmd_none(*fe->pmd))) {
spin_unlock(fe->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(mm, pgtable);
pte_free(vma->vm_mm, pgtable);
} else {
pmd_t entry;
......@@ -868,12 +865,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
if (userfaultfd_missing(vma)) {
int ret;
spin_unlock(ptl);
spin_unlock(fe->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(mm, pgtable);
ret = handle_userfault(vma, address, flags,
VM_UFFD_MISSING);
pte_free(vma->vm_mm, pgtable);
ret = handle_userfault(fe, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret;
}
......@@ -883,11 +879,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_long_inc(&mm->nr_ptes);
spin_unlock(ptl);
pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_long_inc(&vma->vm_mm->nr_ptes);
spin_unlock(fe->ptl);
count_vm_event(THP_FAULT_ALLOC);
}
......@@ -937,13 +933,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return true;
}
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags)
int do_huge_pmd_anonymous_page(struct fault_env *fe)
{
struct vm_area_struct *vma = fe->vma;
gfp_t gfp;
struct page *page;
unsigned long haddr = address & HPAGE_PMD_MASK;
unsigned long haddr = fe->address & HPAGE_PMD_MASK;
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return VM_FAULT_FALLBACK;
......@@ -951,42 +946,40 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) &&
if (!(fe->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
spinlock_t *ptl;
pgtable_t pgtable;
struct page *zero_page;
bool set;
int ret;
pgtable = pte_alloc_one(mm, haddr);
pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
zero_page = get_huge_zero_page();
if (unlikely(!zero_page)) {
pte_free(mm, pgtable);
pte_free(vma->vm_mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
ptl = pmd_lock(mm, pmd);
fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
ret = 0;
set = false;
if (pmd_none(*pmd)) {
if (pmd_none(*fe->pmd)) {
if (userfaultfd_missing(vma)) {
spin_unlock(ptl);
ret = handle_userfault(vma, address, flags,
VM_UFFD_MISSING);
spin_unlock(fe->ptl);
ret = handle_userfault(fe, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else {
set_huge_zero_page(pgtable, mm, vma,
haddr, pmd,
zero_page);
spin_unlock(ptl);
set_huge_zero_page(pgtable, vma->vm_mm, vma,
haddr, fe->pmd, zero_page);
spin_unlock(fe->ptl);
set = true;
}
} else
spin_unlock(ptl);
spin_unlock(fe->ptl);
if (!set) {
pte_free(mm, pgtable);
pte_free(vma->vm_mm, pgtable);
put_huge_zero_page();
}
return ret;
......@@ -998,8 +991,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_FALLBACK;
}
prep_transhuge_page(page);
return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
flags);
return __do_huge_pmd_anonymous_page(fe, page, gfp);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
......@@ -1172,38 +1164,31 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return ret;
}
void huge_pmd_set_accessed(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmd, pmd_t orig_pmd,
int dirty)
void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
{
spinlock_t *ptl;
pmd_t entry;
unsigned long haddr;
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
goto unlock;
entry = pmd_mkyoung(orig_pmd);
haddr = address & HPAGE_PMD_MASK;
if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
update_mmu_cache_pmd(vma, address, pmd);
haddr = fe->address & HPAGE_PMD_MASK;
if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
fe->flags & FAULT_FLAG_WRITE))
update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
unlock:
spin_unlock(ptl);
spin_unlock(fe->ptl);
}
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmd, pmd_t orig_pmd,
struct page *page,
unsigned long haddr)
static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
struct page *page)
{
struct vm_area_struct *vma = fe->vma;
unsigned long haddr = fe->address & HPAGE_PMD_MASK;
struct mem_cgroup *memcg;
spinlock_t *ptl;
pgtable_t pgtable;
pmd_t _pmd;
int ret = 0, i;
......@@ -1220,11 +1205,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
for (i = 0; i < HPAGE_PMD_NR; i++) {
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
__GFP_OTHER_NODE,
vma, address, page_to_nid(page));
__GFP_OTHER_NODE, vma,
fe->address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
&memcg, false))) {
mem_cgroup_try_charge(pages[i], vma->vm_mm,
GFP_KERNEL, &memcg, false))) {
if (pages[i])
put_page(pages[i]);
while (--i >= 0) {
......@@ -1250,41 +1235,41 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
pmdp_huge_clear_flush_notify(vma, haddr, pmd);
pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
pmd_populate(vma->vm_mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t *pte, entry;
pte_t entry;
entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], vma, haddr, false);
page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
pte_unmap(pte);
fe->pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*fe->pte));
set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
pte_unmap(fe->pte);
}
kfree(pages);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
pmd_populate(vma->vm_mm, fe->pmd, pgtable);
page_remove_rmap(page, true);
spin_unlock(ptl);
spin_unlock(fe->ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
ret |= VM_FAULT_WRITE;
put_page(page);
......@@ -1293,8 +1278,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
return ret;
out_free_pages:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
spin_unlock(fe->ptl);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
......@@ -1305,25 +1290,23 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
goto out;
}
int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
{
spinlock_t *ptl;
int ret = 0;
struct vm_area_struct *vma = fe->vma;
struct page *page = NULL, *new_page;
struct mem_cgroup *memcg;
unsigned long haddr;
unsigned long haddr = fe->address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
int ret = 0;
ptl = pmd_lockptr(mm, pmd);
fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
spin_lock(fe->ptl);
if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
......@@ -1336,13 +1319,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
update_mmu_cache_pmd(vma, address, pmd);
if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1))
update_mmu_cache_pmd(vma, fe->address, fe->pmd);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
get_page(page);
spin_unlock(ptl);
spin_unlock(fe->ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
......@@ -1355,13 +1338,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
prep_transhuge_page(new_page);
} else {
if (!page) {
split_huge_pmd(vma, pmd, address);
split_huge_pmd(vma, fe->pmd, fe->address);
ret |= VM_FAULT_FALLBACK;
} else {
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
if (ret & VM_FAULT_OOM) {
split_huge_pmd(vma, pmd, address);
split_huge_pmd(vma, fe->pmd, fe->address);
ret |= VM_FAULT_FALLBACK;
}
put_page(page);
......@@ -1370,14 +1352,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
}
if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
true))) {
if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) {
put_page(new_page);
if (page) {
split_huge_pmd(vma, pmd, address);
split_huge_pmd(vma, fe->pmd, fe->address);
if (page)
put_page(page);
} else
split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK);
goto out;
......@@ -1393,13 +1373,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
spin_lock(ptl);
spin_lock(fe->ptl);
if (page)
put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(ptl);
if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
spin_unlock(fe->ptl);
mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
......@@ -1407,14 +1387,14 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_huge_clear_flush_notify(vma, haddr, pmd);
pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
update_mmu_cache_pmd(vma, fe->address, fe->pmd);
if (!page) {
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
put_huge_zero_page();
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
......@@ -1423,13 +1403,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
ret |= VM_FAULT_WRITE;
}
spin_unlock(ptl);
spin_unlock(fe->ptl);
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
spin_unlock(ptl);
spin_unlock(fe->ptl);
return ret;
}
......@@ -1489,13 +1469,12 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
}
/* NUMA hinting page fault entry point for trans huge pmds */
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
{
spinlock_t *ptl;
struct vm_area_struct *vma = fe->vma;
struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
unsigned long haddr = fe->address & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
......@@ -1506,8 +1485,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* A PROT_NONE fault should not end up here */
BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
ptl = pmd_lock(mm, pmdp);
if (unlikely(!pmd_same(pmd, *pmdp)))
fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
if (unlikely(!pmd_same(pmd, *fe->pmd)))
goto out_unlock;
/*
......@@ -1515,9 +1494,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped.
*/
if (unlikely(pmd_trans_migrating(*pmdp))) {
page = pmd_page(*pmdp);
spin_unlock(ptl);
if (unlikely(pmd_trans_migrating(*fe->pmd))) {
page = pmd_page(*fe->pmd);
spin_unlock(fe->ptl);
wait_on_page_locked(page);
goto out;
}
......@@ -1550,7 +1529,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
spin_unlock(ptl);
spin_unlock(fe->ptl);
wait_on_page_locked(page);
page_nid = -1;
goto out;
......@@ -1561,12 +1540,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* to serialises splits
*/
get_page(page);
spin_unlock(ptl);
spin_unlock(fe->ptl);
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */
spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp))) {
spin_lock(fe->ptl);
if (unlikely(!pmd_same(pmd, *fe->pmd))) {
unlock_page(page);
put_page(page);
page_nid = -1;
......@@ -1584,9 +1563,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
*/
spin_unlock(ptl);
migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid);
spin_unlock(fe->ptl);
migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
fe->pmd, pmd, fe->address, page, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
page_nid = target_nid;
......@@ -1601,18 +1580,18 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pmd = pmd_mkyoung(pmd);
if (was_writable)
pmd = pmd_mkwrite(pmd);
set_pmd_at(mm, haddr, pmdp, pmd);
update_mmu_cache_pmd(vma, addr, pmdp);
set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
update_mmu_cache_pmd(vma, fe->address, fe->pmd);
unlock_page(page);
out_unlock:
spin_unlock(ptl);
spin_unlock(fe->ptl);
out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
if (page_nid != -1)
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
return 0;
}
......@@ -2413,20 +2392,23 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd)
{
unsigned long _address;
pte_t *pte, pteval;
pte_t pteval;
int swapped_in = 0, ret = 0;
pte = pte_offset_map(pmd, address);
for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
pte++, _address += PAGE_SIZE) {
pteval = *pte;
struct fault_env fe = {
.vma = vma,
.address = address,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
};
fe.pte = pte_offset_map(pmd, address);
for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
fe.pte++, fe.address += PAGE_SIZE) {
pteval = *fe.pte;
if (!is_swap_pte(pteval))
continue;
swapped_in++;
ret = do_swap_page(mm, vma, _address, pte, pmd,
FAULT_FLAG_ALLOW_RETRY,
pteval);
ret = do_swap_page(&fe, pteval);
/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
if (ret & VM_FAULT_RETRY) {
down_read(&mm->mmap_sem);
......@@ -2442,10 +2424,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
return false;
}
/* pte is unmapped now, we need to map it */
pte = pte_offset_map(pmd, _address);
fe.pte = pte_offset_map(pmd, fe.address);
}
pte--;
pte_unmap(pte);
fe.pte--;
pte_unmap(fe.pte);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
return true;
}
......
......@@ -36,9 +36,7 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte);
int do_swap_page(struct fault_env *fe, pte_t orig_pte);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
......
......@@ -2070,13 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
static inline int wp_page_reuse(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
struct page *page, int page_mkwrite,
int dirty_shared)
__releases(ptl)
static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
struct page *page, int page_mkwrite, int dirty_shared)
__releases(fe->ptl)
{
struct vm_area_struct *vma = fe->vma;
pte_t entry;
/*
* Clear the pages cpupid information as the existing
......@@ -2086,12 +2084,12 @@ static inline int wp_page_reuse(struct mm_struct *mm,
if (page)
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
flush_cache_page(vma, address, pte_pfn(orig_pte));
flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, address, page_table, entry, 1))
update_mmu_cache(vma, address, page_table);
pte_unmap_unlock(page_table, ptl);
if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
update_mmu_cache(vma, fe->address, fe->pte);
pte_unmap_unlock(fe->pte, fe->ptl);
if (dirty_shared) {
struct address_space *mapping;
......@@ -2137,30 +2135,31 @@ static inline int wp_page_reuse(struct mm_struct *mm,
* held to the old page, as well as updating the rmap.
* - In any case, unlock the PTL and drop the reference we took to the old page.
*/
static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
pte_t orig_pte, struct page *old_page)
static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
struct page *old_page)
{
struct vm_area_struct *vma = fe->vma;
struct mm_struct *mm = vma->vm_mm;
struct page *new_page = NULL;
spinlock_t *ptl = NULL;
pte_t entry;
int page_copied = 0;
const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */
const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */
const unsigned long mmun_start = fe->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (is_zero_pfn(pte_pfn(orig_pte))) {
new_page = alloc_zeroed_user_highpage_movable(vma, address);
new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
fe->address);
if (!new_page)
goto oom;
cow_user_page(new_page, old_page, address, vma);
cow_user_page(new_page, old_page, fe->address, vma);
}
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
......@@ -2173,8 +2172,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Re-check the pte - we dropped the lock
*/
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
if (likely(pte_same(*fe->pte, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm,
......@@ -2184,7 +2183,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
} else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
flush_cache_page(vma, address, pte_pfn(orig_pte));
flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
......@@ -2193,8 +2192,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
ptep_clear_flush_notify(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address, false);
ptep_clear_flush_notify(vma, fe->address, fe->pte);
page_add_new_anon_rmap(new_page, vma, fe->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
......@@ -2202,8 +2201,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
set_pte_at_notify(mm, address, page_table, entry);
update_mmu_cache(vma, address, page_table);
set_pte_at_notify(mm, fe->address, fe->pte, entry);
update_mmu_cache(vma, fe->address, fe->pte);
if (old_page) {
/*
* Only after switching the pte to the new page may
......@@ -2240,7 +2239,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
if (new_page)
put_page(new_page);
pte_unmap_unlock(page_table, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
......@@ -2268,44 +2267,43 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
* mapping
*/
static int wp_pfn_shared(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
pmd_t *pmd)
static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte)
{
struct vm_area_struct *vma = fe->vma;
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
struct vm_fault vmf = {
.page = NULL,
.pgoff = linear_page_index(vma, address),
.virtual_address = (void __user *)(address & PAGE_MASK),
.pgoff = linear_page_index(vma, fe->address),
.virtual_address =
(void __user *)(fe->address & PAGE_MASK),
.flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
};
int ret;
pte_unmap_unlock(page_table, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
if (ret & VM_FAULT_ERROR)
return ret;
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
&fe->ptl);
/*
* We might have raced with another page fault while we
* released the pte_offset_map_lock.
*/
if (!pte_same(*page_table, orig_pte)) {
pte_unmap_unlock(page_table, ptl);
if (!pte_same(*fe->pte, orig_pte)) {
pte_unmap_unlock(fe->pte, fe->ptl);
return 0;
}
}
return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
NULL, 0, 0);
return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
}
static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table,
pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
struct page *old_page)
__releases(ptl)
static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
struct page *old_page)
__releases(fe->ptl)
{
struct vm_area_struct *vma = fe->vma;
int page_mkwrite = 0;
get_page(old_page);
......@@ -2313,8 +2311,8 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
pte_unmap_unlock(page_table, ptl);
tmp = do_page_mkwrite(vma, old_page, address);
pte_unmap_unlock(fe->pte, fe->ptl);
tmp = do_page_mkwrite(vma, old_page, fe->address);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
put_page(old_page);
......@@ -2326,19 +2324,18 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
* they did, we just return, as we can count on the
* MMU to tell us if they didn't also make it writable.
*/
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
&fe->ptl);
if (!pte_same(*fe->pte, orig_pte)) {
unlock_page(old_page);
pte_unmap_unlock(page_table, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
put_page(old_page);
return 0;
}
page_mkwrite = 1;
}
return wp_page_reuse(mm, vma, address, page_table, ptl,
orig_pte, old_page, page_mkwrite, 1);
return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
}
/*
......@@ -2359,14 +2356,13 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
spinlock_t *ptl, pte_t orig_pte)
__releases(ptl)
static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
__releases(fe->ptl)
{
struct vm_area_struct *vma = fe->vma;
struct page *old_page;
old_page = vm_normal_page(vma, address, orig_pte);
old_page = vm_normal_page(vma, fe->address, orig_pte);
if (!old_page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
......@@ -2377,12 +2373,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
return wp_pfn_shared(mm, vma, address, page_table, ptl,
orig_pte, pmd);
return wp_pfn_shared(fe, orig_pte);
pte_unmap_unlock(page_table, ptl);
return wp_page_copy(mm, vma, address, page_table, pmd,
orig_pte, old_page);
pte_unmap_unlock(fe->pte, fe->ptl);
return wp_page_copy(fe, orig_pte, old_page);
}
/*
......@@ -2393,13 +2387,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
int total_mapcount;
if (!trylock_page(old_page)) {
get_page(old_page);
pte_unmap_unlock(page_table, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
lock_page(old_page);
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
fe->address, &fe->ptl);
if (!pte_same(*fe->pte, orig_pte)) {
unlock_page(old_page);
pte_unmap_unlock(page_table, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
put_page(old_page);
return 0;
}
......@@ -2417,14 +2411,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_move_anon_rmap(old_page, vma);
}
unlock_page(old_page);
return wp_page_reuse(mm, vma, address, page_table, ptl,
orig_pte, old_page, 0, 0);
return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
}
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(mm, vma, address, page_table, pmd,
ptl, orig_pte, old_page);
return wp_page_shared(fe, orig_pte, old_page);
}
/*
......@@ -2432,9 +2424,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
get_page(old_page);
pte_unmap_unlock(page_table, ptl);
return wp_page_copy(mm, vma, address, page_table, pmd,
orig_pte, old_page);
pte_unmap_unlock(fe->pte, fe->ptl);
return wp_page_copy(fe, orig_pte, old_page);
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
......@@ -2522,11 +2513,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
int do_swap_page(struct fault_env *fe, pte_t orig_pte)
{
spinlock_t *ptl;
struct vm_area_struct *vma = fe->vma;
struct page *page, *swapcache;
struct mem_cgroup *memcg;
swp_entry_t entry;
......@@ -2535,17 +2524,17 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
int exclusive = 0;
int ret = 0;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
goto out;
entry = pte_to_swp_entry(orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
migration_entry_wait(mm, pmd, address);
migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
print_bad_pte(vma, address, orig_pte, NULL);
print_bad_pte(vma, fe->address, orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
}
goto out;
......@@ -2554,14 +2543,15 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = lookup_swap_cache(entry);
if (!page) {
page = swapin_readahead(entry,
GFP_HIGHUSER_MOVABLE, vma, address);
GFP_HIGHUSER_MOVABLE, vma, fe->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
*/
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte)))
fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
fe->address, &fe->ptl);
if (likely(pte_same(*fe->pte, orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
......@@ -2570,7 +2560,7 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(mm, PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
} else if (PageHWPoison(page)) {
/*
* hwpoisoned dirty swapcache pages are kept for killing
......@@ -2583,7 +2573,7 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
swapcache = page;
locked = lock_page_or_retry(page, mm, flags);
locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!locked) {
......@@ -2600,14 +2590,15 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
goto out_page;
page = ksm_might_need_to_copy(page, vma, address);
page = ksm_might_need_to_copy(page, vma, fe->address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
page = swapcache;
goto out_page;
}
if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
&memcg, false)) {
ret = VM_FAULT_OOM;
goto out_page;
}
......@@ -2615,8 +2606,9 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Back out if somebody else already faulted in this pte.
*/
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*page_table, orig_pte)))
fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
&fe->ptl);
if (unlikely(!pte_same(*fe->pte, orig_pte)))
goto out_nomap;
if (unlikely(!PageUptodate(page))) {
......@@ -2634,24 +2626,24 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
* must be called after the swap_free(), or it will never succeed.
*/
inc_mm_counter_fast(mm, MM_ANONPAGES);
dec_mm_counter_fast(mm, MM_SWAPENTS);
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
fe->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(orig_pte))
pte = pte_mksoft_dirty(pte);
set_pte_at(mm, address, page_table, pte);
set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
if (page == swapcache) {
do_page_add_anon_rmap(page, vma, address, exclusive);
do_page_add_anon_rmap(page, vma, fe->address, exclusive);
mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, address, false);
page_add_new_anon_rmap(page, vma, fe->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
......@@ -2674,22 +2666,22 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
put_page(swapcache);
}
if (flags & FAULT_FLAG_WRITE) {
ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
if (fe->flags & FAULT_FLAG_WRITE) {
ret |= do_wp_page(fe, pte);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
}
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, page_table);
update_mmu_cache(vma, fe->address, fe->pte);
unlock:
pte_unmap_unlock(page_table, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
out:
return ret;
out_nomap:
mem_cgroup_cancel_charge(page, memcg, false);
pte_unmap_unlock(page_table, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
out_page:
unlock_page(page);
out_release:
......@@ -2740,37 +2732,36 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags)
static int do_anonymous_page(struct fault_env *fe)
{
struct vm_area_struct *vma = fe->vma;
struct mem_cgroup *memcg;
struct page *page;
spinlock_t *ptl;
pte_t entry;
pte_unmap(page_table);
pte_unmap(fe->pte);
/* File mapping without ->vm_ops ? */
if (vma->vm_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
/* Check if we need to add a guard page to the stack */
if (check_stack_guard_page(vma, address) < 0)
if (check_stack_guard_page(vma, fe->address) < 0)
return VM_FAULT_SIGSEGV;
/* Use the zero-page for reads */
if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
if (!(fe->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
vma->vm_page_prot));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
&fe->ptl);
if (!pte_none(*fe->pte))
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(page_table, ptl);
return handle_userfault(vma, address, flags,
VM_UFFD_MISSING);
pte_unmap_unlock(fe->pte, fe->ptl);
return handle_userfault(fe, VM_UFFD_MISSING);
}
goto setpte;
}
......@@ -2778,11 +2769,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, address);
page = alloc_zeroed_user_highpage_movable(vma, fe->address);
if (!page)
goto oom;
if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
goto oom_free_page;
/*
......@@ -2796,30 +2787,30 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
&fe->ptl);
if (!pte_none(*fe->pte))
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(page_table, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
return handle_userfault(vma, address, flags,
VM_UFFD_MISSING);
return handle_userfault(fe, VM_UFFD_MISSING);
}
inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address, false);
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, fe->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
set_pte_at(mm, address, page_table, entry);
set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, page_table);
update_mmu_cache(vma, fe->address, fe->pte);
unlock:
pte_unmap_unlock(page_table, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
return 0;
release:
mem_cgroup_cancel_charge(page, memcg, false);
......@@ -2836,17 +2827,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
static int __do_fault(struct vm_area_struct *vma, unsigned long address,
pgoff_t pgoff, unsigned int flags,
struct page *cow_page, struct page **page,
void **entry)
static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
struct page *cow_page, struct page **page, void **entry)
{
struct vm_area_struct *vma = fe->vma;
struct vm_fault vmf;
int ret;
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.flags = fe->flags;
vmf.page = NULL;
vmf.gfp_mask = __get_fault_gfp_mask(vma);
vmf.cow_page = cow_page;
......@@ -2878,38 +2868,36 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
/**
* do_set_pte - setup new PTE entry for given page and add reverse page mapping.
*
* @vma: virtual memory area
* @address: user virtual address
* @fe: fault environment
* @page: page to map
* @pte: pointer to target page table entry
* @write: true, if new entry is writable
* @anon: true, if it's anonymous page
*
* Caller must hold page table lock relevant for @pte.
* Caller must hold page table lock relevant for @fe->pte.
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
*/
void do_set_pte(struct vm_area_struct *vma, unsigned long address,
struct page *page, pte_t *pte, bool write, bool anon)
void do_set_pte(struct fault_env *fe, struct page *page)
{
struct vm_area_struct *vma = fe->vma;
bool write = fe->flags & FAULT_FLAG_WRITE;
pte_t entry;
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address, false);
page_add_new_anon_rmap(page, vma, fe->address, false);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page);
}
set_pte_at(vma->vm_mm, address, pte, entry);
set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, pte);
update_mmu_cache(vma, fe->address, fe->pte);
}
static unsigned long fault_around_bytes __read_mostly =
......@@ -2976,57 +2964,53 @@ late_initcall(fault_around_debugfs);
* fault_around_pages() value (and therefore to page order). This way it's
* easier to guarantee that we don't cross page table boundaries.
*/
static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pgoff_t pgoff, unsigned int flags)
static void do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
{
unsigned long start_addr, nr_pages, mask;
pgoff_t max_pgoff;
struct vm_fault vmf;
unsigned long address = fe->address, start_addr, nr_pages, mask;
pte_t *pte = fe->pte;
pgoff_t end_pgoff;
int off;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
start_addr = max(address & mask, vma->vm_start);
off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
pte -= off;
pgoff -= off;
start_addr = max(fe->address & mask, fe->vma->vm_start);
off = ((fe->address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
fe->pte -= off;
start_pgoff -= off;
/*
* max_pgoff is either end of page table or end of vma
* or fault_around_pages() from pgoff, depending what is nearest.
* end_pgoff is either end of page table or end of vma
* or fault_around_pages() from start_pgoff, depending what is nearest.
*/
max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
end_pgoff = start_pgoff -
((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
pgoff + nr_pages - 1);
end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
start_pgoff + nr_pages - 1);
/* Check if it makes any sense to call ->map_pages */
while (!pte_none(*pte)) {
if (++pgoff > max_pgoff)
return;
start_addr += PAGE_SIZE;
if (start_addr >= vma->vm_end)
return;
pte++;
fe->address = start_addr;
while (!pte_none(*fe->pte)) {
if (++start_pgoff > end_pgoff)
goto out;
fe->address += PAGE_SIZE;
if (fe->address >= fe->vma->vm_end)
goto out;
fe->pte++;
}
vmf.virtual_address = (void __user *) start_addr;
vmf.pte = pte;
vmf.pgoff = pgoff;
vmf.max_pgoff = max_pgoff;
vmf.flags = flags;
vmf.gfp_mask = __get_fault_gfp_mask(vma);
vma->vm_ops->map_pages(vma, &vmf);
fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
out:
/* restore fault_env */
fe->pte = pte;
fe->address = address;
}
static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
static int do_read_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
{
struct vm_area_struct *vma = fe->vma;
struct page *fault_page;
spinlock_t *ptl;
pte_t *pte;
int ret = 0;
/*
......@@ -3035,66 +3019,68 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
do_fault_around(vma, address, pte, pgoff, flags);
if (!pte_same(*pte, orig_pte))
fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
&fe->ptl);
if (!pte_same(*fe->pte, orig_pte))
goto unlock_out;
do_fault_around(fe, pgoff);
/* Check if the fault is handled by faultaround */
if (!pte_same(*fe->pte, orig_pte))
goto unlock_out;
pte_unmap_unlock(pte, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
}
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, &fe->ptl);
if (unlikely(!pte_same(*fe->pte, orig_pte))) {
pte_unmap_unlock(fe->pte, fe->ptl);
unlock_page(fault_page);
put_page(fault_page);
return ret;
}
do_set_pte(vma, address, fault_page, pte, false, false);
do_set_pte(fe, fault_page);
unlock_page(fault_page);
unlock_out:
pte_unmap_unlock(pte, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
return ret;
}
static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
{
struct vm_area_struct *vma = fe->vma;
struct page *fault_page, *new_page;
void *fault_entry;
struct mem_cgroup *memcg;
spinlock_t *ptl;
pte_t *pte;
int ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
if (!new_page)
return VM_FAULT_OOM;
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
&memcg, false)) {
put_page(new_page);
return VM_FAULT_OOM;
}
ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page,
&fault_entry);
ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
if (!(ret & VM_FAULT_DAX_LOCKED))
copy_user_highpage(new_page, fault_page, address, vma);
copy_user_highpage(new_page, fault_page, fe->address, vma);
__SetPageUptodate(new_page);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
&fe->ptl);
if (unlikely(!pte_same(*fe->pte, orig_pte))) {
pte_unmap_unlock(fe->pte, fe->ptl);
if (!(ret & VM_FAULT_DAX_LOCKED)) {
unlock_page(fault_page);
put_page(fault_page);
......@@ -3104,10 +3090,10 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
goto uncharge_out;
}
do_set_pte(vma, address, new_page, pte, true, true);
do_set_pte(fe, new_page);
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
if (!(ret & VM_FAULT_DAX_LOCKED)) {
unlock_page(fault_page);
put_page(fault_page);
......@@ -3121,18 +3107,15 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return ret;
}
static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
{
struct vm_area_struct *vma = fe->vma;
struct page *fault_page;
struct address_space *mapping;
spinlock_t *ptl;
pte_t *pte;
int dirtied = 0;
int ret, tmp;
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
......@@ -3142,7 +3125,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (vma->vm_ops->page_mkwrite) {
unlock_page(fault_page);
tmp = do_page_mkwrite(vma, fault_page, address);
tmp = do_page_mkwrite(vma, fault_page, fe->address);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
put_page(fault_page);
......@@ -3150,15 +3133,16 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
}
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
&fe->ptl);
if (unlikely(!pte_same(*fe->pte, orig_pte))) {
pte_unmap_unlock(fe->pte, fe->ptl);
unlock_page(fault_page);
put_page(fault_page);
return ret;
}
do_set_pte(vma, address, fault_page, pte, true, false);
pte_unmap_unlock(pte, ptl);
do_set_pte(fe, fault_page);
pte_unmap_unlock(fe->pte, fe->ptl);
if (set_page_dirty(fault_page))
dirtied = 1;
......@@ -3190,23 +3174,20 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
static int do_fault(struct fault_env *fe, pte_t orig_pte)
{
pgoff_t pgoff = linear_page_index(vma, address);
struct vm_area_struct *vma = fe->vma;
pgoff_t pgoff = linear_page_index(vma, fe->address);
pte_unmap(page_table);
pte_unmap(fe->pte);
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
return VM_FAULT_SIGBUS;
if (!(flags & FAULT_FLAG_WRITE))
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
if (!(fe->flags & FAULT_FLAG_WRITE))
return do_read_fault(fe, pgoff, orig_pte);
if (!(vma->vm_flags & VM_SHARED))
return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
return do_cow_fault(fe, pgoff, orig_pte);
return do_shared_fault(fe, pgoff, orig_pte);
}
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
......@@ -3224,11 +3205,10 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
return mpol_misplaced(page, vma, addr);
}
static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
static int do_numa_page(struct fault_env *fe, pte_t pte)
{
struct vm_area_struct *vma = fe->vma;
struct page *page = NULL;
spinlock_t *ptl;
int page_nid = -1;
int last_cpupid;
int target_nid;
......@@ -3248,10 +3228,10 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* page table entry is not accessible, so there would be no
* concurrent hardware modifications to the PTE.
*/
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*ptep, pte))) {
pte_unmap_unlock(ptep, ptl);
fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
spin_lock(fe->ptl);
if (unlikely(!pte_same(*fe->pte, pte))) {
pte_unmap_unlock(fe->pte, fe->ptl);
goto out;
}
......@@ -3260,18 +3240,18 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
set_pte_at(mm, addr, ptep, pte);
update_mmu_cache(vma, addr, ptep);
set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
update_mmu_cache(vma, fe->address, fe->pte);
page = vm_normal_page(vma, addr, pte);
page = vm_normal_page(vma, fe->address, pte);
if (!page) {
pte_unmap_unlock(ptep, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
return 0;
}
/* TODO: handle PTE-mapped THP */
if (PageCompound(page)) {
pte_unmap_unlock(ptep, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
return 0;
}
......@@ -3295,8 +3275,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
pte_unmap_unlock(ptep, ptl);
target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
&flags);
pte_unmap_unlock(fe->pte, fe->ptl);
if (target_nid == -1) {
put_page(page);
goto out;
......@@ -3316,24 +3297,24 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned int flags)
static int create_huge_pmd(struct fault_env *fe)
{
struct vm_area_struct *vma = fe->vma;
if (vma_is_anonymous(vma))
return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
return do_huge_pmd_anonymous_page(fe);
if (vma->vm_ops->pmd_fault)
return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
fe->flags);
return VM_FAULT_FALLBACK;
}
static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
unsigned int flags)
static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
{
if (vma_is_anonymous(vma))
return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
if (vma->vm_ops->pmd_fault)
return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
if (vma_is_anonymous(fe->vma))
return do_huge_pmd_wp_page(fe, orig_pmd);
if (fe->vma->vm_ops->pmd_fault)
return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
fe->flags);
return VM_FAULT_FALLBACK;
}
......@@ -3353,12 +3334,9 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
static int handle_pte_fault(struct fault_env *fe)
{
pte_t entry;
spinlock_t *ptl;
/*
* some architectures can have larger ptes than wordsize,
......@@ -3368,37 +3346,34 @@ static int handle_pte_fault(struct mm_struct *mm,
* we later double check anyway with the ptl lock held. So here
* a barrier will do.
*/
entry = *pte;
entry = *fe->pte;
barrier();
if (!pte_present(entry)) {
if (pte_none(entry)) {
if (vma_is_anonymous(vma))
return do_anonymous_page(mm, vma, address,
pte, pmd, flags);
if (vma_is_anonymous(fe->vma))
return do_anonymous_page(fe);
else
return do_fault(mm, vma, address, pte, pmd,
flags, entry);
return do_fault(fe, entry);
}
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
return do_swap_page(fe, entry);
}
if (pte_protnone(entry))
return do_numa_page(mm, vma, address, entry, pte, pmd);
return do_numa_page(fe, entry);
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
spin_lock(fe->ptl);
if (unlikely(!pte_same(*fe->pte, entry)))
goto unlock;
if (flags & FAULT_FLAG_WRITE) {
if (fe->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry);
return do_wp_page(fe, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vma, address, pte);
if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
fe->flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(fe->vma, fe->address, fe->pte);
} else {
/*
* This is needed only for protection faults but the arch code
......@@ -3406,11 +3381,11 @@ static int handle_pte_fault(struct mm_struct *mm,
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vma, address);
if (fe->flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(fe->vma, fe->address);
}
unlock:
pte_unmap_unlock(pte, ptl);
pte_unmap_unlock(fe->pte, fe->ptl);
return 0;
}
......@@ -3423,51 +3398,42 @@ static int handle_pte_fault(struct mm_struct *mm,
static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
struct fault_env fe = {
.vma = vma,
.address = address,
.flags = flags,
};
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
return VM_FAULT_SIGSEGV;
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
return VM_FAULT_OOM;
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
fe.pmd = pmd_alloc(mm, pud, address);
if (!fe.pmd)
return VM_FAULT_OOM;
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
int ret = create_huge_pmd(mm, vma, address, pmd, flags);
if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
int ret = create_huge_pmd(&fe);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pmd_t orig_pmd = *pmd;
pmd_t orig_pmd = *fe.pmd;
int ret;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
if (pmd_protnone(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
return do_huge_pmd_numa_page(&fe, orig_pmd);
if (dirty && !pmd_write(orig_pmd)) {
ret = wp_huge_pmd(mm, vma, address, pmd,
orig_pmd, flags);
if ((fe.flags & FAULT_FLAG_WRITE) &&
!pmd_write(orig_pmd)) {
ret = wp_huge_pmd(&fe, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pmd_set_accessed(mm, vma, address, pmd,
orig_pmd, dirty);
huge_pmd_set_accessed(&fe, orig_pmd);
return 0;
}
}
......@@ -3478,7 +3444,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
if (unlikely(pte_alloc(mm, pmd, address)))
if (unlikely(pte_alloc(fe.vma->vm_mm, fe.pmd, fe.address)))
return VM_FAULT_OOM;
/*
* If a huge pmd materialized under us just retry later. Use
......@@ -3491,7 +3457,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
* through an atomic read in C, which is what pmd_trans_unstable()
* provides.
*/
if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd)))
if (unlikely(pmd_trans_unstable(fe.pmd) || pmd_devmap(*fe.pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
......@@ -3499,9 +3465,9 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
* read mode and khugepaged takes it in write mode. So now it's
* safe to run pte_offset_map().
*/
pte = pte_offset_map(pmd, address);
fe.pte = pte_offset_map(fe.pmd, fe.address);
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
return handle_pte_fault(&fe);
}
/*
......@@ -3530,7 +3496,15 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (flags & FAULT_FLAG_USER)
mem_cgroup_oom_enable();
ret = __handle_mm_fault(vma, address, flags);
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
return VM_FAULT_SIGSEGV;
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
if (flags & FAULT_FLAG_USER) {
mem_cgroup_oom_disable();
......
......@@ -1809,7 +1809,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
void filemap_map_pages(struct fault_env *fe,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
BUG();
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment