Commit b25d1dc9 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'simplify-do_wp_page'

Merge emailed patches from Peter Xu:
 "This is a small series that I picked up from Linus's suggestion to
  simplify cow handling (and also make it more strict) by checking
  against page refcounts rather than mapcounts.

  This makes uffd-wp work again (verified by running upmapsort)"

Note: this is horrendously bad timing, and making this kind of
fundamental vm change after -rc3 is not at all how things should work.
The saving grace is that it really is a a nice simplification:

 8 files changed, 29 insertions(+), 120 deletions(-)

The reason for the bad timing is that it turns out that commit
17839856 ("gup: document and work around 'COW can break either way'
issue" broke not just UFFD functionality (as Peter noticed), but Mikulas
Patocka also reports that it caused issues for strace when running in a
DAX environment with ext4 on a persistent memory setup.

And we can't just revert that commit without re-introducing the original
issue that is a potential security hole, so making COW stricter (and in
the process much simpler) is a step to then undoing the forced COW that
broke other uses.

Link: https://lore.kernel.org/lkml/alpine.LRH.2.02.2009031328040.6929@file01.intranet.prod.int.rdu2.redhat.com/

* emailed patches from Peter Xu <peterx@redhat.com>:
  mm: Add PGREUSE counter
  mm/gup: Remove enfornced COW mechanism
  mm/ksm: Remove reuse_ksm_page()
  mm: do_wp_page() simplification
parents cfc905f1 798a6b87
...@@ -596,14 +596,6 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj) ...@@ -596,14 +596,6 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
GFP_KERNEL | GFP_KERNEL |
__GFP_NORETRY | __GFP_NORETRY |
__GFP_NOWARN); __GFP_NOWARN);
/*
* Using __get_user_pages_fast() with a read-only
* access is questionable. A read-only page may be
* COW-broken, and then this might end up giving
* the wrong side of the COW..
*
* We may or may not care.
*/
if (pvec) { if (pvec) {
/* defer to worker if malloc fails */ /* defer to worker if malloc fails */
if (!i915_gem_object_is_readonly(obj)) if (!i915_gem_object_is_readonly(obj))
......
...@@ -53,8 +53,6 @@ struct page *ksm_might_need_to_copy(struct page *page, ...@@ -53,8 +53,6 @@ struct page *ksm_might_need_to_copy(struct page *page,
void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
void ksm_migrate_page(struct page *newpage, struct page *oldpage); void ksm_migrate_page(struct page *newpage, struct page *oldpage);
bool reuse_ksm_page(struct page *page,
struct vm_area_struct *vma, unsigned long address);
#else /* !CONFIG_KSM */ #else /* !CONFIG_KSM */
...@@ -88,11 +86,6 @@ static inline void rmap_walk_ksm(struct page *page, ...@@ -88,11 +86,6 @@ static inline void rmap_walk_ksm(struct page *page,
static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{ {
} }
static inline bool reuse_ksm_page(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
return false;
}
#endif /* CONFIG_MMU */ #endif /* CONFIG_MMU */
#endif /* !CONFIG_KSM */ #endif /* !CONFIG_KSM */
......
...@@ -30,6 +30,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, ...@@ -30,6 +30,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGFAULT, PGMAJFAULT, PGFAULT, PGMAJFAULT,
PGLAZYFREED, PGLAZYFREED,
PGREFILL, PGREFILL,
PGREUSE,
PGSTEAL_KSWAPD, PGSTEAL_KSWAPD,
PGSTEAL_DIRECT, PGSTEAL_DIRECT,
PGSCAN_KSWAPD, PGSCAN_KSWAPD,
......
...@@ -381,22 +381,13 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, ...@@ -381,22 +381,13 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
} }
/* /*
* FOLL_FORCE or a forced COW break can write even to unwritable pte's, * FOLL_FORCE can write to even unwritable pte's, but only
* but only after we've gone through a COW cycle and they are dirty. * after we've gone through a COW cycle and they are dirty.
*/ */
static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
{ {
return pte_write(pte) || ((flags & FOLL_COW) && pte_dirty(pte)); return pte_write(pte) ||
} ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
/*
* A (separate) COW fault might break the page the other way and
* get_user_pages() would return the page from what is now the wrong
* VM. So we need to force a COW break at GUP time even for reads.
*/
static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags)
{
return is_cow_mapping(vma->vm_flags) && (flags & (FOLL_GET | FOLL_PIN));
} }
static struct page *follow_page_pte(struct vm_area_struct *vma, static struct page *follow_page_pte(struct vm_area_struct *vma,
...@@ -1067,11 +1058,9 @@ static long __get_user_pages(struct mm_struct *mm, ...@@ -1067,11 +1058,9 @@ static long __get_user_pages(struct mm_struct *mm,
goto out; goto out;
} }
if (is_vm_hugetlb_page(vma)) { if (is_vm_hugetlb_page(vma)) {
if (should_force_cow_break(vma, foll_flags))
foll_flags |= FOLL_WRITE;
i = follow_hugetlb_page(mm, vma, pages, vmas, i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i, &start, &nr_pages, i,
foll_flags, locked); gup_flags, locked);
if (locked && *locked == 0) { if (locked && *locked == 0) {
/* /*
* We've got a VM_FAULT_RETRY * We've got a VM_FAULT_RETRY
...@@ -1085,10 +1074,6 @@ static long __get_user_pages(struct mm_struct *mm, ...@@ -1085,10 +1074,6 @@ static long __get_user_pages(struct mm_struct *mm,
continue; continue;
} }
} }
if (should_force_cow_break(vma, foll_flags))
foll_flags |= FOLL_WRITE;
retry: retry:
/* /*
* If we have a pending SIGKILL, don't keep faulting pages and * If we have a pending SIGKILL, don't keep faulting pages and
...@@ -2689,19 +2674,6 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, ...@@ -2689,19 +2674,6 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
return -EFAULT; return -EFAULT;
/* /*
* The FAST_GUP case requires FOLL_WRITE even for pure reads,
* because get_user_pages() may need to cause an early COW in
* order to avoid confusing the normal COW routines. So only
* targets that are already writable are safe to do by just
* looking at the page tables.
*
* NOTE! With FOLL_FAST_ONLY we allow read-only gup_fast() here,
* because there is no slow path to fall back on. But you'd
* better be careful about possible COW pages - you'll get _a_
* COW page, but not necessarily the one you intended to get
* depending on what COW event happens after this. COW may break
* the page copy in a random direction.
*
* Disable interrupts. The nested form is used, in order to allow * Disable interrupts. The nested form is used, in order to allow
* full, general purpose use of this routine. * full, general purpose use of this routine.
* *
...@@ -2714,8 +2686,6 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, ...@@ -2714,8 +2686,6 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
*/ */
if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) {
unsigned long fast_flags = gup_flags; unsigned long fast_flags = gup_flags;
if (!(gup_flags & FOLL_FAST_ONLY))
fast_flags |= FOLL_WRITE;
local_irq_save(flags); local_irq_save(flags);
gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned); gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned);
......
...@@ -1291,12 +1291,13 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) ...@@ -1291,12 +1291,13 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
} }
/* /*
* FOLL_FORCE or a forced COW break can write even to unwritable pmd's, * FOLL_FORCE can write to even unwritable pmd's, but only
* but only after we've gone through a COW cycle and they are dirty. * after we've gone through a COW cycle and they are dirty.
*/ */
static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
{ {
return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd)); return pmd_write(pmd) ||
((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
} }
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
......
...@@ -2661,31 +2661,6 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) ...@@ -2661,31 +2661,6 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
goto again; goto again;
} }
bool reuse_ksm_page(struct page *page,
struct vm_area_struct *vma,
unsigned long address)
{
#ifdef CONFIG_DEBUG_VM
if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
WARN_ON(!page_mapped(page)) ||
WARN_ON(!PageLocked(page))) {
dump_page(page, "reuse_ksm_page");
return false;
}
#endif
if (PageSwapCache(page) || !page_stable_node(page))
return false;
/* Prohibit parallel get_ksm_page() */
if (!page_ref_freeze(page, 1))
return false;
page_move_anon_rmap(page, vma);
page->index = linear_page_index(vma, address);
page_ref_unfreeze(page, 1);
return true;
}
#ifdef CONFIG_MIGRATION #ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage) void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{ {
......
...@@ -2622,6 +2622,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) ...@@ -2622,6 +2622,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte); update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
count_vm_event(PGREUSE);
} }
/* /*
...@@ -2927,50 +2928,25 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) ...@@ -2927,50 +2928,25 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* not dirty accountable. * not dirty accountable.
*/ */
if (PageAnon(vmf->page)) { if (PageAnon(vmf->page)) {
int total_map_swapcount; struct page *page = vmf->page;
if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) ||
page_count(vmf->page) != 1)) /* PageKsm() doesn't necessarily raise the page refcount */
if (PageKsm(page) || page_count(page) != 1)
goto copy; goto copy;
if (!trylock_page(vmf->page)) { if (!trylock_page(page))
get_page(vmf->page); goto copy;
pte_unmap_unlock(vmf->pte, vmf->ptl); if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
lock_page(vmf->page); unlock_page(page);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
unlock_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
put_page(vmf->page);
return 0;
}
put_page(vmf->page);
}
if (PageKsm(vmf->page)) {
bool reused = reuse_ksm_page(vmf->page, vmf->vma,
vmf->address);
unlock_page(vmf->page);
if (!reused)
goto copy; goto copy;
wp_page_reuse(vmf);
return VM_FAULT_WRITE;
} }
if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
if (total_map_swapcount == 1) {
/* /*
* The page is all ours. Move it to * Ok, we've got the only map reference, and the only
* our anon_vma so the rmap code will * page count reference, and the page is locked,
* not search our parent or siblings. * it's dark out, and we're wearing sunglasses. Hit it.
* Protected against the rmap code by
* the page lock.
*/ */
page_move_anon_rmap(vmf->page, vma);
}
unlock_page(vmf->page);
wp_page_reuse(vmf); wp_page_reuse(vmf);
unlock_page(page);
return VM_FAULT_WRITE; return VM_FAULT_WRITE;
}
unlock_page(vmf->page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) { (VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf); return wp_page_shared(vmf);
......
...@@ -1241,6 +1241,7 @@ const char * const vmstat_text[] = { ...@@ -1241,6 +1241,7 @@ const char * const vmstat_text[] = {
"pglazyfreed", "pglazyfreed",
"pgrefill", "pgrefill",
"pgreuse",
"pgsteal_kswapd", "pgsteal_kswapd",
"pgsteal_direct", "pgsteal_direct",
"pgscan_kswapd", "pgscan_kswapd",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment