Commit f45ec5ff authored by Peter Xu's avatar Peter Xu Committed by Linus Torvalds

userfaultfd: wp: support swap and page migration

For either swap and page migration, we all use the bit 2 of the entry to
identify whether this entry is uffd write-protected.  It plays a similar
role as the existing soft dirty bit in swap entries but only for keeping
the uffd-wp tracking for a specific PTE/PMD.

Something special here is that when we want to recover the uffd-wp bit
from a swap/migration entry to the PTE bit we'll also need to take care of
the _PAGE_RW bit and make sure it's cleared, otherwise even with the
_PAGE_UFFD_WP bit we can't trap it at all.

In change_pte_range() we do nothing for uffd if the PTE is a swap entry.
That can lead to data mismatch if the page that we are going to write
protect is swapped out when sending the UFFDIO_WRITEPROTECT.  This patch
also applies/removes the uffd-wp bit even for the swap entries.
Signed-off-by: default avatarPeter Xu <peterx@redhat.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Bobby Powers <bobbypowers@gmail.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Denis Plotnikov <dplotnikov@virtuozzo.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Martin Cracauer <cracauer@cons.org>
Cc: Marty McFadden <mcfadden8@llnl.gov>
Cc: Maya Gokhale <gokhale2@llnl.gov>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@fb.com>
Link: http://lkml.kernel.org/r/20200220163112.11409-11-peterx@redhat.comSigned-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 2e3d5dc5
...@@ -68,6 +68,8 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte) ...@@ -68,6 +68,8 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
if (pte_swp_soft_dirty(pte)) if (pte_swp_soft_dirty(pte))
pte = pte_swp_clear_soft_dirty(pte); pte = pte_swp_clear_soft_dirty(pte);
if (pte_swp_uffd_wp(pte))
pte = pte_swp_clear_uffd_wp(pte);
arch_entry = __pte_to_swp_entry(pte); arch_entry = __pte_to_swp_entry(pte);
return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
} }
......
...@@ -2297,6 +2297,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -2297,6 +2297,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
write = is_write_migration_entry(entry); write = is_write_migration_entry(entry);
young = false; young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd); soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else { } else {
page = pmd_page(old_pmd); page = pmd_page(old_pmd);
if (pmd_dirty(old_pmd)) if (pmd_dirty(old_pmd))
...@@ -2329,6 +2330,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -2329,6 +2330,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = swp_entry_to_pte(swp_entry); entry = swp_entry_to_pte(swp_entry);
if (soft_dirty) if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry); entry = pte_swp_mksoft_dirty(entry);
if (uffd_wp)
entry = pte_swp_mkuffd_wp(entry);
} else { } else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
entry = maybe_mkwrite(entry, vma); entry = maybe_mkwrite(entry, vma);
......
...@@ -733,6 +733,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -733,6 +733,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = swp_entry_to_pte(entry); pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(*src_pte)) if (pte_swp_soft_dirty(*src_pte))
pte = pte_swp_mksoft_dirty(pte); pte = pte_swp_mksoft_dirty(pte);
if (pte_swp_uffd_wp(*src_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte); set_pte_at(src_mm, addr, src_pte, pte);
} }
} else if (is_device_private_entry(entry)) { } else if (is_device_private_entry(entry)) {
...@@ -762,6 +764,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -762,6 +764,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
is_cow_mapping(vm_flags)) { is_cow_mapping(vm_flags)) {
make_device_private_entry_read(&entry); make_device_private_entry_read(&entry);
pte = swp_entry_to_pte(entry); pte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(*src_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte); set_pte_at(src_mm, addr, src_pte, pte);
} }
} }
...@@ -3098,6 +3102,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ...@@ -3098,6 +3102,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page); flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte)) if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte); pte = pte_mksoft_dirty(pte);
if (pte_swp_uffd_wp(vmf->orig_pte)) {
pte = pte_mkuffd_wp(pte);
pte = pte_wrprotect(pte);
}
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte; vmf->orig_pte = pte;
......
...@@ -243,11 +243,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, ...@@ -243,11 +243,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
entry = pte_to_swp_entry(*pvmw.pte); entry = pte_to_swp_entry(*pvmw.pte);
if (is_write_migration_entry(entry)) if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma); pte = maybe_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(*pvmw.pte))
pte = pte_mkuffd_wp(pte);
if (unlikely(is_zone_device_page(new))) { if (unlikely(is_zone_device_page(new))) {
if (is_device_private_page(new)) { if (is_device_private_page(new)) {
entry = make_device_private_entry(new, pte_write(pte)); entry = make_device_private_entry(new, pte_write(pte));
pte = swp_entry_to_pte(entry); pte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(*pvmw.pte))
pte = pte_mkuffd_wp(pte);
} }
} }
...@@ -2338,6 +2342,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, ...@@ -2338,6 +2342,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
swp_pte = swp_entry_to_pte(entry); swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pte)) if (pte_soft_dirty(pte))
swp_pte = pte_swp_mksoft_dirty(swp_pte); swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pte))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, addr, ptep, swp_pte); set_pte_at(mm, addr, ptep, swp_pte);
/* /*
......
...@@ -139,11 +139,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -139,11 +139,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
} }
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
pages++; pages++;
} else if (IS_ENABLED(CONFIG_MIGRATION)) { } else if (is_swap_pte(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte); swp_entry_t entry = pte_to_swp_entry(oldpte);
pte_t newpte;
if (is_write_migration_entry(entry)) { if (is_write_migration_entry(entry)) {
pte_t newpte;
/* /*
* A protection check is difficult so * A protection check is difficult so
* just be safe and disable write * just be safe and disable write
...@@ -152,22 +152,28 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -152,22 +152,28 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
newpte = swp_entry_to_pte(entry); newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte)) if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte); newpte = pte_swp_mksoft_dirty(newpte);
set_pte_at(vma->vm_mm, addr, pte, newpte); if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
pages++; } else if (is_write_device_private_entry(entry)) {
}
if (is_write_device_private_entry(entry)) {
pte_t newpte;
/* /*
* We do not preserve soft-dirtiness. See * We do not preserve soft-dirtiness. See
* copy_one_pte() for explanation. * copy_one_pte() for explanation.
*/ */
make_device_private_entry_read(&entry); make_device_private_entry_read(&entry);
newpte = swp_entry_to_pte(entry); newpte = swp_entry_to_pte(entry);
set_pte_at(vma->vm_mm, addr, pte, newpte); if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
} else {
newpte = oldpte;
}
if (uffd_wp)
newpte = pte_swp_mkuffd_wp(newpte);
else if (uffd_wp_resolve)
newpte = pte_swp_clear_uffd_wp(newpte);
if (!pte_same(oldpte, newpte)) {
set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++; pages++;
} }
} }
......
...@@ -1502,6 +1502,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, ...@@ -1502,6 +1502,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry); swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval)) if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte); swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
/* /*
* No need to invalidate here it will synchronize on * No need to invalidate here it will synchronize on
...@@ -1601,6 +1603,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, ...@@ -1601,6 +1603,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry); swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval)) if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte); swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte);
/* /*
* No need to invalidate here it will synchronize on * No need to invalidate here it will synchronize on
...@@ -1667,6 +1671,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, ...@@ -1667,6 +1671,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry); swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval)) if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte); swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte);
/* Invalidate as we cleared the pte */ /* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address, mmu_notifier_invalidate_range(mm, address,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment