Commit a98a2f0c authored by Alistair Popple's avatar Alistair Popple Committed by Linus Torvalds

mm/rmap: split migration into its own function

Migration is currently implemented as a mode of operation for
try_to_unmap_one() generally specified by passing the TTU_MIGRATION flag
or in the case of splitting a huge anonymous page TTU_SPLIT_FREEZE.

However it does not have much in common with the rest of the unmap
functionality of try_to_unmap_one() and thus splitting it into a separate
function reduces the complexity of try_to_unmap_one() making it more
readable.

Several simplifications can also be made in try_to_migrate_one() based on
the following observations:

 - All users of TTU_MIGRATION also set TTU_IGNORE_MLOCK.
 - No users of TTU_MIGRATION ever set TTU_IGNORE_HWPOISON.
 - No users of TTU_MIGRATION ever set TTU_BATCH_FLUSH.

TTU_SPLIT_FREEZE is a special case of migration used when splitting an
anonymous page.  This is most easily dealt with by calling the correct
function from unmap_page() in mm/huge_memory.c - either try_to_migrate()
for PageAnon or try_to_unmap().

Link: https://lkml.kernel.org/r/20210616105937.23201-5-apopple@nvidia.comSigned-off-by: default avatarAlistair Popple <apopple@nvidia.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarRalph Campbell <rcampbell@nvidia.com>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent cd62734c
......@@ -86,8 +86,6 @@ struct anon_vma_chain {
};
enum ttu_flags {
TTU_MIGRATION = 0x1, /* migration mode */
TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */
......@@ -97,7 +95,6 @@ enum ttu_flags {
* do a final flush if necessary */
TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock:
* caller holds it */
TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */
};
#ifdef CONFIG_MMU
......@@ -194,6 +191,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
int page_referenced(struct page *, int is_locked,
struct mem_cgroup *memcg, unsigned long *vm_flags);
void try_to_migrate(struct page *page, enum ttu_flags flags);
void try_to_unmap(struct page *, enum ttu_flags flags);
/* Avoid racy checks */
......
......@@ -2309,16 +2309,20 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
static void unmap_page(struct page *page)
{
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
TTU_SYNC;
VM_BUG_ON_PAGE(!PageHead(page), page);
/* If TTU_SPLIT_FREEZE is ever extended to file, update remap_page() */
/*
* Anon pages need migration entries to preserve them, but file
* pages can simply be left unmapped, then faulted back on demand.
* If that is ever changed (perhaps for mlock), update remap_page().
*/
if (PageAnon(page))
ttu_flags |= TTU_SPLIT_FREEZE;
try_to_unmap(page, ttu_flags);
try_to_migrate(page, ttu_flags);
else
try_to_unmap(page, ttu_flags | TTU_IGNORE_MLOCK);
VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
......
......@@ -1109,7 +1109,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
/* Establish migration ptes */
VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
page);
try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
try_to_migrate(page, 0);
page_was_mapped = 1;
}
......@@ -1311,7 +1311,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (page_mapped(hpage)) {
bool mapping_locked = false;
enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
enum ttu_flags ttu = 0;
if (!PageAnon(hpage)) {
/*
......@@ -1328,7 +1328,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
ttu |= TTU_RMAP_LOCKED;
}
try_to_unmap(hpage, ttu);
try_to_migrate(hpage, ttu);
page_was_mapped = 1;
if (mapping_locked)
......@@ -2602,7 +2602,6 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
*/
static void migrate_vma_unmap(struct migrate_vma *migrate)
{
int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
const unsigned long npages = migrate->npages;
const unsigned long start = migrate->start;
unsigned long addr, i, restore = 0;
......@@ -2614,7 +2613,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
continue;
if (page_mapped(page)) {
try_to_unmap(page, flags);
try_to_migrate(page, 0);
if (page_mapped(page))
goto restore;
}
......
......@@ -1411,14 +1411,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (flags & TTU_SYNC)
pvmw.flags = PVMW_SYNC;
if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
is_zone_device_page(page) && !is_device_private_page(page))
return true;
if (flags & TTU_SPLIT_HUGE_PMD) {
split_huge_pmd_address(vma, address,
flags & TTU_SPLIT_FREEZE, page);
}
if (flags & TTU_SPLIT_HUGE_PMD)
split_huge_pmd_address(vma, address, false, page);
/*
* For THP, we have to assume the worse case ie pmd for invalidation.
......@@ -1443,16 +1437,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte && (flags & TTU_MIGRATION)) {
VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
set_pmd_migration_entry(&pvmw, page);
continue;
}
#endif
/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
......@@ -1514,46 +1498,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
}
if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & TTU_MIGRATION) &&
is_zone_device_page(page)) {
swp_entry_t entry;
pte_t swp_pte;
pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
entry = make_readable_migration_entry(page_to_pfn(page));
swp_pte = swp_entry_to_pte(entry);
/*
* pteval maps a zone device page and is therefore
* a swap pte.
*/
if (pte_swp_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_swp_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
*
* The assignment to subpage above was computed from a
* swap PTE which results in an invalid pointer.
* Since only PAGE_SIZE pages can currently be
* migrated, just set it to page. This will need to be
* changed when hugepage migrations to device private
* memory are supported.
*/
subpage = page;
goto discard;
}
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
if (should_defer_flush(mm, flags)) {
......@@ -1606,39 +1550,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* We have to invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
address + PAGE_SIZE);
} else if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
swp_entry_t entry;
pte_t swp_pte;
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
if (pte_write(pteval))
entry = make_writable_migration_entry(
page_to_pfn(subpage));
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
*/
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(subpage) };
pte_t swp_pte;
......@@ -1766,6 +1677,277 @@ void try_to_unmap(struct page *page, enum ttu_flags flags)
.anon_lock = page_lock_anon_vma_read,
};
if (flags & TTU_RMAP_LOCKED)
rmap_walk_locked(page, &rwc);
else
rmap_walk(page, &rwc);
}
/*
* @arg: enum ttu_flags will be passed to this argument.
*
* If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
* containing migration entries. This and TTU_RMAP_LOCKED are the only supported
* flags.
*/
static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
struct page_vma_mapped_walk pvmw = {
.page = page,
.vma = vma,
.address = address,
};
pte_t pteval;
struct page *subpage;
bool ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
if (is_zone_device_page(page) && !is_device_private_page(page))
return true;
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and page_remove_rmap(),
* try_to_migrate() may return before page_mapped() has become false,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
pvmw.flags = PVMW_SYNC;
/*
* unmap_page() in mm/huge_memory.c is the only user of migration with
* TTU_SPLIT_HUGE_PMD and it wants to freeze.
*/
if (flags & TTU_SPLIT_HUGE_PMD)
split_huge_pmd_address(vma, address, true, page);
/*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* invalidation in the case of pmd sharing.
*
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
range.end = PageKsm(page) ?
address + PAGE_SIZE : vma_address_end(page, vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address, range.end);
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
*/
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
}
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
VM_BUG_ON_PAGE(PageHuge(page) ||
!PageTransCompound(page), page);
set_pmd_migration_entry(&pvmw, page);
continue;
}
#endif
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_PAGE(!pvmw.pte, page);
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
if (PageHuge(page) && !PageAnon(page)) {
/*
* To call huge_pmd_unshare, i_mmap_rwsem must be
* held in write mode. Caller needs to explicitly
* do this outside rmap routines.
*/
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
/*
* huge_pmd_unshare unmapped an entire PMD
* page. There is no way of knowing exactly
* which PMDs may be cached for this mm, so
* we must flush them all. start/end were
* already adjusted above to cover this range.
*/
flush_cache_range(vma, range.start, range.end);
flush_tlb_range(vma, range.start, range.end);
mmu_notifier_invalidate_range(mm, range.start,
range.end);
/*
* The ref count of the PMD page was dropped
* which is part of the way map counting
* is done for shared PMDs. Return 'true'
* here. When there is no other sharing,
* huge_pmd_unshare returns false and we will
* unmap the actual page and drop map count
* to zero.
*/
page_vma_mapped_walk_done(&pvmw);
break;
}
}
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
pteval = ptep_clear_flush(vma, address, pvmw.pte);
/* Move the dirty bit to the page. Now the pte is gone. */
if (pte_dirty(pteval))
set_page_dirty(page);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
if (is_zone_device_page(page)) {
swp_entry_t entry;
pte_t swp_pte;
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
entry = make_readable_migration_entry(
page_to_pfn(page));
swp_pte = swp_entry_to_pte(entry);
/*
* pteval maps a zone device page and is therefore
* a swap pte.
*/
if (pte_swp_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_swp_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
*
* The assignment to subpage above was computed from a
* swap PTE which results in an invalid pointer.
* Since only PAGE_SIZE pages can currently be
* migrated, just set it to page. This will need to be
* changed when hugepage migrations to device private
* memory are supported.
*/
subpage = page;
} else if (PageHWPoison(page)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (PageHuge(page)) {
hugetlb_count_sub(compound_nr(page), mm);
set_huge_swap_pte_at(mm, address,
pvmw.pte, pteval,
vma_mmu_pagesize(vma));
} else {
dec_mm_counter(mm, mm_counter(page));
set_pte_at(mm, address, pvmw.pte, pteval);
}
} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
* will take care of the rest.
* A future reference will then fault in a new zero
* page. When userfaultfd is active, we must not drop
* this page though, as its main user (postcopy
* migration) will not expect userfaults on already
* copied pages.
*/
dec_mm_counter(mm, mm_counter(page));
/* We have to invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
address + PAGE_SIZE);
} else {
swp_entry_t entry;
pte_t swp_pte;
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
if (pte_write(pteval))
entry = make_writable_migration_entry(
page_to_pfn(subpage));
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
*/
}
/*
* No need to call mmu_notifier_invalidate_range() it has be
* done above for all cases requiring it to happen under page
* table lock before mmu_notifier_invalidate_range_end()
*
* See Documentation/vm/mmu_notifier.rst
*/
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
}
mmu_notifier_invalidate_range_end(&range);
return ret;
}
/**
* try_to_migrate - try to replace all page table mappings with swap entries
* @page: the page to replace page table entries for
* @flags: action and flags
*
* Tries to remove all the page table entries which are mapping this page and
* replace them with special swap entries. Caller must hold the page lock.
*
* If is successful, return true. Otherwise, false.
*/
void try_to_migrate(struct page *page, enum ttu_flags flags)
{
struct rmap_walk_control rwc = {
.rmap_one = try_to_migrate_one,
.arg = (void *)flags,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
};
/*
* Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
* TTU_SPLIT_HUGE_PMD and TTU_SYNC flags.
*/
if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
TTU_SYNC)))
return;
/*
* During exec, a temporary VMA is setup and later moved.
* The VMA is moved under the anon_vma lock but not the
......@@ -1774,8 +1956,7 @@ void try_to_unmap(struct page *page, enum ttu_flags flags)
* locking requirements of exec(), migration skips
* temporary VMAs until after exec() completes.
*/
if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
&& !PageKsm(page) && PageAnon(page))
if (!PageKsm(page) && PageAnon(page))
rwc.invalid_vma = invalid_migration_vma;
if (flags & TTU_RMAP_LOCKED)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment