Commit b56a2d8a authored by Vineeth Remanan Pillai's avatar Vineeth Remanan Pillai Committed by Linus Torvalds

mm: rid swapoff of quadratic complexity

This patch was initially posted by Kelley Nielsen.  Reposting the patch
with all review comments addressed and with minor modifications and
optimizations.  Also, folding in the fixes offered by Hugh Dickins and
Huang Ying.  Tests were rerun and commit message updated with new
results.

try_to_unuse() is of quadratic complexity, with a lot of wasted effort.
It unuses swap entries one by one, potentially iterating over all the
page tables for all the processes in the system for each one.

This new proposed implementation of try_to_unuse simplifies its
complexity to linear.  It iterates over the system's mms once, unusing
all the affected entries as it walks each set of page tables.  It also
makes similar changes to shmem_unuse.

Improvement

swapoff was called on a swap partition containing about 6G of data, in a
VM(8cpu, 16G RAM), and calls to unuse_pte_range() were counted.

Present implementation....about 1200M calls(8min, avg 80% cpu util).
Prototype.................about  9.0K calls(3min, avg 5% cpu util).

Details

In shmem_unuse(), iterate over the shmem_swaplist and, for each
shmem_inode_info that contains a swap entry, pass it to
shmem_unuse_inode(), along with the swap type.  In shmem_unuse_inode(),
iterate over its associated xarray, and store the index and value of
each swap entry in an array for passing to shmem_swapin_page() outside
of the RCU critical section.

In try_to_unuse(), instead of iterating over the entries in the type and
unusing them one by one, perhaps walking all the page tables for all the
processes for each one, iterate over the mmlist, making one pass.  Pass
each mm to unuse_mm() to begin its page table walk, and during the walk,
unuse all the ptes that have backing store in the swap type received by
try_to_unuse().  After the walk, check the type for orphaned swap
entries with find_next_to_unuse(), and remove them from the swap cache.
If find_next_to_unuse() starts over at the beginning of the type, repeat
the check of the shmem_swaplist and the walk a maximum of three times.

Change unuse_mm() and the intervening walk functions down to
unuse_pte_range() to take the type as a parameter, and to iterate over
their entire range, calling the next function down on every iteration.
In unuse_pte_range(), make a swap entry from each pte in the range using
the passed in type.  If it has backing store in the type, call
swapin_readahead() to retrieve the page and pass it to unuse_pte().

Pass the count of pages_to_unuse down the page table walks in
try_to_unuse(), and return from the walk when the desired number of
pages has been swapped back in.

Link: http://lkml.kernel.org/r/20190114153129.4852-2-vpillai@digitalocean.comSigned-off-by: default avatarVineeth Remanan Pillai <vpillai@digitalocean.com>
Signed-off-by: default avatarKelley Nielsen <kelleynnn@gmail.com>
Signed-off-by: default avatarHuang Ying <ying.huang@intel.com>
Acked-by: default avatarHugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent c5bf121e
......@@ -7,6 +7,13 @@
#include <linux/bitops.h>
#include <linux/jump_label.h>
/*
* Return code to denote that requested number of
* frontswap pages are unused(moved to page cache).
* Used in in shmem_unuse and try_to_unuse.
*/
#define FRONTSWAP_PAGES_UNUSED 2
struct frontswap_ops {
void (*init)(unsigned); /* this swap type was just swapon'ed */
int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
......
......@@ -72,7 +72,8 @@ extern void shmem_unlock_mapping(struct address_space *mapping);
extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
extern int shmem_unuse(swp_entry_t entry, struct page *page);
extern int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse);
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
......
......@@ -36,6 +36,7 @@
#include <linux/uio.h>
#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
#include <linux/frontswap.h>
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
......@@ -1093,159 +1094,184 @@ static void shmem_evict_inode(struct inode *inode)
clear_inode(inode);
}
static unsigned long find_swap_entry(struct xarray *xa, void *item)
extern struct swap_info_struct *swap_info[];
static int shmem_find_swap_entries(struct address_space *mapping,
pgoff_t start, unsigned int nr_entries,
struct page **entries, pgoff_t *indices,
bool frontswap)
{
XA_STATE(xas, xa, 0);
unsigned int checked = 0;
void *entry;
XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
unsigned int ret = 0;
if (!nr_entries)
return 0;
rcu_read_lock();
xas_for_each(&xas, entry, ULONG_MAX) {
if (xas_retry(&xas, entry))
xas_for_each(&xas, page, ULONG_MAX) {
if (xas_retry(&xas, page))
continue;
if (entry == item)
break;
checked++;
if ((checked % XA_CHECK_SCHED) != 0)
if (!xa_is_value(page))
continue;
xas_pause(&xas);
cond_resched_rcu();
if (frontswap) {
swp_entry_t entry = radix_to_swp_entry(page);
if (!frontswap_test(swap_info[swp_type(entry)],
swp_offset(entry)))
continue;
}
indices[ret] = xas.xa_index;
entries[ret] = page;
if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
}
if (++ret == nr_entries)
break;
}
rcu_read_unlock();
return entry ? xas.xa_index : -1;
return ret;
}
/*
* If swap found in inode, free it and move page from swapcache to filecache.
* Move the swapped pages for an inode to page cache. Returns the count
* of pages swapped in, or the error in case of failure.
*/
static int shmem_unuse_inode(struct shmem_inode_info *info,
swp_entry_t swap, struct page **pagep)
static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
pgoff_t *indices)
{
struct address_space *mapping = info->vfs_inode.i_mapping;
void *radswap;
pgoff_t index;
gfp_t gfp;
int i = 0;
int ret = 0;
int error = 0;
struct address_space *mapping = inode->i_mapping;
radswap = swp_to_radix_entry(swap);
index = find_swap_entry(&mapping->i_pages, radswap);
if (index == -1)
return -EAGAIN; /* tell shmem_unuse we found nothing */
/*
* Move _head_ to start search for next from here.
* But be careful: shmem_evict_inode checks list_empty without taking
* mutex, and there's an instant in list_move_tail when info->swaplist
* would appear empty, if it were the only one on shmem_swaplist.
*/
if (shmem_swaplist.next != &info->swaplist)
list_move_tail(&shmem_swaplist, &info->swaplist);
for (i = 0; i < pvec.nr; i++) {
struct page *page = pvec.pages[i];
gfp = mapping_gfp_mask(mapping);
if (shmem_should_replace_page(*pagep, gfp)) {
mutex_unlock(&shmem_swaplist_mutex);
error = shmem_replace_page(pagep, gfp, info, index);
mutex_lock(&shmem_swaplist_mutex);
/*
* We needed to drop mutex to make that restrictive page
* allocation, but the inode might have been freed while we
* dropped it: although a racing shmem_evict_inode() cannot
* complete without emptying the page cache, our page lock
* on this swapcache page is not enough to prevent that -
* free_swap_and_cache() of our swap entry will only
* trylock_page(), removing swap from page cache whatever.
*
* We must not proceed to shmem_add_to_page_cache() if the
* inode has been freed, but of course we cannot rely on
* inode or mapping or info to check that. However, we can
* safely check if our swap entry is still in use (and here
* it can't have got reused for another page): if it's still
* in use, then the inode cannot have been freed yet, and we
* can safely proceed (if it's no longer in use, that tells
* nothing about the inode, but we don't need to unuse swap).
*/
if (!page_swapcount(*pagep))
error = -ENOENT;
if (!xa_is_value(page))
continue;
error = shmem_swapin_page(inode, indices[i],
&page, SGP_CACHE,
mapping_gfp_mask(mapping),
NULL, NULL);
if (error == 0) {
unlock_page(page);
put_page(page);
ret++;
}
if (error == -ENOMEM)
break;
error = 0;
}
return error ? error : ret;
}
/*
* We rely on shmem_swaplist_mutex, not only to protect the swaplist,
* but also to hold up shmem_evict_inode(): so inode cannot be freed
* beneath us (pagelock doesn't help until the page is in pagecache).
*/
if (!error)
error = shmem_add_to_page_cache(*pagep, mapping, index,
radswap, gfp);
if (error != -ENOMEM) {
/*
* Truncation and eviction use free_swap_and_cache(), which
* only does trylock page: if we raced, best clean up here.
*/
delete_from_swap_cache(*pagep);
set_page_dirty(*pagep);
if (!error) {
spin_lock_irq(&info->lock);
info->swapped--;
spin_unlock_irq(&info->lock);
swap_free(swap);
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
static int shmem_unuse_inode(struct inode *inode, unsigned int type,
bool frontswap, unsigned long *fs_pages_to_unuse)
{
struct address_space *mapping = inode->i_mapping;
pgoff_t start = 0;
struct pagevec pvec;
pgoff_t indices[PAGEVEC_SIZE];
bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
int ret = 0;
pagevec_init(&pvec);
do {
unsigned int nr_entries = PAGEVEC_SIZE;
if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
nr_entries = *fs_pages_to_unuse;
pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
pvec.pages, indices,
frontswap);
if (pvec.nr == 0) {
ret = 0;
break;
}
}
return error;
ret = shmem_unuse_swap_entries(inode, pvec, indices);
if (ret < 0)
break;
if (frontswap_partial) {
*fs_pages_to_unuse -= ret;
if (*fs_pages_to_unuse == 0) {
ret = FRONTSWAP_PAGES_UNUSED;
break;
}
}
start = indices[pvec.nr - 1];
} while (true);
return ret;
}
/*
* Search through swapped inodes to find and replace swap by page.
* Read all the shared memory data that resides in the swap
* device 'type' back into memory, so the swap device can be
* unused.
*/
int shmem_unuse(swp_entry_t swap, struct page *page)
int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse)
{
struct list_head *this, *next;
struct shmem_inode_info *info;
struct mem_cgroup *memcg;
struct shmem_inode_info *info, *next;
struct inode *inode;
struct inode *prev_inode = NULL;
int error = 0;
/*
* There's a faint possibility that swap page was replaced before
* caller locked it: caller will come back later with the right page.
*/
if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
goto out;
if (list_empty(&shmem_swaplist))
return 0;
mutex_lock(&shmem_swaplist_mutex);
/*
* Charge page using GFP_KERNEL while we can wait, before taking
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
* The extra refcount on the inode is necessary to safely dereference
* p->next after re-acquiring the lock. New shmem inodes with swap
* get added to the end of the list and we will scan them all.
*/
error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
&memcg, false);
if (error)
goto out;
/* No memory allocation: swap entry occupies the slot for the page */
error = -EAGAIN;
mutex_lock(&shmem_swaplist_mutex);
list_for_each_safe(this, next, &shmem_swaplist) {
info = list_entry(this, struct shmem_inode_info, swaplist);
if (info->swapped)
error = shmem_unuse_inode(info, swap, &page);
else
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) {
list_del_init(&info->swaplist);
continue;
}
inode = igrab(&info->vfs_inode);
if (!inode)
continue;
mutex_unlock(&shmem_swaplist_mutex);
if (prev_inode)
iput(prev_inode);
prev_inode = inode;
error = shmem_unuse_inode(inode, type, frontswap,
fs_pages_to_unuse);
cond_resched();
if (error != -EAGAIN)
mutex_lock(&shmem_swaplist_mutex);
next = list_next_entry(info, swaplist);
if (!info->swapped)
list_del_init(&info->swaplist);
if (error)
break;
/* found nothing in this: move on to search the next */
}
mutex_unlock(&shmem_swaplist_mutex);
if (error) {
if (error != -ENOMEM)
error = 0;
mem_cgroup_cancel_charge(page, memcg, false);
} else
mem_cgroup_commit_charge(page, memcg, true, false);
out:
unlock_page(page);
put_page(page);
if (prev_inode)
iput(prev_inode);
return error;
}
......@@ -1329,7 +1355,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
*/
mutex_lock(&shmem_swaplist_mutex);
if (list_empty(&info->swaplist))
list_add_tail(&info->swaplist, &shmem_swaplist);
list_add(&info->swaplist, &shmem_swaplist);
if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
spin_lock_irq(&info->lock);
......@@ -3886,7 +3912,8 @@ int __init shmem_init(void)
return 0;
}
int shmem_unuse(swp_entry_t swap, struct page *page)
int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse)
{
return 0;
}
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment