Commit 1899ad18 authored by Johannes Weiner's avatar Johannes Weiner Committed by Linus Torvalds

mm: workingset: tell cache transitions from workingset thrashing

Refaults happen during transitions between workingsets as well as in-place
thrashing.  Knowing the difference between the two has a range of
applications, including measuring the impact of memory shortage on the
system performance, as well as the ability to smarter balance pressure
between the filesystem cache and the swap-backed workingset.

During workingset transitions, inactive cache refaults and pushes out
established active cache.  When that active cache isn't stale, however,
and also ends up refaulting, that's bonafide thrashing.

Introduce a new page flag that tells on eviction whether the page has been
active or not in its lifetime.  This bit is then stored in the shadow
entry, to classify refaults as transitioning or thrashing.

How many page->flags does this leave us with on 32-bit?

	20 bits are always page flags

	21 if you have an MMU

	23 with the zone bits for DMA, Normal, HighMem, Movable

	29 with the sparsemem section bits

	30 if PAE is enabled

	31 with this patch.

So on 32-bit PAE, that leaves 1 bit for distinguishing two NUMA nodes.  If
that's not enough, the system can switch to discontigmem and re-gain the 6
or 7 sparsemem section bits.

Link: http://lkml.kernel.org/r/20180828172258.3185-3-hannes@cmpxchg.orgSigned-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Acked-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: default avatarDaniel Drake <drake@endlessm.com>
Tested-by: default avatarSuren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 95f9ab2d
...@@ -163,6 +163,7 @@ enum node_stat_item { ...@@ -163,6 +163,7 @@ enum node_stat_item {
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
WORKINGSET_REFAULT, WORKINGSET_REFAULT,
WORKINGSET_ACTIVATE, WORKINGSET_ACTIVATE,
WORKINGSET_RESTORE,
WORKINGSET_NODERECLAIM, WORKINGSET_NODERECLAIM,
NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_ANON_MAPPED, /* Mapped anonymous pages */
NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
......
...@@ -69,13 +69,14 @@ ...@@ -69,13 +69,14 @@
*/ */
enum pageflags { enum pageflags {
PG_locked, /* Page is locked. Don't touch. */ PG_locked, /* Page is locked. Don't touch. */
PG_error,
PG_referenced, PG_referenced,
PG_uptodate, PG_uptodate,
PG_dirty, PG_dirty,
PG_lru, PG_lru,
PG_active, PG_active,
PG_workingset,
PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
PG_error,
PG_slab, PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
PG_arch_1, PG_arch_1,
...@@ -280,6 +281,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) ...@@ -280,6 +281,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
TESTCLEARFLAG(Active, active, PF_HEAD) TESTCLEARFLAG(Active, active, PF_HEAD)
PAGEFLAG(Workingset, workingset, PF_HEAD)
TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
__PAGEFLAG(Slab, slab, PF_NO_TAIL) __PAGEFLAG(Slab, slab, PF_NO_TAIL)
__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
......
...@@ -296,7 +296,7 @@ struct vma_swap_readahead { ...@@ -296,7 +296,7 @@ struct vma_swap_readahead {
/* linux/mm/workingset.c */ /* linux/mm/workingset.c */
void *workingset_eviction(struct address_space *mapping, struct page *page); void *workingset_eviction(struct address_space *mapping, struct page *page);
bool workingset_refault(void *shadow); void workingset_refault(struct page *page, void *shadow);
void workingset_activation(struct page *page); void workingset_activation(struct page *page);
/* Do not use directly, use workingset_lookup_update */ /* Do not use directly, use workingset_lookup_update */
......
...@@ -88,6 +88,7 @@ ...@@ -88,6 +88,7 @@
{1UL << PG_dirty, "dirty" }, \ {1UL << PG_dirty, "dirty" }, \
{1UL << PG_lru, "lru" }, \ {1UL << PG_lru, "lru" }, \
{1UL << PG_active, "active" }, \ {1UL << PG_active, "active" }, \
{1UL << PG_workingset, "workingset" }, \
{1UL << PG_slab, "slab" }, \ {1UL << PG_slab, "slab" }, \
{1UL << PG_owner_priv_1, "owner_priv_1" }, \ {1UL << PG_owner_priv_1, "owner_priv_1" }, \
{1UL << PG_arch_1, "arch_1" }, \ {1UL << PG_arch_1, "arch_1" }, \
......
...@@ -915,12 +915,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, ...@@ -915,12 +915,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
* data from the working set, only to cache data that will * data from the working set, only to cache data that will
* get overwritten with something else, is a waste of memory. * get overwritten with something else, is a waste of memory.
*/ */
if (!(gfp_mask & __GFP_WRITE) && WARN_ON_ONCE(PageActive(page));
shadow && workingset_refault(shadow)) { if (!(gfp_mask & __GFP_WRITE) && shadow)
SetPageActive(page); workingset_refault(page, shadow);
workingset_activation(page);
} else
ClearPageActive(page);
lru_cache_add(page); lru_cache_add(page);
} }
return ret; return ret;
......
...@@ -2369,6 +2369,7 @@ static void __split_huge_page_tail(struct page *head, int tail, ...@@ -2369,6 +2369,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
(1L << PG_mlocked) | (1L << PG_mlocked) |
(1L << PG_uptodate) | (1L << PG_uptodate) |
(1L << PG_active) | (1L << PG_active) |
(1L << PG_workingset) |
(1L << PG_locked) | (1L << PG_locked) |
(1L << PG_unevictable) | (1L << PG_unevictable) |
(1L << PG_dirty))); (1L << PG_dirty)));
......
...@@ -685,6 +685,8 @@ void migrate_page_states(struct page *newpage, struct page *page) ...@@ -685,6 +685,8 @@ void migrate_page_states(struct page *newpage, struct page *page)
SetPageActive(newpage); SetPageActive(newpage);
} else if (TestClearPageUnevictable(page)) } else if (TestClearPageUnevictable(page))
SetPageUnevictable(newpage); SetPageUnevictable(newpage);
if (PageWorkingset(page))
SetPageWorkingset(newpage);
if (PageChecked(page)) if (PageChecked(page))
SetPageChecked(newpage); SetPageChecked(newpage);
if (PageMappedToDisk(page)) if (PageMappedToDisk(page))
......
...@@ -448,6 +448,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, ...@@ -448,6 +448,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/* /*
* Initiate read into locked page and return. * Initiate read into locked page and return.
*/ */
SetPageWorkingset(new_page);
lru_cache_add_anon(new_page); lru_cache_add_anon(new_page);
*new_page_allocated = true; *new_page_allocated = true;
return new_page; return new_page;
......
...@@ -2145,6 +2145,7 @@ static void shrink_active_list(unsigned long nr_to_scan, ...@@ -2145,6 +2145,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
} }
ClearPageActive(page); /* we are de-activating */ ClearPageActive(page); /* we are de-activating */
SetPageWorkingset(page);
list_add(&page->lru, &l_inactive); list_add(&page->lru, &l_inactive);
} }
......
...@@ -1145,6 +1145,7 @@ const char * const vmstat_text[] = { ...@@ -1145,6 +1145,7 @@ const char * const vmstat_text[] = {
"nr_isolated_file", "nr_isolated_file",
"workingset_refault", "workingset_refault",
"workingset_activate", "workingset_activate",
"workingset_restore",
"workingset_nodereclaim", "workingset_nodereclaim",
"nr_anon_pages", "nr_anon_pages",
"nr_mapped", "nr_mapped",
......
...@@ -121,7 +121,7 @@ ...@@ -121,7 +121,7 @@
* the only thing eating into inactive list space is active pages. * the only thing eating into inactive list space is active pages.
* *
* *
* Activating refaulting pages * Refaulting inactive pages
* *
* All that is known about the active list is that the pages have been * All that is known about the active list is that the pages have been
* accessed more than once in the past. This means that at any given * accessed more than once in the past. This means that at any given
...@@ -134,6 +134,10 @@ ...@@ -134,6 +134,10 @@
* used less frequently than the refaulting page - or even not used at * used less frequently than the refaulting page - or even not used at
* all anymore. * all anymore.
* *
* That means if inactive cache is refaulting with a suitable refault
* distance, we assume the cache workingset is transitioning and put
* pressure on the current active list.
*
* If this is wrong and demotion kicks in, the pages which are truly * If this is wrong and demotion kicks in, the pages which are truly
* used more frequently will be reactivated while the less frequently * used more frequently will be reactivated while the less frequently
* used once will be evicted from memory. * used once will be evicted from memory.
...@@ -141,6 +145,14 @@ ...@@ -141,6 +145,14 @@
* But if this is right, the stale pages will be pushed out of memory * But if this is right, the stale pages will be pushed out of memory
* and the used pages get to stay in cache. * and the used pages get to stay in cache.
* *
* Refaulting active pages
*
* If on the other hand the refaulting pages have recently been
* deactivated, it means that the active list is no longer protecting
* actively used cache from reclaim. The cache is NOT transitioning to
* a different workingset; the existing workingset is thrashing in the
* space allocated to the page cache.
*
* *
* Implementation * Implementation
* *
...@@ -156,8 +168,7 @@ ...@@ -156,8 +168,7 @@
*/ */
#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
NODES_SHIFT + \ 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
/* /*
...@@ -170,23 +181,28 @@ ...@@ -170,23 +181,28 @@
*/ */
static unsigned int bucket_order __read_mostly; static unsigned int bucket_order __read_mostly;
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset)
{ {
eviction >>= bucket_order; eviction >>= bucket_order;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
eviction = (eviction << 1) | workingset;
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
} }
static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp) unsigned long *evictionp, bool *workingsetp)
{ {
unsigned long entry = (unsigned long)shadow; unsigned long entry = (unsigned long)shadow;
int memcgid, nid; int memcgid, nid;
bool workingset;
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
workingset = entry & 1;
entry >>= 1;
nid = entry & ((1UL << NODES_SHIFT) - 1); nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT; entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
...@@ -195,6 +211,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, ...@@ -195,6 +211,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*memcgidp = memcgid; *memcgidp = memcgid;
*pgdat = NODE_DATA(nid); *pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order; *evictionp = entry << bucket_order;
*workingsetp = workingset;
} }
/** /**
...@@ -207,8 +224,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, ...@@ -207,8 +224,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*/ */
void *workingset_eviction(struct address_space *mapping, struct page *page) void *workingset_eviction(struct address_space *mapping, struct page *page)
{ {
struct mem_cgroup *memcg = page_memcg(page);
struct pglist_data *pgdat = page_pgdat(page); struct pglist_data *pgdat = page_pgdat(page);
struct mem_cgroup *memcg = page_memcg(page);
int memcgid = mem_cgroup_id(memcg); int memcgid = mem_cgroup_id(memcg);
unsigned long eviction; unsigned long eviction;
struct lruvec *lruvec; struct lruvec *lruvec;
...@@ -220,30 +237,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) ...@@ -220,30 +237,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
lruvec = mem_cgroup_lruvec(pgdat, memcg); lruvec = mem_cgroup_lruvec(pgdat, memcg);
eviction = atomic_long_inc_return(&lruvec->inactive_age); eviction = atomic_long_inc_return(&lruvec->inactive_age);
return pack_shadow(memcgid, pgdat, eviction); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
} }
/** /**
* workingset_refault - evaluate the refault of a previously evicted page * workingset_refault - evaluate the refault of a previously evicted page
* @page: the freshly allocated replacement page
* @shadow: shadow entry of the evicted page * @shadow: shadow entry of the evicted page
* *
* Calculates and evaluates the refault distance of the previously * Calculates and evaluates the refault distance of the previously
* evicted page in the context of the node it was allocated in. * evicted page in the context of the node it was allocated in.
*
* Returns %true if the page should be activated, %false otherwise.
*/ */
bool workingset_refault(void *shadow) void workingset_refault(struct page *page, void *shadow)
{ {
unsigned long refault_distance; unsigned long refault_distance;
struct pglist_data *pgdat;
unsigned long active_file; unsigned long active_file;
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
unsigned long eviction; unsigned long eviction;
struct lruvec *lruvec; struct lruvec *lruvec;
unsigned long refault; unsigned long refault;
struct pglist_data *pgdat; bool workingset;
int memcgid; int memcgid;
unpack_shadow(shadow, &memcgid, &pgdat, &eviction); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
rcu_read_lock(); rcu_read_lock();
/* /*
...@@ -263,41 +280,51 @@ bool workingset_refault(void *shadow) ...@@ -263,41 +280,51 @@ bool workingset_refault(void *shadow)
* configurations instead. * configurations instead.
*/ */
memcg = mem_cgroup_from_id(memcgid); memcg = mem_cgroup_from_id(memcgid);
if (!mem_cgroup_disabled() && !memcg) { if (!mem_cgroup_disabled() && !memcg)
rcu_read_unlock(); goto out;
return false;
}
lruvec = mem_cgroup_lruvec(pgdat, memcg); lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age); refault = atomic_long_read(&lruvec->inactive_age);
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
/* /*
* The unsigned subtraction here gives an accurate distance * Calculate the refault distance
* across inactive_age overflows in most cases.
* *
* There is a special case: usually, shadow entries have a * The unsigned subtraction here gives an accurate distance
* short lifetime and are either refaulted or reclaimed along * across inactive_age overflows in most cases. There is a
* with the inode before they get too old. But it is not * special case: usually, shadow entries have a short lifetime
* impossible for the inactive_age to lap a shadow entry in * and are either refaulted or reclaimed along with the inode
* the field, which can then can result in a false small * before they get too old. But it is not impossible for the
* refault distance, leading to a false activation should this * inactive_age to lap a shadow entry in the field, which can
* old entry actually refault again. However, earlier kernels * then result in a false small refault distance, leading to a
* used to deactivate unconditionally with *every* reclaim * false activation should this old entry actually refault
* invocation for the longest time, so the occasional * again. However, earlier kernels used to deactivate
* inappropriate activation leading to pressure on the active * unconditionally with *every* reclaim invocation for the
* list is not a problem. * longest time, so the occasional inappropriate activation
* leading to pressure on the active list is not a problem.
*/ */
refault_distance = (refault - eviction) & EVICTION_MASK; refault_distance = (refault - eviction) & EVICTION_MASK;
inc_lruvec_state(lruvec, WORKINGSET_REFAULT); inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
if (refault_distance <= active_file) { /*
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); * Compare the distance to the existing workingset size. We
rcu_read_unlock(); * don't act on pages that couldn't stay resident even if all
return true; * the memory was available to the page cache.
*/
if (refault_distance > active_file)
goto out;
SetPageActive(page);
atomic_long_inc(&lruvec->inactive_age);
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
/* Page was active prior to eviction */
if (workingset) {
SetPageWorkingset(page);
inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
} }
out:
rcu_read_unlock(); rcu_read_unlock();
return false;
} }
/** /**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment