Commit e4dde56c authored by Yu Zhao's avatar Yu Zhao Committed by Andrew Morton

mm: multi-gen LRU: per-node lru_gen_folio lists

For each node, memcgs are divided into two generations: the old and
the young. For each generation, memcgs are randomly sharded into
multiple bins to improve scalability. For each bin, an RCU hlist_nulls
is virtually divided into three segments: the head, the tail and the
default.

An onlining memcg is added to the tail of a random bin in the old
generation. The eviction starts at the head of a random bin in the old
generation. The per-node memcg generation counter, whose reminder (mod
2) indexes the old generation, is incremented when all its bins become
empty.

There are four operations:
1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
   its current generation (old or young) and updates its "seg" to
   "head";
2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
   its current generation (old or young) and updates its "seg" to
   "tail";
3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
   the old generation, updates its "gen" to "old" and resets its "seg"
   to "default";
4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
   in the young generation, updates its "gen" to "young" and resets
   its "seg" to "default".

The events that trigger the above operations are:
1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
2. The first attempt to reclaim an memcg below low, which triggers
   MEMCG_LRU_TAIL;
3. The first attempt to reclaim an memcg below reclaimable size
   threshold, which triggers MEMCG_LRU_TAIL;
4. The second attempt to reclaim an memcg below reclaimable size
   threshold, which triggers MEMCG_LRU_YOUNG;
5. Attempting to reclaim an memcg below min, which triggers
   MEMCG_LRU_YOUNG;
6. Finishing the aging on the eviction path, which triggers
   MEMCG_LRU_YOUNG;
7. Offlining an memcg, which triggers MEMCG_LRU_OLD.

Note that memcg LRU only applies to global reclaim, and the
round-robin incrementing of their max_seq counters ensures the
eventual fairness to all eligible memcgs. For memcg reclaim, it still
relies on mem_cgroup_iter().

Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.comSigned-off-by: default avatarYu Zhao <yuzhao@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 77d4459a
...@@ -794,6 +794,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) ...@@ -794,6 +794,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
percpu_ref_put(&objcg->refcnt); percpu_ref_put(&objcg->refcnt);
} }
static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
return !memcg || css_tryget(&memcg->css);
}
static inline void mem_cgroup_put(struct mem_cgroup *memcg) static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{ {
if (memcg) if (memcg)
...@@ -1301,6 +1306,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) ...@@ -1301,6 +1306,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{ {
} }
static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
return true;
}
static inline void mem_cgroup_put(struct mem_cgroup *memcg) static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{ {
} }
......
...@@ -122,6 +122,18 @@ static inline bool lru_gen_in_fault(void) ...@@ -122,6 +122,18 @@ static inline bool lru_gen_in_fault(void)
return current->in_lru_fault; return current->in_lru_fault;
} }
#ifdef CONFIG_MEMCG
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
{
return READ_ONCE(lruvec->lrugen.seg);
}
#else
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
{
return 0;
}
#endif
static inline int lru_gen_from_seq(unsigned long seq) static inline int lru_gen_from_seq(unsigned long seq)
{ {
return seq % MAX_NR_GENS; return seq % MAX_NR_GENS;
...@@ -297,6 +309,11 @@ static inline bool lru_gen_in_fault(void) ...@@ -297,6 +309,11 @@ static inline bool lru_gen_in_fault(void)
return false; return false;
} }
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
{
return 0;
}
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{ {
return false; return false;
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/list.h> #include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/wait.h> #include <linux/wait.h>
#include <linux/bitops.h> #include <linux/bitops.h>
#include <linux/cache.h> #include <linux/cache.h>
...@@ -367,6 +368,15 @@ struct page_vma_mapped_walk; ...@@ -367,6 +368,15 @@ struct page_vma_mapped_walk;
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
/* see the comment on MEMCG_NR_GENS */
enum {
MEMCG_LRU_NOP,
MEMCG_LRU_HEAD,
MEMCG_LRU_TAIL,
MEMCG_LRU_OLD,
MEMCG_LRU_YOUNG,
};
#ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN
enum { enum {
...@@ -426,6 +436,14 @@ struct lru_gen_folio { ...@@ -426,6 +436,14 @@ struct lru_gen_folio {
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
/* whether the multi-gen LRU is enabled */ /* whether the multi-gen LRU is enabled */
bool enabled; bool enabled;
#ifdef CONFIG_MEMCG
/* the memcg generation this lru_gen_folio belongs to */
u8 gen;
/* the list segment this lru_gen_folio belongs to */
u8 seg;
/* per-node lru_gen_folio list for global reclaim */
struct hlist_nulls_node list;
#endif
}; };
enum { enum {
...@@ -479,12 +497,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec); ...@@ -479,12 +497,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec);
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
/*
* For each node, memcgs are divided into two generations: the old and the
* young. For each generation, memcgs are randomly sharded into multiple bins
* to improve scalability. For each bin, the hlist_nulls is virtually divided
* into three segments: the head, the tail and the default.
*
* An onlining memcg is added to the tail of a random bin in the old generation.
* The eviction starts at the head of a random bin in the old generation. The
* per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
* the old generation, is incremented when all its bins become empty.
*
* There are four operations:
* 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
* current generation (old or young) and updates its "seg" to "head";
* 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
* current generation (old or young) and updates its "seg" to "tail";
* 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
* generation, updates its "gen" to "old" and resets its "seg" to "default";
* 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
* young generation, updates its "gen" to "young" and resets its "seg" to
* "default".
*
* The events that trigger the above operations are:
* 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
* 2. The first attempt to reclaim an memcg below low, which triggers
* MEMCG_LRU_TAIL;
* 3. The first attempt to reclaim an memcg below reclaimable size threshold,
* which triggers MEMCG_LRU_TAIL;
* 4. The second attempt to reclaim an memcg below reclaimable size threshold,
* which triggers MEMCG_LRU_YOUNG;
* 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
* 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
* 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
*
* Note that memcg LRU only applies to global reclaim, and the round-robin
* incrementing of their max_seq counters ensures the eventual fairness to all
* eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
*/
#define MEMCG_NR_GENS 2
#define MEMCG_NR_BINS 8
struct lru_gen_memcg {
/* the per-node memcg generation counter */
unsigned long seq;
/* each memcg has one lru_gen_folio per node */
unsigned long nr_memcgs[MEMCG_NR_GENS];
/* per-node lru_gen_folio list for global reclaim */
struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
/* protects the above */
spinlock_t lock;
};
void lru_gen_init_pgdat(struct pglist_data *pgdat);
void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg);
#endif void lru_gen_online_memcg(struct mem_cgroup *memcg);
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
void lru_gen_release_memcg(struct mem_cgroup *memcg);
void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
#else /* !CONFIG_MEMCG */
#define MEMCG_NR_GENS 1
struct lru_gen_memcg {
};
static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
{
}
#endif /* CONFIG_MEMCG */
#else /* !CONFIG_LRU_GEN */ #else /* !CONFIG_LRU_GEN */
static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
{
}
static inline void lru_gen_init_lruvec(struct lruvec *lruvec) static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{ {
} }
...@@ -494,6 +587,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ...@@ -494,6 +587,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
} }
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{ {
} }
...@@ -501,7 +595,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) ...@@ -501,7 +595,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{ {
} }
#endif
static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
}
static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
{
}
static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
{
}
static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
{
}
#endif /* CONFIG_MEMCG */
#endif /* CONFIG_LRU_GEN */ #endif /* CONFIG_LRU_GEN */
...@@ -1243,6 +1354,8 @@ typedef struct pglist_data { ...@@ -1243,6 +1354,8 @@ typedef struct pglist_data {
#ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN
/* kswap mm walk data */ /* kswap mm walk data */
struct lru_gen_mm_walk mm_walk; struct lru_gen_mm_walk mm_walk;
/* lru_gen_folio list */
struct lru_gen_memcg memcg_lru;
#endif #endif
CACHELINE_PADDING(_pad2_); CACHELINE_PADDING(_pad2_);
......
...@@ -478,6 +478,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) ...@@ -478,6 +478,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
struct mem_cgroup_per_node *mz; struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_node *mctz; struct mem_cgroup_tree_per_node *mctz;
if (lru_gen_enabled()) {
struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
/* see the comment on MEMCG_NR_GENS */
if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
return;
}
mctz = soft_limit_tree.rb_tree_per_node[nid]; mctz = soft_limit_tree.rb_tree_per_node[nid];
if (!mctz) if (!mctz)
return; return;
...@@ -3530,6 +3540,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, ...@@ -3530,6 +3540,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
struct mem_cgroup_tree_per_node *mctz; struct mem_cgroup_tree_per_node *mctz;
unsigned long excess; unsigned long excess;
if (lru_gen_enabled())
return 0;
if (order > 0) if (order > 0)
return 0; return 0;
...@@ -5391,6 +5404,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) ...@@ -5391,6 +5404,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (unlikely(mem_cgroup_is_root(memcg))) if (unlikely(mem_cgroup_is_root(memcg)))
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
2UL*HZ); 2UL*HZ);
lru_gen_online_memcg(memcg);
return 0; return 0;
offline_kmem: offline_kmem:
memcg_offline_kmem(memcg); memcg_offline_kmem(memcg);
...@@ -5422,6 +5436,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) ...@@ -5422,6 +5436,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
memcg_offline_kmem(memcg); memcg_offline_kmem(memcg);
reparent_shrinker_deferred(memcg); reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg); wb_memcg_offline(memcg);
lru_gen_offline_memcg(memcg);
drain_all_stock(memcg); drain_all_stock(memcg);
...@@ -5433,6 +5448,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) ...@@ -5433,6 +5448,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *memcg = mem_cgroup_from_css(css);
invalidate_reclaim_iterators(memcg); invalidate_reclaim_iterators(memcg);
lru_gen_release_memcg(memcg);
} }
static void mem_cgroup_css_free(struct cgroup_subsys_state *css) static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
......
...@@ -7941,6 +7941,7 @@ static void __init free_area_init_node(int nid) ...@@ -7941,6 +7941,7 @@ static void __init free_area_init_node(int nid)
pgdat_set_deferred_range(pgdat); pgdat_set_deferred_range(pgdat);
free_area_init_core(pgdat); free_area_init_core(pgdat);
lru_gen_init_pgdat(pgdat);
} }
static void __init free_area_init_memoryless_node(int nid) static void __init free_area_init_memoryless_node(int nid)
......
...@@ -55,6 +55,8 @@ ...@@ -55,6 +55,8 @@
#include <linux/ctype.h> #include <linux/ctype.h>
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/khugepaged.h> #include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/div64.h> #include <asm/div64.h>
...@@ -135,11 +137,6 @@ struct scan_control { ...@@ -135,11 +137,6 @@ struct scan_control {
/* Always discard instead of demoting to lower tier memory */ /* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1; unsigned int no_demotion:1;
#ifdef CONFIG_LRU_GEN
/* help kswapd make better choices among multiple memcgs */
unsigned long last_reclaimed;
#endif
/* Allocation order */ /* Allocation order */
s8 order; s8 order;
...@@ -3185,6 +3182,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); ...@@ -3185,6 +3182,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
{ {
struct pglist_data *pgdat = NODE_DATA(nid); struct pglist_data *pgdat = NODE_DATA(nid);
...@@ -4453,8 +4453,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, ...@@ -4453,8 +4453,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
if (sc->priority <= DEF_PRIORITY - 2) if (sc->priority <= DEF_PRIORITY - 2)
wait_event_killable(lruvec->mm_state.wait, wait_event_killable(lruvec->mm_state.wait,
max_seq < READ_ONCE(lrugen->max_seq)); max_seq < READ_ONCE(lrugen->max_seq));
return false;
return max_seq < READ_ONCE(lrugen->max_seq);
} }
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
...@@ -4527,8 +4526,6 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ...@@ -4527,8 +4526,6 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
VM_WARN_ON_ONCE(!current_is_kswapd()); VM_WARN_ON_ONCE(!current_is_kswapd());
sc->last_reclaimed = sc->nr_reclaimed;
/* check the order to exclude compaction-induced reclaim */ /* check the order to exclude compaction-induced reclaim */
if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
return; return;
...@@ -5117,8 +5114,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, ...@@ -5117,8 +5114,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
* reclaim. * reclaim.
*/ */
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
bool can_swap)
{ {
unsigned long nr_to_scan; unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec);
...@@ -5136,10 +5132,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * ...@@ -5136,10 +5132,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
if (sc->priority == DEF_PRIORITY) if (sc->priority == DEF_PRIORITY)
return nr_to_scan; return nr_to_scan;
try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
/* skip this lruvec as it's low on cold folios */ /* skip this lruvec as it's low on cold folios */
return 0; return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
} }
static unsigned long get_nr_to_reclaim(struct scan_control *sc) static unsigned long get_nr_to_reclaim(struct scan_control *sc)
...@@ -5148,29 +5142,18 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc) ...@@ -5148,29 +5142,18 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc)
if (!global_reclaim(sc)) if (!global_reclaim(sc))
return -1; return -1;
/* discount the previous progress for kswapd */
if (current_is_kswapd())
return sc->nr_to_reclaim + sc->last_reclaimed;
return max(sc->nr_to_reclaim, compact_gap(sc->order)); return max(sc->nr_to_reclaim, compact_gap(sc->order));
} }
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{ {
struct blk_plug plug; long nr_to_scan;
unsigned long scanned = 0; unsigned long scanned = 0;
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
lru_add_drain();
blk_start_plug(&plug);
set_mm_walk(lruvec_pgdat(lruvec));
while (true) { while (true) {
int delta; int delta;
int swappiness; int swappiness;
unsigned long nr_to_scan;
if (sc->may_swap) if (sc->may_swap)
swappiness = get_swappiness(lruvec, sc); swappiness = get_swappiness(lruvec, sc);
...@@ -5180,7 +5163,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc ...@@ -5180,7 +5163,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
swappiness = 0; swappiness = 0;
nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
if (!nr_to_scan) if (nr_to_scan <= 0)
break; break;
delta = evict_folios(lruvec, sc, swappiness); delta = evict_folios(lruvec, sc, swappiness);
...@@ -5197,11 +5180,252 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc ...@@ -5197,11 +5180,252 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
cond_resched(); cond_resched();
} }
/* whether try_to_inc_max_seq() was successful */
return nr_to_scan < 0;
}
static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
{
bool success;
unsigned long scanned = sc->nr_scanned;
unsigned long reclaimed = sc->nr_reclaimed;
int seg = lru_gen_memcg_seg(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
/* see the comment on MEMCG_NR_GENS */
if (!lruvec_is_sizable(lruvec, sc))
return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
mem_cgroup_calculate_protection(NULL, memcg);
if (mem_cgroup_below_min(NULL, memcg))
return MEMCG_LRU_YOUNG;
if (mem_cgroup_below_low(NULL, memcg)) {
/* see the comment on MEMCG_NR_GENS */
if (seg != MEMCG_LRU_TAIL)
return MEMCG_LRU_TAIL;
memcg_memory_event(memcg, MEMCG_LOW);
}
success = try_to_shrink_lruvec(lruvec, sc);
shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
if (!sc->proactive)
vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
current->reclaim_state->reclaimed_slab = 0;
return success ? MEMCG_LRU_YOUNG : 0;
}
#ifdef CONFIG_MEMCG
static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
{
int gen;
int bin;
int first_bin;
struct lruvec *lruvec;
struct lru_gen_folio *lrugen;
const struct hlist_nulls_node *pos;
int op = 0;
struct mem_cgroup *memcg = NULL;
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
restart:
gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
rcu_read_lock();
hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
if (op)
lru_gen_rotate_memcg(lruvec, op);
mem_cgroup_put(memcg);
lruvec = container_of(lrugen, struct lruvec, lrugen);
memcg = lruvec_memcg(lruvec);
if (!mem_cgroup_tryget(memcg)) {
op = 0;
memcg = NULL;
continue;
}
rcu_read_unlock();
op = shrink_one(lruvec, sc);
if (sc->nr_reclaimed >= nr_to_reclaim)
goto success;
rcu_read_lock();
}
rcu_read_unlock();
/* restart if raced with lru_gen_rotate_memcg() */
if (gen != get_nulls_value(pos))
goto restart;
/* try the rest of the bins of the current generation */
bin = get_memcg_bin(bin + 1);
if (bin != first_bin)
goto restart;
success:
if (op)
lru_gen_rotate_memcg(lruvec, op);
mem_cgroup_put(memcg);
}
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
struct blk_plug plug;
VM_WARN_ON_ONCE(global_reclaim(sc));
lru_add_drain();
blk_start_plug(&plug);
set_mm_walk(lruvec_pgdat(lruvec));
if (try_to_shrink_lruvec(lruvec, sc))
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
clear_mm_walk(); clear_mm_walk();
blk_finish_plug(&plug); blk_finish_plug(&plug);
} }
#else /* !CONFIG_MEMCG */
static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
{
BUILD_BUG();
}
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
BUILD_BUG();
}
#endif
static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
{
int priority;
unsigned long reclaimable;
struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
return;
/*
* Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
* priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
* estimated reclaimed_to_scanned_ratio = inactive / total.
*/
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
if (get_swappiness(lruvec, sc))
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
reclaimable /= MEMCG_NR_GENS;
/* round down reclaimable and round up sc->nr_to_reclaim */
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
sc->priority = clamp(priority, 0, DEF_PRIORITY);
}
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct blk_plug plug;
unsigned long reclaimed = sc->nr_reclaimed;
VM_WARN_ON_ONCE(!global_reclaim(sc));
lru_add_drain();
blk_start_plug(&plug);
set_mm_walk(pgdat);
set_initial_priority(pgdat, sc);
if (current_is_kswapd())
sc->nr_reclaimed = 0;
if (mem_cgroup_disabled())
shrink_one(&pgdat->__lruvec, sc);
else
shrink_many(pgdat, sc);
if (current_is_kswapd())
sc->nr_reclaimed += reclaimed;
clear_mm_walk();
blk_finish_plug(&plug);
/* kswapd should never fail */
pgdat->kswapd_failures = 0;
}
#ifdef CONFIG_MEMCG
void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
{
int seg;
int old, new;
int bin = get_random_u32_below(MEMCG_NR_BINS);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
spin_lock(&pgdat->memcg_lru.lock);
VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
seg = 0;
new = old = lruvec->lrugen.gen;
/* see the comment on MEMCG_NR_GENS */
if (op == MEMCG_LRU_HEAD)
seg = MEMCG_LRU_HEAD;
else if (op == MEMCG_LRU_TAIL)
seg = MEMCG_LRU_TAIL;
else if (op == MEMCG_LRU_OLD)
new = get_memcg_gen(pgdat->memcg_lru.seq);
else if (op == MEMCG_LRU_YOUNG)
new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
else
VM_WARN_ON_ONCE(true);
hlist_nulls_del_rcu(&lruvec->lrugen.list);
if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
else
hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
pgdat->memcg_lru.nr_memcgs[old]--;
pgdat->memcg_lru.nr_memcgs[new]++;
lruvec->lrugen.gen = new;
WRITE_ONCE(lruvec->lrugen.seg, seg);
if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
spin_unlock(&pgdat->memcg_lru.lock);
}
#endif
/****************************************************************************** /******************************************************************************
* state change * state change
******************************************************************************/ ******************************************************************************/
...@@ -5655,11 +5879,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, ...@@ -5655,11 +5879,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (!mem_cgroup_disabled()) { if (!mem_cgroup_disabled()) {
rcu_read_lock(); rcu_read_lock();
memcg = mem_cgroup_from_id(memcg_id); memcg = mem_cgroup_from_id(memcg_id);
#ifdef CONFIG_MEMCG if (!mem_cgroup_tryget(memcg))
if (memcg && !css_tryget(&memcg->css))
memcg = NULL; memcg = NULL;
#endif
rcu_read_unlock(); rcu_read_unlock();
if (!memcg) if (!memcg)
...@@ -5807,6 +6031,19 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) ...@@ -5807,6 +6031,19 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
} }
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
void lru_gen_init_pgdat(struct pglist_data *pgdat)
{
int i, j;
spin_lock_init(&pgdat->memcg_lru.lock);
for (i = 0; i < MEMCG_NR_GENS; i++) {
for (j = 0; j < MEMCG_NR_BINS; j++)
INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
}
}
void lru_gen_init_memcg(struct mem_cgroup *memcg) void lru_gen_init_memcg(struct mem_cgroup *memcg)
{ {
INIT_LIST_HEAD(&memcg->mm_list.fifo); INIT_LIST_HEAD(&memcg->mm_list.fifo);
...@@ -5830,7 +6067,69 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) ...@@ -5830,7 +6067,69 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
} }
} }
} }
#endif
void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
int gen;
int nid;
int bin = get_random_u32_below(MEMCG_NR_BINS);
for_each_node(nid) {
struct pglist_data *pgdat = NODE_DATA(nid);
struct lruvec *lruvec = get_lruvec(memcg, nid);
spin_lock(&pgdat->memcg_lru.lock);
VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
gen = get_memcg_gen(pgdat->memcg_lru.seq);
hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
pgdat->memcg_lru.nr_memcgs[gen]++;
lruvec->lrugen.gen = gen;
spin_unlock(&pgdat->memcg_lru.lock);
}
}
void lru_gen_offline_memcg(struct mem_cgroup *memcg)
{
int nid;
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
}
}
void lru_gen_release_memcg(struct mem_cgroup *memcg)
{
int gen;
int nid;
for_each_node(nid) {
struct pglist_data *pgdat = NODE_DATA(nid);
struct lruvec *lruvec = get_lruvec(memcg, nid);
spin_lock(&pgdat->memcg_lru.lock);
VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
gen = lruvec->lrugen.gen;
hlist_nulls_del_rcu(&lruvec->lrugen.list);
pgdat->memcg_lru.nr_memcgs[gen]--;
if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
spin_unlock(&pgdat->memcg_lru.lock);
}
}
#endif /* CONFIG_MEMCG */
static int __init init_lru_gen(void) static int __init init_lru_gen(void)
{ {
...@@ -5857,6 +6156,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc ...@@ -5857,6 +6156,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
{ {
} }
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
{
}
#endif /* CONFIG_LRU_GEN */ #endif /* CONFIG_LRU_GEN */
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
...@@ -5870,7 +6173,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ...@@ -5870,7 +6173,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
bool proportional_reclaim; bool proportional_reclaim;
struct blk_plug plug; struct blk_plug plug;
if (lru_gen_enabled()) { if (lru_gen_enabled() && !global_reclaim(sc)) {
lru_gen_shrink_lruvec(lruvec, sc); lru_gen_shrink_lruvec(lruvec, sc);
return; return;
} }
...@@ -6113,6 +6416,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) ...@@ -6113,6 +6416,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
struct lruvec *target_lruvec; struct lruvec *target_lruvec;
bool reclaimable = false; bool reclaimable = false;
if (lru_gen_enabled() && global_reclaim(sc)) {
lru_gen_shrink_node(pgdat, sc);
return;
}
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again: again:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment