Commit 7348cc91 authored by Yu Zhao's avatar Yu Zhao Committed by Andrew Morton

mm: multi-gen LRU: remove aging fairness safeguard

Recall that the aging produces the youngest generation: first it scans
for accessed folios and updates their gen counters; then it increments
lrugen->max_seq.

The current aging fairness safeguard for kswapd uses two passes to
ensure the fairness to multiple eligible memcgs. On the first pass,
which is shared with the eviction, it checks whether all eligible
memcgs are low on cold folios. If so, it requires a second pass, on
which it ages all those memcgs at the same time.

With memcg LRU, the aging, while ensuring eventual fairness, will run
when necessary. Therefore the current aging fairness safeguard for
kswapd will not be needed.

Note that memcg LRU only applies to global reclaim. For memcg reclaim,
the aging can be unfair to different memcgs, i.e., their
lrugen->max_seq can be incremented at different paces.

Link: https://lkml.kernel.org/r/20221222041905.2431096-5-yuzhao@google.comSigned-off-by: default avatarYu Zhao <yuzhao@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent a579086c
...@@ -137,7 +137,6 @@ struct scan_control { ...@@ -137,7 +137,6 @@ struct scan_control {
#ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN
/* help kswapd make better choices among multiple memcgs */ /* help kswapd make better choices among multiple memcgs */
unsigned int memcgs_need_aging:1;
unsigned long last_reclaimed; unsigned long last_reclaimed;
#endif #endif
...@@ -4468,7 +4467,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, ...@@ -4468,7 +4467,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
return true; return true;
} }
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
{ {
int gen, type, zone; int gen, type, zone;
...@@ -4477,6 +4476,13 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig ...@@ -4477,6 +4476,13 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig
unsigned long total = 0; unsigned long total = 0;
struct lru_gen_folio *lrugen = &lruvec->lrugen; struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
/* whether this lruvec is completely out of cold folios */
if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
*nr_to_scan = 0;
return true;
}
for (type = !can_swap; type < ANON_AND_FILE; type++) { for (type = !can_swap; type < ANON_AND_FILE; type++) {
unsigned long seq; unsigned long seq;
...@@ -4505,8 +4511,6 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig ...@@ -4505,8 +4511,6 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig
* stalls when the number of generations reaches MIN_NR_GENS. Hence, the * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
* ideal number of generations is MIN_NR_GENS+1. * ideal number of generations is MIN_NR_GENS+1.
*/ */
if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
return true;
if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
return false; return false;
...@@ -4525,40 +4529,54 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig ...@@ -4525,40 +4529,54 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig
return false; return false;
} }
static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{ {
bool need_aging; int gen, type, zone;
unsigned long nr_to_scan; unsigned long total = 0;
int swappiness = get_swappiness(lruvec, sc); bool can_swap = get_swappiness(lruvec, sc);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec); DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec);
VM_WARN_ON_ONCE(sc->memcg_low_reclaim); for (type = !can_swap; type < ANON_AND_FILE; type++) {
unsigned long seq;
mem_cgroup_calculate_protection(NULL, memcg); for (seq = min_seq[type]; seq <= max_seq; seq++) {
gen = lru_gen_from_seq(seq);
if (mem_cgroup_below_min(NULL, memcg)) for (zone = 0; zone < MAX_NR_ZONES; zone++)
return false; total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
}
}
need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); /* whether the size is big enough to be helpful */
return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
}
if (min_ttl) { static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); unsigned long min_ttl)
unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); {
int gen;
unsigned long birth;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
if (time_is_after_jiffies(birth + min_ttl)) VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
return false;
/* the size is likely too small to be helpful */ /* see the comment on lru_gen_folio */
if (!nr_to_scan && sc->priority != DEF_PRIORITY) gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
return false; birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
}
if (need_aging) if (time_is_after_jiffies(birth + min_ttl))
try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); return false;
return true; if (!lruvec_is_sizable(lruvec, sc))
return false;
mem_cgroup_calculate_protection(NULL, memcg);
return !mem_cgroup_below_min(NULL, memcg);
} }
/* to protect the working set of the last N jiffies */ /* to protect the working set of the last N jiffies */
...@@ -4567,46 +4585,32 @@ static unsigned long lru_gen_min_ttl __read_mostly; ...@@ -4567,46 +4585,32 @@ static unsigned long lru_gen_min_ttl __read_mostly;
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{ {
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
bool success = false;
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
VM_WARN_ON_ONCE(!current_is_kswapd()); VM_WARN_ON_ONCE(!current_is_kswapd());
sc->last_reclaimed = sc->nr_reclaimed; sc->last_reclaimed = sc->nr_reclaimed;
/* /* check the order to exclude compaction-induced reclaim */
* To reduce the chance of going into the aging path, which can be if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
* costly, optimistically skip it if the flag below was cleared in the
* eviction path. This improves the overall performance when multiple
* memcgs are available.
*/
if (!sc->memcgs_need_aging) {
sc->memcgs_need_aging = true;
return; return;
}
set_mm_walk(pgdat);
memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg = mem_cgroup_iter(NULL, NULL, NULL);
do { do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
if (age_lruvec(lruvec, sc, min_ttl)) if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
success = true; mem_cgroup_iter_break(NULL, memcg);
return;
}
cond_resched(); cond_resched();
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
clear_mm_walk();
/* check the order to exclude compaction-induced reclaim */
if (success || !min_ttl || sc->order)
return;
/* /*
* The main goal is to OOM kill if every generation from all memcgs is * The main goal is to OOM kill if every generation from all memcgs is
* younger than min_ttl. However, another possibility is all memcgs are * younger than min_ttl. However, another possibility is all memcgs are
* either below min or empty. * either too small or below min.
*/ */
if (mutex_trylock(&oom_lock)) { if (mutex_trylock(&oom_lock)) {
struct oom_control oc = { struct oom_control oc = {
...@@ -5114,34 +5118,28 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap ...@@ -5114,34 +5118,28 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
* reclaim. * reclaim.
*/ */
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
bool can_swap, bool *need_aging) bool can_swap)
{ {
unsigned long nr_to_scan; unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec); DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) ||
(mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) &&
!sc->memcg_low_reclaim)) !sc->memcg_low_reclaim))
return 0; return 0;
*need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
if (!*need_aging)
return nr_to_scan; return nr_to_scan;
/* skip the aging path at the default priority */ /* skip the aging path at the default priority */
if (sc->priority == DEF_PRIORITY) if (sc->priority == DEF_PRIORITY)
goto done; return nr_to_scan;
/* leave the work to lru_gen_age_node() */ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
if (current_is_kswapd())
return 0;
if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) /* skip this lruvec as it's low on cold folios */
return nr_to_scan; return 0;
done:
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
} }
static unsigned long get_nr_to_reclaim(struct scan_control *sc) static unsigned long get_nr_to_reclaim(struct scan_control *sc)
...@@ -5160,9 +5158,7 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc) ...@@ -5160,9 +5158,7 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc)
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{ {
struct blk_plug plug; struct blk_plug plug;
bool need_aging = false;
unsigned long scanned = 0; unsigned long scanned = 0;
unsigned long reclaimed = sc->nr_reclaimed;
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
lru_add_drain(); lru_add_drain();
...@@ -5183,13 +5179,13 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc ...@@ -5183,13 +5179,13 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
else else
swappiness = 0; swappiness = 0;
nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
if (!nr_to_scan) if (!nr_to_scan)
goto done; break;
delta = evict_folios(lruvec, sc, swappiness); delta = evict_folios(lruvec, sc, swappiness);
if (!delta) if (!delta)
goto done; break;
scanned += delta; scanned += delta;
if (scanned >= nr_to_scan) if (scanned >= nr_to_scan)
...@@ -5201,10 +5197,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc ...@@ -5201,10 +5197,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
cond_resched(); cond_resched();
} }
/* see the comment in lru_gen_age_node() */
if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
sc->memcgs_need_aging = false;
done:
clear_mm_walk(); clear_mm_walk();
blk_finish_plug(&plug); blk_finish_plug(&plug);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment