Commit 91cdcd8d authored by Johannes Weiner's avatar Johannes Weiner Committed by Andrew Morton

mm: zswap: optimize zswap pool size tracking

Profiling the munmap() of a zswapped memory region shows 60% of the total
cycles currently going into updating the zswap_pool_total_size.

There are three consumers of this counter:
- store, to enforce the globally configured pool limit
- meminfo & debugfs, to report the size to the user
- shrink, to determine the batch size for each cycle

Instead of aggregating everytime an entry enters or exits the zswap
pool, aggregate the value from the zpools on-demand:

- Stores aggregate the counter anyway upon success. Aggregating to
  check the limit instead is the same amount of work.

- Meminfo & debugfs might benefit somewhat from a pre-aggregated
  counter, but aren't exactly hotpaths.

- Shrinking can aggregate once for every cycle instead of doing it for
  every freed entry. As the shrinker might work on tens or hundreds of
  objects per scan cycle, this is a large reduction in aggregations.

The paths that benefit dramatically are swapin, swapoff, and unmaps. 
There could be millions of pages being processed until somebody asks for
the pool size again.  This eliminates the pool size updates from those
paths entirely.

Top profile entries for a 24G range munmap(), before:

    38.54%  zswap-unmap  [kernel.kallsyms]  [k] zs_zpool_total_size
    12.51%  zswap-unmap  [kernel.kallsyms]  [k] zpool_get_total_size
     9.10%  zswap-unmap  [kernel.kallsyms]  [k] zswap_update_total_size
     2.95%  zswap-unmap  [kernel.kallsyms]  [k] obj_cgroup_uncharge_zswap
     2.88%  zswap-unmap  [kernel.kallsyms]  [k] __slab_free
     2.86%  zswap-unmap  [kernel.kallsyms]  [k] xas_store

and after:

     7.70%  zswap-unmap  [kernel.kallsyms]  [k] __slab_free
     7.16%  zswap-unmap  [kernel.kallsyms]  [k] obj_cgroup_uncharge_zswap
     6.74%  zswap-unmap  [kernel.kallsyms]  [k] xas_store

It was also briefly considered to move to a single atomic in zswap
that is updated by the backends, since zswap only cares about the sum
of all pools anyway. However, zram directly needs per-pool information
out of zsmalloc. To keep the backend from having to update two atomics
every time, I opted for the lazy aggregation instead for now.

Link: https://lkml.kernel.org/r/20240312153901.3441-1-hannes@cmpxchg.orgSigned-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Acked-by: default avatarYosry Ahmed <yosryahmed@google.com>
Reviewed-by: default avatarChengming Zhou <chengming.zhou@linux.dev>
Reviewed-by: default avatarNhat Pham <nphamcs@gmail.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 64078b3d
...@@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) ...@@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SwapTotal: ", i.totalswap); show_val_kb(m, "SwapTotal: ", i.totalswap);
show_val_kb(m, "SwapFree: ", i.freeswap); show_val_kb(m, "SwapFree: ", i.freeswap);
#ifdef CONFIG_ZSWAP #ifdef CONFIG_ZSWAP
seq_printf(m, "Zswap: %8lu kB\n", show_val_kb(m, "Zswap: ", zswap_total_pages());
(unsigned long)(zswap_pool_total_size >> 10));
seq_printf(m, "Zswapped: %8lu kB\n", seq_printf(m, "Zswapped: %8lu kB\n",
(unsigned long)atomic_read(&zswap_stored_pages) << (unsigned long)atomic_read(&zswap_stored_pages) <<
(PAGE_SHIFT - 10)); (PAGE_SHIFT - 10));
......
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
struct lruvec; struct lruvec;
extern u64 zswap_pool_total_size;
extern atomic_t zswap_stored_pages; extern atomic_t zswap_stored_pages;
#ifdef CONFIG_ZSWAP #ifdef CONFIG_ZSWAP
...@@ -27,6 +26,7 @@ struct zswap_lruvec_state { ...@@ -27,6 +26,7 @@ struct zswap_lruvec_state {
atomic_long_t nr_zswap_protected; atomic_long_t nr_zswap_protected;
}; };
unsigned long zswap_total_pages(void);
bool zswap_store(struct folio *folio); bool zswap_store(struct folio *folio);
bool zswap_load(struct folio *folio); bool zswap_load(struct folio *folio);
void zswap_invalidate(swp_entry_t swp); void zswap_invalidate(swp_entry_t swp);
......
...@@ -43,8 +43,6 @@ ...@@ -43,8 +43,6 @@
/********************************* /*********************************
* statistics * statistics
**********************************/ **********************************/
/* Total bytes used by the compressed storage */
u64 zswap_pool_total_size;
/* The number of compressed pages currently stored in zswap */ /* The number of compressed pages currently stored in zswap */
atomic_t zswap_stored_pages = ATOMIC_INIT(0); atomic_t zswap_stored_pages = ATOMIC_INIT(0);
/* The number of same-value filled pages currently stored in zswap */ /* The number of same-value filled pages currently stored in zswap */
...@@ -265,45 +263,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) ...@@ -265,45 +263,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
zpool_get_type((p)->zpools[0])) zpool_get_type((p)->zpools[0]))
static bool zswap_is_full(void)
{
return totalram_pages() * zswap_max_pool_percent / 100 <
DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
}
static bool zswap_can_accept(void)
{
return totalram_pages() * zswap_accept_thr_percent / 100 *
zswap_max_pool_percent / 100 >
DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
}
static u64 get_zswap_pool_size(struct zswap_pool *pool)
{
u64 pool_size = 0;
int i;
for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
pool_size += zpool_get_total_size(pool->zpools[i]);
return pool_size;
}
static void zswap_update_total_size(void)
{
struct zswap_pool *pool;
u64 total = 0;
rcu_read_lock();
list_for_each_entry_rcu(pool, &zswap_pools, list)
total += get_zswap_pool_size(pool);
rcu_read_unlock();
zswap_pool_total_size = total;
}
/********************************* /*********************************
* pool functions * pool functions
**********************************/ **********************************/
...@@ -541,6 +500,33 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) ...@@ -541,6 +500,33 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
return NULL; return NULL;
} }
static unsigned long zswap_max_pages(void)
{
return totalram_pages() * zswap_max_pool_percent / 100;
}
static unsigned long zswap_accept_thr_pages(void)
{
return zswap_max_pages() * zswap_accept_thr_percent / 100;
}
unsigned long zswap_total_pages(void)
{
struct zswap_pool *pool;
u64 total = 0;
rcu_read_lock();
list_for_each_entry_rcu(pool, &zswap_pools, list) {
int i;
for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
total += zpool_get_total_size(pool->zpools[i]);
}
rcu_read_unlock();
return total >> PAGE_SHIFT;
}
/********************************* /*********************************
* param callbacks * param callbacks
**********************************/ **********************************/
...@@ -913,7 +899,6 @@ static void zswap_entry_free(struct zswap_entry *entry) ...@@ -913,7 +899,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
} }
zswap_entry_cache_free(entry); zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages); atomic_dec(&zswap_stored_pages);
zswap_update_total_size();
} }
/* /*
...@@ -1344,7 +1329,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, ...@@ -1344,7 +1329,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
} else { } else {
nr_backing = zswap_pool_total_size >> PAGE_SHIFT; nr_backing = zswap_total_pages();
nr_stored = atomic_read(&zswap_nr_stored); nr_stored = atomic_read(&zswap_nr_stored);
} }
...@@ -1412,6 +1397,10 @@ static void shrink_worker(struct work_struct *w) ...@@ -1412,6 +1397,10 @@ static void shrink_worker(struct work_struct *w)
{ {
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
int ret, failures = 0; int ret, failures = 0;
unsigned long thr;
/* Reclaim down to the accept threshold */
thr = zswap_accept_thr_pages();
/* global reclaim will select cgroup in a round-robin fashion. */ /* global reclaim will select cgroup in a round-robin fashion. */
do { do {
...@@ -1459,10 +1448,9 @@ static void shrink_worker(struct work_struct *w) ...@@ -1459,10 +1448,9 @@ static void shrink_worker(struct work_struct *w)
break; break;
if (ret && ++failures == MAX_RECLAIM_RETRIES) if (ret && ++failures == MAX_RECLAIM_RETRIES)
break; break;
resched: resched:
cond_resched(); cond_resched();
} while (!zswap_can_accept()); } while (zswap_total_pages() > thr);
} }
static int zswap_is_page_same_filled(void *ptr, unsigned long *value) static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
...@@ -1503,6 +1491,7 @@ bool zswap_store(struct folio *folio) ...@@ -1503,6 +1491,7 @@ bool zswap_store(struct folio *folio)
struct zswap_entry *entry, *dupentry; struct zswap_entry *entry, *dupentry;
struct obj_cgroup *objcg = NULL; struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL; struct mem_cgroup *memcg = NULL;
unsigned long max_pages, cur_pages;
VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_locked(folio));
VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
...@@ -1514,6 +1503,7 @@ bool zswap_store(struct folio *folio) ...@@ -1514,6 +1503,7 @@ bool zswap_store(struct folio *folio)
if (!zswap_enabled) if (!zswap_enabled)
goto check_old; goto check_old;
/* Check cgroup limits */
objcg = get_obj_cgroup_from_folio(folio); objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) { if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg); memcg = get_mem_cgroup_from_objcg(objcg);
...@@ -1524,15 +1514,18 @@ bool zswap_store(struct folio *folio) ...@@ -1524,15 +1514,18 @@ bool zswap_store(struct folio *folio)
mem_cgroup_put(memcg); mem_cgroup_put(memcg);
} }
/* reclaim space if needed */ /* Check global limits */
if (zswap_is_full()) { cur_pages = zswap_total_pages();
max_pages = zswap_max_pages();
if (cur_pages >= max_pages) {
zswap_pool_limit_hit++; zswap_pool_limit_hit++;
zswap_pool_reached_full = true; zswap_pool_reached_full = true;
goto shrink; goto shrink;
} }
if (zswap_pool_reached_full) { if (zswap_pool_reached_full) {
if (!zswap_can_accept()) if (cur_pages > zswap_accept_thr_pages())
goto shrink; goto shrink;
else else
zswap_pool_reached_full = false; zswap_pool_reached_full = false;
...@@ -1608,7 +1601,6 @@ bool zswap_store(struct folio *folio) ...@@ -1608,7 +1601,6 @@ bool zswap_store(struct folio *folio)
/* update stats */ /* update stats */
atomic_inc(&zswap_stored_pages); atomic_inc(&zswap_stored_pages);
zswap_update_total_size();
count_vm_event(ZSWPOUT); count_vm_event(ZSWPOUT);
return true; return true;
...@@ -1752,6 +1744,13 @@ void zswap_swapoff(int type) ...@@ -1752,6 +1744,13 @@ void zswap_swapoff(int type)
static struct dentry *zswap_debugfs_root; static struct dentry *zswap_debugfs_root;
static int debugfs_get_total_size(void *data, u64 *val)
{
*val = zswap_total_pages() * PAGE_SIZE;
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n");
static int zswap_debugfs_init(void) static int zswap_debugfs_init(void)
{ {
if (!debugfs_initialized()) if (!debugfs_initialized())
...@@ -1773,8 +1772,8 @@ static int zswap_debugfs_init(void) ...@@ -1773,8 +1772,8 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_reject_compress_poor); zswap_debugfs_root, &zswap_reject_compress_poor);
debugfs_create_u64("written_back_pages", 0444, debugfs_create_u64("written_back_pages", 0444,
zswap_debugfs_root, &zswap_written_back_pages); zswap_debugfs_root, &zswap_written_back_pages);
debugfs_create_u64("pool_total_size", 0444, debugfs_create_file("pool_total_size", 0444,
zswap_debugfs_root, &zswap_pool_total_size); zswap_debugfs_root, NULL, &total_size_fops);
debugfs_create_atomic_t("stored_pages", 0444, debugfs_create_atomic_t("stored_pages", 0444,
zswap_debugfs_root, &zswap_stored_pages); zswap_debugfs_root, &zswap_stored_pages);
debugfs_create_atomic_t("same_filled_pages", 0444, debugfs_create_atomic_t("same_filled_pages", 0444,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment