Commit f36f3010 authored by Chunhai Guo's avatar Chunhai Guo Committed by Gao Xiang

erofs: rename per-CPU buffers to global buffer pool and make it configurable

It will cost more time if compressed buffers are allocated on demand for
low-latency algorithms (like lz4) so EROFS uses per-CPU buffers to keep
compressed data if in-place decompression is unfulfilled.  While it is kind
of wasteful of memory for a device with hundreds of CPUs, and only a small
number of CPUs concurrently decompress most of the time.

This patch renames it as 'global buffer pool' and makes it configurable.
This allows two or more CPUs to share a common buffer to reduce memory
occupation.
Suggested-by: default avatarGao Xiang <xiang@kernel.org>
Reviewed-by: default avatarGao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: default avatarChunhai Guo <guochunhai@vivo.com>
Link: https://lore.kernel.org/r/20240402100036.2673604-1-guochunhai@vivo.comSigned-off-by: default avatarSandeep Dhavale <dhavale@google.com>
Link: https://lore.kernel.org/r/20240408215231.3376659-1-dhavale@google.comSigned-off-by: default avatarGao Xiang <hsiangkao@linux.alibaba.com>
parent cacd5b04
......@@ -3,7 +3,7 @@
obj-$(CONFIG_EROFS_FS) += erofs.o
erofs-objs := super.o inode.o data.o namei.o dir.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o zutil.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
......@@ -54,7 +54,7 @@ static int z_erofs_load_lz4_config(struct super_block *sb,
sbi->lz4.max_distance_pages = distance ?
DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
LZ4_MAX_DISTANCE_PAGES;
return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
return z_erofs_gbuf_growsize(sbi->lz4.max_pclusterblks);
}
/*
......@@ -159,7 +159,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
docopy:
/* Or copy compressed data which can be overlapped to per-CPU buffer */
in = rq->in;
src = erofs_get_pcpubuf(ctx->inpages);
src = z_erofs_get_gbuf(ctx->inpages);
if (!src) {
DBG_BUGON(1);
kunmap_local(inpage);
......@@ -260,7 +260,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
} else if (maptype == 1) {
vm_unmap_ram(src, ctx->inpages);
} else if (maptype == 2) {
erofs_put_pcpubuf(src);
z_erofs_put_gbuf(src);
} else if (maptype != 3) {
DBG_BUGON(1);
return -EFAULT;
......
......@@ -463,11 +463,11 @@ int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
struct erofs_workgroup *egrp);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags);
void *erofs_get_pcpubuf(unsigned int requiredpages);
void erofs_put_pcpubuf(void *ptr);
int erofs_pcpubuf_growsize(unsigned int nrpages);
void __init erofs_pcpubuf_init(void);
void erofs_pcpubuf_exit(void);
void *z_erofs_get_gbuf(unsigned int requiredpages);
void z_erofs_put_gbuf(void *ptr);
int z_erofs_gbuf_growsize(unsigned int nrpages);
int __init z_erofs_gbuf_init(void);
void z_erofs_gbuf_exit(void);
int erofs_init_managed_cache(struct super_block *sb);
int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb);
#else
......@@ -477,8 +477,8 @@ static inline int erofs_init_shrinker(void) { return 0; }
static inline void erofs_exit_shrinker(void) {}
static inline int z_erofs_init_zip_subsystem(void) { return 0; }
static inline void z_erofs_exit_zip_subsystem(void) {}
static inline void erofs_pcpubuf_init(void) {}
static inline void erofs_pcpubuf_exit(void) {}
static inline int z_erofs_gbuf_init(void) { return 0; }
static inline void z_erofs_gbuf_exit(void) {}
static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
#endif /* !CONFIG_EROFS_FS_ZIP */
......
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) Gao Xiang <xiang@kernel.org>
*
* For low-latency decompression algorithms (e.g. lz4), reserve consecutive
* per-CPU virtual memory (in pages) in advance to store such inplace I/O
* data if inplace decompression is failed (due to unmet inplace margin for
* example).
*/
#include "internal.h"
struct erofs_pcpubuf {
raw_spinlock_t lock;
void *ptr;
struct page **pages;
unsigned int nrpages;
};
static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
void *erofs_get_pcpubuf(unsigned int requiredpages)
__acquires(pcb->lock)
{
struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
raw_spin_lock(&pcb->lock);
/* check if the per-CPU buffer is too small */
if (requiredpages > pcb->nrpages) {
raw_spin_unlock(&pcb->lock);
put_cpu_var(erofs_pcb);
/* (for sparse checker) pretend pcb->lock is still taken */
__acquire(pcb->lock);
return NULL;
}
return pcb->ptr;
}
void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
{
struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
DBG_BUGON(pcb->ptr != ptr);
raw_spin_unlock(&pcb->lock);
put_cpu_var(erofs_pcb);
}
/* the next step: support per-CPU page buffers hotplug */
int erofs_pcpubuf_growsize(unsigned int nrpages)
{
static DEFINE_MUTEX(pcb_resize_mutex);
static unsigned int pcb_nrpages;
struct page *pagepool = NULL;
int delta, cpu, ret, i;
mutex_lock(&pcb_resize_mutex);
delta = nrpages - pcb_nrpages;
ret = 0;
/* avoid shrinking pcpubuf, since no idea how many fses rely on */
if (delta <= 0)
goto out;
for_each_possible_cpu(cpu) {
struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
struct page **pages, **oldpages;
void *ptr, *old_ptr;
pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
if (!pages) {
ret = -ENOMEM;
break;
}
for (i = 0; i < nrpages; ++i) {
pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
if (!pages[i]) {
ret = -ENOMEM;
oldpages = pages;
goto free_pagearray;
}
}
ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
if (!ptr) {
ret = -ENOMEM;
oldpages = pages;
goto free_pagearray;
}
raw_spin_lock(&pcb->lock);
old_ptr = pcb->ptr;
pcb->ptr = ptr;
oldpages = pcb->pages;
pcb->pages = pages;
i = pcb->nrpages;
pcb->nrpages = nrpages;
raw_spin_unlock(&pcb->lock);
if (!oldpages) {
DBG_BUGON(old_ptr);
continue;
}
if (old_ptr)
vunmap(old_ptr);
free_pagearray:
while (i)
erofs_pagepool_add(&pagepool, oldpages[--i]);
kfree(oldpages);
if (ret)
break;
}
pcb_nrpages = nrpages;
erofs_release_pages(&pagepool);
out:
mutex_unlock(&pcb_resize_mutex);
return ret;
}
void __init erofs_pcpubuf_init(void)
{
int cpu;
for_each_possible_cpu(cpu) {
struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
raw_spin_lock_init(&pcb->lock);
}
}
void erofs_pcpubuf_exit(void)
{
int cpu, i;
for_each_possible_cpu(cpu) {
struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
if (pcb->ptr) {
vunmap(pcb->ptr);
pcb->ptr = NULL;
}
if (!pcb->pages)
continue;
for (i = 0; i < pcb->nrpages; ++i)
if (pcb->pages[i])
put_page(pcb->pages[i]);
kfree(pcb->pages);
pcb->pages = NULL;
}
}
......@@ -859,7 +859,10 @@ static int __init erofs_module_init(void)
if (err)
goto deflate_err;
erofs_pcpubuf_init();
err = z_erofs_gbuf_init();
if (err)
goto gbuf_err;
err = z_erofs_init_zip_subsystem();
if (err)
goto zip_err;
......@@ -879,6 +882,8 @@ static int __init erofs_module_init(void)
sysfs_err:
z_erofs_exit_zip_subsystem();
zip_err:
z_erofs_gbuf_exit();
gbuf_err:
z_erofs_deflate_exit();
deflate_err:
z_erofs_lzma_exit();
......@@ -902,7 +907,7 @@ static void __exit erofs_module_exit(void)
z_erofs_lzma_exit();
erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
erofs_pcpubuf_exit();
z_erofs_gbuf_exit();
}
static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
......
......@@ -5,6 +5,18 @@
*/
#include "internal.h"
struct z_erofs_gbuf {
spinlock_t lock;
void *ptr;
struct page **pages;
unsigned int nrpages;
};
static struct z_erofs_gbuf *z_erofs_gbufpool;
static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages;
module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444);
static atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */
/* protected by 'erofs_sb_list_lock' */
static unsigned int shrinker_run_no;
......@@ -14,6 +26,142 @@ static DEFINE_SPINLOCK(erofs_sb_list_lock);
static LIST_HEAD(erofs_sb_list);
static struct shrinker *erofs_shrinker_info;
static unsigned int z_erofs_gbuf_id(void)
{
return raw_smp_processor_id() % z_erofs_gbuf_count;
}
void *z_erofs_get_gbuf(unsigned int requiredpages)
__acquires(gbuf->lock)
{
struct z_erofs_gbuf *gbuf;
gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()];
spin_lock(&gbuf->lock);
/* check if the buffer is too small */
if (requiredpages > gbuf->nrpages) {
spin_unlock(&gbuf->lock);
/* (for sparse checker) pretend gbuf->lock is still taken */
__acquire(gbuf->lock);
return NULL;
}
return gbuf->ptr;
}
void z_erofs_put_gbuf(void *ptr) __releases(gbuf->lock)
{
struct z_erofs_gbuf *gbuf;
gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()];
DBG_BUGON(gbuf->ptr != ptr);
spin_unlock(&gbuf->lock);
}
int z_erofs_gbuf_growsize(unsigned int nrpages)
{
static DEFINE_MUTEX(gbuf_resize_mutex);
struct page *pagepool = NULL;
int delta, ret, i, j;
mutex_lock(&gbuf_resize_mutex);
delta = nrpages - z_erofs_gbuf_nrpages;
ret = 0;
/* avoid shrinking gbufs, since no idea how many fses rely on */
if (delta <= 0)
goto out;
for (i = 0; i < z_erofs_gbuf_count; ++i) {
struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i];
struct page **pages, **tmp_pages;
void *ptr, *old_ptr = NULL;
ret = -ENOMEM;
tmp_pages = kcalloc(nrpages, sizeof(*tmp_pages), GFP_KERNEL);
if (!tmp_pages)
break;
for (j = 0; j < nrpages; ++j) {
tmp_pages[j] = erofs_allocpage(&pagepool, GFP_KERNEL);
if (!tmp_pages[j])
goto free_pagearray;
}
ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL);
if (!ptr)
goto free_pagearray;
pages = tmp_pages;
spin_lock(&gbuf->lock);
old_ptr = gbuf->ptr;
gbuf->ptr = ptr;
tmp_pages = gbuf->pages;
gbuf->pages = pages;
j = gbuf->nrpages;
gbuf->nrpages = nrpages;
spin_unlock(&gbuf->lock);
ret = 0;
if (!tmp_pages) {
DBG_BUGON(old_ptr);
continue;
}
if (old_ptr)
vunmap(old_ptr);
free_pagearray:
while (j)
erofs_pagepool_add(&pagepool, tmp_pages[--j]);
kfree(tmp_pages);
if (ret)
break;
}
z_erofs_gbuf_nrpages = nrpages;
erofs_release_pages(&pagepool);
out:
mutex_unlock(&gbuf_resize_mutex);
return ret;
}
int __init z_erofs_gbuf_init(void)
{
unsigned int i = num_possible_cpus();
if (!z_erofs_gbuf_count)
z_erofs_gbuf_count = i;
else
z_erofs_gbuf_count = min(z_erofs_gbuf_count, i);
z_erofs_gbufpool = kcalloc(z_erofs_gbuf_count,
sizeof(*z_erofs_gbufpool), GFP_KERNEL);
if (!z_erofs_gbufpool)
return -ENOMEM;
for (i = 0; i < z_erofs_gbuf_count; ++i)
spin_lock_init(&z_erofs_gbufpool[i].lock);
return 0;
}
void z_erofs_gbuf_exit(void)
{
int i;
for (i = 0; i < z_erofs_gbuf_count; ++i) {
struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i];
if (gbuf->ptr) {
vunmap(gbuf->ptr);
gbuf->ptr = NULL;
}
if (!gbuf->pages)
continue;
for (i = 0; i < gbuf->nrpages; ++i)
if (gbuf->pages[i])
put_page(gbuf->pages[i]);
kfree(gbuf->pages);
gbuf->pages = NULL;
}
kfree(z_erofs_gbufpool);
}
struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
{
struct page *page = *pagepool;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment