Commit dd2c0198 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'erofs-for-6.6-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs

Pull erofs updates from Gao Xiang:
 "In this cycle, a xattr bloom filter feature is introduced to speed up
  negative xattr lookups, which was originally suggested by Alexander
  for Composefs use cases.

  Additionally, the DEFLATE algorithm is now supported, which can be
  used together with hardware accelerators for our cloud workloads. Each
  supported compression algorithm can be selected on a per-file basis
  for specific access patterns too.

  There are also some random fixes and cleanups as usual:

   - Support xattr bloom filter to optimize negative xattr lookups

   - Support DEFLATE compression algorithm as an alternative

   - Fix a regression that ztailpacking pclusters don't release properly

   - Avoid warning dedupe and fragments features anymore

   - Some folio conversions and cleanups"

* tag 'erofs-for-6.6-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs:
  erofs: release ztailpacking pclusters properly
  erofs: don't warn dedupe and fragments features anymore
  erofs: adapt folios for z_erofs_read_folio()
  erofs: adapt folios for z_erofs_readahead()
  erofs: get rid of fe->backmost for cache decompression
  erofs: drop z_erofs_page_mark_eio()
  erofs: tidy up z_erofs_do_read_page()
  erofs: move preparation logic into z_erofs_pcluster_begin()
  erofs: avoid obsolete {collector,collection} terms
  erofs: simplify z_erofs_read_fragment()
  erofs: remove redundant erofs_fs_type declaration in super.c
  erofs: add necessary kmem_cache_create flags for erofs inode cache
  erofs: clean up redundant comment and adjust code alignment
  erofs: refine warning messages for zdata I/Os
  erofs: boost negative xattr lookup with bloom filter
  erofs: update on-disk format for xattr name filter
  erofs: DEFLATE compression support
parents f20ae9cf 91b1ad08
......@@ -38,6 +38,7 @@ config EROFS_FS_DEBUG
config EROFS_FS_XATTR
bool "EROFS extended attributes"
depends on EROFS_FS
select XXHASH
default y
help
Extended attributes are name:value pairs associated with inodes by
......@@ -99,6 +100,21 @@ config EROFS_FS_ZIP_LZMA
If unsure, say N.
config EROFS_FS_ZIP_DEFLATE
bool "EROFS DEFLATE compressed data support"
depends on EROFS_FS_ZIP
select ZLIB_INFLATE
help
Saying Y here includes support for reading EROFS file systems
containing DEFLATE compressed data. It gives better compression
ratios than the default LZ4 format, while it costs more CPU
overhead.
DEFLATE support is an experimental feature for now and so most
file systems will be readable without selecting this option.
If unsure, say N.
config EROFS_FS_ONDEMAND
bool "EROFS fscache-based on-demand read support"
depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
......
......@@ -5,4 +5,5 @@ erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
......@@ -94,4 +94,6 @@ extern const struct z_erofs_decompressor erofs_decompressors[];
/* prototypes for specific algorithms */
int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
struct page **pagepool);
int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
struct page **pagepool);
#endif
......@@ -379,4 +379,10 @@ const struct z_erofs_decompressor erofs_decompressors[] = {
.name = "lzma"
},
#endif
#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
[Z_EROFS_COMPRESSION_DEFLATE] = {
.decompress = z_erofs_deflate_decompress,
.name = "deflate"
},
#endif
};
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/module.h>
#include <linux/zlib.h>
#include "compress.h"
struct z_erofs_deflate {
struct z_erofs_deflate *next;
struct z_stream_s z;
u8 bounce[PAGE_SIZE];
};
static DEFINE_SPINLOCK(z_erofs_deflate_lock);
static unsigned int z_erofs_deflate_nstrms, z_erofs_deflate_avail_strms;
static struct z_erofs_deflate *z_erofs_deflate_head;
static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq);
module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444);
void z_erofs_deflate_exit(void)
{
/* there should be no running fs instance */
while (z_erofs_deflate_avail_strms) {
struct z_erofs_deflate *strm;
spin_lock(&z_erofs_deflate_lock);
strm = z_erofs_deflate_head;
if (!strm) {
spin_unlock(&z_erofs_deflate_lock);
continue;
}
z_erofs_deflate_head = NULL;
spin_unlock(&z_erofs_deflate_lock);
while (strm) {
struct z_erofs_deflate *n = strm->next;
vfree(strm->z.workspace);
kfree(strm);
--z_erofs_deflate_avail_strms;
strm = n;
}
}
}
int __init z_erofs_deflate_init(void)
{
/* by default, use # of possible CPUs instead */
if (!z_erofs_deflate_nstrms)
z_erofs_deflate_nstrms = num_possible_cpus();
for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms;
++z_erofs_deflate_avail_strms) {
struct z_erofs_deflate *strm;
strm = kzalloc(sizeof(*strm), GFP_KERNEL);
if (!strm)
goto out_failed;
/* XXX: in-kernel zlib cannot shrink windowbits currently */
strm->z.workspace = vmalloc(zlib_inflate_workspacesize());
if (!strm->z.workspace) {
kfree(strm);
goto out_failed;
}
spin_lock(&z_erofs_deflate_lock);
strm->next = z_erofs_deflate_head;
z_erofs_deflate_head = strm;
spin_unlock(&z_erofs_deflate_lock);
}
return 0;
out_failed:
pr_err("failed to allocate zlib workspace\n");
z_erofs_deflate_exit();
return -ENOMEM;
}
int z_erofs_load_deflate_config(struct super_block *sb,
struct erofs_super_block *dsb,
struct z_erofs_deflate_cfgs *dfl, int size)
{
if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) {
erofs_err(sb, "invalid deflate cfgs, size=%u", size);
return -EINVAL;
}
if (dfl->windowbits > MAX_WBITS) {
erofs_err(sb, "unsupported windowbits %u", dfl->windowbits);
return -EOPNOTSUPP;
}
erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!");
return 0;
}
int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
const unsigned int nrpages_in =
PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
struct super_block *sb = rq->sb;
unsigned int insz, outsz, pofs;
struct z_erofs_deflate *strm;
u8 *kin, *kout = NULL;
bool bounced = false;
int no = -1, ni = 0, j = 0, zerr, err;
/* 1. get the exact DEFLATE compressed size */
kin = kmap_local_page(*rq->in);
err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
min_t(unsigned int, rq->inputsize,
sb->s_blocksize - rq->pageofs_in));
if (err) {
kunmap_local(kin);
return err;
}
/* 2. get an available DEFLATE context */
again:
spin_lock(&z_erofs_deflate_lock);
strm = z_erofs_deflate_head;
if (!strm) {
spin_unlock(&z_erofs_deflate_lock);
wait_event(z_erofs_deflate_wq, READ_ONCE(z_erofs_deflate_head));
goto again;
}
z_erofs_deflate_head = strm->next;
spin_unlock(&z_erofs_deflate_lock);
/* 3. multi-call decompress */
insz = rq->inputsize;
outsz = rq->outputsize;
zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS);
if (zerr != Z_OK) {
err = -EIO;
goto failed_zinit;
}
pofs = rq->pageofs_out;
strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in);
insz -= strm->z.avail_in;
strm->z.next_in = kin + rq->pageofs_in;
strm->z.avail_out = 0;
while (1) {
if (!strm->z.avail_out) {
if (++no >= nrpages_out || !outsz) {
erofs_err(sb, "insufficient space for decompressed data");
err = -EFSCORRUPTED;
break;
}
if (kout)
kunmap_local(kout);
strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs);
outsz -= strm->z.avail_out;
if (!rq->out[no]) {
rq->out[no] = erofs_allocpage(pagepool,
GFP_KERNEL | __GFP_NOFAIL);
set_page_private(rq->out[no],
Z_EROFS_SHORTLIVED_PAGE);
}
kout = kmap_local_page(rq->out[no]);
strm->z.next_out = kout + pofs;
pofs = 0;
}
if (!strm->z.avail_in && insz) {
if (++ni >= nrpages_in) {
erofs_err(sb, "invalid compressed data");
err = -EFSCORRUPTED;
break;
}
if (kout) { /* unlike kmap(), take care of the orders */
j = strm->z.next_out - kout;
kunmap_local(kout);
}
kunmap_local(kin);
strm->z.avail_in = min_t(u32, insz, PAGE_SIZE);
insz -= strm->z.avail_in;
kin = kmap_local_page(rq->in[ni]);
strm->z.next_in = kin;
bounced = false;
if (kout) {
kout = kmap_local_page(rq->out[no]);
strm->z.next_out = kout + j;
}
}
/*
* Handle overlapping: Use bounced buffer if the compressed
* data is under processing; Or use short-lived pages from the
* on-stack pagepool where pages share among the same request
* and not _all_ inplace I/O pages are needed to be doubled.
*/
if (!bounced && rq->out[no] == rq->in[ni]) {
memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in);
strm->z.next_in = strm->bounce;
bounced = true;
}
for (j = ni + 1; j < nrpages_in; ++j) {
struct page *tmppage;
if (rq->out[no] != rq->in[j])
continue;
DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
rq->in[j]));
tmppage = erofs_allocpage(pagepool,
GFP_KERNEL | __GFP_NOFAIL);
set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
copy_highpage(tmppage, rq->in[j]);
rq->in[j] = tmppage;
}
zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH);
if (zerr != Z_OK || !(outsz + strm->z.avail_out)) {
if (zerr == Z_OK && rq->partial_decoding)
break;
if (zerr == Z_STREAM_END && !outsz)
break;
erofs_err(sb, "failed to decompress %d in[%u] out[%u]",
zerr, rq->inputsize, rq->outputsize);
err = -EFSCORRUPTED;
break;
}
}
if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
err = -EIO;
if (kout)
kunmap_local(kout);
failed_zinit:
kunmap_local(kin);
/* 4. push back DEFLATE stream context to the global list */
spin_lock(&z_erofs_deflate_lock);
strm->next = z_erofs_deflate_head;
z_erofs_deflate_head = strm;
spin_unlock(&z_erofs_deflate_lock);
wake_up(&z_erofs_deflate_wq);
return err;
}
......@@ -13,6 +13,7 @@
#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
......@@ -81,7 +82,8 @@ struct erofs_super_block {
__u8 xattr_prefix_count; /* # of long xattr name prefixes */
__le32 xattr_prefix_start; /* start of long xattr prefixes */
__le64 packed_nid; /* nid of the special packed inode */
__u8 reserved2[24];
__u8 xattr_filter_reserved; /* reserved for xattr name filter */
__u8 reserved2[23];
};
/*
......@@ -200,7 +202,7 @@ struct erofs_inode_extended {
* for read-only fs, no need to introduce h_refcount
*/
struct erofs_xattr_ibody_header {
__le32 h_reserved;
__le32 h_name_filter; /* bit value 1 indicates not-present */
__u8 h_shared_count;
__u8 h_reserved2[7];
__le32 h_shared_xattrs[]; /* shared xattr id array */
......@@ -221,6 +223,10 @@ struct erofs_xattr_ibody_header {
#define EROFS_XATTR_LONG_PREFIX 0x80
#define EROFS_XATTR_LONG_PREFIX_MASK 0x7f
#define EROFS_XATTR_FILTER_BITS 32
#define EROFS_XATTR_FILTER_DEFAULT UINT32_MAX
#define EROFS_XATTR_FILTER_SEED 0x25BBE08F
/* xattr entry (for both inline & shared xattrs) */
struct erofs_xattr_entry {
__u8 e_name_len; /* length of name */
......@@ -289,6 +295,7 @@ struct erofs_dirent {
enum {
Z_EROFS_COMPRESSION_LZ4 = 0,
Z_EROFS_COMPRESSION_LZMA = 1,
Z_EROFS_COMPRESSION_DEFLATE = 2,
Z_EROFS_COMPRESSION_MAX
};
#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
......@@ -309,6 +316,12 @@ struct z_erofs_lzma_cfgs {
#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
/* 6 bytes (+ length field = 8 bytes) */
struct z_erofs_deflate_cfgs {
u8 windowbits; /* 8..15 for DEFLATE */
u8 reserved[5];
} __packed;
/*
* bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
* e.g. for 4k logical cluster size, 4B if compacted 2B is off;
......
......@@ -151,6 +151,7 @@ struct erofs_sb_info {
u32 xattr_prefix_start;
u8 xattr_prefix_count;
struct erofs_xattr_prefix_item *xattr_prefixes;
unsigned int xattr_filter_reserved;
#endif
u16 device_id_mask; /* valid bits of device id to be used */
......@@ -251,6 +252,7 @@ EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
/* atomic flag definitions */
#define EROFS_I_EA_INITED_BIT 0
......@@ -270,6 +272,7 @@ struct erofs_inode {
unsigned char inode_isize;
unsigned int xattr_isize;
unsigned int xattr_name_filter;
unsigned int xattr_shared_count;
unsigned int *xattr_shared_xattrs;
......@@ -519,6 +522,26 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */
#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
int __init z_erofs_deflate_init(void);
void z_erofs_deflate_exit(void);
int z_erofs_load_deflate_config(struct super_block *sb,
struct erofs_super_block *dsb,
struct z_erofs_deflate_cfgs *dfl, int size);
#else
static inline int z_erofs_deflate_init(void) { return 0; }
static inline int z_erofs_deflate_exit(void) { return 0; }
static inline int z_erofs_load_deflate_config(struct super_block *sb,
struct erofs_super_block *dsb,
struct z_erofs_deflate_cfgs *dfl, int size) {
if (dfl) {
erofs_err(sb, "deflate algorithm isn't enabled");
return -EINVAL;
}
return 0;
}
#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */
#ifdef CONFIG_EROFS_FS_ONDEMAND
int erofs_fscache_register_fs(struct super_block *sb);
void erofs_fscache_unregister_fs(struct super_block *sb);
......
......@@ -19,10 +19,8 @@
#include <trace/events/erofs.h>
static struct kmem_cache *erofs_inode_cachep __read_mostly;
struct file_system_type erofs_fs_type;
void _erofs_err(struct super_block *sb, const char *function,
const char *fmt, ...)
void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
......@@ -32,12 +30,11 @@ void _erofs_err(struct super_block *sb, const char *function,
vaf.fmt = fmt;
vaf.va = &args;
pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf);
pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
va_end(args);
}
void _erofs_info(struct super_block *sb, const char *function,
const char *fmt, ...)
void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
......@@ -102,11 +99,9 @@ static void erofs_free_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
/* be careful of RCU symlink path */
if (inode->i_op == &erofs_fast_symlink_iops)
kfree(inode->i_link);
kfree(vi->xattr_shared_xattrs);
kmem_cache_free(erofs_inode_cachep, vi);
}
......@@ -119,8 +114,7 @@ static bool check_layout_compatibility(struct super_block *sb,
/* check if current kernel meets all mandatory requirements */
if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) {
erofs_err(sb,
"unidentified incompatible feature %x, please upgrade kernel version",
erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
feature & ~EROFS_ALL_FEATURE_INCOMPAT);
return false;
}
......@@ -201,6 +195,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
case Z_EROFS_COMPRESSION_LZMA:
ret = z_erofs_load_lzma_config(sb, dsb, data, size);
break;
case Z_EROFS_COMPRESSION_DEFLATE:
ret = z_erofs_load_deflate_config(sb, dsb, data, size);
break;
default:
DBG_BUGON(1);
ret = -EFAULT;
......@@ -388,6 +385,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start);
sbi->xattr_prefix_count = dsb->xattr_prefix_count;
sbi->xattr_filter_reserved = dsb->xattr_filter_reserved;
#endif
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
sbi->root_nid = le16_to_cpu(dsb->root_nid);
......@@ -420,16 +418,11 @@ static int erofs_read_superblock(struct super_block *sb)
if (erofs_is_fscache_mode(sb))
erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
if (erofs_sb_has_fragments(sbi))
erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!");
if (erofs_sb_has_dedupe(sbi))
erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!");
out:
erofs_put_metabuf(&buf);
return ret;
}
/* set up default EROFS parameters */
static void erofs_default_options(struct erofs_fs_context *ctx)
{
#ifdef CONFIG_EROFS_FS_ZIP
......@@ -731,7 +724,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
xa_init(&sbi->managed_pslots);
#endif
/* get the root inode */
inode = erofs_iget(sb, ROOT_NID(sbi));
if (IS_ERR(inode))
return PTR_ERR(inode);
......@@ -748,7 +740,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return -ENOMEM;
erofs_shrinker_register(sb);
/* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */
if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
sbi->packed_inode = erofs_iget(sb, sbi->packed_nid);
if (IS_ERR(sbi->packed_inode)) {
......@@ -881,10 +872,6 @@ static int erofs_init_fs_context(struct fs_context *fc)
return 0;
}
/*
* could be triggered after deactivate_locked_super()
* is called, thus including umount and failed to initialize.
*/
static void erofs_kill_sb(struct super_block *sb)
{
struct erofs_sb_info *sbi;
......@@ -913,7 +900,6 @@ static void erofs_kill_sb(struct super_block *sb)
sb->s_fs_info = NULL;
}
/* called when ->s_root is non-NULL */
static void erofs_put_super(struct super_block *sb)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
......@@ -951,7 +937,7 @@ static int __init erofs_module_init(void)
erofs_inode_cachep = kmem_cache_create("erofs_inode",
sizeof(struct erofs_inode), 0,
SLAB_RECLAIM_ACCOUNT,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
erofs_inode_init_once);
if (!erofs_inode_cachep)
return -ENOMEM;
......@@ -964,6 +950,10 @@ static int __init erofs_module_init(void)
if (err)
goto lzma_err;
err = z_erofs_deflate_init();
if (err)
goto deflate_err;
erofs_pcpubuf_init();
err = z_erofs_init_zip_subsystem();
if (err)
......@@ -984,6 +974,8 @@ static int __init erofs_module_init(void)
sysfs_err:
z_erofs_exit_zip_subsystem();
zip_err:
z_erofs_deflate_exit();
deflate_err:
z_erofs_lzma_exit();
lzma_err:
erofs_exit_shrinker();
......@@ -1001,13 +993,13 @@ static void __exit erofs_module_exit(void)
erofs_exit_sysfs();
z_erofs_exit_zip_subsystem();
z_erofs_deflate_exit();
z_erofs_lzma_exit();
erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
erofs_pcpubuf_exit();
}
/* get filesystem statistics */
static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
......
......@@ -5,6 +5,7 @@
* Copyright (C) 2021-2022, Alibaba Cloud
*/
#include <linux/security.h>
#include <linux/xxhash.h>
#include "xattr.h"
struct erofs_xattr_iter {
......@@ -87,6 +88,7 @@ static int erofs_init_inode_xattrs(struct inode *inode)
}
ih = it.kaddr + erofs_blkoff(sb, it.pos);
vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter);
vi->xattr_shared_count = ih->h_shared_count;
vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
sizeof(uint), GFP_KERNEL);
......@@ -392,7 +394,10 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
void *buffer, size_t buffer_size)
{
int ret;
unsigned int hashbit;
struct erofs_xattr_iter it;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
if (!name)
return -EINVAL;
......@@ -401,6 +406,15 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
if (ret)
return ret;
/* reserved flag is non-zero if there's any change of on-disk format */
if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved) {
hashbit = xxh32(name, strlen(name),
EROFS_XATTR_FILTER_SEED + index);
hashbit &= EROFS_XATTR_FILTER_BITS - 1;
if (vi->xattr_name_filter & (1U << hashbit))
return -ENOATTR;
}
it.index = index;
it.name = (struct qstr)QSTR_INIT(name, strlen(name));
if (it.name.len > EROFS_NAME_LEN)
......
......@@ -143,22 +143,17 @@ static inline void z_erofs_onlinepage_split(struct page *page)
atomic_inc((atomic_t *)&page->private);
}
static inline void z_erofs_page_mark_eio(struct page *page)
static void z_erofs_onlinepage_endio(struct page *page, int err)
{
int orig;
int orig, v;
DBG_BUGON(!PagePrivate(page));
do {
orig = atomic_read((atomic_t *)&page->private);
} while (atomic_cmpxchg((atomic_t *)&page->private, orig,
orig | Z_EROFS_PAGE_EIO) != orig);
}
static inline void z_erofs_onlinepage_endio(struct page *page)
{
unsigned int v;
v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0);
} while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig);
DBG_BUGON(!PagePrivate(page));
v = atomic_dec_return((atomic_t *)&page->private);
if (!(v & ~Z_EROFS_PAGE_EIO)) {
set_page_private(page, 0);
ClearPagePrivate(page);
......@@ -507,19 +502,17 @@ enum z_erofs_pclustermode {
*/
Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
/*
* The current collection has been linked with the owned chain, and
* could also be linked with the remaining collections, which means
* if the processing page is the tail page of the collection, thus
* the current collection can safely use the whole page (since
* the previous collection is under control) for in-place I/O, as
* illustrated below:
* ________________________________________________________________
* The pcluster was just linked to a decompression chain by us. It can
* also be linked with the remaining pclusters, which means if the
* processing page is the tail page of a pcluster, this pcluster can
* safely use the whole page (since the previous pcluster is within the
* same chain) for in-place I/O, as illustrated below:
* ___________________________________________________
* | tail (partial) page | head (partial) page |
* | (of the current cl) | (of the previous collection) |
* | | |
* |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________|
* | (of the current pcl) | (of the previous pcl) |
* |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
*
* [ (*) the above page can be used as inplace I/O. ]
* [ (*) the page above can be used as inplace I/O. ]
*/
Z_EROFS_PCLUSTER_FOLLOWED,
};
......@@ -535,8 +528,6 @@ struct z_erofs_decompress_frontend {
z_erofs_next_pcluster_t owned_head;
enum z_erofs_pclustermode mode;
/* used for applying cache strategy on the fly */
bool backmost;
erofs_off_t headoffset;
/* a pointer used to pick up inplace I/O pages */
......@@ -545,7 +536,7 @@ struct z_erofs_decompress_frontend {
#define DECOMPRESS_FRONTEND_INIT(__i) { \
.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
.mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
.mode = Z_EROFS_PCLUSTER_FOLLOWED }
static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
{
......@@ -554,7 +545,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
return false;
if (fe->backmost)
if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED))
return true;
if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
......@@ -851,9 +842,11 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
return err;
}
static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
struct erofs_workgroup *grp = NULL;
int ret;
......@@ -863,8 +856,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
if (!(map->m_flags & EROFS_MAP_META)) {
grp = erofs_find_workgroup(fe->inode->i_sb,
map->m_pa >> PAGE_SHIFT);
grp = erofs_find_workgroup(sb, blknr);
} else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
DBG_BUGON(1);
return -EFSCORRUPTED;
......@@ -883,9 +875,26 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
} else if (ret) {
return ret;
}
z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
/* since file-backed online pages are traversed in reverse order */
if (!z_erofs_is_inline_pcluster(fe->pcl)) {
/* bind cache first when cached decompression is preferred */
z_erofs_bind_cache(fe);
} else {
void *mptr;
mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP);
if (IS_ERR(mptr)) {
ret = PTR_ERR(mptr);
erofs_err(sb, "failed to get inline data %d", ret);
return ret;
}
get_page(map->buf.page);
WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
}
/* file-backed inplace I/O pages are traversed in reverse order */
fe->icur = z_erofs_pclusterpages(fe->pcl);
return 0;
}
......@@ -908,12 +917,12 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
call_rcu(&pcl->rcu, z_erofs_rcu_callback);
}
static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
{
struct z_erofs_pcluster *pcl = fe->pcl;
if (!pcl)
return false;
return;
z_erofs_bvec_iter_end(&fe->biter);
mutex_unlock(&pcl->lock);
......@@ -929,37 +938,29 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
erofs_workgroup_put(&pcl->obj);
fe->pcl = NULL;
return true;
}
static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
struct page *page, unsigned int pageofs,
unsigned int len)
static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
unsigned int cur, unsigned int end, erofs_off_t pos)
{
struct super_block *sb = inode->i_sb;
struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode;
struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
u8 *src, *dst;
unsigned int i, cnt;
unsigned int cnt;
u8 *src;
if (!packed_inode)
return -EFSCORRUPTED;
buf.inode = packed_inode;
pos += EROFS_I(inode)->z_fragmentoff;
for (i = 0; i < len; i += cnt) {
cnt = min_t(unsigned int, len - i,
for (; cur < end; cur += cnt, pos += cnt) {
cnt = min_t(unsigned int, end - cur,
sb->s_blocksize - erofs_blkoff(sb, pos));
src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP);
if (IS_ERR(src)) {
erofs_put_metabuf(&buf);
return PTR_ERR(src);
}
dst = kmap_local_page(page);
memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt);
kunmap_local(dst);
pos += cnt;
memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt);
}
erofs_put_metabuf(&buf);
return 0;
......@@ -972,94 +973,60 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct erofs_map_blocks *const map = &fe->map;
const loff_t offset = page_offset(page);
bool tight = true, exclusive;
unsigned int cur, end, spiltted;
unsigned int cur, end, len, split;
int err = 0;
/* register locked file pages as online pages in pack */
z_erofs_onlinepage_init(page);
spiltted = 0;
split = 0;
end = PAGE_SIZE;
repeat:
cur = end - 1;
if (offset + cur < map->m_la ||
offset + cur >= map->m_la + map->m_llen) {
if (z_erofs_collector_end(fe))
fe->backmost = false;
map->m_la = offset + cur;
if (offset + end - 1 < map->m_la ||
offset + end - 1 >= map->m_la + map->m_llen) {
z_erofs_pcluster_end(fe);
map->m_la = offset + end - 1;
map->m_llen = 0;
err = z_erofs_map_blocks_iter(inode, map, 0);
if (err)
goto out;
} else {
if (fe->pcl)
goto hitted;
/* didn't get a valid pcluster previously (very rare) */
}
if (!(map->m_flags & EROFS_MAP_MAPPED) ||
map->m_flags & EROFS_MAP_FRAGMENT)
goto hitted;
cur = offset > map->m_la ? 0 : map->m_la - offset;
/* bump split parts first to avoid several separate cases */
++split;
err = z_erofs_collector_begin(fe);
if (err)
goto out;
if (z_erofs_is_inline_pcluster(fe->pcl)) {
void *mp;
mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
erofs_blknr(inode->i_sb, map->m_pa),
EROFS_NO_KMAP);
if (IS_ERR(mp)) {
err = PTR_ERR(mp);
erofs_err(inode->i_sb,
"failed to get inline page, err %d", err);
goto out;
}
get_page(fe->map.buf.page);
WRITE_ONCE(fe->pcl->compressed_bvecs[0].page,
fe->map.buf.page);
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
} else {
/* bind cache first when cached decompression is preferred */
z_erofs_bind_cache(fe);
}
hitted:
/*
* Ensure the current partial page belongs to this submit chain rather
* than other concurrent submit chains or the noio(bypass) chain since
* those chains are handled asynchronously thus the page cannot be used
* for inplace I/O or bvpage (should be processed in a strict order.)
*/
tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
cur = end - min_t(erofs_off_t, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
zero_user_segment(page, cur, end);
tight = false;
goto next_part;
}
if (map->m_flags & EROFS_MAP_FRAGMENT) {
unsigned int pageofs, skip, len;
erofs_off_t fpos = offset + cur - map->m_la;
if (offset > map->m_la) {
pageofs = 0;
skip = offset - map->m_la;
} else {
pageofs = map->m_la & ~PAGE_MASK;
skip = 0;
}
len = min_t(unsigned int, map->m_llen - skip, end - cur);
err = z_erofs_read_fragment(inode, skip, page, pageofs, len);
len = min_t(unsigned int, map->m_llen - fpos, end - cur);
err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len,
EROFS_I(inode)->z_fragmentoff + fpos);
if (err)
goto out;
++spiltted;
tight = false;
goto next_part;
}
exclusive = (!cur && (!spiltted || tight));
if (!fe->pcl) {
err = z_erofs_pcluster_begin(fe);
if (err)
goto out;
}
/*
* Ensure the current partial page belongs to this submit chain rather
* than other concurrent submit chains or the noio(bypass) chain since
* those chains are handled asynchronously thus the page cannot be used
* for inplace I/O or bvpage (should be processed in a strict order.)
*/
tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
exclusive = (!cur && ((split <= 1) || tight));
if (cur)
tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
......@@ -1072,8 +1039,6 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
goto out;
z_erofs_onlinepage_split(page);
/* bump up the number of spiltted parts of a page */
++spiltted;
if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
fe->pcl->multibases = true;
if (fe->pcl->length < offset + end - map->m_la) {
......@@ -1094,9 +1059,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
goto repeat;
out:
if (err)
z_erofs_page_mark_eio(page);
z_erofs_onlinepage_endio(page);
z_erofs_onlinepage_endio(page, err);
return err;
}
......@@ -1199,9 +1162,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
cur += len;
}
kunmap_local(dst);
if (err)
z_erofs_page_mark_eio(bvi->bvec.page);
z_erofs_onlinepage_endio(bvi->bvec.page);
z_erofs_onlinepage_endio(bvi->bvec.page, err);
list_del(p);
kfree(bvi);
}
......@@ -1372,9 +1333,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
/* recycle all individual short-lived pages */
if (z_erofs_put_shortlivedpage(be->pagepool, page))
continue;
if (err)
z_erofs_page_mark_eio(page);
z_erofs_onlinepage_endio(page);
z_erofs_onlinepage_endio(page, err);
}
if (be->decompressed_pages != be->onstack_pages)
......@@ -1410,6 +1369,9 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
owned = READ_ONCE(be.pcl->next);
z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
if (z_erofs_is_inline_pcluster(be.pcl))
z_erofs_free_pcluster(be.pcl);
else
erofs_workgroup_put(&be.pcl->obj);
}
}
......@@ -1848,15 +1810,10 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
if (page) {
if (PageUptodate(page)) {
if (PageUptodate(page))
unlock_page(page);
} else {
err = z_erofs_do_read_page(f, page);
if (err)
erofs_err(inode->i_sb,
"readmore error at page %lu @ nid %llu",
index, EROFS_I(inode)->nid);
}
else
(void)z_erofs_do_read_page(f, page);
put_page(page);
}
......@@ -1868,25 +1825,25 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
struct page *page = &folio->page;
struct inode *const inode = page->mapping->host;
struct inode *const inode = folio->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
int err;
trace_erofs_readpage(page, false);
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
trace_erofs_read_folio(folio, false);
f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
z_erofs_pcluster_readmore(&f, NULL, true);
err = z_erofs_do_read_page(&f, page);
err = z_erofs_do_read_page(&f, &folio->page);
z_erofs_pcluster_readmore(&f, NULL, false);
(void)z_erofs_collector_end(&f);
z_erofs_pcluster_end(&f);
/* if some compressed cluster ready, need submit them anyway */
z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
if (err)
erofs_err(inode->i_sb, "failed to read, err [%d]", err);
if (err && err != -EINTR)
erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
err, folio->index, EROFS_I(inode)->nid);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
......@@ -1898,38 +1855,35 @@ static void z_erofs_readahead(struct readahead_control *rac)
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
struct page *head = NULL, *page;
unsigned int nr_pages;
struct folio *head = NULL, *folio;
unsigned int nr_folios;
int err;
f.headoffset = readahead_pos(rac);
z_erofs_pcluster_readmore(&f, rac, true);
nr_pages = readahead_count(rac);
trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
nr_folios = readahead_count(rac);
trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false);
while ((page = readahead_page(rac))) {
set_page_private(page, (unsigned long)head);
head = page;
while ((folio = readahead_folio(rac))) {
folio->private = head;
head = folio;
}
/* traverse in reverse order for best metadata I/O performance */
while (head) {
struct page *page = head;
int err;
/* traversal in reverse order */
head = (void *)page_private(page);
folio = head;
head = folio_get_private(folio);
err = z_erofs_do_read_page(&f, page);
if (err)
erofs_err(inode->i_sb,
"readahead error at page %lu @ nid %llu",
page->index, EROFS_I(inode)->nid);
put_page(page);
err = z_erofs_do_read_page(&f, &folio->page);
if (err && err != -EINTR)
erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
folio->index, EROFS_I(inode)->nid);
}
z_erofs_pcluster_readmore(&f, rac, false);
(void)z_erofs_collector_end(&f);
z_erofs_pcluster_end(&f);
z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true);
z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
}
......
......@@ -561,7 +561,8 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
(map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA ||
map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) &&
map->m_llen >= i_blocksize(inode))) {
err = z_erofs_get_extent_decompressedlen(&m);
if (!err)
......
......@@ -80,11 +80,11 @@ TRACE_EVENT(erofs_fill_inode,
__entry->blkaddr, __entry->ofs)
);
TRACE_EVENT(erofs_readpage,
TRACE_EVENT(erofs_read_folio,
TP_PROTO(struct page *page, bool raw),
TP_PROTO(struct folio *folio, bool raw),
TP_ARGS(page, raw),
TP_ARGS(folio, raw),
TP_STRUCT__entry(
__field(dev_t, dev )
......@@ -96,11 +96,11 @@ TRACE_EVENT(erofs_readpage,
),
TP_fast_assign(
__entry->dev = page->mapping->host->i_sb->s_dev;
__entry->nid = EROFS_I(page->mapping->host)->nid;
__entry->dir = S_ISDIR(page->mapping->host->i_mode);
__entry->index = page->index;
__entry->uptodate = PageUptodate(page);
__entry->dev = folio->mapping->host->i_sb->s_dev;
__entry->nid = EROFS_I(folio->mapping->host)->nid;
__entry->dir = S_ISDIR(folio->mapping->host->i_mode);
__entry->index = folio->index;
__entry->uptodate = folio_test_uptodate(folio);
__entry->raw = raw;
),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment