Commit 63cf10be authored by Andreas Gruenbacher's avatar Andreas Gruenbacher Committed by Stefan Bader

mbcache: add reusable flag to cache entries

To reduce amount of damage caused by single bad block, we limit number
of inodes sharing an xattr block to 1024. Thus there can be more xattr
blocks with the same contents when there are lots of files with the same
extended attributes. These xattr blocks naturally result in hash
collisions and can form long hash chains and we unnecessarily check each
such block only to find out we cannot use it because it is already
shared by too many inodes.

Add a reusable flag to cache entries which is cleared when a cache entry
has reached its maximum refcount.  Cache entries which are not marked
reusable are skipped by mb_cache_entry_find_{first,next}. This
significantly speeds up mbcache when there are many same xattr blocks.
For example for xattr-bench with 5 values and each process handling
20000 files, the run for 64 processes is 25x faster with this patch.
Even for 8 processes the speedup is almost 3x. We have also verified
that for situations where there is only one xattr block of each kind,
the patch doesn't have a measurable cost.

[JK: Remove handling of setting the same value since it is not needed
anymore, check for races in e_reusable setting, improve changelog,
add measurements]
Signed-off-by: default avatarAndreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
(cherry picked from commit 6048c64b)
CVE-2015-8952
Signed-off-by: default avatarThadeu Lima de Souza Cascardo <cascardo@canonical.com>
Acked-by: default avatarStefan Bader <stefan.bader@canonical.com>
Acked-by: default avatarKleber Sacilotto de Souza <kleber.souza@canonical.com>
Signed-off-by: default avatarThadeu Lima de Souza Cascardo <cascardo@canonical.com>
parent ad60cf6a
...@@ -820,7 +820,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh) ...@@ -820,7 +820,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
__u32 hash = le32_to_cpu(HDR(bh)->h_hash); __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
int error; int error;
error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr); error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1);
if (error) { if (error) {
if (error == -EBUSY) { if (error == -EBUSY) {
ea_bdebug(bh, "already in cache (%d cache entries)", ea_bdebug(bh, "already in cache (%d cache entries)",
......
...@@ -559,6 +559,8 @@ static void ...@@ -559,6 +559,8 @@ static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode, ext4_xattr_release_block(handle_t *handle, struct inode *inode,
struct buffer_head *bh) struct buffer_head *bh)
{ {
struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
u32 hash, ref;
int error = 0; int error = 0;
BUFFER_TRACE(bh, "get_write_access"); BUFFER_TRACE(bh, "get_write_access");
...@@ -567,23 +569,33 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ...@@ -567,23 +569,33 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
goto out; goto out;
lock_buffer(bh); lock_buffer(bh);
if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { hash = le32_to_cpu(BHDR(bh)->h_hash);
__u32 hash = le32_to_cpu(BHDR(bh)->h_hash); ref = le32_to_cpu(BHDR(bh)->h_refcount);
if (ref == 1) {
ea_bdebug(bh, "refcount now=0; freeing"); ea_bdebug(bh, "refcount now=0; freeing");
/* /*
* This must happen under buffer lock for * This must happen under buffer lock for
* ext4_xattr_block_set() to reliably detect freed block * ext4_xattr_block_set() to reliably detect freed block
*/ */
mb_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash, mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
bh->b_blocknr);
get_bh(bh); get_bh(bh);
unlock_buffer(bh); unlock_buffer(bh);
ext4_free_blocks(handle, inode, bh, 0, 1, ext4_free_blocks(handle, inode, bh, 0, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET); EXT4_FREE_BLOCKS_FORGET);
} else { } else {
le32_add_cpu(&BHDR(bh)->h_refcount, -1); ref--;
BHDR(bh)->h_refcount = cpu_to_le32(ref);
if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
struct mb_cache_entry *ce;
ce = mb_cache_entry_get(ext4_mb_cache, hash,
bh->b_blocknr);
if (ce) {
ce->e_reusable = 1;
mb_cache_entry_put(ext4_mb_cache, ce);
}
}
ext4_xattr_block_csum_set(inode, bh); ext4_xattr_block_csum_set(inode, bh);
/* /*
...@@ -887,6 +899,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ...@@ -887,6 +899,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (new_bh == bs->bh) if (new_bh == bs->bh)
ea_bdebug(new_bh, "keeping"); ea_bdebug(new_bh, "keeping");
else { else {
u32 ref;
/* The old block is released after updating /* The old block is released after updating
the inode. */ the inode. */
error = dquot_alloc_block(inode, error = dquot_alloc_block(inode,
...@@ -901,15 +915,18 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ...@@ -901,15 +915,18 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
lock_buffer(new_bh); lock_buffer(new_bh);
/* /*
* We have to be careful about races with * We have to be careful about races with
* freeing or rehashing of xattr block. Once we * freeing, rehashing or adding references to
* hold buffer lock xattr block's state is * xattr block. Once we hold buffer lock xattr
* stable so we can check whether the block got * block's state is stable so we can check
* freed / rehashed or not. Since we unhash * whether the block got freed / rehashed or
* mbcache entry under buffer lock when freeing * not. Since we unhash mbcache entry under
* / rehashing xattr block, checking whether * buffer lock when freeing / rehashing xattr
* entry is still hashed is reliable. * block, checking whether entry is still
* hashed is reliable. Same rules hold for
* e_reusable handling.
*/ */
if (hlist_bl_unhashed(&ce->e_hash_list)) { if (hlist_bl_unhashed(&ce->e_hash_list) ||
!ce->e_reusable) {
/* /*
* Undo everything and check mbcache * Undo everything and check mbcache
* again. * again.
...@@ -924,9 +941,12 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ...@@ -924,9 +941,12 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
new_bh = NULL; new_bh = NULL;
goto inserted; goto inserted;
} }
le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
if (ref >= EXT4_XATTR_REFCOUNT_MAX)
ce->e_reusable = 0;
ea_bdebug(new_bh, "reusing; refcount now=%d", ea_bdebug(new_bh, "reusing; refcount now=%d",
le32_to_cpu(BHDR(new_bh)->h_refcount)); ref);
ext4_xattr_block_csum_set(inode, new_bh); ext4_xattr_block_csum_set(inode, new_bh);
unlock_buffer(new_bh); unlock_buffer(new_bh);
error = ext4_handle_dirty_metadata(handle, error = ext4_handle_dirty_metadata(handle,
...@@ -1595,11 +1615,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) ...@@ -1595,11 +1615,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
static void static void
ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
{ {
__u32 hash = le32_to_cpu(BHDR(bh)->h_hash); struct ext4_xattr_header *header = BHDR(bh);
__u32 hash = le32_to_cpu(header->h_hash);
int reusable = le32_to_cpu(header->h_refcount) <
EXT4_XATTR_REFCOUNT_MAX;
int error; int error;
error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
bh->b_blocknr); bh->b_blocknr, reusable);
if (error) { if (error) {
if (error == -EBUSY) if (error == -EBUSY)
ea_bdebug(bh, "already in cache"); ea_bdebug(bh, "already in cache");
...@@ -1674,12 +1697,6 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, ...@@ -1674,12 +1697,6 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
if (!bh) { if (!bh) {
EXT4_ERROR_INODE(inode, "block %lu read error", EXT4_ERROR_INODE(inode, "block %lu read error",
(unsigned long) ce->e_block); (unsigned long) ce->e_block);
} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
EXT4_XATTR_REFCOUNT_MAX) {
ea_idebug(inode, "block %lu refcount %d>=%d",
(unsigned long) ce->e_block,
le32_to_cpu(BHDR(bh)->h_refcount),
EXT4_XATTR_REFCOUNT_MAX);
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
*pce = ce; *pce = ce;
return bh; return bh;
......
...@@ -63,13 +63,14 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, ...@@ -63,13 +63,14 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
* @mask - gfp mask with which the entry should be allocated * @mask - gfp mask with which the entry should be allocated
* @key - key of the entry * @key - key of the entry
* @block - block that contains data * @block - block that contains data
* @reusable - is the block reusable by other inodes?
* *
* Creates entry in @cache with key @key and records that data is stored in * Creates entry in @cache with key @key and records that data is stored in
* block @block. The function returns -EBUSY if entry with the same key * block @block. The function returns -EBUSY if entry with the same key
* and for the same block already exists in cache. Otherwise 0 is returned. * and for the same block already exists in cache. Otherwise 0 is returned.
*/ */
int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
sector_t block) sector_t block, bool reusable)
{ {
struct mb_cache_entry *entry, *dup; struct mb_cache_entry *entry, *dup;
struct hlist_bl_node *dup_node; struct hlist_bl_node *dup_node;
...@@ -91,6 +92,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, ...@@ -91,6 +92,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
atomic_set(&entry->e_refcnt, 1); atomic_set(&entry->e_refcnt, 1);
entry->e_key = key; entry->e_key = key;
entry->e_block = block; entry->e_block = block;
entry->e_reusable = reusable;
head = mb_cache_entry_head(cache, key); head = mb_cache_entry_head(cache, key);
hlist_bl_lock(head); hlist_bl_lock(head);
hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
...@@ -137,7 +139,7 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache, ...@@ -137,7 +139,7 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
while (node) { while (node) {
entry = hlist_bl_entry(node, struct mb_cache_entry, entry = hlist_bl_entry(node, struct mb_cache_entry,
e_hash_list); e_hash_list);
if (entry->e_key == key) { if (entry->e_key == key && entry->e_reusable) {
atomic_inc(&entry->e_refcnt); atomic_inc(&entry->e_refcnt);
goto out; goto out;
} }
...@@ -184,10 +186,38 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, ...@@ -184,10 +186,38 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
} }
EXPORT_SYMBOL(mb_cache_entry_find_next); EXPORT_SYMBOL(mb_cache_entry_find_next);
/*
* mb_cache_entry_get - get a cache entry by block number (and key)
* @cache - cache we work with
* @key - key of block number @block
* @block - block number
*/
struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
sector_t block)
{
struct hlist_bl_node *node;
struct hlist_bl_head *head;
struct mb_cache_entry *entry;
head = mb_cache_entry_head(cache, key);
hlist_bl_lock(head);
hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
if (entry->e_key == key && entry->e_block == block) {
atomic_inc(&entry->e_refcnt);
goto out;
}
}
entry = NULL;
out:
hlist_bl_unlock(head);
return entry;
}
EXPORT_SYMBOL(mb_cache_entry_get);
/* mb_cache_entry_delete_block - remove information about block from cache /* mb_cache_entry_delete_block - remove information about block from cache
* @cache - cache we work with * @cache - cache we work with
* @key - key of the entry to remove * @key - key of block @block
* @block - block containing data for @key * @block - block number
* *
* Remove entry from cache @cache with key @key with data stored in @block. * Remove entry from cache @cache with key @key with data stored in @block.
*/ */
......
...@@ -18,6 +18,7 @@ struct mb_cache_entry { ...@@ -18,6 +18,7 @@ struct mb_cache_entry {
/* Key in hash - stable during lifetime of the entry */ /* Key in hash - stable during lifetime of the entry */
u32 e_key; u32 e_key;
u32 e_referenced:1; u32 e_referenced:1;
u32 e_reusable:1;
/* Block number of hashed block - stable during lifetime of the entry */ /* Block number of hashed block - stable during lifetime of the entry */
sector_t e_block; sector_t e_block;
}; };
...@@ -26,7 +27,7 @@ struct mb_cache *mb_cache_create(int bucket_bits); ...@@ -26,7 +27,7 @@ struct mb_cache *mb_cache_create(int bucket_bits);
void mb_cache_destroy(struct mb_cache *cache); void mb_cache_destroy(struct mb_cache *cache);
int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
sector_t block); sector_t block, bool reusable);
void __mb_cache_entry_free(struct mb_cache_entry *entry); void __mb_cache_entry_free(struct mb_cache_entry *entry);
static inline int mb_cache_entry_put(struct mb_cache *cache, static inline int mb_cache_entry_put(struct mb_cache *cache,
struct mb_cache_entry *entry) struct mb_cache_entry *entry)
...@@ -39,6 +40,8 @@ static inline int mb_cache_entry_put(struct mb_cache *cache, ...@@ -39,6 +40,8 @@ static inline int mb_cache_entry_put(struct mb_cache *cache,
void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
sector_t block); sector_t block);
struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
sector_t block);
struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
u32 key); u32 key);
struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment