Commit fd1ea9ab authored by Andreas Gruenbacher's avatar Andreas Gruenbacher Committed by Linus Torvalds

[PATCH] ext3/EA: Race in ext[23] xattr sharing code

Andrew Tridgell and Stephen C.  Tweedie have reported two different Oopses
caused by a race condition in the mbcache, which is responsible for
extended attribute sharing in ext2 and ext3.  Stephen tracked down the bug;
I did the fix.

Explanation:

The mbcache caches the locations and content hashes of xattr blocks.  There
are two access strategies: [1] xattr block disposal via
mb_cache_entry_get(), [2] xattr block reuse (sharing) via
mb_cache_entry_find_{first,next}().  There is no locking between the two
methods, so between one mb_cache_entry_find_x and the next, a
mb_cache_entry_get might come in, unhash the cache entry, and change the
journaling state of the xattr buffer.  Subsequently, two things can happen:
[a] the next mb_cache_entry_find_x may try to follow the mbcache hash chain
starting from the entry that has become unhashed, which now is a stale
pointer, [b] the block may have become deallocated, and then we try to
reuse it.

Fix this by converting the mbcache into a readers-writer style lock, and
protect all block accesses in ext2/ext3 by the mbcache entry lock.  This
ensures that destroying blocks is an exclusive operation that may not
overlap xattr block reuse, while allowing multiple "re-users".  Write
access to the xattr block's buffer is protected by the buffer lock.  
Signed-off-by: default avatarAndreas Gruenbacher <agruen@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 401494b1
...@@ -95,7 +95,6 @@ static int ext2_xattr_set2(struct inode *, struct buffer_head *, ...@@ -95,7 +95,6 @@ static int ext2_xattr_set2(struct inode *, struct buffer_head *,
static int ext2_xattr_cache_insert(struct buffer_head *); static int ext2_xattr_cache_insert(struct buffer_head *);
static struct buffer_head *ext2_xattr_cache_find(struct inode *, static struct buffer_head *ext2_xattr_cache_find(struct inode *,
struct ext2_xattr_header *); struct ext2_xattr_header *);
static void ext2_xattr_cache_remove(struct buffer_head *);
static void ext2_xattr_rehash(struct ext2_xattr_header *, static void ext2_xattr_rehash(struct ext2_xattr_header *,
struct ext2_xattr_entry *); struct ext2_xattr_entry *);
...@@ -494,15 +493,22 @@ bad_block: ext2_error(sb, "ext2_xattr_set", ...@@ -494,15 +493,22 @@ bad_block: ext2_error(sb, "ext2_xattr_set",
/* Here we know that we can set the new attribute. */ /* Here we know that we can set the new attribute. */
if (header) { if (header) {
struct mb_cache_entry *ce;
/* assert(header == HDR(bh)); */ /* assert(header == HDR(bh)); */
ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev,
bh->b_blocknr);
lock_buffer(bh); lock_buffer(bh);
if (header->h_refcount == cpu_to_le32(1)) { if (header->h_refcount == cpu_to_le32(1)) {
ea_bdebug(bh, "modifying in-place"); ea_bdebug(bh, "modifying in-place");
ext2_xattr_cache_remove(bh); if (ce)
mb_cache_entry_free(ce);
/* keep the buffer locked while modifying it. */ /* keep the buffer locked while modifying it. */
} else { } else {
int offset; int offset;
if (ce)
mb_cache_entry_release(ce);
unlock_buffer(bh); unlock_buffer(bh);
ea_bdebug(bh, "cloning"); ea_bdebug(bh, "cloning");
header = kmalloc(bh->b_size, GFP_KERNEL); header = kmalloc(bh->b_size, GFP_KERNEL);
...@@ -707,13 +713,19 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, ...@@ -707,13 +713,19 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
error = 0; error = 0;
if (old_bh && old_bh != new_bh) { if (old_bh && old_bh != new_bh) {
struct mb_cache_entry *ce;
/* /*
* If there was an old block and we are no longer using it, * If there was an old block and we are no longer using it,
* release the old block. * release the old block.
*/ */
ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev,
old_bh->b_blocknr);
lock_buffer(old_bh); lock_buffer(old_bh);
if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
/* Free the old block. */ /* Free the old block. */
if (ce)
mb_cache_entry_free(ce);
ea_bdebug(old_bh, "freeing"); ea_bdebug(old_bh, "freeing");
ext2_free_blocks(inode, old_bh->b_blocknr, 1); ext2_free_blocks(inode, old_bh->b_blocknr, 1);
/* We let our caller release old_bh, so we /* We let our caller release old_bh, so we
...@@ -724,6 +736,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, ...@@ -724,6 +736,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
/* Decrement the refcount only. */ /* Decrement the refcount only. */
HDR(old_bh)->h_refcount = cpu_to_le32( HDR(old_bh)->h_refcount = cpu_to_le32(
le32_to_cpu(HDR(old_bh)->h_refcount) - 1); le32_to_cpu(HDR(old_bh)->h_refcount) - 1);
if (ce)
mb_cache_entry_release(ce);
DQUOT_FREE_BLOCK(inode, 1); DQUOT_FREE_BLOCK(inode, 1);
mark_buffer_dirty(old_bh); mark_buffer_dirty(old_bh);
ea_bdebug(old_bh, "refcount now=%d", ea_bdebug(old_bh, "refcount now=%d",
...@@ -748,6 +762,7 @@ void ...@@ -748,6 +762,7 @@ void
ext2_xattr_delete_inode(struct inode *inode) ext2_xattr_delete_inode(struct inode *inode)
{ {
struct buffer_head *bh = NULL; struct buffer_head *bh = NULL;
struct mb_cache_entry *ce;
down_write(&EXT2_I(inode)->xattr_sem); down_write(&EXT2_I(inode)->xattr_sem);
if (!EXT2_I(inode)->i_file_acl) if (!EXT2_I(inode)->i_file_acl)
...@@ -767,15 +782,19 @@ ext2_xattr_delete_inode(struct inode *inode) ...@@ -767,15 +782,19 @@ ext2_xattr_delete_inode(struct inode *inode)
EXT2_I(inode)->i_file_acl); EXT2_I(inode)->i_file_acl);
goto cleanup; goto cleanup;
} }
ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr);
lock_buffer(bh); lock_buffer(bh);
if (HDR(bh)->h_refcount == cpu_to_le32(1)) { if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
ext2_xattr_cache_remove(bh); if (ce)
mb_cache_entry_free(ce);
ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
get_bh(bh); get_bh(bh);
bforget(bh); bforget(bh);
} else { } else {
HDR(bh)->h_refcount = cpu_to_le32( HDR(bh)->h_refcount = cpu_to_le32(
le32_to_cpu(HDR(bh)->h_refcount) - 1); le32_to_cpu(HDR(bh)->h_refcount) - 1);
if (ce)
mb_cache_entry_release(ce);
mark_buffer_dirty(bh); mark_buffer_dirty(bh);
if (IS_SYNC(inode)) if (IS_SYNC(inode))
sync_dirty_buffer(bh); sync_dirty_buffer(bh);
...@@ -892,11 +911,19 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) ...@@ -892,11 +911,19 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
if (!header->h_hash) if (!header->h_hash)
return NULL; /* never share */ return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, ce = mb_cache_entry_find_first(ext2_xattr_cache, 0,
inode->i_sb->s_bdev, hash); inode->i_sb->s_bdev, hash);
while (ce) { while (ce) {
struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); struct buffer_head *bh;
if (IS_ERR(ce)) {
if (PTR_ERR(ce) == -EAGAIN)
goto again;
break;
}
bh = sb_bread(inode->i_sb, ce->e_block);
if (!bh) { if (!bh) {
ext2_error(inode->i_sb, "ext2_xattr_cache_find", ext2_error(inode->i_sb, "ext2_xattr_cache_find",
"inode %ld: block %ld read error", "inode %ld: block %ld read error",
...@@ -923,26 +950,6 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) ...@@ -923,26 +950,6 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
return NULL; return NULL;
} }
/*
* ext2_xattr_cache_remove()
*
* Remove the cache entry of a block from the cache. Called when a
* block becomes invalid.
*/
static void
ext2_xattr_cache_remove(struct buffer_head *bh)
{
struct mb_cache_entry *ce;
ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr);
if (ce) {
ea_bdebug(bh, "removing (%d cache entries remaining)",
atomic_read(&ext2_xattr_cache->c_entry_count)-1);
mb_cache_entry_free(ce);
} else
ea_bdebug(bh, "no cache entry");
}
#define NAME_HASH_SHIFT 5 #define NAME_HASH_SHIFT 5
#define VALUE_HASH_SHIFT 16 #define VALUE_HASH_SHIFT 16
......
...@@ -97,7 +97,6 @@ static int ext3_xattr_cache_insert(struct buffer_head *); ...@@ -97,7 +97,6 @@ static int ext3_xattr_cache_insert(struct buffer_head *);
static struct buffer_head *ext3_xattr_cache_find(handle_t *, struct inode *, static struct buffer_head *ext3_xattr_cache_find(handle_t *, struct inode *,
struct ext3_xattr_header *, struct ext3_xattr_header *,
int *); int *);
static void ext3_xattr_cache_remove(struct buffer_head *);
static void ext3_xattr_rehash(struct ext3_xattr_header *, static void ext3_xattr_rehash(struct ext3_xattr_header *,
struct ext3_xattr_entry *); struct ext3_xattr_entry *);
...@@ -500,6 +499,7 @@ bad_block: ext3_error(sb, "ext3_xattr_set", ...@@ -500,6 +499,7 @@ bad_block: ext3_error(sb, "ext3_xattr_set",
/* Here we know that we can set the new attribute. */ /* Here we know that we can set the new attribute. */
if (header) { if (header) {
struct mb_cache_entry *ce;
int credits = 0; int credits = 0;
/* assert(header == HDR(bh)); */ /* assert(header == HDR(bh)); */
...@@ -511,14 +511,19 @@ bad_block: ext3_error(sb, "ext3_xattr_set", ...@@ -511,14 +511,19 @@ bad_block: ext3_error(sb, "ext3_xattr_set",
&credits); &credits);
if (error) if (error)
goto cleanup; goto cleanup;
ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev,
bh->b_blocknr);
lock_buffer(bh); lock_buffer(bh);
if (header->h_refcount == cpu_to_le32(1)) { if (header->h_refcount == cpu_to_le32(1)) {
if (ce)
mb_cache_entry_free(ce);
ea_bdebug(bh, "modifying in-place"); ea_bdebug(bh, "modifying in-place");
ext3_xattr_cache_remove(bh);
/* keep the buffer locked while modifying it. */ /* keep the buffer locked while modifying it. */
} else { } else {
int offset; int offset;
if (ce)
mb_cache_entry_release(ce);
unlock_buffer(bh); unlock_buffer(bh);
journal_release_buffer(handle, bh, credits); journal_release_buffer(handle, bh, credits);
skip_get_write_access: skip_get_write_access:
...@@ -725,6 +730,8 @@ ext3_xattr_set_handle2(handle_t *handle, struct inode *inode, ...@@ -725,6 +730,8 @@ ext3_xattr_set_handle2(handle_t *handle, struct inode *inode,
error = 0; error = 0;
if (old_bh && old_bh != new_bh) { if (old_bh && old_bh != new_bh) {
struct mb_cache_entry *ce;
/* /*
* If there was an old block, and we are no longer using it, * If there was an old block, and we are no longer using it,
* release the old block. * release the old block.
...@@ -732,9 +739,13 @@ ext3_xattr_set_handle2(handle_t *handle, struct inode *inode, ...@@ -732,9 +739,13 @@ ext3_xattr_set_handle2(handle_t *handle, struct inode *inode,
error = ext3_journal_get_write_access(handle, old_bh); error = ext3_journal_get_write_access(handle, old_bh);
if (error) if (error)
goto cleanup; goto cleanup;
ce = mb_cache_entry_get(ext3_xattr_cache, old_bh->b_bdev,
old_bh->b_blocknr);
lock_buffer(old_bh); lock_buffer(old_bh);
if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
/* Free the old block. */ /* Free the old block. */
if (ce)
mb_cache_entry_free(ce);
ea_bdebug(old_bh, "freeing"); ea_bdebug(old_bh, "freeing");
ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1); ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1);
...@@ -747,6 +758,8 @@ ext3_xattr_set_handle2(handle_t *handle, struct inode *inode, ...@@ -747,6 +758,8 @@ ext3_xattr_set_handle2(handle_t *handle, struct inode *inode,
/* Decrement the refcount only. */ /* Decrement the refcount only. */
HDR(old_bh)->h_refcount = cpu_to_le32( HDR(old_bh)->h_refcount = cpu_to_le32(
le32_to_cpu(HDR(old_bh)->h_refcount) - 1); le32_to_cpu(HDR(old_bh)->h_refcount) - 1);
if (ce)
mb_cache_entry_release(ce);
DQUOT_FREE_BLOCK(inode, 1); DQUOT_FREE_BLOCK(inode, 1);
ext3_journal_dirty_metadata(handle, old_bh); ext3_journal_dirty_metadata(handle, old_bh);
ea_bdebug(old_bh, "refcount now=%d", ea_bdebug(old_bh, "refcount now=%d",
...@@ -806,6 +819,7 @@ void ...@@ -806,6 +819,7 @@ void
ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
{ {
struct buffer_head *bh = NULL; struct buffer_head *bh = NULL;
struct mb_cache_entry *ce;
down_write(&EXT3_I(inode)->xattr_sem); down_write(&EXT3_I(inode)->xattr_sem);
if (!EXT3_I(inode)->i_file_acl) if (!EXT3_I(inode)->i_file_acl)
...@@ -826,15 +840,19 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) ...@@ -826,15 +840,19 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
} }
if (ext3_journal_get_write_access(handle, bh) != 0) if (ext3_journal_get_write_access(handle, bh) != 0)
goto cleanup; goto cleanup;
ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr);
lock_buffer(bh); lock_buffer(bh);
if (HDR(bh)->h_refcount == cpu_to_le32(1)) { if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
ext3_xattr_cache_remove(bh); if (ce)
mb_cache_entry_free(ce);
ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1); ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1);
get_bh(bh); get_bh(bh);
ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl);
} else { } else {
HDR(bh)->h_refcount = cpu_to_le32( HDR(bh)->h_refcount = cpu_to_le32(
le32_to_cpu(HDR(bh)->h_refcount) - 1); le32_to_cpu(HDR(bh)->h_refcount) - 1);
if (ce)
mb_cache_entry_release(ce);
ext3_journal_dirty_metadata(handle, bh); ext3_journal_dirty_metadata(handle, bh);
if (IS_SYNC(inode)) if (IS_SYNC(inode))
handle->h_sync = 1; handle->h_sync = 1;
...@@ -951,11 +969,18 @@ ext3_xattr_cache_find(handle_t *handle, struct inode *inode, ...@@ -951,11 +969,18 @@ ext3_xattr_cache_find(handle_t *handle, struct inode *inode,
if (!header->h_hash) if (!header->h_hash)
return NULL; /* never share */ return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, ce = mb_cache_entry_find_first(ext3_xattr_cache, 0,
inode->i_sb->s_bdev, hash); inode->i_sb->s_bdev, hash);
while (ce) { while (ce) {
struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); struct buffer_head *bh;
if (IS_ERR(ce)) {
if (PTR_ERR(ce) == -EAGAIN)
goto again;
break;
}
bh = sb_bread(inode->i_sb, ce->e_block);
if (!bh) { if (!bh) {
ext3_error(inode->i_sb, "ext3_xattr_cache_find", ext3_error(inode->i_sb, "ext3_xattr_cache_find",
"inode %ld: block %ld read error", "inode %ld: block %ld read error",
...@@ -986,27 +1011,6 @@ ext3_xattr_cache_find(handle_t *handle, struct inode *inode, ...@@ -986,27 +1011,6 @@ ext3_xattr_cache_find(handle_t *handle, struct inode *inode,
return NULL; return NULL;
} }
/*
* ext3_xattr_cache_remove()
*
* Remove the cache entry of a block from the cache. Called when a
* block becomes invalid.
*/
static void
ext3_xattr_cache_remove(struct buffer_head *bh)
{
struct mb_cache_entry *ce;
ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev,
bh->b_blocknr);
if (ce) {
ea_bdebug(bh, "removing (%d cache entries remaining)",
atomic_read(&ext3_xattr_cache->c_entry_count)-1);
mb_cache_entry_free(ce);
} else
ea_bdebug(bh, "no cache entry");
}
#define NAME_HASH_SHIFT 5 #define NAME_HASH_SHIFT 5
#define VALUE_HASH_SHIFT 16 #define VALUE_HASH_SHIFT 16
......
...@@ -55,6 +55,10 @@ ...@@ -55,6 +55,10 @@
printk("\n"); \ printk("\n"); \
} while(0) } while(0)
#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>"); MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
...@@ -140,7 +144,7 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask) ...@@ -140,7 +144,7 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask)
{ {
struct mb_cache *cache = ce->e_cache; struct mb_cache *cache = ce->e_cache;
mb_assert(atomic_read(&ce->e_used) == 0); mb_assert(!(ce->e_used || ce->e_queued));
if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) { if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) {
/* free failed -- put back on the lru list /* free failed -- put back on the lru list
for freeing later. */ for freeing later. */
...@@ -157,9 +161,16 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask) ...@@ -157,9 +161,16 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask)
static inline void static inline void
__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
{ {
if (atomic_dec_and_test(&ce->e_used)) { /* Wake up all processes queuing for this cache entry. */
if (ce->e_queued)
wake_up_all(&mb_cache_queue);
if (ce->e_used >= MB_CACHE_WRITER)
ce->e_used -= MB_CACHE_WRITER;
ce->e_used--;
if (!(ce->e_used || ce->e_queued)) {
if (!__mb_cache_entry_is_hashed(ce)) if (!__mb_cache_entry_is_hashed(ce))
goto forget; goto forget;
mb_assert(list_empty(&ce->e_lru_list));
list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
} }
spin_unlock(&mb_cache_spinlock); spin_unlock(&mb_cache_spinlock);
...@@ -396,7 +407,8 @@ mb_cache_entry_alloc(struct mb_cache *cache) ...@@ -396,7 +407,8 @@ mb_cache_entry_alloc(struct mb_cache *cache)
INIT_LIST_HEAD(&ce->e_lru_list); INIT_LIST_HEAD(&ce->e_lru_list);
INIT_LIST_HEAD(&ce->e_block_list); INIT_LIST_HEAD(&ce->e_block_list);
ce->e_cache = cache; ce->e_cache = cache;
atomic_set(&ce->e_used, 1); ce->e_used = 1 + MB_CACHE_WRITER;
ce->e_queued = 0;
} }
return ce; return ce;
} }
...@@ -488,7 +500,8 @@ mb_cache_entry_free(struct mb_cache_entry *ce) ...@@ -488,7 +500,8 @@ mb_cache_entry_free(struct mb_cache_entry *ce)
* *
* Get a cache entry by device / block number. (There can only be one entry * Get a cache entry by device / block number. (There can only be one entry
* in the cache per device and block.) Returns NULL if no such cache entry * in the cache per device and block.) Returns NULL if no such cache entry
* exists. * exists. The returned cache entry is locked for exclusive access ("single
* writer").
*/ */
struct mb_cache_entry * struct mb_cache_entry *
mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
...@@ -504,9 +517,27 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, ...@@ -504,9 +517,27 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
list_for_each(l, &cache->c_block_hash[bucket]) { list_for_each(l, &cache->c_block_hash[bucket]) {
ce = list_entry(l, struct mb_cache_entry, e_block_list); ce = list_entry(l, struct mb_cache_entry, e_block_list);
if (ce->e_bdev == bdev && ce->e_block == block) { if (ce->e_bdev == bdev && ce->e_block == block) {
DEFINE_WAIT(wait);
if (!list_empty(&ce->e_lru_list)) if (!list_empty(&ce->e_lru_list))
list_del_init(&ce->e_lru_list); list_del_init(&ce->e_lru_list);
atomic_inc(&ce->e_used);
while (ce->e_used > 0) {
ce->e_queued++;
prepare_to_wait(&mb_cache_queue, &wait,
TASK_UNINTERRUPTIBLE);
spin_unlock(&mb_cache_spinlock);
schedule();
spin_lock(&mb_cache_spinlock);
ce->e_queued--;
}
finish_wait(&mb_cache_queue, &wait);
ce->e_used += 1 + MB_CACHE_WRITER;
if (!__mb_cache_entry_is_hashed(ce)) {
__mb_cache_entry_release_unlock(ce);
return NULL;
}
goto cleanup; goto cleanup;
} }
} }
...@@ -523,14 +554,37 @@ static struct mb_cache_entry * ...@@ -523,14 +554,37 @@ static struct mb_cache_entry *
__mb_cache_entry_find(struct list_head *l, struct list_head *head, __mb_cache_entry_find(struct list_head *l, struct list_head *head,
int index, struct block_device *bdev, unsigned int key) int index, struct block_device *bdev, unsigned int key)
{ {
DEFINE_WAIT(wait);
while (l != head) { while (l != head) {
struct mb_cache_entry *ce = struct mb_cache_entry *ce =
list_entry(l, struct mb_cache_entry, list_entry(l, struct mb_cache_entry,
e_indexes[index].o_list); e_indexes[index].o_list);
if (ce->e_bdev == bdev && ce->e_indexes[index].o_key == key) { if (ce->e_bdev == bdev && ce->e_indexes[index].o_key == key) {
DEFINE_WAIT(wait);
if (!list_empty(&ce->e_lru_list)) if (!list_empty(&ce->e_lru_list))
list_del_init(&ce->e_lru_list); list_del_init(&ce->e_lru_list);
atomic_inc(&ce->e_used);
/* Incrementing before holding the lock gives readers
priority over writers. */
ce->e_used++;
while (ce->e_used >= MB_CACHE_WRITER) {
ce->e_queued++;
prepare_to_wait(&mb_cache_queue, &wait,
TASK_UNINTERRUPTIBLE);
spin_unlock(&mb_cache_spinlock);
schedule();
spin_lock(&mb_cache_spinlock);
ce->e_queued--;
}
finish_wait(&mb_cache_queue, &wait);
if (!__mb_cache_entry_is_hashed(ce)) {
__mb_cache_entry_release_unlock(ce);
spin_lock(&mb_cache_spinlock);
return ERR_PTR(-EAGAIN);
}
return ce; return ce;
} }
l = l->next; l = l->next;
...@@ -544,7 +598,8 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head, ...@@ -544,7 +598,8 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
* *
* Find the first cache entry on a given device with a certain key in * Find the first cache entry on a given device with a certain key in
* an additional index. Additonal matches can be found with * an additional index. Additonal matches can be found with
* mb_cache_entry_find_next(). Returns NULL if no match was found. * mb_cache_entry_find_next(). Returns NULL if no match was found. The
* returned cache entry is locked for shared access ("multiple readers").
* *
* @cache: the cache to search * @cache: the cache to search
* @index: the number of the additonal index to search (0<=index<indexes_count) * @index: the number of the additonal index to search (0<=index<indexes_count)
......
...@@ -10,7 +10,8 @@ ...@@ -10,7 +10,8 @@
struct mb_cache_entry { struct mb_cache_entry {
struct list_head e_lru_list; struct list_head e_lru_list;
struct mb_cache *e_cache; struct mb_cache *e_cache;
atomic_t e_used; unsigned short e_used;
unsigned short e_queued;
struct block_device *e_bdev; struct block_device *e_bdev;
sector_t e_block; sector_t e_block;
struct list_head e_block_list; struct list_head e_block_list;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment