Commit 376cc685 authored by Miao Xie's avatar Miao Xie Committed by Chris Mason

Btrfs: fix the reserved space leak caused by the race between nonlock dio and buffered io

When we ran sysbench on the fs with compression, the following WARN_ONs were
triggered:
 fs/btrfs/inode.c:7829	WARN_ON(BTRFS_I(inode)->outstanding_extents);
 fs/btrfs/inode.c:7830	WARN_ON(BTRFS_I(inode)->reserved_extents);
 fs/btrfs/inode.c:7832	WARN_ON(BTRFS_I(inode)->csum_bytes);

Steps to reproduce:
 # mkfs.btrfs -f <dev>
 # mount -o compress <dev> <mnt>
 # cd <mnt>
 # sysbench --test=fileio --num-threads=8 --file-total-size=8G \
 > --file-block-size=32K --file-io-mode=rndwr --file-fsync-freq=0 \
 > --file-fsync-end=no --max-requests=300000 --file-extra-flags=direct \
 > --file-test-mode=sync prepare
 # cd -
 # umount <mnt>
 # mount -o compress <dev> <mnt>
 # cd <mnt>
 # sysbench --test=fileio --num-threads=8 --file-total-size=8G \
 > --file-block-size=32K --file-io-mode=rndwr --file-fsync-freq=0 \
 > --file-fsync-end=no --max-requests=300000 --file-extra-flags=direct \
 > --file-test-mode=sync run
 # cd -
 # umount <mnt>

The reason of this problem is:
Task0				Task1
btrfs_direct_IO
  unlock(&inode->i_mutex)
				lock(&inode->i_mutex)
				reserve_space()
				prepare_pages()
				  lock_extent()
				  clear_extent()
				  unlock_extent()
  lock_extent()
  test_extent(uptodate)
    return false
				copy_data()
				set_delalloc_extent()
  extent need compress
    go back to buffered write
  clear_extent(DELALLOC | DIRTY)
  unlock_extent()

Task 0 and 1 wrote the same place, and task0 cleared the delalloc flag which
was set by task1, it made the dirty pages in that extents couldn't be flushed
into the disk, so the reserved space for that extent was not released at
the end.

This patch fixes the above bug by unlocking the extent after the delalloc.
Signed-off-by: default avatarMiao Xie <miaox@cn.fujitsu.com>
Signed-off-by: default avatarJosef Bacik <jbacik@fb.com>
Signed-off-by: default avatarChris Mason <clm@fb.com>
parent b37392ea
...@@ -1235,27 +1235,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos, ...@@ -1235,27 +1235,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
} }
/* /*
* this gets pages into the page cache and locks them down, it also properly * this just gets pages into the page cache and locks them down.
* waits for data=ordered extents to finish before allowing the pages to be
* modified.
*/ */
static noinline int prepare_pages(struct inode *inode, struct page **pages, static noinline int prepare_pages(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t num_pages, loff_t pos,
size_t write_bytes, bool force_uptodate) size_t write_bytes, bool force_uptodate)
{ {
struct extent_state *cached_state = NULL;
int i; int i;
unsigned long index = pos >> PAGE_CACHE_SHIFT; unsigned long index = pos >> PAGE_CACHE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int err = 0; int err;
int faili = 0; int faili;
u64 start_pos;
u64 last_pos;
start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
again:
for (i = 0; i < num_pages; i++) { for (i = 0; i < num_pages; i++) {
pages[i] = find_or_create_page(inode->i_mapping, index + i, pages[i] = find_or_create_page(inode->i_mapping, index + i,
mask | __GFP_WRITE); mask | __GFP_WRITE);
...@@ -1278,57 +1269,85 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, ...@@ -1278,57 +1269,85 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
} }
wait_on_page_writeback(pages[i]); wait_on_page_writeback(pages[i]);
} }
faili = num_pages - 1;
err = 0; return 0;
fail:
while (faili >= 0) {
unlock_page(pages[faili]);
page_cache_release(pages[faili]);
faili--;
}
return err;
}
/*
* This function locks the extent and properly waits for data=ordered extents
* to finish before allowing the pages to be modified if need.
*
* The return value:
* 1 - the extent is locked
* 0 - the extent is not locked, and everything is OK
* -EAGAIN - need re-prepare the pages
* the other < 0 number - Something wrong happens
*/
static noinline int
lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
u64 *lockstart, u64 *lockend,
struct extent_state **cached_state)
{
u64 start_pos;
u64 last_pos;
int i;
int ret = 0;
start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
if (start_pos < inode->i_size) { if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_extent_bits(&BTRFS_I(inode)->io_tree,
start_pos, last_pos - 1, 0, &cached_state); start_pos, last_pos, 0, cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode, ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
last_pos - 1);
if (ordered && if (ordered &&
ordered->file_offset + ordered->len > start_pos && ordered->file_offset + ordered->len > start_pos &&
ordered->file_offset < last_pos) { ordered->file_offset <= last_pos) {
btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, unlock_extent_cached(&BTRFS_I(inode)->io_tree,
start_pos, last_pos - 1, start_pos, last_pos,
&cached_state, GFP_NOFS); cached_state, GFP_NOFS);
for (i = 0; i < num_pages; i++) { for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]); unlock_page(pages[i]);
page_cache_release(pages[i]); page_cache_release(pages[i]);
} }
err = btrfs_wait_ordered_range(inode, start_pos, ret = btrfs_wait_ordered_range(inode, start_pos,
last_pos - start_pos); last_pos - start_pos + 1);
if (err) if (ret)
goto fail; return ret;
goto again; else
return -EAGAIN;
} }
if (ordered) if (ordered)
btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered);
clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state, GFP_NOFS); 0, 0, cached_state, GFP_NOFS);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, *lockstart = start_pos;
start_pos, last_pos - 1, &cached_state, *lockend = last_pos;
GFP_NOFS); ret = 1;
} }
for (i = 0; i < num_pages; i++) { for (i = 0; i < num_pages; i++) {
if (clear_page_dirty_for_io(pages[i])) if (clear_page_dirty_for_io(pages[i]))
account_page_redirty(pages[i]); account_page_redirty(pages[i]);
set_page_extent_mapped(pages[i]); set_page_extent_mapped(pages[i]);
WARN_ON(!PageLocked(pages[i])); WARN_ON(!PageLocked(pages[i]));
} }
return 0;
fail:
while (faili >= 0) {
unlock_page(pages[faili]);
page_cache_release(pages[faili]);
faili--;
}
return err;
return ret;
} }
static noinline int check_can_nocow(struct inode *inode, loff_t pos, static noinline int check_can_nocow(struct inode *inode, loff_t pos,
...@@ -1379,13 +1398,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, ...@@ -1379,13 +1398,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL; struct page **pages = NULL;
struct extent_state *cached_state = NULL;
u64 release_bytes = 0; u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
unsigned long first_index; unsigned long first_index;
size_t num_written = 0; size_t num_written = 0;
int nrptrs; int nrptrs;
int ret = 0; int ret = 0;
bool only_release_metadata = false; bool only_release_metadata = false;
bool force_page_uptodate = false; bool force_page_uptodate = false;
bool need_unlock;
nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
...@@ -1454,7 +1477,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, ...@@ -1454,7 +1477,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
} }
release_bytes = reserve_bytes; release_bytes = reserve_bytes;
need_unlock = false;
again:
/* /*
* This is going to setup the pages array with the number of * This is going to setup the pages array with the number of
* pages we want, so we don't really need to worry about the * pages we want, so we don't really need to worry about the
...@@ -1466,6 +1490,18 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, ...@@ -1466,6 +1490,18 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
if (ret) if (ret)
break; break;
ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
pos, &lockstart, &lockend,
&cached_state);
if (ret < 0) {
if (ret == -EAGAIN)
goto again;
break;
} else if (ret > 0) {
need_unlock = true;
ret = 0;
}
copied = btrfs_copy_from_user(pos, num_pages, copied = btrfs_copy_from_user(pos, num_pages,
write_bytes, pages, i); write_bytes, pages, i);
...@@ -1510,19 +1546,20 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, ...@@ -1510,19 +1546,20 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
} }
release_bytes = dirty_pages << PAGE_CACHE_SHIFT; release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
if (copied > 0) {
if (copied > 0)
ret = btrfs_dirty_pages(root, inode, pages, ret = btrfs_dirty_pages(root, inode, pages,
dirty_pages, pos, copied, dirty_pages, pos, copied,
NULL); NULL);
if (ret) { if (need_unlock)
btrfs_drop_pages(pages, num_pages); unlock_extent_cached(&BTRFS_I(inode)->io_tree,
break; lockstart, lockend, &cached_state,
} GFP_NOFS);
}
release_bytes = 0;
btrfs_drop_pages(pages, num_pages); btrfs_drop_pages(pages, num_pages);
if (ret)
break;
release_bytes = 0;
if (only_release_metadata && copied > 0) { if (only_release_metadata && copied > 0) {
u64 lockstart = round_down(pos, root->sectorsize); u64 lockstart = round_down(pos, root->sectorsize);
u64 lockend = lockstart + u64 lockend = lockstart +
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment