Commit ad801428 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull more btrfs updates from David Sterba:
 "Fixes that arrived after the merge window freeze, mostly stable
  material.

   - fix race in tree-mod-log element tracking

   - fix bio flushing inside extent writepages

   - fix assertion when in-memory tracking of discarded extents finds an
     empty tree (eg. after adding a new device)

   - update logic of temporary read-only block groups to take into
     account overcommit

   - fix some fixup worker corner cases:
       - page could not go through proper COW cycle and the dirty status
         is lost due to page migration
       - deadlock if delayed allocation is performed under page lock

   - fix send emitting invalid clones within the same file

   - fix statfs reporting 0 free space when global block reserve size is
     larger than remaining free space but there is still space for new
     chunks"

* tag 'for-5.6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: do not zero f_bavail if we have available space
  Btrfs: send, fix emission of invalid clone operations within the same file
  btrfs: do not do delalloc reservation under page lock
  btrfs: drop the -EBUSY case in __extent_writepage_io
  Btrfs: keep pages dirty when using btrfs_writepage_fixup_worker
  btrfs: take overcommit into account in inc_block_group_ro
  btrfs: fix force usage in inc_block_group_ro
  btrfs: Correctly handle empty trees in find_first_clear_extent_bit
  btrfs: flush write bio if we loop in extent_write_cache_pages
  Btrfs: fix race between adding and putting tree mod seq elements and nodes
parents e17ac02b d55966c4
......@@ -1191,7 +1191,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
u64 sinfo_used;
int ret = -ENOSPC;
spin_lock(&sinfo->lock);
......@@ -1205,19 +1204,38 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
num_bytes = cache->length - cache->reserved - cache->pinned -
cache->bytes_super - cache->used;
sinfo_used = btrfs_space_info_used(sinfo, true);
/*
* sinfo_used + num_bytes should always <= sinfo->total_bytes.
*
* Here we make sure if we mark this bg RO, we still have enough
* free space as buffer.
* Data never overcommits, even in mixed mode, so do just the straight
* check of left over space in how much we have allocated.
*/
if (sinfo_used + num_bytes <= sinfo->total_bytes) {
if (force) {
ret = 0;
} else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
u64 sinfo_used = btrfs_space_info_used(sinfo, true);
/*
* Here we make sure if we mark this bg RO, we still have enough
* free space as buffer.
*/
if (sinfo_used + num_bytes <= sinfo->total_bytes)
ret = 0;
} else {
/*
* We overcommit metadata, so we need to do the
* btrfs_can_overcommit check here, and we need to pass in
* BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
* leeway to allow us to mark this block group as read only.
*/
if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
BTRFS_RESERVE_NO_FLUSH))
ret = 0;
}
if (!ret) {
sinfo->bytes_readonly += num_bytes;
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
ret = 0;
}
out:
spin_unlock(&cache->lock);
......@@ -1225,9 +1243,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
btrfs_info(cache->fs_info,
"unable to make block group %llu ro", cache->start);
btrfs_info(cache->fs_info,
"sinfo_used=%llu bg_num_bytes=%llu",
sinfo_used, num_bytes);
btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
}
return ret;
......@@ -2225,7 +2240,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
}
}
ret = inc_block_group_ro(cache, !do_chunk_alloc);
ret = inc_block_group_ro(cache, 0);
if (!do_chunk_alloc)
goto unlock_out;
if (!ret)
......
......@@ -326,12 +326,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem)
{
write_lock(&fs_info->tree_mod_log_lock);
spin_lock(&fs_info->tree_mod_seq_lock);
if (!elem->seq) {
elem->seq = btrfs_inc_tree_mod_seq(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
}
spin_unlock(&fs_info->tree_mod_seq_lock);
write_unlock(&fs_info->tree_mod_log_lock);
return elem->seq;
......@@ -351,7 +349,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
if (!seq_putting)
return;
spin_lock(&fs_info->tree_mod_seq_lock);
write_lock(&fs_info->tree_mod_log_lock);
list_del(&elem->list);
elem->seq = 0;
......@@ -362,19 +360,17 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
* blocker with lower sequence number exists, we
* cannot remove anything from the log
*/
spin_unlock(&fs_info->tree_mod_seq_lock);
write_unlock(&fs_info->tree_mod_log_lock);
return;
}
min_seq = cur_elem->seq;
}
}
spin_unlock(&fs_info->tree_mod_seq_lock);
/*
* anything that's lower than the lowest existing (read: blocked)
* sequence number can be removed from the tree.
*/
write_lock(&fs_info->tree_mod_log_lock);
tm_root = &fs_info->tree_mod_log;
for (node = rb_first(tm_root); node; node = next) {
next = rb_next(node);
......
......@@ -714,14 +714,12 @@ struct btrfs_fs_info {
atomic_t nr_delayed_iputs;
wait_queue_head_t delayed_iputs_wait;
/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
atomic64_t tree_mod_seq;
struct list_head tree_mod_seq_list;
/* this protects tree_mod_log */
/* this protects tree_mod_log and tree_mod_seq_list */
rwlock_t tree_mod_log_lock;
struct rb_root tree_mod_log;
struct list_head tree_mod_seq_list;
atomic_t async_delalloc_pages;
......
......@@ -492,7 +492,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
if (head->is_data)
return;
spin_lock(&fs_info->tree_mod_seq_lock);
read_lock(&fs_info->tree_mod_log_lock);
if (!list_empty(&fs_info->tree_mod_seq_list)) {
struct seq_list *elem;
......@@ -500,7 +500,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct seq_list, list);
seq = elem->seq;
}
spin_unlock(&fs_info->tree_mod_seq_lock);
read_unlock(&fs_info->tree_mod_log_lock);
again:
for (node = rb_first_cached(&head->ref_tree); node;
......@@ -518,7 +518,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
struct seq_list *elem;
int ret = 0;
spin_lock(&fs_info->tree_mod_seq_lock);
read_lock(&fs_info->tree_mod_log_lock);
if (!list_empty(&fs_info->tree_mod_seq_list)) {
elem = list_first_entry(&fs_info->tree_mod_seq_list,
struct seq_list, list);
......@@ -531,7 +531,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
}
}
spin_unlock(&fs_info->tree_mod_seq_lock);
read_unlock(&fs_info->tree_mod_log_lock);
return ret;
}
......
......@@ -2697,7 +2697,6 @@ int __cold open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->tree_mod_seq_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
......
......@@ -1593,21 +1593,25 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
/* Find first extent with bits cleared */
while (1) {
node = __etree_search(tree, start, &next, &prev, NULL, NULL);
if (!node) {
if (!node && !next && !prev) {
/*
* Tree is completely empty, send full range and let
* caller deal with it
*/
*start_ret = 0;
*end_ret = -1;
goto out;
} else if (!node && !next) {
/*
* We are past the last allocated chunk, set start at
* the end of the last extent.
*/
state = rb_entry(prev, struct extent_state, rb_node);
*start_ret = state->end + 1;
*end_ret = -1;
goto out;
} else if (!node) {
node = next;
if (!node) {
/*
* We are past the last allocated chunk,
* set start at the end of the last extent. The
* device alloc tree should never be empty so
* prev is always set.
*/
ASSERT(prev);
state = rb_entry(prev, struct extent_state, rb_node);
*start_ret = state->end + 1;
*end_ret = -1;
goto out;
}
}
/*
* At this point 'node' either contains 'start' or start is
......@@ -3438,11 +3442,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
ret = btrfs_writepage_cow_fixup(page, start, page_end);
if (ret) {
/* Fixup worker will requeue */
if (ret == -EBUSY)
wbc->pages_skipped++;
else
redirty_page_for_writepage(wbc, page);
redirty_page_for_writepage(wbc, page);
update_nr_written(wbc, nr_written);
unlock_page(page);
return 1;
......@@ -4166,7 +4166,16 @@ static int extent_write_cache_pages(struct address_space *mapping,
*/
scanned = 1;
index = 0;
goto retry;
/*
* If we're looping we could run into a page that is locked by a
* writer and that writer could be waiting on writeback for a
* page in our current bio, and thus deadlock, so flush the
* write bio here.
*/
ret = flush_write_bio(epd);
if (!ret)
goto retry;
}
if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
......
......@@ -2189,6 +2189,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
/* see btrfs_writepage_start_hook for details on why this is required */
struct btrfs_writepage_fixup {
struct page *page;
struct inode *inode;
struct btrfs_work work;
};
......@@ -2202,27 +2203,71 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct inode *inode;
u64 page_start;
u64 page_end;
int ret;
int ret = 0;
bool free_delalloc_space = true;
fixup = container_of(work, struct btrfs_writepage_fixup, work);
page = fixup->page;
inode = fixup->inode;
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_SIZE - 1;
/*
* This is similar to page_mkwrite, we need to reserve the space before
* we take the page lock.
*/
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
PAGE_SIZE);
again:
lock_page(page);
/*
* Before we queued this fixup, we took a reference on the page.
* page->mapping may go NULL, but it shouldn't be moved to a different
* address space.
*/
if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
ClearPageChecked(page);
/*
* Unfortunately this is a little tricky, either
*
* 1) We got here and our page had already been dealt with and
* we reserved our space, thus ret == 0, so we need to just
* drop our space reservation and bail. This can happen the
* first time we come into the fixup worker, or could happen
* while waiting for the ordered extent.
* 2) Our page was already dealt with, but we happened to get an
* ENOSPC above from the btrfs_delalloc_reserve_space. In
* this case we obviously don't have anything to release, but
* because the page was already dealt with we don't want to
* mark the page with an error, so make sure we're resetting
* ret to 0. This is why we have this check _before_ the ret
* check, because we do not want to have a surprise ENOSPC
* when the page was already properly dealt with.
*/
if (!ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
PAGE_SIZE);
btrfs_delalloc_release_space(inode, data_reserved,
page_start, PAGE_SIZE,
true);
}
ret = 0;
goto out_page;
}
inode = page->mapping->host;
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_SIZE - 1;
/*
* We can't mess with the page state unless it is locked, so now that
* it is locked bail if we failed to make our space reservation.
*/
if (ret)
goto out_page;
lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
/* already ordered? We're done */
if (PagePrivate2(page))
goto out;
goto out_reserved;
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
PAGE_SIZE);
......@@ -2235,39 +2280,49 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
goto again;
}
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
PAGE_SIZE);
if (ret) {
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
ClearPageChecked(page);
goto out;
}
ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
&cached_state);
if (ret) {
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
ClearPageChecked(page);
if (ret)
goto out_reserved;
}
ClearPageChecked(page);
set_page_dirty(page);
/*
* Everything went as planned, we're now the owner of a dirty page with
* delayed allocation bits set and space reserved for our COW
* destination.
*
* The page was dirty when we started, nothing should have cleaned it.
*/
BUG_ON(!PageDirty(page));
free_delalloc_space = false;
out_reserved:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
if (ret)
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
out_page:
if (ret) {
/*
* We hit ENOSPC or other errors. Update the mapping and page
* to reflect the errors and clean the page.
*/
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
clear_page_dirty_for_io(page);
SetPageError(page);
}
ClearPageChecked(page);
unlock_page(page);
put_page(page);
kfree(fixup);
extent_changeset_free(data_reserved);
/*
* As a precaution, do a delayed iput in case it would be the last iput
* that could need flushing space. Recursing back to fixup worker would
* deadlock.
*/
btrfs_add_delayed_iput(inode);
}
/*
......@@ -2291,6 +2346,13 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
if (TestClearPagePrivate2(page))
return 0;
/*
* PageChecked is set below when we create a fixup worker for this page,
* don't try to create another one if we're already PageChecked()
*
* The extent_io writepage code will redirty the page if we send back
* EAGAIN.
*/
if (PageChecked(page))
return -EAGAIN;
......@@ -2298,12 +2360,21 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
if (!fixup)
return -EAGAIN;
/*
* We are already holding a reference to this inode from
* write_cache_pages. We need to hold it because the space reservation
* takes place outside of the page lock, and we can't trust
* page->mapping outside of the page lock.
*/
ihold(inode);
SetPageChecked(page);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
fixup->inode = inode;
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
return -EBUSY;
return -EAGAIN;
}
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
......
......@@ -1269,7 +1269,8 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
* destination of the stream.
*/
if (ino == bctx->cur_objectid &&
offset >= bctx->sctx->cur_inode_next_write_offset)
offset + bctx->extent_len >
bctx->sctx->cur_inode_next_write_offset)
return 0;
}
......
......@@ -159,9 +159,9 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
return (global->size << 1);
}
static int can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
u64 profile;
u64 avail;
......@@ -226,7 +226,8 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
/* Check and see if our ticket can be satisified now. */
if ((used + ticket->bytes <= space_info->total_bytes) ||
can_overcommit(fs_info, space_info, ticket->bytes, flush)) {
btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
flush)) {
btrfs_space_info_update_bytes_may_use(fs_info,
space_info,
ticket->bytes);
......@@ -639,13 +640,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
return to_reclaim;
to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
if (can_overcommit(fs_info, space_info, to_reclaim,
BTRFS_RESERVE_FLUSH_ALL))
if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
BTRFS_RESERVE_FLUSH_ALL))
return 0;
used = btrfs_space_info_used(space_info, true);
if (can_overcommit(fs_info, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
BTRFS_RESERVE_FLUSH_ALL))
expected = div_factor_fine(space_info->total_bytes, 95);
else
expected = div_factor_fine(space_info->total_bytes, 90);
......@@ -1004,7 +1006,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
*/
if (!pending_tickets &&
((used + orig_bytes <= space_info->total_bytes) ||
can_overcommit(fs_info, space_info, orig_bytes, flush))) {
btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
ret = 0;
......
......@@ -127,6 +127,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
enum btrfs_reserve_flush_enum flush);
void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info);
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
static inline void btrfs_space_info_free_bytes_may_use(
struct btrfs_fs_info *fs_info,
......
......@@ -2135,7 +2135,15 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
*/
thresh = SZ_4M;
if (!mixed && total_free_meta - thresh < block_rsv->size)
/*
* We only want to claim there's no available space if we can no longer
* allocate chunks for our metadata profile and our global reserve will
* not fit in the free metadata space. If we aren't ->full then we
* still can allocate chunks and thus are fine using the currently
* calculated f_bavail.
*/
if (!mixed && block_rsv->space_info->full &&
total_free_meta - thresh < block_rsv->size)
buf->f_bavail = 0;
buf->f_type = BTRFS_SUPER_MAGIC;
......
......@@ -142,7 +142,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
spin_lock_init(&fs_info->qgroup_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->tree_mod_seq_lock);
mutex_init(&fs_info->qgroup_ioctl_lock);
mutex_init(&fs_info->qgroup_rescan_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
......
......@@ -441,8 +441,17 @@ static int test_find_first_clear_extent_bit(void)
int ret = -EINVAL;
test_msg("running find_first_clear_extent_bit test");
extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL);
/* Test correct handling of empty tree */
find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
if (start != 0 || end != -1) {
test_err(
"error getting a range from completely empty tree: start %llu end %llu",
start, end);
goto out;
}
/*
* Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between
* 4M-32M
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment