Commit 3ee56a58 authored by Filipe Manana's avatar Filipe Manana Committed by David Sterba

btrfs: reserve space for delayed refs on a per ref basis

Currently when reserving space for delayed refs we do it on a per ref head
basis. This is generally enough because most back refs for an extent end
up being inlined in the extent item - with the default leaf size of 16K we
can have at most 33 inline back refs (this is calculated by the macro
BTRFS_MAX_EXTENT_ITEM_SIZE()). The amount of bytes reserved for each ref
head is given by btrfs_calc_delayed_ref_bytes(), which basically
corresponds to a single path for insertion into the extent tree plus
another path for insertion into the free space tree if it's enabled.

However if we have reached the limit of inline refs or we have a mix of
inline and non-inline refs, then we will need to insert a non-inline ref
and update the existing extent item to update the total number of
references for the extent. This implies we need reserved space for two
insertion paths in the extent tree, but we only reserved for one path.
The extent item and the non-inline ref item may be located in different
leaves, or even if they are located in the same leaf, after updating the
extent item and before inserting the non-inline ref item, the extent
buffers in the btree path may have been written (due to memory pressure
for e.g.), in which case we need to COW the entire path again. In this
case since we have not reserved enough space for the delayed refs block
reserve, we will use the global block reserve.

If we are in a situation where the fs has no more unallocated space enough
to allocate a new metadata block group and available space in the existing
metadata block groups is close to the maximum size of the global block
reserve (512M), we may end up consuming too much of the free metadata
space to the point where we can't commit any future transaction because it
will fail, with -ENOSPC, during its commit when trying to allocate an
extent for some COW operation (running delayed refs generated by running
delayed refs or COWing the root tree's root node at commit_cowonly_roots()
for example). Such dramatic scenario can happen if we have many delayed
refs that require the insertion of non-inline ref items, due to too many
reflinks or snapshots. We also have situations where we use the global
block reserve because we could not in advance know that we will need
space to update some trees (block group creation for example), so this
all adds up to increase the chances of exhausting the global block reserve
and making any future transaction commit to fail with -ENOSPC and turn
the fs into RO mode, or fail the mount operation in case the mount needs
to start and commit a transaction, such as when we have orphans to cleanup
for example - such case was reported and hit by someone running a SLE
(SUSE Linux Enterprise) distribution for example - where the fs had no
more unallocated space that could be used to allocate a new metadata block
group, and the available metadata space was about 1.5M, not enough to
commit a transaction to cleanup an orphan inode (or do relocation of data
block groups that were far from being full).

So reserve space for delayed refs by individual refs and not by ref heads,
as we may need to COW multiple extent tree paths due to non-inline ref
items.
Reviewed-by: default avatarJosef Bacik <josef@toxicpanda.com>
Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 8a526c44
......@@ -422,7 +422,8 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
return 0;
}
static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref)
{
......@@ -433,9 +434,11 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
list_del(&ref->add_list);
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
btrfs_delayed_refs_rsv_release(fs_info, 1);
}
static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
static bool merge_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref,
u64 seq)
......@@ -464,10 +467,10 @@ static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
mod = -next->ref_mod;
}
drop_delayed_ref(delayed_refs, head, next);
drop_delayed_ref(fs_info, delayed_refs, head, next);
ref->ref_mod += mod;
if (ref->ref_mod == 0) {
drop_delayed_ref(delayed_refs, head, ref);
drop_delayed_ref(fs_info, delayed_refs, head, ref);
done = true;
} else {
/*
......@@ -505,7 +508,7 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
if (seq && ref->seq >= seq)
continue;
if (merge_ref(delayed_refs, head, ref, seq))
if (merge_ref(fs_info, delayed_refs, head, ref, seq))
goto again;
}
}
......@@ -584,10 +587,11 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
* Return true if the ref was merged into an existing one (and therefore can be
* freed by the caller).
*/
static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *ref)
{
struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs;
struct btrfs_delayed_ref_node *exist;
int mod;
......@@ -598,6 +602,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
spin_unlock(&href->lock);
trans->delayed_ref_updates++;
return false;
}
......@@ -626,7 +631,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
/* remove existing tail if its ref_mod is zero */
if (exist->ref_mod == 0)
drop_delayed_ref(root, href, exist);
drop_delayed_ref(trans->fs_info, root, href, exist);
spin_unlock(&href->lock);
return true;
}
......@@ -695,6 +700,8 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
/*
* If we are going to from a positive ref mod to a negative or vice
* versa we need to make sure to adjust pending_csums accordingly.
* We reserve bytes for csum deletion when adding or updating a ref head
* see add_delayed_ref_head() for more details.
*/
if (existing->is_data) {
u64 csum_leaves =
......@@ -819,6 +826,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
/*
* We reserve the amount of bytes needed to delete csums when
* adding the ref head and not when adding individual drop refs
* since the csum items are deleted only after running the last
* delayed drop ref (the data extent's ref count drops to 0).
*/
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
trans->delayed_ref_updates +=
......@@ -828,7 +841,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
trans->delayed_ref_updates++;
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
......@@ -958,7 +970,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
merged = insert_delayed_ref(trans, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
......@@ -1050,7 +1062,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
merged = insert_delayed_ref(trans, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
......
......@@ -4563,6 +4563,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
list_del(&ref->add_list);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
btrfs_delayed_refs_rsv_release(fs_info, 1);
}
if (head->must_insert_reserved)
pin_bytes = true;
......
......@@ -1819,22 +1819,24 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
int nr_items = 1; /* Dropping this ref head update. */
/*
* We had csum deletions accounted for in our delayed refs rsv, we need
* to drop the csum leaves for this update from our delayed_refs_rsv.
*/
if (head->total_ref_mod < 0 && head->is_data) {
int nr_items;
spin_lock(&delayed_refs->lock);
delayed_refs->pending_csums -= head->num_bytes;
spin_unlock(&delayed_refs->lock);
nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
}
nr_items = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
return btrfs_calc_delayed_ref_bytes(fs_info, nr_items);
}
return 0;
}
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
......@@ -1884,7 +1886,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
}
}
*bytes_released = btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
*bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
......@@ -1926,7 +1928,8 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
}
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *locked_ref)
struct btrfs_delayed_ref_head *locked_ref,
u64 *bytes_released)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
......@@ -1982,7 +1985,8 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
ret = run_one_delayed_ref(trans, ref, extent_op,
must_insert_reserved);
btrfs_delayed_refs_rsv_release(fs_info, 1);
*bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
unselect_delayed_ref_head(delayed_refs, locked_ref);
......@@ -2048,7 +2052,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_lock(&locked_ref->lock);
btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
if (ret < 0 && ret != -EAGAIN) {
/*
* Error, btrfs_run_delayed_refs_for_head already
......@@ -2056,14 +2060,11 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
*/
return ret;
} else if (!ret) {
u64 bytes_released = 0;
/*
* Success, perform the usual cleanup of a processed
* head
*/
ret = cleanup_ref_head(trans, locked_ref, &bytes_released);
bytes_processed += bytes_released;
ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
if (ret > 0 ) {
/* We dropped our lock, we need to loop. */
ret = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment