Commit d7df2c79 authored by Josef Bacik's avatar Josef Bacik Committed by Chris Mason

Btrfs: attach delayed ref updates to delayed ref heads

Currently we have two rb-trees, one for delayed ref heads and one for all of the
delayed refs, including the delayed ref heads.  When we process the delayed refs
we have to hold onto the delayed ref lock for all of the selecting and merging
and such, which results in quite a bit of lock contention.  This was solved by
having a waitqueue and only one flusher at a time, however this hurts if we get
a lot of delayed refs queued up.

So instead just have an rb tree for the delayed ref heads, and then attach the
delayed ref updates to an rb tree that is per delayed ref head.  Then we only
need to take the delayed ref lock when adding new delayed refs and when
selecting a delayed ref head to process, all the rest of the time we deal with a
per delayed ref head lock which will be much less contentious.

The locking rules for this get a little more complicated since we have to lock
up to 3 things to properly process delayed refs, but I will address that problem
later.  For now this passes all of xfstests and my overnight stress tests.
Thanks,
Signed-off-by: default avatarJosef Bacik <jbacik@fb.com>
Signed-off-by: default avatarChris Mason <clm@fb.com>
parent 5039eddc
...@@ -538,14 +538,13 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, ...@@ -538,14 +538,13 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
if (extent_op && extent_op->update_key) if (extent_op && extent_op->update_key)
btrfs_disk_key_to_cpu(&op_key, &extent_op->key); btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
while ((n = rb_prev(n))) { spin_lock(&head->lock);
n = rb_first(&head->ref_root);
while (n) {
struct btrfs_delayed_ref_node *node; struct btrfs_delayed_ref_node *node;
node = rb_entry(n, struct btrfs_delayed_ref_node, node = rb_entry(n, struct btrfs_delayed_ref_node,
rb_node); rb_node);
if (node->bytenr != head->node.bytenr) n = rb_next(n);
break;
WARN_ON(node->is_head);
if (node->seq > seq) if (node->seq > seq)
continue; continue;
...@@ -612,10 +611,10 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, ...@@ -612,10 +611,10 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
WARN_ON(1); WARN_ON(1);
} }
if (ret) if (ret)
return ret; break;
} }
spin_unlock(&head->lock);
return 0; return ret;
} }
/* /*
...@@ -882,15 +881,15 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, ...@@ -882,15 +881,15 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
btrfs_put_delayed_ref(&head->node); btrfs_put_delayed_ref(&head->node);
goto again; goto again;
} }
spin_unlock(&delayed_refs->lock);
ret = __add_delayed_refs(head, time_seq, ret = __add_delayed_refs(head, time_seq,
&prefs_delayed); &prefs_delayed);
mutex_unlock(&head->mutex); mutex_unlock(&head->mutex);
if (ret) { if (ret)
spin_unlock(&delayed_refs->lock);
goto out; goto out;
} } else {
spin_unlock(&delayed_refs->lock);
} }
spin_unlock(&delayed_refs->lock);
} }
if (path->slots[0]) { if (path->slots[0]) {
......
This diff is collapsed.
...@@ -81,7 +81,8 @@ struct btrfs_delayed_ref_head { ...@@ -81,7 +81,8 @@ struct btrfs_delayed_ref_head {
*/ */
struct mutex mutex; struct mutex mutex;
struct list_head cluster; spinlock_t lock;
struct rb_root ref_root;
struct rb_node href_node; struct rb_node href_node;
...@@ -100,6 +101,7 @@ struct btrfs_delayed_ref_head { ...@@ -100,6 +101,7 @@ struct btrfs_delayed_ref_head {
*/ */
unsigned int must_insert_reserved:1; unsigned int must_insert_reserved:1;
unsigned int is_data:1; unsigned int is_data:1;
unsigned int processing:1;
}; };
struct btrfs_delayed_tree_ref { struct btrfs_delayed_tree_ref {
...@@ -118,8 +120,6 @@ struct btrfs_delayed_data_ref { ...@@ -118,8 +120,6 @@ struct btrfs_delayed_data_ref {
}; };
struct btrfs_delayed_ref_root { struct btrfs_delayed_ref_root {
struct rb_root root;
/* head ref rbtree */ /* head ref rbtree */
struct rb_root href_root; struct rb_root href_root;
...@@ -129,7 +129,7 @@ struct btrfs_delayed_ref_root { ...@@ -129,7 +129,7 @@ struct btrfs_delayed_ref_root {
/* how many delayed ref updates we've queued, used by the /* how many delayed ref updates we've queued, used by the
* throttling code * throttling code
*/ */
unsigned long num_entries; atomic_t num_entries;
/* total number of head nodes in tree */ /* total number of head nodes in tree */
unsigned long num_heads; unsigned long num_heads;
...@@ -137,15 +137,6 @@ struct btrfs_delayed_ref_root { ...@@ -137,15 +137,6 @@ struct btrfs_delayed_ref_root {
/* total number of head nodes ready for processing */ /* total number of head nodes ready for processing */
unsigned long num_heads_ready; unsigned long num_heads_ready;
/*
* bumped when someone is making progress on the delayed
* refs, so that other procs know they are just adding to
* contention intead of helping
*/
atomic_t procs_running_refs;
atomic_t ref_seq;
wait_queue_head_t wait;
/* /*
* set when the tree is flushing before a transaction commit, * set when the tree is flushing before a transaction commit,
* used by the throttling code to decide if new updates need * used by the throttling code to decide if new updates need
...@@ -231,9 +222,9 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) ...@@ -231,9 +222,9 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
mutex_unlock(&head->mutex); mutex_unlock(&head->mutex);
} }
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 search_start); struct btrfs_delayed_ref_head *
void btrfs_release_ref_cluster(struct list_head *cluster); btrfs_select_ref_head(struct btrfs_trans_handle *trans);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_root *delayed_refs,
......
...@@ -3801,58 +3801,55 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, ...@@ -3801,58 +3801,55 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
delayed_refs = &trans->delayed_refs; delayed_refs = &trans->delayed_refs;
spin_lock(&delayed_refs->lock); spin_lock(&delayed_refs->lock);
if (delayed_refs->num_entries == 0) { if (atomic_read(&delayed_refs->num_entries) == 0) {
spin_unlock(&delayed_refs->lock); spin_unlock(&delayed_refs->lock);
btrfs_info(root->fs_info, "delayed_refs has NO entry"); btrfs_info(root->fs_info, "delayed_refs has NO entry");
return ret; return ret;
} }
while ((node = rb_first(&delayed_refs->root)) != NULL) { while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head = NULL; struct btrfs_delayed_ref_head *head;
bool pin_bytes = false; bool pin_bytes = false;
ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); head = rb_entry(node, struct btrfs_delayed_ref_head,
atomic_set(&ref->refs, 1); href_node);
if (btrfs_delayed_ref_is_head(ref)) { if (!mutex_trylock(&head->mutex)) {
atomic_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
head = btrfs_delayed_node_to_head(ref); mutex_lock(&head->mutex);
if (!mutex_trylock(&head->mutex)) {
atomic_inc(&ref->refs);
spin_unlock(&delayed_refs->lock);
/* Need to wait for the delayed ref to run */
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref(ref);
spin_lock(&delayed_refs->lock);
continue;
}
if (head->must_insert_reserved)
pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
delayed_refs->num_heads--;
if (list_empty(&head->cluster))
delayed_refs->num_heads_ready--;
list_del_init(&head->cluster);
}
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
if (head)
rb_erase(&head->href_node, &delayed_refs->href_root);
delayed_refs->num_entries--;
spin_unlock(&delayed_refs->lock);
if (head) {
if (pin_bytes)
btrfs_pin_extent(root, ref->bytenr,
ref->num_bytes, 1);
mutex_unlock(&head->mutex); mutex_unlock(&head->mutex);
btrfs_put_delayed_ref(&head->node);
spin_lock(&delayed_refs->lock);
continue;
}
spin_lock(&head->lock);
while ((node = rb_first(&head->ref_root)) != NULL) {
ref = rb_entry(node, struct btrfs_delayed_ref_node,
rb_node);
ref->in_tree = 0;
rb_erase(&ref->rb_node, &head->ref_root);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
cond_resched_lock(&head->lock);
} }
btrfs_put_delayed_ref(ref); if (head->must_insert_reserved)
pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
delayed_refs->num_heads--;
if (head->processing == 0)
delayed_refs->num_heads_ready--;
atomic_dec(&delayed_refs->num_entries);
head->node.in_tree = 0;
rb_erase(&head->href_node, &delayed_refs->href_root);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
if (pin_bytes)
btrfs_pin_extent(root, head->node.bytenr,
head->node.num_bytes, 1);
btrfs_put_delayed_ref(&head->node);
cond_resched(); cond_resched();
spin_lock(&delayed_refs->lock); spin_lock(&delayed_refs->lock);
} }
......
This diff is collapsed.
...@@ -62,7 +62,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) ...@@ -62,7 +62,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
WARN_ON(atomic_read(&transaction->use_count) == 0); WARN_ON(atomic_read(&transaction->use_count) == 0);
if (atomic_dec_and_test(&transaction->use_count)) { if (atomic_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list)); BUG_ON(!list_empty(&transaction->list));
WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.root));
WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
while (!list_empty(&transaction->pending_chunks)) { while (!list_empty(&transaction->pending_chunks)) {
struct extent_map *em; struct extent_map *em;
...@@ -184,9 +183,8 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type) ...@@ -184,9 +183,8 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
atomic_set(&cur_trans->use_count, 2); atomic_set(&cur_trans->use_count, 2);
cur_trans->start_time = get_seconds(); cur_trans->start_time = get_seconds();
cur_trans->delayed_refs.root = RB_ROOT;
cur_trans->delayed_refs.href_root = RB_ROOT; cur_trans->delayed_refs.href_root = RB_ROOT;
cur_trans->delayed_refs.num_entries = 0; atomic_set(&cur_trans->delayed_refs.num_entries, 0);
cur_trans->delayed_refs.num_heads_ready = 0; cur_trans->delayed_refs.num_heads_ready = 0;
cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.num_heads = 0;
cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.flushing = 0;
...@@ -206,9 +204,6 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type) ...@@ -206,9 +204,6 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
atomic64_set(&fs_info->tree_mod_seq, 0); atomic64_set(&fs_info->tree_mod_seq, 0);
spin_lock_init(&cur_trans->delayed_refs.lock); spin_lock_init(&cur_trans->delayed_refs.lock);
atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
init_waitqueue_head(&cur_trans->delayed_refs.wait);
INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->ordered_operations); INIT_LIST_HEAD(&cur_trans->ordered_operations);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment