Commit 6140ba8a authored by David Sterba's avatar David Sterba

btrfs: switch btrfs_root::delayed_nodes_tree to xarray from radix-tree

The radix-tree has been superseded by the xarray
(https://lwn.net/Articles/745073), this patch converts the
btrfs_root::delayed_nodes, the APIs are used in a simple way.

First idea is to do xa_insert() but this would require GFP_ATOMIC
allocation which we want to avoid if possible. The preload mechanism of
radix-tree can be emulated within the xarray API.

- xa_reserve() with GFP_NOFS outside of the lock, the reserved entry
  is inserted atomically at most once

- xa_store() under a lock, in case something races in we can detect that
  and xa_load() returns a valid pointer

All uses of xa_load() must check for a valid pointer in case they manage
to get between the xa_reserve() and xa_store(), this is handled in
btrfs_get_delayed_node().

Otherwise the functionality is equivalent, xarray implements the
radix-tree and there should be no performance difference.

The patch continues the efforts started in 253bf575 ("btrfs: turn
delayed_nodes_tree into an XArray") and fixes the problems with locking
and GFP flags 088aea3b ("Revert "btrfs: turn delayed_nodes_tree
into an XArray"").
Reviewed-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent eefaf0a1
...@@ -227,10 +227,10 @@ struct btrfs_root { ...@@ -227,10 +227,10 @@ struct btrfs_root {
struct rb_root inode_tree; struct rb_root inode_tree;
/* /*
* radix tree that keeps track of delayed nodes of every inode, * Xarray that keeps track of delayed nodes of every inode, protected
* protected by inode_lock * by @inode_lock.
*/ */
struct radix_tree_root delayed_nodes_tree; struct xarray delayed_nodes;
/* /*
* right now this just gets used so that a root has its own devid * right now this just gets used so that a root has its own devid
* for stat. It may be used for more later * for stat. It may be used for more later
......
...@@ -71,7 +71,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( ...@@ -71,7 +71,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
} }
spin_lock(&root->inode_lock); spin_lock(&root->inode_lock);
node = radix_tree_lookup(&root->delayed_nodes_tree, ino); node = xa_load(&root->delayed_nodes, ino);
if (node) { if (node) {
if (btrfs_inode->delayed_node) { if (btrfs_inode->delayed_node) {
...@@ -83,9 +83,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( ...@@ -83,9 +83,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
/* /*
* It's possible that we're racing into the middle of removing * It's possible that we're racing into the middle of removing
* this node from the radix tree. In this case, the refcount * this node from the xarray. In this case, the refcount
* was zero and it should never go back to one. Just return * was zero and it should never go back to one. Just return
* NULL like it was never in the radix at all; our release * NULL like it was never in the xarray at all; our release
* function is in the process of removing it. * function is in the process of removing it.
* *
* Some implementations of refcount_inc refuse to bump the * Some implementations of refcount_inc refuse to bump the
...@@ -93,7 +93,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( ...@@ -93,7 +93,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead * here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount. * of actually bumping the refcount.
* *
* If this node is properly in the radix, we want to bump the * If this node is properly in the xarray, we want to bump the
* refcount twice, once for the inode and once for this get * refcount twice, once for the inode and once for this get
* operation. * operation.
*/ */
...@@ -120,6 +120,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( ...@@ -120,6 +120,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
struct btrfs_root *root = btrfs_inode->root; struct btrfs_root *root = btrfs_inode->root;
u64 ino = btrfs_ino(btrfs_inode); u64 ino = btrfs_ino(btrfs_inode);
int ret; int ret;
void *ptr;
again: again:
node = btrfs_get_delayed_node(btrfs_inode); node = btrfs_get_delayed_node(btrfs_inode);
...@@ -131,26 +132,30 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( ...@@ -131,26 +132,30 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino); btrfs_init_delayed_node(node, root, ino);
/* cached in the btrfs inode and can be accessed */ /* Cached in the inode and can be accessed. */
refcount_set(&node->refs, 2); refcount_set(&node->refs, 2);
ret = radix_tree_preload(GFP_NOFS); /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
if (ret) { ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
if (ret == -ENOMEM) {
kmem_cache_free(delayed_node_cache, node); kmem_cache_free(delayed_node_cache, node);
return ERR_PTR(ret); return ERR_PTR(-ENOMEM);
} }
spin_lock(&root->inode_lock); spin_lock(&root->inode_lock);
ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); ptr = xa_load(&root->delayed_nodes, ino);
if (ret == -EEXIST) { if (ptr) {
/* Somebody inserted it, go back and read it. */
spin_unlock(&root->inode_lock); spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, node); kmem_cache_free(delayed_node_cache, node);
radix_tree_preload_end(); node = NULL;
goto again; goto again;
} }
ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
ASSERT(xa_err(ptr) != -EINVAL);
ASSERT(xa_err(ptr) != -ENOMEM);
ASSERT(ptr == NULL);
btrfs_inode->delayed_node = node; btrfs_inode->delayed_node = node;
spin_unlock(&root->inode_lock); spin_unlock(&root->inode_lock);
radix_tree_preload_end();
return node; return node;
} }
...@@ -269,8 +274,7 @@ static void __btrfs_release_delayed_node( ...@@ -269,8 +274,7 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now. * back up. We can delete it now.
*/ */
ASSERT(refcount_read(&delayed_node->refs) == 0); ASSERT(refcount_read(&delayed_node->refs) == 0);
radix_tree_delete(&root->delayed_nodes_tree, xa_erase(&root->delayed_nodes, delayed_node->inode_id);
delayed_node->inode_id);
spin_unlock(&root->inode_lock); spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node); kmem_cache_free(delayed_node_cache, delayed_node);
} }
...@@ -2038,34 +2042,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) ...@@ -2038,34 +2042,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{ {
u64 inode_id = 0; unsigned long index = 0;
struct btrfs_delayed_node *delayed_nodes[8]; struct btrfs_delayed_node *delayed_nodes[8];
int i, n;
while (1) { while (1) {
struct btrfs_delayed_node *node;
int count;
spin_lock(&root->inode_lock); spin_lock(&root->inode_lock);
n = radix_tree_gang_lookup(&root->delayed_nodes_tree, if (xa_empty(&root->delayed_nodes)) {
(void **)delayed_nodes, inode_id,
ARRAY_SIZE(delayed_nodes));
if (!n) {
spin_unlock(&root->inode_lock); spin_unlock(&root->inode_lock);
break; return;
} }
inode_id = delayed_nodes[n - 1]->inode_id + 1; count = 0;
for (i = 0; i < n; i++) { xa_for_each_start(&root->delayed_nodes, index, node, index) {
/* /*
* Don't increase refs in case the node is dead and * Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below * about to be removed from the tree in the loop below
*/ */
if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) if (refcount_inc_not_zero(&node->refs)) {
delayed_nodes[i] = NULL; delayed_nodes[count] = node;
count++;
}
if (count >= ARRAY_SIZE(delayed_nodes))
break;
} }
spin_unlock(&root->inode_lock); spin_unlock(&root->inode_lock);
index++;
for (i = 0; i < n; i++) { for (int i = 0; i < count; i++) {
if (!delayed_nodes[i])
continue;
__btrfs_kill_delayed_node(delayed_nodes[i]); __btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]);
} }
......
...@@ -655,7 +655,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, ...@@ -655,7 +655,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_delalloc_inodes = 0; root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0; root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT; root->inode_tree = RB_ROOT;
INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); /* GFP flags are compatible with XA_FLAGS_*. */
xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
btrfs_init_root_block_rsv(root); btrfs_init_root_block_rsv(root);
......
...@@ -3805,7 +3805,7 @@ static int btrfs_read_locked_inode(struct inode *inode, ...@@ -3805,7 +3805,7 @@ static int btrfs_read_locked_inode(struct inode *inode,
* cache. * cache.
* *
* This is required for both inode re-read from disk and delayed inode * This is required for both inode re-read from disk and delayed inode
* in delayed_nodes_tree. * in the delayed_nodes xarray.
*/ */
if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment