Commit 0f9dd46c authored by Josef Bacik's avatar Josef Bacik Committed by Chris Mason

Btrfs: free space accounting redo

1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size.  The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing.  If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible.  When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.

2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset.  also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.

3) cleaned up the allocation code to make it a little easier to read and a
little less complicated.  Basically there are 3 steps, first look from our
provided hint.  If we couldn't find from that given hint, start back at our
original search start and look for space from there.  If that fails try to
allocate space if we can and start looking again.  If not we're screwed and need
to start over again.

4) small fixes.  there were some issues in volumes.c where we wouldn't allocate
the rest of the disk.  fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space.  Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space.  Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups.  The alloc_hint has fixed
this slight degredation and made things semi-normal.

There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space.  This only happens with metadata
allocations, and only when we are almost full.  So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall.  I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent ef8bbdfe
......@@ -7,7 +7,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
transaction.o bit-radix.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
ref-cache.o export.o tree-log.o acl.o
ref-cache.o export.o tree-log.o acl.o free-space-cache.o
else
# Normal Makefile
......
......@@ -2725,9 +2725,8 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
total_size = total_data + (nr * sizeof(struct btrfs_item));
ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
if (ret == 0) {
if (ret == 0)
return -EEXIST;
}
if (ret < 0)
goto out;
......
......@@ -483,7 +483,6 @@ struct btrfs_csum_item {
#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
struct btrfs_block_group_item {
__le64 used;
__le64 chunk_objectid;
......@@ -498,17 +497,40 @@ struct btrfs_space_info {
int full;
int force_alloc;
struct list_head list;
/* for block groups in our same type */
struct list_head block_groups;
spinlock_t lock;
};
struct btrfs_free_space {
struct rb_node bytes_index;
struct rb_node offset_index;
u64 offset;
u64 bytes;
};
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
struct btrfs_space_info *space_info;
spinlock_t lock;
u64 pinned;
u64 flags;
int cached;
int ro;
int dirty;
struct btrfs_space_info *space_info;
/* free space cache stuff */
struct rb_root free_space_bytes;
struct rb_root free_space_offset;
/* block group cache stuff */
struct rb_node cache_node;
/* for block groups in the same raid type */
struct list_head list;
};
struct btrfs_device;
......@@ -525,8 +547,10 @@ struct btrfs_fs_info {
struct btrfs_root *log_root_tree;
struct radix_tree_root fs_roots_radix;
struct extent_io_tree free_space_cache;
struct extent_io_tree block_group_cache;
/* block group cache stuff */
spinlock_t block_group_cache_lock;
struct rb_root block_group_cache_tree;
struct extent_io_tree pinned_extents;
struct extent_io_tree pending_del;
struct extent_io_tree extent_ins;
......@@ -1814,4 +1838,18 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
int btrfs_check_acl(struct inode *inode, int mask);
int btrfs_init_acl(struct inode *inode, struct inode *dir);
int btrfs_acl_chmod(struct inode *inode);
/* free-space-cache.c */
int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
u64 bytenr, u64 size);
void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
*block_group);
struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
*block_group, u64 offset,
u64 bytes);
void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
u64 bytes);
u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
#endif
......@@ -1410,10 +1410,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
extent_io_tree_init(&fs_info->free_space_cache,
fs_info->btree_inode->i_mapping, GFP_NOFS);
extent_io_tree_init(&fs_info->block_group_cache,
fs_info->btree_inode->i_mapping, GFP_NOFS);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree.rb_node = NULL;
extent_io_tree_init(&fs_info->pinned_extents,
fs_info->btree_inode->i_mapping, GFP_NOFS);
extent_io_tree_init(&fs_info->pending_del,
......
This diff is collapsed.
......@@ -2634,6 +2634,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
if (eb) {
atomic_inc(&eb->refs);
spin_unlock(&tree->buffer_lock);
mark_page_accessed(eb->first_page);
return eb;
}
spin_unlock(&tree->buffer_lock);
......@@ -2713,6 +2714,9 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
atomic_inc(&eb->refs);
spin_unlock(&tree->buffer_lock);
if (eb)
mark_page_accessed(eb->first_page);
return eb;
}
EXPORT_SYMBOL(find_extent_buffer);
......
This diff is collapsed.
......@@ -141,7 +141,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
while(num_bytes > 0) {
cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
root->sectorsize, 0, 0,
root->sectorsize, 0, alloc_hint,
(u64)-1, &ins, 1);
if (ret) {
WARN_ON(1);
......@@ -558,7 +558,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
trans->transid, inode->i_ino,
ordered_extent->file_offset, &ins);
BUG_ON(ret);
mutex_lock(&BTRFS_I(inode)->extent_mutex);
ret = btrfs_drop_extents(trans, root, inode,
......
......@@ -64,8 +64,8 @@ static void lock_chunks(struct btrfs_root *root)
static void unlock_chunks(struct btrfs_root *root)
{
mutex_unlock(&root->fs_info->alloc_mutex);
mutex_unlock(&root->fs_info->chunk_mutex);
mutex_unlock(&root->fs_info->alloc_mutex);
}
int btrfs_cleanup_fs_uuids(void)
......@@ -1668,8 +1668,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
else
min_free = calc_size;
/* we add 1MB because we never use the first 1MB of the device */
min_free += 1024 * 1024;
/*
* we add 1MB because we never use the first 1MB of the device, unless
* we've looped, then we are likely allocating the maximum amount of
* space left already
*/
if (!looped)
min_free += 1024 * 1024;
/* build a private list of devices we will allocate from */
while(index < num_stripes) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment