Commit 734db689 authored by Chris Mason's avatar Chris Mason Committed by Linus Torvalds

[PATCH] reiserfs: block allocator optimizations

From: <mason@suse.com>
From: <jeffm@suse.com>

The current reiserfs allocator pretty much allocates things sequentially
from the start of the disk, it works very nicely for desktop loads but
once you've got more then one proc doing io data files can fragment badly.

One obvious solution is something like ext2's bitmap groups, which puts
file data into different areas of the disk based on which subdirectory
they are in.  The problem with bitmap groups is that if you've got a
group of subdirectories their contents will be spread out all over the
disk, leading to lots of seeks during a sequential read.

This allocator patch uses the packing locality to determine which bitmap
group to allocate from, but when you create a file it looks in the bitmaps
to see how 'full' that packing locality already is.  If it hasn't been
heavily used yet, the packing locality is inherited from the parent
directory putting files in new subdirs close to the parent subdir,
otherwise it is the inode number of the parent directory putting new
files far away from the parent subdir.

The end result is fewer bitmap groups for the same working set.  For
example, one test data set created by 20 procs running in parallel has
6822 subdirs.  And with vanilla reiserfs that would mean 6822
packing localities.  This patch turns that into 26 packing localities.

This makes sequential reads of big directory trees more efficient, but
it also makes the btree more efficient in general.  Things end up sorted
better because groups of subdirs end up with similar keys in the btree,
instead of being spread out all over.

The bitmap grouping code tries to use the start of each bitmap group
for metadata, and offsets the data slightly.  The data and metadata
are still close together, but not completely intermixed like they are
in the default allocator.  The end result is that leaf nodes tend to be
close to each other, making metadata readahead more effective.

The old block allocator had the ability to enforce a minimum
allocation size, but did not use it.  It now tries to do a pass looking
for larger allocation chunks before falling back to the old behaviour
of taking any blocks it can find.

The patch changes the defaults to:

mount -o alloc=skip_busy:dirid_groups:packing_groups

You can get back the old behaviour with mount -o alloc=skip_busy

mount -o alloc=dirid_groups will turn on the bitmap groups
mount -o alloc=packing_groups turns on the packing locality reduction code
mount -o alloc=skip_busy:dirid_groups turns on both dirid_groups and
skip_busy

Finally the patch adds a mount -o alloc=oid_groups, which puts files into
bitmap groups based on a hash of their objectid.  This would be used for
databases or other situations where you have a limited number of very
large files.

This command will tell you how many packing localities are actually in
use:

debugreiserfs -d /dev/xxx | grep '^|.*SD' | sed 's/^.....//' | awk '{print $1}' | sort -u | wc -l
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent fab177a4
......@@ -30,6 +30,9 @@
#define _ALLOC_hashed_formatted_nodes 7
#define _ALLOC_old_way 8
#define _ALLOC_hundredth_slices 9
#define _ALLOC_dirid_groups 10
#define _ALLOC_oid_groups 11
#define _ALLOC_packing_groups 12
#define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
#define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
......@@ -150,11 +153,6 @@ static int scan_bitmap_block (struct reiserfs_transaction_handle *th,
__wait_on_buffer (bi->bh);
}
/* If we know that first zero bit is only one or first zero bit is
closer to the end of bitmap than our start pointer */
if (bi->first_zero_hint > *beg || bi->free_count == 1)
*beg = bi->first_zero_hint;
while (1) {
cont:
if (bi->free_count < min)
......@@ -204,21 +202,12 @@ static int scan_bitmap_block (struct reiserfs_transaction_handle *th,
while (--i >= *beg)
reiserfs_test_and_clear_le_bit (i, bi->bh->b_data);
reiserfs_restore_prepared_buffer (s, bi->bh);
*beg = max(org, (int)bi->first_zero_hint);
*beg = org;
/* ... and search again in current block from beginning */
goto cont;
}
}
bi->free_count -= (end - *beg);
/* if search started from zero_hint bit, and zero hint have not
changed since, then we need to update first_zero_hint */
if ( bi->first_zero_hint >= *beg)
/* no point in looking for free bit if there is not any */
bi->first_zero_hint = (bi->free_count > 0 ) ?
reiserfs_find_next_zero_le_bit
((unsigned long*)(bi->bh->b_data), s->s_blocksize << 3, end) : (s->s_blocksize << 3);
journal_mark_dirty (th, s, bi->bh);
/* free block count calculation */
......@@ -231,7 +220,52 @@ static int scan_bitmap_block (struct reiserfs_transaction_handle *th,
*beg = next;
}
}
}
static int bmap_hash_id(struct super_block *s, u32 id) {
char * hash_in = NULL;
unsigned long hash;
unsigned bm;
if (id <= 2) {
bm = 1;
} else {
hash_in = (char *)(&id);
hash = keyed_hash(hash_in, 4);
bm = hash % SB_BMAP_NR(s);
if (!bm)
bm = 1;
}
return bm;
}
/*
* hashes the id and then returns > 0 if the block group for the
* corresponding hash is full
*/
static inline int block_group_used(struct super_block *s, u32 id) {
int bm;
bm = bmap_hash_id(s, id);
if (SB_AP_BITMAP(s)[bm].free_count > ((s->s_blocksize << 3) * 60 / 100) ) {
return 0;
}
return 1;
}
/*
* the packing is returned in disk byte order
*/
u32 reiserfs_choose_packing(struct inode *dir) {
u32 packing;
if (TEST_OPTION(packing_groups, dir->i_sb)) {
if (block_group_used(dir->i_sb,le32_to_cpu(INODE_PKEY(dir)->k_dir_id)))
packing = INODE_PKEY(dir)->k_objectid;
else
packing = INODE_PKEY(dir)->k_dir_id;
} else
packing = INODE_PKEY(dir)->k_objectid;
return packing;
}
/* Tries to find contiguous zero bit window (given size) in given region of
* bitmap and place new blocks there. Returns number of allocated blocks. */
......@@ -255,8 +289,18 @@ static int scan_bitmap (struct reiserfs_transaction_handle *th,
get_bit_address (s, *start, &bm, &off);
get_bit_address (s, finish, &end_bm, &end_off);
// With this option set first we try to find a bitmap that is at least 10%
// free, and if that fails, then we fall back to old whole bitmap scanning
/* When the bitmap is more than 10% free, anyone can allocate.
* When it's less than 10% free, only files that already use the
* bitmap are allowed. Once we pass 80% full, this restriction
* is lifted.
*
* We do this so that files that grow later still have space close to
* their original allocation. This improves locality, and presumably
* performance as a result.
*
* This is only an allocation policy and does not make up for getting a
* bad hint. Decent hinting must be implemented for this to work well.
*/
if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) {
for (;bm < end_bm; bm++, off = 0) {
if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 )
......@@ -314,9 +358,6 @@ static void _reiserfs_free_block (struct reiserfs_transaction_handle *th,
"free_block (%s:%lu)[dev:blocknr]: bit already cleared",
reiserfs_bdevname (s), block);
}
if (offset < apbi[nr].first_zero_hint) {
apbi[nr].first_zero_hint = offset;
}
apbi[nr].free_count ++;
journal_mark_dirty (th, s, apbi[nr].bh);
......@@ -396,6 +437,15 @@ void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th)
__discard_prealloc(th, ei);
}
}
void reiserfs_init_alloc_options (struct super_block *s)
{
set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
set_bit (_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
reiserfs_warning (s, "allocator defaults = [%08x]\n", SB_ALLOC_OPTS(s));
}
/* block allocator related options are parsed here */
int reiserfs_parse_alloc_options(struct super_block * s, char * options)
{
......@@ -439,6 +489,18 @@ int reiserfs_parse_alloc_options(struct super_block * s, char * options)
continue;
}
if (!strcmp(this_char, "dirid_groups")) {
SET_OPTION(dirid_groups);
continue;
}
if (!strcmp(this_char, "oid_groups")) {
SET_OPTION(oid_groups);
continue;
}
if (!strcmp(this_char, "packing_groups")) {
SET_OPTION(packing_groups);
continue;
}
if (!strcmp(this_char, "hashed_formatted_nodes")) {
SET_OPTION(hashed_formatted_nodes);
continue;
......@@ -481,6 +543,7 @@ int reiserfs_parse_alloc_options(struct super_block * s, char * options)
return 1;
}
reiserfs_warning (s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
return 0;
}
......@@ -503,17 +566,76 @@ static inline void new_hashed_relocation (reiserfs_blocknr_hint_t * hint)
hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
}
static inline void get_left_neighbor(reiserfs_blocknr_hint_t *hint)
/*
* Relocation based on dirid, hashing them into a given bitmap block
* files. Formatted nodes are unaffected, a seperate policy covers them
*/
static void
dirid_groups (reiserfs_blocknr_hint_t *hint)
{
unsigned long hash;
__u32 dirid = 0;
int bm = 0;
struct super_block *sb = hint->th->t_super;
if (hint->inode)
dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
else if (hint->formatted_node)
dirid = hint->key.k_dir_id;
if (dirid) {
bm = bmap_hash_id(sb, dirid);
hash = bm * (sb->s_blocksize << 3);
/* give a portion of the block group to metadata */
if (hint->inode)
hash += sb->s_blocksize/2;
hint->search_start = hash;
}
}
/*
* Relocation based on oid, hashing them into a given bitmap block
* files. Formatted nodes are unaffected, a seperate policy covers them
*/
static void
oid_groups (reiserfs_blocknr_hint_t *hint)
{
if (hint->inode) {
unsigned long hash;
__u32 oid;
__u32 dirid;
int bm;
dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
/* keep the root dir and it's first set of subdirs close to
* the start of the disk
*/
if (dirid <= 2)
hash = (hint->inode->i_sb->s_blocksize << 3);
else {
oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
bm = bmap_hash_id(hint->inode->i_sb, oid);
hash = bm * (hint->inode->i_sb->s_blocksize << 3);
}
hint->search_start = hash;
}
}
/* returns 1 if it finds an indirect item and gets valid hint info
* from it, otherwise 0
*/
static int get_left_neighbor(reiserfs_blocknr_hint_t *hint)
{
struct path * path;
struct buffer_head * bh;
struct item_head * ih;
int pos_in_item;
__u32 * item;
int ret = 0;
if (!hint->path) /* reiserfs code can call this function w/o pointer to path
* structure supplied; then we rely on supplied search_start */
return;
return 0;
path = hint->path;
bh = get_last_bh(path);
......@@ -534,15 +656,15 @@ static inline void get_left_neighbor(reiserfs_blocknr_hint_t *hint)
int t=get_block_num(item,pos_in_item);
if (t) {
hint->search_start = t;
ret = 1;
break;
}
pos_in_item --;
}
} else {
}
/* does result value fit into specified region? */
return;
return ret;
}
/* should be, if formatted node, then try to put on first part of the device
......@@ -639,10 +761,12 @@ static inline void hundredth_slices (reiserfs_blocknr_hint_t * hint)
}
}
static inline void determine_search_start(reiserfs_blocknr_hint_t *hint,
static void determine_search_start(reiserfs_blocknr_hint_t *hint,
int amount_needed)
{
struct super_block *s = hint->th->t_super;
int unfm_hint;
hint->beg = 0;
hint->end = SB_BLOCK_COUNT(s) - 1;
......@@ -673,11 +797,6 @@ static inline void determine_search_start(reiserfs_blocknr_hint_t *hint,
return;
}
/* attempt to copy a feature from old block allocator code */
if (TEST_OPTION(old_hashed_relocation, s) && !hint->formatted_node) {
old_hashed_relocation(hint);
}
/* if none of our special cases is relevant, use the left neighbor in the
tree order of the new node we are allocating for */
if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) {
......@@ -685,7 +804,7 @@ static inline void determine_search_start(reiserfs_blocknr_hint_t *hint,
return;
}
get_left_neighbor(hint);
unfm_hint = get_left_neighbor(hint);
/* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
new blocks are displaced based on directory ID. Also, if suggested search_start
......@@ -710,10 +829,36 @@ static inline void determine_search_start(reiserfs_blocknr_hint_t *hint,
return;
}
if (TEST_OPTION(old_hashed_relocation, s))
/* old_hashed_relocation only works on unformatted */
if (!unfm_hint && !hint->formatted_node &&
TEST_OPTION(old_hashed_relocation, s))
{
old_hashed_relocation(hint);
if (TEST_OPTION(new_hashed_relocation, s))
}
/* new_hashed_relocation works with both formatted/unformatted nodes */
if ((!unfm_hint || hint->formatted_node) &&
TEST_OPTION(new_hashed_relocation, s))
{
new_hashed_relocation(hint);
}
/* dirid grouping works only on unformatted nodes */
if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s))
{
dirid_groups(hint);
}
#ifdef DISPLACE_NEW_PACKING_LOCALITIES
if (hint->formatted_node && TEST_OPTION(dirid_groups,s))
{
dirid_groups(hint);
}
#endif
/* oid grouping works only on unformatted nodes */
if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s))
{
oid_groups(hint);
}
return;
}
......@@ -738,13 +883,14 @@ static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
static inline int allocate_without_wrapping_disk (reiserfs_blocknr_hint_t * hint,
b_blocknr_t * new_blocknrs,
b_blocknr_t start, b_blocknr_t finish,
int min,
int amount_needed, int prealloc_size)
{
int rest = amount_needed;
int nr_allocated;
while (rest > 0 && start <= finish) {
nr_allocated = scan_bitmap (hint->th, &start, finish, 1,
nr_allocated = scan_bitmap (hint->th, &start, finish, min,
rest + prealloc_size, !hint->formatted_node,
hint->block);
......@@ -777,8 +923,9 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
struct super_block *s = hint->th->t_super;
b_blocknr_t start = hint->search_start;
b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
int second_pass = 0;
int passno = 0;
int nr_allocated = 0;
int bigalloc = 0;
determine_prealloc_size(hint);
if (!hint->formatted_node) {
......@@ -797,15 +944,47 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
if (quota_ret)
hint->preallocate=hint->prealloc_size=0;
}
/* for unformatted nodes, force large allocations */
bigalloc = amount_needed + hint->prealloc_size;
/* try to make things even */
if (bigalloc & 1 && hint->prealloc_size)
bigalloc--;
}
while((nr_allocated
+= allocate_without_wrapping_disk(hint, new_blocknrs + nr_allocated, start, finish,
amount_needed - nr_allocated, hint->prealloc_size))
< amount_needed) {
/* not all blocks were successfully allocated yet*/
if (second_pass) { /* it was a second pass; we must free all blocks */
do {
/* in bigalloc mode, nr_allocated should stay zero until
* the entire allocation is filled
*/
if (unlikely(bigalloc && nr_allocated)) {
reiserfs_warning(s, "bigalloc is %d, nr_allocated %d\n",
bigalloc, nr_allocated);
/* reset things to a sane value */
bigalloc = amount_needed - nr_allocated;
}
/*
* try pass 0 and pass 1 looking for a nice big
* contiguous allocation. Then reset and look
* for anything you can find.
*/
if (passno == 2 && bigalloc) {
passno = 0;
bigalloc = 0;
}
switch (passno++) {
case 0: /* Search from hint->search_start to end of disk */
start = hint->search_start;
finish = SB_BLOCK_COUNT(s) - 1;
break;
case 1: /* Search from hint->beg to hint->search_start */
start = hint->beg;
finish = hint->search_start;
break;
case 2: /* Last chance: Search from 0 to hint->beg */
start = 0;
finish = hint->beg;
break;
default: /* We've tried searching everywhere, not enough space */
/* Free the blocks */
if (!hint->formatted_node) {
#ifdef REISERQUOTA_DEBUG
reiserfs_debug (s, "reiserquota: freeing (nospace) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid);
......@@ -816,13 +995,13 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node);
return NO_DISK_SPACE;
} else { /* refine search parameters for next pass */
second_pass = 1;
finish = start;
start = 0;
continue;
}
}
} while ((nr_allocated += allocate_without_wrapping_disk (hint,
new_blocknrs + nr_allocated, start, finish,
bigalloc ? bigalloc : 1,
amount_needed - nr_allocated,
hint->prealloc_size))
< amount_needed);
if ( !hint->formatted_node &&
amount_needed + hint->prealloc_size >
nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
......
......@@ -176,12 +176,13 @@ int reiserfs_allocate_blocks_for_region(
hint.formatted_node = 0; // We are allocating blocks for unformatted node.
/* only preallocate if this is a small write */
if (blocks_to_allocate <
REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize)
if (REISERFS_I(inode)->i_prealloc_count ||
(!(write_bytes & (inode->i_sb->s_blocksize -1)) &&
blocks_to_allocate <
REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
hint.preallocate = 1;
else
hint.preallocate = 0;
/* Call block allocator to allocate blocks */
res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
if ( res != CARRY_ON ) {
......@@ -467,6 +468,12 @@ int reiserfs_allocate_blocks_for_region(
// the inode.
//
pathrelse(&path);
/*
* cleanup prellocation from previous writes
* if this is a partial block write
*/
if (write_bytes & (inode->i_sb->s_blocksize -1))
reiserfs_discard_prealloc(th, inode);
reiserfs_write_unlock(inode->i_sb);
// go through all the pages/buffers and map the buffers to newly allocated
......@@ -1254,6 +1261,7 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
journal_end(&th, th.t_super, th.t_blocks_allocated);
reiserfs_write_unlock(inode->i_sb);
}
if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
......
......@@ -1660,7 +1660,7 @@ int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
sb = dir->i_sb;
/* item head of new item */
ih.ih_key.k_dir_id = INODE_PKEY (dir)->k_objectid;
ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th));
if (!ih.ih_key.k_objectid) {
err = -ENOMEM;
......@@ -1729,7 +1729,6 @@ int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
err = -EEXIST;
goto out_bad_inode;
}
if (old_format_only (sb)) {
if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
pathrelse (&path_to_key);
......
......@@ -492,7 +492,6 @@ static void reiserfs_clear_inode (struct inode *inode)
REISERFS_I(inode)->i_acl_default = NULL;
}
struct super_operations reiserfs_sops =
{
.alloc_inode = reiserfs_alloc_inode,
......@@ -1345,15 +1344,17 @@ static int reiserfs_fill_super (struct super_block * s, void * data, int silent)
memset (sbi, 0, sizeof (struct reiserfs_sb_info));
/* Set default values for options: non-aggressive tails */
REISERFS_SB(s)->s_mount_opt = ( 1 << REISERFS_SMALLTAIL );
/* default block allocator option: skip_busy */
REISERFS_SB(s)->s_alloc_options.bits = ( 1 << 5);
/* If file grew past 4 blocks, start preallocation blocks for it. */
REISERFS_SB(s)->s_alloc_options.preallocmin = 4;
/* no preallocation minimum, be smart in
reiserfs_file_write instead */
REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
/* Preallocate by 16 blocks (17-1) at once */
REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
/* Initialize the rwsem for xattr dir */
init_rwsem(&REISERFS_SB(s)->xattr_dir_sem);
/* setup default block allocator options */
reiserfs_init_alloc_options(s);
jdev_name = NULL;
if (reiserfs_parse_options (s, (char *) data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age) == 0) {
goto error;
......
......@@ -1247,7 +1247,7 @@ struct path {
#define pos_in_item(path) ((path)->pos_in_item)
#define INITIALIZE_PATH(var) \
struct path var = {ILLEGAL_PATH_ELEMENT_OFFSET, }
struct path var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET,}
/* Get path element by path and path position. */
#define PATH_OFFSET_PELEMENT(p_s_path,n_offset) ((p_s_path)->path_elements +(n_offset))
......@@ -2149,6 +2149,15 @@ struct buffer_head * get_FEB (struct tree_balance *);
typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
int reiserfs_parse_alloc_options (struct super_block *, char *);
void reiserfs_init_alloc_options (struct super_block *s);
/*
* given a directory, this will tell you what packing locality
* to use for a new object underneat it. The locality is returned
* in disk byte order (le).
*/
u32 reiserfs_choose_packing(struct inode *dir);
int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value);
void reiserfs_free_block (struct reiserfs_transaction_handle *th, struct inode *, b_blocknr_t, int for_unformatted);
int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment