Commit 5e049663 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 fixes from Ted Ts'o:
 "Regression and bug fixes:

   - Performance regression fix from 5.18 on a Rasberry Pi

   - Fix extent parsing bug which triggers a BUG_ON when a (corrupted)
     extent tree has has a non-root node when zero entries.

   - Fix a livelock where in the right (wrong) circumstances a large
     number of nfsd threads can try to write to a nearly full file
     system, and retry for hours(!)"

* tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: limit the number of retries after discarding preallocations blocks
  ext4: fix bug in extents parsing when eh_entries == 0 and eh_depth > 0
  ext4: use buckets for cr 1 block scan instead of rbtree
  ext4: use locality group preallocation for small closed files
  ext4: make directory inode spreading reflect flexbg size
  ext4: avoid unnecessary spreading of allocations among groups
  ext4: make mballoc try target group first even with mb_optimize_scan
parents 4207d595 80fa46d6
...@@ -167,8 +167,6 @@ enum SHIFT_DIRECTION { ...@@ -167,8 +167,6 @@ enum SHIFT_DIRECTION {
#define EXT4_MB_CR0_OPTIMIZED 0x8000 #define EXT4_MB_CR0_OPTIMIZED 0x8000
/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
#define EXT4_MB_CR1_OPTIMIZED 0x00010000 #define EXT4_MB_CR1_OPTIMIZED 0x00010000
/* Perform linear traversal for one group */
#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000
struct ext4_allocation_request { struct ext4_allocation_request {
/* target inode for block we're allocating */ /* target inode for block we're allocating */
struct inode *inode; struct inode *inode;
...@@ -1600,8 +1598,8 @@ struct ext4_sb_info { ...@@ -1600,8 +1598,8 @@ struct ext4_sb_info {
struct list_head s_discard_list; struct list_head s_discard_list;
struct work_struct s_discard_work; struct work_struct s_discard_work;
atomic_t s_retry_alloc_pending; atomic_t s_retry_alloc_pending;
struct rb_root s_mb_avg_fragment_size_root; struct list_head *s_mb_avg_fragment_size;
rwlock_t s_mb_rb_lock; rwlock_t *s_mb_avg_fragment_size_locks;
struct list_head *s_mb_largest_free_orders; struct list_head *s_mb_largest_free_orders;
rwlock_t *s_mb_largest_free_orders_locks; rwlock_t *s_mb_largest_free_orders_locks;
...@@ -3413,6 +3411,8 @@ struct ext4_group_info { ...@@ -3413,6 +3411,8 @@ struct ext4_group_info {
ext4_grpblk_t bb_first_free; /* first free block */ ext4_grpblk_t bb_first_free; /* first free block */
ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_free; /* total free blocks */
ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
int bb_avg_fragment_size_order; /* order of average
fragment in BG */
ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
ext4_group_t bb_group; /* Group number */ ext4_group_t bb_group; /* Group number */
struct list_head bb_prealloc_list; struct list_head bb_prealloc_list;
...@@ -3420,7 +3420,7 @@ struct ext4_group_info { ...@@ -3420,7 +3420,7 @@ struct ext4_group_info {
void *bb_bitmap; void *bb_bitmap;
#endif #endif
struct rw_semaphore alloc_sem; struct rw_semaphore alloc_sem;
struct rb_node bb_avg_fragment_size_rb; struct list_head bb_avg_fragment_size_node;
struct list_head bb_largest_free_order_node; struct list_head bb_largest_free_order_node;
ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
* regions, index is order. * regions, index is order.
......
...@@ -460,6 +460,10 @@ static int __ext4_ext_check(const char *function, unsigned int line, ...@@ -460,6 +460,10 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid eh_entries"; error_msg = "invalid eh_entries";
goto corrupted; goto corrupted;
} }
if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
error_msg = "eh_entries is 0 but eh_depth is > 0";
goto corrupted;
}
if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
error_msg = "invalid extent entries"; error_msg = "invalid extent entries";
goto corrupted; goto corrupted;
......
...@@ -510,7 +510,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ...@@ -510,7 +510,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
goto fallback; goto fallback;
} }
max_dirs = ndirs / ngroups + inodes_per_group / 16; max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16;
min_inodes = avefreei - inodes_per_group*flex_size / 4; min_inodes = avefreei - inodes_per_group*flex_size / 4;
if (min_inodes < 1) if (min_inodes < 1)
min_inodes = 1; min_inodes = 1;
......
...@@ -140,13 +140,15 @@ ...@@ -140,13 +140,15 @@
* number of buddy bitmap orders possible) number of lists. Group-infos are * number of buddy bitmap orders possible) number of lists. Group-infos are
* placed in appropriate lists. * placed in appropriate lists.
* *
* 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
* *
* Locking: sbi->s_mb_rb_lock (rwlock) * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
* *
* This is a red black tree consisting of group infos and the tree is sorted * This is an array of lists where in the i-th list there are groups with
* by average fragment sizes (which is calculated as ext4_group_info->bb_free * average fragment size >= 2^i and < 2^(i+1). The average fragment size
* / ext4_group_info->bb_fragments). * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
* Note that we don't bother with a special list for completely empty groups
* so we only have MB_NUM_ORDERS(sb) lists.
* *
* When "mb_optimize_scan" mount option is set, mballoc consults the above data * When "mb_optimize_scan" mount option is set, mballoc consults the above data
* structures to decide the order in which groups are to be traversed for * structures to decide the order in which groups are to be traversed for
...@@ -160,7 +162,8 @@ ...@@ -160,7 +162,8 @@
* *
* At CR = 1, we only consider groups where average fragment size > request * At CR = 1, we only consider groups where average fragment size > request
* size. So, we lookup a group which has average fragment size just above or * size. So, we lookup a group which has average fragment size just above or
* equal to request size using our rb tree (data structure 2) in O(log N) time. * equal to request size using our average fragment size group lists (data
* structure 2) in O(1) time.
* *
* If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
* linear order which requires O(N) search time for each CR 0 and CR 1 phase. * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
...@@ -802,65 +805,51 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, ...@@ -802,65 +805,51 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
} }
} }
static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
int (*cmp)(struct rb_node *, struct rb_node *))
{ {
struct rb_node **iter = &root->rb_node, *parent = NULL; int order;
while (*iter) { /*
parent = *iter; * We don't bother with a special lists groups with only 1 block free
if (cmp(new, *iter) > 0) * extents and for completely empty groups.
iter = &((*iter)->rb_left); */
else order = fls(len) - 2;
iter = &((*iter)->rb_right); if (order < 0)
} return 0;
if (order == MB_NUM_ORDERS(sb))
rb_link_node(new, parent, iter); order--;
rb_insert_color(new, root); return order;
}
static int
ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2)
{
struct ext4_group_info *grp1 = rb_entry(rb1,
struct ext4_group_info,
bb_avg_fragment_size_rb);
struct ext4_group_info *grp2 = rb_entry(rb2,
struct ext4_group_info,
bb_avg_fragment_size_rb);
int num_frags_1, num_frags_2;
num_frags_1 = grp1->bb_fragments ?
grp1->bb_free / grp1->bb_fragments : 0;
num_frags_2 = grp2->bb_fragments ?
grp2->bb_free / grp2->bb_fragments : 0;
return (num_frags_2 - num_frags_1);
} }
/* /* Move group to appropriate avg_fragment_size list */
* Reinsert grpinfo into the avg_fragment_size tree with new average
* fragment size.
*/
static void static void
mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
{ {
struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb);
int new_order;
if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
return; return;
write_lock(&sbi->s_mb_rb_lock); new_order = mb_avg_fragment_size_order(sb,
if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { grp->bb_free / grp->bb_fragments);
rb_erase(&grp->bb_avg_fragment_size_rb, if (new_order == grp->bb_avg_fragment_size_order)
&sbi->s_mb_avg_fragment_size_root); return;
RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb);
}
ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, if (grp->bb_avg_fragment_size_order != -1) {
&grp->bb_avg_fragment_size_rb, write_lock(&sbi->s_mb_avg_fragment_size_locks[
ext4_mb_avg_fragment_size_cmp); grp->bb_avg_fragment_size_order]);
write_unlock(&sbi->s_mb_rb_lock); list_del(&grp->bb_avg_fragment_size_node);
write_unlock(&sbi->s_mb_avg_fragment_size_locks[
grp->bb_avg_fragment_size_order]);
}
grp->bb_avg_fragment_size_order = new_order;
write_lock(&sbi->s_mb_avg_fragment_size_locks[
grp->bb_avg_fragment_size_order]);
list_add_tail(&grp->bb_avg_fragment_size_node,
&sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
write_unlock(&sbi->s_mb_avg_fragment_size_locks[
grp->bb_avg_fragment_size_order]);
} }
/* /*
...@@ -909,86 +898,56 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, ...@@ -909,86 +898,56 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
*new_cr = 1; *new_cr = 1;
} else { } else {
*group = grp->bb_group; *group = grp->bb_group;
ac->ac_last_optimal_group = *group;
ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
} }
} }
/* /*
* Choose next group by traversing average fragment size tree. Updates *new_cr * Choose next group by traversing average fragment size list of suitable
* if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that * order. Updates *new_cr if cr level needs an update.
* the linear search should continue for one iteration since there's lock
* contention on the rb tree lock.
*/ */
static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
int *new_cr, ext4_group_t *group, ext4_group_t ngroups) int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{ {
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int avg_fragment_size, best_so_far; struct ext4_group_info *grp, *iter;
struct rb_node *node, *found; int i;
struct ext4_group_info *grp;
/*
* If there is contention on the lock, instead of waiting for the lock
* to become available, just continue searching lineraly. We'll resume
* our rb tree search later starting at ac->ac_last_optimal_group.
*/
if (!read_trylock(&sbi->s_mb_rb_lock)) {
ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR;
return;
}
if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
if (sbi->s_mb_stats) if (sbi->s_mb_stats)
atomic_inc(&sbi->s_bal_cr1_bad_suggestions); atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
/* We have found something at CR 1 in the past */ }
grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group);
for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
found = rb_next(found)) { i < MB_NUM_ORDERS(ac->ac_sb); i++) {
grp = rb_entry(found, struct ext4_group_info, if (list_empty(&sbi->s_mb_avg_fragment_size[i]))
bb_avg_fragment_size_rb); continue;
read_lock(&sbi->s_mb_avg_fragment_size_locks[i]);
if (list_empty(&sbi->s_mb_avg_fragment_size[i])) {
read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
continue;
}
grp = NULL;
list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i],
bb_avg_fragment_size_node) {
if (sbi->s_mb_stats) if (sbi->s_mb_stats)
atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
grp = iter;
break; break;
} }
goto done;
}
node = sbi->s_mb_avg_fragment_size_root.rb_node;
best_so_far = 0;
found = NULL;
while (node) {
grp = rb_entry(node, struct ext4_group_info,
bb_avg_fragment_size_rb);
avg_fragment_size = 0;
if (ext4_mb_good_group(ac, grp->bb_group, 1)) {
avg_fragment_size = grp->bb_fragments ?
grp->bb_free / grp->bb_fragments : 0;
if (!best_so_far || avg_fragment_size < best_so_far) {
best_so_far = avg_fragment_size;
found = node;
}
} }
if (avg_fragment_size > ac->ac_g_ex.fe_len) read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
node = node->rb_right; if (grp)
else break;
node = node->rb_left;
} }
done: if (grp) {
if (found) {
grp = rb_entry(found, struct ext4_group_info,
bb_avg_fragment_size_rb);
*group = grp->bb_group; *group = grp->bb_group;
ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
} else { } else {
*new_cr = 2; *new_cr = 2;
} }
read_unlock(&sbi->s_mb_rb_lock);
ac->ac_last_optimal_group = *group;
} }
static inline int should_optimize_scan(struct ext4_allocation_context *ac) static inline int should_optimize_scan(struct ext4_allocation_context *ac)
...@@ -1017,11 +976,6 @@ next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) ...@@ -1017,11 +976,6 @@ next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
goto inc_and_return; goto inc_and_return;
} }
if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) {
ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR;
goto inc_and_return;
}
return group; return group;
inc_and_return: inc_and_return:
/* /*
...@@ -1049,8 +1003,10 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, ...@@ -1049,8 +1003,10 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
{ {
*new_cr = ac->ac_criteria; *new_cr = ac->ac_criteria;
if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
*group = next_linear_group(ac, *group, ngroups);
return; return;
}
if (*new_cr == 0) { if (*new_cr == 0) {
ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
...@@ -1075,23 +1031,25 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) ...@@ -1075,23 +1031,25 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb);
int i; int i;
if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
if (grp->bb_counters[i] > 0)
break;
/* No need to move between order lists? */
if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
i == grp->bb_largest_free_order) {
grp->bb_largest_free_order = i;
return;
}
if (grp->bb_largest_free_order >= 0) {
write_lock(&sbi->s_mb_largest_free_orders_locks[ write_lock(&sbi->s_mb_largest_free_orders_locks[
grp->bb_largest_free_order]); grp->bb_largest_free_order]);
list_del_init(&grp->bb_largest_free_order_node); list_del_init(&grp->bb_largest_free_order_node);
write_unlock(&sbi->s_mb_largest_free_orders_locks[ write_unlock(&sbi->s_mb_largest_free_orders_locks[
grp->bb_largest_free_order]); grp->bb_largest_free_order]);
} }
grp->bb_largest_free_order = -1; /* uninit */
for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) {
if (grp->bb_counters[i] > 0) {
grp->bb_largest_free_order = i; grp->bb_largest_free_order = i;
break; if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
}
}
if (test_opt2(sb, MB_OPTIMIZE_SCAN) &&
grp->bb_largest_free_order >= 0 && grp->bb_free) {
write_lock(&sbi->s_mb_largest_free_orders_locks[ write_lock(&sbi->s_mb_largest_free_orders_locks[
grp->bb_largest_free_order]); grp->bb_largest_free_order]);
list_add_tail(&grp->bb_largest_free_order_node, list_add_tail(&grp->bb_largest_free_order_node,
...@@ -1148,13 +1106,13 @@ void ext4_mb_generate_buddy(struct super_block *sb, ...@@ -1148,13 +1106,13 @@ void ext4_mb_generate_buddy(struct super_block *sb,
EXT4_GROUP_INFO_BBITMAP_CORRUPT); EXT4_GROUP_INFO_BBITMAP_CORRUPT);
} }
mb_set_largest_free_order(sb, grp); mb_set_largest_free_order(sb, grp);
mb_update_avg_fragment_size(sb, grp);
clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
period = get_cycles() - period; period = get_cycles() - period;
atomic_inc(&sbi->s_mb_buddies_generated); atomic_inc(&sbi->s_mb_buddies_generated);
atomic64_add(period, &sbi->s_mb_generation_time); atomic64_add(period, &sbi->s_mb_generation_time);
mb_update_avg_fragment_size(sb, grp);
} }
/* The buddy information is attached the buddy cache inode /* The buddy information is attached the buddy cache inode
...@@ -2636,7 +2594,7 @@ static noinline_for_stack int ...@@ -2636,7 +2594,7 @@ static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{ {
ext4_group_t prefetch_grp = 0, ngroups, group, i; ext4_group_t prefetch_grp = 0, ngroups, group, i;
int cr = -1; int cr = -1, new_cr;
int err = 0, first_err = 0; int err = 0, first_err = 0;
unsigned int nr = 0, prefetch_ios = 0; unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi; struct ext4_sb_info *sbi;
...@@ -2707,17 +2665,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ...@@ -2707,17 +2665,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* from the goal value specified * from the goal value specified
*/ */
group = ac->ac_g_ex.fe_group; group = ac->ac_g_ex.fe_group;
ac->ac_last_optimal_group = group;
ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
prefetch_grp = group; prefetch_grp = group;
for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), for (i = 0, new_cr = cr; i < ngroups; i++,
i++) { ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
int ret = 0, new_cr; int ret = 0;
cond_resched(); cond_resched();
ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups);
if (new_cr != cr) { if (new_cr != cr) {
cr = new_cr; cr = new_cr;
goto repeat; goto repeat;
...@@ -2991,9 +2946,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock) ...@@ -2991,9 +2946,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock)
struct super_block *sb = pde_data(file_inode(seq->file)); struct super_block *sb = pde_data(file_inode(seq->file));
unsigned long position; unsigned long position;
read_lock(&EXT4_SB(sb)->s_mb_rb_lock); if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
return NULL; return NULL;
position = *pos + 1; position = *pos + 1;
return (void *) ((unsigned long) position); return (void *) ((unsigned long) position);
...@@ -3005,7 +2958,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof ...@@ -3005,7 +2958,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof
unsigned long position; unsigned long position;
++*pos; ++*pos;
if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
return NULL; return NULL;
position = *pos + 1; position = *pos + 1;
return (void *) ((unsigned long) position); return (void *) ((unsigned long) position);
...@@ -3017,29 +2970,22 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) ...@@ -3017,29 +2970,22 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned long position = ((unsigned long) v); unsigned long position = ((unsigned long) v);
struct ext4_group_info *grp; struct ext4_group_info *grp;
struct rb_node *n; unsigned int count;
unsigned int count, min, max;
position--; position--;
if (position >= MB_NUM_ORDERS(sb)) { if (position >= MB_NUM_ORDERS(sb)) {
seq_puts(seq, "fragment_size_tree:\n"); position -= MB_NUM_ORDERS(sb);
n = rb_first(&sbi->s_mb_avg_fragment_size_root); if (position == 0)
if (!n) { seq_puts(seq, "avg_fragment_size_lists:\n");
seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n");
return 0;
}
grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
count = 1;
while (rb_next(n)) {
count++;
n = rb_next(n);
}
grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", count = 0;
min, max, count); read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
bb_avg_fragment_size_node)
count++;
read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
seq_printf(seq, "\tlist_order_%u_groups: %u\n",
(unsigned int)position, count);
return 0; return 0;
} }
...@@ -3049,9 +2995,11 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) ...@@ -3049,9 +2995,11 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
seq_puts(seq, "max_free_order_lists:\n"); seq_puts(seq, "max_free_order_lists:\n");
} }
count = 0; count = 0;
read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
bb_largest_free_order_node) bb_largest_free_order_node)
count++; count++;
read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
seq_printf(seq, "\tlist_order_%u_groups: %u\n", seq_printf(seq, "\tlist_order_%u_groups: %u\n",
(unsigned int)position, count); (unsigned int)position, count);
...@@ -3059,11 +3007,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) ...@@ -3059,11 +3007,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
} }
static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
__releases(&EXT4_SB(sb)->s_mb_rb_lock)
{ {
struct super_block *sb = pde_data(file_inode(seq->file));
read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
} }
const struct seq_operations ext4_mb_seq_structs_summary_ops = { const struct seq_operations ext4_mb_seq_structs_summary_ops = {
...@@ -3176,8 +3120,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, ...@@ -3176,8 +3120,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
init_rwsem(&meta_group_info[i]->alloc_sem); init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root = RB_ROOT; meta_group_info[i]->bb_free_root = RB_ROOT;
INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */
meta_group_info[i]->bb_group = group; meta_group_info[i]->bb_group = group;
mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
...@@ -3426,7 +3371,24 @@ int ext4_mb_init(struct super_block *sb) ...@@ -3426,7 +3371,24 @@ int ext4_mb_init(struct super_block *sb)
i++; i++;
} while (i < MB_NUM_ORDERS(sb)); } while (i < MB_NUM_ORDERS(sb));
sbi->s_mb_avg_fragment_size_root = RB_ROOT; sbi->s_mb_avg_fragment_size =
kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
GFP_KERNEL);
if (!sbi->s_mb_avg_fragment_size) {
ret = -ENOMEM;
goto out;
}
sbi->s_mb_avg_fragment_size_locks =
kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
GFP_KERNEL);
if (!sbi->s_mb_avg_fragment_size_locks) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
}
sbi->s_mb_largest_free_orders = sbi->s_mb_largest_free_orders =
kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
GFP_KERNEL); GFP_KERNEL);
...@@ -3445,7 +3407,6 @@ int ext4_mb_init(struct super_block *sb) ...@@ -3445,7 +3407,6 @@ int ext4_mb_init(struct super_block *sb)
INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
} }
rwlock_init(&sbi->s_mb_rb_lock);
spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_md_lock);
sbi->s_mb_free_pending = 0; sbi->s_mb_free_pending = 0;
...@@ -3516,6 +3477,8 @@ int ext4_mb_init(struct super_block *sb) ...@@ -3516,6 +3477,8 @@ int ext4_mb_init(struct super_block *sb)
free_percpu(sbi->s_locality_groups); free_percpu(sbi->s_locality_groups);
sbi->s_locality_groups = NULL; sbi->s_locality_groups = NULL;
out: out:
kfree(sbi->s_mb_avg_fragment_size);
kfree(sbi->s_mb_avg_fragment_size_locks);
kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders);
kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_largest_free_orders_locks);
kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_offsets);
...@@ -3582,6 +3545,8 @@ int ext4_mb_release(struct super_block *sb) ...@@ -3582,6 +3545,8 @@ int ext4_mb_release(struct super_block *sb)
kvfree(group_info); kvfree(group_info);
rcu_read_unlock(); rcu_read_unlock();
} }
kfree(sbi->s_mb_avg_fragment_size);
kfree(sbi->s_mb_avg_fragment_size_locks);
kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders);
kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_largest_free_orders_locks);
kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_offsets);
...@@ -5193,6 +5158,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) ...@@ -5193,6 +5158,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits = ac->ac_sb->s_blocksize_bits; int bsbits = ac->ac_sb->s_blocksize_bits;
loff_t size, isize; loff_t size, isize;
bool inode_pa_eligible, group_pa_eligible;
if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return; return;
...@@ -5200,25 +5166,27 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) ...@@ -5200,25 +5166,27 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return; return;
group_pa_eligible = sbi->s_mb_group_prealloc > 0;
inode_pa_eligible = true;
size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits; >> bsbits;
/* No point in using inode preallocation for closed files */
if ((size == isize) && !ext4_fs_is_busy(sbi) && if ((size == isize) && !ext4_fs_is_busy(sbi) &&
!inode_is_open_for_write(ac->ac_inode)) { !inode_is_open_for_write(ac->ac_inode))
ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; inode_pa_eligible = false;
return;
}
if (sbi->s_mb_group_prealloc <= 0) {
ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
}
/* don't use group allocation for large files */
size = max(size, isize); size = max(size, isize);
if (size > sbi->s_mb_stream_request) { /* Don't use group allocation for large files */
if (size > sbi->s_mb_stream_request)
group_pa_eligible = false;
if (!group_pa_eligible) {
if (inode_pa_eligible)
ac->ac_flags |= EXT4_MB_STREAM_ALLOC; ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
else
ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return; return;
} }
...@@ -5565,6 +5533,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ...@@ -5565,6 +5533,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
ext4_fsblk_t block = 0; ext4_fsblk_t block = 0;
unsigned int inquota = 0; unsigned int inquota = 0;
unsigned int reserv_clstrs = 0; unsigned int reserv_clstrs = 0;
int retries = 0;
u64 seq; u64 seq;
might_sleep(); might_sleep();
...@@ -5667,7 +5636,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ...@@ -5667,7 +5636,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
ar->len = ac->ac_b_ex.fe_len; ar->len = ac->ac_b_ex.fe_len;
} }
} else { } else {
if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) if (++retries < 3 &&
ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
goto repeat; goto repeat;
/* /*
* If block allocation fails then the pa allocated above * If block allocation fails then the pa allocated above
......
...@@ -178,7 +178,6 @@ struct ext4_allocation_context { ...@@ -178,7 +178,6 @@ struct ext4_allocation_context {
/* copy of the best found extent taken before preallocation efforts */ /* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex; struct ext4_free_extent ac_f_ex;
ext4_group_t ac_last_optimal_group;
__u32 ac_groups_considered; __u32 ac_groups_considered;
__u32 ac_flags; /* allocation hints */ __u32 ac_flags; /* allocation hints */
__u16 ac_groups_scanned; __u16 ac_groups_scanned;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment