Commit bc2d9db4 authored by Lukas Czerner's avatar Lukas Czerner Committed by Theodore Ts'o

ext4: Transfer initialized block to right neighbor if possible

Currently when converting extent to initialized we attempt to transfer
initialized block to the left neighbour if possible when certain
criteria are met. However we do not attempt to do the same for the
right neighbor.

This commit adds the possibility to transfer initialized block to the
right neighbour if:

1. We're not converting the whole extent
2. Both extents are stored in the same extent tree node
3. Right neighbor is initialized
4. Right neighbor is logically abutting the current one
5. Right neighbor is physically abutting the current one
6. Right neighbor would not overflow the length limit

This is basically the same logic as with transferring to the left. This
will gain us some performance benefits since it is faster than inserting
extent and then merging it.

It would also prevent some situation in delalloc patch when we might run
out of metadata reservation. This is due to the fact that we would
attempt to split the extent first (possibly allocating new metadata
block) even though we did not counted for that because it can (and will)
be merged again. This commit fix that scenario, because we no longer
need to split the extent in such case.
Signed-off-by: default avatarLukas Czerner <lczerner@redhat.com>
parent bd86298e
...@@ -3153,29 +3153,28 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ...@@ -3153,29 +3153,28 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct ext4_extent_header *eh; struct ext4_extent_header *eh;
struct ext4_map_blocks split_map; struct ext4_map_blocks split_map;
struct ext4_extent zero_ex; struct ext4_extent zero_ex;
struct ext4_extent *ex; struct ext4_extent *ex, *abut_ex;
ext4_lblk_t ee_block, eof_block; ext4_lblk_t ee_block, eof_block;
unsigned int ee_len, depth; unsigned int ee_len, depth, map_len = map->m_len;
int allocated, max_zeroout = 0; int allocated = 0, max_zeroout = 0;
int err = 0; int err = 0;
int split_flag = 0; int split_flag = 0;
ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino, "block %llu, max_blocks %u\n", inode->i_ino,
(unsigned long long)map->m_lblk, map->m_len); (unsigned long long)map->m_lblk, map_len);
sbi = EXT4_SB(inode->i_sb); sbi = EXT4_SB(inode->i_sb);
eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
inode->i_sb->s_blocksize_bits; inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len) if (eof_block < map->m_lblk + map_len)
eof_block = map->m_lblk + map->m_len; eof_block = map->m_lblk + map_len;
depth = ext_depth(inode); depth = ext_depth(inode);
eh = path[depth].p_hdr; eh = path[depth].p_hdr;
ex = path[depth].p_ext; ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block); ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex); ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (map->m_lblk - ee_block);
zero_ex.ee_len = 0; zero_ex.ee_len = 0;
trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
...@@ -3186,77 +3185,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ...@@ -3186,77 +3185,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
/* /*
* Attempt to transfer newly initialized blocks from the currently * Attempt to transfer newly initialized blocks from the currently
* uninitialized extent to its left neighbor. This is much cheaper * uninitialized extent to its neighbor. This is much cheaper
* than an insertion followed by a merge as those involve costly * than an insertion followed by a merge as those involve costly
* memmove() calls. This is the common case in steady state for * memmove() calls. Transferring to the left is the common case in
* workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
* writes. * followed by append writes.
* *
* Limitations of the current logic: * Limitations of the current logic:
* - L1: we only deal with writes at the start of the extent. * - L1: we do not deal with writes covering the whole extent.
* The approach could be extended to writes at the end
* of the extent but this scenario was deemed less common.
* - L2: we do not deal with writes covering the whole extent.
* This would require removing the extent if the transfer * This would require removing the extent if the transfer
* is possible. * is possible.
* - L3: we only attempt to merge with an extent stored in the * - L2: we only attempt to merge with an extent stored in the
* same extent tree node. * same extent tree node.
*/ */
if ((map->m_lblk == ee_block) && /*L1*/ if ((map->m_lblk == ee_block) &&
(map->m_len < ee_len) && /*L2*/ /* See if we can merge left */
(ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ (map_len < ee_len) && /*L1*/
struct ext4_extent *prev_ex; (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/
ext4_lblk_t prev_lblk; ext4_lblk_t prev_lblk;
ext4_fsblk_t prev_pblk, ee_pblk; ext4_fsblk_t prev_pblk, ee_pblk;
unsigned int prev_len, write_len; unsigned int prev_len;
prev_ex = ex - 1; abut_ex = ex - 1;
prev_lblk = le32_to_cpu(prev_ex->ee_block); prev_lblk = le32_to_cpu(abut_ex->ee_block);
prev_len = ext4_ext_get_actual_len(prev_ex); prev_len = ext4_ext_get_actual_len(abut_ex);
prev_pblk = ext4_ext_pblock(prev_ex); prev_pblk = ext4_ext_pblock(abut_ex);
ee_pblk = ext4_ext_pblock(ex); ee_pblk = ext4_ext_pblock(ex);
write_len = map->m_len;
/* /*
* A transfer of blocks from 'ex' to 'prev_ex' is allowed * A transfer of blocks from 'ex' to 'abut_ex' is allowed
* upon those conditions: * upon those conditions:
* - C1: prev_ex is initialized, * - C1: abut_ex is initialized,
* - C2: prev_ex is logically abutting ex, * - C2: abut_ex is logically abutting ex,
* - C3: prev_ex is physically abutting ex, * - C3: abut_ex is physically abutting ex,
* - C4: prev_ex can receive the additional blocks without * - C4: abut_ex can receive the additional blocks without
* overflowing the (initialized) length limit. * overflowing the (initialized) length limit.
*/ */
if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/
((prev_pblk + prev_len) == ee_pblk) && /*C3*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
(prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
err = ext4_ext_get_access(handle, inode, path + depth); err = ext4_ext_get_access(handle, inode, path + depth);
if (err) if (err)
goto out; goto out;
trace_ext4_ext_convert_to_initialized_fastpath(inode, trace_ext4_ext_convert_to_initialized_fastpath(inode,
map, ex, prev_ex); map, ex, abut_ex);
/* Shift the start of ex by 'write_len' blocks */ /* Shift the start of ex by 'map_len' blocks */
ex->ee_block = cpu_to_le32(ee_block + write_len); ex->ee_block = cpu_to_le32(ee_block + map_len);
ext4_ext_store_pblock(ex, ee_pblk + write_len); ext4_ext_store_pblock(ex, ee_pblk + map_len);
ex->ee_len = cpu_to_le16(ee_len - write_len); ex->ee_len = cpu_to_le16(ee_len - map_len);
ext4_ext_mark_uninitialized(ex); /* Restore the flag */ ext4_ext_mark_uninitialized(ex); /* Restore the flag */
/* Extend prev_ex by 'write_len' blocks */ /* Extend abut_ex by 'map_len' blocks */
prev_ex->ee_len = cpu_to_le16(prev_len + write_len); abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
/* Mark the block containing both extents as dirty */ /* Result: number of initialized blocks past m_lblk */
ext4_ext_dirty(handle, inode, path + depth); allocated = map_len;
}
} else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
(map_len < ee_len) && /*L1*/
ex < EXT_LAST_EXTENT(eh)) { /*L2*/
/* See if we can merge right */
ext4_lblk_t next_lblk;
ext4_fsblk_t next_pblk, ee_pblk;
unsigned int next_len;
abut_ex = ex + 1;
next_lblk = le32_to_cpu(abut_ex->ee_block);
next_len = ext4_ext_get_actual_len(abut_ex);
next_pblk = ext4_ext_pblock(abut_ex);
ee_pblk = ext4_ext_pblock(ex);
/* Update path to point to the right extent */ /*
path[depth].p_ext = prev_ex; * A transfer of blocks from 'ex' to 'abut_ex' is allowed
* upon those conditions:
* - C1: abut_ex is initialized,
* - C2: abut_ex is logically abutting ex,
* - C3: abut_ex is physically abutting ex,
* - C4: abut_ex can receive the additional blocks without
* overflowing the (initialized) length limit.
*/
if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
((map->m_lblk + map_len) == next_lblk) && /*C2*/
((ee_pblk + ee_len) == next_pblk) && /*C3*/
(next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
trace_ext4_ext_convert_to_initialized_fastpath(inode,
map, ex, abut_ex);
/* Shift the start of abut_ex by 'map_len' blocks */
abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
ex->ee_len = cpu_to_le16(ee_len - map_len);
ext4_ext_mark_uninitialized(ex); /* Restore the flag */
/* Extend abut_ex by 'map_len' blocks */
abut_ex->ee_len = cpu_to_le16(next_len + map_len);
/* Result: number of initialized blocks past m_lblk */ /* Result: number of initialized blocks past m_lblk */
allocated = write_len; allocated = map_len;
goto out;
} }
} }
if (allocated) {
/* Mark the block containing both extents as dirty */
ext4_ext_dirty(handle, inode, path + depth);
/* Update path to point to the right extent */
path[depth].p_ext = abut_ex;
goto out;
} else
allocated = ee_len - (map->m_lblk - ee_block);
WARN_ON(map->m_lblk < ee_block); WARN_ON(map->m_lblk < ee_block);
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment