Commit 744692dc authored by Jiaying Zhang's avatar Jiaying Zhang Committed by Theodore Ts'o

ext4: use ext4_get_block_write in buffer write

Allocate uninitialized extent before ext4 buffer write and
convert the extent to initialized after io completes.
The purpose is to make sure an extent can only be marked
initialized after it has been written with new data so
we can safely drop the i_mutex lock in ext4 DIO read without
exposing stale data. This helps to improve multi-thread DIO
read performance on high-speed disks.

Skip the nobh and data=journal mount cases to make things simple for now.
Signed-off-by: default avatarJiaying Zhang <jiayingz@google.com>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent c7064ef1
......@@ -138,7 +138,7 @@ typedef struct ext4_io_end {
struct list_head list; /* per-file finished AIO list */
struct inode *inode; /* file being written to */
unsigned int flag; /* unwritten or not */
int error; /* I/O error code */
struct page *page; /* page struct for buffer write */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
struct work_struct work; /* data work queue */
......@@ -361,7 +361,7 @@ struct ext4_new_group_data {
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
/* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
EXT4_GET_BLOCKS_IO_CREATE_EXT)
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
/*
* Flags used by ext4_free_blocks
......@@ -702,6 +702,7 @@ struct ext4_inode_info {
/* completed IOs that might need unwritten extents handling */
struct list_head i_completed_io_list;
spinlock_t i_completed_io_lock;
/* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio;
......@@ -752,6 +753,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
......@@ -1781,6 +1783,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 len, __u64 *moved_len);
/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
enum ext4_state_bits {
BH_Uninit /* blocks are allocated but uninitialized on disk */
= BH_JBDPrivateStart,
};
BUFFER_FNS(Uninit, uninit)
TAS_BUFFER_FNS(Uninit, uninit)
/*
* Add new method to test wether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
......
......@@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
return 0;
}
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
* i_mutex for direct I/O reads. This only works for extent-based
* files, and it doesn't work for nobh or if data journaling is
* enabled, since the dioread_nolock code uses b_private to pass
* information back to the I/O completion handler, and this conflicts
* with the jbd's use of b_private.
*/
static inline int ext4_should_dioread_nolock(struct inode *inode)
{
if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
return 0;
if (test_opt(inode->i_sb, NOBH))
return 0;
if (!S_ISREG(inode->i_mode))
return 0;
if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
return 0;
if (ext4_should_journal_data(inode))
return 0;
return 1;
}
#endif /* _EXT4_JBD2_H */
......@@ -1619,7 +1619,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
BUG_ON(path[depth].p_hdr == NULL);
/* try to insert block into found extent and return */
if (ex && (flag != EXT4_GET_BLOCKS_PRE_IO)
if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
&& ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
ext4_ext_is_uninitialized(newext),
......@@ -1740,7 +1740,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
merge:
/* try to merge extents to the right */
if (flag != EXT4_GET_BLOCKS_PRE_IO)
if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
ext4_ext_try_to_merge(inode, path, nearex);
/* try to merge extents to the left */
......@@ -3065,7 +3065,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
ext4_ext_show_leaf(inode, path);
/* get_block() before submit the IO, split the extent */
if (flags == EXT4_GET_BLOCKS_PRE_IO) {
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
ret = ext4_split_unwritten_extents(handle,
inode, path, iblock,
max_blocks, flags);
......@@ -3078,10 +3078,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
io->flag = EXT4_IO_UNWRITTEN;
else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
if (ext4_should_dioread_nolock(inode))
set_buffer_uninit(bh_result);
goto out;
}
/* IO end_io complete, convert the filled extent to written */
if (flags == EXT4_GET_BLOCKS_CONVERT) {
if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
ret = ext4_convert_unwritten_extents_endio(handle, inode,
path);
if (ret >= 0)
......@@ -3351,21 +3353,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
ext4_ext_mark_uninitialized(&newex);
/*
* io_end structure was created for every async
* direct IO write to the middle of the file.
* To avoid unecessary convertion for every aio dio rewrite
* to the mid of file, here we flag the IO that is really
* need the convertion.
* io_end structure was created for every IO write to an
* uninitialized extent. To avoid unecessary conversion,
* here we flag the IO that really needs the conversion.
* For non asycn direct IO case, flag the inode state
* that we need to perform convertion when IO is done.
*/
if (flags == EXT4_GET_BLOCKS_PRE_IO) {
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
if (io)
io->flag = EXT4_IO_UNWRITTEN;
else
ext4_set_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN);
}
if (ext4_should_dioread_nolock(inode))
set_buffer_uninit(bh_result);
}
if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
......
This diff is collapsed.
......@@ -709,6 +709,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_reserved_quota = 0;
#endif
INIT_LIST_HEAD(&ei->i_completed_io_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
......@@ -926,6 +927,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, NOLOAD))
seq_puts(seq, ",norecovery");
if (test_opt(sb, DIOREAD_NOLOCK))
seq_puts(seq, ",dioread_nolock");
ext4_show_quota_options(seq, sb);
return 0;
......@@ -1109,6 +1113,7 @@ enum {
Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard,
};
......@@ -1176,6 +1181,8 @@ static const match_table_t tokens = {
{Opt_auto_da_alloc, "auto_da_alloc=%u"},
{Opt_auto_da_alloc, "auto_da_alloc"},
{Opt_noauto_da_alloc, "noauto_da_alloc"},
{Opt_dioread_nolock, "dioread_nolock"},
{Opt_dioread_lock, "dioread_lock"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_err, NULL},
......@@ -1640,6 +1647,12 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_nodiscard:
clear_opt(sbi->s_mount_opt, DISCARD);
break;
case Opt_dioread_nolock:
set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
break;
case Opt_dioread_lock:
clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
break;
default:
ext4_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" "
......@@ -2795,7 +2808,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
goto failed_mount4;
goto failed_mount_wq;
} else {
clear_opt(sbi->s_mount_opt, DATA_FLAGS);
set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
......@@ -2808,7 +2821,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_64BIT)) {
ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
goto failed_mount4;
goto failed_mount_wq;
}
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
......@@ -2847,7 +2860,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
ext4_msg(sb, KERN_ERR, "Journal does not support "
"requested data journaling mode");
goto failed_mount4;
goto failed_mount_wq;
}
default:
break;
......@@ -2855,13 +2868,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
no_journal:
if (test_opt(sb, NOBH)) {
if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
"its supported only with writeback mode");
clear_opt(sbi->s_mount_opt, NOBH);
}
if (test_opt(sb, DIOREAD_NOLOCK)) {
ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
"not supported with nobh mode");
goto failed_mount_wq;
}
}
EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
if (!EXT4_SB(sb)->dio_unwritten_wq) {
......@@ -2926,6 +2943,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"requested data journaling mode");
clear_opt(sbi->s_mount_opt, DELALLOC);
}
if (test_opt(sb, DIOREAD_NOLOCK)) {
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
"option - requested data journaling mode");
clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
}
if (sb->s_blocksize < PAGE_SIZE) {
ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
"option - block size is too small");
clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
}
}
err = ext4_setup_system_zone(sb);
if (err) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment