Commit 744692dc authored by Jiaying Zhang's avatar Jiaying Zhang Committed by Theodore Ts'o

ext4: use ext4_get_block_write in buffer write

Allocate uninitialized extent before ext4 buffer write and
convert the extent to initialized after io completes.
The purpose is to make sure an extent can only be marked
initialized after it has been written with new data so
we can safely drop the i_mutex lock in ext4 DIO read without
exposing stale data. This helps to improve multi-thread DIO
read performance on high-speed disks.

Skip the nobh and data=journal mount cases to make things simple for now.
Signed-off-by: default avatarJiaying Zhang <jiayingz@google.com>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent c7064ef1
...@@ -138,7 +138,7 @@ typedef struct ext4_io_end { ...@@ -138,7 +138,7 @@ typedef struct ext4_io_end {
struct list_head list; /* per-file finished AIO list */ struct list_head list; /* per-file finished AIO list */
struct inode *inode; /* file being written to */ struct inode *inode; /* file being written to */
unsigned int flag; /* unwritten or not */ unsigned int flag; /* unwritten or not */
int error; /* I/O error code */ struct page *page; /* page struct for buffer write */
loff_t offset; /* offset in the file */ loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */ ssize_t size; /* size of the extent */
struct work_struct work; /* data work queue */ struct work_struct work; /* data work queue */
...@@ -361,7 +361,7 @@ struct ext4_new_group_data { ...@@ -361,7 +361,7 @@ struct ext4_new_group_data {
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
/* Convert extent to initialized after IO complete */ /* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
EXT4_GET_BLOCKS_IO_CREATE_EXT) EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
/* /*
* Flags used by ext4_free_blocks * Flags used by ext4_free_blocks
...@@ -702,6 +702,7 @@ struct ext4_inode_info { ...@@ -702,6 +702,7 @@ struct ext4_inode_info {
/* completed IOs that might need unwritten extents handling */ /* completed IOs that might need unwritten extents handling */
struct list_head i_completed_io_list; struct list_head i_completed_io_list;
spinlock_t i_completed_io_lock;
/* current io_end structure for async DIO write*/ /* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio; ext4_io_end_t *cur_aio_dio;
...@@ -752,6 +753,7 @@ struct ext4_inode_info { ...@@ -752,6 +753,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
...@@ -1781,6 +1783,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, ...@@ -1781,6 +1783,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 len, __u64 *moved_len); __u64 len, __u64 *moved_len);
/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
enum ext4_state_bits {
BH_Uninit /* blocks are allocated but uninitialized on disk */
= BH_JBDPrivateStart,
};
BUFFER_FNS(Uninit, uninit)
TAS_BUFFER_FNS(Uninit, uninit)
/* /*
* Add new method to test wether block and inode bitmaps are properly * Add new method to test wether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough * initialized. With uninit_bg reading the block from disk is not enough
......
...@@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode) ...@@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
return 0; return 0;
} }
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
* i_mutex for direct I/O reads. This only works for extent-based
* files, and it doesn't work for nobh or if data journaling is
* enabled, since the dioread_nolock code uses b_private to pass
* information back to the I/O completion handler, and this conflicts
* with the jbd's use of b_private.
*/
static inline int ext4_should_dioread_nolock(struct inode *inode)
{
if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
return 0;
if (test_opt(inode->i_sb, NOBH))
return 0;
if (!S_ISREG(inode->i_mode))
return 0;
if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
return 0;
if (ext4_should_journal_data(inode))
return 0;
return 1;
}
#endif /* _EXT4_JBD2_H */ #endif /* _EXT4_JBD2_H */
...@@ -1619,7 +1619,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ...@@ -1619,7 +1619,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
BUG_ON(path[depth].p_hdr == NULL); BUG_ON(path[depth].p_hdr == NULL);
/* try to insert block into found extent and return */ /* try to insert block into found extent and return */
if (ex && (flag != EXT4_GET_BLOCKS_PRE_IO) if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
&& ext4_can_extents_be_merged(inode, ex, newext)) { && ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
ext4_ext_is_uninitialized(newext), ext4_ext_is_uninitialized(newext),
...@@ -1740,7 +1740,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ...@@ -1740,7 +1740,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
merge: merge:
/* try to merge extents to the right */ /* try to merge extents to the right */
if (flag != EXT4_GET_BLOCKS_PRE_IO) if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
ext4_ext_try_to_merge(inode, path, nearex); ext4_ext_try_to_merge(inode, path, nearex);
/* try to merge extents to the left */ /* try to merge extents to the left */
...@@ -3065,7 +3065,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ...@@ -3065,7 +3065,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
ext4_ext_show_leaf(inode, path); ext4_ext_show_leaf(inode, path);
/* get_block() before submit the IO, split the extent */ /* get_block() before submit the IO, split the extent */
if (flags == EXT4_GET_BLOCKS_PRE_IO) { if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
ret = ext4_split_unwritten_extents(handle, ret = ext4_split_unwritten_extents(handle,
inode, path, iblock, inode, path, iblock,
max_blocks, flags); max_blocks, flags);
...@@ -3078,10 +3078,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ...@@ -3078,10 +3078,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
io->flag = EXT4_IO_UNWRITTEN; io->flag = EXT4_IO_UNWRITTEN;
else else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
if (ext4_should_dioread_nolock(inode))
set_buffer_uninit(bh_result);
goto out; goto out;
} }
/* IO end_io complete, convert the filled extent to written */ /* IO end_io complete, convert the filled extent to written */
if (flags == EXT4_GET_BLOCKS_CONVERT) { if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
ret = ext4_convert_unwritten_extents_endio(handle, inode, ret = ext4_convert_unwritten_extents_endio(handle, inode,
path); path);
if (ret >= 0) if (ret >= 0)
...@@ -3351,21 +3353,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ...@@ -3351,21 +3353,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
ext4_ext_mark_uninitialized(&newex); ext4_ext_mark_uninitialized(&newex);
/* /*
* io_end structure was created for every async * io_end structure was created for every IO write to an
* direct IO write to the middle of the file. * uninitialized extent. To avoid unecessary conversion,
* To avoid unecessary convertion for every aio dio rewrite * here we flag the IO that really needs the conversion.
* to the mid of file, here we flag the IO that is really
* need the convertion.
* For non asycn direct IO case, flag the inode state * For non asycn direct IO case, flag the inode state
* that we need to perform convertion when IO is done. * that we need to perform convertion when IO is done.
*/ */
if (flags == EXT4_GET_BLOCKS_PRE_IO) { if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
if (io) if (io)
io->flag = EXT4_IO_UNWRITTEN; io->flag = EXT4_IO_UNWRITTEN;
else else
ext4_set_inode_state(inode, ext4_set_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN); EXT4_STATE_DIO_UNWRITTEN);
} }
if (ext4_should_dioread_nolock(inode))
set_buffer_uninit(bh_result);
} }
if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
......
...@@ -38,6 +38,7 @@ ...@@ -38,6 +38,7 @@
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/bio.h> #include <linux/bio.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/kernel.h>
#include "ext4_jbd2.h" #include "ext4_jbd2.h"
#include "xattr.h" #include "xattr.h"
...@@ -1534,6 +1535,8 @@ static void ext4_truncate_failed_write(struct inode *inode) ...@@ -1534,6 +1535,8 @@ static void ext4_truncate_failed_write(struct inode *inode)
ext4_truncate(inode); ext4_truncate(inode);
} }
static int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
static int ext4_write_begin(struct file *file, struct address_space *mapping, static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags, loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata) struct page **pagep, void **fsdata)
...@@ -1575,8 +1578,12 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, ...@@ -1575,8 +1578,12 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
} }
*pagep = page; *pagep = page;
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, if (ext4_should_dioread_nolock(inode))
ext4_get_block); ret = block_write_begin(file, mapping, pos, len, flags, pagep,
fsdata, ext4_get_block_write);
else
ret = block_write_begin(file, mapping, pos, len, flags, pagep,
fsdata, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) { if (!ret && ext4_should_journal_data(inode)) {
ret = walk_page_buffers(handle, page_buffers(page), ret = walk_page_buffers(handle, page_buffers(page),
...@@ -2092,6 +2099,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, ...@@ -2092,6 +2099,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
} else if (buffer_mapped(bh)) } else if (buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock); BUG_ON(bh->b_blocknr != pblock);
if (buffer_uninit(exbh))
set_buffer_uninit(bh);
cur_logical++; cur_logical++;
pblock++; pblock++;
} while ((bh = bh->b_this_page) != head); } while ((bh = bh->b_this_page) != head);
...@@ -2221,6 +2230,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) ...@@ -2221,6 +2230,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
*/ */
new.b_state = 0; new.b_state = 0;
get_blocks_flags = EXT4_GET_BLOCKS_CREATE; get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
if (ext4_should_dioread_nolock(mpd->inode))
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
if (mpd->b_state & (1 << BH_Delay)) if (mpd->b_state & (1 << BH_Delay))
get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
...@@ -2636,6 +2647,9 @@ static int __ext4_journalled_writepage(struct page *page, ...@@ -2636,6 +2647,9 @@ static int __ext4_journalled_writepage(struct page *page,
return ret; return ret;
} }
static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
/* /*
* Note that we don't need to start a transaction unless we're journaling data * Note that we don't need to start a transaction unless we're journaling data
* because we should have holes filled from ext4_page_mkwrite(). We even don't * because we should have holes filled from ext4_page_mkwrite(). We even don't
...@@ -2683,7 +2697,7 @@ static int ext4_writepage(struct page *page, ...@@ -2683,7 +2697,7 @@ static int ext4_writepage(struct page *page,
int ret = 0; int ret = 0;
loff_t size; loff_t size;
unsigned int len; unsigned int len;
struct buffer_head *page_bufs; struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host; struct inode *inode = page->mapping->host;
trace_ext4_writepage(inode, page); trace_ext4_writepage(inode, page);
...@@ -2759,7 +2773,11 @@ static int ext4_writepage(struct page *page, ...@@ -2759,7 +2773,11 @@ static int ext4_writepage(struct page *page,
if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
ret = nobh_writepage(page, noalloc_get_block_write, wbc); ret = nobh_writepage(page, noalloc_get_block_write, wbc);
else else if (page_bufs && buffer_uninit(page_bufs)) {
ext4_set_bh_endio(page_bufs, inode);
ret = block_write_full_page_endio(page, noalloc_get_block_write,
wbc, ext4_end_io_buffer_write);
} else
ret = block_write_full_page(page, noalloc_get_block_write, ret = block_write_full_page(page, noalloc_get_block_write,
wbc); wbc);
...@@ -3347,10 +3365,44 @@ ext4_readpages(struct file *file, struct address_space *mapping, ...@@ -3347,10 +3365,44 @@ ext4_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
} }
static void ext4_free_io_end(ext4_io_end_t *io)
{
BUG_ON(!io);
if (io->page)
put_page(io->page);
iput(io->inode);
kfree(io);
}
static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
{
struct buffer_head *head, *bh;
unsigned int curr_off = 0;
if (!page_has_buffers(page))
return;
head = bh = page_buffers(page);
do {
if (offset <= curr_off && test_clear_buffer_uninit(bh)
&& bh->b_private) {
ext4_free_io_end(bh->b_private);
bh->b_private = NULL;
bh->b_end_io = NULL;
}
curr_off = curr_off + bh->b_size;
bh = bh->b_this_page;
} while (bh != head);
}
static void ext4_invalidatepage(struct page *page, unsigned long offset) static void ext4_invalidatepage(struct page *page, unsigned long offset)
{ {
journal_t *journal = EXT4_JOURNAL(page->mapping->host); journal_t *journal = EXT4_JOURNAL(page->mapping->host);
/*
* free any io_end structure allocated for buffers to be discarded
*/
if (ext4_should_dioread_nolock(page->mapping->host))
ext4_invalidatepage_free_endio(page, offset);
/* /*
* If it's a full truncate we just forget about the pending dirtying * If it's a full truncate we just forget about the pending dirtying
*/ */
...@@ -3471,10 +3523,11 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, ...@@ -3471,10 +3523,11 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
static int ext4_get_block_write(struct inode *inode, sector_t iblock, static int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create) struct buffer_head *bh_result, int create)
{ {
handle_t *handle = NULL; handle_t *handle = ext4_journal_current_handle();
int ret = 0; int ret = 0;
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
int dio_credits; int dio_credits;
int started = 0;
ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
inode->i_ino, create); inode->i_ino, create);
...@@ -3485,37 +3538,36 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock, ...@@ -3485,37 +3538,36 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
*/ */
create = EXT4_GET_BLOCKS_IO_CREATE_EXT; create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
if (max_blocks > DIO_MAX_BLOCKS) if (!handle) {
max_blocks = DIO_MAX_BLOCKS; if (max_blocks > DIO_MAX_BLOCKS)
dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); max_blocks = DIO_MAX_BLOCKS;
handle = ext4_journal_start(inode, dio_credits); dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
if (IS_ERR(handle)) { handle = ext4_journal_start(inode, dio_credits);
ret = PTR_ERR(handle); if (IS_ERR(handle)) {
goto out; ret = PTR_ERR(handle);
goto out;
}
started = 1;
} }
ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
create); create);
if (ret > 0) { if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits); bh_result->b_size = (ret << inode->i_blkbits);
ret = 0; ret = 0;
} }
ext4_journal_stop(handle); if (started)
ext4_journal_stop(handle);
out: out:
return ret; return ret;
} }
static void ext4_free_io_end(ext4_io_end_t *io)
{
BUG_ON(!io);
iput(io->inode);
kfree(io);
}
static void dump_completed_IO(struct inode * inode) static void dump_completed_IO(struct inode * inode)
{ {
#ifdef EXT4_DEBUG #ifdef EXT4_DEBUG
struct list_head *cur, *before, *after; struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1; ext4_io_end_t *io, *io0, *io1;
unsigned long flags;
if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
...@@ -3523,6 +3575,7 @@ static void dump_completed_IO(struct inode * inode) ...@@ -3523,6 +3575,7 @@ static void dump_completed_IO(struct inode * inode)
} }
ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
cur = &io->list; cur = &io->list;
before = cur->prev; before = cur->prev;
...@@ -3533,6 +3586,7 @@ static void dump_completed_IO(struct inode * inode) ...@@ -3533,6 +3586,7 @@ static void dump_completed_IO(struct inode * inode)
ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
io, inode->i_ino, io0, io1); io, inode->i_ino, io0, io1);
} }
spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
#endif #endif
} }
...@@ -3556,9 +3610,7 @@ static int ext4_end_io_nolock(ext4_io_end_t *io) ...@@ -3556,9 +3610,7 @@ static int ext4_end_io_nolock(ext4_io_end_t *io)
if (io->flag != EXT4_IO_UNWRITTEN) if (io->flag != EXT4_IO_UNWRITTEN)
return ret; return ret;
if (offset + size <= i_size_read(inode)) ret = ext4_convert_unwritten_extents(inode, offset, size);
ret = ext4_convert_unwritten_extents(inode, offset, size);
if (ret < 0) { if (ret < 0) {
printk(KERN_EMERG "%s: failed to convert unwritten" printk(KERN_EMERG "%s: failed to convert unwritten"
"extents to written extents, error is %d" "extents to written extents, error is %d"
...@@ -3577,18 +3629,25 @@ static int ext4_end_io_nolock(ext4_io_end_t *io) ...@@ -3577,18 +3629,25 @@ static int ext4_end_io_nolock(ext4_io_end_t *io)
*/ */
static void ext4_end_io_work(struct work_struct *work) static void ext4_end_io_work(struct work_struct *work)
{ {
ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
struct inode *inode = io->inode; struct inode *inode = io->inode;
int ret = 0; struct ext4_inode_info *ei = EXT4_I(inode);
unsigned long flags;
int ret;
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
ret = ext4_end_io_nolock(io); ret = ext4_end_io_nolock(io);
if (ret >= 0) { if (ret < 0) {
if (!list_empty(&io->list)) mutex_unlock(&inode->i_mutex);
list_del_init(&io->list); return;
ext4_free_io_end(io);
} }
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
if (!list_empty(&io->list))
list_del_init(&io->list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
ext4_free_io_end(io);
} }
/* /*
...@@ -3607,15 +3666,18 @@ static void ext4_end_io_work(struct work_struct *work) ...@@ -3607,15 +3666,18 @@ static void ext4_end_io_work(struct work_struct *work)
int flush_completed_IO(struct inode *inode) int flush_completed_IO(struct inode *inode)
{ {
ext4_io_end_t *io; ext4_io_end_t *io;
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned long flags;
int ret = 0; int ret = 0;
int ret2 = 0; int ret2 = 0;
if (list_empty(&EXT4_I(inode)->i_completed_io_list)) if (list_empty(&ei->i_completed_io_list))
return ret; return ret;
dump_completed_IO(inode); dump_completed_IO(inode);
while (!list_empty(&EXT4_I(inode)->i_completed_io_list)){ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
io = list_entry(EXT4_I(inode)->i_completed_io_list.next, while (!list_empty(&ei->i_completed_io_list)){
io = list_entry(ei->i_completed_io_list.next,
ext4_io_end_t, list); ext4_io_end_t, list);
/* /*
* Calling ext4_end_io_nolock() to convert completed * Calling ext4_end_io_nolock() to convert completed
...@@ -3631,20 +3693,23 @@ int flush_completed_IO(struct inode *inode) ...@@ -3631,20 +3693,23 @@ int flush_completed_IO(struct inode *inode)
* avoid double converting from both fsync and background work * avoid double converting from both fsync and background work
* queue work. * queue work.
*/ */
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
ret = ext4_end_io_nolock(io); ret = ext4_end_io_nolock(io);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
if (ret < 0) if (ret < 0)
ret2 = ret; ret2 = ret;
else else
list_del_init(&io->list); list_del_init(&io->list);
} }
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
return (ret2 < 0) ? ret2 : 0; return (ret2 < 0) ? ret2 : 0;
} }
static ext4_io_end_t *ext4_init_io_end (struct inode *inode) static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
{ {
ext4_io_end_t *io = NULL; ext4_io_end_t *io = NULL;
io = kmalloc(sizeof(*io), GFP_NOFS); io = kmalloc(sizeof(*io), flags);
if (io) { if (io) {
igrab(inode); igrab(inode);
...@@ -3652,7 +3717,7 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode) ...@@ -3652,7 +3717,7 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
io->flag = 0; io->flag = 0;
io->offset = 0; io->offset = 0;
io->size = 0; io->size = 0;
io->error = 0; io->page = NULL;
INIT_WORK(&io->work, ext4_end_io_work); INIT_WORK(&io->work, ext4_end_io_work);
INIT_LIST_HEAD(&io->list); INIT_LIST_HEAD(&io->list);
} }
...@@ -3665,6 +3730,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ...@@ -3665,6 +3730,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
{ {
ext4_io_end_t *io_end = iocb->private; ext4_io_end_t *io_end = iocb->private;
struct workqueue_struct *wq; struct workqueue_struct *wq;
unsigned long flags;
struct ext4_inode_info *ei;
/* if not async direct IO or dio with 0 bytes write, just return */ /* if not async direct IO or dio with 0 bytes write, just return */
if (!io_end || !size) if (!io_end || !size)
...@@ -3684,17 +3751,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ...@@ -3684,17 +3751,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
io_end->offset = offset; io_end->offset = offset;
io_end->size = size; io_end->size = size;
io_end->flag = EXT4_IO_UNWRITTEN;
wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
/* queue the work to convert unwritten extents to written */ /* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work); queue_work(wq, &io_end->work);
/* Add the io_end to per-inode completed aio dio list*/ /* Add the io_end to per-inode completed aio dio list*/
list_add_tail(&io_end->list, ei = EXT4_I(io_end->inode);
&EXT4_I(io_end->inode)->i_completed_io_list); spin_lock_irqsave(&ei->i_completed_io_lock, flags);
list_add_tail(&io_end->list, &ei->i_completed_io_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
iocb->private = NULL; iocb->private = NULL;
} }
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
{
ext4_io_end_t *io_end = bh->b_private;
struct workqueue_struct *wq;
struct inode *inode;
unsigned long flags;
if (!test_clear_buffer_uninit(bh) || !io_end)
goto out;
if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
printk("sb umounted, discard end_io request for inode %lu\n",
io_end->inode->i_ino);
ext4_free_io_end(io_end);
goto out;
}
io_end->flag = EXT4_IO_UNWRITTEN;
inode = io_end->inode;
/* Add the io_end to per-inode completed io list*/
spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
/* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work);
out:
bh->b_private = NULL;
bh->b_end_io = NULL;
clear_buffer_uninit(bh);
end_buffer_async_write(bh, uptodate);
}
static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
{
ext4_io_end_t *io_end;
struct page *page = bh->b_page;
loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
size_t size = bh->b_size;
retry:
io_end = ext4_init_io_end(inode, GFP_ATOMIC);
if (!io_end) {
if (printk_ratelimit())
printk(KERN_WARNING "%s: allocation fail\n", __func__);
schedule();
goto retry;
}
io_end->offset = offset;
io_end->size = size;
/*
* We need to hold a reference to the page to make sure it
* doesn't get evicted before ext4_end_io_work() has a chance
* to convert the extent from written to unwritten.
*/
io_end->page = page;
get_page(io_end->page);
bh->b_private = io_end;
bh->b_end_io = ext4_end_io_buffer_write;
return 0;
}
/* /*
* For ext4 extent files, ext4 will do direct-io write to holes, * For ext4 extent files, ext4 will do direct-io write to holes,
* preallocated extents, and those write extend the file, no need to * preallocated extents, and those write extend the file, no need to
...@@ -3748,7 +3883,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ...@@ -3748,7 +3883,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
iocb->private = NULL; iocb->private = NULL;
EXT4_I(inode)->cur_aio_dio = NULL; EXT4_I(inode)->cur_aio_dio = NULL;
if (!is_sync_kiocb(iocb)) { if (!is_sync_kiocb(iocb)) {
iocb->private = ext4_init_io_end(inode); iocb->private = ext4_init_io_end(inode, GFP_NOFS);
if (!iocb->private) if (!iocb->private)
return -ENOMEM; return -ENOMEM;
/* /*
......
...@@ -709,6 +709,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ...@@ -709,6 +709,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_reserved_quota = 0; ei->i_reserved_quota = 0;
#endif #endif
INIT_LIST_HEAD(&ei->i_completed_io_list); INIT_LIST_HEAD(&ei->i_completed_io_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->cur_aio_dio = NULL; ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0; ei->i_sync_tid = 0;
ei->i_datasync_tid = 0; ei->i_datasync_tid = 0;
...@@ -926,6 +927,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) ...@@ -926,6 +927,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, NOLOAD)) if (test_opt(sb, NOLOAD))
seq_puts(seq, ",norecovery"); seq_puts(seq, ",norecovery");
if (test_opt(sb, DIOREAD_NOLOCK))
seq_puts(seq, ",dioread_nolock");
ext4_show_quota_options(seq, sb); ext4_show_quota_options(seq, sb);
return 0; return 0;
...@@ -1109,6 +1113,7 @@ enum { ...@@ -1109,6 +1113,7 @@ enum {
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_block_validity, Opt_noblock_validity, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_discard, Opt_nodiscard,
}; };
...@@ -1176,6 +1181,8 @@ static const match_table_t tokens = { ...@@ -1176,6 +1181,8 @@ static const match_table_t tokens = {
{Opt_auto_da_alloc, "auto_da_alloc=%u"}, {Opt_auto_da_alloc, "auto_da_alloc=%u"},
{Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_auto_da_alloc, "auto_da_alloc"},
{Opt_noauto_da_alloc, "noauto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"},
{Opt_dioread_nolock, "dioread_nolock"},
{Opt_dioread_lock, "dioread_lock"},
{Opt_discard, "discard"}, {Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"}, {Opt_nodiscard, "nodiscard"},
{Opt_err, NULL}, {Opt_err, NULL},
...@@ -1640,6 +1647,12 @@ static int parse_options(char *options, struct super_block *sb, ...@@ -1640,6 +1647,12 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_nodiscard: case Opt_nodiscard:
clear_opt(sbi->s_mount_opt, DISCARD); clear_opt(sbi->s_mount_opt, DISCARD);
break; break;
case Opt_dioread_nolock:
set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
break;
case Opt_dioread_lock:
clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
break;
default: default:
ext4_msg(sb, KERN_ERR, ext4_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" " "Unrecognized mount option \"%s\" "
...@@ -2795,7 +2808,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ...@@ -2795,7 +2808,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
ext4_msg(sb, KERN_ERR, "required journal recovery " ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only"); "suppressed and not mounted read-only");
goto failed_mount4; goto failed_mount_wq;
} else { } else {
clear_opt(sbi->s_mount_opt, DATA_FLAGS); clear_opt(sbi->s_mount_opt, DATA_FLAGS);
set_opt(sbi->s_mount_opt, WRITEBACK_DATA); set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
...@@ -2808,7 +2821,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ...@@ -2808,7 +2821,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_64BIT)) { JBD2_FEATURE_INCOMPAT_64BIT)) {
ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
goto failed_mount4; goto failed_mount_wq;
} }
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
...@@ -2847,7 +2860,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ...@@ -2847,7 +2860,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
ext4_msg(sb, KERN_ERR, "Journal does not support " ext4_msg(sb, KERN_ERR, "Journal does not support "
"requested data journaling mode"); "requested data journaling mode");
goto failed_mount4; goto failed_mount_wq;
} }
default: default:
break; break;
...@@ -2855,13 +2868,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ...@@ -2855,13 +2868,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
no_journal: no_journal:
if (test_opt(sb, NOBH)) { if (test_opt(sb, NOBH)) {
if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
"its supported only with writeback mode"); "its supported only with writeback mode");
clear_opt(sbi->s_mount_opt, NOBH); clear_opt(sbi->s_mount_opt, NOBH);
} }
if (test_opt(sb, DIOREAD_NOLOCK)) {
ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
"not supported with nobh mode");
goto failed_mount_wq;
}
} }
EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
if (!EXT4_SB(sb)->dio_unwritten_wq) { if (!EXT4_SB(sb)->dio_unwritten_wq) {
...@@ -2926,6 +2943,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ...@@ -2926,6 +2943,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"requested data journaling mode"); "requested data journaling mode");
clear_opt(sbi->s_mount_opt, DELALLOC); clear_opt(sbi->s_mount_opt, DELALLOC);
} }
if (test_opt(sb, DIOREAD_NOLOCK)) {
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
"option - requested data journaling mode");
clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
}
if (sb->s_blocksize < PAGE_SIZE) {
ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
"option - block size is too small");
clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
}
}
err = ext4_setup_system_zone(sb); err = ext4_setup_system_zone(sb);
if (err) { if (err) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment