Commit 678aaf48 authored by Jan Kara's avatar Jan Kara Committed by Theodore Ts'o

ext4: Use new framework for data=ordered mode in JBD2

This patch makes ext4 use inode-based implementation of data=ordered mode
in JBD2. It allows us to unify some data=ordered and data=writeback paths
(especially writepage since we don't have to start a transaction anymore)
and remove some buffer walking.

Updated fix from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
to fix file system hang due to corrupt jinode values.
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatarAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: default avatarMingming Cao <cmm@us.ibm.com>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent c851ed54
...@@ -150,6 +150,7 @@ struct ext4_inode_info { ...@@ -150,6 +150,7 @@ struct ext4_inode_info {
*/ */
struct rw_semaphore i_data_sem; struct rw_semaphore i_data_sem;
struct inode vfs_inode; struct inode vfs_inode;
struct jbd2_inode jinode;
unsigned long i_ext_generation; unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent; struct ext4_ext_cache i_cached_extent;
......
...@@ -154,8 +154,6 @@ int __ext4_journal_dirty_metadata(const char *where, ...@@ -154,8 +154,6 @@ int __ext4_journal_dirty_metadata(const char *where,
#define ext4_journal_forget(handle, bh) \ #define ext4_journal_forget(handle, bh) \
__ext4_journal_forget(__func__, (handle), (bh)) __ext4_journal_forget(__func__, (handle), (bh))
int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle); int __ext4_journal_stop(const char *where, handle_t *handle);
...@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal) ...@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
return jbd2_journal_force_commit(journal); return jbd2_journal_force_commit(journal);
} }
static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
{
return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
}
/* super.c */ /* super.c */
int ext4_force_commit(struct super_block *sb); int ext4_force_commit(struct super_block *sb);
......
...@@ -39,6 +39,13 @@ ...@@ -39,6 +39,13 @@
#include "xattr.h" #include "xattr.h"
#include "acl.h" #include "acl.h"
static inline int ext4_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
new_size);
}
/* /*
* Test whether an inode is a fast symlink. * Test whether an inode is a fast symlink.
*/ */
...@@ -181,6 +188,8 @@ void ext4_delete_inode (struct inode * inode) ...@@ -181,6 +188,8 @@ void ext4_delete_inode (struct inode * inode)
{ {
handle_t *handle; handle_t *handle;
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0); truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode)) if (is_bad_inode(inode))
...@@ -1273,15 +1282,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, ...@@ -1273,15 +1282,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
return ret; return ret;
} }
int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
{
int err = jbd2_journal_dirty_data(handle, bh);
if (err)
ext4_journal_abort_handle(__func__, __func__,
bh, handle, err);
return err;
}
/* For write_end() in data=journal mode */ /* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh) static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{ {
...@@ -1311,8 +1311,7 @@ static int ext4_ordered_write_end(struct file *file, ...@@ -1311,8 +1311,7 @@ static int ext4_ordered_write_end(struct file *file,
from = pos & (PAGE_CACHE_SIZE - 1); from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len; to = from + len;
ret = walk_page_buffers(handle, page_buffers(page), ret = ext4_jbd2_file_inode(handle, inode);
from, to, NULL, ext4_journal_dirty_data);
if (ret == 0) { if (ret == 0) {
/* /*
...@@ -1472,25 +1471,22 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) ...@@ -1472,25 +1471,22 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0; return 0;
} }
static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
{
if (buffer_mapped(bh))
return ext4_journal_dirty_data(handle, bh);
return 0;
}
static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
{ {
return !buffer_mapped(bh) || buffer_delay(bh); return !buffer_mapped(bh) || buffer_delay(bh);
} }
/* /*
* Note that we don't need to start a transaction unless we're journaling * Note that we don't need to start a transaction unless we're journaling data
* data because we should have holes filled from ext4_page_mkwrite(). If * because we should have holes filled from ext4_page_mkwrite(). We even don't
* we are journaling data, we cannot start transaction directly because * need to file the inode to the transaction's list in ordered mode because if
* transaction start ranks above page lock so we have to do some magic... * we are writing back data added by write(), the inode is already there and if
* we are writing back data modified via mmap(), noone guarantees in which
* transaction the data will hit the disk. In case we are journaling data, we
* cannot start transaction directly because transaction start ranks above page
* lock so we have to do some magic.
* *
* In all journalling modes block_write_full_page() will start the I/O. * In all journaling modes block_write_full_page() will start the I/O.
* *
* Problem: * Problem:
* *
...@@ -1533,86 +1529,7 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) ...@@ -1533,86 +1529,7 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
* us. * us.
* *
*/ */
static int __ext4_ordered_writepage(struct page *page, static int __ext4_normal_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
struct buffer_head *page_bufs;
handle_t *handle = NULL;
int ret = 0;
int err;
if (!page_has_buffers(page)) {
create_empty_buffers(page, inode->i_sb->s_blocksize,
(1 << BH_Dirty)|(1 << BH_Uptodate));
}
page_bufs = page_buffers(page);
walk_page_buffers(handle, page_bufs, 0,
PAGE_CACHE_SIZE, NULL, bget_one);
ret = block_write_full_page(page, ext4_get_block, wbc);
/*
* The page can become unlocked at any point now, and
* truncate can then come in and change things. So we
* can't touch *page from now on. But *page_bufs is
* safe due to elevated refcount.
*/
/*
* And attach them to the current transaction. But only if
* block_write_full_page() succeeded. Otherwise they are unmapped,
* and generally junk.
*/
if (ret == 0) {
handle = ext4_journal_start(inode,
ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out_put;
}
ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
NULL, jbd2_journal_dirty_data_fn);
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
}
out_put:
walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
bput_one);
return ret;
}
static int ext4_ordered_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
loff_t size = i_size_read(inode);
loff_t len;
J_ASSERT(PageLocked(page));
J_ASSERT(page_has_buffers(page));
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
else
len = PAGE_CACHE_SIZE;
BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
ext4_bh_unmapped_or_delay));
/*
* We give up here if we're reentered, because it might be for a
* different filesystem.
*/
if (!ext4_journal_current_handle())
return __ext4_ordered_writepage(page, wbc);
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
static int __ext4_writeback_writepage(struct page *page,
struct writeback_control *wbc) struct writeback_control *wbc)
{ {
struct inode *inode = page->mapping->host; struct inode *inode = page->mapping->host;
...@@ -1624,7 +1541,7 @@ static int __ext4_writeback_writepage(struct page *page, ...@@ -1624,7 +1541,7 @@ static int __ext4_writeback_writepage(struct page *page,
} }
static int ext4_writeback_writepage(struct page *page, static int ext4_normal_writepage(struct page *page,
struct writeback_control *wbc) struct writeback_control *wbc)
{ {
struct inode *inode = page->mapping->host; struct inode *inode = page->mapping->host;
...@@ -1641,7 +1558,7 @@ static int ext4_writeback_writepage(struct page *page, ...@@ -1641,7 +1558,7 @@ static int ext4_writeback_writepage(struct page *page,
ext4_bh_unmapped_or_delay)); ext4_bh_unmapped_or_delay));
if (!ext4_journal_current_handle()) if (!ext4_journal_current_handle())
return __ext4_writeback_writepage(page, wbc); return __ext4_normal_writepage(page, wbc);
redirty_page_for_writepage(wbc, page); redirty_page_for_writepage(wbc, page);
unlock_page(page); unlock_page(page);
...@@ -1877,7 +1794,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) ...@@ -1877,7 +1794,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
static const struct address_space_operations ext4_ordered_aops = { static const struct address_space_operations ext4_ordered_aops = {
.readpage = ext4_readpage, .readpage = ext4_readpage,
.readpages = ext4_readpages, .readpages = ext4_readpages,
.writepage = ext4_ordered_writepage, .writepage = ext4_normal_writepage,
.sync_page = block_sync_page, .sync_page = block_sync_page,
.write_begin = ext4_write_begin, .write_begin = ext4_write_begin,
.write_end = ext4_ordered_write_end, .write_end = ext4_ordered_write_end,
...@@ -1891,7 +1808,7 @@ static const struct address_space_operations ext4_ordered_aops = { ...@@ -1891,7 +1808,7 @@ static const struct address_space_operations ext4_ordered_aops = {
static const struct address_space_operations ext4_writeback_aops = { static const struct address_space_operations ext4_writeback_aops = {
.readpage = ext4_readpage, .readpage = ext4_readpage,
.readpages = ext4_readpages, .readpages = ext4_readpages,
.writepage = ext4_writeback_writepage, .writepage = ext4_normal_writepage,
.sync_page = block_sync_page, .sync_page = block_sync_page,
.write_begin = ext4_write_begin, .write_begin = ext4_write_begin,
.write_end = ext4_writeback_write_end, .write_end = ext4_writeback_write_end,
...@@ -2019,7 +1936,7 @@ int ext4_block_truncate_page(handle_t *handle, ...@@ -2019,7 +1936,7 @@ int ext4_block_truncate_page(handle_t *handle,
err = ext4_journal_dirty_metadata(handle, bh); err = ext4_journal_dirty_metadata(handle, bh);
} else { } else {
if (ext4_should_order_data(inode)) if (ext4_should_order_data(inode))
err = ext4_journal_dirty_data(handle, bh); err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh); mark_buffer_dirty(bh);
} }
...@@ -3171,7 +3088,14 @@ int ext4_write_inode(struct inode *inode, int wait) ...@@ -3171,7 +3088,14 @@ int ext4_write_inode(struct inode *inode, int wait)
* be freed, so we have a strong guarantee that no future commit will * be freed, so we have a strong guarantee that no future commit will
* leave these blocks visible to the user.) * leave these blocks visible to the user.)
* *
* Called with inode->sem down. * Another thing we have to assure is that if we are in ordered mode
* and inode is still attached to the committing transaction, we must
* we start writeout of all the dirty pages which are being truncated.
* This way we are sure that all the data written in the previous
* transaction are already on disk (truncate waits for pages under
* writeback).
*
* Called with inode->i_mutex down.
*/ */
int ext4_setattr(struct dentry *dentry, struct iattr *attr) int ext4_setattr(struct dentry *dentry, struct iattr *attr)
{ {
...@@ -3237,6 +3161,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -3237,6 +3161,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (!error) if (!error)
error = rc; error = rc;
ext4_journal_stop(handle); ext4_journal_stop(handle);
if (ext4_should_order_data(inode)) {
error = ext4_begin_ordered_truncate(inode,
attr->ia_size);
if (error) {
/* Do as much error cleanup as possible */
handle = ext4_journal_start(inode, 3);
if (IS_ERR(handle)) {
ext4_orphan_del(NULL, inode);
goto err_out;
}
ext4_orphan_del(handle, inode);
ext4_journal_stop(handle);
goto err_out;
}
}
} }
rc = inode_setattr(inode, attr); rc = inode_setattr(inode, attr);
......
...@@ -573,6 +573,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ...@@ -573,6 +573,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list); INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock); spin_lock_init(&ei->i_prealloc_lock);
jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
return &ei->vfs_inode; return &ei->vfs_inode;
} }
...@@ -637,6 +638,8 @@ static void ext4_clear_inode(struct inode *inode) ...@@ -637,6 +638,8 @@ static void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->i_block_alloc_info = NULL; EXT4_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv)) if (unlikely(rsv))
kfree(rsv); kfree(rsv);
jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
&EXT4_I(inode)->jinode);
} }
static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
...@@ -3378,7 +3381,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, ...@@ -3378,7 +3381,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
err = ext4_journal_dirty_metadata(handle, bh); err = ext4_journal_dirty_metadata(handle, bh);
else { else {
/* Always do at least ordered writes for quotas */ /* Always do at least ordered writes for quotas */
err = ext4_journal_dirty_data(handle, bh); err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh); mark_buffer_dirty(bh);
} }
brelse(bh); brelse(bh);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment