Commit cc3e1bea authored by Theodore Ts'o's avatar Theodore Ts'o

ext4, jbd2: Add barriers for file systems with exernal journals

This is a bit complicated because we are trying to optimize when we
send barriers to the fs data disk.  We could just throw in an extra
barrier to the data disk whenever we send a barrier to the journal
disk, but that's not always strictly necessary.

We only need to send a barrier during a commit when there are data
blocks which are must be written out due to an inode written in
ordered mode, or if fsync() depends on the commit to force data blocks
to disk.  Finally, before we drop transactions from the beginning of
the journal during a checkpoint operation, we need to guarantee that
any blocks that were flushed out to the data disk are firmly on the
rust platter before we drop the transaction from the journal.

Thanks to Oleg Drokin for pointing out this flaw in ext3/ext4.
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent 034fb4c9
...@@ -88,9 +88,21 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) ...@@ -88,9 +88,21 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
return ext4_force_commit(inode->i_sb); return ext4_force_commit(inode->i_sb);
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
if (jbd2_log_start_commit(journal, commit_tid)) if (jbd2_log_start_commit(journal, commit_tid)) {
/*
* When the journal is on a different device than the
* fs data disk, we need to issue the barrier in
* writeback mode. (In ordered mode, the jbd2 layer
* will take care of issuing the barrier. In
* data=journal, all of the data blocks are written to
* the journal device.)
*/
if (ext4_should_writeback_data(inode) &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
jbd2_log_wait_commit(journal, commit_tid); jbd2_log_wait_commit(journal, commit_tid);
else if (journal->j_flags & JBD2_BARRIER) } else if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(inode->i_sb->s_bdev, NULL); blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
return ret; return ret;
} }
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <linux/jbd2.h> #include <linux/jbd2.h>
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/blkdev.h>
#include <trace/events/jbd2.h> #include <trace/events/jbd2.h>
/* /*
...@@ -515,6 +516,20 @@ int jbd2_cleanup_journal_tail(journal_t *journal) ...@@ -515,6 +516,20 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
journal->j_tail_sequence = first_tid; journal->j_tail_sequence = first_tid;
journal->j_tail = blocknr; journal->j_tail = blocknr;
spin_unlock(&journal->j_state_lock); spin_unlock(&journal->j_state_lock);
/*
* If there is an external journal, we need to make sure that
* any data blocks that were recently written out --- perhaps
* by jbd2_log_do_checkpoint() --- are flushed out before we
* drop the transactions from the external journal. It's
* unlikely this will be necessary, especially with a
* appropriately sized journal, but we need this to guarantee
* correctness. Fortunately jbd2_cleanup_journal_tail()
* doesn't get called all that often.
*/
if ((journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(journal->j_fs_dev, NULL);
if (!(journal->j_flags & JBD2_ABORT)) if (!(journal->j_flags & JBD2_ABORT))
jbd2_journal_update_superblock(journal, 1); jbd2_journal_update_superblock(journal, 1);
return 0; return 0;
......
...@@ -259,6 +259,7 @@ static int journal_submit_data_buffers(journal_t *journal, ...@@ -259,6 +259,7 @@ static int journal_submit_data_buffers(journal_t *journal,
ret = err; ret = err;
spin_lock(&journal->j_list_lock); spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction); J_ASSERT(jinode->i_transaction == commit_transaction);
commit_transaction->t_flushed_data_blocks = 1;
jinode->i_flags &= ~JI_COMMIT_RUNNING; jinode->i_flags &= ~JI_COMMIT_RUNNING;
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
} }
...@@ -708,8 +709,17 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -708,8 +709,17 @@ void jbd2_journal_commit_transaction(journal_t *journal)
} }
} }
/* Done it all: now write the commit record asynchronously. */ /*
* If the journal is not located on the file system device,
* then we must flush the file system device before we issue
* the commit record
*/
if (commit_transaction->t_flushed_data_blocks &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(journal->j_fs_dev, NULL);
/* Done it all: now write the commit record asynchronously. */
if (JBD2_HAS_INCOMPAT_FEATURE(journal, if (JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
err = journal_submit_commit_record(journal, commit_transaction, err = journal_submit_commit_record(journal, commit_transaction,
...@@ -720,13 +730,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -720,13 +730,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
blkdev_issue_flush(journal->j_dev, NULL); blkdev_issue_flush(journal->j_dev, NULL);
} }
/*
* This is the right place to wait for data buffers both for ASYNC
* and !ASYNC commit. If commit is ASYNC, we need to wait only after
* the commit block went to disk (which happens above). If commit is
* SYNC, we need to wait for data buffers before we start writing
* commit block, which happens below in such setting.
*/
err = journal_finish_inode_data_buffers(journal, commit_transaction); err = journal_finish_inode_data_buffers(journal, commit_transaction);
if (err) { if (err) {
printk(KERN_WARNING printk(KERN_WARNING
......
...@@ -653,6 +653,7 @@ struct transaction_s ...@@ -653,6 +653,7 @@ struct transaction_s
* waiting for it to finish. * waiting for it to finish.
*/ */
unsigned int t_synchronous_commit:1; unsigned int t_synchronous_commit:1;
unsigned int t_flushed_data_blocks:1;
/* /*
* For use by the filesystem to store fs-specific data * For use by the filesystem to store fs-specific data
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment