ext4: fix races between changing inode journal mode and ext4_writepages

In ext4, there is a race condition between changing inode journal mode and ext4_writepages(). While ext4_writepages() is executed on a non-journalled mode inode, the inode's journal mode could be enabled by ioctl() and then, some pages dirtied after switching the journal mode will be still exposed to ext4_writepages() in non-journaled mode. To resolve this problem, we use fs-wide per-cpu rw semaphore by Jan Kara's suggestion because we don't want to waste ext4_inode_info's space for this extra rare case. Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Reviewed-by: Jan Kara <jack@suse.cz>

ext4: fix races between changing inode journal mode and ext4_writepages
In ext4, there is a race condition between changing inode journal mode and ext4_writepages(). While ext4_writepages() is executed on a non-journalled mode inode, the inode's journal mode could be enabled by ioctl() and then, some pages dirtied after switching the journal mode will be still exposed to ext4_writepages() in non-journaled mode. To resolve this problem, we use fs-wide per-cpu rw semaphore by Jan Kara's suggestion because we don't want to waste ext4_inode_info's space for this extra rare case. Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Reviewed-by: Jan Kara <jack@suse.cz>
c8585c6f · Daeho Jeong · Theodore Ts'o · 4c546592 · c8585c6f · c8585c6f
Commit c8585c6f authored Apr 25, 2016 by Daeho Jeong Committed by Theodore Ts'o Apr 25, 2016
Showing with 21 additions and 3 deletions

fs/ext4/ext4.h fs/ext4/ext4.h +4 -0

fs/ext4/inode.c fs/ext4/inode.c +12 -3

fs/ext4/super.c fs/ext4/super.c +4 -0

kernel/locking/percpu-rwsem.c kernel/locking/percpu-rwsem.c +1 -0

No files found.
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,6 +33,7 @@
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
 #include <linux/falloc.h>
+#include <linux/percpu-rwsem.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
@@ -1508,6 +1509,9 @@ struct ext4_sb_info {
 	struct ratelimit_state s_err_ratelimit_state;
 	struct ratelimit_state s_warning_ratelimit_state;
 	struct ratelimit_state s_msg_ratelimit_state;
+
+	/* Barrier between changing inodes' journal flags and writepages ops. */
+	struct percpu_rw_semaphore s_journal_flag_rwsem;
 };

 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)

--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2612,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping,
 	struct blk_plug plug;
 	bool give_up_on_write = false;

+	percpu_down_read(&sbi->s_journal_flag_rwsem);
 	trace_ext4_writepages(inode, wbc);

-	if (dax_mapping(mapping))
-		return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+	if (dax_mapping(mapping)) {
+		ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
 						  wbc);
+		goto out_writepages;
+	}

 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
@@ -2786,6 +2789,7 @@ static int ext4_writepages(struct address_space *mapping,
 out_writepages:
 	trace_ext4_writepages_result(inode, wbc, ret,
 				     nr_to_write - wbc->nr_to_write);
+	percpu_up_read(&sbi->s_journal_flag_rwsem);
 	return ret;
 }

@@ -5436,6 +5440,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	journal_t *journal;
 	handle_t *handle;
 	int err;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

 	/*
 	 * We have to be very careful here: changing a data block's
@@ -5475,6 +5480,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		}
 	}

+	percpu_down_write(&sbi->s_journal_flag_rwsem);
 	jbd2_journal_lock_updates(journal);

 	/*
@@ -5491,6 +5497,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		err = jbd2_journal_flush(journal);
 		if (err < 0) {
 			jbd2_journal_unlock_updates(journal);
+			percpu_up_write(&sbi->s_journal_flag_rwsem);
 			ext4_inode_resume_unlocked_dio(inode);
 			return err;
 		}
@@ -5499,6 +5506,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	ext4_set_aops(inode);

 	jbd2_journal_unlock_updates(journal);
+	percpu_up_write(&sbi->s_journal_flag_rwsem);
+
 	if (val)
 		up_write(&EXT4_I(inode)->i_mmap_sem);
 	ext4_inode_resume_unlocked_dio(inode);

--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+	percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -3930,6 +3931,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (!err)
 		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
 					  GFP_KERNEL);
+	if (!err)
+		err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount6;

--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
 	free_percpu(brw->fast_read_ctr);
 	brw->fast_read_ctr = NULL; /* catch use after free bugs */
 }
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);

 /*
 * This is the fast-path for down_read/up_read. If it succeeds we rely