Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o: "Mostly performance and bug fixes, plus some cleanups. The one new feature this merge window is a new ioctl EXT4_IOC_SWAP_BOOT which allows installation of a hidden inode designed for boot loaders." * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (50 commits) ext4: fix type-widening bug in inode table readahead code ext4: add check for inodes_count overflow in new resize ioctl ext4: fix Kconfig documentation for CONFIG_EXT4_DEBUG ext4: fix online resizing for ext3-compat file systems jbd2: trace when lock_buffer in do_get_write_access takes a long time ext4: mark metadata blocks using bh flags buffer: add BH_Prio and BH_Meta flags ext4: mark all metadata I/O with REQ_META ext4: fix readdir error in case inline_data+^dir_index. ext4: fix readdir error in the case of inline_data+dir_index jbd2: use kmem_cache_zalloc instead of kmem_cache_alloc/memset ext4: mext_insert_extents should update extent block checksum ext4: move quota initialization out of inode allocation transaction ext4: reserve xattr index for Rich ACL support jbd2: reduce journal_head size ext4: clear buffer_uninit flag when submitting IO ext4: use io_end for multiple bios ext4: make ext4_bio_write_page() use BH_Async_Write flags ext4: Use kstrtoul() instead of parse_strtoul() ext4: defragmentation code cleanup ...

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "Mostly performance and bug fixes, plus some cleanups. The one new feature this merge window is a new ioctl EXT4_IOC_SWAP_BOOT which allows installation of a hidden inode designed for boot loaders." * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (50 commits) ext4: fix type-widening bug in inode table readahead code ext4: add check for inodes_count overflow in new resize ioctl ext4: fix Kconfig documentation for CONFIG_EXT4_DEBUG ext4: fix online resizing for ext3-compat file systems jbd2: trace when lock_buffer in do_get_write_access takes a long time ext4: mark metadata blocks using bh flags buffer: add BH_Prio and BH_Meta flags ext4: mark all metadata I/O with REQ_META ext4: fix readdir error in case inline_data+^dir_index. ext4: fix readdir error in the case of inline_data+dir_index jbd2: use kmem_cache_zalloc instead of kmem_cache_alloc/memset ext4: mext_insert_extents should update extent block checksum ext4: move quota initialization out of inode allocation transaction ext4: reserve xattr index for Rich ACL support jbd2: reduce journal_head size ext4: clear buffer_uninit flag when submitting IO ext4: use io_end for multiple bios ext4: make ext4_bio_write_page() use BH_Async_Write flags ext4: Use kstrtoul() instead of parse_strtoul() ext4: defragmentation code cleanup ...
149b3060 · Linus Torvalds · b0ca4d01 · 0d606e2c · 149b3060 · 149b3060
Commit 149b3060 authored May 01, 2013 by Linus Torvalds
34 changed files
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -494,6 +494,17 @@ Files in /sys/fs/ext4/<devname>
 session_write_kbytes         This file is read-only and shows the number of
                              kilobytes of data that have been written to this
                              filesystem since it was mounted.
+ reserved_clusters            This is RW file and contains number of reserved
+                              clusters in the file system which will be used
+                              in the specific situations to avoid costly
+                              zeroout, unexpected ENOSPC, or possible data
+                              loss. The default is 2% or 4096 clusters,
+                              whichever is smaller and this can be changed
+                              however it can never exceed number of clusters
+                              in the file system. If there is not enough space
+                              for the reserved space when mounting the file
+                              mount will _not_ fail.
 ..............................................................................
 Ioctls
@@ -587,6 +598,16 @@ Table of Ext4 specific ioctls
 			      bitmaps and inode table, the userspace tool thus
 			      just passes the new number of blocks.
+EXT4_IOC_SWAP_BOOT	      Swap i_blocks and associated attributes
+			      (like i_blocks, i_size, i_flags, ...) from
+			      the specified inode with inode
+			      EXT4_BOOT_LOADER_INO (#5). This is typically
+			      used to store a boot loader in a secure part of
+			      the filesystem, where it can't be changed by a
+			      normal user by accident.
+			      The data blocks of the previous boot loader
+			      will be associated with the given inode.
 ..............................................................................
 References

--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2987,6 +2987,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 	/* Take care of bh's that straddle the end of the device */
 	guard_bh_eod(rw, bio, bh);
+	if (buffer_meta(bh))
+		rw |= REQ_META;
+	if (buffer_prio(bh))
+		rw |= REQ_PRIO;
 	bio_get(bio);
 	submit_bio(rw, bio);

--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -71,4 +71,5 @@ config EXT4_DEBUG
 	  Enables run-time debugging support for the ext4 filesystem.
 	  If you select Y here, then you will be able to turn on debugging
-	  with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
+	  with a command such as:
+		echo 1 > /sys/module/ext4/parameters/mballoc_debug
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -29,6 +29,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
 * balloc.c contains the blocks allocation and deallocation routines
 */
+/*
+ * Calculate block group number for a given block number
+ */
+ext4_group_t ext4_get_group_number(struct super_block *sb,
+				   ext4_fsblk_t block)
+{
+	ext4_group_t group;
+	if (test_opt2(sb, STD_GROUP_SIZE))
+		group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+			 block) >>
+			(EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
+	else
+		ext4_get_group_no_and_offset(sb, block, &group, NULL);
+	return group;
+}
 /*
 * Calculate the block group number and offset into the block/cluster
 * allocation bitmap, given a block number
@@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 }
-static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
+/*
+ * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
+ * and 0 otherwise.
+ */
+static inline int ext4_block_in_group(struct super_block *sb,
+				      ext4_fsblk_t block,
 				      ext4_group_t block_group)
 {
 	ext4_group_t actual_group;
-	ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
-	if (actual_group == block_group)
+	actual_group = ext4_get_group_number(sb, block);
-		return 1;
+	return (actual_group == block_group) ? 1 : 0;
-	return 0;
 }
 /* Return the number of clusters used for file system metadata; this
@@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 	trace_ext4_read_block_bitmap_load(sb, block_group);
 	bh->b_end_io = ext4_end_bitmap_read;
 	get_bh(bh);
-	submit_bh(READ, bh);
+	submit_bh(READ | REQ_META | REQ_PRIO, bh);
 	return bh;
 verify:
 	ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
 				  s64 nclusters, unsigned int flags)
 {
-	s64 free_clusters, dirty_clusters, root_clusters;
+	s64 free_clusters, dirty_clusters, rsv, resv_clusters;
 	struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
 	struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
 	free_clusters  = percpu_counter_read_positive(fcc);
 	dirty_clusters = percpu_counter_read_positive(dcc);
+	resv_clusters = atomic64_read(&sbi->s_resv_clusters);
 	/*
 	 * r_blocks_count should always be multiple of the cluster ratio so
 	 * we are safe to do a plane bit shift only.
 	 */
-	root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+	rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
+	      resv_clusters;
-	if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
+	if (free_clusters - (nclusters + rsv + dirty_clusters) <
 					EXT4_FREECLUSTERS_WATERMARK) {
 		free_clusters  = percpu_counter_sum_positive(fcc);
 		dirty_clusters = percpu_counter_sum_positive(dcc);
@@ -499,7 +522,7 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
 	/* Check whether we have space after accounting for current
 	 * dirty clusters & root reserved clusters.
 	 */
-	if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
+	if (free_clusters >= (rsv + nclusters + dirty_clusters))
 		return 1;
 	/* Hm, nope.  Are (enough) root reserved clusters available? */
@@ -508,6 +531,12 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
 	    capable(CAP_SYS_RESOURCE) ||
 	    (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+		if (free_clusters >= (nclusters + dirty_clusters +
+				      resv_clusters))
+			return 1;
+	}
+	/* No free blocks. Let's see if we can dip into reserved pool */
+	if (flags & EXT4_MB_USE_RESERVED) {
 		if (free_clusters >= (nclusters + dirty_clusters))
 			return 1;
 	}

--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -46,7 +46,8 @@ static int is_dx_dir(struct inode *inode)
 	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
 		     EXT4_FEATURE_COMPAT_DIR_INDEX) &&
 	    ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
-	     ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+	     ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
+	     ext4_has_inline_data(inode)))
 		return 1;
 	return 0;
@@ -115,14 +116,6 @@ static int ext4_readdir(struct file *filp,
 	int ret = 0;
 	int dir_has_error = 0;
-	if (ext4_has_inline_data(inode)) {
-		int has_inline_data = 1;
-		ret = ext4_read_inline_dir(filp, dirent, filldir,
-					   &has_inline_data);
-		if (has_inline_data)
-			return ret;
-	}
 	if (is_dx_dir(inode)) {
 		err = ext4_dx_readdir(filp, dirent, filldir);
 		if (err != ERR_BAD_DX_DIR) {
@@ -136,6 +129,15 @@ static int ext4_readdir(struct file *filp,
 		ext4_clear_inode_flag(file_inode(filp),
 				      EXT4_INODE_INDEX);
 	}
+	if (ext4_has_inline_data(inode)) {
+		int has_inline_data = 1;
+		ret = ext4_read_inline_dir(filp, dirent, filldir,
+					   &has_inline_data);
+		if (has_inline_data)
+			return ret;
+	}
 	stored = 0;
 	offset = filp->f_pos & (sb->s_blocksize - 1);

--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
 				     0xffff);
 }
+#define ext4_ext_dirty(handle, inode, path) \
+		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+		     struct inode *inode, struct ext4_ext_path *path);
 #endif /* _EXT4_EXTENTS */
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -43,6 +43,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
 {
 	journal_t *journal;
+	might_sleep();
 	trace_ext4_journal_start(sb, nblocks, _RET_IP_);
 	if (sb->s_flags & MS_RDONLY)
 		return ERR_PTR(-EROFS);
@@ -113,6 +115,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
 {
 	int err = 0;
+	might_sleep();
 	if (ext4_handle_valid(handle)) {
 		err = jbd2_journal_get_write_access(handle, bh);
 		if (err)
@@ -209,6 +213,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 {
 	int err = 0;
+	might_sleep();
+	set_buffer_meta(bh);
+	set_buffer_prio(bh);
 	if (ext4_handle_valid(handle)) {
 		err = jbd2_journal_dirty_metadata(handle, bh);
 		if (err) {

--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -29,11 +29,13 @@
 * block to complete the transaction.
 *
 * For extents-enabled fs we may have to allocate and modify up to
- * 5 levels of tree + root which are stored in the inode. */
+ * 5 levels of tree, data block (for each of these we need bitmap + group
+ * summaries), root which is stored in the inode, sb
+ */
 #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)				\
 	(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
-	 ? 27U : 8U)
+	 ? 20U : 8U)
 /* Extended attribute operations touch at most two data buffers,
 * two bitmap buffers, and two group summaries, in addition to the inode
@@ -194,16 +196,20 @@ static inline void ext4_journal_callback_add(handle_t *handle,
 * ext4_journal_callback_del: delete a registered callback
 * @handle: active journal transaction handle on which callback was registered
 * @jce: registered journal callback entry to unregister
+ * Return true if object was sucessfully removed
 */
-static inline void ext4_journal_callback_del(handle_t *handle,
+static inline bool ext4_journal_callback_try_del(handle_t *handle,
 					     struct ext4_journal_cb_entry *jce)
 {
+	bool deleted;
 	struct ext4_sb_info *sbi =
 			EXT4_SB(handle->h_transaction->t_journal->j_private);
 	spin_lock(&sbi->s_md_lock);
+	deleted = !list_empty(&jce->jce_list);
 	list_del_init(&jce->jce_list);
 	spin_unlock(&sbi->s_md_lock);
+	return deleted;
 }
 int

--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
 		needs_barrier = true;
-	jbd2_log_start_commit(journal, commit_tid);
+	ret = jbd2_complete_transaction(journal, commit_tid);
-	ret = jbd2_log_wait_commit(journal, commit_tid);
 	if (needs_barrier) {
 		err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 		if (!ret)

--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	trace_ext4_load_inode_bitmap(sb, block_group);
 	bh->b_end_io = ext4_end_bitmap_read;
 	get_bh(bh);
-	submit_bh(READ, bh);
+	submit_bh(READ | REQ_META | REQ_PRIO, bh);
 	wait_on_buffer(bh);
 	if (!buffer_uptodate(bh)) {
 		put_bh(bh);
@@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 	ei = EXT4_I(inode);
 	sbi = EXT4_SB(sb);
+	/*
+	 * Initalize owners and quota early so that we don't have to account
+	 * for quota initialization worst case in standard inode creating
+	 * transaction
+	 */
+	if (owner) {
+		inode->i_mode = mode;
+		i_uid_write(inode, owner[0]);
+		i_gid_write(inode, owner[1]);
+	} else if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = dir->i_gid;
+	} else
+		inode_init_owner(inode, dir, mode);
+	dquot_initialize(inode);
 	if (!goal)
 		goal = sbi->s_inode_goal;
@@ -697,7 +714,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
 		if (!gdp)
-			goto fail;
+			goto out;
 		/*
 		 * Check free inodes count before loading bitmap.
@@ -711,7 +728,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		brelse(inode_bitmap_bh);
 		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
 		if (!inode_bitmap_bh)
-			goto fail;
+			goto out;
 repeat_in_this_group:
 		ino = ext4_find_next_zero_bit((unsigned long *)
@@ -733,13 +750,16 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 							 handle_type, nblocks);
 			if (IS_ERR(handle)) {
 				err = PTR_ERR(handle);
-				goto fail;
+				ext4_std_error(sb, err);
+				goto out;
 			}
 		}
 		BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
-		if (err)
+		if (err) {
-			goto fail;
+			ext4_std_error(sb, err);
+			goto out;
+		}
 		ext4_lock_group(sb, group);
 		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
 		ext4_unlock_group(sb, group);
@@ -755,8 +775,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 got:
 	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
-	if (err)
+	if (err) {
-		goto fail;
+		ext4_std_error(sb, err);
+		goto out;
+	}
 	/* We may have to initialize the block bitmap if it isn't already */
 	if (ext4_has_group_desc_csum(sb) &&
@@ -768,7 +790,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		err = ext4_journal_get_write_access(handle, block_bitmap_bh);
 		if (err) {
 			brelse(block_bitmap_bh);
-			goto fail;
+			ext4_std_error(sb, err);
+			goto out;
 		}
 		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
@@ -787,14 +810,18 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		ext4_unlock_group(sb, group);
 		brelse(block_bitmap_bh);
-		if (err)
+		if (err) {
-			goto fail;
+			ext4_std_error(sb, err);
+			goto out;
+		}
 	}
 	BUFFER_TRACE(group_desc_bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, group_desc_bh);
-	if (err)
+	if (err) {
-		goto fail;
+		ext4_std_error(sb, err);
+		goto out;
+	}
 	/* Update the relevant bg descriptor fields */
 	if (ext4_has_group_desc_csum(sb)) {
@@ -840,8 +867,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
-	if (err)
+	if (err) {
-		goto fail;
+		ext4_std_error(sb, err);
+		goto out;
+	}
 	percpu_counter_dec(&sbi->s_freeinodes_counter);
 	if (S_ISDIR(mode))
@@ -851,16 +880,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		flex_group = ext4_flex_group(sbi, group);
 		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
 	}
-	if (owner) {
-		inode->i_mode = mode;
-		i_uid_write(inode, owner[0]);
-		i_gid_write(inode, owner[1]);
-	} else if (test_opt(sb, GRPID)) {
-		inode->i_mode = mode;
-		inode->i_uid = current_fsuid();
-		inode->i_gid = dir->i_gid;
-	} else
-		inode_init_owner(inode, dir, mode);
 	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
 	/* This is the optimal IO size (for stat), not the fs block size */
@@ -889,7 +908,9 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		 * twice.
 		 */
 		err = -EIO;
-		goto fail;
+		ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
+			   inode->i_ino);
+		goto out;
 	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
@@ -899,7 +920,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
 		__u32 csum;
-		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 		__le32 inum = cpu_to_le32(inode->i_ino);
 		__le32 gen = cpu_to_le32(inode->i_generation);
 		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
@@ -918,7 +938,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
 	ret = inode;
-	dquot_initialize(inode);
 	err = dquot_alloc_inode(inode);
 	if (err)
 		goto fail_drop;
@@ -952,24 +971,17 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 	ext4_debug("allocating inode %lu\n", inode->i_ino);
 	trace_ext4_allocate_inode(inode, dir, mode);
-	goto really_out;
-fail:
-	ext4_std_error(sb, err);
-out:
-	iput(inode);
-	ret = ERR_PTR(err);
-really_out:
 	brelse(inode_bitmap_bh);
 	return ret;
 fail_free_drop:
 	dquot_free_inode(inode);
 fail_drop:
-	dquot_drop(inode);
-	inode->i_flags |= S_NOQUOTA;
 	clear_nlink(inode);
 	unlock_new_inode(inode);
+out:
+	dquot_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
 	iput(inode);
 	brelse(inode_bitmap_bh);
 	return ERR_PTR(err);

--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -19,6 +19,7 @@
 #define EXT4_XATTR_SYSTEM_DATA	"data"
 #define EXT4_MIN_INLINE_DATA_SIZE	((sizeof(__le32) * EXT4_N_BLOCKS))
+#define EXT4_INLINE_DOTDOT_OFFSET	2
 #define EXT4_INLINE_DOTDOT_SIZE		4
 int ext4_get_inline_size(struct inode *inode)
@@ -1289,6 +1290,120 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
 	return ret;
 }
+/*
+ * This function fills a red-black tree with information from an
+ * inlined dir.  It returns the number directory entries loaded
+ * into the tree.  If there is an error it is returned in err.
+ */
+int htree_inlinedir_to_tree(struct file *dir_file,
+			    struct inode *dir, ext4_lblk_t block,
+			    struct dx_hash_info *hinfo,
+			    __u32 start_hash, __u32 start_minor_hash,
+			    int *has_inline_data)
+{
+	int err = 0, count = 0;
+	unsigned int parent_ino;
+	int pos;
+	struct ext4_dir_entry_2 *de;
+	struct inode *inode = file_inode(dir_file);
+	int ret, inline_size = 0;
+	struct ext4_iloc iloc;
+	void *dir_buf = NULL;
+	struct ext4_dir_entry_2 fake;
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		up_read(&EXT4_I(inode)->xattr_sem);
+		*has_inline_data = 0;
+		goto out;
+	}
+	inline_size = ext4_get_inline_size(inode);
+	dir_buf = kmalloc(inline_size, GFP_NOFS);
+	if (!dir_buf) {
+		ret = -ENOMEM;
+		up_read(&EXT4_I(inode)->xattr_sem);
+		goto out;
+	}
+	ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+	up_read(&EXT4_I(inode)->xattr_sem);
+	if (ret < 0)
+		goto out;
+	pos = 0;
+	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+	while (pos < inline_size) {
+		/*
+		 * As inlined dir doesn't store any information about '.' and
+		 * only the inode number of '..' is stored, we have to handle
+		 * them differently.
+		 */
+		if (pos == 0) {
+			fake.inode = cpu_to_le32(inode->i_ino);
+			fake.name_len = 1;
+			strcpy(fake.name, ".");
+			fake.rec_len = ext4_rec_len_to_disk(
+						EXT4_DIR_REC_LEN(fake.name_len),
+						inline_size);
+			ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+			de = &fake;
+			pos = EXT4_INLINE_DOTDOT_OFFSET;
+		} else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
+			fake.inode = cpu_to_le32(parent_ino);
+			fake.name_len = 2;
+			strcpy(fake.name, "..");
+			fake.rec_len = ext4_rec_len_to_disk(
+						EXT4_DIR_REC_LEN(fake.name_len),
+						inline_size);
+			ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+			de = &fake;
+			pos = EXT4_INLINE_DOTDOT_SIZE;
+		} else {
+			de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
+			pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
+			if (ext4_check_dir_entry(inode, dir_file, de,
+					 iloc.bh, dir_buf,
+					 inline_size, pos)) {
+				ret = count;
+				goto out;
+			}
+		}
+		ext4fs_dirhash(de->name, de->name_len, hinfo);
+		if ((hinfo->hash < start_hash) ||
+		    ((hinfo->hash == start_hash) &&
+		     (hinfo->minor_hash < start_minor_hash)))
+			continue;
+		if (de->inode == 0)
+			continue;
+		err = ext4_htree_store_dirent(dir_file,
+				   hinfo->hash, hinfo->minor_hash, de);
+		if (err) {
+			count = err;
+			goto out;
+		}
+		count++;
+	}
+	ret = count;
+out:
+	kfree(dir_buf);
+	brelse(iloc.bh);
+	return ret;
+}
+/*
+ * So this function is called when the volume is mkfsed with
+ * dir_index disabled. In order to keep f_pos persistent
+ * after we convert from an inlined dir to a blocked based,
+ * we just pretend that we are a normal dir and return the
+ * offset as if '.' and '..' really take place.
+ *
+ */
 int ext4_read_inline_dir(struct file *filp,
 			 void *dirent, filldir_t filldir,
 			 int *has_inline_data)
@@ -1302,6 +1417,7 @@ int ext4_read_inline_dir(struct file *filp,
 	int ret, inline_size = 0;
 	struct ext4_iloc iloc;
 	void *dir_buf = NULL;
+	int dotdot_offset, dotdot_size, extra_offset, extra_size;
 	ret = ext4_get_inode_loc(inode, &iloc);
 	if (ret)
@@ -1330,8 +1446,21 @@ int ext4_read_inline_dir(struct file *filp,
 	sb = inode->i_sb;
 	stored = 0;
 	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+	offset = filp->f_pos;
+	/*
+	 * dotdot_offset and dotdot_size is the real offset and
+	 * size for ".." and "." if the dir is block based while
+	 * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
+	 * So we will use extra_offset and extra_size to indicate them
+	 * during the inline dir iteration.
+	 */
+	dotdot_offset = EXT4_DIR_REC_LEN(1);
+	dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+	extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
+	extra_size = extra_offset + inline_size;
-	while (!error && !stored && filp->f_pos < inode->i_size) {
+	while (!error && !stored && filp->f_pos < extra_size) {
 revalidate:
 		/*
 		 * If the version has changed since the last call to
@@ -1340,15 +1469,23 @@ int ext4_read_inline_dir(struct file *filp,
 		 * dir to make sure.
 		 */
 		if (filp->f_version != inode->i_version) {
-			for (i = 0;
+			for (i = 0; i < extra_size && i < offset;) {
-			     i < inode->i_size && i < offset;) {
+				/*
+				 * "." is with offset 0 and
+				 * ".." is dotdot_offset.
+				 */
 				if (!i) {
-					/* skip "." and ".." if needed. */
+					i = dotdot_offset;
-					i += EXT4_INLINE_DOTDOT_SIZE;
+					continue;
+				} else if (i == dotdot_offset) {
+					i = dotdot_size;
 					continue;
 				}
+				/* for other entry, the real offset in
+				 * the buf has to be tuned accordingly.
+				 */
 				de = (struct ext4_dir_entry_2 *)
-					(dir_buf + i);
+					(dir_buf + i - extra_offset);
 				/* It's too expensive to do a full
 				 * dirent test each time round this
 				 * loop, but we do have to test at
@@ -1356,43 +1493,47 @@ int ext4_read_inline_dir(struct file *filp,
 				 * failure will be detected in the
 				 * dirent test below. */
 				if (ext4_rec_len_from_disk(de->rec_len,
-					inline_size) < EXT4_DIR_REC_LEN(1))
+					extra_size) < EXT4_DIR_REC_LEN(1))
 					break;
 				i += ext4_rec_len_from_disk(de->rec_len,
-							    inline_size);
+							    extra_size);
 			}
 			offset = i;
 			filp->f_pos = offset;
 			filp->f_version = inode->i_version;
 		}
-		while (!error && filp->f_pos < inode->i_size) {
+		while (!error && filp->f_pos < extra_size) {
 			if (filp->f_pos == 0) {
 				error = filldir(dirent, ".", 1, 0, inode->i_ino,
 						DT_DIR);
 				if (error)
 					break;
 				stored++;
+				filp->f_pos = dotdot_offset;
+				continue;
+			}
-				error = filldir(dirent, "..", 2, 0, parent_ino,
+			if (filp->f_pos == dotdot_offset) {
-						DT_DIR);
+				error = filldir(dirent, "..", 2,
+						dotdot_offset,
+						parent_ino, DT_DIR);
 				if (error)
 					break;
 				stored++;
-				filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
+				filp->f_pos = dotdot_size;
 				continue;
 			}
-			de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
+			de = (struct ext4_dir_entry_2 *)
+				(dir_buf + filp->f_pos - extra_offset);
 			if (ext4_check_dir_entry(inode, filp, de,
 						 iloc.bh, dir_buf,
-						 inline_size, offset)) {
+						 extra_size, filp->f_pos)) {
 				ret = stored;
 				goto out;
 			}
-			offset += ext4_rec_len_from_disk(de->rec_len,
-							 inline_size);
 			if (le32_to_cpu(de->inode)) {
 				/* We might block in the next section
 				 * if the data destination is
@@ -1415,9 +1556,8 @@ int ext4_read_inline_dir(struct file *filp,
 				stored++;
 			}
 			filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
-							      inline_size);
+							      extra_size);
 		}
-		offset = 0;
 	}
 out:
 	kfree(dir_buf);

--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -17,9 +17,201 @@
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
+#include "ext4_extents.h"
 #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+/**
+ * Swap memory between @a and @b for @len bytes.
+ *
+ * @a:          pointer to first memory area
+ * @b:          pointer to second memory area
+ * @len:        number of bytes to swap
+ *
+ */
+static void memswap(void *a, void *b, size_t len)
+{
+	unsigned char *ap, *bp;
+	unsigned char tmp;
+	ap = (unsigned char *)a;
+	bp = (unsigned char *)b;
+	while (len-- > 0) {
+		tmp = *ap;
+		*ap = *bp;
+		*bp = tmp;
+		ap++;
+		bp++;
+	}
+}
+/**
+ * Swap i_data and associated attributes between @inode1 and @inode2.
+ * This function is used for the primary swap between inode1 and inode2
+ * and also to revert this primary swap in case of errors.
+ *
+ * Therefore you have to make sure, that calling this method twice
+ * will revert all changes.
+ *
+ * @inode1:     pointer to first inode
+ * @inode2:     pointer to second inode
+ */
+static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+{
+	loff_t isize;
+	struct ext4_inode_info *ei1;
+	struct ext4_inode_info *ei2;
+	ei1 = EXT4_I(inode1);
+	ei2 = EXT4_I(inode2);
+	memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
+	memswap(&inode1->i_version, &inode2->i_version,
+		  sizeof(inode1->i_version));
+	memswap(&inode1->i_blocks, &inode2->i_blocks,
+		  sizeof(inode1->i_blocks));
+	memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
+	memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
+	memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
+	memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
+	memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
+	memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+	memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
+	memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
+	isize = i_size_read(inode1);
+	i_size_write(inode1, i_size_read(inode2));
+	i_size_write(inode2, isize);
+}
+/**
+ * Swap the information from the given @inode and the inode
+ * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
+ * important fields of the inodes.
+ *
+ * @sb:         the super block of the filesystem
+ * @inode:      the inode to swap with EXT4_BOOT_LOADER_INO
+ *
+ */
+static long swap_inode_boot_loader(struct super_block *sb,
+				struct inode *inode)
+{
+	handle_t *handle;
+	int err;
+	struct inode *inode_bl;
+	struct ext4_inode_info *ei;
+	struct ext4_inode_info *ei_bl;
+	struct ext4_sb_info *sbi;
+	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
+		err = -EINVAL;
+		goto swap_boot_out;
+	}
+	if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto swap_boot_out;
+	}
+	sbi = EXT4_SB(sb);
+	ei = EXT4_I(inode);
+	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+	if (IS_ERR(inode_bl)) {
+		err = PTR_ERR(inode_bl);
+		goto swap_boot_out;
+	}
+	ei_bl = EXT4_I(inode_bl);
+	filemap_flush(inode->i_mapping);
+	filemap_flush(inode_bl->i_mapping);
+	/* Protect orig inodes against a truncate and make sure,
+	 * that only 1 swap_inode_boot_loader is running. */
+	ext4_inode_double_lock(inode, inode_bl);
+	truncate_inode_pages(&inode->i_data, 0);
+	truncate_inode_pages(&inode_bl->i_data, 0);
+	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(inode);
+	ext4_inode_block_unlocked_dio(inode_bl);
+	inode_dio_wait(inode);
+	inode_dio_wait(inode_bl);
+	handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
+	if (IS_ERR(handle)) {
+		err = -EINVAL;
+		goto swap_boot_out;
+	}
+	/* Protect extent tree against block allocations via delalloc */
+	ext4_double_down_write_data_sem(inode, inode_bl);
+	if (inode_bl->i_nlink == 0) {
+		/* this inode has never been used as a BOOT_LOADER */
+		set_nlink(inode_bl, 1);
+		i_uid_write(inode_bl, 0);
+		i_gid_write(inode_bl, 0);
+		inode_bl->i_flags = 0;
+		ei_bl->i_flags = 0;
+		inode_bl->i_version = 1;
+		i_size_write(inode_bl, 0);
+		inode_bl->i_mode = S_IFREG;
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+					      EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+			ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
+			ext4_ext_tree_init(handle, inode_bl);
+		} else
+			memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
+	}
+	swap_inode_data(inode, inode_bl);
+	inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	inode_bl->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+	ext4_discard_preallocations(inode);
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err < 0) {
+		ext4_warning(inode->i_sb,
+			"couldn't mark inode #%lu dirty (err %d)",
+			inode->i_ino, err);
+		/* Revert all changes: */
+		swap_inode_data(inode, inode_bl);
+	} else {
+		err = ext4_mark_inode_dirty(handle, inode_bl);
+		if (err < 0) {
+			ext4_warning(inode_bl->i_sb,
+				"couldn't mark inode #%lu dirty (err %d)",
+				inode_bl->i_ino, err);
+			/* Revert all changes: */
+			swap_inode_data(inode, inode_bl);
+			ext4_mark_inode_dirty(handle, inode);
+		}
+	}
+	ext4_journal_stop(handle);
+	ext4_double_up_write_data_sem(inode, inode_bl);
+	ext4_inode_resume_unlocked_dio(inode);
+	ext4_inode_resume_unlocked_dio(inode_bl);
+	ext4_inode_double_unlock(inode, inode_bl);
+	iput(inode_bl);
+swap_boot_out:
+	return err;
+}
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			if (!capable(CAP_SYS_RESOURCE))
 				goto flags_out;
 		}
-		if (oldflags & EXT4_EXTENTS_FL) {
+		if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
-			/* We don't support clearning extent flags */
-			if (!(flags & EXT4_EXTENTS_FL)) {
-				err = -EOPNOTSUPP;
-				goto flags_out;
-			}
-		} else if (flags & EXT4_EXTENTS_FL) {
-			/* migrate the file */
 			migrate = 1;
-			flags &= ~EXT4_EXTENTS_FL;
-		}
 		if (flags & EXT4_EOFBLOCKS_FL) {
 			/* we don't support adding EOFBLOCKS flag */
@@ -137,8 +320,13 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			err = ext4_change_inode_journal_flag(inode, jflag);
 		if (err)
 			goto flags_out;
-		if (migrate)
+		if (migrate) {
+			if (flags & EXT4_EXTENTS_FL)
 				err = ext4_ext_migrate(inode);
+			else
+				err = ext4_ind_migrate(inode);
+		}
 flags_out:
 		mutex_unlock(&inode->i_mutex);
 		mnt_drop_write_file(filp);
@@ -357,9 +545,13 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return err;
 	}
+	case EXT4_IOC_SWAP_BOOT:
+		if (!(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+		return swap_inode_boot_loader(sb, inode);
 	case EXT4_IOC_RESIZE_FS: {
 		ext4_fsblk_t n_blocks_count;
-		struct super_block *sb = inode->i_sb;
 		int err = 0, err2 = 0;
 		ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;

--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
 			return retval;
 	}
 	return retval;
 }
 int ext4_ext_migrate(struct inode *inode)
@@ -606,3 +605,64 @@ int ext4_ext_migrate(struct inode *inode)
 	return retval;
 }
+/*
+ * Migrate a simple extent-based inode to use the i_blocks[] array
+ */
+int ext4_ind_migrate(struct inode *inode)
+{
+	struct ext4_extent_header	*eh;
+	struct ext4_super_block		*es = EXT4_SB(inode->i_sb)->s_es;
+	struct ext4_inode_info		*ei = EXT4_I(inode);
+	struct ext4_extent		*ex;
+	unsigned int			i, len;
+	ext4_fsblk_t			blk;
+	handle_t			*handle;
+	int				ret;
+	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+	    (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return -EINVAL;
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+		return -EOPNOTSUPP;
+	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ret = ext4_ext_check_inode(inode);
+	if (ret)
+		goto errout;
+	eh = ext_inode_hdr(inode);
+	ex  = EXT_FIRST_EXTENT(eh);
+	if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
+	    eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
+		ret = -EOPNOTSUPP;
+		goto errout;
+	}
+	if (eh->eh_entries == 0)
+		blk = len = 0;
+	else {
+		len = le16_to_cpu(ex->ee_len);
+		blk = ext4_ext_pblock(ex);
+		if (len > EXT4_NDIR_BLOCKS) {
+			ret = -EOPNOTSUPP;
+			goto errout;
+		}
+	}
+	ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+	memset(ei->i_data, 0, sizeof(ei->i_data));
+	for (i=0; i < len; i++)
+		ei->i_data[i] = cpu_to_le32(blk++);
+	ext4_mark_inode_dirty(handle, inode);
+errout:
+	ext4_journal_stop(handle);
+	up_write(&EXT4_I(inode)->i_data_sem);
+	return ret;
+}
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -7,7 +7,7 @@
 #include "ext4.h"
 /* Checksumming functions */
-static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int offset = offsetof(struct mmp_struct, mmp_checksum);
@@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
 	lock_buffer(bh);
 	bh->b_end_io = end_buffer_write_sync;
 	get_bh(bh);
-	submit_bh(WRITE_SYNC, bh);
+	submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
 	wait_on_buffer(bh);
 	sb_end_write(sb);
 	if (unlikely(!buffer_uptodate(bh)))
@@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
 		get_bh(*bh);
 		lock_buffer(*bh);
 		(*bh)->b_end_io = end_buffer_read_sync;
-		submit_bh(READ_SYNC, *bh);
+		submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
 		wait_on_buffer(*bh);
 		if (!buffer_uptodate(*bh)) {
 			brelse(*bh);

--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 }
 /**
- * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
+ * ext4_double_down_write_data_sem - Acquire two inodes' write lock
+ *                                   of i_data_sem
 *
 * Acquire write lock of i_data_sem of the two inodes
 */
-static void
+void
-double_down_write_data_sem(struct inode *first, struct inode *second)
+ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
 {
 	if (first < second) {
 		down_write(&EXT4_I(first)->i_data_sem);
@@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second)
 }
 /**
- * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
 *
 * @orig_inode:		original inode structure to be released its lock first
 * @donor_inode:	donor inode structure to be released its lock second
 * Release write lock of i_data_sem of two inodes (orig and donor).
 */
-static void
+void
-double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+ext4_double_up_write_data_sem(struct inode *orig_inode,
+			      struct inode *donor_inode)
 {
 	up_write(&EXT4_I(orig_inode)->i_data_sem);
 	up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -407,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
 		mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
 						end_ext, eh, range_to_move);
-	if (depth) {
+	return ext4_ext_dirty(handle, orig_inode, orig_path);
-		ret = ext4_handle_dirty_metadata(handle, orig_inode,
-						 orig_path->p_bh);
-		if (ret)
-			return ret;
-	} else {
-		ret = ext4_mark_inode_dirty(handle, orig_inode);
-		if (ret < 0)
-			return ret;
-	}
-	return 0;
 }
 /**
@@ -737,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 		donor_off += dext_alen;
 		orig_off += dext_alen;
+		BUG_ON(replaced_count > count);
 		/* Already moved the expected blocks */
 		if (replaced_count >= count)
 			break;
@@ -814,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
 		page_cache_release(page[0]);
 		return -ENOMEM;
 	}
+	/*
+	 * grab_cache_page_write_begin() may not wait on page's writeback if
+	 * BDI not demand that. But it is reasonable to be very conservative
+	 * here and explicitly wait on page's writeback
+	 */
+	wait_on_page_writeback(page[0]);
+	wait_on_page_writeback(page[1]);
 	if (inode1 > inode2) {
 		struct page *tmp;
 		tmp = page[0];
@@ -856,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
 		if (buffer_uptodate(bh))
 			continue;
 		if (!buffer_mapped(bh)) {
-			int err = 0;
 			err = ext4_get_block(inode, block, bh, 0);
 			if (err) {
 				SetPageError(page);
@@ -976,7 +973,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	 * necessary, just swap data blocks between orig and donor.
 	 */
 	if (uninit) {
-		double_down_write_data_sem(orig_inode, donor_inode);
+		ext4_double_down_write_data_sem(orig_inode, donor_inode);
 		/* If any of extents in range became initialized we have to
 		 * fallback to data copying */
 		uninit = mext_check_coverage(orig_inode, orig_blk_offset,
@@ -990,7 +987,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 			goto drop_data_sem;
 		if (!uninit) {
-			double_up_write_data_sem(orig_inode, donor_inode);
+			ext4_double_up_write_data_sem(orig_inode, donor_inode);
 			goto data_copy;
 		}
 		if ((page_has_private(pagep[0]) &&
@@ -1004,7 +1001,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 						donor_inode, orig_blk_offset,
 						block_len_in_page, err);
 	drop_data_sem:
-		double_up_write_data_sem(orig_inode, donor_inode);
+		ext4_double_up_write_data_sem(orig_inode, donor_inode);
 		goto unlock_pages;
 	}
 data_copy:
@@ -1033,7 +1030,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	}
 	/* Perform all necessary steps similar write_begin()/write_end()
 	 * but keeping in mind that i_size will not change */
-	*err = __block_write_begin(pagep[0], from, from + replaced_size,
+	*err = __block_write_begin(pagep[0], from, replaced_size,
 				   ext4_get_block);
 	if (!*err)
 		*err = block_commit_write(pagep[0], from, from + replaced_size);
@@ -1065,11 +1062,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	 * Extents are swapped already, but we are not able to copy data.
 	 * Try to swap extents to it's original places
 	 */
-	double_down_write_data_sem(orig_inode, donor_inode);
+	ext4_double_down_write_data_sem(orig_inode, donor_inode);
 	replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
 					       orig_blk_offset,
 					       block_len_in_page, &err2);
-	double_up_write_data_sem(orig_inode, donor_inode);
+	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	if (replaced_count != block_len_in_page) {
 		EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
 				       "Unable to copy data block,"
@@ -1209,15 +1206,15 @@ mext_check_arguments(struct inode *orig_inode,
 }
 /**
- * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
 *
 * @inode1:	the inode structure
 * @inode2:	the inode structure
 *
 * Lock two inodes' i_mutex
 */
-static void
+void
-mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
 {
 	BUG_ON(inode1 == inode2);
 	if (inode1 < inode2) {
@@ -1230,15 +1227,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 }
 /**
- * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
 *
 * @inode1:     the inode that is released first
 * @inode2:     the inode that is released second
 *
 */
-static void
+void
-mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
 {
 	mutex_unlock(&inode1->i_mutex);
 	mutex_unlock(&inode2->i_mutex);
@@ -1333,7 +1330,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		return -EINVAL;
 	}
 	/* Protect orig and donor inodes against a truncate */
-	mext_inode_double_lock(orig_inode, donor_inode);
+	ext4_inode_double_lock(orig_inode, donor_inode);
 	/* Wait for all existing dio workers */
 	ext4_inode_block_unlocked_dio(orig_inode);
@@ -1342,7 +1339,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	inode_dio_wait(donor_inode);
 	/* Protect extent tree against block allocations via delalloc */
-	double_down_write_data_sem(orig_inode, donor_inode);
+	ext4_double_down_write_data_sem(orig_inode, donor_inode);
 	/* Check the filesystem environment whether move_extent can be done */
 	ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
 				    donor_start, &len);
@@ -1466,7 +1463,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		 * b. racing with ->readpage, ->write_begin, and ext4_get_block
 		 *    in move_extent_per_page
 		 */
-		double_up_write_data_sem(orig_inode, donor_inode);
+		ext4_double_up_write_data_sem(orig_inode, donor_inode);
 		while (orig_page_offset <= seq_end_page) {
@@ -1500,7 +1497,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 				block_len_in_page = rest_blocks;
 		}
-		double_down_write_data_sem(orig_inode, donor_inode);
+		ext4_double_down_write_data_sem(orig_inode, donor_inode);
 		if (ret < 0)
 			break;
@@ -1538,10 +1535,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		ext4_ext_drop_refs(holecheck_path);
 		kfree(holecheck_path);
 	}
-	double_up_write_data_sem(orig_inode, donor_inode);
+	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	ext4_inode_resume_unlocked_dio(orig_inode);
 	ext4_inode_resume_unlocked_dio(donor_inode);
-	mext_inode_double_unlock(orig_inode, donor_inode);
+	ext4_inode_double_unlock(orig_inode, donor_inode);
 	return ret;
 }
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	__u32 csum, old_csum;
+	__u32 csum;
+	__le32 save_csum;
 	int size;
 	size = count_offset + (count * sizeof(struct dx_entry));
-	old_csum = t->dt_checksum;
+	save_csum = t->dt_checksum;
 	t->dt_checksum = 0;
 	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
 	csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
-	t->dt_checksum = old_csum;
+	t->dt_checksum = save_csum;
 	return cpu_to_le32(csum);
 }
@@ -971,6 +972,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 			hinfo.hash_version +=
 				EXT4_SB(dir->i_sb)->s_hash_unsigned;
 		hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+		if (ext4_has_inline_data(dir)) {
+			int has_inline_data = 1;
+			count = htree_inlinedir_to_tree(dir_file, dir, 0,
+							&hinfo, start_hash,
+							start_minor_hash,
+							&has_inline_data);
+			if (has_inline_data) {
+				*next_hash = ~0;
+				return count;
+			}
+		}
 		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
 					       start_hash, start_minor_hash);
 		*next_hash = ~0;
@@ -1455,24 +1467,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
 	return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
 }
-#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
-	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
-	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
-	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
-	[S_IFBLK >> S_SHIFT]	= EXT4_FT_BLKDEV,
-	[S_IFIFO >> S_SHIFT]	= EXT4_FT_FIFO,
-	[S_IFSOCK >> S_SHIFT]	= EXT4_FT_SOCK,
-	[S_IFLNK >> S_SHIFT]	= EXT4_FT_SYMLINK,
-};
-static inline void ext4_set_de_type(struct super_block *sb,
-				struct ext4_dir_entry_2 *de,
-				umode_t mode) {
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
-		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
-}
 /*
 * Move count entries from end of map between two memory locations.
 * Returns pointer to last entry moved.
@@ -2251,8 +2245,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	dquot_initialize(dir);
 	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
-		   EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
 retry:
 	inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
 					    NULL, EXT4_HT_DIR, credits);
@@ -2286,8 +2279,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
 	dquot_initialize(dir);
 	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
-		   EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
 retry:
 	inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
 					    NULL, EXT4_HT_DIR, credits);
@@ -2396,8 +2388,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	dquot_initialize(dir);
 	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
-		   EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
 retry:
 	inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
 					    &dentry->d_name,
@@ -2826,8 +2817,7 @@ static int ext4_symlink(struct inode *dir,
 		 * quota blocks, sb is already counted in previous macros).
 		 */
 		credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
-			  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
 	}
 retry:
 	inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,

--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -272,7 +272,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
 		if (start_blk >= last_blk)
 			goto next_group;
 		group_data[bb_index].block_bitmap = start_blk++;
-		ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+		group = ext4_get_group_number(sb, start_blk - 1);
 		group -= group_data[0].group;
 		group_data[group].free_blocks_count--;
 		if (flexbg_size > 1)
@@ -284,7 +284,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
 		if (start_blk >= last_blk)
 			goto next_group;
 		group_data[ib_index].inode_bitmap = start_blk++;
-		ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+		group = ext4_get_group_number(sb, start_blk - 1);
 		group -= group_data[0].group;
 		group_data[group].free_blocks_count--;
 		if (flexbg_size > 1)
@@ -296,7 +296,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
 		if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
 			goto next_group;
 		group_data[it_index].inode_table = start_blk;
-		ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+		group = ext4_get_group_number(sb, start_blk - 1);
 		group -= group_data[0].group;
 		group_data[group].free_blocks_count -=
 					EXT4_SB(sb)->s_itb_per_group;
@@ -392,7 +392,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
 		ext4_group_t group;
 		int err;
-		ext4_get_group_no_and_offset(sb, block, &group, NULL);
+		group = ext4_get_group_number(sb, block);
 		start = ext4_group_first_block_no(sb, group);
 		group -= flex_gd->groups[0].group;
@@ -1341,6 +1341,8 @@ static void ext4_update_super(struct super_block *sb,
 	/* Update the global fs size fields */
 	sbi->s_groups_count += flex_gd->count;
+	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
 	/* Update the reserved block counts only once the new group is
 	 * active. */
@@ -1879,7 +1881,11 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
 		/* Nothing need to do */
 		return 0;
-	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+	n_group = ext4_get_group_number(sb, n_blocks_count - 1);
+	if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
+		ext4_warning(sb, "resize would cause inodes_count overflow");
+		return -EINVAL;
+	}
 	ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
 	n_desc_blocks = num_desc_blocks(sb, n_group + 1);

--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
+static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
@@ -353,10 +354,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 	struct super_block		*sb = journal->j_private;
 	struct ext4_sb_info		*sbi = EXT4_SB(sb);
 	int				error = is_journal_aborted(journal);
-	struct ext4_journal_cb_entry	*jce, *tmp;
+	struct ext4_journal_cb_entry	*jce;
+	BUG_ON(txn->t_state == T_FINISHED);
 	spin_lock(&sbi->s_md_lock);
-	list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+	while (!list_empty(&txn->t_private_list)) {
+		jce = list_entry(txn->t_private_list.next,
+				 struct ext4_journal_cb_entry, jce_list);
 		list_del_init(&jce->jce_list);
 		spin_unlock(&sbi->s_md_lock);
 		jce->jce_func(sb, jce, error);
@@ -1948,16 +1952,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
 	if ((sbi->s_es->s_feature_ro_compat &
 	     cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
 		/* Use new metadata_csum algorithm */
-		__u16 old_csum;
+		__le16 save_csum;
 		__u32 csum32;
-		old_csum = gdp->bg_checksum;
+		save_csum = gdp->bg_checksum;
 		gdp->bg_checksum = 0;
 		csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
 				     sizeof(le_group));
 		csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
 				     sbi->s_desc_size);
-		gdp->bg_checksum = old_csum;
+		gdp->bg_checksum = save_csum;
 		crc = csum32 & 0xFFFF;
 		goto out;
@@ -2379,17 +2383,15 @@ struct ext4_attr {
 	int offset;
 };
-static int parse_strtoul(const char *buf,
+static int parse_strtoull(const char *buf,
-		unsigned long max, unsigned long *value)
+		unsigned long long max, unsigned long long *value)
 {
-	char *endp;
+	int ret;
-	*value = simple_strtoul(skip_spaces(buf), &endp, 0);
-	endp = skip_spaces(endp);
-	if (*endp || *value > max)
-		return -EINVAL;
-	return 0;
+	ret = kstrtoull(skip_spaces(buf), 0, value);
+	if (!ret && *value > max)
+		ret = -EINVAL;
+	return ret;
 }
 static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
@@ -2431,11 +2433,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 					  const char *buf, size_t count)
 {
 	unsigned long t;
+	int ret;
-	if (parse_strtoul(buf, 0x40000000, &t))
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
-		return -EINVAL;
+	if (ret)
+		return ret;
-	if (t && !is_power_of_2(t))
+	if (t && (!is_power_of_2(t) || t > 0x40000000))
 		return -EINVAL;
 	sbi->s_inode_readahead_blks = t;
@@ -2456,13 +2460,36 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
 {
 	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
 	unsigned long t;
+	int ret;
-	if (parse_strtoul(buf, 0xffffffff, &t))
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
-		return -EINVAL;
+	if (ret)
+		return ret;
 	*ui = t;
 	return count;
 }
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+				  struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi,
+				   const char *buf, size_t count)
+{
+	unsigned long long val;
+	int ret;
+	if (parse_strtoull(buf, -1ULL, &val))
+		return -EINVAL;
+	ret = ext4_reserve_clusters(sbi, val);
+	return ret ? ret : count;
+}
 static ssize_t trigger_test_error(struct ext4_attr *a,
 				  struct ext4_sb_info *sbi,
 				  const char *buf, size_t count)
@@ -2500,6 +2527,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 		 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2517,6 +2545,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
 	ATTR_LIST(session_write_kbytes),
 	ATTR_LIST(lifetime_write_kbytes),
+	ATTR_LIST(reserved_clusters),
 	ATTR_LIST(inode_readahead_blks),
 	ATTR_LIST(inode_goal),
 	ATTR_LIST(mb_stats),
@@ -3192,6 +3221,40 @@ int ext4_calculate_overhead(struct super_block *sb)
 	return 0;
 }
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+{
+	ext4_fsblk_t resv_clusters;
+	/*
+	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
+	 * This should cover the situations where we can not afford to run
+	 * out of space like for example punch hole, or converting
+	 * uninitialized extents in delalloc path. In most cases such
+	 * allocation would require 1, or 2 blocks, higher numbers are
+	 * very rare.
+	 */
+	resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+	do_div(resv_clusters, 50);
+	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
+	return resv_clusters;
+}
+static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
+{
+	ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
+				sbi->s_cluster_bits;
+	if (count >= clusters)
+		return -EINVAL;
+	atomic64_set(&sbi->s_resv_clusters, count);
+	return 0;
+}
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 {
 	char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -3526,6 +3589,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
 	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+	/* Do we have standard group size of blocksize * 8 blocks ? */
+	if (sbi->s_blocks_per_group == blocksize << 3)
+		set_opt2(sb, STD_GROUP_SIZE);
 	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3698,6 +3765,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_err_report.function = print_daily_error_info;
 	sbi->s_err_report.data = (unsigned long) sb;
+	/* Register extent status tree shrinker */
+	ext4_es_register_shrinker(sb);
 	err = percpu_counter_init(&sbi->s_freeclusters_counter,
 			ext4_count_free_clusters(sb));
 	if (!err) {
@@ -3723,9 +3793,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_max_writeback_mb_bump = 128;
 	sbi->s_extent_max_zeroout_kb = 32;
-	/* Register extent status tree shrinker */
-	ext4_es_register_shrinker(sb);
 	/*
 	 * set up enough so that it can read an inode
 	 */
@@ -3911,6 +3978,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			 "available");
 	}
+	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
+			 "reserved pool", ext4_calculate_resv_clusters(sbi));
+		goto failed_mount4a;
+	}
 	err = ext4_setup_system_zone(sb);
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -4010,6 +4084,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		sbi->s_journal = NULL;
 	}
 failed_mount3:
+	ext4_es_unregister_shrinker(sb);
 	del_timer(&sbi->s_err_report);
 	if (sbi->s_flex_groups)
 		ext4_kvfree(sbi->s_flex_groups);
@@ -4177,7 +4252,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 		goto out_bdev;
 	}
 	journal->j_private = sb;
-	ll_rw_block(READ, 1, &journal->j_sb_buffer);
+	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
 	wait_on_buffer(journal->j_sb_buffer);
 	if (!buffer_uptodate(journal->j_sb_buffer)) {
 		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
@@ -4742,9 +4817,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct super_block *sb = dentry->d_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
-	ext4_fsblk_t overhead = 0;
+	ext4_fsblk_t overhead = 0, resv_blocks;
 	u64 fsid;
 	s64 bfree;
+	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
 	if (!test_opt(sb, MINIX_DF))
 		overhead = sbi->s_overhead;
@@ -4756,8 +4832,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
 	/* prevent underflow in case that few free space is available */
 	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
-	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
+	buf->f_bavail = buf->f_bfree -
-	if (buf->f_bfree < ext4_r_blocks_count(es))
+			(ext4_r_blocks_count(es) + resv_blocks);
+	if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
 		buf->f_bavail = 0;
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
@@ -4945,6 +5022,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 		return PTR_ERR(qf_inode);
 	}
+	/* Don't account quota for quota files to avoid recursion */
+	qf_inode->i_flags |= S_NOQUOTA;
 	err = dquot_enable(qf_inode, type, format_id, flags);
 	iput(qf_inode);

--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
 				    struct ext4_xattr_header *hdr)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	__u32 csum, old;
+	__u32 csum;
+	__le32 save_csum;
+	__le64 dsk_block_nr = cpu_to_le64(block_nr);
-	old = hdr->h_checksum;
+	save_csum = hdr->h_checksum;
 	hdr->h_checksum = 0;
-	block_nr = cpu_to_le64(block_nr);
+	csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
-	csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
+			   sizeof(dsk_block_nr));
-			   sizeof(block_nr));
 	csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
 			   EXT4_BLOCK_SIZE(inode->i_sb));
-	hdr->h_checksum = old;
+	hdr->h_checksum = save_csum;
 	return cpu_to_le32(csum);
 }

--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -22,6 +22,7 @@
 #define	EXT4_XATTR_INDEX_LUSTRE			5
 #define EXT4_XATTR_INDEX_SECURITY	        6
 #define EXT4_XATTR_INDEX_SYSTEM			7
+#define EXT4_XATTR_INDEX_RICHACL		8
 struct ext4_xattr_header {
 	__le32	h_magic;	/* magic number for identification */

--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int space_left = 0;
 	int first_tag = 0;
 	int tag_flag;
-	int i, to_free = 0;
+	int i;
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
@@ -1134,7 +1134,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
 	spin_unlock(&journal->j_history_lock);
-	commit_transaction->t_state = T_FINISHED;
+	commit_transaction->t_state = T_COMMIT_CALLBACK;
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
 	journal->j_committing_transaction = NULL;
@@ -1149,13 +1149,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 				journal->j_average_commit_time*3) / 4;
 	else
 		journal->j_average_commit_time = commit_time;
 	write_unlock(&journal->j_state_lock);
-	if (commit_transaction->t_checkpoint_list == NULL &&
-	    commit_transaction->t_checkpoint_io_list == NULL) {
-		__jbd2_journal_drop_transaction(journal, commit_transaction);
-		to_free = 1;
-	} else {
 	if (journal->j_checkpoint_transactions == NULL) {
 		journal->j_checkpoint_transactions = commit_transaction;
 		commit_transaction->t_cpnext = commit_transaction;
@@ -1170,17 +1166,27 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		commit_transaction->t_cpprev->t_cpnext =
 				commit_transaction;
 	}
-	}
 	spin_unlock(&journal->j_list_lock);
+	/* Drop all spin_locks because commit_callback may be block.
+	 * __journal_remove_checkpoint() can not destroy transaction
+	 * under us because it is not marked as T_FINISHED yet */
 	if (journal->j_commit_callback)
 		journal->j_commit_callback(journal, commit_transaction);
 	trace_jbd2_end_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
-	if (to_free)
-		jbd2_journal_free_transaction(commit_transaction);
+	write_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	commit_transaction->t_state = T_FINISHED;
+	/* Recheck checkpoint lists after j_list_lock was dropped */
+	if (commit_transaction->t_checkpoint_list == NULL &&
+	    commit_transaction->t_checkpoint_io_list == NULL) {
+		__jbd2_journal_drop_transaction(journal, commit_transaction);
+		jbd2_journal_free_transaction(commit_transaction);
+	}
+	spin_unlock(&journal->j_list_lock);
+	write_unlock(&journal->j_state_lock);
 	wake_up(&journal->j_wait_done_commit);
 }
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -707,6 +707,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	return err;
 }
+/*
+ * When this function returns the transaction corresponding to tid
+ * will be completed.  If the transaction has currently running, start
+ * committing that transaction before waiting for it to complete.  If
+ * the transaction id is stale, it is by definition already completed,
+ * so just return SUCCESS.
+ */
+int jbd2_complete_transaction(journal_t *journal, tid_t tid)
+{
+	int	need_to_wait = 1;
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid) {
+		if (journal->j_commit_request != tid) {
+			/* transaction not yet started, so request it */
+			read_unlock(&journal->j_state_lock);
+			jbd2_log_start_commit(journal, tid);
+			goto wait_commit;
+		}
+	} else if (!(journal->j_committing_transaction &&
+		     journal->j_committing_transaction->t_tid == tid))
+		need_to_wait = 0;
+	read_unlock(&journal->j_state_lock);
+	if (!need_to_wait)
+		return 0;
+wait_commit:
+	return jbd2_log_wait_commit(journal, tid);
+}
+EXPORT_SYMBOL(jbd2_complete_transaction);
 /*
 * Log buffer allocation routines:
 */

--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks)
 	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
 	if (!handle)
 		return NULL;
-	memset(handle, 0, sizeof(*handle));
 	handle->h_buffer_credits = nblocks;
 	handle->h_ref = 1;
@@ -640,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 	int error;
 	char *frozen_buffer = NULL;
 	int need_copy = 0;
+	unsigned long start_lock, time_lock;
 	if (is_handle_aborted(handle))
 		return -EROFS;
@@ -655,9 +655,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 	/* @@@ Need to check for errors here at some point. */
+ 	start_lock = jiffies;
 	lock_buffer(bh);
 	jbd_lock_bh_state(bh);
+	/* If it takes too long to lock the buffer, trace it */
+	time_lock = jbd2_time_diff(start_lock, jiffies);
+	if (time_lock > HZ/10)
+		trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
+			jiffies_to_msecs(time_lock));
 	/* We now hold the buffer lock so it is safe to query the buffer
 	 * state.  Is the buffer dirty?
 	 *

--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -34,6 +34,8 @@ enum bh_state_bits {
 	BH_Write_EIO,	/* I/O error on write */
 	BH_Unwritten,	/* Buffer is allocated on disk but not written */
 	BH_Quiet,	/* Buffer Error Prinks to be quiet */
+	BH_Meta,	/* Buffer contains metadata */
+	BH_Prio,	/* Buffer should be submitted with REQ_PRIO */
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
@@ -124,6 +126,8 @@ BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
 BUFFER_FNS(Write_EIO, write_io_error)
 BUFFER_FNS(Unwritten, unwritten)
+BUFFER_FNS(Meta, meta)
+BUFFER_FNS(Prio, prio)
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)

--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -480,6 +480,7 @@ struct transaction_s
 		T_COMMIT,
 		T_COMMIT_DFLUSH,
 		T_COMMIT_JFLUSH,
+		T_COMMIT_CALLBACK,
 		T_FINISHED
 	}			t_state;
@@ -1144,7 +1145,7 @@ extern struct kmem_cache *jbd2_handle_cache;
 static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags)
 {
-	return kmem_cache_alloc(jbd2_handle_cache, gfp_flags);
+	return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags);
 }
 static inline void jbd2_free_handle(handle_t *handle)
@@ -1200,6 +1201,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_journal_force_commit_nested(journal_t *journal);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
+int jbd2_complete_transaction(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);

--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h
@@ -31,21 +31,14 @@ struct journal_head {
 	/*
 	 * Journalling list for this buffer [jbd_lock_bh_state()]
 	 */
-	unsigned b_jlist;
+	unsigned b_jlist:4;
 	/*
 	 * This flag signals the buffer has been modified by
 	 * the currently running transaction
 	 * [jbd_lock_bh_state()]
 	 */
-	unsigned b_modified;
+	unsigned b_modified:1;
-	/*
-	 * This feild tracks the last transaction id in which this buffer
-	 * has been cowed
-	 * [jbd_lock_bh_state()]
-	 */
-	tid_t b_cow_tid;
 	/*
 	 * Copy of the buffer data frozen for writing to the log.

--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -257,15 +257,7 @@ DECLARE_EVENT_CLASS(ext4__write_end,
 		  __entry->pos, __entry->len, __entry->copied)
 );
-DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end,
+DEFINE_EVENT(ext4__write_end, ext4_write_end,
-	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
-		 unsigned int copied),
-	TP_ARGS(inode, pos, len, copied)
-);
-DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end,
 	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 		 unsigned int copied),
@@ -1956,7 +1948,7 @@ TRACE_EVENT(ext4_remove_blocks,
 		__entry->to		= to;
 		__entry->partial	= partial_cluster;
 		__entry->ee_pblk	= ext4_ext_pblock(ex);
-		__entry->ee_lblk	= cpu_to_le32(ex->ee_block);
+		__entry->ee_lblk	= le32_to_cpu(ex->ee_block);
 		__entry->ee_len		= ext4_ext_get_actual_len(ex);
 	),
@@ -2060,7 +2052,7 @@ TRACE_EVENT(ext4_ext_remove_space,
 TRACE_EVENT(ext4_ext_remove_space_done,
 	TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth,
-		ext4_lblk_t partial, unsigned short eh_entries),
+		ext4_lblk_t partial, __le16 eh_entries),
 	TP_ARGS(inode, start, depth, partial, eh_entries),
@@ -2079,7 +2071,7 @@ TRACE_EVENT(ext4_ext_remove_space_done,
 		__entry->start		= start;
 		__entry->depth		= depth;
 		__entry->partial	= partial;
-		__entry->eh_entries	= eh_entries;
+		__entry->eh_entries	= le16_to_cpu(eh_entries);
 	),
 	TP_printk("dev %d,%d ino %lu since %u depth %d partial %u "

--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h