[XFS] write barrier support Issue all log sync operations as ordered

writes. In addition flush the disk cache on fsync if the sync cached operation didn't sync the log to disk (this requires some additional bookeping in the transaction and log code). If the device doesn't claim to support barriers, the filesystem has an extern log volume or the trial superblock write with barriers enabled failed we disable barriers and print a warning. We should probably fail the mount completely, but that could lead to nasty boot failures for the root filesystem. Not enabled by default yet, needs more destructive testing first. SGI-PV: 912426 SGI-Modid: xfs-linux:xfs-kern:198723a Signed-off-by: Christoph Hellwig <hch@sgi.com> Signed-off-by: Nathan Scott <nathans@sgi.com>

[XFS] write barrier support Issue all log sync operations as ordered
writes. In addition flush the disk cache on fsync if the sync cached operation didn't sync the log to disk (this requires some additional bookeping in the transaction and log code). If the device doesn't claim to support barriers, the filesystem has an extern log volume or the trial superblock write with barriers enabled failed we disable barriers and print a warning. We should probably fail the mount completely, but that could lead to nasty boot failures for the root filesystem. Not enabled by default yet, needs more destructive testing first. SGI-PV: 912426 SGI-Modid: xfs-linux:xfs-kern:198723a Signed-off-by: Christoph Hellwig <hch@sgi.com> Signed-off-by: Nathan Scott <nathans@sgi.com>
f538d4da · Christoph Hellwig · Nathan Scott · 739cafd3 · f538d4da · f538d4da
Commit f538d4da authored Nov 02, 2005 by Christoph Hellwig Committed by Nathan Scott Nov 02, 2005
12 changed files
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1295,6 +1295,11 @@ _pagebuf_ioapply(
 		rw = (pb->pb_flags & PBF_READ) ? READ : WRITE;
 	}

+	if (pb->pb_flags & PBF_ORDERED) {
+		ASSERT(!(pb->pb_flags & PBF_READ));
+		rw = WRITE_BARRIER;
+	}
+
 	/* Special code path for reading a sub page size pagebuf in --
 	 * we populate up the whole page, and hence the other metadata
 	 * in the same page.  This optimization is only valid when the

--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -74,7 +74,7 @@ typedef enum page_buf_flags_e {		/* pb_flags values */
 	PBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
 	PBF_STALE = (1 << 7),	/* buffer has been staled, do not find it  */
 	PBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
-	PBF_FLUSH = (1 << 11),	    /* flush disk write cache		   */
+ 	PBF_ORDERED = (1 << 11),    /* use ordered writes		   */
 	PBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead		   */

 	/* flags used only as arguments to access routines */
@@ -383,9 +383,9 @@ extern void pagebuf_trace(
 #define XFS_BUF_UNASYNC(x)	 ((x)->pb_flags &= ~PBF_ASYNC)
 #define XFS_BUF_ISASYNC(x)	 ((x)->pb_flags & PBF_ASYNC)

-#define XFS_BUF_FLUSH(x)	 ((x)->pb_flags |= PBF_FLUSH)
-#define XFS_BUF_UNFLUSH(x)	 ((x)->pb_flags &= ~PBF_FLUSH)
-#define XFS_BUF_ISFLUSH(x)	 ((x)->pb_flags & PBF_FLUSH)
+#define XFS_BUF_ORDERED(x)	 ((x)->pb_flags |= PBF_ORDERED)
+#define XFS_BUF_UNORDERED(x)	 ((x)->pb_flags &= ~PBF_ORDERED)
+#define XFS_BUF_ISORDERED(x)	 ((x)->pb_flags & PBF_ORDERED)

 #define XFS_BUF_SHUT(x)		 printk("XFS_BUF_SHUT not implemented yet\n")
 #define XFS_BUF_UNSHUT(x)	 printk("XFS_BUF_UNSHUT not implemented yet\n")

--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -278,6 +278,72 @@ xfs_blkdev_put(
 		close_bdev_excl(bdev);
 }

+/*
+ * Try to write out the superblock using barriers.
+ */
+STATIC int
+xfs_barrier_test(
+	xfs_mount_t	*mp)
+{
+	xfs_buf_t	*sbp = xfs_getsb(mp, 0);
+	int		error;
+
+	XFS_BUF_UNDONE(sbp);
+	XFS_BUF_UNREAD(sbp);
+	XFS_BUF_UNDELAYWRITE(sbp);
+	XFS_BUF_WRITE(sbp);
+	XFS_BUF_UNASYNC(sbp);
+	XFS_BUF_ORDERED(sbp);
+
+	xfsbdstrat(mp, sbp);
+	error = xfs_iowait(sbp);
+
+	/*
+	 * Clear all the flags we set and possible error state in the
+	 * buffer.  We only did the write to try out whether barriers
+	 * worked and shouldn't leave any traces in the superblock
+	 * buffer.
+	 */
+	XFS_BUF_DONE(sbp);
+	XFS_BUF_ERROR(sbp, 0);
+	XFS_BUF_UNORDERED(sbp);
+
+	xfs_buf_relse(sbp);
+	return error;
+}
+
+void
+xfs_mountfs_check_barriers(xfs_mount_t *mp)
+{
+	int error;
+
+	if (mp->m_logdev_targp != mp->m_ddev_targp) {
+		xfs_fs_cmn_err(CE_NOTE, mp,
+		  "Disabling barriers, not supported with external log device");
+		mp->m_flags &= ~XFS_MOUNT_BARRIER;
+	}
+
+	if (mp->m_ddev_targp->pbr_bdev->bd_disk->queue->ordered ==
+					QUEUE_ORDERED_NONE) {
+		xfs_fs_cmn_err(CE_NOTE, mp,
+		  "Disabling barriers, not supported by the underlying device");
+		mp->m_flags &= ~XFS_MOUNT_BARRIER;
+	}
+
+	error = xfs_barrier_test(mp);
+	if (error) {
+		xfs_fs_cmn_err(CE_NOTE, mp,
+		  "Disabling barriers, trial barrier write failed");
+		mp->m_flags &= ~XFS_MOUNT_BARRIER;
+	}
+}
+
+void
+xfs_blkdev_issue_flush(
+	xfs_buftarg_t		*buftarg)
+{
+	blkdev_issue_flush(buftarg->pbr_bdev, NULL);
+}

 STATIC struct inode *
 linvfs_alloc_inode(

--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -132,6 +132,7 @@ extern void xfs_flush_device(struct xfs_inode *);
 extern int  xfs_blkdev_get(struct xfs_mount *, const char *,
 				struct block_device **);
 extern void xfs_blkdev_put(struct block_device *);
+extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);

 extern struct export_operations linvfs_export_ops;


--- a/fs/xfs/xfs_clnt.h
+++ b/fs/xfs/xfs_clnt.h
@@ -99,7 +99,7 @@ struct xfs_mount_args {
 						 * enforcement */
 #define XFSMNT_NOUUID		0x01000000	/* Ignore fs uuid */
 #define XFSMNT_DMAPI		0x02000000	/* enable dmapi/xdsm */
-#define XFSMNT_NOLOGFLUSH	0x04000000	/* Don't flush for log blocks */
+#define XFSMNT_BARRIER		0x04000000	/* use write barriers */
 #define XFSMNT_IDELETE		0x08000000	/* inode cluster delete */
 #define XFSMNT_SWALLOC		0x10000000	/* turn on stripe width
 						 * allocation */

--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -93,8 +93,11 @@ STATIC int  xlog_state_release_iclog(xlog_t		*log,
 STATIC void xlog_state_switch_iclogs(xlog_t		*log,
 				     xlog_in_core_t *iclog,
 				     int		eventual_size);
-STATIC int  xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags);
-STATIC int  xlog_state_sync_all(xlog_t *log, uint flags);
+STATIC int  xlog_state_sync(xlog_t			*log,
+			    xfs_lsn_t 			lsn,
+			    uint			flags,
+			    int				*log_flushed);
+STATIC int  xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed);
 STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);

 /* local functions to manipulate grant head */
@@ -312,12 +315,17 @@ xfs_log_done(xfs_mount_t	*mp,
 * semaphore.
 */
 int
-xfs_log_force(xfs_mount_t *mp,
+_xfs_log_force(
+	xfs_mount_t	*mp,
 	xfs_lsn_t	lsn,
-	      uint	  flags)
+	uint		flags,
+	int		*log_flushed)
 {
-	int	rval;
 	xlog_t		*log = mp->m_log;
+	int		dummy;
+
+	if (!log_flushed)
+		log_flushed = &dummy;

 #if defined(DEBUG) || defined(XLOG_NOLOG)
 	if (!xlog_debug && xlog_target == log->l_targ)
@@ -328,17 +336,12 @@ xfs_log_force(xfs_mount_t *mp,

 	XFS_STATS_INC(xs_log_force);

-	if ((log->l_flags & XLOG_IO_ERROR) == 0) {
+	if (log->l_flags & XLOG_IO_ERROR)
+		return XFS_ERROR(EIO);
 	if (lsn == 0)
-			rval = xlog_state_sync_all(log, flags);
+		return xlog_state_sync_all(log, flags, log_flushed);
 	else
-			rval = xlog_state_sync(log, lsn, flags);
-	} else {
-		rval = XFS_ERROR(EIO);
-	}
-
-	return rval;
-
+		return xlog_state_sync(log, lsn, flags, log_flushed);
 }	/* xfs_log_force */

 /*
@@ -1467,14 +1470,13 @@ xlog_sync(xlog_t		*log,
 	XFS_BUF_BUSY(bp);
 	XFS_BUF_ASYNC(bp);
 	/*
-	 * Do a disk write cache flush for the log block.
-	 * This is a bit of a sledgehammer, it would be better
-	 * to use a tag barrier here that just prevents reordering.
+	 * Do an ordered write for the log block.
+	 *
 	 * It may not be needed to flush the first split block in the log wrap
 	 * case, but do it anyways to be safe -AK
 	 */
-	if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
-		XFS_BUF_FLUSH(bp);
+	if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
+		XFS_BUF_ORDERED(bp);

 	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
 	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1505,8 +1507,8 @@ xlog_sync(xlog_t		*log,
 		XFS_BUF_SET_FSPRIVATE(bp, iclog);
 		XFS_BUF_BUSY(bp);
 		XFS_BUF_ASYNC(bp);
-		if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
-			XFS_BUF_FLUSH(bp);
+		if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
+			XFS_BUF_ORDERED(bp);
 		dptr = XFS_BUF_PTR(bp);
 		/*
 		 * Bump the cycle numbers at the start of each block
@@ -2951,7 +2953,7 @@ xlog_state_switch_iclogs(xlog_t		*log,
 *		not in the active nor dirty state.
 */
 STATIC int
-xlog_state_sync_all(xlog_t *log, uint flags)
+xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
 {
 	xlog_in_core_t	*iclog;
 	xfs_lsn_t	lsn;
@@ -3000,6 +3002,7 @@ xlog_state_sync_all(xlog_t *log, uint flags)

 				if (xlog_state_release_iclog(log, iclog))
 					return XFS_ERROR(EIO);
+				*log_flushed = 1;
 				s = LOG_LOCK(log);
 				if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn &&
 				    iclog->ic_state != XLOG_STATE_DIRTY)
@@ -3043,6 +3046,7 @@ xlog_state_sync_all(xlog_t *log, uint flags)
 		 */
 		if (iclog->ic_state & XLOG_STATE_IOERROR)
 			return XFS_ERROR(EIO);
+		*log_flushed = 1;

 	} else {

@@ -3068,7 +3072,8 @@ xlog_state_sync_all(xlog_t *log, uint flags)
 int
 xlog_state_sync(xlog_t	  *log,
 		xfs_lsn_t lsn,
-		uint	  flags)
+		uint	  flags,
+		int	  *log_flushed)
 {
    xlog_in_core_t	*iclog;
    int			already_slept = 0;
@@ -3120,6 +3125,7 @@ xlog_state_sync(xlog_t	  *log,
 			XFS_STATS_INC(xs_log_force_sleep);
 			sv_wait(&iclog->ic_prev->ic_writesema, PSWP,
 				&log->l_icloglock, s);
+			*log_flushed = 1;
 			already_slept = 1;
 			goto try_again;
 		} else {
@@ -3128,6 +3134,7 @@ xlog_state_sync(xlog_t	  *log,
 			LOG_UNLOCK(log, s);
 			if (xlog_state_release_iclog(log, iclog))
 				return XFS_ERROR(EIO);
+			*log_flushed = 1;
 			s = LOG_LOCK(log);
 		}
 	}
@@ -3152,6 +3159,7 @@ xlog_state_sync(xlog_t	  *log,
 		 */
 		if (iclog->ic_state & XLOG_STATE_IOERROR)
 			return XFS_ERROR(EIO);
+		*log_flushed = 1;
 	} else {		/* just return */
 		LOG_UNLOCK(log, s);
 	}
@@ -3606,6 +3614,7 @@ xfs_log_force_umount(
 	xlog_ticket_t	*tic;
 	xlog_t		*log;
 	int		retval;
+	int		dummy;
 	SPLDECL(s);
 	SPLDECL(s2);

@@ -3684,7 +3693,7 @@ xfs_log_force_umount(
 		 * Force the incore logs to disk before shutting the
 		 * log down completely.
 		 */
-		xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC);
+		xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy);
 		s2 = LOG_LOCK(log);
 		retval = xlog_state_ioerror(log);
 		LOG_UNLOCK(log, s2);

--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -174,9 +174,12 @@ xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
 		       xfs_log_ticket_t ticket,
 		       void		**iclog,
 		       uint		flags);
-int	  xfs_log_force(struct xfs_mount *mp,
+int	  _xfs_log_force(struct xfs_mount *mp,
 			 xfs_lsn_t	lsn,
-			uint		 flags);
+			 uint		flags,
+			 int		*log_forced);
+#define xfs_log_force(mp, lsn, flags) \
+	_xfs_log_force(mp, lsn, flags, NULL);
 int	  xfs_log_mount(struct xfs_mount	*mp,
 			struct xfs_buftarg	*log_target,
 			xfs_daddr_t		start_block,

--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -415,7 +415,7 @@ typedef struct xfs_mount {
 						 * 32 bits in size */
 #define XFS_MOUNT_32BITINOOPT	0x00008000	/* saved mount option state */
 #define XFS_MOUNT_NOUUID	0x00010000	/* ignore uuid during mount */
-#define XFS_MOUNT_NOLOGFLUSH	0x00020000
+#define XFS_MOUNT_BARRIER	0x00020000
 #define XFS_MOUNT_IDELETE	0x00040000	/* delete empty inode clusters*/
 #define XFS_MOUNT_SWALLOC	0x00080000	/* turn on stripe width
 						 * allocation */
@@ -542,6 +542,7 @@ extern xfs_mount_t *xfs_mount_init(void);
 extern void	xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern void	xfs_mount_free(xfs_mount_t *mp, int remove_bhv);
 extern int	xfs_mountfs(struct vfs *, xfs_mount_t *mp, int);
+extern void	xfs_mountfs_check_barriers(xfs_mount_t *mp);

 extern int	xfs_unmountfs(xfs_mount_t *, struct cred *);
 extern void	xfs_unmountfs_close(xfs_mount_t *, struct cred *);

--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -661,10 +661,11 @@ xfs_trans_unreserve_and_mod_sb(
 */
 /*ARGSUSED*/
 int
-xfs_trans_commit(
+_xfs_trans_commit(
 	xfs_trans_t	*tp,
 	uint		flags,
-	xfs_lsn_t	*commit_lsn_p)
+	xfs_lsn_t	*commit_lsn_p,
+	int		*log_flushed)
 {
 	xfs_log_iovec_t		*log_vector;
 	int			nvec;
@@ -893,9 +894,11 @@ xfs_trans_commit(
 	 * log out now and wait for it.
 	 */
 	if (sync) {
-		if (!error)
-			error = xfs_log_force(mp, commit_lsn,
-				      XFS_LOG_FORCE | XFS_LOG_SYNC);
+		if (!error) {
+			error = _xfs_log_force(mp, commit_lsn,
+				      XFS_LOG_FORCE | XFS_LOG_SYNC,
+				      log_flushed);
+		}
 		XFS_STATS_INC(xs_trans_sync);
 	} else {
 		XFS_STATS_INC(xs_trans_async);

--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -1025,7 +1025,12 @@ void		xfs_trans_log_efd_extent(xfs_trans_t *,
 					 struct xfs_efd_log_item *,
 					 xfs_fsblock_t,
 					 xfs_extlen_t);
-int		xfs_trans_commit(xfs_trans_t *, uint flags, xfs_lsn_t *);
+int		_xfs_trans_commit(xfs_trans_t *,
+				  uint flags,
+				  xfs_lsn_t *,
+				  int *);
+#define xfs_trans_commit(tp, flags, lsn) \
+	_xfs_trans_commit(tp, flags, lsn, NULL)
 void		xfs_trans_cancel(xfs_trans_t *, int);
 void		xfs_trans_ail_init(struct xfs_mount *);
 xfs_lsn_t	xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);

--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -321,8 +321,8 @@ xfs_start_flags(

 	if (ap->flags & XFSMNT_NOUUID)
 		mp->m_flags |= XFS_MOUNT_NOUUID;
-	if (ap->flags & XFSMNT_NOLOGFLUSH)
-		mp->m_flags |= XFS_MOUNT_NOLOGFLUSH;
+	if (ap->flags & XFSMNT_BARRIER)
+		mp->m_flags |= XFS_MOUNT_BARRIER;

 	return 0;
 }
@@ -512,8 +512,14 @@ xfs_mount(
 		goto error2;

 	error = XFS_IOINIT(vfsp, args, flags);
-	if (!error)
+	if (error)
+		goto error2;
+
+	if ((args->flags & XFSMNT_BARRIER) &&
+	    !(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY))
+		xfs_mountfs_check_barriers(mp);
 	return 0;
+
 error2:
 	if (mp->m_sb_bp)
 		xfs_freesb(mp);
@@ -656,19 +662,24 @@ xfs_mntupdate(
 	else
 		mp->m_flags &= ~XFS_MOUNT_NOATIME;

-	if (!(vfsp->vfs_flag & VFS_RDONLY)) {
-		VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error);
+	if ((vfsp->vfs_flag & VFS_RDONLY) &&
+	    !(*flags & MS_RDONLY)) {
+		vfsp->vfs_flag &= ~VFS_RDONLY;
+
+		if (args->flags & XFSMNT_BARRIER)
+			xfs_mountfs_check_barriers(mp);
 	}

-	if (*flags & MS_RDONLY) {
+	if (!(vfsp->vfs_flag & VFS_RDONLY) &&
+	    (*flags & MS_RDONLY)) {
+		VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error);
+
 		xfs_quiesce_fs(mp);

 		/* Ok now write out an unmount record */
 		xfs_log_unmount_write(mp);
 		xfs_unmountfs_writesb(mp);
 		vfsp->vfs_flag |= VFS_RDONLY;
-	} else {
-		vfsp->vfs_flag &= ~VFS_RDONLY;
 	}

 	return 0;
@@ -1628,7 +1639,8 @@ xfs_vget(
 #define MNTOPT_ALLOCSIZE    "allocsize"    /* preferred allocation size */
 #define MNTOPT_IHASHSIZE    "ihashsize"    /* size of inode hash table */
 #define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
-#define MNTOPT_NOLOGFLUSH   "nologflush"   /* don't hard flush on log writes */
+#define MNTOPT_BARRIER	"barrier"	/* use writer barriers for log write and
+					   unwritten extent conversion */
 #define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
 #define MNTOPT_64BITINODE   "inode64"	/* inodes can be allocated anywhere */
 #define MNTOPT_IKEEP	"ikeep"		/* do not free empty inode clusters */
@@ -1791,8 +1803,8 @@ xfs_parseargs(
 #endif
 		} else if (!strcmp(this_char, MNTOPT_NOUUID)) {
 			args->flags |= XFSMNT_NOUUID;
-		} else if (!strcmp(this_char, MNTOPT_NOLOGFLUSH)) {
-			args->flags |= XFSMNT_NOLOGFLUSH;
+		} else if (!strcmp(this_char, MNTOPT_BARRIER)) {
+			args->flags |= XFSMNT_BARRIER;
 		} else if (!strcmp(this_char, MNTOPT_IKEEP)) {
 			args->flags &= ~XFSMNT_IDELETE;
 		} else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
@@ -1866,7 +1878,7 @@ xfs_showargs(
 		{ XFS_MOUNT_NOUUID,		"," MNTOPT_NOUUID },
 		{ XFS_MOUNT_NORECOVERY,		"," MNTOPT_NORECOVERY },
 		{ XFS_MOUNT_OSYNCISOSYNC,	"," MNTOPT_OSYNCISOSYNC },
-		{ XFS_MOUNT_NOLOGFLUSH,		"," MNTOPT_NOLOGFLUSH },
+		{ XFS_MOUNT_BARRIER,		"," MNTOPT_BARRIER },
 		{ XFS_MOUNT_IDELETE,		"," MNTOPT_NOIKEEP },
 		{ 0, NULL }
 	};

--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1118,6 +1118,7 @@ xfs_fsync(
 	xfs_inode_t	*ip;
 	xfs_trans_t	*tp;
 	int		error;
+	int		log_flushed = 0, changed = 1;

 	vn_trace_entry(BHV_TO_VNODE(bdp),
 			__FUNCTION__, (inst_t *)__return_address);
@@ -1171,10 +1172,18 @@ xfs_fsync(
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);

 		if (xfs_ipincount(ip)) {
-			xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+			_xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
 				      XFS_LOG_FORCE |
 				      ((flag & FSYNC_WAIT)
-				       ? XFS_LOG_SYNC : 0));
+				       ? XFS_LOG_SYNC : 0),
+				      &log_flushed);
+		} else {
+			/*
+			 * If the inode is not pinned and nothing
+			 * has changed we don't need to flush the
+			 * cache.
+			 */
+			changed = 0;
 		}
 		error = 0;
 	} else	{
@@ -1210,10 +1219,27 @@ xfs_fsync(
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 		if (flag & FSYNC_WAIT)
 			xfs_trans_set_sync(tp);
-		error = xfs_trans_commit(tp, 0, NULL);
+		error = _xfs_trans_commit(tp, 0, NULL, &log_flushed);

 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
+
+	if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
+		/*
+		 * If the log write didn't issue an ordered tag we need
+		 * to flush the disk cache for the data device now.
+		 */
+		if (!log_flushed)
+			xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
+
+		/*
+		 * If this inode is on the RT dev we need to flush that
+		 * cache aswell.
+		 */
+		if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
+			xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
+	}
+
 	return error;
 }