Commit 16d4d435 authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Dave Chinner

xfs: split direct I/O and DAX path

So far the DAX code overloaded the direct I/O code path.  There is very little
in common between the two, and untangling them allows to clean up both variants.

As a side effect we also get separate trace points for both I/O types.
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarDave Chinner <dchinner@redhat.com>
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
parent fa8d972d
...@@ -305,14 +305,12 @@ xfs_file_dio_aio_read( ...@@ -305,14 +305,12 @@ xfs_file_dio_aio_read(
else else
target = ip->i_mount->m_ddev_targp; target = ip->i_mount->m_ddev_targp;
if (!IS_DAX(inode)) {
/* DIO must be aligned to device logical sector size */ /* DIO must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
if (iocb->ki_pos == isize) if (iocb->ki_pos == isize)
return 0; return 0;
return -EINVAL; return -EINVAL;
} }
}
/* /*
* Locking is a bit tricky here. If we take an exclusive lock for direct * Locking is a bit tricky here. If we take an exclusive lock for direct
...@@ -360,13 +358,37 @@ xfs_file_dio_aio_read( ...@@ -360,13 +358,37 @@ xfs_file_dio_aio_read(
} }
data = *to; data = *to;
if (IS_DAX(inode)) {
ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
NULL, 0);
} else {
ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
xfs_get_blocks_direct, NULL, NULL, 0); xfs_get_blocks_direct, NULL, NULL, 0);
if (ret > 0) {
iocb->ki_pos += ret;
iov_iter_advance(to, ret);
} }
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
file_accessed(iocb->ki_filp);
return ret;
}
STATIC ssize_t
xfs_file_dax_read(
struct kiocb *iocb,
struct iov_iter *to)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct iov_iter data = *to;
size_t count = iov_iter_count(to);
ssize_t ret = 0;
trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
if (!count)
return 0; /* skip atime */
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
if (ret > 0) { if (ret > 0) {
iocb->ki_pos += ret; iocb->ki_pos += ret;
iov_iter_advance(to, ret); iov_iter_advance(to, ret);
...@@ -399,7 +421,8 @@ xfs_file_read_iter( ...@@ -399,7 +421,8 @@ xfs_file_read_iter(
struct kiocb *iocb, struct kiocb *iocb,
struct iov_iter *to) struct iov_iter *to)
{ {
struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount; struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_mount *mp = XFS_I(inode)->i_mount;
ssize_t ret = 0; ssize_t ret = 0;
XFS_STATS_INC(mp, xs_read_calls); XFS_STATS_INC(mp, xs_read_calls);
...@@ -407,7 +430,9 @@ xfs_file_read_iter( ...@@ -407,7 +430,9 @@ xfs_file_read_iter(
if (XFS_FORCED_SHUTDOWN(mp)) if (XFS_FORCED_SHUTDOWN(mp))
return -EIO; return -EIO;
if (iocb->ki_flags & IOCB_DIRECT) if (IS_DAX(inode))
ret = xfs_file_dax_read(iocb, to);
else if (iocb->ki_flags & IOCB_DIRECT)
ret = xfs_file_dio_aio_read(iocb, to); ret = xfs_file_dio_aio_read(iocb, to);
else else
ret = xfs_file_buffered_aio_read(iocb, to); ret = xfs_file_buffered_aio_read(iocb, to);
...@@ -755,8 +780,7 @@ xfs_file_dio_aio_write( ...@@ -755,8 +780,7 @@ xfs_file_dio_aio_write(
mp->m_rtdev_targp : mp->m_ddev_targp; mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */ /* DIO must be aligned to device logical sector size */
if (!IS_DAX(inode) && if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
((iocb->ki_pos | count) & target->bt_logical_sectormask))
return -EINVAL; return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */ /* "unaligned" here means not aligned to a filesystem block */
...@@ -825,14 +849,9 @@ xfs_file_dio_aio_write( ...@@ -825,14 +849,9 @@ xfs_file_dio_aio_write(
trace_xfs_file_direct_write(ip, count, iocb->ki_pos); trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
data = *from; data = *from;
if (IS_DAX(inode)) {
ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
xfs_end_io_direct_write, 0);
} else {
ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
xfs_get_blocks_direct, xfs_end_io_direct_write, xfs_get_blocks_direct, xfs_end_io_direct_write,
NULL, DIO_ASYNC_EXTEND); NULL, DIO_ASYNC_EXTEND);
}
/* see generic_file_direct_write() for why this is necessary */ /* see generic_file_direct_write() for why this is necessary */
if (mapping->nrpages) { if (mapping->nrpages) {
...@@ -849,10 +868,70 @@ xfs_file_dio_aio_write( ...@@ -849,10 +868,70 @@ xfs_file_dio_aio_write(
xfs_rw_iunlock(ip, iolock); xfs_rw_iunlock(ip, iolock);
/* /*
* No fallback to buffered IO on errors for XFS. DAX can result in * No fallback to buffered IO on errors for XFS, direct IO will either
* partial writes, but direct IO will either complete fully or fail. * complete fully or fail.
*/
ASSERT(ret < 0 || ret == count);
return ret;
}
STATIC ssize_t
xfs_file_dax_write(
struct kiocb *iocb,
struct iov_iter *from)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t ret = 0;
int unaligned_io = 0;
int iolock;
struct iov_iter data;
/* "unaligned" here means not aligned to a filesystem block */
if ((iocb->ki_pos & mp->m_blockmask) ||
((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
unaligned_io = 1;
iolock = XFS_IOLOCK_EXCL;
} else if (mapping->nrpages) {
iolock = XFS_IOLOCK_EXCL;
} else {
iolock = XFS_IOLOCK_SHARED;
}
xfs_rw_ilock(ip, iolock);
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
/*
* Yes, even DAX files can have page cache attached to them: A zeroed
* page is inserted into the pagecache when we have to serve a write
* fault on a hole. It should never be dirtied and can simply be
* dropped from the pagecache once we get real data for the page.
*/ */
ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); if (mapping->nrpages) {
ret = invalidate_inode_pages2(mapping);
WARN_ON_ONCE(ret);
}
if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
data = *from;
ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
xfs_end_io_direct_write, 0);
if (ret > 0) {
iocb->ki_pos += ret;
iov_iter_advance(from, ret);
}
out:
xfs_rw_iunlock(ip, iolock);
return ret; return ret;
} }
...@@ -934,7 +1013,9 @@ xfs_file_write_iter( ...@@ -934,7 +1013,9 @@ xfs_file_write_iter(
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO; return -EIO;
if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) if (IS_DAX(inode))
ret = xfs_file_dax_write(iocb, from);
else if (iocb->ki_flags & IOCB_DIRECT)
ret = xfs_file_dio_aio_write(iocb, from); ret = xfs_file_dio_aio_write(iocb, from);
else else
ret = xfs_file_buffered_aio_write(iocb, from); ret = xfs_file_buffered_aio_write(iocb, from);
......
...@@ -1164,8 +1164,10 @@ DEFINE_EVENT(xfs_file_class, name, \ ...@@ -1164,8 +1164,10 @@ DEFINE_EVENT(xfs_file_class, name, \
TP_ARGS(ip, count, offset)) TP_ARGS(ip, count, offset))
DEFINE_RW_EVENT(xfs_file_buffered_read); DEFINE_RW_EVENT(xfs_file_buffered_read);
DEFINE_RW_EVENT(xfs_file_direct_read); DEFINE_RW_EVENT(xfs_file_direct_read);
DEFINE_RW_EVENT(xfs_file_dax_read);
DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_file_splice_read); DEFINE_RW_EVENT(xfs_file_splice_read);
DECLARE_EVENT_CLASS(xfs_page_class, DECLARE_EVENT_CLASS(xfs_page_class,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment