Commit 4f016a31 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'iomap-5.12-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull iomap updates from Darrick Wong:
 "The big change in this cycle is some new code to make it possible for
  XFS to try unaligned directio overwrites without taking locks. If the
  block is fully written and within EOF (i.e. doesn't require any
  further fs intervention) then we can let the unlocked write proceed.
  If not, we fall back to synchronizing direct writes.

  Summary:

   - Adjust the final parameter of iomap_dio_rw.

   - Add a new flag to request that iomap directio writes return EAGAIN
     if the write is not a pure overwrite within EOF; this will be used
     to reduce lock contention with unaligned direct writes on XFS.

   - Amend XFS' directio code to eliminate exclusive locking for
     unaligned direct writes if the circumstances permit"

* tag 'iomap-5.12-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  xfs: reduce exclusive locking on unaligned dio
  xfs: split the unaligned DIO write code out
  xfs: improve the reflink_bounce_dio_write tracepoint
  xfs: simplify the read/write tracepoints
  xfs: remove the buffered I/O fallback assert
  xfs: cleanup the read/write helper naming
  xfs: make xfs_file_aio_write_checks IOCB_NOWAIT-aware
  xfs: factor out a xfs_ilock_iocb helper
  iomap: add a IOMAP_DIO_OVERWRITE_ONLY flag
  iomap: pass a flags argument to iomap_dio_rw
  iomap: rename the flags variable in __iomap_dio_rw
parents f0236163 ed1128c2
...@@ -1942,8 +1942,8 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) ...@@ -1942,8 +1942,8 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
goto buffered; goto buffered;
} }
dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
&btrfs_dio_ops, is_sync_kiocb(iocb)); 0);
btrfs_inode_unlock(inode, ilock_flags); btrfs_inode_unlock(inode, ilock_flags);
...@@ -3618,8 +3618,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) ...@@ -3618,8 +3618,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
return 0; return 0;
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
is_sync_kiocb(iocb));
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret; return ret;
} }
......
...@@ -74,8 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) ...@@ -74,8 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to); return generic_file_read_iter(iocb, to);
} }
ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
is_sync_kiocb(iocb));
inode_unlock_shared(inode); inode_unlock_shared(inode);
file_accessed(iocb->ki_filp); file_accessed(iocb->ki_filp);
...@@ -550,7 +549,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -550,7 +549,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ilock_shared) if (ilock_shared)
iomap_ops = &ext4_iomap_overwrite_ops; iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
is_sync_kiocb(iocb) || unaligned_io || extend); (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
if (ret == -ENOTBLK) if (ret == -ENOTBLK)
ret = 0; ret = 0;
......
...@@ -797,9 +797,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, ...@@ -797,9 +797,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
if (ret) if (ret)
goto out_uninit; goto out_uninit;
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
is_sync_kiocb(iocb));
gfs2_glock_dq(gh); gfs2_glock_dq(gh);
out_uninit: out_uninit:
gfs2_holder_uninit(gh); gfs2_holder_uninit(gh);
...@@ -833,8 +831,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, ...@@ -833,8 +831,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
if (offset + len > i_size_read(&ip->i_inode)) if (offset + len > i_size_read(&ip->i_inode))
goto out; goto out;
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
is_sync_kiocb(iocb));
if (ret == -ENOTBLK) if (ret == -ENOTBLK)
ret = 0; ret = 0;
out: out:
......
...@@ -451,23 +451,22 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, ...@@ -451,23 +451,22 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
struct iomap_dio * struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops, const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
bool wait_for_completion) unsigned int dio_flags)
{ {
struct address_space *mapping = iocb->ki_filp->f_mapping; struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp); struct inode *inode = file_inode(iocb->ki_filp);
size_t count = iov_iter_count(iter); size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos; loff_t pos = iocb->ki_pos;
loff_t end = iocb->ki_pos + count - 1, ret = 0; loff_t end = iocb->ki_pos + count - 1, ret = 0;
unsigned int flags = IOMAP_DIRECT; bool wait_for_completion =
is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
unsigned int iomap_flags = IOMAP_DIRECT;
struct blk_plug plug; struct blk_plug plug;
struct iomap_dio *dio; struct iomap_dio *dio;
if (!count) if (!count)
return NULL; return NULL;
if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
return ERR_PTR(-EIO);
dio = kmalloc(sizeof(*dio), GFP_KERNEL); dio = kmalloc(sizeof(*dio), GFP_KERNEL);
if (!dio) if (!dio)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
...@@ -492,7 +491,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, ...@@ -492,7 +491,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iter_is_iovec(iter)) if (iter_is_iovec(iter))
dio->flags |= IOMAP_DIO_DIRTY; dio->flags |= IOMAP_DIO_DIRTY;
} else { } else {
flags |= IOMAP_WRITE; iomap_flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE; dio->flags |= IOMAP_DIO_WRITE;
/* for data sync or sync, we need sync completion processing */ /* for data sync or sync, we need sync completion processing */
...@@ -514,7 +513,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, ...@@ -514,7 +513,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
ret = -EAGAIN; ret = -EAGAIN;
goto out_free_dio; goto out_free_dio;
} }
flags |= IOMAP_NOWAIT; iomap_flags |= IOMAP_NOWAIT;
}
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (pos >= dio->i_size || pos + count > dio->i_size)
goto out_free_dio;
iomap_flags |= IOMAP_OVERWRITE_ONLY;
} }
ret = filemap_write_and_wait_range(mapping, pos, end); ret = filemap_write_and_wait_range(mapping, pos, end);
...@@ -545,7 +551,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, ...@@ -545,7 +551,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
blk_start_plug(&plug); blk_start_plug(&plug);
do { do {
ret = iomap_apply(inode, pos, count, flags, ops, dio, ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio,
iomap_dio_actor); iomap_dio_actor);
if (ret <= 0) { if (ret <= 0) {
/* magic error code to fall back to buffered I/O */ /* magic error code to fall back to buffered I/O */
...@@ -629,11 +635,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw); ...@@ -629,11 +635,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
ssize_t ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops, const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
bool wait_for_completion) unsigned int dio_flags)
{ {
struct iomap_dio *dio; struct iomap_dio *dio;
dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion); dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags);
if (IS_ERR_OR_NULL(dio)) if (IS_ERR_OR_NULL(dio))
return PTR_ERR_OR_ZERO(dio); return PTR_ERR_OR_ZERO(dio);
return iomap_dio_complete(dio); return iomap_dio_complete(dio);
......
...@@ -197,30 +197,42 @@ xfs_file_fsync( ...@@ -197,30 +197,42 @@ xfs_file_fsync(
return error; return error;
} }
static int
xfs_ilock_iocb(
struct kiocb *iocb,
unsigned int lock_mode)
{
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!xfs_ilock_nowait(ip, lock_mode))
return -EAGAIN;
} else {
xfs_ilock(ip, lock_mode);
}
return 0;
}
STATIC ssize_t STATIC ssize_t
xfs_file_dio_aio_read( xfs_file_dio_read(
struct kiocb *iocb, struct kiocb *iocb,
struct iov_iter *to) struct iov_iter *to)
{ {
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
size_t count = iov_iter_count(to);
ssize_t ret; ssize_t ret;
trace_xfs_file_direct_read(ip, count, iocb->ki_pos); trace_xfs_file_direct_read(iocb, to);
if (!count) if (!iov_iter_count(to))
return 0; /* skip atime */ return 0; /* skip atime */
file_accessed(iocb->ki_filp); file_accessed(iocb->ki_filp);
if (iocb->ki_flags & IOCB_NOWAIT) { ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) if (ret)
return -EAGAIN; return ret;
} else { ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
xfs_ilock(ip, XFS_IOLOCK_SHARED);
}
ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
is_sync_kiocb(iocb));
xfs_iunlock(ip, XFS_IOLOCK_SHARED); xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret; return ret;
...@@ -232,21 +244,16 @@ xfs_file_dax_read( ...@@ -232,21 +244,16 @@ xfs_file_dax_read(
struct iov_iter *to) struct iov_iter *to)
{ {
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
size_t count = iov_iter_count(to);
ssize_t ret = 0; ssize_t ret = 0;
trace_xfs_file_dax_read(ip, count, iocb->ki_pos); trace_xfs_file_dax_read(iocb, to);
if (!count) if (!iov_iter_count(to))
return 0; /* skip atime */ return 0; /* skip atime */
if (iocb->ki_flags & IOCB_NOWAIT) { ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) if (ret)
return -EAGAIN; return ret;
} else {
xfs_ilock(ip, XFS_IOLOCK_SHARED);
}
ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
xfs_iunlock(ip, XFS_IOLOCK_SHARED); xfs_iunlock(ip, XFS_IOLOCK_SHARED);
...@@ -255,21 +262,18 @@ xfs_file_dax_read( ...@@ -255,21 +262,18 @@ xfs_file_dax_read(
} }
STATIC ssize_t STATIC ssize_t
xfs_file_buffered_aio_read( xfs_file_buffered_read(
struct kiocb *iocb, struct kiocb *iocb,
struct iov_iter *to) struct iov_iter *to)
{ {
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
ssize_t ret; ssize_t ret;
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); trace_xfs_file_buffered_read(iocb, to);
if (iocb->ki_flags & IOCB_NOWAIT) { ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) if (ret)
return -EAGAIN; return ret;
} else {
xfs_ilock(ip, XFS_IOLOCK_SHARED);
}
ret = generic_file_read_iter(iocb, to); ret = generic_file_read_iter(iocb, to);
xfs_iunlock(ip, XFS_IOLOCK_SHARED); xfs_iunlock(ip, XFS_IOLOCK_SHARED);
...@@ -293,9 +297,9 @@ xfs_file_read_iter( ...@@ -293,9 +297,9 @@ xfs_file_read_iter(
if (IS_DAX(inode)) if (IS_DAX(inode))
ret = xfs_file_dax_read(iocb, to); ret = xfs_file_dax_read(iocb, to);
else if (iocb->ki_flags & IOCB_DIRECT) else if (iocb->ki_flags & IOCB_DIRECT)
ret = xfs_file_dio_aio_read(iocb, to); ret = xfs_file_dio_read(iocb, to);
else else
ret = xfs_file_buffered_aio_read(iocb, to); ret = xfs_file_buffered_read(iocb, to);
if (ret > 0) if (ret > 0)
XFS_STATS_ADD(mp, xs_read_bytes, ret); XFS_STATS_ADD(mp, xs_read_bytes, ret);
...@@ -310,7 +314,7 @@ xfs_file_read_iter( ...@@ -310,7 +314,7 @@ xfs_file_read_iter(
* if called for a direct write beyond i_size. * if called for a direct write beyond i_size.
*/ */
STATIC ssize_t STATIC ssize_t
xfs_file_aio_write_checks( xfs_file_write_checks(
struct kiocb *iocb, struct kiocb *iocb,
struct iov_iter *from, struct iov_iter *from,
int *iolock) int *iolock)
...@@ -328,7 +332,14 @@ xfs_file_aio_write_checks( ...@@ -328,7 +332,14 @@ xfs_file_aio_write_checks(
if (error <= 0) if (error <= 0)
return error; return error;
error = xfs_break_layouts(inode, iolock, BREAK_WRITE); if (iocb->ki_flags & IOCB_NOWAIT) {
error = break_layout(inode, false);
if (error == -EWOULDBLOCK)
error = -EAGAIN;
} else {
error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
}
if (error) if (error)
return error; return error;
...@@ -339,7 +350,11 @@ xfs_file_aio_write_checks( ...@@ -339,7 +350,11 @@ xfs_file_aio_write_checks(
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
xfs_iunlock(ip, *iolock); xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL; *iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, *iolock); error = xfs_ilock_iocb(iocb, *iolock);
if (error) {
*iolock = 0;
return error;
}
goto restart; goto restart;
} }
/* /*
...@@ -361,6 +376,10 @@ xfs_file_aio_write_checks( ...@@ -361,6 +376,10 @@ xfs_file_aio_write_checks(
isize = i_size_read(inode); isize = i_size_read(inode);
if (iocb->ki_pos > isize) { if (iocb->ki_pos > isize) {
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
if (!drained_dio) { if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) { if (*iolock == XFS_IOLOCK_SHARED) {
xfs_iunlock(ip, *iolock); xfs_iunlock(ip, *iolock);
...@@ -480,122 +499,149 @@ static const struct iomap_dio_ops xfs_dio_write_ops = { ...@@ -480,122 +499,149 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
}; };
/* /*
* xfs_file_dio_aio_write - handle direct IO writes * Handle block aligned direct I/O writes
*
* Lock the inode appropriately to prepare for and issue a direct IO write.
* By separating it from the buffered write path we remove all the tricky to
* follow locking changes and looping.
*
* If there are cached pages or we're extending the file, we need IOLOCK_EXCL
* until we're sure the bytes at the new EOF have been zeroed and/or the cached
* pages are flushed out.
*
* In most cases the direct IO writes will be done holding IOLOCK_SHARED
* allowing them to be done in parallel with reads and other direct IO writes.
* However, if the IO is not aligned to filesystem blocks, the direct IO layer
* needs to do sub-block zeroing and that requires serialisation against other
* direct IOs to the same block. In this case we need to serialise the
* submission of the unaligned IOs so that we don't get racing block zeroing in
* the dio layer. To avoid the problem with aio, we also need to wait for
* outstanding IOs to complete so that unwritten extent conversion is completed
* before we try to map the overlapping block. This is currently implemented by
* hitting it with a big hammer (i.e. inode_dio_wait()).
*
* Returns with locks held indicated by @iolock and errors indicated by
* negative return values.
*/ */
STATIC ssize_t static noinline ssize_t
xfs_file_dio_aio_write( xfs_file_dio_write_aligned(
struct xfs_inode *ip,
struct kiocb *iocb, struct kiocb *iocb,
struct iov_iter *from) struct iov_iter *from)
{ {
struct file *file = iocb->ki_filp; int iolock = XFS_IOLOCK_SHARED;
struct address_space *mapping = file->f_mapping; ssize_t ret;
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t ret = 0;
int unaligned_io = 0;
int iolock;
size_t count = iov_iter_count(from);
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
/* DIO must be aligned to device logical sector size */ ret = xfs_ilock_iocb(iocb, iolock);
if ((iocb->ki_pos | count) & target->bt_logical_sectormask) if (ret)
return -EINVAL; return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
goto out_unlock;
/* /*
* Don't take the exclusive iolock here unless the I/O is unaligned to * We don't need to hold the IOLOCK exclusively across the IO, so demote
* the file system block size. We don't need to consider the EOF * the iolock back to shared if we had to take the exclusive lock in
* extension case here because xfs_file_aio_write_checks() will relock * xfs_file_write_checks() for other reasons.
* the inode as necessary for EOF zeroing cases and fill out the new
* inode size as appropriate.
*/ */
if ((iocb->ki_pos & mp->m_blockmask) || if (iolock == XFS_IOLOCK_EXCL) {
((iocb->ki_pos + count) & mp->m_blockmask)) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
unaligned_io = 1;
/*
* We can't properly handle unaligned direct I/O to reflink
* files yet, as we can't unshare a partial block.
*/
if (xfs_is_cow_inode(ip)) {
trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
return -ENOTBLK;
}
iolock = XFS_IOLOCK_EXCL;
} else {
iolock = XFS_IOLOCK_SHARED; iolock = XFS_IOLOCK_SHARED;
} }
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
&xfs_dio_write_ops, 0);
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
return ret;
}
if (iocb->ki_flags & IOCB_NOWAIT) { /*
/* unaligned dio always waits, bail */ * Handle block unaligned direct I/O writes
if (unaligned_io) *
return -EAGAIN; * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
if (!xfs_ilock_nowait(ip, iolock)) * them to be done in parallel with reads and other direct I/O writes. However,
* if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
* to do sub-block zeroing and that requires serialisation against other direct
* I/O to the same block. In this case we need to serialise the submission of
* the unaligned I/O so that we don't get racing block zeroing in the dio layer.
* In the case where sub-block zeroing is not required, we can do concurrent
* sub-block dios to the same block successfully.
*
* Optimistically submit the I/O using the shared lock first, but use the
* IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
* if block allocation or partial block zeroing would be required. In that case
* we try again with the exclusive lock.
*/
static noinline ssize_t
xfs_file_dio_write_unaligned(
struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from)
{
size_t isize = i_size_read(VFS_I(ip));
size_t count = iov_iter_count(from);
int iolock = XFS_IOLOCK_SHARED;
unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
ssize_t ret;
/*
* Extending writes need exclusivity because of the sub-block zeroing
* that the DIO code always does for partial tail blocks beyond EOF, so
* don't even bother trying the fast path in this case.
*/
if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
retry_exclusive:
if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN; return -EAGAIN;
} else { iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock); flags = IOMAP_DIO_FORCE_WAIT;
} }
ret = xfs_file_aio_write_checks(iocb, from, &iolock); ret = xfs_ilock_iocb(iocb, iolock);
if (ret) if (ret)
goto out; return ret;
count = iov_iter_count(from);
/* /*
* If we are doing unaligned IO, we can't allow any other overlapping IO * We can't properly handle unaligned direct I/O to reflink files yet,
* in-flight at the same time or we risk data corruption. Wait for all * as we can't unshare a partial block.
* other IO to drain before we submit. If the IO is aligned, demote the
* iolock if we had to take the exclusive lock in
* xfs_file_aio_write_checks() for other reasons.
*/ */
if (unaligned_io) { if (xfs_is_cow_inode(ip)) {
inode_dio_wait(inode); trace_xfs_reflink_bounce_dio_write(iocb, from);
} else if (iolock == XFS_IOLOCK_EXCL) { ret = -ENOTBLK;
xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); goto out_unlock;
iolock = XFS_IOLOCK_SHARED;
} }
trace_xfs_file_direct_write(ip, count, iocb->ki_pos); ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
goto out_unlock;
/* /*
* If unaligned, this is the only IO in-flight. Wait on it before we * If we are doing exclusive unaligned I/O, this must be the only I/O
* release the iolock to prevent subsequent overlapping IO. * in-flight. Otherwise we risk data corruption due to unwritten extent
* conversions from the AIO end_io handler. Wait for all other I/O to
* drain first.
*/ */
if (flags & IOMAP_DIO_FORCE_WAIT)
inode_dio_wait(VFS_I(ip));
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
&xfs_dio_write_ops, &xfs_dio_write_ops, flags);
is_sync_kiocb(iocb) || unaligned_io);
out:
xfs_iunlock(ip, iolock);
/* /*
* No fallback to buffered IO after short writes for XFS, direct I/O * Retry unaligned I/O with exclusive blocking semantics if the DIO
* will either complete fully or return an error. * layer rejected it for mapping or locking reasons. If we are doing
* nonblocking user I/O, propagate the error.
*/ */
ASSERT(ret < 0 || ret == count); if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
xfs_iunlock(ip, iolock);
goto retry_exclusive;
}
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
return ret; return ret;
} }
static ssize_t
xfs_file_dio_write(
struct kiocb *iocb,
struct iov_iter *from)
{
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
size_t count = iov_iter_count(from);
/* direct I/O must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
return xfs_file_dio_write_unaligned(ip, iocb, from);
return xfs_file_dio_write_aligned(ip, iocb, from);
}
static noinline ssize_t static noinline ssize_t
xfs_file_dax_write( xfs_file_dax_write(
struct kiocb *iocb, struct kiocb *iocb,
...@@ -605,31 +651,26 @@ xfs_file_dax_write( ...@@ -605,31 +651,26 @@ xfs_file_dax_write(
struct xfs_inode *ip = XFS_I(inode); struct xfs_inode *ip = XFS_I(inode);
int iolock = XFS_IOLOCK_EXCL; int iolock = XFS_IOLOCK_EXCL;
ssize_t ret, error = 0; ssize_t ret, error = 0;
size_t count;
loff_t pos; loff_t pos;
if (iocb->ki_flags & IOCB_NOWAIT) { ret = xfs_ilock_iocb(iocb, iolock);
if (!xfs_ilock_nowait(ip, iolock)) if (ret)
return -EAGAIN; return ret;
} else { ret = xfs_file_write_checks(iocb, from, &iolock);
xfs_ilock(ip, iolock);
}
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret) if (ret)
goto out; goto out;
pos = iocb->ki_pos; pos = iocb->ki_pos;
count = iov_iter_count(from);
trace_xfs_file_dax_write(ip, count, pos); trace_xfs_file_dax_write(iocb, from);
ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops); ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos); i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret); error = xfs_setfilesize(ip, pos, ret);
} }
out: out:
xfs_iunlock(ip, iolock); if (iolock)
xfs_iunlock(ip, iolock);
if (error) if (error)
return error; return error;
...@@ -643,7 +684,7 @@ xfs_file_dax_write( ...@@ -643,7 +684,7 @@ xfs_file_dax_write(
} }
STATIC ssize_t STATIC ssize_t
xfs_file_buffered_aio_write( xfs_file_buffered_write(
struct kiocb *iocb, struct kiocb *iocb,
struct iov_iter *from) struct iov_iter *from)
{ {
...@@ -662,14 +703,14 @@ xfs_file_buffered_aio_write( ...@@ -662,14 +703,14 @@ xfs_file_buffered_aio_write(
iolock = XFS_IOLOCK_EXCL; iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock); xfs_ilock(ip, iolock);
ret = xfs_file_aio_write_checks(iocb, from, &iolock); ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret) if (ret)
goto out; goto out;
/* We can write back this queue in page reclaim */ /* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode); current->backing_dev_info = inode_to_bdi(inode);
trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from, ret = iomap_file_buffered_write(iocb, from,
&xfs_buffered_write_iomap_ops); &xfs_buffered_write_iomap_ops);
if (likely(ret >= 0)) if (likely(ret >= 0))
...@@ -749,12 +790,12 @@ xfs_file_write_iter( ...@@ -749,12 +790,12 @@ xfs_file_write_iter(
* CoW. In all other directio scenarios we do not * CoW. In all other directio scenarios we do not
* allow an operation to fall back to buffered mode. * allow an operation to fall back to buffered mode.
*/ */
ret = xfs_file_dio_aio_write(iocb, from); ret = xfs_file_dio_write(iocb, from);
if (ret != -ENOTBLK) if (ret != -ENOTBLK)
return ret; return ret;
} }
return xfs_file_buffered_aio_write(iocb, from); return xfs_file_buffered_write(iocb, from);
} }
static void static void
......
...@@ -784,15 +784,28 @@ xfs_direct_write_iomap_begin( ...@@ -784,15 +784,28 @@ xfs_direct_write_iomap_begin(
goto allocate_blocks; goto allocate_blocks;
/* /*
* NOWAIT IO needs to span the entire requested IO with a single map so * NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with
* that we avoid partial IO failures due to the rest of the IO range not * a single map so that we avoid partial IO failures due to the rest of
* covered by this map triggering an EAGAIN condition when it is * the I/O range not covered by this map triggering an EAGAIN condition
* subsequently mapped and aborting the IO. * when it is subsequently mapped and aborting the I/O.
*/ */
if ((flags & IOMAP_NOWAIT) && if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) {
!imap_spans_range(&imap, offset_fsb, end_fsb)) {
error = -EAGAIN; error = -EAGAIN;
goto out_unlock; if (!imap_spans_range(&imap, offset_fsb, end_fsb))
goto out_unlock;
}
/*
* For overwrite only I/O, we cannot convert unwritten extents without
* requiring sub-block zeroing. This can only be done under an
* exclusive IOLOCK, hence return -EAGAIN if this is not a written
* extent to tell the caller to try again.
*/
if (flags & IOMAP_OVERWRITE_ONLY) {
error = -EAGAIN;
if (imap.br_state != XFS_EXT_NORM &&
((offset | length) & mp->m_blockmask))
goto out_unlock;
} }
xfs_iunlock(ip, lockmode); xfs_iunlock(ip, lockmode);
...@@ -801,7 +814,7 @@ xfs_direct_write_iomap_begin( ...@@ -801,7 +814,7 @@ xfs_direct_write_iomap_begin(
allocate_blocks: allocate_blocks:
error = -EAGAIN; error = -EAGAIN;
if (flags & IOMAP_NOWAIT) if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY))
goto out_unlock; goto out_unlock;
/* /*
......
...@@ -1287,8 +1287,8 @@ TRACE_EVENT(xfs_log_assign_tail_lsn, ...@@ -1287,8 +1287,8 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
) )
DECLARE_EVENT_CLASS(xfs_file_class, DECLARE_EVENT_CLASS(xfs_file_class,
TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),
TP_ARGS(ip, count, offset), TP_ARGS(iocb, iter),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(dev_t, dev) __field(dev_t, dev)
__field(xfs_ino_t, ino) __field(xfs_ino_t, ino)
...@@ -1297,11 +1297,11 @@ DECLARE_EVENT_CLASS(xfs_file_class, ...@@ -1297,11 +1297,11 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__field(size_t, count) __field(size_t, count)
), ),
TP_fast_assign( TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
__entry->ino = ip->i_ino; __entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino;
__entry->size = ip->i_d.di_size; __entry->size = XFS_I(file_inode(iocb->ki_filp))->i_d.di_size;
__entry->offset = offset; __entry->offset = iocb->ki_pos;
__entry->count = count; __entry->count = iov_iter_count(iter);
), ),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx", TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->dev), MINOR(__entry->dev),
...@@ -1313,14 +1313,16 @@ DECLARE_EVENT_CLASS(xfs_file_class, ...@@ -1313,14 +1313,16 @@ DECLARE_EVENT_CLASS(xfs_file_class,
#define DEFINE_RW_EVENT(name) \ #define DEFINE_RW_EVENT(name) \
DEFINE_EVENT(xfs_file_class, name, \ DEFINE_EVENT(xfs_file_class, name, \
TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \ TP_PROTO(struct kiocb *iocb, struct iov_iter *iter), \
TP_ARGS(ip, count, offset)) TP_ARGS(iocb, iter))
DEFINE_RW_EVENT(xfs_file_buffered_read); DEFINE_RW_EVENT(xfs_file_buffered_read);
DEFINE_RW_EVENT(xfs_file_direct_read); DEFINE_RW_EVENT(xfs_file_direct_read);
DEFINE_RW_EVENT(xfs_file_dax_read); DEFINE_RW_EVENT(xfs_file_dax_read);
DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
DECLARE_EVENT_CLASS(xfs_imap_class, DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
...@@ -3294,8 +3296,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); ...@@ -3294,8 +3296,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
......
...@@ -780,7 +780,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ...@@ -780,7 +780,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = zonefs_file_dio_append(iocb, from); ret = zonefs_file_dio_append(iocb, from);
else else
ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
&zonefs_write_dio_ops, sync); &zonefs_write_dio_ops, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) { (ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0) if (ret > 0)
...@@ -917,7 +917,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) ...@@ -917,7 +917,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
} }
file_accessed(iocb->ki_filp); file_accessed(iocb->ki_filp);
ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
&zonefs_read_dio_ops, is_sync_kiocb(iocb)); &zonefs_read_dio_ops, 0);
} else { } else {
ret = generic_file_read_iter(iocb, to); ret = generic_file_read_iter(iocb, to);
if (ret == -EIO) if (ret == -EIO)
......
...@@ -123,6 +123,7 @@ struct iomap_page_ops { ...@@ -123,6 +123,7 @@ struct iomap_page_ops {
#define IOMAP_FAULT (1 << 3) /* mapping for page fault */ #define IOMAP_FAULT (1 << 3) /* mapping for page fault */
#define IOMAP_DIRECT (1 << 4) /* direct I/O */ #define IOMAP_DIRECT (1 << 4) /* direct I/O */
#define IOMAP_NOWAIT (1 << 5) /* do not block */ #define IOMAP_NOWAIT (1 << 5) /* do not block */
#define IOMAP_OVERWRITE_ONLY (1 << 6) /* only pure overwrites allowed */
struct iomap_ops { struct iomap_ops {
/* /*
...@@ -257,12 +258,25 @@ struct iomap_dio_ops { ...@@ -257,12 +258,25 @@ struct iomap_dio_ops {
struct bio *bio, loff_t file_offset); struct bio *bio, loff_t file_offset);
}; };
/*
* Wait for the I/O to complete in iomap_dio_rw even if the kiocb is not
* synchronous.
*/
#define IOMAP_DIO_FORCE_WAIT (1 << 0)
/*
* Do not allocate blocks or zero partial blocks, but instead fall back to
* the caller by returning -EAGAIN. Used to optimize direct I/O writes that
* are not aligned to the file system block size.
*/
#define IOMAP_DIO_OVERWRITE_ONLY (1 << 1)
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops, const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
bool wait_for_completion); unsigned int dio_flags);
struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops, const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
bool wait_for_completion); unsigned int dio_flags);
ssize_t iomap_dio_complete(struct iomap_dio *dio); ssize_t iomap_dio_complete(struct iomap_dio *dio);
int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment