Commit 4f016a31 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'iomap-5.12-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull iomap updates from Darrick Wong:
 "The big change in this cycle is some new code to make it possible for
  XFS to try unaligned directio overwrites without taking locks. If the
  block is fully written and within EOF (i.e. doesn't require any
  further fs intervention) then we can let the unlocked write proceed.
  If not, we fall back to synchronizing direct writes.

  Summary:

   - Adjust the final parameter of iomap_dio_rw.

   - Add a new flag to request that iomap directio writes return EAGAIN
     if the write is not a pure overwrite within EOF; this will be used
     to reduce lock contention with unaligned direct writes on XFS.

   - Amend XFS' directio code to eliminate exclusive locking for
     unaligned direct writes if the circumstances permit"

* tag 'iomap-5.12-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  xfs: reduce exclusive locking on unaligned dio
  xfs: split the unaligned DIO write code out
  xfs: improve the reflink_bounce_dio_write tracepoint
  xfs: simplify the read/write tracepoints
  xfs: remove the buffered I/O fallback assert
  xfs: cleanup the read/write helper naming
  xfs: make xfs_file_aio_write_checks IOCB_NOWAIT-aware
  xfs: factor out a xfs_ilock_iocb helper
  iomap: add a IOMAP_DIO_OVERWRITE_ONLY flag
  iomap: pass a flags argument to iomap_dio_rw
  iomap: rename the flags variable in __iomap_dio_rw
parents f0236163 ed1128c2
......@@ -1942,8 +1942,8 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
goto buffered;
}
dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops,
&btrfs_dio_ops, is_sync_kiocb(iocb));
dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
0);
btrfs_inode_unlock(inode, ilock_flags);
......@@ -3618,8 +3618,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
return 0;
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
is_sync_kiocb(iocb));
ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret;
}
......
......@@ -74,8 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
is_sync_kiocb(iocb));
ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
......@@ -550,7 +549,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ilock_shared)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
is_sync_kiocb(iocb) || unaligned_io || extend);
(unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
if (ret == -ENOTBLK)
ret = 0;
......
......@@ -797,9 +797,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
if (ret)
goto out_uninit;
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
is_sync_kiocb(iocb));
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
......@@ -833,8 +831,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
if (offset + len > i_size_read(&ip->i_inode))
goto out;
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
is_sync_kiocb(iocb));
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
out:
......
......@@ -451,23 +451,22 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
bool wait_for_completion)
unsigned int dio_flags)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos;
loff_t end = iocb->ki_pos + count - 1, ret = 0;
unsigned int flags = IOMAP_DIRECT;
bool wait_for_completion =
is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
unsigned int iomap_flags = IOMAP_DIRECT;
struct blk_plug plug;
struct iomap_dio *dio;
if (!count)
return NULL;
if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
return ERR_PTR(-EIO);
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
if (!dio)
return ERR_PTR(-ENOMEM);
......@@ -492,7 +491,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iter_is_iovec(iter))
dio->flags |= IOMAP_DIO_DIRTY;
} else {
flags |= IOMAP_WRITE;
iomap_flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
/* for data sync or sync, we need sync completion processing */
......@@ -514,7 +513,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
ret = -EAGAIN;
goto out_free_dio;
}
flags |= IOMAP_NOWAIT;
iomap_flags |= IOMAP_NOWAIT;
}
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (pos >= dio->i_size || pos + count > dio->i_size)
goto out_free_dio;
iomap_flags |= IOMAP_OVERWRITE_ONLY;
}
ret = filemap_write_and_wait_range(mapping, pos, end);
......@@ -545,7 +551,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
blk_start_plug(&plug);
do {
ret = iomap_apply(inode, pos, count, flags, ops, dio,
ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio,
iomap_dio_actor);
if (ret <= 0) {
/* magic error code to fall back to buffered I/O */
......@@ -629,11 +635,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
bool wait_for_completion)
unsigned int dio_flags)
{
struct iomap_dio *dio;
dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion);
dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags);
if (IS_ERR_OR_NULL(dio))
return PTR_ERR_OR_ZERO(dio);
return iomap_dio_complete(dio);
......
This diff is collapsed.
......@@ -784,15 +784,28 @@ xfs_direct_write_iomap_begin(
goto allocate_blocks;
/*
* NOWAIT IO needs to span the entire requested IO with a single map so
* that we avoid partial IO failures due to the rest of the IO range not
* covered by this map triggering an EAGAIN condition when it is
* subsequently mapped and aborting the IO.
* NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with
* a single map so that we avoid partial IO failures due to the rest of
* the I/O range not covered by this map triggering an EAGAIN condition
* when it is subsequently mapped and aborting the I/O.
*/
if ((flags & IOMAP_NOWAIT) &&
!imap_spans_range(&imap, offset_fsb, end_fsb)) {
if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) {
error = -EAGAIN;
goto out_unlock;
if (!imap_spans_range(&imap, offset_fsb, end_fsb))
goto out_unlock;
}
/*
* For overwrite only I/O, we cannot convert unwritten extents without
* requiring sub-block zeroing. This can only be done under an
* exclusive IOLOCK, hence return -EAGAIN if this is not a written
* extent to tell the caller to try again.
*/
if (flags & IOMAP_OVERWRITE_ONLY) {
error = -EAGAIN;
if (imap.br_state != XFS_EXT_NORM &&
((offset | length) & mp->m_blockmask))
goto out_unlock;
}
xfs_iunlock(ip, lockmode);
......@@ -801,7 +814,7 @@ xfs_direct_write_iomap_begin(
allocate_blocks:
error = -EAGAIN;
if (flags & IOMAP_NOWAIT)
if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY))
goto out_unlock;
/*
......
......@@ -1287,8 +1287,8 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
)
DECLARE_EVENT_CLASS(xfs_file_class,
TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
TP_ARGS(ip, count, offset),
TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),
TP_ARGS(iocb, iter),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
......@@ -1297,11 +1297,11 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__field(size_t, count)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->size = ip->i_d.di_size;
__entry->offset = offset;
__entry->count = count;
__entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
__entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino;
__entry->size = XFS_I(file_inode(iocb->ki_filp))->i_d.di_size;
__entry->offset = iocb->ki_pos;
__entry->count = iov_iter_count(iter);
),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev),
......@@ -1313,14 +1313,16 @@ DECLARE_EVENT_CLASS(xfs_file_class,
#define DEFINE_RW_EVENT(name) \
DEFINE_EVENT(xfs_file_class, name, \
TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \
TP_ARGS(ip, count, offset))
TP_PROTO(struct kiocb *iocb, struct iov_iter *iter), \
TP_ARGS(iocb, iter))
DEFINE_RW_EVENT(xfs_file_buffered_read);
DEFINE_RW_EVENT(xfs_file_direct_read);
DEFINE_RW_EVENT(xfs_file_dax_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
......@@ -3294,8 +3296,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
......
......@@ -780,7 +780,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = zonefs_file_dio_append(iocb, from);
else
ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
&zonefs_write_dio_ops, sync);
&zonefs_write_dio_ops, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
......@@ -917,7 +917,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
file_accessed(iocb->ki_filp);
ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
&zonefs_read_dio_ops, is_sync_kiocb(iocb));
&zonefs_read_dio_ops, 0);
} else {
ret = generic_file_read_iter(iocb, to);
if (ret == -EIO)
......
......@@ -123,6 +123,7 @@ struct iomap_page_ops {
#define IOMAP_FAULT (1 << 3) /* mapping for page fault */
#define IOMAP_DIRECT (1 << 4) /* direct I/O */
#define IOMAP_NOWAIT (1 << 5) /* do not block */
#define IOMAP_OVERWRITE_ONLY (1 << 6) /* only pure overwrites allowed */
struct iomap_ops {
/*
......@@ -257,12 +258,25 @@ struct iomap_dio_ops {
struct bio *bio, loff_t file_offset);
};
/*
* Wait for the I/O to complete in iomap_dio_rw even if the kiocb is not
* synchronous.
*/
#define IOMAP_DIO_FORCE_WAIT (1 << 0)
/*
* Do not allocate blocks or zero partial blocks, but instead fall back to
* the caller by returning -EAGAIN. Used to optimize direct I/O writes that
* are not aligned to the file system block size.
*/
#define IOMAP_DIO_OVERWRITE_ONLY (1 << 1)
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
bool wait_for_completion);
unsigned int dio_flags);
struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
bool wait_for_completion);
unsigned int dio_flags);
ssize_t iomap_dio_complete(struct iomap_dio *dio);
int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment