Commit d5432de8 authored by Nathan Scott's avatar Nathan Scott Committed by Linus Torvalds

[PATCH] Fix an XFS direct I/O deadlock

Here's a patch which fixes the deadlock Brad Fitzpatrick reported here:
http://lkml.org/lkml/2004/11/14/98

The meat of the problem is a locking order reversal between the XFS I/O
lock and the inode semaphore (i_sem).  When we mix a number of threads
doing both direct reads and writes, we hit an ABBA deadlock after a while
because direct-io.c is taking and dropping i_sem after the XFS read path
has already taken its I/O lock.  This is the wrong way around from XFS's
point of view, in particular its the opposite order to the XFS write path. 
So this patch changes the logic for direct reads in the DIO_OWN_LOCKING
case (i.e.  XFS-only case), but leaves things as is for the other two types
of locking.

Not real pretty, but fixes up the lock ordering and deadlock.

Oh, I have tested direct reads and writes on ext3 and block devices as
well, to ensure they still function correctly with the change (i.e.  they
don't regress, should be a no-op there) which covers the other locking
cases in __blockdev_direct_IO.
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 26a4480e
......@@ -1147,11 +1147,23 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
/*
* This is a library function for use by filesystem drivers.
* The locking rules are governed by the dio_lock_type parameter.
*
* For writes to S_ISREG files, we are called under i_sem and return with i_sem
* held, even though it is internally dropped.
* DIO_NO_LOCKING (no locking, for raw block device access)
* For writes, i_sem is not held on entry; it is never taken.
*
* For writes to S_ISBLK files, i_sem is not held on entry; it is never taken.
* DIO_LOCKING (simple locking for regular files)
* For writes we are called under i_sem and return with i_sem held, even though
* it is internally dropped.
* For reads, i_sem is not held on entry, but it is taken and dropped before
* returning.
*
* DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
* uninitialised data, allowing parallel direct readers and writers)
* For writes we are called without i_sem, return without it, never touch it.
* For reads, i_sem is held on entry and will be released before returning.
*
* Additional i_alloc_sem locking requirements described inline below.
*/
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
......@@ -1168,6 +1180,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
int reader_with_isem = (rw == READ && dio_lock_type == DIO_OWN_LOCKING);
if (bdev)
bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
......@@ -1200,12 +1213,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
goto out;
/*
* For block device access DIO_NO_LOCKING is used,
* neither readers nor writers do any locking at all
* For regular files using DIO_LOCKING,
* readers need to grab i_sem and i_alloc_sem
* writers need to grab i_alloc_sem only (i_sem is already held)
* For regular files using DIO_OWN_LOCKING,
* both readers and writers need to grab i_alloc_sem
* neither readers nor writers hold i_sem on entry (nor exit)
* readers need to grab i_alloc_sem only (i_sem is already held)
* writers need to grab i_alloc_sem only
*/
dio->lock_type = dio_lock_type;
if (dio_lock_type != DIO_NO_LOCKING) {
......@@ -1213,20 +1228,25 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct address_space *mapping;
mapping = iocb->ki_filp->f_mapping;
if (dio_lock_type != DIO_OWN_LOCKING) {
down(&inode->i_sem);
reader_with_isem = 1;
}
retval = filemap_write_and_wait(mapping);
if (retval) {
up(&inode->i_sem);
kfree(dio);
goto out;
}
down_read(&inode->i_alloc_sem);
if (dio_lock_type == DIO_OWN_LOCKING)
if (dio_lock_type == DIO_OWN_LOCKING) {
up(&inode->i_sem);
reader_with_isem = 0;
}
} else {
down_read(&inode->i_alloc_sem);
}
}
/*
* For file extending writes updating i_size before data
* writeouts complete can expose uninitialized blocks. So
......@@ -1238,7 +1258,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
retval = direct_io_worker(rw, iocb, inode, iov, offset,
nr_segs, blkbits, get_blocks, end_io, dio);
if (rw == READ && dio_lock_type == DIO_LOCKING)
reader_with_isem = 0;
out:
if (reader_with_isem)
up(&inode->i_sem);
return retval;
}
EXPORT_SYMBOL(__blockdev_direct_IO);
......@@ -244,6 +244,7 @@ xfs_read(
cred_t *credp)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
size_t size = 0;
ssize_t ret;
xfs_fsize_t n;
......@@ -272,7 +273,7 @@ xfs_read(
}
/* END copy & waste from filemap.c */
if (ioflags & IO_ISDIRECT) {
if (unlikely(ioflags & IO_ISDIRECT)) {
xfs_buftarg_t *target =
(ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
......@@ -296,18 +297,20 @@ xfs_read(
return -EIO;
}
if (unlikely(ioflags & IO_ISDIRECT))
down(&inode->i_sem);
xfs_ilock(ip, XFS_IOLOCK_SHARED);
if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
!(ioflags & IO_INVIS)) {
vrwlock_t locktype = VRWLOCK_READ;
ret = XFS_SEND_DATA(mp, DM_EVENT_READ,
ret = -XFS_SEND_DATA(mp, DM_EVENT_READ,
BHV_TO_VNODE(bdp), *offset, size,
FILP_DELAY_FLAG(file), &locktype);
if (ret) {
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return -ret;
goto unlock_isem;
}
}
......@@ -316,15 +319,17 @@ xfs_read(
ret = __generic_file_aio_read(iocb, iovp, segs, offset);
if (ret == -EIOCBQUEUED)
ret = wait_on_sync_kiocb(iocb);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
if (likely(!(ioflags & IO_INVIS)))
xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
unlock_isem:
if (unlikely(ioflags & IO_ISDIRECT))
up(&inode->i_sem);
return ret;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment