Commit bc0e2bbf authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] O_DIRECT data exposure fixes

From: Badari Pulavarty, Suparna Bhattacharya, Andrew Morton

Forward port of Stephen Tweedie's DIO fixes from 2.4, to fix various DIO vs
buffered IO exposures involving races causing:

(a) stale data from uninstantiated blocks to be read, e.g.

    - O_DIRECT reads against buffered writes to a sparse region

    - O_DIRECT writes to a sparse region against buffered reads

(b) potential data corruption with

    - O_DIRECT IOs against truncate

    due to writes to truncated blocks (which may have been reallocated to
    another file).

Summary of fixes:

1) All the changes affect only regular files.  RAW/O_DIRECT on block are
   unaffected. 

2) The DIO code will not fill in sparse regions on a write.  Instead
   -ENOTBLK is returned and the generic file write code would fallthrough to
   buffered IO in this case followed by writing through the pages to disk
   using filemap_fdatawrite/wait.

3) i_sem is held during both DIO reads and writes.  For reads, and writes
   to already allocated blocks, it is released right after IO is issued,
   while for writes to newly allocated blocks (e.g file extending writes and
   hole overwrites) it is held all the way through until IO completes (and
   data is committed to disk).

4) filemap_fdatawrite/wait are called under i_sem to synchronize buffered
   pages to disk blocks before issuing DIO.

5) A new rwsem (i_alloc_sem) is held in shared mode all the while a DIO
   (read or write) is in progress, and in exclusive mode by truncate to guard
   against deallocation of data blocks during DIO. 

6) All this new locking has been pushed down into blockdev_direct_IO to
   avoid interfering with NFS direct IO.  The locks are taken in the order
   i_sem followed by i_alloc_sem.  While i_sem may be released after IO
   submission in some cases, i_alloc_sem is held through until dio_complete
   (in the case of AIO-DIO this happens through the IO completion callback).

7) i_sem and i_alloc_sem are not held for the _nolock versions of write
   routines, as used by blockdev and XFS.  Filesystems can specify the
   needs_special_locking parameter to __blockdev_direct_IO from their direct
   IO address space op accordingly.

Note from Badari:
Here is the locking (when needs_special_locking is true):

(1) generic_file_*_write() holds i_sem (as before) and calls
    ->direct_IO().  blockdev_direct_IO gets i_alloc_sem and call
    direct_io_worker().

(2) generic_file_*_read() does not hold any locks.  blockdev_direct_IO()
    gets i_sem and then i_alloc_sem and calls direct_io_worker() to do the
    work

(3) direct_io_worker() does the work and drops i_sem after submitting IOs
    if appropriate and drops i_alloc_sem after completing IOs.
parent 62a36b1f
...@@ -52,6 +52,10 @@ ...@@ -52,6 +52,10 @@
* *
* If blkfactor is zero then the user's request was aligned to the filesystem's * If blkfactor is zero then the user's request was aligned to the filesystem's
* blocksize. * blocksize.
*
* needs_locking is set for regular files on direct-IO-naive filesystems. It
* determines whether we need to do the fancy locking which prevents direct-IO
* from being able to read uninitialised disk blocks.
*/ */
struct dio { struct dio {
...@@ -59,6 +63,7 @@ struct dio { ...@@ -59,6 +63,7 @@ struct dio {
struct bio *bio; /* bio under assembly */ struct bio *bio; /* bio under assembly */
struct inode *inode; struct inode *inode;
int rw; int rw;
int needs_locking; /* doesn't change */
unsigned blkbits; /* doesn't change */ unsigned blkbits; /* doesn't change */
unsigned blkfactor; /* When we're using an alignment which unsigned blkfactor; /* When we're using an alignment which
is finer than the filesystem's soft is finer than the filesystem's soft
...@@ -206,6 +211,8 @@ static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes) ...@@ -206,6 +211,8 @@ static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)
{ {
if (dio->end_io) if (dio->end_io)
dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private); dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private);
if (dio->needs_locking)
up_read(&dio->inode->i_alloc_sem);
} }
/* /*
...@@ -449,6 +456,7 @@ static int get_more_blocks(struct dio *dio) ...@@ -449,6 +456,7 @@ static int get_more_blocks(struct dio *dio)
unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */
unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */
unsigned long blkmask; unsigned long blkmask;
int beyond_eof = 0;
/* /*
* If there was a memory error and we've overwritten all the * If there was a memory error and we've overwritten all the
...@@ -466,8 +474,19 @@ static int get_more_blocks(struct dio *dio) ...@@ -466,8 +474,19 @@ static int get_more_blocks(struct dio *dio)
if (dio_count & blkmask) if (dio_count & blkmask)
fs_count++; fs_count++;
if (dio->needs_locking) {
if (dio->block_in_file >= (i_size_read(dio->inode) >>
dio->blkbits))
beyond_eof = 1;
}
/*
* For writes inside i_size we forbid block creations: only
* overwrites are permitted. We fall back to buffered writes
* at a higher level for inside-i_size block-instantiating
* writes.
*/
ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count, ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count,
map_bh, dio->rw == WRITE); map_bh, (dio->rw == WRITE) && beyond_eof);
} }
return ret; return ret;
} }
...@@ -774,6 +793,10 @@ static int do_direct_IO(struct dio *dio) ...@@ -774,6 +793,10 @@ static int do_direct_IO(struct dio *dio)
if (!buffer_mapped(map_bh)) { if (!buffer_mapped(map_bh)) {
char *kaddr; char *kaddr;
/* AKPM: eargh, -ENOTBLK is a hack */
if (dio->rw == WRITE)
return -ENOTBLK;
if (dio->block_in_file >= if (dio->block_in_file >=
i_size_read(dio->inode)>>blkbits) { i_size_read(dio->inode)>>blkbits) {
/* We hit eof */ /* We hit eof */
...@@ -839,21 +862,21 @@ static int do_direct_IO(struct dio *dio) ...@@ -839,21 +862,21 @@ static int do_direct_IO(struct dio *dio)
return ret; return ret;
} }
/*
* Releases both i_sem and i_alloc_sem
*/
static int static int
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
const struct iovec *iov, loff_t offset, unsigned long nr_segs, const struct iovec *iov, loff_t offset, unsigned long nr_segs,
unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io) unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io,
struct dio *dio)
{ {
unsigned long user_addr; unsigned long user_addr;
int seg; int seg;
int ret = 0; int ret = 0;
int ret2; int ret2;
struct dio *dio;
size_t bytes; size_t bytes;
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
if (!dio)
return -ENOMEM;
dio->is_async = !is_sync_kiocb(iocb); dio->is_async = !is_sync_kiocb(iocb);
dio->bio = NULL; dio->bio = NULL;
...@@ -864,7 +887,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -864,7 +887,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
dio->start_zero_done = 0; dio->start_zero_done = 0;
dio->block_in_file = offset >> blkbits; dio->block_in_file = offset >> blkbits;
dio->blocks_available = 0; dio->blocks_available = 0;
dio->cur_page = NULL; dio->cur_page = NULL;
dio->boundary = 0; dio->boundary = 0;
...@@ -952,6 +974,13 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -952,6 +974,13 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
*/ */
dio_cleanup(dio); dio_cleanup(dio);
/*
* All new block allocations have been performed. We can let i_sem
* go now.
*/
if (dio->needs_locking)
up(&dio->inode->i_sem);
/* /*
* OK, all BIOs are submitted, so we can decrement bio_count to truly * OK, all BIOs are submitted, so we can decrement bio_count to truly
* reflect the number of to-be-processed BIOs. * reflect the number of to-be-processed BIOs.
...@@ -987,11 +1016,17 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -987,11 +1016,17 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
/* /*
* This is a library function for use by filesystem drivers. * This is a library function for use by filesystem drivers.
*
* For writes to S_ISREG files, we are called under i_sem and return with i_sem
* held, even though it is internally dropped.
*
* For writes to S_ISBLK files, i_sem is not held on entry; it is never taken.
*/ */
int int
blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset, struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io) unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io,
int needs_special_locking)
{ {
int seg; int seg;
size_t size; size_t size;
...@@ -1000,6 +1035,8 @@ blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -1000,6 +1035,8 @@ blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
unsigned bdev_blkbits = 0; unsigned bdev_blkbits = 0;
unsigned blocksize_mask = (1 << blkbits) - 1; unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL; ssize_t retval = -EINVAL;
struct dio *dio;
int needs_locking;
if (bdev) if (bdev)
bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
...@@ -1025,10 +1062,40 @@ blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -1025,10 +1062,40 @@ blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
} }
} }
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
retval = -ENOMEM;
if (!dio)
goto out;
/*
* For regular files,
* readers need to grab i_sem and i_alloc_sem
* writers need to grab i_alloc_sem only (i_sem is already held)
*/
needs_locking = 0;
if (S_ISREG(inode->i_mode) && needs_special_locking) {
needs_locking = 1;
if (rw == READ) {
struct address_space *mapping;
mapping = iocb->ki_filp->f_mapping;
down(&inode->i_sem);
retval = filemap_write_and_wait(mapping);
if (retval) {
up(&inode->i_sem);
kfree(dio);
goto out;
}
}
down_read(&inode->i_alloc_sem);
}
dio->needs_locking = needs_locking;
retval = direct_io_worker(rw, iocb, inode, iov, offset, retval = direct_io_worker(rw, iocb, inode, iov, offset,
nr_segs, blkbits, get_blocks, end_io); nr_segs, blkbits, get_blocks, end_io, dio);
if (needs_locking && rw == WRITE)
down(&inode->i_sem);
out: out:
return retval; return retval;
} }
EXPORT_SYMBOL(__blockdev_direct_IO);
EXPORT_SYMBOL(blockdev_direct_IO);
...@@ -185,6 +185,7 @@ void inode_init_once(struct inode *inode) ...@@ -185,6 +185,7 @@ void inode_init_once(struct inode *inode)
INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_dentry);
INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_devices);
sema_init(&inode->i_sem, 1); sema_init(&inode->i_sem, 1);
init_rwsem(&inode->i_alloc_sem);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
spin_lock_init(&inode->i_data.page_lock); spin_lock_init(&inode->i_data.page_lock);
init_MUTEX(&inode->i_data.i_shared_sem); init_MUTEX(&inode->i_data.i_shared_sem);
......
...@@ -192,7 +192,9 @@ int do_truncate(struct dentry *dentry, loff_t length) ...@@ -192,7 +192,9 @@ int do_truncate(struct dentry *dentry, loff_t length)
newattrs.ia_size = length; newattrs.ia_size = length;
newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
down(&dentry->d_inode->i_sem); down(&dentry->d_inode->i_sem);
down_write(&dentry->d_inode->i_alloc_sem);
err = notify_change(dentry, &newattrs); err = notify_change(dentry, &newattrs);
up_write(&dentry->d_inode->i_alloc_sem);
up(&dentry->d_inode->i_sem); up(&dentry->d_inode->i_sem);
return err; return err;
} }
......
...@@ -1032,7 +1032,8 @@ linvfs_direct_IO( ...@@ -1032,7 +1032,8 @@ linvfs_direct_IO(
if (error) if (error)
return -error; return -error;
return blockdev_direct_IO(rw, iocb, inode, iomap.iomap_target->pbr_bdev, return blockdev_direct_IO_no_locking(rw, iocb, inode,
iomap.iomap_target->pbr_bdev,
iov, offset, nr_segs, iov, offset, nr_segs,
linvfs_get_blocks_direct, linvfs_get_blocks_direct,
linvfs_unwritten_convert_direct); linvfs_unwritten_convert_direct);
......
...@@ -397,6 +397,7 @@ struct inode { ...@@ -397,6 +397,7 @@ struct inode {
unsigned short i_bytes; unsigned short i_bytes;
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
struct semaphore i_sem; struct semaphore i_sem;
struct rw_semaphore i_alloc_sem;
struct inode_operations *i_op; struct inode_operations *i_op;
struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct file_operations *i_fop; /* former ->i_op->default_file_ops */
struct super_block *i_sb; struct super_block *i_sb;
...@@ -1235,6 +1236,7 @@ extern void write_inode_now(struct inode *, int); ...@@ -1235,6 +1236,7 @@ extern void write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *); extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *); extern int filemap_flush(struct address_space *);
extern int filemap_fdatawait(struct address_space *); extern int filemap_fdatawait(struct address_space *);
extern int filemap_write_and_wait(struct address_space *mapping);
extern void sync_supers(void); extern void sync_supers(void);
extern void sync_filesystems(int wait); extern void sync_filesystems(int wait);
extern void emergency_sync(void); extern void emergency_sync(void);
...@@ -1347,9 +1349,6 @@ extern void ...@@ -1347,9 +1349,6 @@ extern void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset, unsigned long nr_segs); const struct iovec *iov, loff_t offset, unsigned long nr_segs);
extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io);
extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos); unsigned long nr_segs, loff_t *ppos);
ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, ssize_t generic_file_writev(struct file *filp, const struct iovec *iov,
...@@ -1371,6 +1370,32 @@ static inline void do_generic_file_read(struct file * filp, loff_t *ppos, ...@@ -1371,6 +1370,32 @@ static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
actor); actor);
} }
int __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io,
int needs_special_locking);
/*
* For filesystems which need locking between buffered and direct access
*/
static inline int blockdev_direct_IO(int rw, struct kiocb *iocb,
struct inode *inode, struct block_device *bdev, const struct iovec *iov,
loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
dio_iodone_t end_io)
{
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
nr_segs, get_blocks, end_io, 1);
}
static inline int blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
struct inode *inode, struct block_device *bdev, const struct iovec *iov,
loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
dio_iodone_t end_io)
{
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
nr_segs, get_blocks, end_io, 0);
}
extern struct file_operations generic_ro_fops; extern struct file_operations generic_ro_fops;
#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
......
...@@ -73,6 +73,9 @@ ...@@ -73,6 +73,9 @@
* ->mmap_sem * ->mmap_sem
* ->i_sem (msync) * ->i_sem (msync)
* *
* ->i_sem
* ->i_alloc_sem (various)
*
* ->inode_lock * ->inode_lock
* ->sb_lock (fs/fs-writeback.c) * ->sb_lock (fs/fs-writeback.c)
* ->mapping->page_lock (__sync_single_inode) * ->mapping->page_lock (__sync_single_inode)
...@@ -228,6 +231,18 @@ int filemap_fdatawait(struct address_space * mapping) ...@@ -228,6 +231,18 @@ int filemap_fdatawait(struct address_space * mapping)
EXPORT_SYMBOL(filemap_fdatawait); EXPORT_SYMBOL(filemap_fdatawait);
int filemap_write_and_wait(struct address_space *mapping)
{
int retval = 0;
if (mapping->nrpages) {
retval = filemap_fdatawrite(mapping);
if (retval == 0)
retval = filemap_fdatawait(mapping);
}
return retval;
}
/* /*
* This adds a page to the page cache, starting out as locked, unreferenced, * This adds a page to the page cache, starting out as locked, unreferenced,
* not uptodate and with no errors. * not uptodate and with no errors.
...@@ -1716,6 +1731,7 @@ EXPORT_SYMBOL(generic_write_checks); ...@@ -1716,6 +1731,7 @@ EXPORT_SYMBOL(generic_write_checks);
/* /*
* Write to a file through the page cache. * Write to a file through the page cache.
* Called under i_sem for S_ISREG files.
* *
* We put everything into the page cache prior to writing it. This is not a * We put everything into the page cache prior to writing it. This is not a
* problem when writing full pages. With partial pages, however, we first have * problem when writing full pages. With partial pages, however, we first have
...@@ -1806,12 +1822,19 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, ...@@ -1806,12 +1822,19 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
/* /*
* Sync the fs metadata but not the minor inode changes and * Sync the fs metadata but not the minor inode changes and
* of course not the data as we did direct DMA for the IO. * of course not the data as we did direct DMA for the IO.
* i_sem is held, which protects generic_osync_inode() from
* livelocking.
*/ */
if (written >= 0 && file->f_flags & O_SYNC) if (written >= 0 && file->f_flags & O_SYNC)
status = generic_osync_inode(inode, mapping, OSYNC_METADATA); status = generic_osync_inode(inode, mapping, OSYNC_METADATA);
if (written >= 0 && !is_sync_kiocb(iocb)) if (written >= 0 && !is_sync_kiocb(iocb))
written = -EIOCBQUEUED; written = -EIOCBQUEUED;
if (written != -ENOTBLK)
goto out_status; goto out_status;
/*
* direct-io write to a hole: fall through to buffered I/O
*/
written = 0;
} }
buf = iov->iov_base; buf = iov->iov_base;
...@@ -1900,6 +1923,14 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, ...@@ -1900,6 +1923,14 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
OSYNC_METADATA|OSYNC_DATA); OSYNC_METADATA|OSYNC_DATA);
} }
/*
* If we get here for O_DIRECT writes then we must have fallen through
* to buffered writes (block instantiation inside i_size). So we sync
* the file data here, to try to honour O_DIRECT expectations.
*/
if (unlikely(file->f_flags & O_DIRECT) && written)
status = filemap_write_and_wait(mapping);
out_status: out_status:
err = written ? written : status; err = written ? written : status;
out: out:
...@@ -1991,6 +2022,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov, ...@@ -1991,6 +2022,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
EXPORT_SYMBOL(generic_file_writev); EXPORT_SYMBOL(generic_file_writev);
/*
* Called under i_sem for writes to S_ISREG files
*/
ssize_t ssize_t
generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs) loff_t offset, unsigned long nr_segs)
...@@ -1999,18 +2033,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, ...@@ -1999,18 +2033,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
struct address_space *mapping = file->f_mapping; struct address_space *mapping = file->f_mapping;
ssize_t retval; ssize_t retval;
if (mapping->nrpages) { retval = filemap_write_and_wait(mapping);
retval = filemap_fdatawrite(mapping); if (retval == 0) {
if (retval == 0) retval = mapping->a_ops->direct_IO(rw, iocb, iov,
retval = filemap_fdatawait(mapping); offset, nr_segs);
if (retval)
goto out;
}
retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
if (rw == WRITE && mapping->nrpages) if (rw == WRITE && mapping->nrpages)
invalidate_inode_pages2(mapping); invalidate_inode_pages2(mapping);
out: }
return retval; return retval;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment