Commit 4a4c6811 authored by Andrew Morton's avatar Andrew Morton Committed by Jens Axboe

[PATCH] permit direct IO with finer-than-fs-blocksize alignments

Mainly from Badari Pulavarty

Traditionally we have only supported O_DIRECT I/O at an alignment and
granularity which matches the underlying filesystem.  That typically
means that all IO must be 4k-aligned and a multiple of 4k in size.

Here, we relax that so that direct I/O happens with (typically)
512-byte alignment and multiple-of-512-byte size.

The tricky part is when a write starts and/or ends partway through a
filesystem block which has just been added.  We need to zero out the
parts of that block which lie outside the written region.

We handle that by putting appropriately-sized parts of the ZERO_PAGE
into sepatate BIOs.

The generic_direct_IO() function has been changed so that the
filesystem must pass in the address of the block_device against which
the IO is to be performed.  I'd have preferred to not do this, but we
do need that info at that time so that alignment checks can be
performed.

If the filesystem passes in a NULL block_device pointer then we fall
back to the old behaviour - must align with the fs blocksize.

There is no trivial way for userspace to know what the minimum
alignment is - it depends on what bdev_hardsect_size() says about the
device.  It is _usually_ 512 bytes, but not always.  This introduces
the risk that someone will develop and test applications which work
fine on their hardware, but will fail on someone else's hardware.

It is possible to query the hardsect size using the BLKSSZGET ioctl
against the backing block device.  This can be performed at runtime or
at application installation time.
parent a9577554
...@@ -120,7 +120,7 @@ blkdev_direct_IO(int rw, struct file *file, const struct iovec *iov, ...@@ -120,7 +120,7 @@ blkdev_direct_IO(int rw, struct file *file, const struct iovec *iov,
{ {
struct inode *inode = file->f_dentry->d_inode->i_mapping->host; struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
return generic_direct_IO(rw, inode, iov, offset, return generic_direct_IO(rw, inode, inode->i_bdev, iov, offset,
nr_segs, blkdev_get_blocks); nr_segs, blkdev_get_blocks);
} }
......
...@@ -29,14 +29,35 @@ ...@@ -29,14 +29,35 @@
*/ */
#define DIO_PAGES 64 #define DIO_PAGES 64
/*
* This code generally works in units of "dio_blocks". A dio_block is
* somewhere between the hard sector size and the filesystem block size. it
* is determined on a per-invokation basis. When talking to the filesystem
* we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
* down by dio->blkfactor. Similarly, fs-blocksize quantities are converted
* to bio_block quantities by shifting left by blkfactor.
*
* If blkfactor is zero then the user's request was aligned to the filesystem's
* blocksize.
*/
struct dio { struct dio {
/* BIO submission state */ /* BIO submission state */
struct bio *bio; /* bio under assembly */ struct bio *bio; /* bio under assembly */
struct inode *inode; struct inode *inode;
int rw; int rw;
unsigned blkbits; /* doesn't change */ unsigned blkbits; /* doesn't change */
unsigned blkfactor; /* When we're using an aligment which
is finer than the filesystem's soft
blocksize, this specifies how much
finer. blkfactor=2 means 1/4-block
alignment. Does not change */
unsigned start_zero_done; /* flag: sub-blocksize zeroing has
been performed at the start of a
write */
int pages_in_io; /* approximate total IO pages */ int pages_in_io; /* approximate total IO pages */
sector_t block_in_file; /* changes */ sector_t block_in_file; /* Current offset into the underlying
file in dio_block units. */
unsigned blocks_available; /* At block_in_file. changes */ unsigned blocks_available; /* At block_in_file. changes */
sector_t final_block_in_request;/* doesn't change */ sector_t final_block_in_request;/* doesn't change */
unsigned first_block_in_page; /* doesn't change, Used only once */ unsigned first_block_in_page; /* doesn't change, Used only once */
...@@ -44,7 +65,8 @@ struct dio { ...@@ -44,7 +65,8 @@ struct dio {
int reap_counter; /* rate limit reaping */ int reap_counter; /* rate limit reaping */
get_blocks_t *get_blocks; /* block mapping function */ get_blocks_t *get_blocks; /* block mapping function */
sector_t final_block_in_bio; /* current final block in bio + 1 */ sector_t final_block_in_bio; /* current final block in bio + 1 */
sector_t next_block_for_io; /* next block to be put under IO */ sector_t next_block_for_io; /* next block to be put under IO,
in dio_blocks units */
struct buffer_head map_bh; /* last get_blocks() result */ struct buffer_head map_bh; /* last get_blocks() result */
/* /*
...@@ -340,6 +362,10 @@ static int get_more_blocks(struct dio *dio) ...@@ -340,6 +362,10 @@ static int get_more_blocks(struct dio *dio)
{ {
int ret; int ret;
struct buffer_head *map_bh = &dio->map_bh; struct buffer_head *map_bh = &dio->map_bh;
sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
unsigned long fs_count; /* Number of filesystem-sized blocks */
unsigned long dio_count;/* Number of dio_block-sized blocks */
unsigned long blkmask;
/* /*
* If there was a memory error and we've overwritten all the * If there was a memory error and we've overwritten all the
...@@ -350,8 +376,14 @@ static int get_more_blocks(struct dio *dio) ...@@ -350,8 +376,14 @@ static int get_more_blocks(struct dio *dio)
map_bh->b_state = 0; map_bh->b_state = 0;
map_bh->b_size = 0; map_bh->b_size = 0;
BUG_ON(dio->block_in_file >= dio->final_block_in_request); BUG_ON(dio->block_in_file >= dio->final_block_in_request);
ret = (*dio->get_blocks)(dio->inode, dio->block_in_file, fs_startblk = dio->block_in_file >> dio->blkfactor;
dio->final_block_in_request-dio->block_in_file, dio_count = dio->final_block_in_request - dio->block_in_file;
fs_count = dio_count >> dio->blkfactor;
blkmask = (1 << dio->blkfactor) - 1;
if (dio_count & blkmask)
fs_count++;
ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count,
map_bh, dio->rw == WRITE); map_bh, dio->rw == WRITE);
} }
return ret; return ret;
...@@ -523,6 +555,49 @@ static void clean_blockdev_aliases(struct dio *dio) ...@@ -523,6 +555,49 @@ static void clean_blockdev_aliases(struct dio *dio)
} }
} }
/*
* If we are not writing the entire block and get_block() allocated
* the block for us, we need to fill-in the unused portion of the
* block with zeros. This happens only if user-buffer, fileoffset or
* io length is not filesystem block-size multiple.
*
* `end' is zero if we're doing the start of the IO, 1 at the end of the
* IO.
*/
static void dio_zero_block(struct dio *dio, int end)
{
unsigned dio_blocks_per_fs_block;
unsigned this_chunk_blocks; /* In dio_blocks */
unsigned this_chunk_bytes;
struct page *page;
dio->start_zero_done = 1;
if (!dio->blkfactor || !buffer_new(&dio->map_bh))
return;
dio_blocks_per_fs_block = 1 << dio->blkfactor;
this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
if (!this_chunk_blocks)
return;
/*
* We need to zero out part of an fs block. It is either at the
* beginning or the end of the fs block.
*/
if (end)
this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
this_chunk_bytes = this_chunk_blocks << dio->blkbits;
page = ZERO_PAGE(dio->cur_user_address);
if (submit_page_section(dio, page, 0, this_chunk_bytes,
dio->next_block_for_io))
return;
dio->next_block_for_io += this_chunk_blocks;
}
/* /*
* Walk the user pages, and the file, mapping blocks to disk and generating * Walk the user pages, and the file, mapping blocks to disk and generating
* a sequence of (page,offset,len,block) mappings. These mappings are injected * a sequence of (page,offset,len,block) mappings. These mappings are injected
...@@ -565,21 +640,49 @@ static int do_direct_IO(struct dio *dio) ...@@ -565,21 +640,49 @@ static int do_direct_IO(struct dio *dio)
unsigned u; unsigned u;
if (dio->blocks_available == 0) { if (dio->blocks_available == 0) {
/*
* Need to go and map some more disk
*/
unsigned long blkmask;
unsigned long dio_remainder;
ret = get_more_blocks(dio); ret = get_more_blocks(dio);
if (ret) { if (ret) {
page_cache_release(page); page_cache_release(page);
goto out; goto out;
} }
if (buffer_mapped(map_bh)) { if (!buffer_mapped(map_bh))
goto do_holes;
dio->blocks_available = dio->blocks_available =
map_bh->b_size >> dio->blkbits; map_bh->b_size >> dio->blkbits;
dio->next_block_for_io = dio->next_block_for_io =
map_bh->b_blocknr; map_bh->b_blocknr << dio->blkfactor;
if (buffer_new(map_bh)) if (buffer_new(map_bh))
clean_blockdev_aliases(dio); clean_blockdev_aliases(dio);
}
}
if (!dio->blkfactor)
goto do_holes;
blkmask = (1 << dio->blkfactor) - 1;
dio_remainder = (dio->block_in_file & blkmask);
/*
* If we are at the start of IO and that IO
* starts partway into a fs-block,
* dio_remainder will be non-zero. If the IO
* is a read then we can simply advance the IO
* cursor to the first block which is to be
* read. But if the IO is a write and the
* block was newly allocated we cannot do that;
* the start of the fs block must be zeroed out
* on-disk
*/
if (!buffer_new(map_bh))
dio->next_block_for_io += dio_remainder;
dio->blocks_available -= dio_remainder;
}
do_holes:
/* Handle holes */ /* Handle holes */
if (!buffer_mapped(map_bh)) { if (!buffer_mapped(map_bh)) {
char *kaddr = kmap_atomic(page, KM_USER0); char *kaddr = kmap_atomic(page, KM_USER0);
...@@ -592,6 +695,14 @@ static int do_direct_IO(struct dio *dio) ...@@ -592,6 +695,14 @@ static int do_direct_IO(struct dio *dio)
goto next_block; goto next_block;
} }
/*
* If we're performing IO which has an alignment which
* is finer than the underlying fs, go check to see if
* we must zero out the start of this block.
*/
if (unlikely(dio->blkfactor && !dio->start_zero_done))
dio_zero_block(dio, 0);
/* /*
* Work out, in this_chunk_blocks, how much disk we * Work out, in this_chunk_blocks, how much disk we
* can add to this page * can add to this page
...@@ -635,9 +746,9 @@ static int do_direct_IO(struct dio *dio) ...@@ -635,9 +746,9 @@ static int do_direct_IO(struct dio *dio)
static int static int
direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks) loff_t offset, unsigned long nr_segs, unsigned blkbits,
get_blocks_t get_blocks)
{ {
const unsigned blkbits = inode->i_blkbits;
unsigned long user_addr; unsigned long user_addr;
int seg, ret2, ret = 0; int seg, ret2, ret = 0;
struct dio dio; struct dio dio;
...@@ -647,6 +758,8 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, ...@@ -647,6 +758,8 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
dio.inode = inode; dio.inode = inode;
dio.rw = rw; dio.rw = rw;
dio.blkbits = blkbits; dio.blkbits = blkbits;
dio.blkfactor = inode->i_blkbits - blkbits;
dio.start_zero_done = 0;
dio.block_in_file = offset >> blkbits; dio.block_in_file = offset >> blkbits;
dio.blocks_available = 0; dio.blocks_available = 0;
...@@ -702,6 +815,12 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, ...@@ -702,6 +815,12 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
} /* end iovec loop */ } /* end iovec loop */
/*
* There may be some unwritten disk at the end of a part-written
* fs-block-sized block. Go zero that now.
*/
dio_zero_block(&dio, 1);
if (dio.cur_page) { if (dio.cur_page) {
ret2 = dio_send_cur_page(&dio); ret2 = dio_send_cur_page(&dio);
page_cache_release(dio.cur_page); page_cache_release(dio.cur_page);
...@@ -723,27 +842,44 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, ...@@ -723,27 +842,44 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
* This is a library function for use by filesystem drivers. * This is a library function for use by filesystem drivers.
*/ */
int int
generic_direct_IO(int rw, struct inode *inode, const struct iovec *iov, generic_direct_IO(int rw, struct inode *inode, struct block_device *bdev,
loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks) const struct iovec *iov, loff_t offset, unsigned long nr_segs,
get_blocks_t get_blocks)
{ {
int seg; int seg;
size_t size; size_t size;
unsigned long addr; unsigned long addr;
unsigned blocksize_mask = (1 << inode->i_blkbits) - 1; unsigned blkbits = inode->i_blkbits;
unsigned bdev_blkbits = 0;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL; ssize_t retval = -EINVAL;
if (bdev)
bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
if (offset & blocksize_mask) {
if (bdev)
blkbits = bdev_blkbits;
blocksize_mask = (1 << blkbits) - 1;
if (offset & blocksize_mask) if (offset & blocksize_mask)
goto out; goto out;
}
/* Check the memory alignment. Blocks cannot straddle pages */ /* Check the memory alignment. Blocks cannot straddle pages */
for (seg = 0; seg < nr_segs; seg++) { for (seg = 0; seg < nr_segs; seg++) {
addr = (unsigned long)iov[seg].iov_base; addr = (unsigned long)iov[seg].iov_base;
size = iov[seg].iov_len; size = iov[seg].iov_len;
if ((addr & blocksize_mask) || (size & blocksize_mask)) {
if (bdev)
blkbits = bdev_blkbits;
blocksize_mask = (1 << blkbits) - 1;
if ((addr & blocksize_mask) || (size & blocksize_mask)) if ((addr & blocksize_mask) || (size & blocksize_mask))
goto out; goto out;
} }
}
retval = direct_io_worker(rw, inode, iov, offset, nr_segs, get_blocks); retval = direct_io_worker(rw, inode, iov, offset,
nr_segs, blkbits, get_blocks);
out: out:
return retval; return retval;
} }
......
...@@ -624,7 +624,7 @@ ext2_direct_IO(int rw, struct file *file, const struct iovec *iov, ...@@ -624,7 +624,7 @@ ext2_direct_IO(int rw, struct file *file, const struct iovec *iov,
{ {
struct inode *inode = file->f_dentry->d_inode->i_mapping->host; struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
return generic_direct_IO(rw, inode, iov, return generic_direct_IO(rw, inode, inode->i_sb->s_bdev, iov,
offset, nr_segs, ext2_get_blocks); offset, nr_segs, ext2_get_blocks);
} }
......
...@@ -1431,7 +1431,7 @@ static int ext3_direct_IO(int rw, struct file *file, ...@@ -1431,7 +1431,7 @@ static int ext3_direct_IO(int rw, struct file *file,
} }
} }
ret = generic_direct_IO(rw, inode, iov, offset, ret = generic_direct_IO(rw, inode, inode->i_sb->s_bdev, iov, offset,
nr_segs, ext3_direct_io_get_blocks); nr_segs, ext3_direct_io_get_blocks);
out_stop: out_stop:
......
...@@ -315,7 +315,7 @@ static int jfs_direct_IO(int rw, struct file *file, const struct iovec *iov, ...@@ -315,7 +315,7 @@ static int jfs_direct_IO(int rw, struct file *file, const struct iovec *iov,
{ {
struct inode *inode = file->f_dentry->d_inode->i_mapping->host; struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
return generic_direct_IO(rw, inode, iov, return generic_direct_IO(rw, inode, inode->i_sb->s_bdev, iov,
offset, nr_segs, jfs_get_blocks); offset, nr_segs, jfs_get_blocks);
} }
......
...@@ -607,8 +607,8 @@ linvfs_direct_IO( ...@@ -607,8 +607,8 @@ linvfs_direct_IO(
{ {
struct inode *inode = file->f_dentry->d_inode->i_mapping->host; struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
return generic_direct_IO(rw, inode, iov, offset, nr_segs, return generic_direct_IO(rw, inode, NULL,
linvfs_get_blocks_direct); iov, offset, nr_segs, linvfs_get_blocks_direct);
} }
......
...@@ -1252,8 +1252,8 @@ extern void do_generic_mapping_read(struct address_space *, struct file_ra_state ...@@ -1252,8 +1252,8 @@ extern void do_generic_mapping_read(struct address_space *, struct file_ra_state
loff_t *, read_descriptor_t *, read_actor_t); loff_t *, read_descriptor_t *, read_actor_t);
extern ssize_t generic_file_direct_IO(int rw, struct file *file, extern ssize_t generic_file_direct_IO(int rw, struct file *file,
const struct iovec *iov, loff_t offset, unsigned long nr_segs); const struct iovec *iov, loff_t offset, unsigned long nr_segs);
extern int generic_direct_IO(int rw, struct inode *inode, const struct iovec extern int generic_direct_IO(int rw, struct inode *inode, struct block_device *bdev,
*iov, loff_t offset, unsigned long nr_segs, get_blocks_t *get_blocks); const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_blocks_t *get_blocks);
extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos); unsigned long nr_segs, loff_t *ppos);
ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, ssize_t generic_file_writev(struct file *filp, const struct iovec *iov,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment