Commit 42ec8bc1 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] direct-to-BIO for O_DIRECT

Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
the kiovec layer.  It's followed by a patch which converts the raw
driver to use the O_DIRECT engine.

CPU utilisation is about the same as the kiovec-based implementation.
Read and write bandwidth are the same too, for 128k chunks.   But with
one megabyte chunks, this implementation is 20% faster at writing.

I assume this is because the kiobuf-based implementation has to stop
and wait for each 128k chunk, whereas this code streams the entire
request, regardless of its size.

This is with a single (oldish) scsi disk on aic7xxx.  I'd expect the
margin to widen on higher-end hardware which likes to have more
requests in flight.

Question is: what do we want to do with this sucker?  These are the
remaining users of kiovecs:

	drivers/md/lvm-snap.c
	drivers/media/video/video-buf.c
	drivers/mtd/devices/blkmtd.c
	drivers/scsi/sg.c

the video and mtd drivers seems to be fairly easy to de-kiobufize.
I'm aware of one proprietary driver which uses kiobufs.  XFS uses
kiobufs a little bit - just to map the pages.

So with a bit of effort and maintainer-irritation, we can extract
the kiobuf layer from the kernel.
parent 2dbd1502
...@@ -8,8 +8,8 @@ ...@@ -8,8 +8,8 @@
* device are used to bind the other minor numbers to block devices. * device are used to bind the other minor numbers to block devices.
*/ */
#include <linux/init.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/iobuf.h>
#include <linux/major.h> #include <linux/major.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/raw.h> #include <linux/raw.h>
...@@ -86,12 +86,6 @@ int raw_open(struct inode *inode, struct file *filp) ...@@ -86,12 +86,6 @@ int raw_open(struct inode *inode, struct file *filp)
return 0; return 0;
} }
if (!filp->f_iobuf) {
err = alloc_kiovec(1, &filp->f_iobuf);
if (err)
return err;
}
down(&raw_devices[minor].mutex); down(&raw_devices[minor].mutex);
/* /*
* No, it is a normal raw device. All we need to do on open is * No, it is a normal raw device. All we need to do on open is
...@@ -256,124 +250,46 @@ int raw_ctl_ioctl(struct inode *inode, ...@@ -256,124 +250,46 @@ int raw_ctl_ioctl(struct inode *inode,
return err; return err;
} }
ssize_t raw_read(struct file *filp, char * buf, size_t size, loff_t *offp)
ssize_t raw_read(struct file *filp, char * buf,
size_t size, loff_t *offp)
{ {
return rw_raw_dev(READ, filp, buf, size, offp); return rw_raw_dev(READ, filp, buf, size, offp);
} }
ssize_t raw_write(struct file *filp, const char *buf, ssize_t raw_write(struct file *filp, const char *buf, size_t size, loff_t *offp)
size_t size, loff_t *offp)
{ {
return rw_raw_dev(WRITE, filp, (char *) buf, size, offp); return rw_raw_dev(WRITE, filp, (char *) buf, size, offp);
} }
#define SECTOR_BITS 9 ssize_t
#define SECTOR_SIZE (1U << SECTOR_BITS) rw_raw_dev(int rw, struct file *filp, char *buf, size_t size, loff_t *offp)
#define SECTOR_MASK (SECTOR_SIZE - 1)
ssize_t rw_raw_dev(int rw, struct file *filp, char *buf,
size_t size, loff_t *offp)
{ {
struct kiobuf * iobuf;
int new_iobuf;
int err = 0;
unsigned long blocks;
size_t transferred;
int iosize;
int minor;
kdev_t dev;
unsigned long limit;
int sector_size, sector_bits, sector_mask;
sector_t blocknr;
struct block_device *bdev; struct block_device *bdev;
struct inode *inode;
/* int minor;
* First, a few checks on device size limits ssize_t ret = 0;
*/
minor = minor(filp->f_dentry->d_inode->i_rdev); minor = minor(filp->f_dentry->d_inode->i_rdev);
new_iobuf = 0;
iobuf = filp->f_iobuf;
if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
/*
* A parallel read/write is using the preallocated iobuf
* so just run slow and allocate a new one.
*/
err = alloc_kiovec(1, &iobuf);
if (err)
goto out;
new_iobuf = 1;
}
bdev = raw_devices[minor].binding; bdev = raw_devices[minor].binding;
dev = to_kdev_t(bdev->bd_dev); inode = bdev->bd_inode;
sector_size = raw_devices[minor].sector_size;
sector_bits = raw_devices[minor].sector_bits;
sector_mask = sector_size - 1;
limit = bdev->bd_inode->i_size >> sector_bits;
if (!limit)
limit = INT_MAX;
dprintk ("rw_raw_dev: dev %d:%d (+%d)\n",
major(dev), minor(dev), limit);
err = -EINVAL;
if ((*offp & sector_mask) || (size & sector_mask))
goto out_free;
err = 0;
if (size)
err = -ENXIO;
if ((*offp >> sector_bits) >= limit)
goto out_free;
transferred = 0;
blocknr = *offp >> sector_bits;
while (size > 0) {
blocks = size >> sector_bits;
if (blocks > limit - blocknr)
blocks = limit - blocknr;
if (!blocks)
break;
iosize = blocks << sector_bits;
err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); if (size == 0)
if (err) goto out;
break; if (size < 0) {
ret = -EINVAL;
err = brw_kiovec(rw, 1, &iobuf, raw_devices[minor].binding, &blocknr, sector_size); goto out;
if (rw == READ && err > 0)
mark_dirty_kiobuf(iobuf, err);
if (err >= 0) {
transferred += err;
size -= err;
buf += err;
}
blocknr += blocks;
unmap_kiobuf(iobuf);
if (err != iosize)
break;
} }
if (*offp >= inode->i_size) {
if (transferred) { ret = -ENXIO;
*offp += transferred; goto out;
err = transferred;
} }
if (size + *offp > inode->i_size)
out_free: size = inode->i_size - *offp;
if (!new_iobuf)
clear_bit(0, &filp->f_iobuf_lock); ret = generic_file_direct_IO(rw, inode, buf, *offp, size);
else if (ret > 0)
free_kiovec(1, &iobuf); *offp += ret;
out: if (inode->i_mapping->nrpages)
return err; invalidate_inode_pages2(inode->i_mapping);
out:
return ret;
} }
...@@ -15,7 +15,7 @@ obj-y := open.o read_write.o devices.o file_table.o buffer.o \ ...@@ -15,7 +15,7 @@ obj-y := open.o read_write.o devices.o file_table.o buffer.o \
namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
filesystems.o namespace.o seq_file.o xattr.o libfs.o \ filesystems.o namespace.o seq_file.o xattr.o libfs.o \
fs-writeback.o mpage.o fs-writeback.o mpage.o direct-io.o
ifneq ($(CONFIG_NFSD),n) ifneq ($(CONFIG_NFSD),n)
ifneq ($(CONFIG_NFSD),) ifneq ($(CONFIG_NFSD),)
......
...@@ -106,9 +106,12 @@ static int blkdev_get_block(struct inode * inode, sector_t iblock, struct buffer ...@@ -106,9 +106,12 @@ static int blkdev_get_block(struct inode * inode, sector_t iblock, struct buffer
return 0; return 0;
} }
static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) static int
blkdev_direct_IO(int rw, struct inode *inode, char *buf,
loff_t offset, size_t count)
{ {
return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, blkdev_get_block); return generic_direct_IO(rw, inode, buf, offset,
count, blkdev_get_block);
} }
static int blkdev_writepage(struct page * page) static int blkdev_writepage(struct page * page)
......
...@@ -2311,6 +2311,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block, ...@@ -2311,6 +2311,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
return tmp.b_blocknr; return tmp.b_blocknr;
} }
#if 0
int generic_direct_IO(int rw, struct inode *inode, int generic_direct_IO(int rw, struct inode *inode,
struct kiobuf *iobuf, unsigned long blocknr, struct kiobuf *iobuf, unsigned long blocknr,
int blocksize, get_block_t *get_block) int blocksize, get_block_t *get_block)
...@@ -2355,6 +2356,7 @@ int generic_direct_IO(int rw, struct inode *inode, ...@@ -2355,6 +2356,7 @@ int generic_direct_IO(int rw, struct inode *inode,
out: out:
return retval; return retval;
} }
#endif
/* /*
* Start I/O on a physical range of kernel memory, defined by a vector * Start I/O on a physical range of kernel memory, defined by a vector
......
/*
* mm/direct-io.c
*
* Copyright (C) 2002, Linus Torvalds.
*
* O_DIRECT
*
* 04Jul2002 akpm@zip.com.au
* Initial version
*/
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/bio.h>
#include <linux/wait.h>
#include <linux/err.h>
#include <linux/buffer_head.h>
#include <linux/rwsem.h>
#include <asm/atomic.h>
/*
* The largest-sized BIO which this code will assemble, in bytes. Set this
* to PAGE_SIZE if your drivers are broken.
*/
#define DIO_BIO_MAX_SIZE BIO_MAX_SIZE
/*
* How many user pages to map in one call to get_user_pages(). This determines
* the size of a structure on the stack.
*/
#define DIO_PAGES 64
struct dio {
/* BIO submission state */
struct bio *bio; /* bio under assembly */
struct bio_vec *bvec; /* current bvec in that bio */
struct inode *inode;
int rw;
sector_t block_in_file; /* changes */
sector_t final_block_in_request;/* doesn't change */
unsigned first_block_in_page; /* doesn't change */
int boundary; /* prev block is at a boundary */
int reap_counter; /* rate limit reaping */
get_block_t *get_block;
sector_t last_block_in_bio;
/* Page fetching state */
int curr_page; /* changes */
int total_pages; /* doesn't change */
unsigned long curr_user_address;/* changes */
/* Page queue */
struct page *pages[DIO_PAGES];
unsigned head;
unsigned tail;
/* BIO completion state */
atomic_t bio_count;
spinlock_t bio_list_lock;
struct bio *bio_list; /* singly linked via bi_private */
wait_queue_head_t wait_q;
};
/*
* How many pages are in the queue?
*/
static inline unsigned dio_pages_present(struct dio *dio)
{
return dio->head - dio->tail;
}
/*
* Go grab and pin some userspace pages. Typically we'll get 64 at a time.
*/
static int dio_refill_pages(struct dio *dio)
{
int ret;
int nr_pages;
nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
ret = get_user_pages(
current, /* Task for fault acounting */
current->mm, /* whose pages? */
dio->curr_user_address, /* Where from? */
nr_pages, /* How many pages? */
dio->rw == READ, /* Write to memory? */
0, /* force (?) */
&dio->pages[0],
NULL); /* vmas */
if (ret >= 0) {
dio->curr_user_address += ret * PAGE_SIZE;
dio->curr_page += ret;
dio->head = 0;
dio->tail = ret;
ret = 0;
}
return ret;
}
/*
* Get another userspace page. Returns an ERR_PTR on error. Pages are
* buffered inside the dio so that we can call get_user_pages() against a
* decent number of pages, less frequently. To provide nicer use of the
* L1 cache.
*/
static struct page *dio_get_page(struct dio *dio)
{
if (dio_pages_present(dio) == 0) {
int ret;
ret = dio_refill_pages(dio);
if (ret) {
printk("%s: dio_refill_pages returns %d\n",
__FUNCTION__, ret);
return ERR_PTR(ret);
}
BUG_ON(dio_pages_present(dio) == 0);
}
return dio->pages[dio->head++];
}
/*
* The BIO completion handler simply queues the BIO up for the process-context
* handler.
*
* During I/O bi_private points at the dio. After I/O, bi_private is used to
* implement a singly-linked list of completed BIOs, at dio->bio_list.
*/
static void dio_bio_end_io(struct bio *bio)
{
struct dio *dio = bio->bi_private;
unsigned long flags;
spin_lock_irqsave(&dio->bio_list_lock, flags);
bio->bi_private = dio->bio_list;
dio->bio_list = bio;
spin_unlock_irqrestore(&dio->bio_list_lock, flags);
wake_up(&dio->wait_q);
}
static int
dio_bio_alloc(struct dio *dio, struct block_device *bdev,
sector_t first_sector, int nr_vecs)
{
struct bio *bio;
bio = bio_alloc(GFP_KERNEL, nr_vecs);
if (bio == NULL)
return -ENOMEM;
bio->bi_bdev = bdev;
bio->bi_vcnt = nr_vecs;
bio->bi_idx = 0;
bio->bi_size = 0;
bio->bi_sector = first_sector;
bio->bi_io_vec[0].bv_page = NULL;
bio->bi_end_io = dio_bio_end_io;
dio->bio = bio;
dio->bvec = NULL; /* debug */
return 0;
}
static void dio_bio_submit(struct dio *dio)
{
struct bio *bio = dio->bio;
bio->bi_vcnt = bio->bi_idx;
bio->bi_idx = 0;
bio->bi_private = dio;
atomic_inc(&dio->bio_count);
submit_bio(dio->rw, bio);
dio->bio = NULL;
dio->bvec = NULL;
}
/*
* Release any resources in case of a failure
*/
static void dio_cleanup(struct dio *dio)
{
while (dio_pages_present(dio))
page_cache_release(dio_get_page(dio));
}
/*
* Wait for the next BIO to complete. Remove it and return it.
*/
static struct bio *dio_await_one(struct dio *dio)
{
DECLARE_WAITQUEUE(wait, current);
unsigned long flags;
struct bio *bio;
spin_lock_irqsave(&dio->bio_list_lock, flags);
while (dio->bio_list == NULL) {
add_wait_queue(&dio->wait_q, &wait);
set_current_state(TASK_UNINTERRUPTIBLE);
if (dio->bio_list == NULL) {
spin_unlock_irqrestore(&dio->bio_list_lock, flags);
blk_run_queues();
schedule();
spin_lock_irqsave(&dio->bio_list_lock, flags);
}
set_current_state(TASK_RUNNING);
remove_wait_queue(&dio->wait_q, &wait);
}
bio = dio->bio_list;
dio->bio_list = bio->bi_private;
spin_unlock_irqrestore(&dio->bio_list_lock, flags);
return bio;
}
/*
* Process one completed BIO. No locks are held.
*/
static int dio_bio_complete(struct dio *dio, struct bio *bio)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec;
int page_no;
int ret = 0;
for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
struct page *page = bvec[page_no].bv_page;
if (!uptodate) {
if (ret == 0)
ret = -EIO;
}
if (dio->rw == READ)
set_page_dirty(page);
page_cache_release(page);
}
atomic_dec(&dio->bio_count);
bio_put(bio);
return ret;
}
/*
* Wait on and process all in-flight BIOs.
*/
static int dio_await_completion(struct dio *dio)
{
int ret = 0;
while (atomic_read(&dio->bio_count)) {
struct bio *bio = dio_await_one(dio);
int ret2;
ret2 = dio_bio_complete(dio, bio);
if (ret == 0)
ret = ret2;
}
return ret;
}
/*
* A really large O_DIRECT read or write can generate a lot of BIOs. So
* to keep the memory consumption sane we periodically reap any completed BIOs
* during the BIO generation phase.
*
* This also helps to limis the peak amount of pinned userspace memory.
*/
static int dio_bio_reap(struct dio *dio)
{
int ret = 0;
if (dio->reap_counter++ >= 64) {
while (dio->bio_list) {
unsigned long flags;
struct bio *bio;
int ret2;
spin_lock_irqsave(&dio->bio_list_lock, flags);
bio = dio->bio_list;
dio->bio_list = bio->bi_private;
spin_unlock_irqrestore(&dio->bio_list_lock, flags);
ret2 = dio_bio_complete(dio, bio);
if (ret == 0)
ret = ret2;
}
dio->reap_counter = 0;
}
return ret;
}
/*
* Walk the user pages, and the file, mapping blocks to disk and emitting BIOs.
*/
int do_direct_IO(struct dio *dio)
{
struct inode * const inode = dio->inode;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocksize = 1 << blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
struct page *page;
unsigned block_in_page;
int ret;
/* The I/O can start at any block offset within the first page */
block_in_page = dio->first_block_in_page;
while (dio->block_in_file < dio->final_block_in_request) {
int new_page; /* Need to insert this page into the BIO? */
page = dio_get_page(dio);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
}
new_page = 1;
for ( ; block_in_page < blocks_per_page; block_in_page++) {
struct buffer_head map_bh;
struct bio *bio;
map_bh.b_state = 0;
ret = (*dio->get_block)(inode, dio->block_in_file,
&map_bh, dio->rw == WRITE);
if (ret) {
printk("%s: get_block returns %d\n",
__FUNCTION__, ret);
goto fail_release;
}
/* blockdevs do not set buffer_new */
if (buffer_new(&map_bh))
unmap_underlying_metadata(map_bh.b_bdev,
map_bh.b_blocknr);
if (!buffer_mapped(&map_bh)) {
ret = -EINVAL; /* A hole */
goto fail_release;
}
if (dio->bio) {
if (dio->bio->bi_idx == dio->bio->bi_vcnt ||
dio->boundary ||
dio->last_block_in_bio !=
map_bh.b_blocknr - 1) {
dio_bio_submit(dio);
dio->boundary = 0;
}
}
if (dio->bio == NULL) {
ret = dio_bio_reap(dio);
if (ret)
goto fail_release;
ret = dio_bio_alloc(dio, map_bh.b_bdev,
map_bh.b_blocknr << (blkbits - 9),
DIO_BIO_MAX_SIZE / PAGE_SIZE);
if (ret)
goto fail_release;
new_page = 1;
dio->boundary = 0;
}
bio = dio->bio;
if (new_page) {
dio->bvec = &bio->bi_io_vec[bio->bi_idx];
page_cache_get(page);
dio->bvec->bv_page = page;
dio->bvec->bv_len = 0;
dio->bvec->bv_offset = block_in_page*blocksize;
bio->bi_idx++;
}
new_page = 0;
dio->bvec->bv_len += blocksize;
bio->bi_size += blocksize;
dio->last_block_in_bio = map_bh.b_blocknr;
dio->boundary = buffer_boundary(&map_bh);
dio->block_in_file++;
if (dio->block_in_file >= dio->final_block_in_request)
break;
}
block_in_page = 0;
page_cache_release(page);
}
ret = 0;
goto out;
fail_release:
page_cache_release(page);
out:
return ret;
}
struct dio *g_dio;
int
generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset,
size_t count, get_block_t get_block)
{
const unsigned blocksize_mask = (1 << inode->i_blkbits) - 1;
const unsigned long user_addr = (unsigned long)buf;
int ret = 0;
int ret2;
struct dio dio;
size_t bytes;
/* Check the memory alignment. Blocks cannot straddle pages */
if ((user_addr & blocksize_mask) || (count & blocksize_mask)) {
ret = -EINVAL;
goto out;
}
g_dio = &dio;
/* BIO submission state */
dio.bio = NULL;
dio.bvec = NULL;
dio.inode = inode;
dio.rw = rw;
dio.block_in_file = offset >> inode->i_blkbits;
dio.final_block_in_request = (offset + count) >> inode->i_blkbits;
/* Index into the first page of the first block */
dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1))
>> inode->i_blkbits;
dio.boundary = 0;
dio.reap_counter = 0;
dio.get_block = get_block;
dio.last_block_in_bio = -1;
/* Page fetching state */
dio.curr_page = 0;
bytes = count;
dio.total_pages = 0;
if (offset & PAGE_SIZE) {
dio.total_pages++;
bytes -= PAGE_SIZE - (offset & ~(PAGE_SIZE - 1));
}
dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
dio.curr_user_address = user_addr;
/* Page queue */
dio.head = 0;
dio.tail = 0;
/* BIO completion state */
atomic_set(&dio.bio_count, 0);
spin_lock_init(&dio.bio_list_lock);
dio.bio_list = NULL;
init_waitqueue_head(&dio.wait_q);
down_read(&current->mm->mmap_sem);
ret = do_direct_IO(&dio);
up_read(&current->mm->mmap_sem);
if (dio.bio)
dio_bio_submit(&dio);
if (ret)
dio_cleanup(&dio);
ret2 = dio_await_completion(&dio);
if (ret == 0)
ret = ret2;
if (ret == 0)
ret = count - ((dio.final_block_in_request -
dio.block_in_file) << inode->i_blkbits);
out:
return ret;
}
ssize_t
generic_file_direct_IO(int rw, struct inode *inode, char *buf,
loff_t offset, size_t count)
{
struct address_space *mapping = inode->i_mapping;
unsigned blocksize_mask;
ssize_t retval;
blocksize_mask = (1 << inode->i_blkbits) - 1;
if ((offset & blocksize_mask) || (count & blocksize_mask)) {
retval = -EINVAL;
goto out;
}
if (mapping->nrpages) {
retval = filemap_fdatawrite(mapping);
if (retval == 0)
retval = filemap_fdatawait(mapping);
if (retval)
goto out;
}
retval = mapping->a_ops->direct_IO(rw, inode, buf, offset, count);
out:
return retval;
}
...@@ -607,11 +607,10 @@ static int ext2_bmap(struct address_space *mapping, long block) ...@@ -607,11 +607,10 @@ static int ext2_bmap(struct address_space *mapping, long block)
} }
static int static int
ext2_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf, ext2_direct_IO(int rw, struct inode *inode, char *buf,
unsigned long blocknr, int blocksize) loff_t offset, size_t count)
{ {
return generic_direct_IO(rw, inode, iobuf, blocknr, return generic_direct_IO(rw, inode, buf, offset, count, ext2_get_block);
blocksize, ext2_get_block);
} }
static int static int
......
...@@ -185,8 +185,6 @@ int block_sync_page(struct page *); ...@@ -185,8 +185,6 @@ int block_sync_page(struct page *);
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
int generic_commit_write(struct file *, struct page *, unsigned, unsigned); int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
int block_truncate_page(struct address_space *, loff_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *);
int generic_direct_IO(int, struct inode *, struct kiobuf *,
unsigned long, int, get_block_t *);
int file_fsync(struct file *, struct dentry *, int); int file_fsync(struct file *, struct dentry *, int);
#define OSYNC_METADATA (1<<0) #define OSYNC_METADATA (1<<0)
......
...@@ -303,8 +303,8 @@ struct address_space_operations { ...@@ -303,8 +303,8 @@ struct address_space_operations {
int (*bmap)(struct address_space *, long); int (*bmap)(struct address_space *, long);
int (*invalidatepage) (struct page *, unsigned long); int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int); int (*releasepage) (struct page *, int);
#define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */ int (*direct_IO)(int, struct inode *, char *buf,
int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int); loff_t offset, size_t count);
}; };
struct backing_dev_info; struct backing_dev_info;
...@@ -1128,7 +1128,7 @@ extern int check_disk_change(kdev_t); ...@@ -1128,7 +1128,7 @@ extern int check_disk_change(kdev_t);
extern int invalidate_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *);
extern int invalidate_device(kdev_t, int); extern int invalidate_device(kdev_t, int);
extern void invalidate_inode_pages(struct inode *); extern void invalidate_inode_pages(struct inode *);
extern void invalidate_inode_pages2(struct address_space *); extern void invalidate_inode_pages2(struct address_space *mapping);
extern void write_inode_now(struct inode *, int); extern void write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *); extern int filemap_fdatawrite(struct address_space *);
extern int filemap_fdatawait(struct address_space *); extern int filemap_fdatawait(struct address_space *);
...@@ -1233,6 +1233,11 @@ extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned ...@@ -1233,6 +1233,11 @@ extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned
extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *); extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *); extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t); extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
ssize_t generic_file_direct_IO(int rw, struct inode *inode, char *buf,
loff_t offset, size_t count);
int generic_direct_IO(int rw, struct inode *inode, char *buf,
loff_t offset, size_t count, get_block_t *get_block);
extern loff_t no_llseek(struct file *file, loff_t offset, int origin); extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
extern loff_t remote_llseek(struct file *file, loff_t offset, int origin); extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
......
...@@ -414,7 +414,7 @@ static int invalidate_list_pages2(struct address_space * mapping, ...@@ -414,7 +414,7 @@ static int invalidate_list_pages2(struct address_space * mapping,
* free the pages because they're mapped. * free the pages because they're mapped.
* @mapping: the address_space which pages we want to invalidate * @mapping: the address_space which pages we want to invalidate
*/ */
void invalidate_inode_pages2(struct address_space * mapping) void invalidate_inode_pages2(struct address_space *mapping)
{ {
int unlocked; int unlocked;
...@@ -1102,6 +1102,7 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * ...@@ -1102,6 +1102,7 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
UPDATE_ATIME(inode); UPDATE_ATIME(inode);
} }
#if 0
static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset) static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
{ {
ssize_t retval; ssize_t retval;
...@@ -1182,6 +1183,7 @@ static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, si ...@@ -1182,6 +1183,7 @@ static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, si
out: out:
return retval; return retval;
} }
#endif
int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
{ {
...@@ -1209,15 +1211,36 @@ int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long o ...@@ -1209,15 +1211,36 @@ int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long o
* This is the "read()" routine for all filesystems * This is the "read()" routine for all filesystems
* that can use the page cache directly. * that can use the page cache directly.
*/ */
ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos) ssize_t
generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
{ {
ssize_t retval; ssize_t retval;
if ((ssize_t) count < 0) if ((ssize_t) count < 0)
return -EINVAL; return -EINVAL;
if (filp->f_flags & O_DIRECT) if (filp->f_flags & O_DIRECT) {
goto o_direct; loff_t pos = *ppos, size;
struct address_space *mapping;
struct inode *inode;
mapping = filp->f_dentry->d_inode->i_mapping;
inode = mapping->host;
retval = 0;
if (!count)
goto out; /* skip atime */
size = inode->i_size;
if (pos < size) {
if (pos + count > size)
count = size - pos;
retval = generic_file_direct_IO(READ, inode,
buf, pos, count);
if (retval > 0)
*ppos = pos + retval;
}
UPDATE_ATIME(filp->f_dentry->d_inode);
goto out;
}
retval = -EFAULT; retval = -EFAULT;
if (access_ok(VERIFY_WRITE, buf, count)) { if (access_ok(VERIFY_WRITE, buf, count)) {
...@@ -1230,36 +1253,14 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t * ...@@ -1230,36 +1253,14 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
desc.count = count; desc.count = count;
desc.buf = buf; desc.buf = buf;
desc.error = 0; desc.error = 0;
do_generic_file_read(filp, ppos, &desc, file_read_actor); do_generic_file_read(filp,ppos,&desc,file_read_actor);
retval = desc.written; retval = desc.written;
if (!retval) if (!retval)
retval = desc.error; retval = desc.error;
} }
} }
out: out:
return retval; return retval;
o_direct:
{
loff_t pos = *ppos, size;
struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
struct inode *inode = mapping->host;
retval = 0;
if (!count)
goto out; /* skip atime */
size = inode->i_size;
if (pos < size) {
if (pos + count > size)
count = size - pos;
retval = generic_file_direct_IO(READ, filp, buf, count, pos);
if (retval > 0)
*ppos = pos + retval;
}
UPDATE_ATIME(filp->f_dentry->d_inode);
goto out;
}
} }
static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
...@@ -2186,8 +2187,8 @@ generic_file_write(struct file *file, const char *buf, ...@@ -2186,8 +2187,8 @@ generic_file_write(struct file *file, const char *buf,
} }
if (unlikely(file->f_flags & O_DIRECT)) { if (unlikely(file->f_flags & O_DIRECT)) {
written = generic_file_direct_IO(WRITE, file, written = generic_file_direct_IO(WRITE, inode,
(char *) buf, count, pos); (char *)buf, pos, count);
if (written > 0) { if (written > 0) {
loff_t end = pos + written; loff_t end = pos + written;
if (end > inode->i_size && !S_ISBLK(inode->i_mode)) { if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
...@@ -2195,7 +2196,8 @@ generic_file_write(struct file *file, const char *buf, ...@@ -2195,7 +2196,8 @@ generic_file_write(struct file *file, const char *buf,
mark_inode_dirty(inode); mark_inode_dirty(inode);
} }
*ppos = end; *ppos = end;
invalidate_inode_pages2(mapping); if (mapping->nrpages)
invalidate_inode_pages2(mapping);
} }
/* /*
* Sync the fs metadata but not the minor inode changes and * Sync the fs metadata but not the minor inode changes and
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment