Commit 4b46fce2 authored by Josef Bacik's avatar Josef Bacik Committed by Chris Mason

Btrfs: add basic DIO read/write support

This provides basic DIO support for reading and writing.  It does not do the
work to recover from mismatching checksums, that will come later.  A few design
changes have been made from Jim's code (sorry Jim!)

1) Use the generic direct-io code.  Jim originally re-wrote all the generic DIO
code in order to account for all of BTRFS's oddities, but thanks to that work it
seems like the best bet is to just ignore compression and such and just opt to
fallback on buffered IO.

2) Fallback on buffered IO for compressed or inline extents.  Jim's code did
it's own buffering to make dio with compressed extents work.  Now we just
fallback onto normal buffered IO.

3) Use ordered extents for the writes so that all of the

lock_extent()
lookup_ordered()

type checks continue to work.

4) Do the lock_extent() lookup_ordered() loop in readpage so we don't race with
DIO writes.

I've tested this with fsx and everything works great.  This patch depends on my
dio and filemap.c patches to work.  Thanks,
Signed-off-by: default avatarJosef Bacik <josef@redhat.com>
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent c2c6ca41
......@@ -2317,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u32 *dst);
int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u64 logical_offset, u32 *dst);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
......
......@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
}
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u32 *dst)
static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
struct inode *inode, struct bio *bio,
u64 logical_offset, u32 *dst, int dio)
{
u32 sum;
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
u64 offset;
u64 offset = 0;
u64 item_start_offset = 0;
u64 item_last_offset = 0;
u64 disk_bytenr;
......@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
WARN_ON(bio->bi_vcnt <= 0);
disk_bytenr = (u64)bio->bi_sector << 9;
if (dio)
offset = logical_offset;
while (bio_index < bio->bi_vcnt) {
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
if (!dio)
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
if (ret == 0)
goto found;
......@@ -238,6 +242,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
else
set_state_private(io_tree, offset, sum);
disk_bytenr += bvec->bv_len;
offset += bvec->bv_len;
bio_index++;
bvec++;
}
......@@ -245,6 +250,18 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
return 0;
}
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u32 *dst)
{
return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
}
int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u64 offset, u32 *dst)
{
return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list)
{
......
......@@ -822,6 +822,47 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
return 0;
}
/* Copied from read-write.c */
static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
{
set_current_state(TASK_UNINTERRUPTIBLE);
if (!kiocbIsKicked(iocb))
schedule();
else
kiocbClearKicked(iocb);
__set_current_state(TASK_RUNNING);
}
/*
* Just a copy of what do_sync_write does.
*/
static ssize_t __btrfs_direct_write(struct file *file, const char __user *buf,
size_t count, loff_t pos, loff_t *ppos)
{
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
unsigned long nr_segs = 1;
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos;
kiocb.ki_left = count;
kiocb.ki_nbytes = count;
while (1) {
ret = generic_file_direct_write(&kiocb, &iov, &nr_segs, pos,
ppos, count, count);
if (ret != -EIOCBRETRY)
break;
wait_on_retry_sync_kiocb(&kiocb);
}
if (ret == -EIOCBQUEUED)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}
static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
......@@ -838,12 +879,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
unsigned long first_index;
unsigned long last_index;
int will_write;
int buffered = 0;
will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
(file->f_flags & O_DIRECT));
nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
PAGE_CACHE_SIZE / (sizeof(struct page *)));
pinned[0] = NULL;
pinned[1] = NULL;
......@@ -867,13 +907,34 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
goto out;
file_update_time(file);
BTRFS_I(inode)->sequence++;
if (unlikely(file->f_flags & O_DIRECT)) {
num_written = __btrfs_direct_write(file, buf, count, pos,
ppos);
pos += num_written;
count -= num_written;
/* We've written everything we wanted to, exit */
if (num_written < 0 || !count)
goto out;
/*
* We are going to do buffered for the rest of the range, so we
* need to make sure to invalidate the buffered pages when we're
* done.
*/
buffered = 1;
buf += num_written;
}
nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
PAGE_CACHE_SIZE / (sizeof(struct page *)));
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
/* generic_write_checks can change our pos */
start_pos = pos;
BTRFS_I(inode)->sequence++;
first_index = pos >> PAGE_CACHE_SHIFT;
last_index = (pos + count) >> PAGE_CACHE_SHIFT;
......@@ -1007,7 +1068,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
btrfs_end_transaction(trans, root);
}
}
if (file->f_flags & O_DIRECT) {
if (file->f_flags & O_DIRECT && buffered) {
invalidate_mapping_pages(inode->i_mapping,
start_pos >> PAGE_CACHE_SHIFT,
(start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
......
This diff is collapsed.
......@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
return 1;
}
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
u64 len)
{
if (file_offset + len <= entry->file_offset ||
entry->file_offset + entry->len <= file_offset)
return 0;
return 1;
}
/*
* look find the first ordered struct that has this offset, otherwise
* the first one less than this offset
......@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type)
static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int dio)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
......@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
if (dio)
set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
/* one ref for the tree */
atomic_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
......@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
return 0;
}
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type)
{
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
disk_len, type, 0);
}
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type)
{
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
disk_len, type, 1);
}
/*
* Add a struct btrfs_ordered_sum into the list of checksums to be inserted
* when an ordered extent is finished. If the list covers more than one
......@@ -484,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
* start IO on any dirty ones so the wait doesn't stall waiting
* for pdflush to find them
*/
filemap_fdatawrite_range(inode->i_mapping, start, end);
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
filemap_fdatawrite_range(inode->i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
......@@ -581,6 +609,47 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
return entry;
}
/* Since the DIO code tries to lock a wide area we need to look for any ordered
* extents that exist in the range, rather than just the start of the range.
*/
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
u64 file_offset,
u64 len)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock(&tree->lock);
node = tree_search(tree, file_offset);
if (!node) {
node = tree_search(tree, file_offset + len);
if (!node)
goto out;
}
while (1) {
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
if (range_overlaps(entry, file_offset, len))
break;
if (entry->file_offset >= file_offset + len) {
entry = NULL;
break;
}
entry = NULL;
node = rb_next(node);
if (!node)
break;
}
out:
if (entry)
atomic_inc(&entry->refs);
spin_unlock(&tree->lock);
return entry;
}
/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
......
......@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
......@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int tyep);
u64 start, u64 len, u64 disk_len, int type);
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type);
int btrfs_add_ordered_sum(struct inode *inode,
struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
......@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
u64 file_offset,
u64 len);
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment