Commit 9d645db8 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.8-part2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "This reverts the direct io port to iomap infrastructure of btrfs
  merged in the first pull request. We found problems in invalidate page
  that don't seem to be fixable as regressions or without changing iomap
  code that would not affect other filesystems.

  There are four reverts in total, but three of them are followup
  cleanups needed to revert a43a67a2 cleanly. The result is the
  buffer head based implementation of direct io.

  Reverts are not great, but under current circumstances I don't see
  better options"

* tag 'for-5.8-part2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  Revert "btrfs: switch to iomap_dio_rw() for dio"
  Revert "fs: remove dio_end_io()"
  Revert "btrfs: remove BTRFS_INODE_READDIO_NEED_LOCK"
  Revert "btrfs: split btrfs_direct_IO to read and write part"
parents 96144c58 55e20bd1
...@@ -14,7 +14,6 @@ config BTRFS_FS ...@@ -14,7 +14,6 @@ config BTRFS_FS
select LZO_DECOMPRESS select LZO_DECOMPRESS
select ZSTD_COMPRESS select ZSTD_COMPRESS
select ZSTD_DECOMPRESS select ZSTD_DECOMPRESS
select FS_IOMAP
select RAID6_PQ select RAID6_PQ
select XOR_BLOCKS select XOR_BLOCKS
select SRCU select SRCU
......
...@@ -28,6 +28,7 @@ enum { ...@@ -28,6 +28,7 @@ enum {
BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_COPY_EVERYTHING,
BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_IN_DELALLOC_LIST,
BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS, BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH, BTRFS_INODE_SNAPSHOT_FLUSH,
}; };
...@@ -312,6 +313,23 @@ struct btrfs_dio_private { ...@@ -312,6 +313,23 @@ struct btrfs_dio_private {
u8 csums[]; u8 csums[];
}; };
/*
* Disable DIO read nolock optimization, so new dio readers will be forced
* to grab i_mutex. It is used to avoid the endless truncate due to
* nonlocked dio read.
*/
static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
{
set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
smp_mb();
}
static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
{
smp_mb__before_atomic();
clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
}
/* Array of bytes with variable length, hexadecimal format 0x1234 */ /* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN" #define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes #define CSUM_FMT_VALUE(size, bytes) size, bytes
......
...@@ -28,7 +28,6 @@ ...@@ -28,7 +28,6 @@
#include <linux/dynamic_debug.h> #include <linux/dynamic_debug.h>
#include <linux/refcount.h> #include <linux/refcount.h>
#include <linux/crc32c.h> #include <linux/crc32c.h>
#include <linux/iomap.h>
#include "extent-io-tree.h" #include "extent-io-tree.h"
#include "extent_io.h" #include "extent_io.h"
#include "extent_map.h" #include "extent_map.h"
...@@ -2934,9 +2933,6 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); ...@@ -2934,9 +2933,6 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate); u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations; extern const struct dentry_operations btrfs_dentry_operations;
ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
extern const struct iomap_ops btrfs_dio_iomap_ops;
extern const struct iomap_dio_ops btrfs_dops;
/* ioctl.c */ /* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
......
...@@ -1809,61 +1809,21 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, ...@@ -1809,61 +1809,21 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
return num_written ? num_written : ret; return num_written ? num_written : ret;
} }
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
const struct iov_iter *iter, loff_t offset)
{
const unsigned int blocksize_mask = fs_info->sectorsize - 1;
if (offset & blocksize_mask)
return -EINVAL;
if (iov_iter_alignment(iter) & blocksize_mask)
return -EINVAL;
return 0;
}
static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); loff_t pos;
loff_t pos = iocb->ki_pos; ssize_t written;
ssize_t written = 0;
ssize_t written_buffered; ssize_t written_buffered;
loff_t endbyte; loff_t endbyte;
int err; int err;
size_t count = 0;
bool relock = false;
if (check_direct_IO(fs_info, from, pos)) written = generic_file_direct_write(iocb, from);
goto buffered;
count = iov_iter_count(from);
/*
* If the write DIO is beyond the EOF, we need update the isize, but it
* is protected by i_mutex. So we can not unlock the i_mutex at this
* case.
*/
if (pos + count <= inode->i_size) {
inode_unlock(inode);
relock = true;
} else if (iocb->ki_flags & IOCB_NOWAIT) {
return -EAGAIN;
}
down_read(&BTRFS_I(inode)->dio_sem);
written = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dops,
is_sync_kiocb(iocb));
up_read(&BTRFS_I(inode)->dio_sem);
if (relock)
inode_lock(inode);
if (written < 0 || !iov_iter_count(from)) if (written < 0 || !iov_iter_count(from))
return written; return written;
buffered:
pos = iocb->ki_pos; pos = iocb->ki_pos;
written_buffered = btrfs_buffered_write(iocb, from); written_buffered = btrfs_buffered_write(iocb, from);
if (written_buffered < 0) { if (written_buffered < 0) {
...@@ -2002,7 +1962,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, ...@@ -2002,7 +1962,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers); atomic_inc(&BTRFS_I(inode)->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT) { if (iocb->ki_flags & IOCB_DIRECT) {
num_written = btrfs_direct_write(iocb, from); num_written = __btrfs_direct_write(iocb, from);
} else { } else {
num_written = btrfs_buffered_write(iocb, from); num_written = btrfs_buffered_write(iocb, from);
if (num_written > 0) if (num_written > 0)
...@@ -3516,54 +3476,9 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) ...@@ -3516,54 +3476,9 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp); return generic_file_open(inode, filp);
} }
static int check_direct_read(struct btrfs_fs_info *fs_info,
const struct iov_iter *iter, loff_t offset)
{
int ret;
int i, seg;
ret = check_direct_IO(fs_info, iter, offset);
if (ret < 0)
return ret;
for (seg = 0; seg < iter->nr_segs; seg++)
for (i = seg + 1; i < iter->nr_segs; i++)
if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
return -EINVAL;
return 0;
}
static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
return 0;
inode_lock_shared(inode);
ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dops,
is_sync_kiocb(iocb));
inode_unlock_shared(inode);
return ret;
}
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret = 0;
if (iocb->ki_flags & IOCB_DIRECT) {
ret = btrfs_direct_read(iocb, to);
if (ret < 0)
return ret;
}
return generic_file_buffered_read(iocb, to, ret);
}
const struct file_operations btrfs_file_operations = { const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek, .llseek = btrfs_file_llseek,
.read_iter = btrfs_file_read_iter, .read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read, .splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter, .write_iter = btrfs_file_write_iter,
.mmap = btrfs_file_mmap, .mmap = btrfs_file_mmap,
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/bio.h> #include <linux/bio.h>
#include <linux/buffer_head.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
...@@ -57,9 +58,9 @@ struct btrfs_iget_args { ...@@ -57,9 +58,9 @@ struct btrfs_iget_args {
struct btrfs_dio_data { struct btrfs_dio_data {
u64 reserve; u64 reserve;
loff_t length; u64 unsubmitted_oe_range_start;
ssize_t submitted; u64 unsubmitted_oe_range_end;
struct extent_changeset *data_reserved; int overwrite;
}; };
static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_dir_inode_operations;
...@@ -4810,7 +4811,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) ...@@ -4810,7 +4811,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
truncate_setsize(inode, newsize); truncate_setsize(inode, newsize);
/* Disable nonlocked read DIO to avoid the endless truncate */
btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode); inode_dio_wait(inode);
btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
ret = btrfs_truncate(inode, newsize == oldsize); ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) { if (ret && inode->i_nlink) {
...@@ -7041,7 +7045,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, ...@@ -7041,7 +7045,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
} }
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
struct extent_state **cached_state, bool writing) struct extent_state **cached_state, int writing)
{ {
struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *ordered;
int ret = 0; int ret = 0;
...@@ -7179,7 +7183,30 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, ...@@ -7179,7 +7183,30 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
} }
static int btrfs_get_blocks_direct_read(struct extent_map *em,
struct buffer_head *bh_result,
struct inode *inode,
u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
if (em->block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
return -ENOENT;
len = min(len, em->len - (start - em->start));
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
inode->i_blkbits;
bh_result->b_size = len;
bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
set_buffer_mapped(bh_result);
return 0;
}
static int btrfs_get_blocks_direct_write(struct extent_map **map, static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct buffer_head *bh_result,
struct inode *inode, struct inode *inode,
struct btrfs_dio_data *dio_data, struct btrfs_dio_data *dio_data,
u64 start, u64 len) u64 start, u64 len)
...@@ -7241,6 +7268,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, ...@@ -7241,6 +7268,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
} }
/* this will cow the extent */ /* this will cow the extent */
len = bh_result->b_size;
free_extent_map(em); free_extent_map(em);
*map = em = btrfs_new_extent_direct(inode, start, len); *map = em = btrfs_new_extent_direct(inode, start, len);
if (IS_ERR(em)) { if (IS_ERR(em)) {
...@@ -7251,73 +7279,64 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, ...@@ -7251,73 +7279,64 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
len = min(len, em->len - (start - em->start)); len = min(len, em->len - (start - em->start));
skip_cow: skip_cow:
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
inode->i_blkbits;
bh_result->b_size = len;
bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
set_buffer_mapped(bh_result);
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
set_buffer_new(bh_result);
/* /*
* Need to update the i_size under the extent lock so buffered * Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock. * readers will get the updated i_size when we unlock.
*/ */
if (start + len > i_size_read(inode)) if (!dio_data->overwrite && start + len > i_size_read(inode))
i_size_write(inode, start + len); i_size_write(inode, start + len);
WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len; dio_data->reserve -= len;
dio_data->unsubmitted_oe_range_end = start + len;
current->journal_info = dio_data;
out: out:
return ret; return ret;
} }
static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
loff_t length, unsigned flags, struct iomap *iomap, struct buffer_head *bh_result, int create)
struct iomap *srcmap)
{ {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em; struct extent_map *em;
struct extent_state *cached_state = NULL; struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = NULL; struct btrfs_dio_data *dio_data = NULL;
u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend; u64 lockstart, lockend;
const bool write = !!(flags & IOMAP_WRITE); u64 len = bh_result->b_size;
int ret = 0; int ret = 0;
u64 len = length;
bool unlock_extents = false;
if (!write) if (!create)
len = min_t(u64, len, fs_info->sectorsize); len = min_t(u64, len, fs_info->sectorsize);
lockstart = start; lockstart = start;
lockend = start + len - 1; lockend = start + len - 1;
if (current->journal_info) {
/* /*
* The generic stuff only does filemap_write_and_wait_range, which * Need to pull our outstanding extents and set journal_info to NULL so
* isn't enough if we've written compressed pages to this area, so we * that anything that needs to check if there's a transaction doesn't get
* need to flush the dirty pages again to make absolutely sure that any * confused.
* outstanding dirty pages are on disk.
*/ */
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, dio_data = current->journal_info;
&BTRFS_I(inode)->runtime_flags)) current->journal_info = NULL;
ret = filemap_fdatawrite_range(inode->i_mapping, start,
start + length - 1);
dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
if (!dio_data)
return -ENOMEM;
dio_data->length = length;
if (write) {
dio_data->reserve = round_up(length, fs_info->sectorsize);
ret = btrfs_delalloc_reserve_space(inode,
&dio_data->data_reserved,
start, dio_data->reserve);
if (ret) {
extent_changeset_free(dio_data->data_reserved);
kfree(dio_data);
return ret;
}
} }
iomap->private = dio_data;
/* /*
* If this errors out it's because we couldn't invalidate pagecache for * If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered. * this range and we need to fallback to buffered.
*/ */
if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
create)) {
ret = -ENOTBLK; ret = -ENOTBLK;
goto err; goto err;
} }
...@@ -7349,47 +7368,35 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, ...@@ -7349,47 +7368,35 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
goto unlock_err; goto unlock_err;
} }
len = min(len, em->len - (start - em->start)); if (create) {
if (write) { ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, dio_data, start, len);
start, len);
if (ret < 0) if (ret < 0)
goto unlock_err; goto unlock_err;
unlock_extents = true;
/* Recalc len in case the new em is smaller than requested */ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
len = min(len, em->len - (start - em->start)); lockend, &cached_state);
} else { } else {
ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
start, len);
/* Can be negative only if we read from a hole */
if (ret < 0) {
ret = 0;
free_extent_map(em);
goto unlock_err;
}
/* /*
* We need to unlock only the end area that we aren't using. * We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine. * The rest is going to be unlocked by the endio routine.
*/ */
lockstart = start + len; lockstart = start + bh_result->b_size;
if (lockstart < lockend) if (lockstart < lockend) {
unlock_extents = true;
}
if (unlock_extents)
unlock_extent_cached(&BTRFS_I(inode)->io_tree, unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state); lockstart, lockend, &cached_state);
else
free_extent_state(cached_state);
/*
* Translate extent map information to iomap.
* We trim the extents (and move the addr) even though iomap code does
* that, since we have locked only the parts we are performing I/O in.
*/
if ((em->block_start == EXTENT_MAP_HOLE) ||
(test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else { } else {
iomap->addr = em->block_start + (start - em->start); free_extent_state(cached_state);
iomap->type = IOMAP_MAPPED; }
} }
iomap->offset = start;
iomap->bdev = fs_info->fs_devices->latest_bdev;
iomap->length = len;
free_extent_map(em); free_extent_map(em);
...@@ -7399,53 +7406,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, ...@@ -7399,53 +7406,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state); &cached_state);
err: err:
if (dio_data) { if (dio_data)
btrfs_delalloc_release_space(inode, dio_data->data_reserved, current->journal_info = dio_data;
start, dio_data->reserve, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
extent_changeset_free(dio_data->data_reserved);
kfree(dio_data);
}
return ret;
}
static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
int ret = 0;
struct btrfs_dio_data *dio_data = iomap->private;
size_t submitted = dio_data->submitted;
const bool write = !!(flags & IOMAP_WRITE);
if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */
unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
goto out;
}
if (submitted < length) {
pos += submitted;
length -= submitted;
if (write)
__endio_write_update_ordered(inode, pos, length, false);
else
unlock_extent(&BTRFS_I(inode)->io_tree, pos,
pos + length - 1);
ret = -ENOTBLK;
}
if (write) {
if (dio_data->reserve)
btrfs_delalloc_release_space(inode,
dio_data->data_reserved, pos,
dio_data->reserve, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
extent_changeset_free(dio_data->data_reserved);
}
out:
kfree(dio_data);
iomap->private = NULL;
return ret; return ret;
} }
...@@ -7468,7 +7430,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) ...@@ -7468,7 +7430,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
dip->logical_offset + dip->bytes - 1); dip->logical_offset + dip->bytes - 1);
} }
bio_endio(dip->dio_bio); dio_end_io(dip->dio_bio);
kfree(dip); kfree(dip);
} }
...@@ -7704,11 +7666,24 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, ...@@ -7704,11 +7666,24 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio; dip->dio_bio = dio_bio;
refcount_set(&dip->refs, 1); refcount_set(&dip->refs, 1);
if (write) {
struct btrfs_dio_data *dio_data = current->journal_info;
/*
* Setting range start and end to the same value means that
* no cleanup will happen in btrfs_direct_IO
*/
dio_data->unsubmitted_oe_range_end = dip->logical_offset +
dip->bytes;
dio_data->unsubmitted_oe_range_start =
dio_data->unsubmitted_oe_range_end;
}
return dip; return dip;
} }
static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
struct bio *dio_bio, loff_t file_offset) loff_t file_offset)
{ {
const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
...@@ -7725,7 +7700,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, ...@@ -7725,7 +7700,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
int ret; int ret;
blk_status_t status; blk_status_t status;
struct btrfs_io_geometry geom; struct btrfs_io_geometry geom;
struct btrfs_dio_data *dio_data = iomap->private;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset); dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) { if (!dip) {
...@@ -7734,8 +7708,8 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, ...@@ -7734,8 +7708,8 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
file_offset + dio_bio->bi_iter.bi_size - 1); file_offset + dio_bio->bi_iter.bi_size - 1);
} }
dio_bio->bi_status = BLK_STS_RESOURCE; dio_bio->bi_status = BLK_STS_RESOURCE;
bio_endio(dio_bio); dio_end_io(dio_bio);
return BLK_QC_T_NONE; return;
} }
if (!write && csum) { if (!write && csum) {
...@@ -7806,27 +7780,156 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, ...@@ -7806,27 +7780,156 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
goto out_err; goto out_err;
} }
dio_data->submitted += clone_len;
clone_offset += clone_len; clone_offset += clone_len;
start_sector += clone_len >> 9; start_sector += clone_len >> 9;
file_offset += clone_len; file_offset += clone_len;
} while (submit_len > 0); } while (submit_len > 0);
return BLK_QC_T_NONE; return;
out_err: out_err:
dip->dio_bio->bi_status = status; dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip); btrfs_dio_private_put(dip);
return BLK_QC_T_NONE;
} }
const struct iomap_ops btrfs_dio_iomap_ops = { static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
.iomap_begin = btrfs_dio_iomap_begin, const struct iov_iter *iter, loff_t offset)
.iomap_end = btrfs_dio_iomap_end, {
}; int seg;
int i;
unsigned int blocksize_mask = fs_info->sectorsize - 1;
ssize_t retval = -EINVAL;
const struct iomap_dio_ops btrfs_dops = { if (offset & blocksize_mask)
.submit_io = btrfs_submit_direct, goto out;
};
if (iov_iter_alignment(iter) & blocksize_mask)
goto out;
/* If this is a write we don't need to check anymore */
if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
return 0;
/*
* Check to make sure we don't have duplicate iov_base's in this
* iovec, if so return EINVAL, otherwise we'll get csum errors
* when reading back.
*/
for (seg = 0; seg < iter->nr_segs; seg++) {
for (i = seg + 1; i < iter->nr_segs; i++) {
if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
goto out;
}
}
retval = 0;
out:
return retval;
}
static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_data dio_data = { 0 };
struct extent_changeset *data_reserved = NULL;
loff_t offset = iocb->ki_pos;
size_t count = 0;
int flags = 0;
bool wakeup = true;
bool relock = false;
ssize_t ret;
if (check_direct_IO(fs_info, iter, offset))
return 0;
inode_dio_begin(inode);
/*
* The generic stuff only does filemap_write_and_wait_range, which
* isn't enough if we've written compressed pages to this area, so
* we need to flush the dirty pages again to make absolutely sure
* that any outstanding dirty pages are on disk.
*/
count = iov_iter_count(iter);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags))
filemap_fdatawrite_range(inode->i_mapping, offset,
offset + count - 1);
if (iov_iter_rw(iter) == WRITE) {
/*
* If the write DIO is beyond the EOF, we need update
* the isize, but it is protected by i_mutex. So we can
* not unlock the i_mutex at this case.
*/
if (offset + count <= inode->i_size) {
dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
} else if (iocb->ki_flags & IOCB_NOWAIT) {
ret = -EAGAIN;
goto out;
}
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
offset, count);
if (ret)
goto out;
/*
* We need to know how many extents we reserved so that we can
* do the accounting properly if we go over the number we
* originally calculated. Abuse current->journal_info for this.
*/
dio_data.reserve = round_up(count,
fs_info->sectorsize);
dio_data.unsubmitted_oe_range_start = (u64)offset;
dio_data.unsubmitted_oe_range_end = (u64)offset;
current->journal_info = &dio_data;
down_read(&BTRFS_I(inode)->dio_sem);
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
inode_dio_end(inode);
flags = DIO_LOCKING | DIO_SKIP_HOLES;
wakeup = false;
}
ret = __blockdev_direct_IO(iocb, inode,
fs_info->fs_devices->latest_bdev,
iter, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, flags);
if (iov_iter_rw(iter) == WRITE) {
up_read(&BTRFS_I(inode)->dio_sem);
current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED) {
if (dio_data.reserve)
btrfs_delalloc_release_space(inode, data_reserved,
offset, dio_data.reserve, true);
/*
* On error we might have left some ordered extents
* without submitting corresponding bios for them, so
* cleanup them up to avoid other tasks getting them
* and waiting for them to complete forever.
*/
if (dio_data.unsubmitted_oe_range_start <
dio_data.unsubmitted_oe_range_end)
__endio_write_update_ordered(inode,
dio_data.unsubmitted_oe_range_start,
dio_data.unsubmitted_oe_range_end -
dio_data.unsubmitted_oe_range_start,
false);
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, data_reserved,
offset, count - (size_t)ret, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), count);
}
out:
if (wakeup)
inode_dio_end(inode);
if (relock)
inode_lock(inode);
extent_changeset_free(data_reserved);
return ret;
}
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len) __u64 start, __u64 len)
...@@ -10122,7 +10225,7 @@ static const struct address_space_operations btrfs_aops = { ...@@ -10122,7 +10225,7 @@ static const struct address_space_operations btrfs_aops = {
.writepage = btrfs_writepage, .writepage = btrfs_writepage,
.writepages = btrfs_writepages, .writepages = btrfs_writepages,
.readahead = btrfs_readahead, .readahead = btrfs_readahead,
.direct_IO = noop_direct_IO, .direct_IO = btrfs_direct_IO,
.invalidatepage = btrfs_invalidatepage, .invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage, .releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION #ifdef CONFIG_MIGRATION
......
...@@ -386,6 +386,25 @@ static void dio_bio_end_io(struct bio *bio) ...@@ -386,6 +386,25 @@ static void dio_bio_end_io(struct bio *bio)
spin_unlock_irqrestore(&dio->bio_lock, flags); spin_unlock_irqrestore(&dio->bio_lock, flags);
} }
/**
* dio_end_io - handle the end io action for the given bio
* @bio: The direct io bio thats being completed
*
* This is meant to be called by any filesystem that uses their own dio_submit_t
* so that the DIO specific endio actions are dealt with after the filesystem
* has done it's completion work.
*/
void dio_end_io(struct bio *bio)
{
struct dio *dio = bio->bi_private;
if (dio->is_async)
dio_bio_end_aio(bio);
else
dio_bio_end_io(bio);
}
EXPORT_SYMBOL_GPL(dio_end_io);
static inline void static inline void
dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
struct block_device *bdev, struct block_device *bdev,
......
...@@ -3204,6 +3204,8 @@ enum { ...@@ -3204,6 +3204,8 @@ enum {
DIO_SKIP_HOLES = 0x02, DIO_SKIP_HOLES = 0x02,
}; };
void dio_end_io(struct bio *bio);
ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, struct iov_iter *iter, struct block_device *bdev, struct iov_iter *iter,
get_block_t get_block, get_block_t get_block,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment