Commit 11e97398 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'xfs-for-linus-3.17-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs

Pull xfs fixes from Dave Chinner:
 "The fixes all address recently discovered data corruption issues.

  The original Direct IO issue was discovered by Chris Mason @ Facebook
  on a production workload which mixed buffered reads with direct reads
  and writes IO to the same file.  The fix for that exposed other issues
  with page invalidation (exposed by millions of fsx operations) failing
  due to dirty buffers beyond EOF.

  Finally, the collapse_range code could also cause problems due to
  racing writeback changing the extent map while it was being shifted
  around.  The commits for that problem are simple mitigation fixes that
  prevent the problem from occuring.  A more robust fix for 3.18 that
  addresses the underlying problem is currently being worked on by
  Brian.

  Summary of fixes:
   - a direct IO read/buffered read data corruption
   - the associated fallout from the DIO data corruption fix
   - collapse range bugs that are potential data corruption issues"

* tag 'xfs-for-linus-3.17-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs:
  xfs: trim eofblocks before collapse range
  xfs: xfs_file_collapse_range is delalloc challenged
  xfs: don't log inode unless extent shift makes extent modifications
  xfs: use ranged writeback and invalidation for direct IO
  xfs: don't zero partial page cache pages during O_DIRECT writes
  xfs: don't zero partial page cache pages during O_DIRECT writes
  xfs: don't dirty buffers beyond EOF
parents 925e0ea4 41b9d726
...@@ -5424,7 +5424,7 @@ xfs_bmap_shift_extents( ...@@ -5424,7 +5424,7 @@ xfs_bmap_shift_extents(
struct xfs_bmap_free *flist, struct xfs_bmap_free *flist,
int num_exts) int num_exts)
{ {
struct xfs_btree_cur *cur; struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_rec_host *gotp; struct xfs_bmbt_rec_host *gotp;
struct xfs_bmbt_irec got; struct xfs_bmbt_irec got;
struct xfs_bmbt_irec left; struct xfs_bmbt_irec left;
...@@ -5435,7 +5435,7 @@ xfs_bmap_shift_extents( ...@@ -5435,7 +5435,7 @@ xfs_bmap_shift_extents(
int error = 0; int error = 0;
int i; int i;
int whichfork = XFS_DATA_FORK; int whichfork = XFS_DATA_FORK;
int logflags; int logflags = 0;
xfs_filblks_t blockcount = 0; xfs_filblks_t blockcount = 0;
int total_extents; int total_extents;
...@@ -5478,16 +5478,11 @@ xfs_bmap_shift_extents( ...@@ -5478,16 +5478,11 @@ xfs_bmap_shift_extents(
} }
} }
/* We are going to change core inode */
logflags = XFS_ILOG_CORE;
if (ifp->if_flags & XFS_IFBROOT) { if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock; cur->bc_private.b.firstblock = *firstblock;
cur->bc_private.b.flist = flist; cur->bc_private.b.flist = flist;
cur->bc_private.b.flags = 0; cur->bc_private.b.flags = 0;
} else {
cur = NULL;
logflags |= XFS_ILOG_DEXT;
} }
/* /*
...@@ -5545,11 +5540,14 @@ xfs_bmap_shift_extents( ...@@ -5545,11 +5540,14 @@ xfs_bmap_shift_extents(
blockcount = left.br_blockcount + blockcount = left.br_blockcount +
got.br_blockcount; got.br_blockcount;
xfs_iext_remove(ip, *current_ext, 1, 0); xfs_iext_remove(ip, *current_ext, 1, 0);
logflags |= XFS_ILOG_CORE;
if (cur) { if (cur) {
error = xfs_btree_delete(cur, &i); error = xfs_btree_delete(cur, &i);
if (error) if (error)
goto del_cursor; goto del_cursor;
XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
} else {
logflags |= XFS_ILOG_DEXT;
} }
XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1); XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
...@@ -5575,6 +5573,7 @@ xfs_bmap_shift_extents( ...@@ -5575,6 +5573,7 @@ xfs_bmap_shift_extents(
got.br_startoff = startoff; got.br_startoff = startoff;
} }
logflags |= XFS_ILOG_CORE;
if (cur) { if (cur) {
error = xfs_bmbt_update(cur, got.br_startoff, error = xfs_bmbt_update(cur, got.br_startoff,
got.br_startblock, got.br_startblock,
...@@ -5582,6 +5581,8 @@ xfs_bmap_shift_extents( ...@@ -5582,6 +5581,8 @@ xfs_bmap_shift_extents(
got.br_state); got.br_state);
if (error) if (error)
goto del_cursor; goto del_cursor;
} else {
logflags |= XFS_ILOG_DEXT;
} }
(*current_ext)++; (*current_ext)++;
...@@ -5597,6 +5598,7 @@ xfs_bmap_shift_extents( ...@@ -5597,6 +5598,7 @@ xfs_bmap_shift_extents(
xfs_btree_del_cursor(cur, xfs_btree_del_cursor(cur,
error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
if (logflags)
xfs_trans_log_inode(tp, ip, logflags); xfs_trans_log_inode(tp, ip, logflags);
return error; return error;
} }
...@@ -1753,11 +1753,72 @@ xfs_vm_readpages( ...@@ -1753,11 +1753,72 @@ xfs_vm_readpages(
return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
} }
/*
* This is basically a copy of __set_page_dirty_buffers() with one
* small tweak: buffers beyond EOF do not get marked dirty. If we mark them
* dirty, we'll never be able to clean them because we don't write buffers
* beyond EOF, and that means we can't invalidate pages that span EOF
* that have been marked dirty. Further, the dirty state can leak into
* the file interior if the file is extended, resulting in all sorts of
* bad things happening as the state does not match the underlying data.
*
* XXX: this really indicates that bufferheads in XFS need to die. Warts like
* this only exist because of bufferheads and how the generic code manages them.
*/
STATIC int
xfs_vm_set_page_dirty(
struct page *page)
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
loff_t end_offset;
loff_t offset;
int newly_dirty;
if (unlikely(!mapping))
return !TestSetPageDirty(page);
end_offset = i_size_read(inode);
offset = page_offset(page);
spin_lock(&mapping->private_lock);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
do {
if (offset < end_offset)
set_buffer_dirty(bh);
bh = bh->b_this_page;
offset += 1 << inode->i_blkbits;
} while (bh != head);
}
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
if (newly_dirty) {
/* sigh - __set_page_dirty() is static, so copy it here, too */
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(!PageUptodate(page));
account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
return newly_dirty;
}
const struct address_space_operations xfs_address_space_operations = { const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage, .readpage = xfs_vm_readpage,
.readpages = xfs_vm_readpages, .readpages = xfs_vm_readpages,
.writepage = xfs_vm_writepage, .writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages, .writepages = xfs_vm_writepages,
.set_page_dirty = xfs_vm_set_page_dirty,
.releasepage = xfs_vm_releasepage, .releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage, .invalidatepage = xfs_vm_invalidatepage,
.write_begin = xfs_vm_write_begin, .write_begin = xfs_vm_write_begin,
......
...@@ -1470,6 +1470,26 @@ xfs_collapse_file_space( ...@@ -1470,6 +1470,26 @@ xfs_collapse_file_space(
start_fsb = XFS_B_TO_FSB(mp, offset + len); start_fsb = XFS_B_TO_FSB(mp, offset + len);
shift_fsb = XFS_B_TO_FSB(mp, len); shift_fsb = XFS_B_TO_FSB(mp, len);
/*
* Writeback the entire file and force remove any post-eof blocks. The
* writeback prevents changes to the extent list via concurrent
* writeback and the eofblocks trim prevents the extent shift algorithm
* from running into a post-eof delalloc extent.
*
* XXX: This is a temporary fix until the extent shift loop below is
* converted to use offsets and lookups within the ILOCK rather than
* carrying around the index into the extent list for the next
* iteration.
*/
error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
if (error)
return error;
if (xfs_can_free_eofblocks(ip, true)) {
error = xfs_free_eofblocks(mp, ip, false);
if (error)
return error;
}
error = xfs_free_file_space(ip, offset, len); error = xfs_free_file_space(ip, offset, len);
if (error) if (error)
return error; return error;
......
...@@ -291,12 +291,22 @@ xfs_file_read_iter( ...@@ -291,12 +291,22 @@ xfs_file_read_iter(
if (inode->i_mapping->nrpages) { if (inode->i_mapping->nrpages) {
ret = filemap_write_and_wait_range( ret = filemap_write_and_wait_range(
VFS_I(ip)->i_mapping, VFS_I(ip)->i_mapping,
pos, -1); pos, pos + size - 1);
if (ret) { if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret; return ret;
} }
truncate_pagecache_range(VFS_I(ip), pos, -1);
/*
* Invalidate whole pages. This can return an error if
* we fail to invalidate a page, but this should never
* happen on XFS. Warn if it does fail.
*/
ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
pos >> PAGE_CACHE_SHIFT,
(pos + size - 1) >> PAGE_CACHE_SHIFT);
WARN_ON_ONCE(ret);
ret = 0;
} }
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
} }
...@@ -632,10 +642,19 @@ xfs_file_dio_aio_write( ...@@ -632,10 +642,19 @@ xfs_file_dio_aio_write(
if (mapping->nrpages) { if (mapping->nrpages) {
ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
pos, -1); pos, pos + count - 1);
if (ret) if (ret)
goto out; goto out;
truncate_pagecache_range(VFS_I(ip), pos, -1); /*
* Invalidate whole pages. This can return an error if
* we fail to invalidate a page, but this should never
* happen on XFS. Warn if it does fail.
*/
ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
pos >> PAGE_CACHE_SHIFT,
(pos + count - 1) >> PAGE_CACHE_SHIFT);
WARN_ON_ONCE(ret);
ret = 0;
} }
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment