Commit d3eb546e authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] fdatasync integrity fix

fdatasync can fail to wait on some pages due to a race.

If some task (eg pdflush) is flushing the same mapping it can remove a page's
dirty tag but not then mark that page as being under writeback, because
pdflush hit a locked buffer in __block_write_full_page().  This will happen
because kjournald is writing the buffer.  In this situation
__block_write_full_page() will redirty the page so that fsync notices it, but
there is a window where the page eludes the radix tree dirty page walk.

Consequently a concurrent fsync will fail to notice the page when walking the
radix tree's dirty pages.

The approach taken by this patch is to leave the page marked as dirty in the
radix tree while ->writepage is working out what to do with it.  This ensures
that a concurrent write-for-sync will successfully locate the page and will
then block in lock_page() until the non-write-for-sync code has finished
altering the page state.
parent be5ceb40
......@@ -643,7 +643,7 @@ mpage_writepages(struct address_space *mapping,
wait_on_page_writeback(page);
if (page->mapping == mapping && !PageWriteback(page) &&
test_clear_page_dirty(page)) {
clear_page_dirty_for_io(page)) {
if (writepage) {
ret = (*writepage)(page, wbc);
if (ret) {
......
......@@ -472,6 +472,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long
int __set_page_dirty_buffers(struct page *page);
int __set_page_dirty_nobuffers(struct page *page);
int set_page_dirty_lock(struct page *page);
int clear_page_dirty_for_io(struct page *page);
/*
* Prototype to add a shrinker callback for ageable caches.
......
......@@ -472,7 +472,7 @@ int write_one_page(struct page *page, int wait)
if (wait)
wait_on_page_writeback(page);
if (test_clear_page_dirty(page)) {
if (clear_page_dirty_for_io(page)) {
page_cache_get(page);
ret = mapping->a_ops->writepage(page, &wbc);
if (ret == 0 && wait) {
......@@ -573,6 +573,36 @@ int test_clear_page_dirty(struct page *page)
}
EXPORT_SYMBOL(test_clear_page_dirty);
/*
* Clear a page's dirty flag, while caring for dirty memory accounting.
* Returns true if the page was previously dirty.
*
* This is for preparing to put the page under writeout. We leave the page
* tagged as dirty in the radix tree so that a concurrent write-for-sync
* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
* implementation will run either set_page_writeback() or set_page_dirty(),
* at which stage we bring the page's dirty flag and radix-tree dirty tag
* back into sync.
*
* This incoherency between the page's dirty flag and radix-tree tag is
* unfortunate, but it only exists while the page is locked.
*/
int clear_page_dirty_for_io(struct page *page)
{
struct address_space *mapping = page->mapping;
if (mapping) {
if (TestClearPageDirty(page)) {
if (!mapping->backing_dev_info->memory_backed)
dec_page_state(nr_dirty);
return 1;
}
return 0;
}
return TestClearPageDirty(page);
}
EXPORT_SYMBOL(clear_page_dirty_for_io);
/*
* Clear a page's dirty flag while ignoring dirty memory accounting
*/
......@@ -629,6 +659,9 @@ int test_set_page_writeback(struct page *page)
if (!ret)
radix_tree_tag_set(&mapping->page_tree, page->index,
PAGECACHE_TAG_WRITEBACK);
if (!PageDirty(page))
radix_tree_tag_clear(&mapping->page_tree, page->index,
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
......
......@@ -354,7 +354,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
goto keep_locked;
if (!may_write_to_queue(mapping->backing_dev_info))
goto keep_locked;
if (test_clear_page_dirty(page)) {
if (clear_page_dirty_for_io(page)) {
int res;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment