Commit bd134f27 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] don't allow background writes to hide dirty buffers

If pdflush hits a locked-and-clean buffer in __block_write_full_page() it
will just pass over the buffer.  Typically the buffer is an ext3 data=ordered
buffer which is being written by kjournald, but a similar thing can happen
with blockdev buffers and ll_rw_block().

This is bad because the buffer is still under I/O and a subsequent fsync's
fdatawait() needs to know about it.

It is not practical to tag the page for writeback - only the submitter of the
I/O can do that, because the submitter has control of the end_io handler.

So instead, redirty the page so a subsequent fsync's fdatawrite() will wait on
the underway I/O.

There is a risk that pdflush::background_writeout() will lock up, repeatedly
trying and failing to write the same page.  This is prevented by ensuring
that background_writeout() always throttles when it made no progress.
parent d3eb546e
...@@ -1802,14 +1802,18 @@ static int __block_write_full_page(struct inode *inode, struct page *page, ...@@ -1802,14 +1802,18 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
get_bh(bh); get_bh(bh);
if (!buffer_mapped(bh)) if (!buffer_mapped(bh))
continue; continue;
if (wbc->sync_mode != WB_SYNC_NONE) { /*
* If it's a fully non-blocking write attempt and we cannot
* lock the buffer then redirty the page. Note that this can
* potentially cause a busy-wait loop from pdflush and kswapd
* activity, but those code paths have their own higher-level
* throttling.
*/
if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
lock_buffer(bh); lock_buffer(bh);
} else { } else if (test_set_buffer_locked(bh)) {
if (test_set_buffer_locked(bh)) { __set_page_dirty_nobuffers(page);
if (buffer_dirty(bh)) continue;
__set_page_dirty_nobuffers(page);
continue;
}
} }
if (test_clear_buffer_dirty(bh)) { if (test_clear_buffer_dirty(bh)) {
if (!buffer_uptodate(bh)) if (!buffer_uptodate(bh))
...@@ -1857,6 +1861,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, ...@@ -1857,6 +1861,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
if (uptodate) if (uptodate)
SetPageUptodate(page); SetPageUptodate(page);
end_page_writeback(page); end_page_writeback(page);
wbc->pages_skipped++; /* We didn't write this page */
} }
return err; return err;
......
...@@ -279,6 +279,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) ...@@ -279,6 +279,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
struct inode, i_list); struct inode, i_list);
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
struct backing_dev_info *bdi = mapping->backing_dev_info; struct backing_dev_info *bdi = mapping->backing_dev_info;
long pages_skipped;
if (bdi->memory_backed) { if (bdi->memory_backed) {
if (sb == blockdev_superblock) { if (sb == blockdev_superblock) {
...@@ -326,6 +327,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) ...@@ -326,6 +327,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
BUG_ON(inode->i_state & I_FREEING); BUG_ON(inode->i_state & I_FREEING);
__iget(inode); __iget(inode);
pages_skipped = wbc->pages_skipped;
__writeback_single_inode(inode, wbc); __writeback_single_inode(inode, wbc);
if (wbc->sync_mode == WB_SYNC_HOLD) { if (wbc->sync_mode == WB_SYNC_HOLD) {
inode->dirtied_when = jiffies; inode->dirtied_when = jiffies;
...@@ -333,6 +335,13 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) ...@@ -333,6 +335,13 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
} }
if (current_is_pdflush()) if (current_is_pdflush())
writeback_release(bdi); writeback_release(bdi);
if (wbc->pages_skipped != pages_skipped) {
/*
* writeback is not making progress due to locked
* buffers. Skip this inode for now.
*/
list_move(&inode->i_list, &sb->s_dirty);
}
spin_unlock(&inode_lock); spin_unlock(&inode_lock);
iput(inode); iput(inode);
spin_lock(&inode_lock); spin_lock(&inode_lock);
......
...@@ -39,6 +39,7 @@ struct writeback_control { ...@@ -39,6 +39,7 @@ struct writeback_control {
older than this */ older than this */
long nr_to_write; /* Write this many pages, and decrement long nr_to_write; /* Write this many pages, and decrement
this for each page written */ this for each page written */
long pages_skipped; /* Pages which were not written */
int nonblocking; /* Don't get stuck on request queues */ int nonblocking; /* Don't get stuck on request queues */
int encountered_congestion; /* An output: a queue is full */ int encountered_congestion; /* An output: a queue is full */
int for_kupdate; /* A kupdate writeback */ int for_kupdate; /* A kupdate writeback */
......
...@@ -261,13 +261,13 @@ static void background_writeout(unsigned long _min_pages) ...@@ -261,13 +261,13 @@ static void background_writeout(unsigned long _min_pages)
break; break;
wbc.encountered_congestion = 0; wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.nr_to_write = MAX_WRITEBACK_PAGES;
wbc.pages_skipped = 0;
writeback_inodes(&wbc); writeback_inodes(&wbc);
min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
if (wbc.nr_to_write > 0) { if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
/* Wrote less than expected */ /* Wrote less than expected */
if (wbc.encountered_congestion) blk_congestion_wait(WRITE, HZ/10);
blk_congestion_wait(WRITE, HZ/10); if (!wbc.encountered_congestion)
else
break; break;
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment