Commit f9cae926 authored by Jan Kara's avatar Jan Kara

writeback: Fix sync livelock due to b_dirty_time processing

When we are processing writeback for sync(2), move_expired_inodes()
didn't set any inode expiry value (older_than_this). This can result in
writeback never completing if there's steady stream of inodes added to
b_dirty_time list as writeback rechecks dirty lists after each writeback
round whether there's more work to be done. Fix the problem by using
sync(2) start time is inode expiry value when processing b_dirty_time
list similarly as for ordinarily dirtied inodes. This requires some
refactoring of older_than_this handling which simplifies the code
noticeably as a bonus.

Fixes: 0ae45f63 ("vfs: add support for a lazytime mount option")
CC: stable@vger.kernel.org
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarJan Kara <jack@suse.cz>
parent 5afced3b
...@@ -42,7 +42,6 @@ ...@@ -42,7 +42,6 @@
struct wb_writeback_work { struct wb_writeback_work {
long nr_pages; long nr_pages;
struct super_block *sb; struct super_block *sb;
unsigned long *older_than_this;
enum writeback_sync_modes sync_mode; enum writeback_sync_modes sync_mode;
unsigned int tagged_writepages:1; unsigned int tagged_writepages:1;
unsigned int for_kupdate:1; unsigned int for_kupdate:1;
...@@ -1234,16 +1233,13 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) ...@@ -1234,16 +1233,13 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
#define EXPIRE_DIRTY_ATIME 0x0001 #define EXPIRE_DIRTY_ATIME 0x0001
/* /*
* Move expired (dirtied before work->older_than_this) dirty inodes from * Move expired (dirtied before dirtied_before) dirty inodes from
* @delaying_queue to @dispatch_queue. * @delaying_queue to @dispatch_queue.
*/ */
static int move_expired_inodes(struct list_head *delaying_queue, static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue, struct list_head *dispatch_queue,
int flags, int flags, unsigned long dirtied_before)
struct wb_writeback_work *work)
{ {
unsigned long *older_than_this = NULL;
unsigned long expire_time;
LIST_HEAD(tmp); LIST_HEAD(tmp);
struct list_head *pos, *node; struct list_head *pos, *node;
struct super_block *sb = NULL; struct super_block *sb = NULL;
...@@ -1251,16 +1247,9 @@ static int move_expired_inodes(struct list_head *delaying_queue, ...@@ -1251,16 +1247,9 @@ static int move_expired_inodes(struct list_head *delaying_queue,
int do_sb_sort = 0; int do_sb_sort = 0;
int moved = 0; int moved = 0;
if ((flags & EXPIRE_DIRTY_ATIME) == 0)
older_than_this = work->older_than_this;
else if (!work->for_sync) {
expire_time = jiffies - (dirtytime_expire_interval * HZ);
older_than_this = &expire_time;
}
while (!list_empty(delaying_queue)) { while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev); inode = wb_inode(delaying_queue->prev);
if (older_than_this && if (inode_dirtied_after(inode, dirtied_before))
inode_dirtied_after(inode, *older_than_this))
break; break;
list_move(&inode->i_io_list, &tmp); list_move(&inode->i_io_list, &tmp);
moved++; moved++;
...@@ -1306,18 +1295,22 @@ static int move_expired_inodes(struct list_head *delaying_queue, ...@@ -1306,18 +1295,22 @@ static int move_expired_inodes(struct list_head *delaying_queue,
* | * |
* +--> dequeue for IO * +--> dequeue for IO
*/ */
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
unsigned long dirtied_before)
{ {
int moved; int moved;
unsigned long time_expire_jif = dirtied_before;
assert_spin_locked(&wb->list_lock); assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io); list_splice_init(&wb->b_more_io, &wb->b_io);
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, dirtied_before);
if (!work->for_sync)
time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
EXPIRE_DIRTY_ATIME, work); EXPIRE_DIRTY_ATIME, time_expire_jif);
if (moved) if (moved)
wb_io_lists_populated(wb); wb_io_lists_populated(wb);
trace_writeback_queue_io(wb, work, moved); trace_writeback_queue_io(wb, work, dirtied_before, moved);
} }
static int write_inode(struct inode *inode, struct writeback_control *wbc) static int write_inode(struct inode *inode, struct writeback_control *wbc)
...@@ -1829,7 +1822,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, ...@@ -1829,7 +1822,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
blk_start_plug(&plug); blk_start_plug(&plug);
spin_lock(&wb->list_lock); spin_lock(&wb->list_lock);
if (list_empty(&wb->b_io)) if (list_empty(&wb->b_io))
queue_io(wb, &work); queue_io(wb, &work, jiffies);
__writeback_inodes_wb(wb, &work); __writeback_inodes_wb(wb, &work);
spin_unlock(&wb->list_lock); spin_unlock(&wb->list_lock);
blk_finish_plug(&plug); blk_finish_plug(&plug);
...@@ -1849,7 +1842,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, ...@@ -1849,7 +1842,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
* takes longer than a dirty_writeback_interval interval, then leave a * takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap. * one-second gap.
* *
* older_than_this takes precedence over nr_to_write. So we'll only write back * dirtied_before takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings. * all dirty pages if they are all attached to "old" mappings.
*/ */
static long wb_writeback(struct bdi_writeback *wb, static long wb_writeback(struct bdi_writeback *wb,
...@@ -1857,14 +1850,11 @@ static long wb_writeback(struct bdi_writeback *wb, ...@@ -1857,14 +1850,11 @@ static long wb_writeback(struct bdi_writeback *wb,
{ {
unsigned long wb_start = jiffies; unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages; long nr_pages = work->nr_pages;
unsigned long oldest_jif; unsigned long dirtied_before = jiffies;
struct inode *inode; struct inode *inode;
long progress; long progress;
struct blk_plug plug; struct blk_plug plug;
oldest_jif = jiffies;
work->older_than_this = &oldest_jif;
blk_start_plug(&plug); blk_start_plug(&plug);
spin_lock(&wb->list_lock); spin_lock(&wb->list_lock);
for (;;) { for (;;) {
...@@ -1898,14 +1888,14 @@ static long wb_writeback(struct bdi_writeback *wb, ...@@ -1898,14 +1888,14 @@ static long wb_writeback(struct bdi_writeback *wb,
* safe. * safe.
*/ */
if (work->for_kupdate) { if (work->for_kupdate) {
oldest_jif = jiffies - dirtied_before = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10); msecs_to_jiffies(dirty_expire_interval * 10);
} else if (work->for_background) } else if (work->for_background)
oldest_jif = jiffies; dirtied_before = jiffies;
trace_writeback_start(wb, work); trace_writeback_start(wb, work);
if (list_empty(&wb->b_io)) if (list_empty(&wb->b_io))
queue_io(wb, work); queue_io(wb, work, dirtied_before);
if (work->sb) if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work); progress = writeback_sb_inodes(work->sb, wb, work);
else else
......
...@@ -498,8 +498,9 @@ DEFINE_WBC_EVENT(wbc_writepage); ...@@ -498,8 +498,9 @@ DEFINE_WBC_EVENT(wbc_writepage);
TRACE_EVENT(writeback_queue_io, TRACE_EVENT(writeback_queue_io,
TP_PROTO(struct bdi_writeback *wb, TP_PROTO(struct bdi_writeback *wb,
struct wb_writeback_work *work, struct wb_writeback_work *work,
unsigned long dirtied_before,
int moved), int moved),
TP_ARGS(wb, work, moved), TP_ARGS(wb, work, dirtied_before, moved),
TP_STRUCT__entry( TP_STRUCT__entry(
__array(char, name, 32) __array(char, name, 32)
__field(unsigned long, older) __field(unsigned long, older)
...@@ -509,19 +510,17 @@ TRACE_EVENT(writeback_queue_io, ...@@ -509,19 +510,17 @@ TRACE_EVENT(writeback_queue_io,
__field(ino_t, cgroup_ino) __field(ino_t, cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
unsigned long *older_than_this = work->older_than_this;
strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
__entry->older = older_than_this ? *older_than_this : 0; __entry->older = dirtied_before;
__entry->age = older_than_this ? __entry->age = (jiffies - dirtied_before) * 1000 / HZ;
(jiffies - *older_than_this) * 1000 / HZ : -1;
__entry->moved = moved; __entry->moved = moved;
__entry->reason = work->reason; __entry->reason = work->reason;
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
), ),
TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu",
__entry->name, __entry->name,
__entry->older, /* older_than_this in jiffies */ __entry->older, /* dirtied_before in jiffies */
__entry->age, /* older_than_this in relative milliseconds */ __entry->age, /* dirtied_before in relative milliseconds */
__entry->moved, __entry->moved,
__print_symbolic(__entry->reason, WB_WORK_REASON), __print_symbolic(__entry->reason, WB_WORK_REASON),
(unsigned long)__entry->cgroup_ino (unsigned long)__entry->cgroup_ino
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment