Commit 3ee7e869 authored by Jan Kara's avatar Jan Kara Committed by Jens Axboe

bdi: Fix another oops in wb_workfn()

syzbot is reporting NULL pointer dereference at wb_workfn() [1] due to
wb->bdi->dev being NULL. And Dmitry confirmed that wb->state was
WB_shutting_down after wb->bdi->dev became NULL. This indicates that
unregister_bdi() failed to call wb_shutdown() on one of wb objects.

The problem is in cgwb_bdi_unregister() which does cgwb_kill() and thus
drops bdi's reference to wb structures before going through the list of
wbs again and calling wb_shutdown() on each of them. This way the loop
iterating through all wbs can easily miss a wb if that wb has already
passed through cgwb_remove_from_bdi_list() called from wb_shutdown()
from cgwb_release_workfn() and as a result fully shutdown bdi although
wb_workfn() for this wb structure is still running. In fact there are
also other ways cgwb_bdi_unregister() can race with
cgwb_release_workfn() leading e.g. to use-after-free issues:

CPU1                            CPU2
                                cgwb_bdi_unregister()
                                  cgwb_kill(*slot);

cgwb_release()
  queue_work(cgwb_release_wq, &wb->release_work);
cgwb_release_workfn()
                                  wb = list_first_entry(&bdi->wb_list, ...)
                                  spin_unlock_irq(&cgwb_lock);
  wb_shutdown(wb);
  ...
  kfree_rcu(wb, rcu);
                                  wb_shutdown(wb); -> oops use-after-free

We solve these issues by synchronizing writeback structure shutdown from
cgwb_bdi_unregister() with cgwb_release_workfn() using a new mutex. That
way we also no longer need synchronization using WB_shutting_down as the
mutex provides it for CONFIG_CGROUP_WRITEBACK case and without
CONFIG_CGROUP_WRITEBACK wb_shutdown() can be called only once from
bdi_unregister().
Reported-by: default avatarsyzbot <syzbot+4a7438e774b21ddd8eca@syzkaller.appspotmail.com>
Acked-by: default avatarTejun Heo <tj@kernel.org>
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 0ae52ddf
...@@ -22,7 +22,6 @@ struct dentry; ...@@ -22,7 +22,6 @@ struct dentry;
*/ */
enum wb_state { enum wb_state {
WB_registered, /* bdi_register() was done */ WB_registered, /* bdi_register() was done */
WB_shutting_down, /* wb_shutdown() in progress */
WB_writeback_running, /* Writeback is in progress */ WB_writeback_running, /* Writeback is in progress */
WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
WB_start_all, /* nr_pages == 0 (all) work pending */ WB_start_all, /* nr_pages == 0 (all) work pending */
...@@ -189,6 +188,7 @@ struct backing_dev_info { ...@@ -189,6 +188,7 @@ struct backing_dev_info {
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
struct rb_root cgwb_congested_tree; /* their congested states */ struct rb_root cgwb_congested_tree; /* their congested states */
struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */
#else #else
struct bdi_writeback_congested *wb_congested; struct bdi_writeback_congested *wb_congested;
#endif #endif
......
...@@ -359,15 +359,8 @@ static void wb_shutdown(struct bdi_writeback *wb) ...@@ -359,15 +359,8 @@ static void wb_shutdown(struct bdi_writeback *wb)
spin_lock_bh(&wb->work_lock); spin_lock_bh(&wb->work_lock);
if (!test_and_clear_bit(WB_registered, &wb->state)) { if (!test_and_clear_bit(WB_registered, &wb->state)) {
spin_unlock_bh(&wb->work_lock); spin_unlock_bh(&wb->work_lock);
/*
* Wait for wb shutdown to finish if someone else is just
* running wb_shutdown(). Otherwise we could proceed to wb /
* bdi destruction before wb_shutdown() is finished.
*/
wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
return; return;
} }
set_bit(WB_shutting_down, &wb->state);
spin_unlock_bh(&wb->work_lock); spin_unlock_bh(&wb->work_lock);
cgwb_remove_from_bdi_list(wb); cgwb_remove_from_bdi_list(wb);
...@@ -379,12 +372,6 @@ static void wb_shutdown(struct bdi_writeback *wb) ...@@ -379,12 +372,6 @@ static void wb_shutdown(struct bdi_writeback *wb)
mod_delayed_work(bdi_wq, &wb->dwork, 0); mod_delayed_work(bdi_wq, &wb->dwork, 0);
flush_delayed_work(&wb->dwork); flush_delayed_work(&wb->dwork);
WARN_ON(!list_empty(&wb->work_list)); WARN_ON(!list_empty(&wb->work_list));
/*
* Make sure bit gets cleared after shutdown is finished. Matches with
* the barrier provided by test_and_clear_bit() above.
*/
smp_wmb();
clear_and_wake_up_bit(WB_shutting_down, &wb->state);
} }
static void wb_exit(struct bdi_writeback *wb) static void wb_exit(struct bdi_writeback *wb)
...@@ -508,10 +495,12 @@ static void cgwb_release_workfn(struct work_struct *work) ...@@ -508,10 +495,12 @@ static void cgwb_release_workfn(struct work_struct *work)
struct bdi_writeback *wb = container_of(work, struct bdi_writeback, struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work); release_work);
mutex_lock(&wb->bdi->cgwb_release_mutex);
wb_shutdown(wb); wb_shutdown(wb);
css_put(wb->memcg_css); css_put(wb->memcg_css);
css_put(wb->blkcg_css); css_put(wb->blkcg_css);
mutex_unlock(&wb->bdi->cgwb_release_mutex);
fprop_local_destroy_percpu(&wb->memcg_completions); fprop_local_destroy_percpu(&wb->memcg_completions);
percpu_ref_exit(&wb->refcnt); percpu_ref_exit(&wb->refcnt);
...@@ -697,6 +686,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) ...@@ -697,6 +686,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT; bdi->cgwb_congested_tree = RB_ROOT;
mutex_init(&bdi->cgwb_release_mutex);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) { if (!ret) {
...@@ -717,7 +707,10 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) ...@@ -717,7 +707,10 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
spin_lock_irq(&cgwb_lock); spin_lock_irq(&cgwb_lock);
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot); cgwb_kill(*slot);
spin_unlock_irq(&cgwb_lock);
mutex_lock(&bdi->cgwb_release_mutex);
spin_lock_irq(&cgwb_lock);
while (!list_empty(&bdi->wb_list)) { while (!list_empty(&bdi->wb_list)) {
wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
bdi_node); bdi_node);
...@@ -726,6 +719,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) ...@@ -726,6 +719,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
spin_lock_irq(&cgwb_lock); spin_lock_irq(&cgwb_lock);
} }
spin_unlock_irq(&cgwb_lock); spin_unlock_irq(&cgwb_lock);
mutex_unlock(&bdi->cgwb_release_mutex);
} }
/** /**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment