Commit 59b57717 authored by Dennis Zhou (Facebook)'s avatar Dennis Zhou (Facebook) Committed by Jens Axboe

blkcg: delay blkg destruction until after writeback has finished

Currently, blkcg destruction relies on a sequence of events:
  1. Destruction starts. blkcg_css_offline() is called and blkgs
     release their reference to the blkcg. This immediately destroys
     the cgwbs (writeback).
  2. With blkgs giving up their reference, the blkcg ref count should
     become zero and eventually call blkcg_css_free() which finally
     frees the blkcg.

Jiufei Xue reported that there is a race between blkcg_bio_issue_check()
and cgroup_rmdir(). To remedy this, blkg destruction becomes contingent
on the completion of all writeback associated with the blkcg. A count of
the number of cgwbs is maintained and once that goes to zero, blkg
destruction can follow. This should prevent premature blkg destruction
related to writeback.

The new process for blkcg cleanup is as follows:
  1. Destruction starts. blkcg_css_offline() is called which offlines
     writeback. Blkg destruction is delayed on the cgwb_refcnt count to
     avoid punting potentially large amounts of outstanding writeback
     to root while maintaining any ongoing policies. Here, the base
     cgwb_refcnt is put back.
  2. When the cgwb_refcnt becomes zero, blkcg_destroy_blkgs() is called
     and handles destruction of blkgs. This is where the css reference
     held by each blkg is released.
  3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
     This finally frees the blkg.

It seems in the past blk-throttle didn't do the most understandable
things with taking data from a blkg while associating with current. So,
the simplification and unification of what blk-throttle is doing caused
this.

Fixes: 08e18eab ("block: add bi_blkg to the bio for cgroups")
Reviewed-by: default avatarJosef Bacik <josef@toxicpanda.com>
Signed-off-by: default avatarDennis Zhou <dennisszhou@gmail.com>
Cc: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 6b065462
...@@ -1042,21 +1042,59 @@ static struct cftype blkcg_legacy_files[] = { ...@@ -1042,21 +1042,59 @@ static struct cftype blkcg_legacy_files[] = {
{ } /* terminate */ { } /* terminate */
}; };
/*
* blkcg destruction is a three-stage process.
*
* 1. Destruction starts. The blkcg_css_offline() callback is invoked
* which offlines writeback. Here we tie the next stage of blkg destruction
* to the completion of writeback associated with the blkcg. This lets us
* avoid punting potentially large amounts of outstanding writeback to root
* while maintaining any ongoing policies. The next stage is triggered when
* the nr_cgwbs count goes to zero.
*
* 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
* and handles the destruction of blkgs. Here the css reference held by
* the blkg is put back eventually allowing blkcg_css_free() to be called.
* This work may occur in cgwb_release_workfn() on the cgwb_release
* workqueue. Any submitted ios that fail to get the blkg ref will be
* punted to the root_blkg.
*
* 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
* This finally frees the blkcg.
*/
/** /**
* blkcg_css_offline - cgroup css_offline callback * blkcg_css_offline - cgroup css_offline callback
* @css: css of interest * @css: css of interest
* *
* This function is called when @css is about to go away and responsible * This function is called when @css is about to go away. Here the cgwbs are
* for shooting down all blkgs associated with @css. blkgs should be * offlined first and only once writeback associated with the blkcg has
* removed while holding both q and blkcg locks. As blkcg lock is nested * finished do we start step 2 (see above).
* inside q lock, this function performs reverse double lock dancing.
*
* This is the blkcg counterpart of ioc_release_fn().
*/ */
static void blkcg_css_offline(struct cgroup_subsys_state *css) static void blkcg_css_offline(struct cgroup_subsys_state *css)
{ {
struct blkcg *blkcg = css_to_blkcg(css); struct blkcg *blkcg = css_to_blkcg(css);
/* this prevents anyone from attaching or migrating to this blkcg */
wb_blkcg_offline(blkcg);
/* put the base cgwb reference allowing step 2 to be triggered */
blkcg_cgwb_put(blkcg);
}
/**
* blkcg_destroy_blkgs - responsible for shooting down blkgs
* @blkcg: blkcg of interest
*
* blkgs should be removed while holding both q and blkcg locks. As blkcg lock
* is nested inside q lock, this function performs reverse double lock dancing.
* Destroying the blkgs releases the reference held on the blkcg's css allowing
* blkcg_css_free to eventually be called.
*
* This is the blkcg counterpart of ioc_release_fn().
*/
void blkcg_destroy_blkgs(struct blkcg *blkcg)
{
spin_lock_irq(&blkcg->lock); spin_lock_irq(&blkcg->lock);
while (!hlist_empty(&blkcg->blkg_list)) { while (!hlist_empty(&blkcg->blkg_list)) {
...@@ -1075,8 +1113,6 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) ...@@ -1075,8 +1113,6 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
} }
spin_unlock_irq(&blkcg->lock); spin_unlock_irq(&blkcg->lock);
wb_blkcg_offline(blkcg);
} }
static void blkcg_css_free(struct cgroup_subsys_state *css) static void blkcg_css_free(struct cgroup_subsys_state *css)
...@@ -1146,6 +1182,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) ...@@ -1146,6 +1182,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
INIT_HLIST_HEAD(&blkcg->blkg_list); INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&blkcg->cgwb_list); INIT_LIST_HEAD(&blkcg->cgwb_list);
refcount_set(&blkcg->cgwb_refcnt, 1);
#endif #endif
list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
......
...@@ -56,6 +56,7 @@ struct blkcg { ...@@ -56,6 +56,7 @@ struct blkcg {
struct list_head all_blkcgs_node; struct list_head all_blkcgs_node;
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list; struct list_head cgwb_list;
refcount_t cgwb_refcnt;
#endif #endif
}; };
...@@ -386,6 +387,49 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) ...@@ -386,6 +387,49 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
return cpd ? cpd->blkcg : NULL; return cpd ? cpd->blkcg : NULL;
} }
extern void blkcg_destroy_blkgs(struct blkcg *blkcg);
#ifdef CONFIG_CGROUP_WRITEBACK
/**
* blkcg_cgwb_get - get a reference for blkcg->cgwb_list
* @blkcg: blkcg of interest
*
* This is used to track the number of active wb's related to a blkcg.
*/
static inline void blkcg_cgwb_get(struct blkcg *blkcg)
{
refcount_inc(&blkcg->cgwb_refcnt);
}
/**
* blkcg_cgwb_put - put a reference for @blkcg->cgwb_list
* @blkcg: blkcg of interest
*
* This is used to track the number of active wb's related to a blkcg.
* When this count goes to zero, all active wb has finished so the
* blkcg can continue destruction by calling blkcg_destroy_blkgs().
* This work may occur in cgwb_release_workfn() on the cgwb_release
* workqueue.
*/
static inline void blkcg_cgwb_put(struct blkcg *blkcg)
{
if (refcount_dec_and_test(&blkcg->cgwb_refcnt))
blkcg_destroy_blkgs(blkcg);
}
#else
static inline void blkcg_cgwb_get(struct blkcg *blkcg) { }
static inline void blkcg_cgwb_put(struct blkcg *blkcg)
{
/* wb isn't being accounted, so trigger destruction right away */
blkcg_destroy_blkgs(blkcg);
}
#endif
/** /**
* blkg_path - format cgroup path of blkg * blkg_path - format cgroup path of blkg
* @blkg: blkg of interest * @blkg: blkg of interest
......
...@@ -491,6 +491,7 @@ static void cgwb_release_workfn(struct work_struct *work) ...@@ -491,6 +491,7 @@ static void cgwb_release_workfn(struct work_struct *work)
{ {
struct bdi_writeback *wb = container_of(work, struct bdi_writeback, struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work); release_work);
struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
mutex_lock(&wb->bdi->cgwb_release_mutex); mutex_lock(&wb->bdi->cgwb_release_mutex);
wb_shutdown(wb); wb_shutdown(wb);
...@@ -499,6 +500,9 @@ static void cgwb_release_workfn(struct work_struct *work) ...@@ -499,6 +500,9 @@ static void cgwb_release_workfn(struct work_struct *work)
css_put(wb->blkcg_css); css_put(wb->blkcg_css);
mutex_unlock(&wb->bdi->cgwb_release_mutex); mutex_unlock(&wb->bdi->cgwb_release_mutex);
/* triggers blkg destruction if cgwb_refcnt becomes zero */
blkcg_cgwb_put(blkcg);
fprop_local_destroy_percpu(&wb->memcg_completions); fprop_local_destroy_percpu(&wb->memcg_completions);
percpu_ref_exit(&wb->refcnt); percpu_ref_exit(&wb->refcnt);
wb_exit(wb); wb_exit(wb);
...@@ -597,6 +601,7 @@ static int cgwb_create(struct backing_dev_info *bdi, ...@@ -597,6 +601,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list);
blkcg_cgwb_get(blkcg);
css_get(memcg_css); css_get(memcg_css);
css_get(blkcg_css); css_get(blkcg_css);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment