Commit a637120e authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe

blkcg: use radix tree to index blkgs from blkcg

blkg lookup is currently performed by traversing linked list anchored
at blkcg->blkg_list.  This is very unscalable and with blk-throttle
enabled and enough request queues on the system, this can get very
ugly quickly (blk-throttle performs look up on every bio submission).

This patch makes blkcg use radix tree to index blkgs combined with
simple last-looked-up hint.  This is mostly identical to how icqs are
indexed from ioc.

Note that because __blkg_lookup() may be invoked without holding queue
lock, hint is only updated from __blkg_lookup_create().  Due to cfq's
cfqq caching, this makes hint updates overly lazy.  This will be
improved with scheduled blkcg aware request allocation.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 496fb780
...@@ -142,11 +142,21 @@ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, ...@@ -142,11 +142,21 @@ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
struct request_queue *q) struct request_queue *q)
{ {
struct blkcg_gq *blkg; struct blkcg_gq *blkg;
struct hlist_node *n;
hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) blkg = rcu_dereference(blkcg->blkg_hint);
if (blkg->q == q) if (blkg && blkg->q == q)
return blkg; return blkg;
/*
* Hint didn't match. Look up from the radix tree. Note that we
* may not be holding queue_lock and thus are not sure whether
* @blkg from blkg_tree has already been removed or not, so we
* can't update hint to the lookup result. Leave it to the caller.
*/
blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
if (blkg && blkg->q == q)
return blkg;
return NULL; return NULL;
} }
...@@ -179,9 +189,12 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, ...@@ -179,9 +189,12 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
WARN_ON_ONCE(!rcu_read_lock_held()); WARN_ON_ONCE(!rcu_read_lock_held());
lockdep_assert_held(q->queue_lock); lockdep_assert_held(q->queue_lock);
/* lookup and update hint on success, see __blkg_lookup() for details */
blkg = __blkg_lookup(blkcg, q); blkg = __blkg_lookup(blkcg, q);
if (blkg) if (blkg) {
rcu_assign_pointer(blkcg->blkg_hint, blkg);
return blkg; return blkg;
}
/* blkg holds a reference to blkcg */ /* blkg holds a reference to blkcg */
if (!css_tryget(&blkcg->css)) if (!css_tryget(&blkcg->css))
...@@ -194,12 +207,24 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, ...@@ -194,12 +207,24 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
goto err_put; goto err_put;
/* insert */ /* insert */
ret = radix_tree_preload(GFP_ATOMIC);
if (ret)
goto err_free;
spin_lock(&blkcg->lock); spin_lock(&blkcg->lock);
hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
list_add(&blkg->q_node, &q->blkg_list); if (likely(!ret)) {
hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
list_add(&blkg->q_node, &q->blkg_list);
}
spin_unlock(&blkcg->lock); spin_unlock(&blkcg->lock);
return blkg;
radix_tree_preload_end();
if (!ret)
return blkg;
err_free:
blkg_free(blkg);
err_put: err_put:
css_put(&blkcg->css); css_put(&blkcg->css);
return ERR_PTR(ret); return ERR_PTR(ret);
...@@ -229,9 +254,19 @@ static void blkg_destroy(struct blkcg_gq *blkg) ...@@ -229,9 +254,19 @@ static void blkg_destroy(struct blkcg_gq *blkg)
/* Something wrong if we are trying to remove same group twice */ /* Something wrong if we are trying to remove same group twice */
WARN_ON_ONCE(list_empty(&blkg->q_node)); WARN_ON_ONCE(list_empty(&blkg->q_node));
WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
list_del_init(&blkg->q_node); list_del_init(&blkg->q_node);
hlist_del_init_rcu(&blkg->blkcg_node); hlist_del_init_rcu(&blkg->blkcg_node);
/*
* Both setting lookup hint to and clearing it from @blkg are done
* under queue_lock. If it's not pointing to @blkg now, it never
* will. Hint assignment itself can race safely.
*/
if (rcu_dereference_raw(blkcg->blkg_hint) == blkg)
rcu_assign_pointer(blkcg->blkg_hint, NULL);
/* /*
* Put the reference taken at the time of creation so that when all * Put the reference taken at the time of creation so that when all
* queues are gone, group can be destroyed. * queues are gone, group can be destroyed.
...@@ -593,6 +628,7 @@ static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup) ...@@ -593,6 +628,7 @@ static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup)
blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
done: done:
spin_lock_init(&blkcg->lock); spin_lock_init(&blkcg->lock);
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&blkcg->blkg_list); INIT_HLIST_HEAD(&blkcg->blkg_list);
return &blkcg->css; return &blkcg->css;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <linux/cgroup.h> #include <linux/cgroup.h>
#include <linux/u64_stats_sync.h> #include <linux/u64_stats_sync.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/radix-tree.h>
/* Max limits for throttle policy */ /* Max limits for throttle policy */
#define THROTL_IOPS_MAX UINT_MAX #define THROTL_IOPS_MAX UINT_MAX
...@@ -37,9 +38,14 @@ enum blkg_rwstat_type { ...@@ -37,9 +38,14 @@ enum blkg_rwstat_type {
BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
}; };
struct blkcg_gq;
struct blkcg { struct blkcg {
struct cgroup_subsys_state css; struct cgroup_subsys_state css;
spinlock_t lock; spinlock_t lock;
struct radix_tree_root blkg_tree;
struct blkcg_gq *blkg_hint;
struct hlist_head blkg_list; struct hlist_head blkg_list;
/* for policies to test whether associated blkcg has changed */ /* for policies to test whether associated blkcg has changed */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment