Commit b735a872 authored by Tejun Heo's avatar Tejun Heo Committed by Kleber Sacilotto de Souza

sched/fair: Fix O(nr_cgroups) in load balance path

BugLink: https://bugs.launchpad.net/bugs/1747896

Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
live cfs_rqs which have ever been active on the CPU; unfortunately,
this makes update_blocked_averages() O(# total cgroups) which isn't
scalable at all.

This shows up as a small CPU consumption and scheduling latency
increase in the load balancing path in systems with CPU controller
enabled across most cgroups.  In an edge case where temporary cgroups
were leaking, this caused the kernel to consume good several tens of
percents of CPU cycles running update_blocked_averages(), each run
taking multiple millisecs.

This patch fixes the issue by taking empty and fully decayed cfs_rqs
off the rq->leaf_cfs_rq_list.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
[ Added cfs_rq_is_decayed() ]
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: default avatarVincent Guittot <vincent.guittot@linaro.org>
Cc: Chris Mason <clm@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170426004350.GB3222@wtj.duckdns.orgSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>

(backported from commit a9e7f654)
Signed-off-by: default avatarGavin Guo <gavin.guo@canonical.com>
Acked-by: default avatarKleber Sacilotto de Souza <kleber.souza@canonical.com>
Acked-by: default avatarStefan Bader <stefan.bader@canonical.com>
Signed-off-by: default avatarKleber Sacilotto de Souza <kleber.souza@canonical.com>
parent 96fb9ee9
...@@ -314,8 +314,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) ...@@ -314,8 +314,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
} }
/* Iterate thr' all leaf cfs_rq's on a runqueue */ /* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \ #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */ /* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq * static inline struct cfs_rq *
...@@ -408,8 +409,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) ...@@ -408,8 +409,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{ {
} }
#define for_each_leaf_cfs_rq(rq, cfs_rq) \ #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
static inline struct sched_entity *parent_entity(struct sched_entity *se) static inline struct sched_entity *parent_entity(struct sched_entity *se)
{ {
...@@ -4065,9 +4066,9 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) ...@@ -4065,9 +4066,9 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
static void __maybe_unused update_runtime_enabled(struct rq *rq) static void __maybe_unused update_runtime_enabled(struct rq *rq)
{ {
struct cfs_rq *cfs_rq; struct cfs_rq *cfs_rq, *pos;
for_each_leaf_cfs_rq(rq, cfs_rq) { for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
raw_spin_lock(&cfs_b->lock); raw_spin_lock(&cfs_b->lock);
...@@ -4078,9 +4079,9 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) ...@@ -4078,9 +4079,9 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{ {
struct cfs_rq *cfs_rq; struct cfs_rq *cfs_rq, *pos;
for_each_leaf_cfs_rq(rq, cfs_rq) { for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
if (!cfs_rq->runtime_enabled) if (!cfs_rq->runtime_enabled)
continue; continue;
...@@ -5966,10 +5967,28 @@ static void attach_tasks(struct lb_env *env) ...@@ -5966,10 +5967,28 @@ static void attach_tasks(struct lb_env *env)
} }
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
if (cfs_rq->load.weight)
return false;
if (cfs_rq->avg.load_sum)
return false;
if (cfs_rq->avg.util_sum)
return false;
if (cfs_rq->runnable_load_sum)
return false;
return true;
}
static void update_blocked_averages(int cpu) static void update_blocked_averages(int cpu)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq; struct cfs_rq *cfs_rq, *pos;
unsigned long flags; unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags); raw_spin_lock_irqsave(&rq->lock, flags);
...@@ -5979,13 +5998,21 @@ static void update_blocked_averages(int cpu) ...@@ -5979,13 +5998,21 @@ static void update_blocked_averages(int cpu)
* Iterates the task_group tree in a bottom up fashion, see * Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details. * list_add_leaf_cfs_rq() for details.
*/ */
for_each_leaf_cfs_rq(rq, cfs_rq) { for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
/* throttled entities do not contribute to load */ /* throttled entities do not contribute to load */
if (throttled_hierarchy(cfs_rq)) if (throttled_hierarchy(cfs_rq))
continue; continue;
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0); update_tg_load_avg(cfs_rq, 0);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
* decayed cfs_rqs linger on the list.
*/
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
} }
raw_spin_unlock_irqrestore(&rq->lock, flags); raw_spin_unlock_irqrestore(&rq->lock, flags);
} }
...@@ -8386,10 +8413,10 @@ const struct sched_class fair_sched_class = { ...@@ -8386,10 +8413,10 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu) void print_cfs_stats(struct seq_file *m, int cpu)
{ {
struct cfs_rq *cfs_rq; struct cfs_rq *cfs_rq, *pos;
rcu_read_lock(); rcu_read_lock();
for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
print_cfs_rq(m, cpu, cfs_rq); print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock(); rcu_read_unlock();
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment