Commit 83cd4fe2 authored by Venkatesh Pallipadi's avatar Venkatesh Pallipadi Committed by Ingo Molnar

sched: Change nohz idle load balancing logic to push model

In the new push model, all idle CPUs indeed go into nohz mode. There is
still the concept of idle load balancer (performing the load balancing
on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz
balancer when any of the nohz CPUs need idle load balancing.
The kickee CPU does the idle load balancing on behalf of all idle CPUs
instead of the normal idle balance.

This addresses the below two problems with the current nohz ilb logic:
* the idle load balancer continued to have periodic ticks during idle and
  wokeup frequently, even though it did not have any rebalancing to do on
  behalf of any of the idle CPUs.
* On x86 and CPUs that have APIC timer stoppage on idle CPUs, this
  periodic wakeup can result in a periodic additional interrupt on a CPU
  doing the timer broadcast.

Also currently we are migrating the unpinned timers from an idle to the cpu
doing idle load balancing (when all the cpus in the system are idle,
there is no idle load balancing cpu and timers get added to the same idle cpu
where the request was made. So the existing optimization works only on semi idle
system).

And In semi idle system, we no longer have periodic ticks on the idle load
balancer CPU. Using that cpu will add more delays to the timers than intended
(as that cpu's timer base may not be uptodate wrt jiffies etc). This was
causing mysterious slowdowns during boot etc.

For now, in the semi idle case, use the nearest busy cpu for migrating timers
from an idle cpu.  This is good for power-savings anyway.
Signed-off-by: default avatarVenkatesh Pallipadi <venki@google.com>
Signed-off-by: default avatarSuresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent fdf3e95d
......@@ -271,14 +271,11 @@ extern int runqueue_is_locked(int cpu);
extern cpumask_var_t nohz_cpu_mask;
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
extern int select_nohz_load_balancer(int cpu);
extern int get_nohz_load_balancer(void);
extern void select_nohz_load_balancer(int stop_tick);
extern int get_nohz_timer_target(void);
extern int nohz_ratelimit(int cpu);
#else
static inline int select_nohz_load_balancer(int cpu)
{
return 0;
}
static inline void select_nohz_load_balancer(int stop_tick) { }
static inline int nohz_ratelimit(int cpu)
{
......
......@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
static int hrtimer_get_target(int this_cpu, int pinned)
{
#ifdef CONFIG_NO_HZ
if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
int preferred_cpu = get_nohz_load_balancer();
if (preferred_cpu >= 0)
return preferred_cpu;
}
if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
return get_nohz_timer_target();
#endif
return this_cpu;
}
......
......@@ -460,7 +460,7 @@ struct rq {
unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ
u64 nohz_stamp;
unsigned char in_nohz_recently;
unsigned char nohz_balance_kick;
#endif
unsigned int skip_clock_update;
......@@ -1194,6 +1194,27 @@ static void resched_cpu(int cpu)
}
#ifdef CONFIG_NO_HZ
/*
* In the semi idle case, use the nearest busy cpu for migrating timers
* from an idle cpu. This is good for power-savings.
*
* We don't do similar optimization for completely idle system, as
* selecting an idle cpu will add more delays to the timers than intended
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
*/
int get_nohz_timer_target(void)
{
int cpu = smp_processor_id();
int i;
struct sched_domain *sd;
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd))
if (!idle_cpu(i))
return i;
}
return cpu;
}
/*
* When add_timer_on() enqueues a timer into the timer wheel of an
* idle CPU then this timer might expire before the next timer event
......@@ -7791,6 +7812,10 @@ void __init sched_init(void)
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ
rq->nohz_balance_kick = 0;
init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
#endif
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
......@@ -7835,8 +7860,11 @@ void __init sched_init(void)
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
atomic_set(&nohz.load_balancer, nr_cpu_ids);
atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
#endif
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
......
This diff is collapsed.
......@@ -408,13 +408,7 @@ void tick_nohz_stop_sched_tick(int inidle)
* the scheduler tick in nohz_restart_sched_tick.
*/
if (!ts->tick_stopped) {
if (select_nohz_load_balancer(1)) {
/*
* sched tick not stopped!
*/
cpumask_clear_cpu(cpu, nohz_cpu_mask);
goto out;
}
select_nohz_load_balancer(1);
ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
......
......@@ -679,12 +679,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
cpu = smp_processor_id();
#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
int preferred_cpu = get_nohz_load_balancer();
if (preferred_cpu >= 0)
cpu = preferred_cpu;
}
if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
cpu = get_nohz_timer_target();
#endif
new_base = per_cpu(tvec_bases, cpu);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment