Commit 850f7d78 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] sched: trivial fixes, cleanups

From: Ingo Molnar <mingo@elte.hu>

The trivial fixes.

- added recent trivial bits from Nick's and my patches.
- hotplug CPU fix
- early init cleanup
parent fa8f2c50
...@@ -719,7 +719,7 @@ config X86_PAE ...@@ -719,7 +719,7 @@ config X86_PAE
# Common NUMA Features # Common NUMA Features
config NUMA config NUMA
bool "Numa Memory Allocation Support" bool "Numa Memory Allocation and Scheduler Support"
depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI))
default n if X86_PC default n if X86_PC
default y if (X86_NUMAQ || X86_SUMMIT) default y if (X86_NUMAQ || X86_SUMMIT)
......
...@@ -1162,9 +1162,9 @@ __init void arch_init_sched_domains(void) ...@@ -1162,9 +1162,9 @@ __init void arch_init_sched_domains(void)
first_cpu = last_cpu = NULL; first_cpu = last_cpu = NULL;
if (i != first_cpu(cpu_domain->span)) { if (i != first_cpu(cpu_domain->span)) {
cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER;
cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= cpu_sched_domain(first_cpu(cpu_domain->span))->flags |=
SD_FLAG_SHARE_CPUPOWER; SD_SHARE_CPUPOWER;
continue; continue;
} }
...@@ -1258,7 +1258,7 @@ __init void arch_init_sched_domains(void) ...@@ -1258,7 +1258,7 @@ __init void arch_init_sched_domains(void)
cpu_domain->groups = cpu_group; cpu_domain->groups = cpu_group;
} }
} }
#else /* CONFIG_NUMA */ #else /* !CONFIG_NUMA */
static struct sched_group sched_group_cpus[NR_CPUS]; static struct sched_group sched_group_cpus[NR_CPUS];
static struct sched_group sched_group_phys[NR_CPUS]; static struct sched_group sched_group_phys[NR_CPUS];
static DEFINE_PER_CPU(struct sched_domain, phys_domains); static DEFINE_PER_CPU(struct sched_domain, phys_domains);
...@@ -1286,9 +1286,9 @@ __init void arch_init_sched_domains(void) ...@@ -1286,9 +1286,9 @@ __init void arch_init_sched_domains(void)
first_cpu = last_cpu = NULL; first_cpu = last_cpu = NULL;
if (i != first_cpu(cpu_domain->span)) { if (i != first_cpu(cpu_domain->span)) {
cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER;
cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= cpu_sched_domain(first_cpu(cpu_domain->span))->flags |=
SD_FLAG_SHARE_CPUPOWER; SD_SHARE_CPUPOWER;
continue; continue;
} }
......
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
# define HZ 1000 /* Internal kernel timer frequency */ # define HZ 1000 /* Internal kernel timer frequency */
# define USER_HZ 100 /* .. some user interfaces are in "ticks" */ # define USER_HZ 100 /* .. some user interfaces are in "ticks" */
# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ # define CLOCKS_PER_SEC (USER_HZ) /* like times() */
# define JIFFIES_TO_MSEC(x) (x)
# define MSEC_TO_JIFFIES(x) (x)
#endif #endif
#ifndef HZ #ifndef HZ
......
...@@ -650,7 +650,7 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c); ...@@ -650,7 +650,7 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c);
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
#define ARCH_HAS_SCHED_DOMAIN #define ARCH_HAS_SCHED_DOMAIN
#define ARCH_HAS_SCHED_WAKE_BALANCE #define ARCH_HAS_SCHED_WAKE_IDLE
#endif #endif
#endif /* __ASM_I386_PROCESSOR_H */ #endif /* __ASM_I386_PROCESSOR_H */
...@@ -543,14 +543,13 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0) ...@@ -543,14 +543,13 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
#define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define SCHED_LOAD_SHIFT 7 /* increase resolution of load calculations */ #define SCHED_LOAD_SCALE 128UL /* increase resolution of load */
#define SCHED_LOAD_SCALE (1UL << SCHED_LOAD_SHIFT)
#define SD_FLAG_NEWIDLE 1 /* Balance when about to become idle */ #define SD_BALANCE_NEWIDLE 1 /* Balance when about to become idle */
#define SD_FLAG_EXEC 2 /* Balance on exec */ #define SD_BALANCE_EXEC 2 /* Balance on exec */
#define SD_FLAG_WAKE 4 /* Balance on task wakeup */ #define SD_WAKE_IDLE 4 /* Wake to idle CPU on task wakeup */
#define SD_FLAG_FASTMIGRATE 8 /* Sync wakes put task on waking CPU */ #define SD_WAKE_AFFINE 8 /* Wake task to waking CPU */
#define SD_FLAG_SHARE_CPUPOWER 16 /* Domain members share cpu power */ #define SD_SHARE_CPUPOWER 16 /* Domain members share cpu power */
struct sched_group { struct sched_group {
struct sched_group *next; /* Must be a circular list */ struct sched_group *next; /* Must be a circular list */
...@@ -577,7 +576,7 @@ struct sched_domain { ...@@ -577,7 +576,7 @@ struct sched_domain {
unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */
int flags; /* See SD_FLAG_* */ int flags; /* See SD_* */
/* Runtime fields. */ /* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies */ unsigned long last_balance; /* init to jiffies. units in jiffies */
...@@ -597,7 +596,9 @@ struct sched_domain { ...@@ -597,7 +596,9 @@ struct sched_domain {
.cache_hot_time = 0, \ .cache_hot_time = 0, \
.cache_nice_tries = 0, \ .cache_nice_tries = 0, \
.per_cpu_gain = 15, \ .per_cpu_gain = 15, \
.flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE | SD_FLAG_WAKE,\ .flags = SD_BALANCE_NEWIDLE \
| SD_WAKE_AFFINE \
| SD_WAKE_IDLE, \
.last_balance = jiffies, \ .last_balance = jiffies, \
.balance_interval = 1, \ .balance_interval = 1, \
.nr_balance_failed = 0, \ .nr_balance_failed = 0, \
...@@ -615,7 +616,8 @@ struct sched_domain { ...@@ -615,7 +616,8 @@ struct sched_domain {
.cache_hot_time = (5*1000000/2), \ .cache_hot_time = (5*1000000/2), \
.cache_nice_tries = 1, \ .cache_nice_tries = 1, \
.per_cpu_gain = 100, \ .per_cpu_gain = 100, \
.flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE,\ .flags = SD_BALANCE_NEWIDLE \
| SD_WAKE_AFFINE, \
.last_balance = jiffies, \ .last_balance = jiffies, \
.balance_interval = 1, \ .balance_interval = 1, \
.nr_balance_failed = 0, \ .nr_balance_failed = 0, \
...@@ -634,7 +636,7 @@ struct sched_domain { ...@@ -634,7 +636,7 @@ struct sched_domain {
.cache_hot_time = (10*1000000), \ .cache_hot_time = (10*1000000), \
.cache_nice_tries = 1, \ .cache_nice_tries = 1, \
.per_cpu_gain = 100, \ .per_cpu_gain = 100, \
.flags = SD_FLAG_EXEC, \ .flags = SD_BALANCE_EXEC, \
.last_balance = jiffies, \ .last_balance = jiffies, \
.balance_interval = 1, \ .balance_interval = 1, \
.nr_balance_failed = 0, \ .nr_balance_failed = 0, \
...@@ -645,6 +647,9 @@ DECLARE_PER_CPU(struct sched_domain, base_domains); ...@@ -645,6 +647,9 @@ DECLARE_PER_CPU(struct sched_domain, base_domains);
#define cpu_sched_domain(cpu) (&per_cpu(base_domains, (cpu))) #define cpu_sched_domain(cpu) (&per_cpu(base_domains, (cpu)))
#define this_sched_domain() (&__get_cpu_var(base_domains)) #define this_sched_domain() (&__get_cpu_var(base_domains))
#define for_each_domain(cpu, domain) \
for (domain = cpu_sched_domain(cpu); domain; domain = domain->parent)
extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
#else #else
static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask) static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)
......
...@@ -417,6 +417,13 @@ asmlinkage void __init start_kernel(void) ...@@ -417,6 +417,13 @@ asmlinkage void __init start_kernel(void)
*/ */
smp_prepare_boot_cpu(); smp_prepare_boot_cpu();
/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
build_all_zonelists(); build_all_zonelists();
page_alloc_init(); page_alloc_init();
printk("Kernel command line: %s\n", saved_command_line); printk("Kernel command line: %s\n", saved_command_line);
...@@ -428,7 +435,7 @@ asmlinkage void __init start_kernel(void) ...@@ -428,7 +435,7 @@ asmlinkage void __init start_kernel(void)
rcu_init(); rcu_init();
init_IRQ(); init_IRQ();
pidhash_init(); pidhash_init();
sched_init(); init_timers();
softirq_init(); softirq_init();
time_init(); time_init();
......
...@@ -75,6 +75,13 @@ ...@@ -75,6 +75,13 @@
#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
#ifndef JIFFIES_TO_MSEC
# define JIFFIES_TO_MSEC(x) ((x) * 1000 / HZ)
#endif
#ifndef MSEC_TO_JIFFIES
# define MSEC_TO_JIFFIES(x) ((x) * HZ / 1000)
#endif
/* /*
* These are the 'tuning knobs' of the scheduler: * These are the 'tuning knobs' of the scheduler:
* *
...@@ -257,16 +264,6 @@ const unsigned long scheduling_functions_end_here = ...@@ -257,16 +264,6 @@ const unsigned long scheduling_functions_end_here =
# define task_running(rq, p) ((rq)->curr == (p)) # define task_running(rq, p) ((rq)->curr == (p))
#endif #endif
static inline void nr_running_inc(runqueue_t *rq)
{
rq->nr_running++;
}
static inline void nr_running_dec(runqueue_t *rq)
{
rq->nr_running--;
}
/* /*
* task_rq_lock - lock the runqueue a given task resides on and disable * task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without * interrupts. Note the ordering: we can safely lookup the task_rq without
...@@ -367,7 +364,7 @@ static int effective_prio(task_t *p) ...@@ -367,7 +364,7 @@ static int effective_prio(task_t *p)
static inline void __activate_task(task_t *p, runqueue_t *rq) static inline void __activate_task(task_t *p, runqueue_t *rq)
{ {
enqueue_task(p, rq->active); enqueue_task(p, rq->active);
nr_running_inc(rq); rq->nr_running++;
} }
static void recalc_task_prio(task_t *p, unsigned long long now) static void recalc_task_prio(task_t *p, unsigned long long now)
...@@ -488,7 +485,7 @@ static inline void activate_task(task_t *p, runqueue_t *rq) ...@@ -488,7 +485,7 @@ static inline void activate_task(task_t *p, runqueue_t *rq)
*/ */
static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
{ {
nr_running_dec(rq); rq->nr_running--;
if (p->state == TASK_UNINTERRUPTIBLE) if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++; rq->nr_uninterruptible++;
dequeue_task(p, p->array); dequeue_task(p, p->array);
...@@ -502,9 +499,9 @@ static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) ...@@ -502,9 +499,9 @@ static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
* might also involve a cross-CPU call to trigger the scheduler on * might also involve a cross-CPU call to trigger the scheduler on
* the target CPU. * the target CPU.
*/ */
#ifdef CONFIG_SMP
static inline void resched_task(task_t *p) static inline void resched_task(task_t *p)
{ {
#ifdef CONFIG_SMP
int need_resched, nrpolling; int need_resched, nrpolling;
preempt_disable(); preempt_disable();
...@@ -516,10 +513,13 @@ static inline void resched_task(task_t *p) ...@@ -516,10 +513,13 @@ static inline void resched_task(task_t *p)
if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
smp_send_reschedule(task_cpu(p)); smp_send_reschedule(task_cpu(p));
preempt_enable(); preempt_enable();
}
#else #else
static inline void resched_task(task_t *p)
{
set_tsk_need_resched(p); set_tsk_need_resched(p);
#endif
} }
#endif
/** /**
* task_curr - is this task currently executing on a CPU? * task_curr - is this task currently executing on a CPU?
...@@ -611,13 +611,14 @@ void kick_process(task_t *p) ...@@ -611,13 +611,14 @@ void kick_process(task_t *p)
} }
EXPORT_SYMBOL_GPL(kick_process); EXPORT_SYMBOL_GPL(kick_process);
/* /*
* Return a low guess at the load of cpu. * Return a low guess at the load of cpu.
*/ */
static inline unsigned long get_low_cpu_load(int cpu) static inline unsigned long get_low_cpu_load(int cpu)
{ {
runqueue_t *rq = cpu_rq(cpu); runqueue_t *rq = cpu_rq(cpu);
unsigned long load_now = rq->nr_running << SCHED_LOAD_SHIFT; unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
return min(rq->cpu_load, load_now); return min(rq->cpu_load, load_now);
} }
...@@ -625,7 +626,7 @@ static inline unsigned long get_low_cpu_load(int cpu) ...@@ -625,7 +626,7 @@ static inline unsigned long get_low_cpu_load(int cpu)
static inline unsigned long get_high_cpu_load(int cpu) static inline unsigned long get_high_cpu_load(int cpu)
{ {
runqueue_t *rq = cpu_rq(cpu); runqueue_t *rq = cpu_rq(cpu);
unsigned long load_now = rq->nr_running << SCHED_LOAD_SHIFT; unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
return max(rq->cpu_load, load_now); return max(rq->cpu_load, load_now);
} }
...@@ -633,26 +634,27 @@ static inline unsigned long get_high_cpu_load(int cpu) ...@@ -633,26 +634,27 @@ static inline unsigned long get_high_cpu_load(int cpu)
#endif #endif
/* /*
* sched_balance_wake can be used with SMT architectures to wake a * wake_idle() is useful especially on SMT architectures to wake a
* task onto an idle sibling if cpu is not idle. Returns cpu if * task onto an idle sibling if we would otherwise wake it onto a
* cpu is idle or no siblings are idle, otherwise returns an idle * busy sibling.
* sibling. *
* Returns the CPU we should wake onto.
*/ */
#if defined(CONFIG_SMP) && defined(ARCH_HAS_SCHED_WAKE_BALANCE) #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
static int sched_balance_wake(int cpu, task_t *p) static int wake_idle(int cpu, task_t *p)
{ {
cpumask_t tmp; cpumask_t tmp;
struct sched_domain *domain; struct sched_domain *sd;
int i; int i;
if (idle_cpu(cpu)) if (idle_cpu(cpu))
return cpu; return cpu;
domain = cpu_sched_domain(cpu); sd = cpu_sched_domain(cpu);
if (!(domain->flags & SD_FLAG_WAKE)) if (!(sd->flags & SD_WAKE_IDLE))
return cpu; return cpu;
cpus_and(tmp, domain->span, cpu_online_map); cpus_and(tmp, sd->span, cpu_online_map);
for_each_cpu_mask(i, tmp) { for_each_cpu_mask(i, tmp) {
if (!cpu_isset(i, p->cpus_allowed)) if (!cpu_isset(i, p->cpus_allowed))
continue; continue;
...@@ -664,7 +666,7 @@ static int sched_balance_wake(int cpu, task_t *p) ...@@ -664,7 +666,7 @@ static int sched_balance_wake(int cpu, task_t *p)
return cpu; return cpu;
} }
#else #else
static inline int sched_balance_wake(int cpu, task_t *p) static inline int wake_idle(int cpu, task_t *p)
{ {
return cpu; return cpu;
} }
...@@ -694,8 +696,8 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) ...@@ -694,8 +696,8 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
unsigned long long now; unsigned long long now;
unsigned long load, this_load; unsigned long load, this_load;
int new_cpu;
struct sched_domain *sd; struct sched_domain *sd;
int new_cpu;
#endif #endif
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
...@@ -706,49 +708,44 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) ...@@ -706,49 +708,44 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
if (p->array) if (p->array)
goto out_running; goto out_running;
this_cpu = smp_processor_id();
cpu = task_cpu(p); cpu = task_cpu(p);
this_cpu = smp_processor_id();
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (cpu == this_cpu || unlikely(cpu_is_offline(this_cpu))) if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu)))
goto out_activate; goto out_activate;
if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed) new_cpu = this_cpu; /* Wake to this CPU if we can */
|| task_running(rq, p)))
goto out_activate; if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
goto out_set_cpu;
/* Passive load balancing */ /* Passive load balancing */
load = get_low_cpu_load(cpu); load = get_low_cpu_load(cpu);
this_load = get_high_cpu_load(this_cpu) + SCHED_LOAD_SCALE; this_load = get_high_cpu_load(this_cpu) + SCHED_LOAD_SCALE;
if (load > this_load) { if (load > this_load)
new_cpu = sched_balance_wake(this_cpu, p); goto out_set_cpu;
set_task_cpu(p, new_cpu);
goto repeat_lock_task;
}
now = sched_clock(); now = sched_clock();
sd = cpu_sched_domain(this_cpu);
/* /*
* Fast-migrate the task if it's not running or * Migrate the task to the waking domain.
* runnable currently. Do not violate hard affinity. * Do not violate hard affinity.
*/ */
do { for_each_domain(this_cpu, sd) {
if (!(sd->flags & SD_FLAG_FASTMIGRATE)) if (!(sd->flags & SD_WAKE_AFFINE))
break; break;
if (now - p->timestamp < sd->cache_hot_time) if (now - p->timestamp < sd->cache_hot_time)
break; break;
if (cpu_isset(cpu, sd->span)) { if (cpu_isset(cpu, sd->span))
new_cpu = sched_balance_wake(this_cpu, p); goto out_set_cpu;
set_task_cpu(p, new_cpu);
goto repeat_lock_task;
} }
sd = sd->parent;
} while (sd);
new_cpu = sched_balance_wake(cpu, p); new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
if (new_cpu != cpu) { out_set_cpu:
new_cpu = wake_idle(new_cpu, p);
if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
set_task_cpu(p, new_cpu); set_task_cpu(p, new_cpu);
goto repeat_lock_task; goto repeat_lock_task;
} }
...@@ -778,6 +775,14 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) ...@@ -778,6 +775,14 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
p->activated = -1; p->activated = -1;
} }
/*
* Sync wakeups (i.e. those types of wakeups where the waker
* has indicated that it will leave the CPU in short order)
* don't trigger a preemption, if the woken up task will run on
* this cpu. (in this case the 'I will reschedule' promise of
* the waker guarantees that the freshly woken up task is going
* to be considered on this CPU.)
*/
if (sync && cpu == this_cpu) { if (sync && cpu == this_cpu) {
__activate_task(p, rq); __activate_task(p, rq);
} else { } else {
...@@ -794,6 +799,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) ...@@ -794,6 +799,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
return success; return success;
} }
int fastcall wake_up_process(task_t * p) int fastcall wake_up_process(task_t * p)
{ {
return try_to_wake_up(p, TASK_STOPPED | return try_to_wake_up(p, TASK_STOPPED |
...@@ -897,7 +903,7 @@ void fastcall wake_up_forked_process(task_t * p) ...@@ -897,7 +903,7 @@ void fastcall wake_up_forked_process(task_t * p)
list_add_tail(&p->run_list, &current->run_list); list_add_tail(&p->run_list, &current->run_list);
p->array = current->array; p->array = current->array;
p->array->nr_active++; p->array->nr_active++;
nr_running_inc(rq); rq->nr_running++;
} }
task_rq_unlock(rq, &flags); task_rq_unlock(rq, &flags);
} }
...@@ -1114,8 +1120,8 @@ enum idle_type ...@@ -1114,8 +1120,8 @@ enum idle_type
*/ */
static void sched_migrate_task(task_t *p, int dest_cpu) static void sched_migrate_task(task_t *p, int dest_cpu)
{ {
runqueue_t *rq;
migration_req_t req; migration_req_t req;
runqueue_t *rq;
unsigned long flags; unsigned long flags;
lock_cpu_hotplug(); lock_cpu_hotplug();
...@@ -1136,6 +1142,7 @@ static void sched_migrate_task(task_t *p, int dest_cpu) ...@@ -1136,6 +1142,7 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
* the migration. * the migration.
*/ */
tlb_migrate_prepare(current->mm); tlb_migrate_prepare(current->mm);
unlock_cpu_hotplug();
return; return;
} }
...@@ -1146,9 +1153,9 @@ static void sched_migrate_task(task_t *p, int dest_cpu) ...@@ -1146,9 +1153,9 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
/* /*
* Find the least loaded CPU. Slightly favor the current CPU by * Find the least loaded CPU. Slightly favor the current CPU by
* setting its runqueue length as the minimum to start. * setting its load as the minimum to start.
*/ */
static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain) static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd)
{ {
cpumask_t tmp; cpumask_t tmp;
int i, min_load, this_cpu, best_cpu; int i, min_load, this_cpu, best_cpu;
...@@ -1156,7 +1163,7 @@ static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain) ...@@ -1156,7 +1163,7 @@ static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain)
best_cpu = this_cpu = task_cpu(p); best_cpu = this_cpu = task_cpu(p);
min_load = INT_MAX; min_load = INT_MAX;
cpus_and(tmp, domain->span, cpu_online_map); cpus_and(tmp, sd->span, cpu_online_map);
for_each_cpu_mask(i, tmp) { for_each_cpu_mask(i, tmp) {
unsigned long load; unsigned long load;
if (i == this_cpu) if (i == this_cpu)
...@@ -1168,30 +1175,42 @@ static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain) ...@@ -1168,30 +1175,42 @@ static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain)
best_cpu = i; best_cpu = i;
min_load = load; min_load = load;
} }
} }
return best_cpu; return best_cpu;
} }
/*
* sched_balance_exec(): find the highest-level, exec-balance-capable
* domain and try to migrate the task to the least loaded CPU.
*
* execve() is a valuable balancing opportunity, because at this point
* the task has the smallest effective memory and cache footprint.
*/
void sched_balance_exec(void) void sched_balance_exec(void)
{ {
struct sched_domain *domain = this_sched_domain(); struct sched_domain *sd, *best_sd = NULL;
int new_cpu; int new_cpu;
int this_cpu = smp_processor_id(); int this_cpu = get_cpu();
if (numnodes == 1)
return;
/* Prefer the current CPU if there's only this task running */
if (this_rq()->nr_running <= 1) if (this_rq()->nr_running <= 1)
return; goto out;
while (domain->parent && !(domain->flags & SD_FLAG_EXEC)) for_each_domain(this_cpu, sd) {
domain = domain->parent; if (sd->flags & SD_BALANCE_EXEC)
best_sd = sd;
}
if (domain->flags & SD_FLAG_EXEC) { if (best_sd) {
new_cpu = sched_best_cpu(current, domain); new_cpu = sched_best_cpu(current, best_sd);
if (new_cpu != this_cpu) if (new_cpu != this_cpu) {
put_cpu();
sched_migrate_task(current, new_cpu); sched_migrate_task(current, new_cpu);
return;
} }
}
out:
put_cpu();
} }
#endif /* CONFIG_NUMA */ #endif /* CONFIG_NUMA */
...@@ -1214,14 +1233,14 @@ static inline void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) ...@@ -1214,14 +1233,14 @@ static inline void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
* pull_task - move a task from a remote runqueue to the local runqueue. * pull_task - move a task from a remote runqueue to the local runqueue.
* Both runqueues must be locked. * Both runqueues must be locked.
*/ */
static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, static inline
task_t *p, runqueue_t *this_rq, prio_array_t *this_array, void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
int this_cpu) runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
{ {
dequeue_task(p, src_array); dequeue_task(p, src_array);
nr_running_dec(src_rq); src_rq->nr_running--;
set_task_cpu(p, this_cpu); set_task_cpu(p, this_cpu);
nr_running_inc(this_rq); this_rq->nr_running++;
enqueue_task(p, this_array); enqueue_task(p, this_array);
p->timestamp = sched_clock() - p->timestamp = sched_clock() -
(src_rq->timestamp_last_tick - p->timestamp); (src_rq->timestamp_last_tick - p->timestamp);
...@@ -1238,7 +1257,7 @@ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, ...@@ -1238,7 +1257,7 @@ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array,
*/ */
static inline static inline
int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
struct sched_domain *domain, enum idle_type idle) struct sched_domain *sd, enum idle_type idle)
{ {
/* /*
* We do not migrate tasks that are: * We do not migrate tasks that are:
...@@ -1253,9 +1272,9 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, ...@@ -1253,9 +1272,9 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
/* Aggressive migration if we've failed balancing */ /* Aggressive migration if we've failed balancing */
if (idle == NEWLY_IDLE || if (idle == NEWLY_IDLE ||
domain->nr_balance_failed < domain->cache_nice_tries) { sd->nr_balance_failed < sd->cache_nice_tries) {
if ((rq->timestamp_last_tick - p->timestamp) if ((rq->timestamp_last_tick - p->timestamp)
< domain->cache_hot_time) < sd->cache_hot_time)
return 0; return 0;
} }
...@@ -1270,7 +1289,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, ...@@ -1270,7 +1289,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
* Called with both runqueues locked. * Called with both runqueues locked.
*/ */
static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
unsigned long max_nr_move, struct sched_domain *domain, unsigned long max_nr_move, struct sched_domain *sd,
enum idle_type idle) enum idle_type idle)
{ {
int idx; int idx;
...@@ -1305,7 +1324,7 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, ...@@ -1305,7 +1324,7 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
else else
idx = find_next_bit(array->bitmap, MAX_PRIO, idx); idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
if (idx >= MAX_PRIO) { if (idx >= MAX_PRIO) {
if (array == busiest->expired) { if (array == busiest->expired && busiest->active->nr_active) {
array = busiest->active; array = busiest->active;
dst_array = this_rq->active; dst_array = this_rq->active;
goto new_array; goto new_array;
...@@ -1320,7 +1339,7 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, ...@@ -1320,7 +1339,7 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
curr = curr->prev; curr = curr->prev;
if (!can_migrate_task(tmp, busiest, this_cpu, domain, idle)) { if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
if (curr != head) if (curr != head)
goto skip_queue; goto skip_queue;
idx++; idx++;
...@@ -1346,20 +1365,16 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, ...@@ -1346,20 +1365,16 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
* moved to restore balance via the imbalance parameter. * moved to restore balance via the imbalance parameter.
*/ */
static struct sched_group * static struct sched_group *
find_busiest_group(struct sched_domain *domain, int this_cpu, find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum idle_type idle) unsigned long *imbalance, enum idle_type idle)
{ {
unsigned long max_load, avg_load, total_load, this_load; struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned int total_pwr; unsigned long max_load, avg_load, total_load, this_load, total_pwr;
struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups;
max_load = 0; if (unlikely(!group))
this_load = 0; return NULL;
total_load = 0;
total_pwr = 0;
if (group == NULL) max_load = this_load = total_load = total_pwr = 0;
goto out_balanced;
do { do {
cpumask_t tmp; cpumask_t tmp;
...@@ -1372,6 +1387,11 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1372,6 +1387,11 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
/* Tally up the load of all CPUs in the group */ /* Tally up the load of all CPUs in the group */
avg_load = 0; avg_load = 0;
cpus_and(tmp, group->cpumask, cpu_online_map); cpus_and(tmp, group->cpumask, cpu_online_map);
if (unlikely(cpus_empty(tmp))) {
WARN_ON(1);
return NULL;
}
for_each_cpu_mask(i, tmp) { for_each_cpu_mask(i, tmp) {
/* Bias balancing toward cpus of our domain */ /* Bias balancing toward cpus of our domain */
if (local_group) { if (local_group) {
...@@ -1390,7 +1410,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1390,7 +1410,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
total_pwr += group->cpu_power; total_pwr += group->cpu_power;
/* Adjust by relative CPU power of the group */ /* Adjust by relative CPU power of the group */
avg_load = (avg_load << SCHED_LOAD_SHIFT) / group->cpu_power; avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
if (local_group) { if (local_group) {
this_load = avg_load; this_load = avg_load;
...@@ -1403,7 +1423,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1403,7 +1423,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
} }
nextgroup: nextgroup:
group = group->next; group = group->next;
} while (group != domain->groups); } while (group != sd->groups);
if (!busiest || this_load >= max_load) if (!busiest || this_load >= max_load)
goto out_balanced; goto out_balanced;
...@@ -1412,7 +1432,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1412,7 +1432,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
if (idle == NOT_IDLE) { if (idle == NOT_IDLE) {
if (this_load >= avg_load || if (this_load >= avg_load ||
100*max_load <= domain->imbalance_pct*this_load) 100*max_load <= sd->imbalance_pct*this_load)
goto out_balanced; goto out_balanced;
} }
...@@ -1441,7 +1461,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1441,7 +1461,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
pwr_now >>= SCHED_LOAD_SHIFT; pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */ /* Amount of load we'd subtract */
tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
...@@ -1452,7 +1472,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1452,7 +1472,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
/* Amount of load we'd add */ /* Amount of load we'd add */
tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp); pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp);
pwr_move >>= SCHED_LOAD_SHIFT; pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain another 8th of a CPU worth of throughput */ /* Move if we gain another 8th of a CPU worth of throughput */
if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
...@@ -1463,9 +1483,9 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1463,9 +1483,9 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
/* How many tasks to actually move to equalise the imbalance */ /* How many tasks to actually move to equalise the imbalance */
*imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
>> SCHED_LOAD_SHIFT; / SCHED_LOAD_SCALE;
/* Get rid of the scaling factor, rounding *up* as we divide */ /* Get rid of the scaling factor, rounding *up* as we divide */
*imbalance = (*imbalance + SCHED_LOAD_SCALE/2) >> SCHED_LOAD_SHIFT; *imbalance = (*imbalance + SCHED_LOAD_SCALE/2) / SCHED_LOAD_SCALE;
return busiest; return busiest;
...@@ -1485,14 +1505,12 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1485,14 +1505,12 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
static runqueue_t *find_busiest_queue(struct sched_group *group) static runqueue_t *find_busiest_queue(struct sched_group *group)
{ {
cpumask_t tmp; cpumask_t tmp;
int i; unsigned long load, max_load = 0;
unsigned long max_load = 0;
runqueue_t *busiest = NULL; runqueue_t *busiest = NULL;
int i;
cpus_and(tmp, group->cpumask, cpu_online_map); cpus_and(tmp, group->cpumask, cpu_online_map);
for_each_cpu_mask(i, tmp) { for_each_cpu_mask(i, tmp) {
unsigned long load;
load = get_low_cpu_load(i); load = get_low_cpu_load(i);
if (load > max_load) { if (load > max_load) {
...@@ -1511,42 +1529,38 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) ...@@ -1511,42 +1529,38 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
* Called with this_rq unlocked. * Called with this_rq unlocked.
*/ */
static int load_balance(int this_cpu, runqueue_t *this_rq, static int load_balance(int this_cpu, runqueue_t *this_rq,
struct sched_domain *domain, enum idle_type idle) struct sched_domain *sd, enum idle_type idle)
{ {
struct sched_group *group; struct sched_group *group;
runqueue_t *busiest = NULL; runqueue_t *busiest;
unsigned long imbalance; unsigned long imbalance;
int balanced = 0, failed = 0; int nr_moved;
int nr_moved = 0;
spin_lock(&this_rq->lock); spin_lock(&this_rq->lock);
group = find_busiest_group(domain, this_cpu, &imbalance, idle); group = find_busiest_group(sd, this_cpu, &imbalance, idle);
if (!group) { if (!group)
balanced = 1; goto out_balanced;
goto out;
}
busiest = find_busiest_queue(group); busiest = find_busiest_queue(group);
if (!busiest || busiest == this_rq) { if (!busiest)
balanced = 1; goto out_balanced;
goto out; if (unlikely(busiest == this_rq)) {
WARN_ON(1);
goto out_balanced;
} }
/* Attempt to move tasks */ /* Attempt to move tasks */
double_lock_balance(this_rq, busiest); double_lock_balance(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest, nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle);
imbalance, domain, idle);
spin_unlock(&busiest->lock);
out:
spin_unlock(&this_rq->lock); spin_unlock(&this_rq->lock);
spin_unlock(&busiest->lock);
if (!balanced && nr_moved == 0) if (!nr_moved) {
failed = 1; sd->nr_balance_failed++;
if (failed && busiest && if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
domain->nr_balance_failed > domain->cache_nice_tries) {
int wake = 0; int wake = 0;
spin_lock(&busiest->lock); spin_lock(&busiest->lock);
...@@ -1558,21 +1572,29 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, ...@@ -1558,21 +1572,29 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
spin_unlock(&busiest->lock); spin_unlock(&busiest->lock);
if (wake) if (wake)
wake_up_process(busiest->migration_thread); wake_up_process(busiest->migration_thread);
}
if (failed) /*
domain->nr_balance_failed++; * We've kicked active balancing, reset the failure
else * counter.
domain->nr_balance_failed = 0; */
sd->nr_balance_failed = sd->cache_nice_tries;
if (balanced) {
if (domain->balance_interval < domain->max_interval)
domain->balance_interval *= 2;
} else {
domain->balance_interval = domain->min_interval;
} }
} else
sd->nr_balance_failed = 0;
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
return nr_moved; return nr_moved;
out_balanced:
spin_unlock(&this_rq->lock);
/* tune up the balancing interval */
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
return 0;
} }
/* /*
...@@ -1583,14 +1605,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, ...@@ -1583,14 +1605,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
* this_rq is locked. * this_rq is locked.
*/ */
static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
struct sched_domain *domain) struct sched_domain *sd)
{ {
struct sched_group *group; struct sched_group *group;
runqueue_t *busiest = NULL; runqueue_t *busiest = NULL;
unsigned long imbalance; unsigned long imbalance;
int nr_moved = 0; int nr_moved = 0;
group = find_busiest_group(domain, this_cpu, &imbalance, NEWLY_IDLE); group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
if (!group) if (!group)
goto out; goto out;
...@@ -1602,7 +1624,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, ...@@ -1602,7 +1624,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
double_lock_balance(this_rq, busiest); double_lock_balance(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest, nr_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, domain, NEWLY_IDLE); imbalance, sd, NEWLY_IDLE);
spin_unlock(&busiest->lock); spin_unlock(&busiest->lock);
...@@ -1616,25 +1638,22 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, ...@@ -1616,25 +1638,22 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
*/ */
static inline void idle_balance(int this_cpu, runqueue_t *this_rq) static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
{ {
struct sched_domain *domain = this_sched_domain(); struct sched_domain *sd;
if (unlikely(cpu_is_offline(this_cpu))) if (unlikely(cpu_is_offline(this_cpu)))
return; return;
do { for_each_domain(this_cpu, sd) {
if (unlikely(!domain->groups)) if (unlikely(!sd->groups))
/* hasn't been setup yet */ return;
break;
if (domain->flags & SD_FLAG_NEWIDLE) { if (sd->flags & SD_BALANCE_NEWIDLE) {
if (load_balance_newidle(this_cpu, this_rq, domain)) { if (load_balance_newidle(this_cpu, this_rq, sd)) {
/* We've pulled tasks over so stop searching */ /* We've pulled tasks over so stop searching */
break; break;
} }
} }
}
domain = domain->parent;
} while (domain);
} }
/* /*
...@@ -1647,36 +1666,26 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) ...@@ -1647,36 +1666,26 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
*/ */
static void active_load_balance(runqueue_t *busiest, int busiest_cpu) static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
{ {
int i; struct sched_domain *sd;
struct sched_domain *sd = cpu_sched_domain(busiest_cpu);
struct sched_group *group, *busy_group; struct sched_group *group, *busy_group;
int i;
if (busiest->nr_running <= 1) if (busiest->nr_running <= 1)
return; return;
/* sd->parent should never cause a NULL dereference, if it did so, for_each_domain(busiest_cpu, sd) {
* then push_cpu was set to a buggy value */ if (cpu_isset(busiest->push_cpu, sd->span))
while (!cpu_isset(busiest->push_cpu, sd->span)) { break;
sd = sd->parent;
if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) {
WARN_ON(1);
return;
}
} }
if (!sd->groups) { if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) {
WARN_ON(1); WARN_ON(1);
return; return;
} }
group = sd->groups; group = sd->groups;
while (!cpu_isset(busiest_cpu, group->cpumask)) { while (!cpu_isset(busiest_cpu, group->cpumask))
group = group->next; group = group->next;
if (group == sd->groups) {
WARN_ON(1);
return;
}
}
busy_group = group; busy_group = group;
group = sd->groups; group = sd->groups;
...@@ -1719,59 +1728,60 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu) ...@@ -1719,59 +1728,60 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
/* Don't have all balancing operations going off at once */ /* Don't have all balancing operations going off at once */
#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
static void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
enum idle_type idle)
{ {
unsigned long old_load, this_load; unsigned long old_load, this_load;
unsigned long j = jiffies + CPU_OFFSET(this_cpu); unsigned long j = jiffies + CPU_OFFSET(this_cpu);
struct sched_domain *domain = this_sched_domain(); struct sched_domain *sd;
if (unlikely(cpu_is_offline(this_cpu))) if (unlikely(cpu_is_offline(this_cpu)))
return; return;
/* Update our load */ /* Update our load */
old_load = this_rq->cpu_load; old_load = this_rq->cpu_load;
this_load = this_rq->nr_running << SCHED_LOAD_SHIFT; this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
this_rq->cpu_load = (old_load + this_load) / 2; this_rq->cpu_load = (old_load + this_load) / 2;
/* Run through all this CPU's domains */ for_each_domain(this_cpu, sd) {
do { unsigned long interval = sd->balance_interval;
unsigned long interval;
if (unlikely(!domain->groups)) if (unlikely(!sd->groups))
break; return;
interval = domain->balance_interval;
if (idle != IDLE) if (idle != IDLE)
interval *= domain->busy_factor; interval *= sd->busy_factor;
/* scale ms to jiffies */ /* scale ms to jiffies */
interval = interval * HZ / 1000; interval = MSEC_TO_JIFFIES(interval);
if (unlikely(interval == 0)) if (unlikely(interval == 0))
interval = 1; interval = 1;
if (j - domain->last_balance >= interval) { if (j - sd->last_balance >= interval) {
if (load_balance(this_cpu, this_rq, domain, idle)) { if (load_balance(this_cpu, this_rq, sd, idle)) {
/* We've pulled tasks over so no longer idle */ /* We've pulled tasks over so no longer idle */
idle = NOT_IDLE; idle = NOT_IDLE;
} }
domain->last_balance += interval; sd->last_balance += interval;
}
} }
domain = domain->parent;
} while (domain);
} }
#else #else
/* /*
* on UP we do not need to balance between CPUs: * on UP we do not need to balance between CPUs:
*/ */
static inline void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
{
}
static inline void idle_balance(int cpu, runqueue_t *rq)
{ {
} }
#endif #endif
#ifdef CONFIG_SCHED_SMT
static inline int wake_priority_sleeper(runqueue_t *rq) static inline int wake_priority_sleeper(runqueue_t *rq)
{ /* {
#ifdef CONFIG_SCHED_SMT
/*
* If an SMT sibling task has been put to sleep for priority * If an SMT sibling task has been put to sleep for priority
* reasons reschedule the idle task to see if it can now run. * reasons reschedule the idle task to see if it can now run.
*/ */
...@@ -1779,14 +1789,9 @@ static inline int wake_priority_sleeper(runqueue_t *rq) ...@@ -1779,14 +1789,9 @@ static inline int wake_priority_sleeper(runqueue_t *rq)
resched_task(rq->idle); resched_task(rq->idle);
return 1; return 1;
} }
#endif
return 0; return 0;
} }
#else
static inline int wake_priority_sleeper(runqueue_t *rq)
{
return 0;
}
#endif
DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_stat, kstat);
...@@ -1937,10 +1942,8 @@ static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) ...@@ -1937,10 +1942,8 @@ static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
struct sched_domain *sd = cpu_sched_domain(cpu); struct sched_domain *sd = cpu_sched_domain(cpu);
cpumask_t sibling_map; cpumask_t sibling_map;
if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { if (!(sd->flags & SD_SHARE_CPUPOWER))
/* Not SMT */
return; return;
}
cpus_and(sibling_map, sd->span, cpu_online_map); cpus_and(sibling_map, sd->span, cpu_online_map);
cpu_clear(cpu, sibling_map); cpu_clear(cpu, sibling_map);
...@@ -1960,14 +1963,12 @@ static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) ...@@ -1960,14 +1963,12 @@ static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
{ {
int ret = 0, i;
struct sched_domain *sd = cpu_sched_domain(cpu); struct sched_domain *sd = cpu_sched_domain(cpu);
cpumask_t sibling_map; cpumask_t sibling_map;
int ret = 0, i;
if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { if (!(sd->flags & SD_SHARE_CPUPOWER))
/* Not SMT */
return 0; return 0;
}
cpus_and(sibling_map, sd->span, cpu_online_map); cpus_and(sibling_map, sd->span, cpu_online_map);
cpu_clear(cpu, sibling_map); cpu_clear(cpu, sibling_map);
...@@ -1989,7 +1990,7 @@ static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) ...@@ -1989,7 +1990,7 @@ static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
task_timeslice(p) || rt_task(smt_curr)) && task_timeslice(p) || rt_task(smt_curr)) &&
p->mm && smt_curr->mm && !rt_task(p)) p->mm && smt_curr->mm && !rt_task(p))
ret |= 1; ret = 1;
/* /*
* Reschedule a lower priority task on the SMT sibling, * Reschedule a lower priority task on the SMT sibling,
...@@ -2079,9 +2080,7 @@ asmlinkage void __sched schedule(void) ...@@ -2079,9 +2080,7 @@ asmlinkage void __sched schedule(void)
cpu = smp_processor_id(); cpu = smp_processor_id();
if (unlikely(!rq->nr_running)) { if (unlikely(!rq->nr_running)) {
#ifdef CONFIG_SMP
idle_balance(cpu, rq); idle_balance(cpu, rq);
#endif
if (!rq->nr_running) { if (!rq->nr_running) {
next = rq->idle; next = rq->idle;
rq->expired_timestamp = 0; rq->expired_timestamp = 0;
...@@ -2627,7 +2626,7 @@ static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) ...@@ -2627,7 +2626,7 @@ static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
if (task_running(rq, p)) { if (task_running(rq, p)) {
if (p->prio > oldprio) if (p->prio > oldprio)
resched_task(rq->curr); resched_task(rq->curr);
} else if (p->prio < rq->curr->prio) } else if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr); resched_task(rq->curr);
} }
...@@ -3406,24 +3405,24 @@ static void __init arch_init_sched_domains(void) ...@@ -3406,24 +3405,24 @@ static void __init arch_init_sched_domains(void)
for_each_cpu(i) { for_each_cpu(i) {
int node = cpu_to_node(i); int node = cpu_to_node(i);
cpumask_t nodemask = node_to_cpumask(node); cpumask_t nodemask = node_to_cpumask(node);
struct sched_domain *node_domain = &per_cpu(node_domains, i); struct sched_domain *node_sd = &per_cpu(node_domains, i);
struct sched_domain *cpu_domain = cpu_sched_domain(i); struct sched_domain *cpu_sd = cpu_sched_domain(i);
*node_domain = SD_NODE_INIT; *node_sd = SD_NODE_INIT;
node_domain->span = cpu_possible_map; node_sd->span = cpu_possible_map;
*cpu_domain = SD_CPU_INIT; *cpu_sd = SD_CPU_INIT;
cpus_and(cpu_domain->span, nodemask, cpu_possible_map); cpus_and(cpu_sd->span, nodemask, cpu_possible_map);
cpu_domain->parent = node_domain; cpu_sd->parent = node_sd;
} }
/* Set up groups */ /* Set up groups */
for (i = 0; i < MAX_NUMNODES; i++) { for (i = 0; i < MAX_NUMNODES; i++) {
struct sched_group *first_cpu = NULL, *last_cpu = NULL; cpumask_t tmp = node_to_cpumask(i);
int j;
cpumask_t nodemask; cpumask_t nodemask;
struct sched_group *first_cpu = NULL, *last_cpu = NULL;
struct sched_group *node = &sched_group_nodes[i]; struct sched_group *node = &sched_group_nodes[i];
cpumask_t tmp = node_to_cpumask(i); int j;
cpus_and(nodemask, tmp, cpu_possible_map); cpus_and(nodemask, tmp, cpu_possible_map);
...@@ -3458,14 +3457,14 @@ static void __init arch_init_sched_domains(void) ...@@ -3458,14 +3457,14 @@ static void __init arch_init_sched_domains(void)
mb(); mb();
for_each_cpu(i) { for_each_cpu(i) {
struct sched_domain *node_domain = &per_cpu(node_domains, i); struct sched_domain *node_sd = &per_cpu(node_domains, i);
struct sched_domain *cpu_domain = cpu_sched_domain(i); struct sched_domain *cpu_sd = cpu_sched_domain(i);
node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; node_sd->groups = &sched_group_nodes[cpu_to_node(i)];
cpu_domain->groups = &sched_group_cpus[i]; cpu_sd->groups = &sched_group_cpus[i];
} }
} }
#else /* CONFIG_NUMA */ #else /* !CONFIG_NUMA */
static void __init arch_init_sched_domains(void) static void __init arch_init_sched_domains(void)
{ {
int i; int i;
...@@ -3473,10 +3472,10 @@ static void __init arch_init_sched_domains(void) ...@@ -3473,10 +3472,10 @@ static void __init arch_init_sched_domains(void)
/* Set up domains */ /* Set up domains */
for_each_cpu(i) { for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i); struct sched_domain *cpu_sd = cpu_sched_domain(i);
*cpu_domain = SD_CPU_INIT; *cpu_sd = SD_CPU_INIT;
cpu_domain->span = cpu_possible_map; cpu_sd->span = cpu_possible_map;
} }
/* Set up CPU groups */ /* Set up CPU groups */
...@@ -3497,15 +3496,15 @@ static void __init arch_init_sched_domains(void) ...@@ -3497,15 +3496,15 @@ static void __init arch_init_sched_domains(void)
mb(); mb();
for_each_cpu(i) { for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i); struct sched_domain *cpu_sd = cpu_sched_domain(i);
cpu_domain->groups = &sched_group_cpus[i]; cpu_sd->groups = &sched_group_cpus[i];
} }
} }
#endif /* CONFIG_NUMA */ #endif /* CONFIG_NUMA */
#endif /* ARCH_HAS_SCHED_DOMAIN */ #endif /* ARCH_HAS_SCHED_DOMAIN */
#undef SCHED_DOMAIN_DEBUG #define SCHED_DOMAIN_DEBUG
#ifdef SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG
void sched_domain_debug(void) void sched_domain_debug(void)
{ {
...@@ -3513,7 +3512,7 @@ void sched_domain_debug(void) ...@@ -3513,7 +3512,7 @@ void sched_domain_debug(void)
for_each_cpu(i) { for_each_cpu(i) {
int level = 0; int level = 0;
struct sched_domain *cpu_domain = cpu_sched_domain(i); struct sched_domain *cpu_sd = cpu_sched_domain(i);
printk(KERN_DEBUG "CPU%d: %s\n", printk(KERN_DEBUG "CPU%d: %s\n",
i, (cpu_online(i) ? " online" : "offline")); i, (cpu_online(i) ? " online" : "offline"));
...@@ -3521,10 +3520,10 @@ void sched_domain_debug(void) ...@@ -3521,10 +3520,10 @@ void sched_domain_debug(void)
do { do {
int j; int j;
char str[NR_CPUS]; char str[NR_CPUS];
struct sched_group *group = cpu_domain->groups; struct sched_group *group = cpu_sd->groups;
cpumask_t groupmask, tmp; cpumask_t groupmask, tmp;
cpumask_snprintf(str, NR_CPUS, cpu_domain->span); cpumask_scnprintf(str, NR_CPUS, cpu_sd->span);
cpus_clear(groupmask); cpus_clear(groupmask);
printk(KERN_DEBUG); printk(KERN_DEBUG);
...@@ -3532,10 +3531,12 @@ void sched_domain_debug(void) ...@@ -3532,10 +3531,12 @@ void sched_domain_debug(void)
printk(" "); printk(" ");
printk("domain %d: span %s\n", level, str); printk("domain %d: span %s\n", level, str);
if (!cpu_isset(i, cpu_domain->span)) if (!cpu_isset(i, cpu_sd->span))
printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
if (!cpu_isset(i, group->cpumask)) if (!cpu_isset(i, group->cpumask))
printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i);
if (!group->cpu_power)
printk(KERN_DEBUG "ERROR domain->cpu_power not set\n");
printk(KERN_DEBUG); printk(KERN_DEBUG);
for (j = 0; j < level + 2; j++) for (j = 0; j < level + 2; j++)
...@@ -3556,26 +3557,26 @@ void sched_domain_debug(void) ...@@ -3556,26 +3557,26 @@ void sched_domain_debug(void)
cpus_or(groupmask, groupmask, group->cpumask); cpus_or(groupmask, groupmask, group->cpumask);
cpumask_snprintf(str, NR_CPUS, group->cpumask); cpumask_scnprintf(str, NR_CPUS, group->cpumask);
printk(" %s", str); printk(" %s", str);
group = group->next; group = group->next;
} while (group != cpu_domain->groups); } while (group != cpu_sd->groups);
printk("\n"); printk("\n");
if (!cpus_equal(cpu_domain->span, groupmask)) if (!cpus_equal(cpu_sd->span, groupmask))
printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); printk(KERN_DEBUG "ERROR groups don't span domain->span\n");
level++; level++;
cpu_domain = cpu_domain->parent; cpu_sd = cpu_sd->parent;
if (cpu_domain) { if (cpu_sd) {
cpus_and(tmp, groupmask, cpu_domain->span); cpus_and(tmp, groupmask, cpu_sd->span);
if (!cpus_equal(tmp, groupmask)) if (!cpus_equal(tmp, groupmask))
printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
} }
} while (cpu_domain); } while (cpu_sd);
} }
} }
#else #else
...@@ -3635,8 +3636,6 @@ void __init sched_init(void) ...@@ -3635,8 +3636,6 @@ void __init sched_init(void)
set_task_cpu(current, smp_processor_id()); set_task_cpu(current, smp_processor_id());
wake_up_forked_process(current); wake_up_forked_process(current);
init_timers();
/* /*
* The boot idle thread does lazy MMU switching as well: * The boot idle thread does lazy MMU switching as well:
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment