Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
linux
Commits
66fef08f
Commit
66fef08f
authored
Mar 26, 2009
by
Ingo Molnar
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'sched/balancing' into sched/core
parents
b6d98422
b7bb4c9b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
515 additions
and
250 deletions
+515
-250
kernel/sched.c
kernel/sched.c
+515
-250
No files found.
kernel/sched.c
View file @
66fef08f
...
...
@@ -3189,246 +3189,479 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
return
0
;
}
/********** Helpers for find_busiest_group ************************/
/**
* sd_lb_stats - Structure to store the statistics of a sched_domain
* during load balancing.
*/
struct
sd_lb_stats
{
struct
sched_group
*
busiest
;
/* Busiest group in this sd */
struct
sched_group
*
this
;
/* Local group in this sd */
unsigned
long
total_load
;
/* Total load of all groups in sd */
unsigned
long
total_pwr
;
/* Total power of all groups in sd */
unsigned
long
avg_load
;
/* Average load across all groups in sd */
/** Statistics of this group */
unsigned
long
this_load
;
unsigned
long
this_load_per_task
;
unsigned
long
this_nr_running
;
/* Statistics of the busiest group */
unsigned
long
max_load
;
unsigned
long
busiest_load_per_task
;
unsigned
long
busiest_nr_running
;
int
group_imb
;
/* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int
power_savings_balance
;
/* Is powersave balance needed for this sd */
struct
sched_group
*
group_min
;
/* Least loaded group in sd */
struct
sched_group
*
group_leader
;
/* Group which relieves group_min */
unsigned
long
min_load_per_task
;
/* load_per_task in group_min */
unsigned
long
leader_nr_running
;
/* Nr running of group_leader */
unsigned
long
min_nr_running
;
/* Nr running of group_min */
#endif
};
/*
* find_busiest_group finds and returns the busiest CPU group within the
* domain. It calculates and returns the amount of weighted load which
* should be moved to restore balance via the imbalance parameter.
/**
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
struct
sg_lb_stats
{
unsigned
long
avg_load
;
/*Avg load across the CPUs of the group */
unsigned
long
group_load
;
/* Total load over the CPUs of the group */
unsigned
long
sum_nr_running
;
/* Nr tasks running in the group */
unsigned
long
sum_weighted_load
;
/* Weighted load of group's tasks */
unsigned
long
group_capacity
;
int
group_imb
;
/* Is there an imbalance in the group ? */
};
/**
* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
* @group: The group whose first cpu is to be returned.
*/
static
struct
sched_group
*
find_busiest_group
(
struct
sched_domain
*
sd
,
int
this_cpu
,
unsigned
long
*
imbalance
,
enum
cpu_idle_type
idle
,
int
*
sd_idle
,
const
struct
cpumask
*
cpus
,
int
*
balance
)
static
inline
unsigned
int
group_first_cpu
(
struct
sched_group
*
group
)
{
struct
sched_group
*
busiest
=
NULL
,
*
this
=
NULL
,
*
group
=
sd
->
groups
;
unsigned
long
max_load
,
avg_load
,
total_load
,
this_load
,
total_pwr
;
unsigned
long
max_pull
;
unsigned
long
busiest_load_per_task
,
busiest_nr_running
;
unsigned
long
this_load_per_task
,
this_nr_running
;
int
load_idx
,
group_imb
=
0
;
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int
power_savings_balance
=
1
;
unsigned
long
leader_nr_running
=
0
,
min_load_per_task
=
0
;
unsigned
long
min_nr_running
=
ULONG_MAX
;
struct
sched_group
*
group_min
=
NULL
,
*
group_leader
=
NULL
;
#endif
return
cpumask_first
(
sched_group_cpus
(
group
));
}
max_load
=
this_load
=
total_load
=
total_pwr
=
0
;
busiest_load_per_task
=
busiest_nr_running
=
0
;
this_load_per_task
=
this_nr_running
=
0
;
/**
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
*/
static
inline
int
get_sd_load_idx
(
struct
sched_domain
*
sd
,
enum
cpu_idle_type
idle
)
{
int
load_idx
;
if
(
idle
==
CPU_NOT_IDLE
)
switch
(
idle
)
{
case
CPU_NOT_IDLE
:
load_idx
=
sd
->
busy_idx
;
else
if
(
idle
==
CPU_NEWLY_IDLE
)
break
;
case
CPU_NEWLY_IDLE
:
load_idx
=
sd
->
newidle_idx
;
else
break
;
default:
load_idx
=
sd
->
idle_idx
;
break
;
}
do
{
unsigned
long
load
,
group_capacity
,
max_cpu_load
,
min_cpu_load
;
int
local_group
;
int
i
;
int
__group_imb
=
0
;
unsigned
int
balance_cpu
=
-
1
,
first_idle_cpu
=
0
;
unsigned
long
sum_nr_running
,
sum_weighted_load
;
unsigned
long
sum_avg_load_per_task
;
unsigned
long
avg_load_per_task
;
return
load_idx
;
}
local_group
=
cpumask_test_cpu
(
this_cpu
,
sched_group_cpus
(
group
));
if
(
local_group
)
balance_cpu
=
cpumask_first
(
sched_group_cpus
(
group
));
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
/**
* init_sd_power_savings_stats - Initialize power savings statistics for
* the given sched_domain, during load balancing.
*
* @sd: Sched domain whose power-savings statistics are to be initialized.
* @sds: Variable containing the statistics for sd.
* @idle: Idle status of the CPU at which we're performing load-balancing.
*/
static
inline
void
init_sd_power_savings_stats
(
struct
sched_domain
*
sd
,
struct
sd_lb_stats
*
sds
,
enum
cpu_idle_type
idle
)
{
/*
* Busy processors will not participate in power savings
* balance.
*/
if
(
idle
==
CPU_NOT_IDLE
||
!
(
sd
->
flags
&
SD_POWERSAVINGS_BALANCE
))
sds
->
power_savings_balance
=
0
;
else
{
sds
->
power_savings_balance
=
1
;
sds
->
min_nr_running
=
ULONG_MAX
;
sds
->
leader_nr_running
=
0
;
}
}
/* Tally up the load of all CPUs in the group */
sum_weighted_load
=
sum_nr_running
=
avg_load
=
0
;
sum_avg_load_per_task
=
avg_load_per_task
=
0
;
/**
* update_sd_power_savings_stats - Update the power saving stats for a
* sched_domain while performing load balancing.
*
* @group: sched_group belonging to the sched_domain under consideration.
* @sds: Variable containing the statistics of the sched_domain
* @local_group: Does group contain the CPU for which we're performing
* load balancing ?
* @sgs: Variable containing the statistics of the group.
*/
static
inline
void
update_sd_power_savings_stats
(
struct
sched_group
*
group
,
struct
sd_lb_stats
*
sds
,
int
local_group
,
struct
sg_lb_stats
*
sgs
)
{
max_cpu_load
=
0
;
min_cpu_load
=
~
0UL
;
if
(
!
sds
->
power_savings_balance
)
return
;
for_each_cpu_and
(
i
,
sched_group_cpus
(
group
),
cpus
)
{
struct
rq
*
rq
=
cpu_rq
(
i
);
/*
* If the local group is idle or completely loaded
* no need to do power savings balance at this domain
*/
if
(
local_group
&&
(
sds
->
this_nr_running
>=
sgs
->
group_capacity
||
!
sds
->
this_nr_running
))
sds
->
power_savings_balance
=
0
;
if
(
*
sd_idle
&&
rq
->
nr_running
)
*
sd_idle
=
0
;
/*
* If a group is already running at full capacity or idle,
* don't include that group in power savings calculations
*/
if
(
!
sds
->
power_savings_balance
||
sgs
->
sum_nr_running
>=
sgs
->
group_capacity
||
!
sgs
->
sum_nr_running
)
return
;
/* Bias balancing toward cpus of our domain */
if
(
local_group
)
{
if
(
idle_cpu
(
i
)
&&
!
first_idle_cpu
)
{
first_idle_cpu
=
1
;
balance_cpu
=
i
;
}
/*
* Calculate the group which has the least non-idle load.
* This is the group from where we need to pick up the load
* for saving power
*/
if
((
sgs
->
sum_nr_running
<
sds
->
min_nr_running
)
||
(
sgs
->
sum_nr_running
==
sds
->
min_nr_running
&&
group_first_cpu
(
group
)
>
group_first_cpu
(
sds
->
group_min
)))
{
sds
->
group_min
=
group
;
sds
->
min_nr_running
=
sgs
->
sum_nr_running
;
sds
->
min_load_per_task
=
sgs
->
sum_weighted_load
/
sgs
->
sum_nr_running
;
}
load
=
target_load
(
i
,
load_idx
);
}
else
{
load
=
source_load
(
i
,
load_idx
);
if
(
load
>
max_cpu_load
)
max_cpu_load
=
load
;
if
(
min_cpu_load
>
load
)
min_cpu_load
=
load
;
}
/*
* Calculate the group which is almost near its
* capacity but still has some space to pick up some load
* from other group and save more power
*/
if
(
sgs
->
sum_nr_running
>
sgs
->
group_capacity
-
1
)
return
;
avg_load
+=
load
;
sum_nr_running
+=
rq
->
nr_running
;
sum_weighted_load
+=
weighted_cpuload
(
i
);
if
(
sgs
->
sum_nr_running
>
sds
->
leader_nr_running
||
(
sgs
->
sum_nr_running
==
sds
->
leader_nr_running
&&
group_first_cpu
(
group
)
<
group_first_cpu
(
sds
->
group_leader
)))
{
sds
->
group_leader
=
group
;
sds
->
leader_nr_running
=
sgs
->
sum_nr_running
;
}
}
sum_avg_load_per_task
+=
cpu_avg_load_per_task
(
i
);
}
/**
* check_power_save_busiest_group - Check if we have potential to perform
* some power-savings balance. If yes, set the busiest group to be
* the least loaded group in the sched_domain, so that it's CPUs can
* be put to idle.
*
* @sds: Variable containing the statistics of the sched_domain
* under consideration.
* @this_cpu: Cpu at which we're currently performing load-balancing.
* @imbalance: Variable to store the imbalance.
*
* Returns 1 if there is potential to perform power-savings balance.
* Else returns 0.
*/
static
inline
int
check_power_save_busiest_group
(
struct
sd_lb_stats
*
sds
,
int
this_cpu
,
unsigned
long
*
imbalance
)
{
if
(
!
sds
->
power_savings_balance
)
return
0
;
/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible for doing load balancing at this and above
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
if
(
idle
!=
CPU_NEWLY_IDLE
&&
local_group
&&
balance_cpu
!=
this_cpu
&&
balance
)
{
*
balance
=
0
;
goto
ret
;
}
if
(
sds
->
this
!=
sds
->
group_leader
||
sds
->
group_leader
==
sds
->
group_min
)
return
0
;
total_load
+=
avg_load
;
total_pwr
+=
group
->
__cpu_power
;
*
imbalance
=
sds
->
min_load_per_task
;
sds
->
busiest
=
sds
->
group_min
;
/* Adjust by relative CPU power of the group */
avg_load
=
sg_div_cpu_power
(
group
,
avg_load
*
SCHED_LOAD_SCALE
);
if
(
sched_mc_power_savings
>=
POWERSAVINGS_BALANCE_WAKEUP
)
{
cpu_rq
(
this_cpu
)
->
rd
->
sched_mc_preferred_wakeup_cpu
=
group_first_cpu
(
sds
->
group_leader
);
}
return
1
;
/*
* Consider the group unbalanced when the imbalance is larger
* than the average weight of two tasks.
*
* APZ: with cgroup the avg task weight can vary wildly and
* might not be a suitable number - should we keep a
* normalized nr_running number somewhere that negates
* the hierarchy?
*/
avg_load_per_task
=
sg_div_cpu_power
(
group
,
sum_avg_load_per_task
*
SCHED_LOAD_SCALE
);
}
#else
/* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
static
inline
void
init_sd_power_savings_stats
(
struct
sched_domain
*
sd
,
struct
sd_lb_stats
*
sds
,
enum
cpu_idle_type
idle
)
{
return
;
}
static
inline
void
update_sd_power_savings_stats
(
struct
sched_group
*
group
,
struct
sd_lb_stats
*
sds
,
int
local_group
,
struct
sg_lb_stats
*
sgs
)
{
return
;
}
static
inline
int
check_power_save_busiest_group
(
struct
sd_lb_stats
*
sds
,
int
this_cpu
,
unsigned
long
*
imbalance
)
{
return
0
;
}
#endif
/* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @group: sched_group whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @sd_idle: Idle status of the sched_domain containing group.
* @local_group: Does group contain this_cpu.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sgs: variable to hold the statistics for this group.
*/
static
inline
void
update_sg_lb_stats
(
struct
sched_group
*
group
,
int
this_cpu
,
enum
cpu_idle_type
idle
,
int
load_idx
,
int
*
sd_idle
,
int
local_group
,
const
struct
cpumask
*
cpus
,
int
*
balance
,
struct
sg_lb_stats
*
sgs
)
{
unsigned
long
load
,
max_cpu_load
,
min_cpu_load
;
int
i
;
unsigned
int
balance_cpu
=
-
1
,
first_idle_cpu
=
0
;
unsigned
long
sum_avg_load_per_task
;
unsigned
long
avg_load_per_task
;
if
((
max_cpu_load
-
min_cpu_load
)
>
2
*
avg_load_per_task
)
__group_imb
=
1
;
if
(
local_group
)
balance_cpu
=
group_first_cpu
(
group
)
;
group_capacity
=
group
->
__cpu_power
/
SCHED_LOAD_SCALE
;
/* Tally up the load of all CPUs in the group */
sum_avg_load_per_task
=
avg_load_per_task
=
0
;
max_cpu_load
=
0
;
min_cpu_load
=
~
0UL
;
for_each_cpu_and
(
i
,
sched_group_cpus
(
group
),
cpus
)
{
struct
rq
*
rq
=
cpu_rq
(
i
);
if
(
*
sd_idle
&&
rq
->
nr_running
)
*
sd_idle
=
0
;
/* Bias balancing toward cpus of our domain */
if
(
local_group
)
{
this_load
=
avg_load
;
this
=
group
;
this_nr_running
=
sum_nr_running
;
this_load_per_task
=
sum_weighted_load
;
}
else
if
(
avg_load
>
max_load
&&
(
sum_nr_running
>
group_capacity
||
__group_imb
))
{
max_load
=
avg_load
;
busiest
=
group
;
busiest_nr_running
=
sum_nr_running
;
busiest_load_per_task
=
sum_weighted_load
;
group_imb
=
__group_imb
;
if
(
idle_cpu
(
i
)
&&
!
first_idle_cpu
)
{
first_idle_cpu
=
1
;
balance_cpu
=
i
;
}
load
=
target_load
(
i
,
load_idx
);
}
else
{
load
=
source_load
(
i
,
load_idx
);
if
(
load
>
max_cpu_load
)
max_cpu_load
=
load
;
if
(
min_cpu_load
>
load
)
min_cpu_load
=
load
;
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
/*
* Busy processors will not participate in power savings
* balance.
*/
if
(
idle
==
CPU_NOT_IDLE
||
!
(
sd
->
flags
&
SD_POWERSAVINGS_BALANCE
))
goto
group_next
;
sgs
->
group_load
+=
load
;
sgs
->
sum_nr_running
+=
rq
->
nr_running
;
sgs
->
sum_weighted_load
+=
weighted_cpuload
(
i
);
/*
* If the local group is idle or completely loaded
* no need to do power savings balance at this domain
*/
if
(
local_group
&&
(
this_nr_running
>=
group_capacity
||
!
this_nr_running
))
power_savings_balance
=
0
;
sum_avg_load_per_task
+=
cpu_avg_load_per_task
(
i
);
}
/*
* If a group is already running at full capacity or idle,
* don't include that group in power savings calculations
*/
if
(
!
power_savings_balance
||
sum_nr_running
>=
group_capacity
||
!
sum_nr_running
)
goto
group_next
;
/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible for doing load balancing at this and above
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
if
(
idle
!=
CPU_NEWLY_IDLE
&&
local_group
&&
balance_cpu
!=
this_cpu
&&
balance
)
{
*
balance
=
0
;
return
;
}
/*
* Calculate the group which has the least non-idle load.
* This is the group from where we need to pick up the load
* for saving power
*/
if
((
sum_nr_running
<
min_nr_running
)
||
(
sum_nr_running
==
min_nr_running
&&
cpumask_first
(
sched_group_cpus
(
group
))
>
cpumask_first
(
sched_group_cpus
(
group_min
))))
{
group_min
=
group
;
min_nr_running
=
sum_nr_running
;
min_load_per_task
=
sum_weighted_load
/
sum_nr_running
;
}
/* Adjust by relative CPU power of the group */
sgs
->
avg_load
=
sg_div_cpu_power
(
group
,
sgs
->
group_load
*
SCHED_LOAD_SCALE
);
/*
* Calculate the group which is almost near its
* capacity but still has some space to pick up some load
* from other group and save more power
*/
if
(
sum_nr_running
<=
group_capacity
-
1
)
{
if
(
sum_nr_running
>
leader_nr_running
||
(
sum_nr_running
==
leader_nr_running
&&
cpumask_first
(
sched_group_cpus
(
group
))
<
cpumask_first
(
sched_group_cpus
(
group_leader
))))
{
group_leader
=
group
;
leader_nr_running
=
sum_nr_running
;
}
/*
* Consider the group unbalanced when the imbalance is larger
* than the average weight of two tasks.
*
* APZ: with cgroup the avg task weight can vary wildly and
* might not be a suitable number - should we keep a
* normalized nr_running number somewhere that negates
* the hierarchy?
*/
avg_load_per_task
=
sg_div_cpu_power
(
group
,
sum_avg_load_per_task
*
SCHED_LOAD_SCALE
);
if
((
max_cpu_load
-
min_cpu_load
)
>
2
*
avg_load_per_task
)
sgs
->
group_imb
=
1
;
sgs
->
group_capacity
=
group
->
__cpu_power
/
SCHED_LOAD_SCALE
;
}
/**
* update_sd_lb_stats - Update sched_group's statistics for load balancing.
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
* @sd_idle: Idle status of the sched_domain containing group.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
static
inline
void
update_sd_lb_stats
(
struct
sched_domain
*
sd
,
int
this_cpu
,
enum
cpu_idle_type
idle
,
int
*
sd_idle
,
const
struct
cpumask
*
cpus
,
int
*
balance
,
struct
sd_lb_stats
*
sds
)
{
struct
sched_group
*
group
=
sd
->
groups
;
struct
sg_lb_stats
sgs
;
int
load_idx
;
init_sd_power_savings_stats
(
sd
,
sds
,
idle
);
load_idx
=
get_sd_load_idx
(
sd
,
idle
);
do
{
int
local_group
;
local_group
=
cpumask_test_cpu
(
this_cpu
,
sched_group_cpus
(
group
));
memset
(
&
sgs
,
0
,
sizeof
(
sgs
));
update_sg_lb_stats
(
group
,
this_cpu
,
idle
,
load_idx
,
sd_idle
,
local_group
,
cpus
,
balance
,
&
sgs
);
if
(
local_group
&&
balance
&&
!
(
*
balance
))
return
;
sds
->
total_load
+=
sgs
.
group_load
;
sds
->
total_pwr
+=
group
->
__cpu_power
;
if
(
local_group
)
{
sds
->
this_load
=
sgs
.
avg_load
;
sds
->
this
=
group
;
sds
->
this_nr_running
=
sgs
.
sum_nr_running
;
sds
->
this_load_per_task
=
sgs
.
sum_weighted_load
;
}
else
if
(
sgs
.
avg_load
>
sds
->
max_load
&&
(
sgs
.
sum_nr_running
>
sgs
.
group_capacity
||
sgs
.
group_imb
))
{
sds
->
max_load
=
sgs
.
avg_load
;
sds
->
busiest
=
group
;
sds
->
busiest_nr_running
=
sgs
.
sum_nr_running
;
sds
->
busiest_load_per_task
=
sgs
.
sum_weighted_load
;
sds
->
group_imb
=
sgs
.
group_imb
;
}
group_next:
#endif
update_sd_power_savings_stats
(
group
,
sds
,
local_group
,
&
sgs
);
group
=
group
->
next
;
}
while
(
group
!=
sd
->
groups
);
if
(
!
busiest
||
this_load
>=
max_load
||
busiest_nr_running
==
0
)
goto
out_balanced
;
avg_load
=
(
SCHED_LOAD_SCALE
*
total_load
)
/
total_pwr
;
}
if
(
this_load
>=
avg_load
||
100
*
max_load
<=
sd
->
imbalance_pct
*
this_load
)
goto
out_balanced
;
/**
* fix_small_imbalance - Calculate the minor imbalance that exists
* amongst the groups of a sched_domain, during
* load balancing.
* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
* @this_cpu: The cpu at whose sched_domain we're performing load-balance.
* @imbalance: Variable to store the imbalance.
*/
static
inline
void
fix_small_imbalance
(
struct
sd_lb_stats
*
sds
,
int
this_cpu
,
unsigned
long
*
imbalance
)
{
unsigned
long
tmp
,
pwr_now
=
0
,
pwr_move
=
0
;
unsigned
int
imbn
=
2
;
if
(
sds
->
this_nr_running
)
{
sds
->
this_load_per_task
/=
sds
->
this_nr_running
;
if
(
sds
->
busiest_load_per_task
>
sds
->
this_load_per_task
)
imbn
=
1
;
}
else
sds
->
this_load_per_task
=
cpu_avg_load_per_task
(
this_cpu
);
busiest_load_per_task
/=
busiest_nr_running
;
if
(
group_imb
)
busiest_load_per_task
=
min
(
busiest_load_per_task
,
avg_load
);
if
(
sds
->
max_load
-
sds
->
this_load
+
sds
->
busiest_load_per_task
>=
sds
->
busiest_load_per_task
*
imbn
)
{
*
imbalance
=
sds
->
busiest_load_per_task
;
return
;
}
/*
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded cpu below the average load, as either of these
* actions would just result in more rebalancing later, and ping-pong
* tasks around. Thus we look for the minimum possible imbalance.
* Negative imbalances (*we* are more loaded than anyone else) will
* be counted as no imbalance for these purposes -- we can't fix that
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
* OK, we don't have enough imbalance to justify moving tasks,
* however we may be able to increase total CPU power used by
* moving them.
*/
if
(
max_load
<=
busiest_load_per_task
)
goto
out_balanced
;
pwr_now
+=
sds
->
busiest
->
__cpu_power
*
min
(
sds
->
busiest_load_per_task
,
sds
->
max_load
);
pwr_now
+=
sds
->
this
->
__cpu_power
*
min
(
sds
->
this_load_per_task
,
sds
->
this_load
);
pwr_now
/=
SCHED_LOAD_SCALE
;
/* Amount of load we'd subtract */
tmp
=
sg_div_cpu_power
(
sds
->
busiest
,
sds
->
busiest_load_per_task
*
SCHED_LOAD_SCALE
);
if
(
sds
->
max_load
>
tmp
)
pwr_move
+=
sds
->
busiest
->
__cpu_power
*
min
(
sds
->
busiest_load_per_task
,
sds
->
max_load
-
tmp
);
/* Amount of load we'd add */
if
(
sds
->
max_load
*
sds
->
busiest
->
__cpu_power
<
sds
->
busiest_load_per_task
*
SCHED_LOAD_SCALE
)
tmp
=
sg_div_cpu_power
(
sds
->
this
,
sds
->
max_load
*
sds
->
busiest
->
__cpu_power
);
else
tmp
=
sg_div_cpu_power
(
sds
->
this
,
sds
->
busiest_load_per_task
*
SCHED_LOAD_SCALE
);
pwr_move
+=
sds
->
this
->
__cpu_power
*
min
(
sds
->
this_load_per_task
,
sds
->
this_load
+
tmp
);
pwr_move
/=
SCHED_LOAD_SCALE
;
/* Move if we gain throughput */
if
(
pwr_move
>
pwr_now
)
*
imbalance
=
sds
->
busiest_load_per_task
;
}
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
* @this_cpu: Cpu for which currently load balance is being performed.
* @imbalance: The variable to store the imbalance.
*/
static
inline
void
calculate_imbalance
(
struct
sd_lb_stats
*
sds
,
int
this_cpu
,
unsigned
long
*
imbalance
)
{
unsigned
long
max_pull
;
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
*/
if
(
max_load
<
avg_load
)
{
if
(
sds
->
max_load
<
sds
->
avg_load
)
{
*
imbalance
=
0
;
goto
small_imbalance
;
return
fix_small_imbalance
(
sds
,
this_cpu
,
imbalance
)
;
}
/* Don't want to pull so many tasks that a group would go idle */
max_pull
=
min
(
max_load
-
avg_load
,
max_load
-
busiest_load_per_task
);
max_pull
=
min
(
sds
->
max_load
-
sds
->
avg_load
,
sds
->
max_load
-
sds
->
busiest_load_per_task
);
/* How much load to actually move to equalise the imbalance */
*
imbalance
=
min
(
max_pull
*
busiest
->
__cpu_power
,
(
avg_load
-
this_load
)
*
this
->
__cpu_power
)
*
imbalance
=
min
(
max_pull
*
sds
->
busiest
->
__cpu_power
,
(
sds
->
avg_load
-
sds
->
this_load
)
*
sds
->
this
->
__cpu_power
)
/
SCHED_LOAD_SCALE
;
/*
...
...
@@ -3437,78 +3670,110 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* a think about bumping its value to force at least one task to be
* moved
*/
if
(
*
imbalance
<
busiest_load_per_task
)
{
unsigned
long
tmp
,
pwr_now
,
pwr_move
;
unsigned
int
imbn
;
small_imbalance:
pwr_move
=
pwr_now
=
0
;
imbn
=
2
;
if
(
this_nr_running
)
{
this_load_per_task
/=
this_nr_running
;
if
(
busiest_load_per_task
>
this_load_per_task
)
imbn
=
1
;
}
else
this_load_per_task
=
cpu_avg_load_per_task
(
this_cpu
);
if
(
*
imbalance
<
sds
->
busiest_load_per_task
)
return
fix_small_imbalance
(
sds
,
this_cpu
,
imbalance
);
if
(
max_load
-
this_load
+
busiest_load_per_task
>=
busiest_load_per_task
*
imbn
)
{
*
imbalance
=
busiest_load_per_task
;
return
busiest
;
}
}
/******* find_busiest_group() helpers end here *********************/
/*
* OK, we don't have enough imbalance to justify moving tasks,
* however we may be able to increase total CPU power used by
* moving them.
*/
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance. If there isn't an imbalance, and
* the user has opted for power-savings, it returns a group whose
* CPUs can be put to idle by rebalancing those tasks elsewhere, if
* such a group exists.
*
* Also calculates the amount of weighted load which should be moved
* to restore balance.
*
* @sd: The sched_domain whose busiest group is to be returned.
* @this_cpu: The cpu for which load balancing is currently being performed.
* @imbalance: Variable which stores amount of weighted load which should
* be moved to restore balance/put a group to idle.
* @idle: The idle status of this_cpu.
* @sd_idle: The idleness of sd
* @cpus: The set of CPUs under consideration for load-balancing.
* @balance: Pointer to a variable indicating if this_cpu
* is the appropriate cpu to perform load balancing at this_level.
*
* Returns: - the busiest group if imbalance exists.
* - If no imbalance and user has opted for power-savings balance,
* return the least loaded group whose CPUs can be
* put to idle by rebalancing its tasks onto our group.
*/
static
struct
sched_group
*
find_busiest_group
(
struct
sched_domain
*
sd
,
int
this_cpu
,
unsigned
long
*
imbalance
,
enum
cpu_idle_type
idle
,
int
*
sd_idle
,
const
struct
cpumask
*
cpus
,
int
*
balance
)
{
struct
sd_lb_stats
sds
;
pwr_now
+=
busiest
->
__cpu_power
*
min
(
busiest_load_per_task
,
max_load
);
pwr_now
+=
this
->
__cpu_power
*
min
(
this_load_per_task
,
this_load
);
pwr_now
/=
SCHED_LOAD_SCALE
;
/* Amount of load we'd subtract */
tmp
=
sg_div_cpu_power
(
busiest
,
busiest_load_per_task
*
SCHED_LOAD_SCALE
);
if
(
max_load
>
tmp
)
pwr_move
+=
busiest
->
__cpu_power
*
min
(
busiest_load_per_task
,
max_load
-
tmp
);
/* Amount of load we'd add */
if
(
max_load
*
busiest
->
__cpu_power
<
busiest_load_per_task
*
SCHED_LOAD_SCALE
)
tmp
=
sg_div_cpu_power
(
this
,
max_load
*
busiest
->
__cpu_power
);
else
tmp
=
sg_div_cpu_power
(
this
,
busiest_load_per_task
*
SCHED_LOAD_SCALE
);
pwr_move
+=
this
->
__cpu_power
*
min
(
this_load_per_task
,
this_load
+
tmp
);
pwr_move
/=
SCHED_LOAD_SCALE
;
memset
(
&
sds
,
0
,
sizeof
(
sds
));
/* Move if we gain throughput */
if
(
pwr_move
>
pwr_now
)
*
imbalance
=
busiest_load_per_task
;
}
/*
* Compute the various statistics relavent for load balancing at
* this level.
*/
update_sd_lb_stats
(
sd
,
this_cpu
,
idle
,
sd_idle
,
cpus
,
balance
,
&
sds
);
/* Cases where imbalance does not exist from POV of this_cpu */
/* 1) this_cpu is not the appropriate cpu to perform load balancing
* at this level.
* 2) There is no busy sibling group to pull from.
* 3) This group is the busiest group.
* 4) This group is more busy than the avg busieness at this
* sched_domain.
* 5) The imbalance is within the specified limit.
* 6) Any rebalance would lead to ping-pong
*/
if
(
balance
&&
!
(
*
balance
))
goto
ret
;
return
busiest
;
if
(
!
sds
.
busiest
||
sds
.
busiest_nr_running
==
0
)
goto
out_balanced
;
out_balanced:
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
if
(
idle
==
CPU_NOT_IDLE
||
!
(
sd
->
flags
&
SD_POWERSAVINGS_BALANCE
))
goto
ret
;
if
(
sds
.
this_load
>=
sds
.
max_load
)
goto
out_balanced
;
if
(
this
==
group_leader
&&
group_leader
!=
group_min
)
{
*
imbalance
=
min_load_per_task
;
if
(
sched_mc_power_savings
>=
POWERSAVINGS_BALANCE_WAKEUP
)
{
cpu_rq
(
this_cpu
)
->
rd
->
sched_mc_preferred_wakeup_cpu
=
cpumask_first
(
sched_group_cpus
(
group_leader
));
}
return
group_min
;
}
#endif
sds
.
avg_load
=
(
SCHED_LOAD_SCALE
*
sds
.
total_load
)
/
sds
.
total_pwr
;
if
(
sds
.
this_load
>=
sds
.
avg_load
)
goto
out_balanced
;
if
(
100
*
sds
.
max_load
<=
sd
->
imbalance_pct
*
sds
.
this_load
)
goto
out_balanced
;
sds
.
busiest_load_per_task
/=
sds
.
busiest_nr_running
;
if
(
sds
.
group_imb
)
sds
.
busiest_load_per_task
=
min
(
sds
.
busiest_load_per_task
,
sds
.
avg_load
);
/*
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded cpu below the average load, as either of these
* actions would just result in more rebalancing later, and ping-pong
* tasks around. Thus we look for the minimum possible imbalance.
* Negative imbalances (*we* are more loaded than anyone else) will
* be counted as no imbalance for these purposes -- we can't fix that
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
if
(
sds
.
max_load
<=
sds
.
busiest_load_per_task
)
goto
out_balanced
;
/* Looks like there is an imbalance. Compute it */
calculate_imbalance
(
&
sds
,
this_cpu
,
imbalance
);
return
sds
.
busiest
;
out_balanced:
/*
* There is no obvious imbalance. But check if we can do some balancing
* to save power.
*/
if
(
check_power_save_busiest_group
(
&
sds
,
this_cpu
,
imbalance
))
return
sds
.
busiest
;
ret:
*
imbalance
=
0
;
return
NULL
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment