Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
linux
Commits
291c54ff
Commit
291c54ff
authored
Sep 06, 2008
by
Ingo Molnar
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'sched/cpuset' into sched/urgent
parents
49048622
dfb512ec
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
196 additions
and
137 deletions
+196
-137
include/linux/cpuset.h
include/linux/cpuset.h
+1
-1
kernel/cpuset.c
kernel/cpuset.c
+182
-130
kernel/sched.c
kernel/sched.c
+13
-6
No files found.
include/linux/cpuset.h
View file @
291c54ff
...
...
@@ -160,7 +160,7 @@ static inline int current_cpuset_is_being_rebound(void)
static
inline
void
rebuild_sched_domains
(
void
)
{
partition_sched_domains
(
0
,
NULL
,
NULL
);
partition_sched_domains
(
1
,
NULL
,
NULL
);
}
#endif
/* !CONFIG_CPUSETS */
...
...
kernel/cpuset.c
View file @
291c54ff
...
...
@@ -14,6 +14,8 @@
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
* 2006 Rework by Paul Menage to use generic cgroups
* 2008 Rework of the scheduler domains and CPU hotplug handling
* by Max Krasnyansky
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
...
...
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = {
static
DEFINE_MUTEX
(
callback_mutex
);
/* This is ugly, but preserves the userspace API for existing cpuset
/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead */
* silently switch it to mount "cgroup" instead
*/
static
int
cpuset_get_sb
(
struct
file_system_type
*
fs_type
,
int
flags
,
const
char
*
unused_dev_name
,
void
*
data
,
struct
vfsmount
*
mnt
)
...
...
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
}
/*
* Helper routine for
rebuild
_sched_domains().
* Helper routine for
generate
_sched_domains().
* Do cpusets a, b have overlapping cpus_allowed masks?
*/
static
int
cpusets_overlap
(
struct
cpuset
*
a
,
struct
cpuset
*
b
)
{
return
cpus_intersects
(
a
->
cpus_allowed
,
b
->
cpus_allowed
);
...
...
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
}
/*
* rebuild_sched_domains()
*
* This routine will be called to rebuild the scheduler's dynamic
* sched domains:
* - if the flag 'sched_load_balance' of any cpuset with non-empty
* 'cpus' changes,
* - or if the 'cpus' allowed changes in any cpuset which has that
* flag enabled,
* - or if the 'sched_relax_domain_level' of any cpuset which has
* that flag enabled and with non-empty 'cpus' changes,
* - or if any cpuset with non-empty 'cpus' is removed,
* - or if a cpu gets offlined.
*
* This routine builds a partial partition of the systems CPUs
* (the set of non-overlappping cpumask_t's in the array 'part'
* below), and passes that partial partition to the kernel/sched.c
* partition_sched_domains() routine, which will rebuild the
* schedulers load balancing domains (sched domains) as specified
* by that partial partition. A 'partial partition' is a set of
* non-overlapping subsets whose union is a subset of that set.
* generate_sched_domains()
*
* This function builds a partial partition of the systems CPUs
* A 'partial partition' is a set of non-overlapping subsets whose
* union is a subset of that set.
* The output of this function needs to be passed to kernel/sched.c
* partition_sched_domains() routine, which will rebuild the scheduler's
* load balancing domains (sched domains) as specified by that partial
* partition.
*
* See "What is sched_load_balance" in Documentation/cpusets.txt
* for a background explanation of this.
...
...
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
* Call with cgroup_mutex held. May take callback_mutex during
* call due to the kfifo_alloc() and kmalloc() calls. May nest
* a call to the get_online_cpus()/put_online_cpus() pair.
* Must not be called holding callback_mutex, because we must not
* call get_online_cpus() while holding callback_mutex. Elsewhere
* the kernel nests callback_mutex inside get_online_cpus() calls.
* So the reverse nesting would risk an ABBA deadlock.
* Must be called with cgroup_lock held.
*
* The three key local variables below are:
* q - a linked-list queue of cpuset pointers, used to implement a
...
...
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* element of the partition (one sched domain) to be passed to
* partition_sched_domains().
*/
void
rebuild_sched_domains
(
void
)
static
int
generate_sched_domains
(
cpumask_t
**
domains
,
struct
sched_domain_attr
**
attributes
)
{
LIST_HEAD
(
q
);
/* queue of cpusets to be scanned*/
LIST_HEAD
(
q
);
/* queue of cpusets to be scanned
*/
struct
cpuset
*
cp
;
/* scans q */
struct
cpuset
**
csa
;
/* array of all cpuset ptrs */
int
csn
;
/* how many cpuset ptrs in csa so far */
...
...
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void)
int
ndoms
;
/* number of sched domains in result */
int
nslot
;
/* next empty doms[] cpumask_t slot */
csa
=
NULL
;
ndoms
=
0
;
doms
=
NULL
;
dattr
=
NULL
;
csa
=
NULL
;
/* Special case for the 99% of systems with one, full, sched domain */
if
(
is_sched_load_balance
(
&
top_cpuset
))
{
ndoms
=
1
;
doms
=
kmalloc
(
sizeof
(
cpumask_t
),
GFP_KERNEL
);
if
(
!
doms
)
goto
rebuild
;
goto
done
;
dattr
=
kmalloc
(
sizeof
(
struct
sched_domain_attr
),
GFP_KERNEL
);
if
(
dattr
)
{
*
dattr
=
SD_ATTR_INIT
;
update_domain_attr_tree
(
dattr
,
&
top_cpuset
);
}
*
doms
=
top_cpuset
.
cpus_allowed
;
goto
rebuild
;
ndoms
=
1
;
goto
done
;
}
csa
=
kmalloc
(
number_of_cpusets
*
sizeof
(
cp
),
GFP_KERNEL
);
...
...
@@ -680,61 +669,141 @@ void rebuild_sched_domains(void)
}
}
/* Convert <csn, csa> to <ndoms, doms> */
/*
* Now we know how many domains to create.
* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
*/
doms
=
kmalloc
(
ndoms
*
sizeof
(
cpumask_t
),
GFP_KERNEL
);
if
(
!
doms
)
goto
rebuild
;
if
(
!
doms
)
{
ndoms
=
0
;
goto
done
;
}
/*
* The rest of the code, including the scheduler, can deal with
* dattr==NULL case. No need to abort if alloc fails.
*/
dattr
=
kmalloc
(
ndoms
*
sizeof
(
struct
sched_domain_attr
),
GFP_KERNEL
);
for
(
nslot
=
0
,
i
=
0
;
i
<
csn
;
i
++
)
{
struct
cpuset
*
a
=
csa
[
i
];
cpumask_t
*
dp
;
int
apn
=
a
->
pn
;
if
(
apn
>=
0
)
{
cpumask_t
*
dp
=
doms
+
nslot
;
if
(
nslot
==
ndoms
)
{
static
int
warnings
=
10
;
if
(
warnings
)
{
printk
(
KERN_WARNING
"rebuild_sched_domains confused:"
" nslot %d, ndoms %d, csn %d, i %d,"
" apn %d
\n
"
,
nslot
,
ndoms
,
csn
,
i
,
apn
);
warnings
--
;
}
continue
;
if
(
apn
<
0
)
{
/* Skip completed partitions */
continue
;
}
dp
=
doms
+
nslot
;
if
(
nslot
==
ndoms
)
{
static
int
warnings
=
10
;
if
(
warnings
)
{
printk
(
KERN_WARNING
"rebuild_sched_domains confused:"
" nslot %d, ndoms %d, csn %d, i %d,"
" apn %d
\n
"
,
nslot
,
ndoms
,
csn
,
i
,
apn
);
warnings
--
;
}
continue
;
}
cpus_clear
(
*
dp
);
if
(
dattr
)
*
(
dattr
+
nslot
)
=
SD_ATTR_INIT
;
for
(
j
=
i
;
j
<
csn
;
j
++
)
{
struct
cpuset
*
b
=
csa
[
j
];
if
(
apn
==
b
->
pn
)
{
cpus_or
(
*
dp
,
*
dp
,
b
->
cpus_allowed
);
b
->
pn
=
-
1
;
if
(
dattr
)
update_domain_attr_tree
(
dattr
+
nslot
,
b
);
}
cpus_clear
(
*
dp
);
if
(
dattr
)
*
(
dattr
+
nslot
)
=
SD_ATTR_INIT
;
for
(
j
=
i
;
j
<
csn
;
j
++
)
{
struct
cpuset
*
b
=
csa
[
j
];
if
(
apn
==
b
->
pn
)
{
cpus_or
(
*
dp
,
*
dp
,
b
->
cpus_allowed
);
if
(
dattr
)
update_domain_attr_tree
(
dattr
+
nslot
,
b
);
/* Done with this partition */
b
->
pn
=
-
1
;
}
nslot
++
;
}
nslot
++
;
}
BUG_ON
(
nslot
!=
ndoms
);
rebuild:
/* Have scheduler rebuild sched domains */
done:
kfree
(
csa
);
*
domains
=
doms
;
*
attributes
=
dattr
;
return
ndoms
;
}
/*
* Rebuild scheduler domains.
*
* Call with neither cgroup_mutex held nor within get_online_cpus().
* Takes both cgroup_mutex and get_online_cpus().
*
* Cannot be directly called from cpuset code handling changes
* to the cpuset pseudo-filesystem, because it cannot be called
* from code that already holds cgroup_mutex.
*/
static
void
do_rebuild_sched_domains
(
struct
work_struct
*
unused
)
{
struct
sched_domain_attr
*
attr
;
cpumask_t
*
doms
;
int
ndoms
;
get_online_cpus
();
partition_sched_domains
(
ndoms
,
doms
,
dattr
);
/* Generate domain masks and attrs */
cgroup_lock
();
ndoms
=
generate_sched_domains
(
&
doms
,
&
attr
);
cgroup_unlock
();
/* Have scheduler rebuild the domains */
partition_sched_domains
(
ndoms
,
doms
,
attr
);
put_online_cpus
();
}
done:
kfree
(
csa
);
/* Don't kfree(doms) -- partition_sched_domains() does that. */
/* Don't kfree(dattr) -- partition_sched_domains() does that. */
static
DECLARE_WORK
(
rebuild_sched_domains_work
,
do_rebuild_sched_domains
);
/*
* Rebuild scheduler domains, asynchronously via workqueue.
*
* If the flag 'sched_load_balance' of any cpuset with non-empty
* 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
* which has that flag enabled, or if any cpuset with a non-empty
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
* The rebuild_sched_domains() and partition_sched_domains()
* routines must nest cgroup_lock() inside get_online_cpus(),
* but such cpuset changes as these must nest that locking the
* other way, holding cgroup_lock() for much of the code.
*
* So in order to avoid an ABBA deadlock, the cpuset code handling
* these user changes delegates the actual sched domain rebuilding
* to a separate workqueue thread, which ends up processing the
* above do_rebuild_sched_domains() function.
*/
static
void
async_rebuild_sched_domains
(
void
)
{
schedule_work
(
&
rebuild_sched_domains_work
);
}
/*
* Accomplishes the same scheduler domain rebuild as the above
* async_rebuild_sched_domains(), however it directly calls the
* rebuild routine synchronously rather than calling it via an
* asynchronous work thread.
*
* This can only be called from code that is not holding
* cgroup_mutex (not nested in a cgroup_lock() call.)
*/
void
rebuild_sched_domains
(
void
)
{
do_rebuild_sched_domains
(
NULL
);
}
/**
...
...
@@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
return
retval
;
if
(
is_load_balanced
)
rebuild_sched_domains
();
async_
rebuild_sched_domains
();
return
0
;
}
...
...
@@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
if
(
val
!=
cs
->
relax_domain_level
)
{
cs
->
relax_domain_level
=
val
;
if
(
!
cpus_empty
(
cs
->
cpus_allowed
)
&&
is_sched_load_balance
(
cs
))
rebuild_sched_domains
();
async_
rebuild_sched_domains
();
}
return
0
;
...
...
@@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
mutex_unlock
(
&
callback_mutex
);
if
(
cpus_nonempty
&&
balance_flag_changed
)
rebuild_sched_domains
();
async_
rebuild_sched_domains
();
return
0
;
}
...
...
@@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
default:
BUG
();
}
/* Unreachable but makes gcc happy */
return
0
;
}
static
s64
cpuset_read_s64
(
struct
cgroup
*
cont
,
struct
cftype
*
cft
)
...
...
@@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
default:
BUG
();
}
/* Unrechable but makes gcc happy */
return
0
;
}
...
...
@@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create(
}
/*
* Locking note on the strange update_flag() call below:
*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
* will call rebuild_sched_domains(). The get_online_cpus()
* call in rebuild_sched_domains() must not be made while holding
* callback_mutex. Elsewhere the kernel nests callback_mutex inside
* get_online_cpus() calls. So the reverse nesting would risk an
* ABBA deadlock.
* will call async_rebuild_sched_domains().
*/
static
void
cpuset_destroy
(
struct
cgroup_subsys
*
ss
,
struct
cgroup
*
cont
)
...
...
@@ -1719,7 +1788,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
struct
cgroup_subsys
cpuset_subsys
=
{
.
name
=
"cpuset"
,
.
create
=
cpuset_create
,
.
destroy
=
cpuset_destroy
,
.
destroy
=
cpuset_destroy
,
.
can_attach
=
cpuset_can_attach
,
.
attach
=
cpuset_attach
,
.
populate
=
cpuset_populate
,
...
...
@@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
}
/*
* If
common_cpu_mem_hotplug_unplug(), below, unplugs
any CPUs
* If
CPU and/or memory hotplug handlers, below, unplug
any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then move the tasks in the empty
...
...
@@ -1902,35 +1971,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
}
}
/*
* The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
* cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
* track what's online after any CPU or memory node hotplug or unplug event.
*
* Since there are two callers of this routine, one for CPU hotplug
* events and one for memory node hotplug events, we could have coded
* two separate routines here. We code it as a single common routine
* in order to minimize text size.
*/
static
void
common_cpu_mem_hotplug_unplug
(
int
rebuild_sd
)
{
cgroup_lock
();
top_cpuset
.
cpus_allowed
=
cpu_online_map
;
top_cpuset
.
mems_allowed
=
node_states
[
N_HIGH_MEMORY
];
scan_for_empty_cpusets
(
&
top_cpuset
);
/*
* Scheduler destroys domains on hotplug events.
* Rebuild them based on the current settings.
*/
if
(
rebuild_sd
)
rebuild_sched_domains
();
cgroup_unlock
();
}
/*
* The top_cpuset tracks what CPUs and Memory Nodes are online,
* period. This is necessary in order to make cpusets transparent
...
...
@@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
*
* This routine ensures that top_cpuset.cpus_allowed tracks
* cpu_online_map on each CPU hotplug (cpuhp) event.
*
* Called within get_online_cpus(). Needs to call cgroup_lock()
* before calling generate_sched_domains().
*/
static
int
cpuset_handle_cpuhp
(
struct
notifier_block
*
unused_nb
,
static
int
cpuset_track_online_cpus
(
struct
notifier_block
*
unused_nb
,
unsigned
long
phase
,
void
*
unused_cpu
)
{
struct
sched_domain_attr
*
attr
;
cpumask_t
*
doms
;
int
ndoms
;
switch
(
phase
)
{
case
CPU_UP_CANCELED
:
case
CPU_UP_CANCELED_FROZEN
:
case
CPU_DOWN_FAILED
:
case
CPU_DOWN_FAILED_FROZEN
:
case
CPU_ONLINE
:
case
CPU_ONLINE_FROZEN
:
case
CPU_DEAD
:
case
CPU_DEAD_FROZEN
:
common_cpu_mem_hotplug_unplug
(
1
);
break
;
default:
return
NOTIFY_DONE
;
}
cgroup_lock
();
top_cpuset
.
cpus_allowed
=
cpu_online_map
;
scan_for_empty_cpusets
(
&
top_cpuset
);
ndoms
=
generate_sched_domains
(
&
doms
,
&
attr
);
cgroup_unlock
();
/* Have scheduler rebuild the domains */
partition_sched_domains
(
ndoms
,
doms
,
attr
);
return
NOTIFY_OK
;
}
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
* Call this routine anytime after you change
* node_states[N_HIGH_MEMORY].
* See also the previous routine cpuset_handle_cpuhp().
* Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
* See also the previous routine cpuset_track_online_cpus().
*/
void
cpuset_track_online_nodes
(
void
)
{
common_cpu_mem_hotplug_unplug
(
0
);
cgroup_lock
();
top_cpuset
.
mems_allowed
=
node_states
[
N_HIGH_MEMORY
];
scan_for_empty_cpusets
(
&
top_cpuset
);
cgroup_unlock
();
}
#endif
...
...
@@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void)
top_cpuset
.
cpus_allowed
=
cpu_online_map
;
top_cpuset
.
mems_allowed
=
node_states
[
N_HIGH_MEMORY
];
hotcpu_notifier
(
cpuset_
handle_cpuhp
,
0
);
hotcpu_notifier
(
cpuset_
track_online_cpus
,
0
);
}
/**
...
...
kernel/sched.c
View file @
291c54ff
...
...
@@ -7696,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
* and partition_sched_domains() will fallback to the single partition
* 'fallback_doms', it also forces the domains to be rebuilt.
*
* If doms_new==NULL it will be replaced with cpu_online_map.
* ndoms_new==0 is a special case for destroying existing domains.
* It will not create the default domain.
*
* Call with hotplug lock held
*/
void
partition_sched_domains
(
int
ndoms_new
,
cpumask_t
*
doms_new
,
struct
sched_domain_attr
*
dattr_new
)
{
int
i
,
j
;
int
i
,
j
,
n
;
mutex_lock
(
&
sched_domains_mutex
);
/* always unregister in case we don't destroy any domains */
unregister_sched_domain_sysctl
();
if
(
doms_new
==
NULL
)
ndoms_new
=
0
;
n
=
doms_new
?
ndoms_new
:
0
;
/* Destroy deleted domains */
for
(
i
=
0
;
i
<
ndoms_cur
;
i
++
)
{
for
(
j
=
0
;
j
<
n
doms_new
;
j
++
)
{
for
(
j
=
0
;
j
<
n
;
j
++
)
{
if
(
cpus_equal
(
doms_cur
[
i
],
doms_new
[
j
])
&&
dattrs_equal
(
dattr_cur
,
i
,
dattr_new
,
j
))
goto
match1
;
...
...
@@ -7726,7 +7729,6 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
if
(
doms_new
==
NULL
)
{
ndoms_cur
=
0
;
ndoms_new
=
1
;
doms_new
=
&
fallback_doms
;
cpus_andnot
(
doms_new
[
0
],
cpu_online_map
,
cpu_isolated_map
);
dattr_new
=
NULL
;
...
...
@@ -7763,8 +7765,13 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
int
arch_reinit_sched_domains
(
void
)
{
get_online_cpus
();
/* Destroy domains first to force the rebuild */
partition_sched_domains
(
0
,
NULL
,
NULL
);
rebuild_sched_domains
();
put_online_cpus
();
return
0
;
}
...
...
@@ -7848,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb,
case
CPU_ONLINE_FROZEN
:
case
CPU_DEAD
:
case
CPU_DEAD_FROZEN
:
partition_sched_domains
(
0
,
NULL
,
NULL
);
partition_sched_domains
(
1
,
NULL
,
NULL
);
return
NOTIFY_OK
;
default:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment