Commit be53f58f authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
 "Misc fixes: a cgroup fix, a fair-scheduler migration accounting fix, a
  cputime fix and two cpuacct cleanups"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/cpuacct: Simplify the cpuacct code
  sched/cpuacct: Rename parameter in cpuusage_write() for readability
  sched/fair: Add comments to explain select_idle_sibling()
  sched/fair: Fix fairness issue on migration
  sched/cgroup: Fix/cleanup cgroup teardown/init
  sched/cputime: Fix steal time accounting vs. CPU hotplug
parents 19d6f04c 73e6aafd
...@@ -5371,6 +5371,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) ...@@ -5371,6 +5371,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_UP_PREPARE: case CPU_UP_PREPARE:
rq->calc_load_update = calc_load_update; rq->calc_load_update = calc_load_update;
account_reset_rq(rq);
break; break;
case CPU_ONLINE: case CPU_ONLINE:
...@@ -7537,7 +7538,7 @@ void set_curr_task(int cpu, struct task_struct *p) ...@@ -7537,7 +7538,7 @@ void set_curr_task(int cpu, struct task_struct *p)
/* task_group_lock serializes the addition/removal of task groups */ /* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock); static DEFINE_SPINLOCK(task_group_lock);
static void free_sched_group(struct task_group *tg) static void sched_free_group(struct task_group *tg)
{ {
free_fair_sched_group(tg); free_fair_sched_group(tg);
free_rt_sched_group(tg); free_rt_sched_group(tg);
...@@ -7563,7 +7564,7 @@ struct task_group *sched_create_group(struct task_group *parent) ...@@ -7563,7 +7564,7 @@ struct task_group *sched_create_group(struct task_group *parent)
return tg; return tg;
err: err:
free_sched_group(tg); sched_free_group(tg);
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
...@@ -7583,17 +7584,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) ...@@ -7583,17 +7584,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
} }
/* rcu callback to free various structures associated with a task group */ /* rcu callback to free various structures associated with a task group */
static void free_sched_group_rcu(struct rcu_head *rhp) static void sched_free_group_rcu(struct rcu_head *rhp)
{ {
/* now it should be safe to free those cfs_rqs */ /* now it should be safe to free those cfs_rqs */
free_sched_group(container_of(rhp, struct task_group, rcu)); sched_free_group(container_of(rhp, struct task_group, rcu));
} }
/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg) void sched_destroy_group(struct task_group *tg)
{ {
/* wait for possible concurrent references to cfs_rqs complete */ /* wait for possible concurrent references to cfs_rqs complete */
call_rcu(&tg->rcu, free_sched_group_rcu); call_rcu(&tg->rcu, sched_free_group_rcu);
} }
void sched_offline_group(struct task_group *tg) void sched_offline_group(struct task_group *tg)
...@@ -8052,31 +8052,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ...@@ -8052,31 +8052,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg)) if (IS_ERR(tg))
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
sched_online_group(tg, parent);
return &tg->css; return &tg->css;
} }
static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{ {
struct task_group *tg = css_tg(css); struct task_group *tg = css_tg(css);
struct task_group *parent = css_tg(css->parent);
if (parent) sched_offline_group(tg);
sched_online_group(tg, parent);
return 0;
} }
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{ {
struct task_group *tg = css_tg(css); struct task_group *tg = css_tg(css);
sched_destroy_group(tg); /*
} * Relies on the RCU grace period between css_released() and this.
*/
static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_free_group(tg);
{
struct task_group *tg = css_tg(css);
sched_offline_group(tg);
} }
static void cpu_cgroup_fork(struct task_struct *task) static void cpu_cgroup_fork(struct task_struct *task)
...@@ -8436,9 +8431,8 @@ static struct cftype cpu_files[] = { ...@@ -8436,9 +8431,8 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = { struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc, .css_alloc = cpu_cgroup_css_alloc,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free, .css_free = cpu_cgroup_css_free,
.css_online = cpu_cgroup_css_online,
.css_offline = cpu_cgroup_css_offline,
.fork = cpu_cgroup_fork, .fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach, .can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach, .attach = cpu_cgroup_attach,
......
...@@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) ...@@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
} }
static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
u64 reset) u64 val)
{ {
struct cpuacct *ca = css_ca(css); struct cpuacct *ca = css_ca(css);
int err = 0; int err = 0;
int i; int i;
if (reset) { /*
* Only allow '0' here to do a reset.
*/
if (val) {
err = -EINVAL; err = -EINVAL;
goto out; goto out;
} }
...@@ -235,23 +238,10 @@ static struct cftype files[] = { ...@@ -235,23 +238,10 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime) void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{ {
struct cpuacct *ca; struct cpuacct *ca;
int cpu;
cpu = task_cpu(tsk);
rcu_read_lock(); rcu_read_lock();
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
ca = task_ca(tsk); *this_cpu_ptr(ca->cpuusage) += cputime;
while (true) {
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
ca = parent_ca(ca);
if (!ca)
break;
}
rcu_read_unlock(); rcu_read_unlock();
} }
...@@ -260,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) ...@@ -260,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
* *
* Note: it's the caller that updates the account of the root cgroup. * Note: it's the caller that updates the account of the root cgroup.
*/ */
void cpuacct_account_field(struct task_struct *p, int index, u64 val) void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{ {
struct kernel_cpustat *kcpustat;
struct cpuacct *ca; struct cpuacct *ca;
rcu_read_lock(); rcu_read_lock();
ca = task_ca(p); for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
while (ca != &root_cpuacct) { this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
kcpustat = this_cpu_ptr(ca->cpustat);
kcpustat->cpustat[index] += val;
ca = parent_ca(ca);
}
rcu_read_unlock(); rcu_read_unlock();
} }
......
#ifdef CONFIG_CGROUP_CPUACCT #ifdef CONFIG_CGROUP_CPUACCT
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else #else
...@@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) ...@@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
} }
static inline void static inline void
cpuacct_account_field(struct task_struct *p, int index, u64 val) cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{ {
} }
......
...@@ -3181,17 +3181,25 @@ static inline void check_schedstat_required(void) ...@@ -3181,17 +3181,25 @@ static inline void check_schedstat_required(void)
static void static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{ {
bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
bool curr = cfs_rq->curr == se;
/* /*
* Update the normalized vruntime before updating min_vruntime * If we're the current task, we must renormalise before calling
* through calling update_curr(). * update_curr().
*/ */
if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) if (renorm && curr)
se->vruntime += cfs_rq->min_vruntime; se->vruntime += cfs_rq->min_vruntime;
update_curr(cfs_rq);
/* /*
* Update run-time statistics of the 'current'. * Otherwise, renormalise after, such that we're placed at the current
* moment in time, instead of some random moment in the past.
*/ */
update_curr(cfs_rq); if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
enqueue_entity_load_avg(cfs_rq, se); enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se); account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq); update_cfs_shares(cfs_rq);
...@@ -3207,7 +3215,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ...@@ -3207,7 +3215,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_stats_enqueue(cfs_rq, se); update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se); check_spread(cfs_rq, se);
} }
if (se != cfs_rq->curr) if (!curr)
__enqueue_entity(cfs_rq, se); __enqueue_entity(cfs_rq, se);
se->on_rq = 1; se->on_rq = 1;
...@@ -5071,7 +5079,19 @@ static int select_idle_sibling(struct task_struct *p, int target) ...@@ -5071,7 +5079,19 @@ static int select_idle_sibling(struct task_struct *p, int target)
return i; return i;
/* /*
* Otherwise, iterate the domains and find an elegible idle cpu. * Otherwise, iterate the domains and find an eligible idle cpu.
*
* A completely idle sched group at higher domains is more
* desirable than an idle group at a lower level, because lower
* domains have smaller groups and usually share hardware
* resources which causes tasks to contend on them, e.g. x86
* hyperthread siblings in the lowest domain (SMT) can contend
* on the shared cpu pipeline.
*
* However, while we prefer idle groups at higher domains
* finding an idle cpu at the lowest domain is still better than
* returning 'target', which we've already established, isn't
* idle.
*/ */
sd = rcu_dereference(per_cpu(sd_llc, target)); sd = rcu_dereference(per_cpu(sd_llc, target));
for_each_lower_domain(sd) { for_each_lower_domain(sd) {
...@@ -5081,11 +5101,16 @@ static int select_idle_sibling(struct task_struct *p, int target) ...@@ -5081,11 +5101,16 @@ static int select_idle_sibling(struct task_struct *p, int target)
tsk_cpus_allowed(p))) tsk_cpus_allowed(p)))
goto next; goto next;
/* Ensure the entire group is idle */
for_each_cpu(i, sched_group_cpus(sg)) { for_each_cpu(i, sched_group_cpus(sg)) {
if (i == target || !idle_cpu(i)) if (i == target || !idle_cpu(i))
goto next; goto next;
} }
/*
* It doesn't matter which cpu we pick, the
* whole group is idle.
*/
target = cpumask_first_and(sched_group_cpus(sg), target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p)); tsk_cpus_allowed(p));
goto done; goto done;
......
...@@ -1841,3 +1841,16 @@ static inline void cpufreq_trigger_update(u64 time) ...@@ -1841,3 +1841,16 @@ static inline void cpufreq_trigger_update(u64 time)
static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
static inline void cpufreq_trigger_update(u64 time) {} static inline void cpufreq_trigger_update(u64 time) {}
#endif /* CONFIG_CPU_FREQ */ #endif /* CONFIG_CPU_FREQ */
static inline void account_reset_rq(struct rq *rq)
{
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
rq->prev_irq_time = 0;
#endif
#ifdef CONFIG_PARAVIRT
rq->prev_steal_time = 0;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
rq->prev_steal_time_rq = 0;
#endif
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment