Commit cca08cd6 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - introduce and use task_rcu_dereference()/try_get_task_struct() to fix
   and generalize task_struct handling (Oleg Nesterov)

 - do various per entity load tracking (PELT) fixes and optimizations
   (Peter Zijlstra)

 - cputime virt-steal time accounting enhancements/fixes (Wanpeng Li)

 - introduce consolidated cputime output file cpuacct.usage_all and
   related refactorings (Zhao Lei)

 - ... plus misc fixes and enhancements

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/core: Panic on scheduling while atomic bugs if kernel.panic_on_warn is set
  sched/cpuacct: Introduce cpuacct.usage_all to show all CPU stats together
  sched/cpuacct: Use loop to consolidate code in cpuacct_stats_show()
  sched/cpuacct: Merge cpuacct_usage_index and cpuacct_stat_index enums
  sched/fair: Rework throttle_count sync
  sched/core: Fix sched_getaffinity() return value kerneldoc comment
  sched/fair: Reorder cgroup creation code
  sched/fair: Apply more PELT fixes
  sched/fair: Fix PELT integrity for new tasks
  sched/cgroup: Fix cpu_cgroup_fork() handling
  sched/fair: Fix PELT integrity for new groups
  sched/fair: Fix and optimize the fork() path
  sched/cputime: Add steal time support to full dynticks CPU time accounting
  sched/cputime: Fix prev steal time accouting during CPU hotplug
  KVM: Fix steal clock warp during guest CPU hotplug
  sched/debug: Always show 'nr_migrations'
  sched/fair: Use task_rcu_dereference()
  sched/api: Introduce task_rcu_dereference() and try_get_task_struct()
  sched/idle: Optimize the generic idle loop
  sched/fair: Fix the wrong throttled clock time for cfs_rq_clock_task()
parents 7e4dc77b 748c7201
...@@ -301,8 +301,6 @@ static void kvm_register_steal_time(void) ...@@ -301,8 +301,6 @@ static void kvm_register_steal_time(void)
if (!has_steal_clock) if (!has_steal_clock)
return; return;
memset(st, 0, sizeof(*st));
wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
pr_info("kvm-stealtime: cpu %d, msr %llx\n", pr_info("kvm-stealtime: cpu %d, msr %llx\n",
cpu, (unsigned long long) slow_virt_to_phys(st)); cpu, (unsigned long long) slow_virt_to_phys(st));
......
...@@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p); ...@@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p);
#define TASK_WAKING 256 #define TASK_WAKING 256
#define TASK_PARKED 512 #define TASK_PARKED 512
#define TASK_NOLOAD 1024 #define TASK_NOLOAD 1024
#define TASK_STATE_MAX 2048 #define TASK_NEW 2048
#define TASK_STATE_MAX 4096
#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
extern char ___assert_task_state[1 - 2*!!( extern char ___assert_task_state[1 - 2*!!(
sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
...@@ -2139,6 +2140,9 @@ static inline void put_task_struct(struct task_struct *t) ...@@ -2139,6 +2140,9 @@ static inline void put_task_struct(struct task_struct *t)
__put_task_struct(t); __put_task_struct(t);
} }
struct task_struct *task_rcu_dereference(struct task_struct **ptask);
struct task_struct *try_get_task_struct(struct task_struct **ptask);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern void task_cputime(struct task_struct *t, extern void task_cputime(struct task_struct *t,
cputime_t *utime, cputime_t *stime); cputime_t *utime, cputime_t *stime);
......
...@@ -210,6 +210,82 @@ void release_task(struct task_struct *p) ...@@ -210,6 +210,82 @@ void release_task(struct task_struct *p)
goto repeat; goto repeat;
} }
/*
* Note that if this function returns a valid task_struct pointer (!NULL)
* task->usage must remain >0 for the duration of the RCU critical section.
*/
struct task_struct *task_rcu_dereference(struct task_struct **ptask)
{
struct sighand_struct *sighand;
struct task_struct *task;
/*
* We need to verify that release_task() was not called and thus
* delayed_put_task_struct() can't run and drop the last reference
* before rcu_read_unlock(). We check task->sighand != NULL,
* but we can read the already freed and reused memory.
*/
retry:
task = rcu_dereference(*ptask);
if (!task)
return NULL;
probe_kernel_address(&task->sighand, sighand);
/*
* Pairs with atomic_dec_and_test() in put_task_struct(). If this task
* was already freed we can not miss the preceding update of this
* pointer.
*/
smp_rmb();
if (unlikely(task != READ_ONCE(*ptask)))
goto retry;
/*
* We've re-checked that "task == *ptask", now we have two different
* cases:
*
* 1. This is actually the same task/task_struct. In this case
* sighand != NULL tells us it is still alive.
*
* 2. This is another task which got the same memory for task_struct.
* We can't know this of course, and we can not trust
* sighand != NULL.
*
* In this case we actually return a random value, but this is
* correct.
*
* If we return NULL - we can pretend that we actually noticed that
* *ptask was updated when the previous task has exited. Or pretend
* that probe_slab_address(&sighand) reads NULL.
*
* If we return the new task (because sighand is not NULL for any
* reason) - this is fine too. This (new) task can't go away before
* another gp pass.
*
* And note: We could even eliminate the false positive if re-read
* task->sighand once again to avoid the falsely NULL. But this case
* is very unlikely so we don't care.
*/
if (!sighand)
return NULL;
return task;
}
struct task_struct *try_get_task_struct(struct task_struct **ptask)
{
struct task_struct *task;
rcu_read_lock();
task = task_rcu_dereference(ptask);
if (task)
get_task_struct(task);
rcu_read_unlock();
return task;
}
/* /*
* Determine if a process group is "orphaned", according to the POSIX * Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected * definition in 2.2.2.52. Orphaned process groups are not to be affected
......
...@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) ...@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
__sched_fork(clone_flags, p); __sched_fork(clone_flags, p);
/* /*
* We mark the process as running here. This guarantees that * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external * nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either. * event cannot wake it up and insert it on the runqueue either.
*/ */
p->state = TASK_RUNNING; p->state = TASK_NEW;
/* /*
* Make sure we do not leak PI boosting priority to the child. * Make sure we do not leak PI boosting priority to the child.
...@@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) ...@@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_class = &fair_sched_class; p->sched_class = &fair_sched_class;
} }
if (p->sched_class->task_fork) init_entity_runnable_average(&p->se);
p->sched_class->task_fork(p);
/* /*
* The child is not yet in the pid-hash so no cgroup attach races, * The child is not yet in the pid-hash so no cgroup attach races,
...@@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) ...@@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU. * Silence PROVE_RCU.
*/ */
raw_spin_lock_irqsave(&p->pi_lock, flags); raw_spin_lock_irqsave(&p->pi_lock, flags);
set_task_cpu(p, cpu); /*
* We're setting the cpu for the first time, we don't migrate,
* so use __set_task_cpu().
*/
__set_task_cpu(p, cpu);
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags); raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO #ifdef CONFIG_SCHED_INFO
...@@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p) ...@@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p)
struct rq_flags rf; struct rq_flags rf;
struct rq *rq; struct rq *rq;
/* Initialize new task's runnable average */
init_entity_runnable_average(&p->se);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags); raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* /*
* Fork balancing, do it here and not earlier because: * Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path * - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug * - any previously selected cpu might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/ */
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif #endif
rq = __task_rq_lock(p, &rf); rq = __task_rq_lock(p, &rf);
post_init_entity_util_avg(&p->se); post_init_entity_util_avg(&p->se);
...@@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev) ...@@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_cont("\n"); pr_cont("\n");
} }
#endif #endif
if (panic_on_warn)
panic("scheduling while atomic\n");
dump_stack(); dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK); add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
} }
...@@ -4752,7 +4762,8 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) ...@@ -4752,7 +4762,8 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
* @len: length in bytes of the bitmask pointed to by user_mask_ptr * @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask * @user_mask_ptr: user-space pointer to hold the current cpu mask
* *
* Return: 0 on success. An error code otherwise. * Return: size of CPU mask copied to user_mask_ptr on success. An
* error code otherwise.
*/ */
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr) unsigned long __user *, user_mask_ptr)
...@@ -7233,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) ...@@ -7233,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
rq->calc_load_update = calc_load_update; rq->calc_load_update = calc_load_update;
account_reset_rq(rq);
update_max_interval(); update_max_interval();
} }
...@@ -7713,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) ...@@ -7713,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
INIT_LIST_HEAD(&tg->children); INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children); list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags); spin_unlock_irqrestore(&task_group_lock, flags);
online_fair_sched_group(tg);
} }
/* rcu callback to free various structures associated with a task group */ /* rcu callback to free various structures associated with a task group */
...@@ -7741,27 +7753,9 @@ void sched_offline_group(struct task_group *tg) ...@@ -7741,27 +7753,9 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags); spin_unlock_irqrestore(&task_group_lock, flags);
} }
/* change task's runqueue when it moves between groups. static void sched_change_group(struct task_struct *tsk, int type)
* The caller of this function should have put the task in its new group
* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
* reflect its new group.
*/
void sched_move_task(struct task_struct *tsk)
{ {
struct task_group *tg; struct task_group *tg;
int queued, running;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
running = task_current(rq, tsk);
queued = task_on_rq_queued(tsk);
if (queued)
dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
if (unlikely(running))
put_prev_task(rq, tsk);
/* /*
* All callers are synchronized by task_rq_lock(); we do not use RCU * All callers are synchronized by task_rq_lock(); we do not use RCU
...@@ -7774,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk) ...@@ -7774,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_task_group = tg; tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group) if (tsk->sched_class->task_change_group)
tsk->sched_class->task_move_group(tsk); tsk->sched_class->task_change_group(tsk, type);
else else
#endif #endif
set_task_rq(tsk, task_cpu(tsk)); set_task_rq(tsk, task_cpu(tsk));
}
/*
* Change task's runqueue when it moves between groups.
*
* The caller of this function should have put the task in its new group by
* now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
* its new group.
*/
void sched_move_task(struct task_struct *tsk)
{
int queued, running;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
running = task_current(rq, tsk);
queued = task_on_rq_queued(tsk);
if (queued)
dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
if (unlikely(running))
put_prev_task(rq, tsk);
sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running)) if (unlikely(running))
tsk->sched_class->set_curr_task(rq); tsk->sched_class->set_curr_task(rq);
...@@ -8206,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ...@@ -8206,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_free_group(tg); sched_free_group(tg);
} }
/*
* This is called before wake_up_new_task(), therefore we really only
* have to set its group bits, all the other stuff does not apply.
*/
static void cpu_cgroup_fork(struct task_struct *task) static void cpu_cgroup_fork(struct task_struct *task)
{ {
sched_move_task(task); struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(task, &rf);
sched_change_group(task, TASK_SET_GROUP);
task_rq_unlock(rq, task, &rf);
} }
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{ {
struct task_struct *task; struct task_struct *task;
struct cgroup_subsys_state *css; struct cgroup_subsys_state *css;
int ret = 0;
cgroup_taskset_for_each(task, css, tset) { cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
...@@ -8225,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ...@@ -8225,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (task->sched_class != &fair_sched_class) if (task->sched_class != &fair_sched_class)
return -EINVAL; return -EINVAL;
#endif #endif
/*
* Serialize against wake_up_new_task() such that if its
* running, we're sure to observe its full state.
*/
raw_spin_lock_irq(&task->pi_lock);
/*
* Avoid calling sched_move_task() before wake_up_new_task()
* has happened. This would lead to problems with PELT, due to
* move wanting to detach+attach while we're not attached yet.
*/
if (task->state == TASK_NEW)
ret = -EINVAL;
raw_spin_unlock_irq(&task->pi_lock);
if (ret)
break;
} }
return 0; return ret;
} }
static void cpu_cgroup_attach(struct cgroup_taskset *tset) static void cpu_cgroup_attach(struct cgroup_taskset *tset)
......
...@@ -25,15 +25,13 @@ enum cpuacct_stat_index { ...@@ -25,15 +25,13 @@ enum cpuacct_stat_index {
CPUACCT_STAT_NSTATS, CPUACCT_STAT_NSTATS,
}; };
enum cpuacct_usage_index { static const char * const cpuacct_stat_desc[] = {
CPUACCT_USAGE_USER, /* ... user mode */ [CPUACCT_STAT_USER] = "user",
CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ [CPUACCT_STAT_SYSTEM] = "system",
CPUACCT_USAGE_NRUSAGE,
}; };
struct cpuacct_usage { struct cpuacct_usage {
u64 usages[CPUACCT_USAGE_NRUSAGE]; u64 usages[CPUACCT_STAT_NSTATS];
}; };
/* track cpu usage of a group of tasks and its child groups */ /* track cpu usage of a group of tasks and its child groups */
...@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) ...@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
} }
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
enum cpuacct_usage_index index) enum cpuacct_stat_index index)
{ {
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data; u64 data;
/* /*
* We allow index == CPUACCT_USAGE_NRUSAGE here to read * We allow index == CPUACCT_STAT_NSTATS here to read
* the sum of suages. * the sum of suages.
*/ */
BUG_ON(index > CPUACCT_USAGE_NRUSAGE); BUG_ON(index > CPUACCT_STAT_NSTATS);
#ifndef CONFIG_64BIT #ifndef CONFIG_64BIT
/* /*
...@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, ...@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
raw_spin_lock_irq(&cpu_rq(cpu)->lock); raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif #endif
if (index == CPUACCT_USAGE_NRUSAGE) { if (index == CPUACCT_STAT_NSTATS) {
int i = 0; int i = 0;
data = 0; data = 0;
for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
data += cpuusage->usages[i]; data += cpuusage->usages[i];
} else { } else {
data = cpuusage->usages[index]; data = cpuusage->usages[index];
...@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) ...@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
raw_spin_lock_irq(&cpu_rq(cpu)->lock); raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif #endif
for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
cpuusage->usages[i] = val; cpuusage->usages[i] = val;
#ifndef CONFIG_64BIT #ifndef CONFIG_64BIT
...@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) ...@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
/* return total cpu usage (in nanoseconds) of a group */ /* return total cpu usage (in nanoseconds) of a group */
static u64 __cpuusage_read(struct cgroup_subsys_state *css, static u64 __cpuusage_read(struct cgroup_subsys_state *css,
enum cpuacct_usage_index index) enum cpuacct_stat_index index)
{ {
struct cpuacct *ca = css_ca(css); struct cpuacct *ca = css_ca(css);
u64 totalcpuusage = 0; u64 totalcpuusage = 0;
...@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css, ...@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css,
static u64 cpuusage_user_read(struct cgroup_subsys_state *css, static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
struct cftype *cft) struct cftype *cft)
{ {
return __cpuusage_read(css, CPUACCT_USAGE_USER); return __cpuusage_read(css, CPUACCT_STAT_USER);
} }
static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
struct cftype *cft) struct cftype *cft)
{ {
return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
} }
static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
{ {
return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
} }
static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
...@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, ...@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
} }
static int __cpuacct_percpu_seq_show(struct seq_file *m, static int __cpuacct_percpu_seq_show(struct seq_file *m,
enum cpuacct_usage_index index) enum cpuacct_stat_index index)
{ {
struct cpuacct *ca = css_ca(seq_css(m)); struct cpuacct *ca = css_ca(seq_css(m));
u64 percpu; u64 percpu;
...@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m, ...@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m,
static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
{ {
return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
} }
static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
{ {
return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
} }
static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
{ {
return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
} }
static const char * const cpuacct_stat_desc[] = { static int cpuacct_all_seq_show(struct seq_file *m, void *V)
[CPUACCT_STAT_USER] = "user", {
[CPUACCT_STAT_SYSTEM] = "system", struct cpuacct *ca = css_ca(seq_css(m));
}; int index;
int cpu;
seq_puts(m, "cpu");
for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
seq_printf(m, " %s", cpuacct_stat_desc[index]);
seq_puts(m, "\n");
for_each_possible_cpu(cpu) {
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
seq_printf(m, "%d", cpu);
for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit read safe on 32-bit
* platforms.
*/
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif
seq_printf(m, " %llu", cpuusage->usages[index]);
#ifndef CONFIG_64BIT
raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
#endif
}
seq_puts(m, "\n");
}
return 0;
}
static int cpuacct_stats_show(struct seq_file *sf, void *v) static int cpuacct_stats_show(struct seq_file *sf, void *v)
{ {
struct cpuacct *ca = css_ca(seq_css(sf)); struct cpuacct *ca = css_ca(seq_css(sf));
s64 val[CPUACCT_STAT_NSTATS];
int cpu; int cpu;
s64 val = 0; int stat;
memset(val, 0, sizeof(val));
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
val += kcpustat->cpustat[CPUTIME_USER];
val += kcpustat->cpustat[CPUTIME_NICE];
}
val = cputime64_to_clock_t(val);
seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
val = 0; val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
for_each_possible_cpu(cpu) { val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
val += kcpustat->cpustat[CPUTIME_SYSTEM]; val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
val += kcpustat->cpustat[CPUTIME_IRQ]; val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
} }
val = cputime64_to_clock_t(val); for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); seq_printf(sf, "%s %lld\n",
cpuacct_stat_desc[stat],
cputime64_to_clock_t(val[stat]));
}
return 0; return 0;
} }
...@@ -301,6 +329,10 @@ static struct cftype files[] = { ...@@ -301,6 +329,10 @@ static struct cftype files[] = {
.name = "usage_percpu_sys", .name = "usage_percpu_sys",
.seq_show = cpuacct_percpu_sys_seq_show, .seq_show = cpuacct_percpu_sys_seq_show,
}, },
{
.name = "usage_all",
.seq_show = cpuacct_all_seq_show,
},
{ {
.name = "stat", .name = "stat",
.seq_show = cpuacct_stats_show, .seq_show = cpuacct_stats_show,
...@@ -316,11 +348,11 @@ static struct cftype files[] = { ...@@ -316,11 +348,11 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime) void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{ {
struct cpuacct *ca; struct cpuacct *ca;
int index = CPUACCT_USAGE_SYSTEM; int index = CPUACCT_STAT_SYSTEM;
struct pt_regs *regs = task_pt_regs(tsk); struct pt_regs *regs = task_pt_regs(tsk);
if (regs && user_mode(regs)) if (regs && user_mode(regs))
index = CPUACCT_USAGE_USER; index = CPUACCT_STAT_USER;
rcu_read_lock(); rcu_read_lock();
......
...@@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime) ...@@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime; cpustat[CPUTIME_IDLE] += (__force u64) cputime;
} }
static __always_inline bool steal_account_process_tick(void) static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies)
{ {
#ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) { if (static_key_false(&paravirt_steal_enabled)) {
...@@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void) ...@@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void)
* time in jiffies. Lets cast the result to jiffies * time in jiffies. Lets cast the result to jiffies
* granularity and account the rest on the next rounds. * granularity and account the rest on the next rounds.
*/ */
steal_jiffies = nsecs_to_jiffies(steal); steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies);
this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
account_steal_time(jiffies_to_cputime(steal_jiffies)); account_steal_time(jiffies_to_cputime(steal_jiffies));
return steal_jiffies; return steal_jiffies;
} }
#endif #endif
return false; return 0;
} }
/* /*
...@@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, ...@@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
u64 cputime = (__force u64) cputime_one_jiffy; u64 cputime = (__force u64) cputime_one_jiffy;
u64 *cpustat = kcpustat_this_cpu->cpustat; u64 *cpustat = kcpustat_this_cpu->cpustat;
if (steal_account_process_tick()) if (steal_account_process_tick(ULONG_MAX))
return; return;
cputime *= ticks; cputime *= ticks;
...@@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick) ...@@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
return; return;
} }
if (steal_account_process_tick()) if (steal_account_process_tick(ULONG_MAX))
return; return;
if (user_tick) if (user_tick)
...@@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) ...@@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
static cputime_t get_vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk)
{ {
unsigned long now = READ_ONCE(jiffies); unsigned long now = READ_ONCE(jiffies);
unsigned long delta = now - tsk->vtime_snap; unsigned long delta_jiffies, steal_jiffies;
delta_jiffies = now - tsk->vtime_snap;
steal_jiffies = steal_account_process_tick(delta_jiffies);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap = now; tsk->vtime_snap = now;
return jiffies_to_cputime(delta); return jiffies_to_cputime(delta_jiffies - steal_jiffies);
} }
static void __vtime_account_system(struct task_struct *tsk) static void __vtime_account_system(struct task_struct *tsk)
......
...@@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) ...@@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw; nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
P(se.nr_migrations); P(se.nr_migrations);
#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) { if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu; u64 avg_atom, avg_per_cpu;
......
This diff is collapsed.
...@@ -201,6 +201,8 @@ static void cpuidle_idle_call(void) ...@@ -201,6 +201,8 @@ static void cpuidle_idle_call(void)
*/ */
static void cpu_idle_loop(void) static void cpu_idle_loop(void)
{ {
int cpu = smp_processor_id();
while (1) { while (1) {
/* /*
* If the arch has a polling bit, we maintain an invariant: * If the arch has a polling bit, we maintain an invariant:
...@@ -219,7 +221,7 @@ static void cpu_idle_loop(void) ...@@ -219,7 +221,7 @@ static void cpu_idle_loop(void)
check_pgt_cache(); check_pgt_cache();
rmb(); rmb();
if (cpu_is_offline(smp_processor_id())) { if (cpu_is_offline(cpu)) {
cpuhp_report_idle_dead(); cpuhp_report_idle_dead();
arch_cpu_idle_dead(); arch_cpu_idle_dead();
} }
......
...@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data); ...@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data);
extern void free_fair_sched_group(struct task_group *tg); extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu, struct sched_entity *se, int cpu,
...@@ -437,7 +438,7 @@ struct cfs_rq { ...@@ -437,7 +438,7 @@ struct cfs_rq {
u64 throttled_clock, throttled_clock_task; u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time; u64 throttled_clock_task_time;
int throttled, throttle_count, throttle_uptodate; int throttled, throttle_count;
struct list_head throttled_list; struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */
...@@ -1246,8 +1247,11 @@ struct sched_class { ...@@ -1246,8 +1247,11 @@ struct sched_class {
void (*update_curr) (struct rq *rq); void (*update_curr) (struct rq *rq);
#define TASK_SET_GROUP 0
#define TASK_MOVE_GROUP 1
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p); void (*task_change_group) (struct task_struct *p, int type);
#endif #endif
}; };
...@@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {} ...@@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {}
#else /* arch_scale_freq_capacity */ #else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false) #define arch_scale_freq_invariant() (false)
#endif #endif
static inline void account_reset_rq(struct rq *rq)
{
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
rq->prev_irq_time = 0;
#endif
#ifdef CONFIG_PARAVIRT
rq->prev_steal_time = 0;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
rq->prev_steal_time_rq = 0;
#endif
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment