Commit f57d54ba authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The biggest change affects group scheduling: we now track the runnable
  average on a per-task entity basis, allowing a smoother, exponential
  decay average based load/weight estimation instead of the previous
  binary on-the-runqueue/off-the-runqueue load weight method.

  This will inevitably disturb workloads that were in some sort of
  borderline balancing state or unstable equilibrium, so an eye has to
  be kept on regressions.

  For that reason the new load average is only limited to group
  scheduling (shares distribution) at the moment (which was also hurting
  the most from the prior, crude weight calculation and whose scheduling
  quality wins most from this change) - but we plan to extend this to
  regular SMP balancing as well in the future, which will simplify and
  speed up things a bit.

  Other changes involve ongoing preparatory work to extend NOHZ to the
  scheduler as well, eventually allowing completely irq-free user-space
  execution."

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits)
  Revert "sched/autogroup: Fix crash on reboot when autogroup is disabled"
  cputime: Comment cputime's adjusting code
  cputime: Consolidate cputime adjustment code
  cputime: Rename thread_group_times to thread_group_cputime_adjusted
  cputime: Move thread_group_cputime() to sched code
  vtime: Warn if irqs aren't disabled on system time accounting APIs
  vtime: No need to disable irqs on vtime_account()
  vtime: Consolidate a bit the ctx switch code
  vtime: Explicitly account pending user time on process tick
  vtime: Remove the underscore prefix invasion
  sched/autogroup: Fix crash on reboot when autogroup is disabled
  cputime: Separate irqtime accounting from generic vtime
  cputime: Specialize irq vtime hooks
  kvm: Directly account vtime to system on guest switch
  vtime: Make vtime_account_system() irqsafe
  vtime: Gather vtime declarations to their own header file
  sched: Describe CFS load-balancer
  sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking
  sched: Make __update_entity_runnable_avg() fast
  sched: Update_cfs_shares at period edge
  ...
parents da830e58 c1ad41f1
......@@ -103,5 +103,7 @@ static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val)
#define cputime64_to_clock_t(__ct) \
cputime_to_clock_t((__force cputime_t)__ct)
extern void arch_vtime_task_switch(struct task_struct *tsk);
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
#endif /* __IA64_CPUTIME_H */
......@@ -83,7 +83,7 @@ static struct clocksource *itc_clocksource;
extern cputime_t cycle_to_cputime(u64 cyc);
static void vtime_account_user(struct task_struct *tsk)
void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_utime;
struct thread_info *ti = task_thread_info(tsk);
......@@ -100,18 +100,11 @@ static void vtime_account_user(struct task_struct *tsk)
* accumulated times to the current process, and to prepare accounting on
* the next process.
*/
void vtime_task_switch(struct task_struct *prev)
void arch_vtime_task_switch(struct task_struct *prev)
{
struct thread_info *pi = task_thread_info(prev);
struct thread_info *ni = task_thread_info(current);
if (idle_task(smp_processor_id()) != prev)
vtime_account_system(prev);
else
vtime_account_idle(prev);
vtime_account_user(prev);
pi->ac_stamp = ni->ac_stamp;
ni->ac_stime = ni->ac_utime = 0;
}
......@@ -126,6 +119,8 @@ static cputime_t vtime_delta(struct task_struct *tsk)
cputime_t delta_stime;
__u64 now;
WARN_ON_ONCE(!irqs_disabled());
now = ia64_get_itc();
delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp));
......@@ -147,15 +142,6 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(vtime_delta(tsk));
}
/*
* Called from the timer interrupt handler to charge accumulated user time
* to the current process. Must be called with interrupts disabled.
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
vtime_account_user(p);
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
static irqreturn_t
......
......@@ -228,6 +228,8 @@ static inline cputime_t clock_t_to_cputime(const unsigned long clk)
#define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct))
static inline void arch_vtime_task_switch(struct task_struct *tsk) { }
#endif /* __KERNEL__ */
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
#endif /* __POWERPC_CPUTIME_H */
......@@ -297,6 +297,8 @@ static u64 vtime_delta(struct task_struct *tsk,
u64 now, nowscaled, deltascaled;
u64 udelta, delta, user_scaled;
WARN_ON_ONCE(!irqs_disabled());
now = mftb();
nowscaled = read_spurr(now);
get_paca()->system_time += now - get_paca()->starttime;
......@@ -355,15 +357,15 @@ void vtime_account_idle(struct task_struct *tsk)
}
/*
* Transfer the user and system times accumulated in the paca
* by the exception entry and exit code to the generic process
* user and system time records.
* Transfer the user time accumulated in the paca
* by the exception entry and exit code to the generic
* process user time records.
* Must be called with interrupts disabled.
* Assumes that vtime_account() has been called recently
* (i.e. since the last entry from usermode) so that
* Assumes that vtime_account_system/idle() has been called
* recently (i.e. since the last entry from usermode) so that
* get_paca()->user_time_scaled is up to date.
*/
void account_process_tick(struct task_struct *tsk, int user_tick)
void vtime_account_user(struct task_struct *tsk)
{
cputime_t utime, utimescaled;
......@@ -375,12 +377,6 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
account_user_time(tsk, utime, utimescaled);
}
void vtime_task_switch(struct task_struct *prev)
{
vtime_account(prev);
account_process_tick(prev, 0);
}
#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
#define calc_cputime_factors()
#endif
......
......@@ -14,6 +14,7 @@
#define __ARCH_HAS_VTIME_ACCOUNT
#define __ARCH_HAS_VTIME_TASK_SWITCH
/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
......
......@@ -112,7 +112,12 @@ void vtime_task_switch(struct task_struct *prev)
S390_lowcore.system_timer = ti->system_timer;
}
void account_process_tick(struct task_struct *tsk, int user_tick)
/*
* In s390, accounting pending user time also implies
* accounting system time in order to correctly compute
* the stolen time accounting.
*/
void vtime_account_user(struct task_struct *tsk)
{
if (do_account_vtime(tsk, HARDIRQ_OFFSET))
virt_timer_expire();
......@@ -127,6 +132,8 @@ void vtime_account(struct task_struct *tsk)
struct thread_info *ti = task_thread_info(tsk);
u64 timer, system;
WARN_ON_ONCE(!irqs_disabled());
timer = S390_lowcore.last_update_timer;
S390_lowcore.last_update_timer = get_vtimer();
S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer;
......@@ -140,6 +147,10 @@ void vtime_account(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(vtime_account);
void vtime_account_system(struct task_struct *tsk)
__attribute__((alias("vtime_account")));
EXPORT_SYMBOL_GPL(vtime_account_system);
void __kprobes vtime_stop_cpu(void)
{
struct s390_idle_data *idle = &__get_cpu_var(s390_idle);
......
......@@ -608,9 +608,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
kvm_s390_deliver_pending_interrupts(vcpu);
vcpu->arch.sie_block->icptcode = 0;
local_irq_disable();
kvm_guest_enter();
local_irq_enable();
VCPU_EVENT(vcpu, 6, "entering sie flags %x",
atomic_read(&vcpu->arch.sie_block->cpuflags));
trace_kvm_s390_sie_enter(vcpu,
......@@ -629,9 +627,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
vcpu->arch.sie_block->icptcode);
trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
local_irq_disable();
kvm_guest_exit();
local_irq_enable();
memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16);
return rc;
......
......@@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
thread_group_times(task, &utime, &stime);
thread_group_cputime_adjusted(task, &utime, &stime);
gtime += sig->gtime;
}
......@@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
task_times(task, &utime, &stime);
task_cputime_adjusted(task, &utime, &stime);
gtime = task->gtime;
}
......
......@@ -4,6 +4,7 @@
#include <linux/preempt.h>
#include <linux/lockdep.h>
#include <linux/ftrace_irq.h>
#include <linux/vtime.h>
#include <asm/hardirq.h>
/*
......@@ -129,16 +130,6 @@ extern void synchronize_irq(unsigned int irq);
# define synchronize_irq(irq) barrier()
#endif
struct task_struct;
#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
static inline void vtime_account(struct task_struct *tsk)
{
}
#else
extern void vtime_account(struct task_struct *tsk);
#endif
#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
static inline void rcu_nmi_enter(void)
......@@ -162,7 +153,7 @@ extern void rcu_nmi_exit(void);
*/
#define __irq_enter() \
do { \
vtime_account(current); \
vtime_account_irq_enter(current); \
add_preempt_count(HARDIRQ_OFFSET); \
trace_hardirq_enter(); \
} while (0)
......@@ -178,7 +169,7 @@ extern void irq_enter(void);
#define __irq_exit() \
do { \
trace_hardirq_exit(); \
vtime_account(current); \
vtime_account_irq_exit(current); \
sub_preempt_count(HARDIRQ_OFFSET); \
} while (0)
......
......@@ -7,6 +7,7 @@
#include <linux/cpumask.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/vtime.h>
#include <asm/irq.h>
#include <asm/cputime.h>
......@@ -126,16 +127,16 @@ extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t)
extern void account_steal_time(cputime_t);
extern void account_idle_time(cputime_t);
extern void account_process_tick(struct task_struct *, int user);
extern void account_steal_ticks(unsigned long ticks);
extern void account_idle_ticks(unsigned long ticks);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
extern void vtime_task_switch(struct task_struct *prev);
extern void vtime_account_system(struct task_struct *tsk);
extern void vtime_account_idle(struct task_struct *tsk);
static inline void account_process_tick(struct task_struct *tsk, int user)
{
vtime_account_user(tsk);
}
#else
static inline void vtime_task_switch(struct task_struct *prev) { }
extern void account_process_tick(struct task_struct *, int user);
#endif
extern void account_steal_ticks(unsigned long ticks);
extern void account_idle_ticks(unsigned long ticks);
#endif /* _LINUX_KERNEL_STAT_H */
......@@ -726,7 +726,11 @@ static inline int kvm_deassign_device(struct kvm *kvm,
static inline void kvm_guest_enter(void)
{
BUG_ON(preemptible());
vtime_account(current);
/*
* This is running in ioctl context so we can avoid
* the call to vtime_account() with its unnecessary idle check.
*/
vtime_account_system_irqsafe(current);
current->flags |= PF_VCPU;
/* KVM does not hold any references to rcu protected data when it
* switches CPU into a guest mode. In fact switching to a guest mode
......@@ -740,7 +744,11 @@ static inline void kvm_guest_enter(void)
static inline void kvm_guest_exit(void)
{
vtime_account(current);
/*
* This is running in ioctl context so we can avoid
* the call to vtime_account() with its unnecessary idle check.
*/
vtime_account_system_irqsafe(current);
current->flags &= ~PF_VCPU;
}
......
......@@ -435,14 +435,29 @@ struct cpu_itimer {
u32 incr_error;
};
/**
* struct cputime - snaphsot of system and user cputime
* @utime: time spent in user mode
* @stime: time spent in system mode
*
* Gathers a generic snapshot of user and system time.
*/
struct cputime {
cputime_t utime;
cputime_t stime;
};
/**
* struct task_cputime - collected CPU time counts
* @utime: time spent in user mode, in &cputime_t units
* @stime: time spent in kernel mode, in &cputime_t units
* @sum_exec_runtime: total time spent on the CPU, in nanoseconds
*
* This structure groups together three kinds of CPU time that are
* tracked for threads and thread groups. Most things considering
* This is an extension of struct cputime that includes the total runtime
* spent by the task from the scheduler point of view.
*
* As a result, this structure groups together three kinds of CPU time
* that are tracked for threads and thread groups. Most things considering
* CPU time want to group these counts together and treat all three
* of them in parallel.
*/
......@@ -583,7 +598,7 @@ struct signal_struct {
cputime_t gtime;
cputime_t cgtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
cputime_t prev_utime, prev_stime;
struct cputime prev_cputime;
#endif
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
......@@ -1064,6 +1079,7 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
......@@ -1098,6 +1114,18 @@ struct load_weight {
unsigned long weight, inv_weight;
};
struct sched_avg {
/*
* These sums represent an infinite geometric series and so are bound
* above by 1024/(1-y). Thus we only need a u32 to store them for for all
* choices of y < 1-2^(-32)*1024.
*/
u32 runnable_avg_sum, runnable_avg_period;
u64 last_runnable_update;
s64 decay_count;
unsigned long load_avg_contrib;
};
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics {
u64 wait_start;
......@@ -1158,6 +1186,15 @@ struct sched_entity {
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
/*
* Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
* removed when useful for applications beyond shares distribution (e.g.
* load-balance).
*/
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
/* Per-entity load-tracking */
struct sched_avg avg;
#endif
};
struct sched_rt_entity {
......@@ -1321,7 +1358,7 @@ struct task_struct {
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
cputime_t prev_utime, prev_stime;
struct cputime prev_cputime;
#endif
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
......@@ -1732,8 +1769,8 @@ static inline void put_task_struct(struct task_struct *t)
__put_task_struct(t);
}
extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
/*
* Per process flags
......
#ifndef _LINUX_KERNEL_VTIME_H
#define _LINUX_KERNEL_VTIME_H
struct task_struct;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
extern void vtime_task_switch(struct task_struct *prev);
extern void vtime_account_system(struct task_struct *tsk);
extern void vtime_account_system_irqsafe(struct task_struct *tsk);
extern void vtime_account_idle(struct task_struct *tsk);
extern void vtime_account_user(struct task_struct *tsk);
extern void vtime_account(struct task_struct *tsk);
#else
static inline void vtime_task_switch(struct task_struct *prev) { }
static inline void vtime_account_system(struct task_struct *tsk) { }
static inline void vtime_account_system_irqsafe(struct task_struct *tsk) { }
static inline void vtime_account(struct task_struct *tsk) { }
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
extern void irqtime_account_irq(struct task_struct *tsk);
#else
static inline void irqtime_account_irq(struct task_struct *tsk) { }
#endif
static inline void vtime_account_irq_enter(struct task_struct *tsk)
{
/*
* Hardirq can interrupt idle task anytime. So we need vtime_account()
* that performs the idle check in CONFIG_VIRT_CPU_ACCOUNTING.
* Softirq can also interrupt idle task directly if it calls
* local_bh_enable(). Such case probably don't exist but we never know.
* Ksoftirqd is not concerned because idle time is flushed on context
* switch. Softirqs in the end of hardirqs are also not a problem because
* the idle time is flushed on hardirq time already.
*/
vtime_account(tsk);
irqtime_account_irq(tsk);
}
static inline void vtime_account_irq_exit(struct task_struct *tsk)
{
/* On hard|softirq exit we always account to hard|softirq cputime */
vtime_account_system(tsk);
irqtime_account_irq(tsk);
}
#endif /* _LINUX_KERNEL_VTIME_H */
......@@ -1186,11 +1186,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* as other threads in the parent group can be right
* here reaping other children at the same time.
*
* We use thread_group_times() to get times for the thread
* We use thread_group_cputime_adjusted() to get times for the thread
* group, which consolidates times for all threads in the
* group including the group leader.
*/
thread_group_times(p, &tgutime, &tgstime);
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
spin_lock_irq(&p->real_parent->sighand->siglock);
psig = p->real_parent->signal;
sig = p->signal;
......
......@@ -1224,7 +1224,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
p->prev_utime = p->prev_stime = 0;
p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
......
......@@ -217,30 +217,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
return 0;
}
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
struct signal_struct *sig = tsk->signal;
struct task_struct *t;
times->utime = sig->utime;
times->stime = sig->stime;
times->sum_exec_runtime = sig->sum_sched_runtime;
rcu_read_lock();
/* make sure we can trust tsk->thread_group list */
if (!likely(pid_alive(tsk)))
goto out;
t = tsk;
do {
times->utime += t->utime;
times->stime += t->stime;
times->sum_exec_runtime += task_sched_runtime(t);
} while_each_thread(tsk, t);
out:
rcu_read_unlock();
}
static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
{
if (b->utime > a->utime)
......
......@@ -143,11 +143,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
p->signal->autogroup = autogroup_kref_get(ag);
if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
goto out;
t = p;
do {
sched_move_task(t);
} while_each_thread(p, t);
out:
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
}
......
......@@ -4,6 +4,11 @@
#include <linux/rwsem.h>
struct autogroup {
/*
* reference doesn't mean how many thread attach to this
* autogroup now. It just stands for the number of task
* could use this autogroup.
*/
struct kref kref;
struct task_group *tg;
struct rw_semaphore lock;
......
......@@ -953,6 +953,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
}
......@@ -1525,6 +1527,15 @@ static void __sched_fork(struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
/*
* Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
* removed when useful for applications beyond shares distribution (e.g.
* load-balance).
*/
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
p->se.avg.runnable_avg_period = 0;
p->se.avg.runnable_avg_sum = 0;
#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
......
......@@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
*/
void vtime_account(struct task_struct *curr)
void irqtime_account_irq(struct task_struct *curr)
{
unsigned long flags;
s64 delta;
......@@ -73,7 +73,7 @@ void vtime_account(struct task_struct *curr)
irq_time_write_end();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(vtime_account);
EXPORT_SYMBOL_GPL(irqtime_account_irq);
static int irqtime_account_hi_update(void)
{
......@@ -288,6 +288,34 @@ static __always_inline bool steal_account_process_tick(void)
return false;
}
/*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
* tasks (sum on group iteration) belonging to @tsk's group.
*/
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
struct signal_struct *sig = tsk->signal;
struct task_struct *t;
times->utime = sig->utime;
times->stime = sig->stime;
times->sum_exec_runtime = sig->sum_sched_runtime;
rcu_read_lock();
/* make sure we can trust tsk->thread_group list */
if (!likely(pid_alive(tsk)))
goto out;
t = tsk;
do {
times->utime += t->utime;
times->stime += t->stime;
times->sum_exec_runtime += task_sched_runtime(t);
} while_each_thread(tsk, t);
out:
rcu_read_unlock();
}
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
......@@ -417,13 +445,13 @@ void account_idle_ticks(unsigned long ticks)
* Use precise platform statistics if available:
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
*ut = p->utime;
*st = p->stime;
}
void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
struct task_cputime cputime;
......@@ -433,6 +461,29 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
*st = cputime.stime;
}
void vtime_account_system_irqsafe(struct task_struct *tsk)
{
unsigned long flags;
local_irq_save(flags);
vtime_account_system(tsk);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
void vtime_task_switch(struct task_struct *prev)
{
if (is_idle_task(prev))
vtime_account_idle(prev);
else
vtime_account_system(prev);
vtime_account_user(prev);
arch_vtime_task_switch(prev);
}
#endif
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
......@@ -444,16 +495,10 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
#ifndef __ARCH_HAS_VTIME_ACCOUNT
void vtime_account(struct task_struct *tsk)
{
unsigned long flags;
local_irq_save(flags);
if (in_interrupt() || !is_idle_task(tsk))
vtime_account_system(tsk);
else
vtime_account_idle(tsk);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(vtime_account);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
......@@ -478,14 +523,30 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
return (__force cputime_t) temp;
}
void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
/*
* Adjust tick based cputime random precision against scheduler
* runtime accounting.
*/
static void cputime_adjust(struct task_cputime *curr,
struct cputime *prev,
cputime_t *ut, cputime_t *st)
{
cputime_t rtime, utime = p->utime, total = utime + p->stime;
cputime_t rtime, utime, total;
utime = curr->utime;
total = utime + curr->stime;
/*
* Use CFS's precise accounting:
* Tick based cputime accounting depend on random scheduling
* timeslices of a task to be interrupted or not by the timer.
* Depending on these circumstances, the number of these interrupts
* may be over or under-optimistic, matching the real user and system
* cputime with a variable precision.
*
* Fix this by scaling these tick based values against the total
* runtime accounted by the CFS scheduler.
*/
rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
if (total)
utime = scale_utime(utime, rtime, total);
......@@ -493,38 +554,36 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
utime = rtime;
/*
* Compare with previous values, to keep monotonicity:
* If the tick based count grows faster than the scheduler one,
* the result of the scaling may go backward.
* Let's enforce monotonicity.
*/
p->prev_utime = max(p->prev_utime, utime);
p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
prev->utime = max(prev->utime, utime);
prev->stime = max(prev->stime, rtime - prev->utime);
*ut = p->prev_utime;
*st = p->prev_stime;
*ut = prev->utime;
*st = prev->stime;
}
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
struct task_cputime cputime = {
.utime = p->utime,
.stime = p->stime,
.sum_exec_runtime = p->se.sum_exec_runtime,
};
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
/*
* Must be called with siglock held.
*/
void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
struct signal_struct *sig = p->signal;
struct task_cputime cputime;
cputime_t rtime, utime, total;
thread_group_cputime(p, &cputime);
total = cputime.utime + cputime.stime;
rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
if (total)
utime = scale_utime(cputime.utime, rtime, total);
else
utime = rtime;
sig->prev_utime = max(sig->prev_utime, utime);
sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
*ut = sig->prev_utime;
*st = sig->prev_stime;
cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
}
#endif
......@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
if (!se)
return;
#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
if (!se) {
struct sched_avg *avg = &cpu_rq(cpu)->avg;
P(avg->runnable_avg_sum);
P(avg->runnable_avg_period);
return;
}
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
......@@ -85,6 +91,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->statistics.wait_count);
#endif
P(se->load.weight);
#ifdef CONFIG_SMP
P(se->avg.runnable_avg_sum);
P(se->avg.runnable_avg_period);
P(se->avg.load_avg_contrib);
P(se->avg.decay_count);
#endif
#undef PN
#undef P
}
......@@ -206,14 +218,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
SPLIT_NS(cfs_rq->load_avg));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
SPLIT_NS(cfs_rq->load_period));
SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
cfs_rq->load_contribution);
SEQ_printf(m, " .%-30s: %d\n", "load_tg",
atomic_read(&cfs_rq->tg->load_weight));
SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
cfs_rq->blocked_load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
atomic64_read(&cfs_rq->tg->load_avg));
SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
cfs_rq->tg_load_contrib);
SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
cfs_rq->tg_runnable_contrib);
SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
atomic_read(&cfs_rq->tg->runnable_avg));
#endif
print_cfs_group_stats(m, cpu, cfs_rq->tg);
......
This diff is collapsed.
......@@ -31,6 +31,11 @@ SCHED_FEAT(LAST_BUDDY, true)
*/
SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
* Allow wakeup-time preemption of the current task:
*/
SCHED_FEAT(WAKEUP_PREEMPTION, true)
/*
* Use arch dependent cpu power functions
*/
......
......@@ -112,6 +112,8 @@ struct task_group {
unsigned long shares;
atomic_t load_weight;
atomic64_t load_avg;
atomic_t runnable_avg;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
......@@ -222,22 +224,29 @@ struct cfs_rq {
unsigned int nr_spread_over;
#endif
#ifdef CONFIG_SMP
/*
* Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
* removed when useful for applications beyond shares distribution (e.g.
* load-balance).
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
* CFS Load tracking
* Under CFS, load is tracked on a per-entity basis and aggregated up.
* This allows for the description of both thread and group usage (in
* the FAIR_GROUP_SCHED case).
*/
int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
u64 runnable_load_avg, blocked_load_avg;
atomic64_t decay_counter, removed_load;
u64 last_decay;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/* These always depend on CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
u32 tg_runnable_contrib;
u64 tg_load_contrib;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
/*
* h_load = weight * f(tg)
*
......@@ -245,26 +254,30 @@ struct cfs_rq {
* this group.
*/
unsigned long h_load;
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
* Maintaining per-cpu shares distribution for group scheduling
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* load_stamp is the last time we updated the load average
* load_last is the last time we updated the load average and saw load
* load_unacc_exec_time is currently unaccounted execution time
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
u64 load_avg;
u64 load_period;
u64 load_stamp, load_last, load_unacc_exec_time;
int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
unsigned long load_contribution;
#endif /* CONFIG_SMP */
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_timestamp;
u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time;
int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
......@@ -467,6 +480,8 @@ struct rq {
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
struct sched_avg avg;
};
static inline int cpu_of(struct rq *rq)
......@@ -1212,4 +1227,3 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
......@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
vtime_account(current);
vtime_account_irq_enter(current);
__local_bh_disable((unsigned long)__builtin_return_address(0),
SOFTIRQ_OFFSET);
......@@ -272,7 +272,7 @@ asmlinkage void __do_softirq(void)
lockdep_softirq_exit();
vtime_account(current);
vtime_account_irq_exit(current);
__local_bh_enable(SOFTIRQ_OFFSET);
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
......@@ -341,7 +341,7 @@ static inline void invoke_softirq(void)
*/
void irq_exit(void)
{
vtime_account(current);
vtime_account_irq_exit(current);
trace_hardirq_exit();
sub_preempt_count(IRQ_EXIT_OFFSET);
if (!in_interrupt() && local_softirq_pending())
......
......@@ -1046,7 +1046,7 @@ void do_sys_times(struct tms *tms)
cputime_t tgutime, tgstime, cutime, cstime;
spin_lock_irq(&current->sighand->siglock);
thread_group_times(current, &tgutime, &tgstime);
thread_group_cputime_adjusted(current, &tgutime, &tgstime);
cutime = current->signal->cutime;
cstime = current->signal->cstime;
spin_unlock_irq(&current->sighand->siglock);
......@@ -1704,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
utime = stime = 0;
if (who == RUSAGE_THREAD) {
task_times(current, &utime, &stime);
task_cputime_adjusted(current, &utime, &stime);
accumulate_thread_rusage(p, r);
maxrss = p->signal->maxrss;
goto out;
......@@ -1730,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
break;
case RUSAGE_SELF:
thread_group_times(p, &tgutime, &tgstime);
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
utime += tgutime;
stime += tgstime;
r->ru_nvcsw += p->signal->nvcsw;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment