Commit a70a9322 authored by Linus Torvalds's avatar Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched

* git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched:
  sched: proper prototype for kernel/sched.c:migration_init()
  sched: avoid large irq-latencies in smp-balancing
  sched: fix copy_namespace() <-> sched_fork() dependency in do_fork
  sched: clean up the wakeup preempt check, #2
  sched: clean up the wakeup preempt check
  sched: wakeup preemption fix
  sched: remove PREEMPT_RESTRICT
  sched: turn off PREEMPT_RESTRICT
  KVM: fix !SMP build error
  x86: make nmi_cpu_busy() always defined
  x86: make ipi_handler() always defined
  sched: cleanup, use NSEC_PER_MSEC and NSEC_PER_SEC
  sched: reintroduce SMP tunings again
  sched: restore deterministic CPU accounting on powerpc
  sched: fix delay accounting regression
  sched: reintroduce the sched_min_granularity tunable
  sched: documentation: place_entity() comments
  sched: fix vslice
parents a80b824f e6fe6649
...@@ -350,7 +350,7 @@ struct task_struct *__switch_to(struct task_struct *prev, ...@@ -350,7 +350,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
local_irq_save(flags); local_irq_save(flags);
account_system_vtime(current); account_system_vtime(current);
account_process_vtime(current); account_process_tick(current, 0);
calculate_steal_time(); calculate_steal_time();
last = _switch(old_thread, new_thread); last = _switch(old_thread, new_thread);
......
...@@ -259,7 +259,7 @@ void account_system_vtime(struct task_struct *tsk) ...@@ -259,7 +259,7 @@ void account_system_vtime(struct task_struct *tsk)
* user and system time records. * user and system time records.
* Must be called with interrupts disabled. * Must be called with interrupts disabled.
*/ */
void account_process_vtime(struct task_struct *tsk) void account_process_tick(struct task_struct *tsk, int user_tick)
{ {
cputime_t utime, utimescaled; cputime_t utime, utimescaled;
...@@ -274,18 +274,6 @@ void account_process_vtime(struct task_struct *tsk) ...@@ -274,18 +274,6 @@ void account_process_vtime(struct task_struct *tsk)
account_user_time_scaled(tsk, utimescaled); account_user_time_scaled(tsk, utimescaled);
} }
static void account_process_time(struct pt_regs *regs)
{
int cpu = smp_processor_id();
account_process_vtime(current);
run_local_timers();
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_mode(regs));
scheduler_tick();
run_posix_cpu_timers(current);
}
/* /*
* Stuff for accounting stolen time. * Stuff for accounting stolen time.
*/ */
...@@ -375,7 +363,6 @@ static void snapshot_purr(void) ...@@ -375,7 +363,6 @@ static void snapshot_purr(void)
#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
#define calc_cputime_factors() #define calc_cputime_factors()
#define account_process_time(regs) update_process_times(user_mode(regs))
#define calculate_steal_time() do { } while (0) #define calculate_steal_time() do { } while (0)
#endif #endif
...@@ -599,16 +586,6 @@ void timer_interrupt(struct pt_regs * regs) ...@@ -599,16 +586,6 @@ void timer_interrupt(struct pt_regs * regs)
get_lppaca()->int_dword.fields.decr_int = 0; get_lppaca()->int_dword.fields.decr_int = 0;
#endif #endif
/*
* We cannot disable the decrementer, so in the period
* between this cpu's being marked offline in cpu_online_map
* and calling stop-self, it is taking timer interrupts.
* Avoid calling into the scheduler rebalancing code if this
* is the case.
*/
if (!cpu_is_offline(cpu))
account_process_time(regs);
if (evt->event_handler) if (evt->event_handler)
evt->event_handler(evt); evt->event_handler(evt);
......
...@@ -145,12 +145,8 @@ void account_ticks(u64 time) ...@@ -145,12 +145,8 @@ void account_ticks(u64 time)
do_timer(ticks); do_timer(ticks);
#endif #endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
account_tick_vtime(current);
#else
while (ticks--) while (ticks--)
update_process_times(user_mode(get_irq_regs())); update_process_times(user_mode(get_irq_regs()));
#endif
s390_do_profile(); s390_do_profile();
} }
......
...@@ -32,7 +32,7 @@ static DEFINE_PER_CPU(struct vtimer_queue, virt_cpu_timer); ...@@ -32,7 +32,7 @@ static DEFINE_PER_CPU(struct vtimer_queue, virt_cpu_timer);
* Update process times based on virtual cpu times stored by entry.S * Update process times based on virtual cpu times stored by entry.S
* to the lowcore fields user_timer, system_timer & steal_clock. * to the lowcore fields user_timer, system_timer & steal_clock.
*/ */
void account_tick_vtime(struct task_struct *tsk) void account_process_tick(struct task_struct *tsk, int user_tick)
{ {
cputime_t cputime; cputime_t cputime;
__u64 timer, clock; __u64 timer, clock;
...@@ -64,12 +64,6 @@ void account_tick_vtime(struct task_struct *tsk) ...@@ -64,12 +64,6 @@ void account_tick_vtime(struct task_struct *tsk)
S390_lowcore.steal_clock -= cputime << 12; S390_lowcore.steal_clock -= cputime << 12;
account_steal_time(tsk, cputime); account_steal_time(tsk, cputime);
} }
run_local_timers();
if (rcu_pending(smp_processor_id()))
rcu_check_callbacks(smp_processor_id(), rcu_user_flag);
scheduler_tick();
run_posix_cpu_timers(tsk);
} }
/* /*
......
...@@ -139,13 +139,12 @@ struct set_mtrr_data { ...@@ -139,13 +139,12 @@ struct set_mtrr_data {
mtrr_type smp_type; mtrr_type smp_type;
}; };
#ifdef CONFIG_SMP
static void ipi_handler(void *info) static void ipi_handler(void *info)
/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. /* [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
[RETURNS] Nothing. [RETURNS] Nothing.
*/ */
{ {
#ifdef CONFIG_SMP
struct set_mtrr_data *data = info; struct set_mtrr_data *data = info;
unsigned long flags; unsigned long flags;
...@@ -168,9 +167,8 @@ static void ipi_handler(void *info) ...@@ -168,9 +167,8 @@ static void ipi_handler(void *info)
atomic_dec(&data->count); atomic_dec(&data->count);
local_irq_restore(flags); local_irq_restore(flags);
}
#endif #endif
}
static inline int types_compatible(mtrr_type type1, mtrr_type type2) { static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
return type1 == MTRR_TYPE_UNCACHABLE || return type1 == MTRR_TYPE_UNCACHABLE ||
......
...@@ -51,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); ...@@ -51,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
static int endflag __initdata = 0; static int endflag __initdata = 0;
#ifdef CONFIG_SMP
/* The performance counters used by NMI_LOCAL_APIC don't trigger when /* The performance counters used by NMI_LOCAL_APIC don't trigger when
* the CPU is idle. To make sure the NMI watchdog really ticks on all * the CPU is idle. To make sure the NMI watchdog really ticks on all
* CPUs during the test make them busy. * CPUs during the test make them busy.
*/ */
static __init void nmi_cpu_busy(void *data) static __init void nmi_cpu_busy(void *data)
{ {
#ifdef CONFIG_SMP
local_irq_enable_in_hardirq(); local_irq_enable_in_hardirq();
/* Intentionally don't use cpu_relax here. This is /* Intentionally don't use cpu_relax here. This is
to make sure that the performance counter really ticks, to make sure that the performance counter really ticks,
...@@ -67,8 +67,8 @@ static __init void nmi_cpu_busy(void *data) ...@@ -67,8 +67,8 @@ static __init void nmi_cpu_busy(void *data)
care if they get somewhat less cycles. */ care if they get somewhat less cycles. */
while (endflag == 0) while (endflag == 0)
mb(); mb();
}
#endif #endif
}
static int __init check_nmi_watchdog(void) static int __init check_nmi_watchdog(void)
{ {
......
...@@ -254,6 +254,7 @@ long io_schedule_timeout(long timeout); ...@@ -254,6 +254,7 @@ long io_schedule_timeout(long timeout);
extern void cpu_init (void); extern void cpu_init (void);
extern void trap_init(void); extern void trap_init(void);
extern void account_process_tick(struct task_struct *task, int user);
extern void update_process_times(int user); extern void update_process_times(int user);
extern void scheduler_tick(void); extern void scheduler_tick(void);
...@@ -862,7 +863,6 @@ struct sched_entity { ...@@ -862,7 +863,6 @@ struct sched_entity {
struct load_weight load; /* for load-balancing */ struct load_weight load; /* for load-balancing */
struct rb_node run_node; struct rb_node run_node;
unsigned int on_rq; unsigned int on_rq;
int peer_preempt;
u64 exec_start; u64 exec_start;
u64 sum_exec_runtime; u64 sum_exec_runtime;
...@@ -1460,12 +1460,17 @@ extern void sched_idle_next(void); ...@@ -1460,12 +1460,17 @@ extern void sched_idle_next(void);
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_nr_latency; extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_batch_wakeup_granularity; extern unsigned int sysctl_sched_batch_wakeup_granularity;
extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
int sched_nr_latency_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length,
loff_t *ppos);
#endif #endif
extern unsigned int sysctl_sched_compat_yield; extern unsigned int sysctl_sched_compat_yield;
...@@ -1983,6 +1988,14 @@ static inline void inc_syscw(struct task_struct *tsk) ...@@ -1983,6 +1988,14 @@ static inline void inc_syscw(struct task_struct *tsk)
} }
#endif #endif
#ifdef CONFIG_SMP
void migration_init(void);
#else
static inline void migration_init(void)
{
}
#endif
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
#endif #endif
...@@ -84,11 +84,12 @@ void smp_prepare_boot_cpu(void); ...@@ -84,11 +84,12 @@ void smp_prepare_boot_cpu(void);
* These macros fold the SMP functionality into a single CPU system * These macros fold the SMP functionality into a single CPU system
*/ */
#define raw_smp_processor_id() 0 #define raw_smp_processor_id() 0
static inline int up_smp_call_function(void) static inline int up_smp_call_function(void (*func)(void *), void *info)
{ {
return 0; return 0;
} }
#define smp_call_function(func,info,retry,wait) (up_smp_call_function()) #define smp_call_function(func, info, retry, wait) \
(up_smp_call_function(func, info))
#define on_each_cpu(func,info,retry,wait) \ #define on_each_cpu(func,info,retry,wait) \
({ \ ({ \
local_irq_disable(); \ local_irq_disable(); \
...@@ -107,6 +108,8 @@ static inline void smp_send_reschedule(int cpu) { } ...@@ -107,6 +108,8 @@ static inline void smp_send_reschedule(int cpu) { }
local_irq_enable(); \ local_irq_enable(); \
0; \ 0; \
}) })
#define smp_call_function_mask(mask, func, info, wait) \
(up_smp_call_function(func, info))
#endif /* !SMP */ #endif /* !SMP */
......
...@@ -56,6 +56,7 @@ ...@@ -56,6 +56,7 @@
#include <linux/pid_namespace.h> #include <linux/pid_namespace.h>
#include <linux/device.h> #include <linux/device.h>
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/sched.h>
#include <asm/io.h> #include <asm/io.h>
#include <asm/bugs.h> #include <asm/bugs.h>
...@@ -747,11 +748,8 @@ __setup("nosoftlockup", nosoftlockup_setup); ...@@ -747,11 +748,8 @@ __setup("nosoftlockup", nosoftlockup_setup);
static void __init do_pre_smp_initcalls(void) static void __init do_pre_smp_initcalls(void)
{ {
extern int spawn_ksoftirqd(void); extern int spawn_ksoftirqd(void);
#ifdef CONFIG_SMP
extern int migration_init(void);
migration_init(); migration_init();
#endif
spawn_ksoftirqd(); spawn_ksoftirqd();
if (!nosoftlockup) if (!nosoftlockup)
spawn_softlockup_task(); spawn_softlockup_task();
......
...@@ -1123,6 +1123,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, ...@@ -1123,6 +1123,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->blocked_on = NULL; /* not blocked yet */ p->blocked_on = NULL; /* not blocked yet */
#endif #endif
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags);
if ((retval = security_task_alloc(p))) if ((retval = security_task_alloc(p)))
goto bad_fork_cleanup_policy; goto bad_fork_cleanup_policy;
if ((retval = audit_alloc(p))) if ((retval = audit_alloc(p)))
...@@ -1212,9 +1215,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, ...@@ -1212,9 +1215,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->ptrace_children); INIT_LIST_HEAD(&p->ptrace_children);
INIT_LIST_HEAD(&p->ptrace_list); INIT_LIST_HEAD(&p->ptrace_list);
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags);
/* Now that the task is set up, run cgroup callbacks if /* Now that the task is set up, run cgroup callbacks if
* necessary. We need to run them before the task is visible * necessary. We need to run them before the task is visible
* on the tasklist. */ * on the tasklist. */
......
...@@ -75,7 +75,7 @@ ...@@ -75,7 +75,7 @@
*/ */
unsigned long long __attribute__((weak)) sched_clock(void) unsigned long long __attribute__((weak)) sched_clock(void)
{ {
return (unsigned long long)jiffies * (1000000000 / HZ); return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
} }
/* /*
...@@ -99,8 +99,8 @@ unsigned long long __attribute__((weak)) sched_clock(void) ...@@ -99,8 +99,8 @@ unsigned long long __attribute__((weak)) sched_clock(void)
/* /*
* Some helpers for converting nanosecond timing to jiffy resolution * Some helpers for converting nanosecond timing to jiffy resolution
*/ */
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ))
#define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_LOAD SCHED_LOAD_SCALE
#define NICE_0_SHIFT SCHED_LOAD_SHIFT #define NICE_0_SHIFT SCHED_LOAD_SHIFT
...@@ -460,7 +460,6 @@ enum { ...@@ -460,7 +460,6 @@ enum {
SCHED_FEAT_TREE_AVG = 4, SCHED_FEAT_TREE_AVG = 4,
SCHED_FEAT_APPROX_AVG = 8, SCHED_FEAT_APPROX_AVG = 8,
SCHED_FEAT_WAKEUP_PREEMPT = 16, SCHED_FEAT_WAKEUP_PREEMPT = 16,
SCHED_FEAT_PREEMPT_RESTRICT = 32,
}; };
const_debug unsigned int sysctl_sched_features = const_debug unsigned int sysctl_sched_features =
...@@ -468,11 +467,16 @@ const_debug unsigned int sysctl_sched_features = ...@@ -468,11 +467,16 @@ const_debug unsigned int sysctl_sched_features =
SCHED_FEAT_START_DEBIT * 1 | SCHED_FEAT_START_DEBIT * 1 |
SCHED_FEAT_TREE_AVG * 0 | SCHED_FEAT_TREE_AVG * 0 |
SCHED_FEAT_APPROX_AVG * 0 | SCHED_FEAT_APPROX_AVG * 0 |
SCHED_FEAT_WAKEUP_PREEMPT * 1 | SCHED_FEAT_WAKEUP_PREEMPT * 1;
SCHED_FEAT_PREEMPT_RESTRICT * 1;
#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
/*
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/* /*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
* clock constructed from sched_clock(): * clock constructed from sched_clock():
...@@ -2237,7 +2241,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, ...@@ -2237,7 +2241,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
enum cpu_idle_type idle, int *all_pinned, enum cpu_idle_type idle, int *all_pinned,
int *this_best_prio, struct rq_iterator *iterator) int *this_best_prio, struct rq_iterator *iterator)
{ {
int pulled = 0, pinned = 0, skip_for_load; int loops = 0, pulled = 0, pinned = 0, skip_for_load;
struct task_struct *p; struct task_struct *p;
long rem_load_move = max_load_move; long rem_load_move = max_load_move;
...@@ -2251,10 +2255,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, ...@@ -2251,10 +2255,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
*/ */
p = iterator->start(iterator->arg); p = iterator->start(iterator->arg);
next: next:
if (!p) if (!p || loops++ > sysctl_sched_nr_migrate)
goto out; goto out;
/* /*
* To help distribute high priority tasks accross CPUs we don't * To help distribute high priority tasks across CPUs we don't
* skip a task if it will be the highest priority task (i.e. smallest * skip a task if it will be the highest priority task (i.e. smallest
* prio value) on its new queue regardless of its load weight * prio value) on its new queue regardless of its load weight
*/ */
...@@ -2271,8 +2275,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, ...@@ -2271,8 +2275,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
rem_load_move -= p->se.load.weight; rem_load_move -= p->se.load.weight;
/* /*
* We only want to steal up to the prescribed number of tasks * We only want to steal up to the prescribed amount of weighted load.
* and the prescribed amount of weighted load.
*/ */
if (rem_load_move > 0) { if (rem_load_move > 0) {
if (p->prio < *this_best_prio) if (p->prio < *this_best_prio)
...@@ -4992,6 +4995,32 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) ...@@ -4992,6 +4995,32 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
*/ */
cpumask_t nohz_cpu_mask = CPU_MASK_NONE; cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
/*
* Increase the granularity value when there are more CPUs,
* because with more CPUs the 'effective latency' as visible
* to users decreases. But the relationship is not linear,
* so pick a second-best guess by going with the log2 of the
* number of CPUs.
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
static inline void sched_init_granularity(void)
{
unsigned int factor = 1 + ilog2(num_online_cpus());
const unsigned long limit = 200000000;
sysctl_sched_min_granularity *= factor;
if (sysctl_sched_min_granularity > limit)
sysctl_sched_min_granularity = limit;
sysctl_sched_latency *= factor;
if (sysctl_sched_latency > limit)
sysctl_sched_latency = limit;
sysctl_sched_wakeup_granularity *= factor;
sysctl_sched_batch_wakeup_granularity *= factor;
}
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* /*
* This is how migration works: * This is how migration works:
...@@ -5621,7 +5650,7 @@ static struct notifier_block __cpuinitdata migration_notifier = { ...@@ -5621,7 +5650,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
.priority = 10 .priority = 10
}; };
int __init migration_init(void) void __init migration_init(void)
{ {
void *cpu = (void *)(long)smp_processor_id(); void *cpu = (void *)(long)smp_processor_id();
int err; int err;
...@@ -5631,8 +5660,6 @@ int __init migration_init(void) ...@@ -5631,8 +5660,6 @@ int __init migration_init(void)
BUG_ON(err == NOTIFY_BAD); BUG_ON(err == NOTIFY_BAD);
migration_call(&migration_notifier, CPU_ONLINE, cpu); migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier); register_cpu_notifier(&migration_notifier);
return 0;
} }
#endif #endif
...@@ -6688,10 +6715,12 @@ void __init sched_init_smp(void) ...@@ -6688,10 +6715,12 @@ void __init sched_init_smp(void)
/* Move init over to a non-isolated CPU */ /* Move init over to a non-isolated CPU */
if (set_cpus_allowed(current, non_isolated_cpus) < 0) if (set_cpus_allowed(current, non_isolated_cpus) < 0)
BUG(); BUG();
sched_init_granularity();
} }
#else #else
void __init sched_init_smp(void) void __init sched_init_smp(void)
{ {
sched_init_granularity();
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
...@@ -7228,7 +7257,7 @@ static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft) ...@@ -7228,7 +7257,7 @@ static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft)
spin_unlock_irqrestore(&cpu_rq(i)->lock, flags); spin_unlock_irqrestore(&cpu_rq(i)->lock, flags);
} }
/* Convert from ns to ms */ /* Convert from ns to ms */
do_div(res, 1000000); do_div(res, NSEC_PER_MSEC);
return res; return res;
} }
......
...@@ -211,7 +211,7 @@ static int sched_debug_show(struct seq_file *m, void *v) ...@@ -211,7 +211,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
#define PN(x) \ #define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
PN(sysctl_sched_latency); PN(sysctl_sched_latency);
PN(sysctl_sched_nr_latency); PN(sysctl_sched_min_granularity);
PN(sysctl_sched_wakeup_granularity); PN(sysctl_sched_wakeup_granularity);
PN(sysctl_sched_batch_wakeup_granularity); PN(sysctl_sched_batch_wakeup_granularity);
PN(sysctl_sched_child_runs_first); PN(sysctl_sched_child_runs_first);
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
/* /*
* Targeted preemption latency for CPU-bound tasks: * Targeted preemption latency for CPU-bound tasks:
* (default: 20ms, units: nanoseconds) * (default: 20ms * ilog(ncpus), units: nanoseconds)
* *
* NOTE: this latency value is not the same as the concept of * NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length * 'timeslice length' - timeslices in CFS are of variable length
...@@ -32,19 +32,24 @@ ...@@ -32,19 +32,24 @@
* (to see the precise effective timeslice length of your workload, * (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field) * run vmstat and monitor the context-switches (cs) field)
*/ */
const_debug unsigned int sysctl_sched_latency = 20000000ULL; unsigned int sysctl_sched_latency = 20000000ULL;
/* /*
* After fork, child runs first. (default) If set to 0 then * Minimal preemption granularity for CPU-bound tasks:
* parent will (try to) run first. * (default: 1 msec * ilog(ncpus), units: nanoseconds)
*/ */
const_debug unsigned int sysctl_sched_child_runs_first = 1; unsigned int sysctl_sched_min_granularity = 1000000ULL;
/* /*
* Minimal preemption granularity for CPU-bound tasks: * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
* (default: 2 msec, units: nanoseconds) */
unsigned int sched_nr_latency = 20;
/*
* After fork, child runs first. (default) If set to 0 then
* parent will (try to) run first.
*/ */
const_debug unsigned int sysctl_sched_nr_latency = 20; const_debug unsigned int sysctl_sched_child_runs_first = 1;
/* /*
* sys_sched_yield() compat mode * sys_sched_yield() compat mode
...@@ -56,23 +61,23 @@ unsigned int __read_mostly sysctl_sched_compat_yield; ...@@ -56,23 +61,23 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
/* /*
* SCHED_BATCH wake-up granularity. * SCHED_BATCH wake-up granularity.
* (default: 10 msec, units: nanoseconds) * (default: 10 msec * ilog(ncpus), units: nanoseconds)
* *
* This option delays the preemption effects of decoupled workloads * This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still * and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies. * have immediate wakeup/sleep latencies.
*/ */
const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
/* /*
* SCHED_OTHER wake-up granularity. * SCHED_OTHER wake-up granularity.
* (default: 10 msec, units: nanoseconds) * (default: 10 msec * ilog(ncpus), units: nanoseconds)
* *
* This option delays the preemption effects of decoupled workloads * This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still * and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies. * have immediate wakeup/sleep latencies.
*/ */
const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
...@@ -212,6 +217,22 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) ...@@ -212,6 +217,22 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
* Scheduling class statistics methods: * Scheduling class statistics methods:
*/ */
#ifdef CONFIG_SCHED_DEBUG
int sched_nr_latency_handler(struct ctl_table *table, int write,
struct file *filp, void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
if (ret || !write)
return ret;
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
sysctl_sched_min_granularity);
return 0;
}
#endif
/* /*
* The idea is to set a period in which each task runs once. * The idea is to set a period in which each task runs once.
...@@ -224,7 +245,7 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) ...@@ -224,7 +245,7 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
static u64 __sched_period(unsigned long nr_running) static u64 __sched_period(unsigned long nr_running)
{ {
u64 period = sysctl_sched_latency; u64 period = sysctl_sched_latency;
unsigned long nr_latency = sysctl_sched_nr_latency; unsigned long nr_latency = sched_nr_latency;
if (unlikely(nr_running > nr_latency)) { if (unlikely(nr_running > nr_latency)) {
period *= nr_running; period *= nr_running;
...@@ -259,6 +280,7 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) ...@@ -259,6 +280,7 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
{ {
u64 vslice = __sched_period(nr_running); u64 vslice = __sched_period(nr_running);
vslice *= NICE_0_LOAD;
do_div(vslice, rq_weight); do_div(vslice, rq_weight);
return vslice; return vslice;
...@@ -472,19 +494,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) ...@@ -472,19 +494,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
vruntime += sched_vslice(cfs_rq)/2; vruntime += sched_vslice(cfs_rq)/2;
/*
* The 'current' period is already promised to the current tasks,
* however the extra weight of the new task will slow them down a
* little, place the new task so that it fits in the slot that
* stays open at the end.
*/
if (initial && sched_feat(START_DEBIT)) if (initial && sched_feat(START_DEBIT))
vruntime += sched_vslice_add(cfs_rq, se); vruntime += sched_vslice_add(cfs_rq, se);
if (!initial) { if (!initial) {
/* sleeps upto a single latency don't count. */
if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
task_of(se)->policy != SCHED_BATCH) task_of(se)->policy != SCHED_BATCH)
vruntime -= sysctl_sched_latency; vruntime -= sysctl_sched_latency;
vruntime = max_t(s64, vruntime, se->vruntime); /* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
} }
se->vruntime = vruntime; se->vruntime = vruntime;
} }
static void static void
...@@ -517,7 +546,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) ...@@ -517,7 +546,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
update_stats_dequeue(cfs_rq, se); update_stats_dequeue(cfs_rq, se);
if (sleep) { if (sleep) {
se->peer_preempt = 0;
#ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) { if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se); struct task_struct *tsk = task_of(se);
...@@ -545,10 +573,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ...@@ -545,10 +573,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
ideal_runtime = sched_slice(cfs_rq, curr); ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime || if (delta_exec > ideal_runtime)
(sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt))
resched_task(rq_of(cfs_rq)->curr); resched_task(rq_of(cfs_rq)->curr);
curr->peer_preempt = 0;
} }
static void static void
...@@ -811,7 +837,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) ...@@ -811,7 +837,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
struct task_struct *curr = rq->curr; struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se, *pse = &p->se; struct sched_entity *se = &curr->se, *pse = &p->se;
s64 delta, gran; unsigned long gran;
if (unlikely(rt_prio(p->prio))) { if (unlikely(rt_prio(p->prio))) {
update_rq_clock(rq); update_rq_clock(rq);
...@@ -826,24 +852,20 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) ...@@ -826,24 +852,20 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
if (unlikely(p->policy == SCHED_BATCH)) if (unlikely(p->policy == SCHED_BATCH))
return; return;
if (sched_feat(WAKEUP_PREEMPT)) { if (!sched_feat(WAKEUP_PREEMPT))
return;
while (!is_same_group(se, pse)) { while (!is_same_group(se, pse)) {
se = parent_entity(se); se = parent_entity(se);
pse = parent_entity(pse); pse = parent_entity(pse);
} }
delta = se->vruntime - pse->vruntime;
gran = sysctl_sched_wakeup_granularity; gran = sysctl_sched_wakeup_granularity;
if (unlikely(se->load.weight != NICE_0_LOAD)) if (unlikely(se->load.weight != NICE_0_LOAD))
gran = calc_delta_fair(gran, &se->load); gran = calc_delta_fair(gran, &se->load);
if (delta > gran) { if (pse->vruntime + gran < se->vruntime)
int now = !sched_feat(PREEMPT_RESTRICT);
if (now || p->prio < curr->prio || !se->peer_preempt++)
resched_task(curr); resched_task(curr);
}
}
} }
static struct task_struct *pick_next_task_fair(struct rq *rq) static struct task_struct *pick_next_task_fair(struct rq *rq)
...@@ -1045,8 +1067,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) ...@@ -1045,8 +1067,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
update_curr(cfs_rq); update_curr(cfs_rq);
place_entity(cfs_rq, se, 1); place_entity(cfs_rq, se, 1);
/* 'curr' will be NULL if the child belongs to a different group */
if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
curr->vruntime < se->vruntime) { curr && curr->vruntime < se->vruntime) {
/* /*
* Upon rescheduling, sched_class::put_prev_task() will place * Upon rescheduling, sched_class::put_prev_task() will place
* 'current' within the tree based on its new key value. * 'current' within the tree based on its new key value.
...@@ -1054,7 +1077,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) ...@@ -1054,7 +1077,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
swap(curr->vruntime, se->vruntime); swap(curr->vruntime, se->vruntime);
} }
se->peer_preempt = 0;
enqueue_task_fair(rq, p, 0); enqueue_task_fair(rq, p, 0);
resched_task(rq->curr); resched_task(rq->curr);
} }
......
...@@ -127,7 +127,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) ...@@ -127,7 +127,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
# define schedstat_set(var, val) do { } while (0) # define schedstat_set(var, val) do { } while (0)
#endif #endif
#ifdef CONFIG_SCHEDSTATS #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
/* /*
* Called when a process is dequeued from the active array and given * Called when a process is dequeued from the active array and given
* the cpu. We should note that with the exception of interactive * the cpu. We should note that with the exception of interactive
...@@ -155,7 +155,7 @@ static inline void sched_info_dequeued(struct task_struct *t) ...@@ -155,7 +155,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
*/ */
static void sched_info_arrive(struct task_struct *t) static void sched_info_arrive(struct task_struct *t)
{ {
unsigned long long now = sched_clock(), delta = 0; unsigned long long now = task_rq(t)->clock, delta = 0;
if (t->sched_info.last_queued) if (t->sched_info.last_queued)
delta = now - t->sched_info.last_queued; delta = now - t->sched_info.last_queued;
...@@ -186,7 +186,7 @@ static inline void sched_info_queued(struct task_struct *t) ...@@ -186,7 +186,7 @@ static inline void sched_info_queued(struct task_struct *t)
{ {
if (unlikely(sched_info_on())) if (unlikely(sched_info_on()))
if (!t->sched_info.last_queued) if (!t->sched_info.last_queued)
t->sched_info.last_queued = sched_clock(); t->sched_info.last_queued = task_rq(t)->clock;
} }
/* /*
...@@ -195,7 +195,8 @@ static inline void sched_info_queued(struct task_struct *t) ...@@ -195,7 +195,8 @@ static inline void sched_info_queued(struct task_struct *t)
*/ */
static inline void sched_info_depart(struct task_struct *t) static inline void sched_info_depart(struct task_struct *t)
{ {
unsigned long long delta = sched_clock() - t->sched_info.last_arrival; unsigned long long delta = task_rq(t)->clock -
t->sched_info.last_arrival;
t->sched_info.cpu_time += delta; t->sched_info.cpu_time += delta;
rq_sched_info_depart(task_rq(t), delta); rq_sched_info_depart(task_rq(t), delta);
...@@ -231,5 +232,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) ...@@ -231,5 +232,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
#else #else
#define sched_info_queued(t) do { } while (0) #define sched_info_queued(t) do { } while (0)
#define sched_info_switch(t, next) do { } while (0) #define sched_info_switch(t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
...@@ -226,20 +226,23 @@ static struct ctl_table root_table[] = { ...@@ -226,20 +226,23 @@ static struct ctl_table root_table[] = {
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */
static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */ static unsigned long max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ static unsigned long min_wakeup_granularity_ns; /* 0 usecs */
static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */ static unsigned long max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
#endif #endif
static struct ctl_table kern_table[] = { static struct ctl_table kern_table[] = {
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
{ {
.ctl_name = CTL_UNNUMBERED, .ctl_name = CTL_UNNUMBERED,
.procname = "sched_nr_latency", .procname = "sched_min_granularity_ns",
.data = &sysctl_sched_nr_latency, .data = &sysctl_sched_min_granularity,
.maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned int),
.mode = 0644, .mode = 0644,
.proc_handler = &proc_dointvec, .proc_handler = &sched_nr_latency_handler,
.strategy = &sysctl_intvec,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
}, },
{ {
.ctl_name = CTL_UNNUMBERED, .ctl_name = CTL_UNNUMBERED,
...@@ -247,7 +250,7 @@ static struct ctl_table kern_table[] = { ...@@ -247,7 +250,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_latency, .data = &sysctl_sched_latency,
.maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned int),
.mode = 0644, .mode = 0644,
.proc_handler = &proc_dointvec_minmax, .proc_handler = &sched_nr_latency_handler,
.strategy = &sysctl_intvec, .strategy = &sysctl_intvec,
.extra1 = &min_sched_granularity_ns, .extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns, .extra2 = &max_sched_granularity_ns,
...@@ -298,6 +301,14 @@ static struct ctl_table kern_table[] = { ...@@ -298,6 +301,14 @@ static struct ctl_table kern_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = &proc_dointvec, .proc_handler = &proc_dointvec,
}, },
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_nr_migrate",
.data = &sysctl_sched_nr_migrate,
.maxlen = sizeof(unsigned int),
.mode = 644,
.proc_handler = &proc_dointvec,
},
#endif #endif
{ {
.ctl_name = CTL_UNNUMBERED, .ctl_name = CTL_UNNUMBERED,
......
...@@ -817,6 +817,19 @@ unsigned long next_timer_interrupt(void) ...@@ -817,6 +817,19 @@ unsigned long next_timer_interrupt(void)
#endif #endif
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
void account_process_tick(struct task_struct *p, int user_tick)
{
if (user_tick) {
account_user_time(p, jiffies_to_cputime(1));
account_user_time_scaled(p, jiffies_to_cputime(1));
} else {
account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
account_system_time_scaled(p, jiffies_to_cputime(1));
}
}
#endif
/* /*
* Called from the timer interrupt handler to charge one tick to the current * Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system. * process. user_tick is 1 if the tick is user time, 0 for system.
...@@ -827,13 +840,7 @@ void update_process_times(int user_tick) ...@@ -827,13 +840,7 @@ void update_process_times(int user_tick)
int cpu = smp_processor_id(); int cpu = smp_processor_id();
/* Note: this timer irq context must be accounted for as well. */ /* Note: this timer irq context must be accounted for as well. */
if (user_tick) { account_process_tick(p, user_tick);
account_user_time(p, jiffies_to_cputime(1));
account_user_time_scaled(p, jiffies_to_cputime(1));
} else {
account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
account_system_time_scaled(p, jiffies_to_cputime(1));
}
run_local_timers(); run_local_timers();
if (rcu_pending(cpu)) if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_tick); rcu_check_callbacks(cpu, user_tick);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment