Commit 774a694f authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of...

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (64 commits)
  sched: Fix sched::sched_stat_wait tracepoint field
  sched: Disable NEW_FAIR_SLEEPERS for now
  sched: Keep kthreads at default priority
  sched: Re-tune the scheduler latency defaults to decrease worst-case latencies
  sched: Turn off child_runs_first
  sched: Ensure that a child can't gain time over it's parent after fork()
  sched: enable SD_WAKE_IDLE
  sched: Deal with low-load in wake_affine()
  sched: Remove short cut from select_task_rq_fair()
  sched: Turn on SD_BALANCE_NEWIDLE
  sched: Clean up topology.h
  sched: Fix dynamic power-balancing crash
  sched: Remove reciprocal for cpu_power
  sched: Try to deal with low capacity, fix update_sd_power_savings_stats()
  sched: Try to deal with low capacity
  sched: Scale down cpu_power due to RT tasks
  sched: Implement dynamic cpu_power
  sched: Add smt_gain
  sched: Update the cpu_power sum during load-balance
  sched: Add SD_PREFER_SIBLING
  ...
parents 4f0ac854 e1f84508
...@@ -129,25 +129,34 @@ extern unsigned long node_remap_size[]; ...@@ -129,25 +129,34 @@ extern unsigned long node_remap_size[];
#endif #endif
/* sched_domains SD_NODE_INIT for NUMA machines */ /* sched_domains SD_NODE_INIT for NUMA machines */
#define SD_NODE_INIT (struct sched_domain) { \ #define SD_NODE_INIT (struct sched_domain) { \
.min_interval = 8, \ .min_interval = 8, \
.max_interval = 32, \ .max_interval = 32, \
.busy_factor = 32, \ .busy_factor = 32, \
.imbalance_pct = 125, \ .imbalance_pct = 125, \
.cache_nice_tries = SD_CACHE_NICE_TRIES, \ .cache_nice_tries = SD_CACHE_NICE_TRIES, \
.busy_idx = 3, \ .busy_idx = 3, \
.idle_idx = SD_IDLE_IDX, \ .idle_idx = SD_IDLE_IDX, \
.newidle_idx = SD_NEWIDLE_IDX, \ .newidle_idx = SD_NEWIDLE_IDX, \
.wake_idx = 1, \ .wake_idx = 1, \
.forkexec_idx = SD_FORKEXEC_IDX, \ .forkexec_idx = SD_FORKEXEC_IDX, \
.flags = SD_LOAD_BALANCE \ \
| SD_BALANCE_EXEC \ .flags = 1*SD_LOAD_BALANCE \
| SD_BALANCE_FORK \ | 1*SD_BALANCE_NEWIDLE \
| SD_WAKE_AFFINE \ | 1*SD_BALANCE_EXEC \
| SD_WAKE_BALANCE \ | 1*SD_BALANCE_FORK \
| SD_SERIALIZE, \ | 0*SD_WAKE_IDLE \
.last_balance = jiffies, \ | 1*SD_WAKE_AFFINE \
.balance_interval = 1, \ | 1*SD_WAKE_BALANCE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
| 0*SD_SHARE_PKG_RESOURCES \
| 1*SD_SERIALIZE \
| 1*SD_WAKE_IDLE_FAR \
| 0*SD_PREFER_SIBLING \
, \
.last_balance = jiffies, \
.balance_interval = 1, \
} }
#ifdef CONFIG_X86_64_ACPI_NUMA #ifdef CONFIG_X86_64_ACPI_NUMA
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include <linux/swap.h> #include <linux/swap.h>
#include <linux/bootmem.h> #include <linux/bootmem.h>
#include <linux/fs_struct.h> #include <linux/fs_struct.h>
#include <linux/hardirq.h>
#include "internal.h" #include "internal.h"
int sysctl_vfs_cache_pressure __read_mostly = 100; int sysctl_vfs_cache_pressure __read_mostly = 100;
......
...@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) ...@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
* give it the opportunity to lock the file. * give it the opportunity to lock the file.
*/ */
if (found) if (found)
cond_resched_bkl(); cond_resched();
find_conflict: find_conflict:
for_each_lock(inode, before) { for_each_lock(inode, before) {
......
...@@ -64,6 +64,12 @@ ...@@ -64,6 +64,12 @@
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET (1UL << NMI_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT)
#ifndef PREEMPT_ACTIVE
#define PREEMPT_ACTIVE_BITS 1
#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
#endif
#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
#error PREEMPT_ACTIVE is too low! #error PREEMPT_ACTIVE is too low!
#endif #endif
......
...@@ -125,7 +125,7 @@ extern int _cond_resched(void); ...@@ -125,7 +125,7 @@ extern int _cond_resched(void);
#endif #endif
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
void __might_sleep(char *file, int line); void __might_sleep(char *file, int line, int preempt_offset);
/** /**
* might_sleep - annotation for functions that can sleep * might_sleep - annotation for functions that can sleep
* *
...@@ -137,8 +137,9 @@ extern int _cond_resched(void); ...@@ -137,8 +137,9 @@ extern int _cond_resched(void);
* supposed to. * supposed to.
*/ */
# define might_sleep() \ # define might_sleep() \
do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
#else #else
static inline void __might_sleep(char *file, int line, int preempt_offset) { }
# define might_sleep() do { might_resched(); } while (0) # define might_sleep() do { might_resched(); } while (0)
#endif #endif
......
...@@ -38,6 +38,8 @@ ...@@ -38,6 +38,8 @@
#define SCHED_BATCH 3 #define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */ /* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5 #define SCHED_IDLE 5
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
#ifdef __KERNEL__ #ifdef __KERNEL__
...@@ -796,18 +798,19 @@ enum cpu_idle_type { ...@@ -796,18 +798,19 @@ enum cpu_idle_type {
#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ #define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
#define SD_BALANCE_EXEC 4 /* Balance on exec */ #define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
#define SD_BALANCE_FORK 8 /* Balance on fork, clone */ #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ #define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */
#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ #define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */
#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 1024 /* Only a single load balancing instance */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ #define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
enum powersavings_balance_level { enum powersavings_balance_level {
POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
...@@ -827,7 +830,7 @@ static inline int sd_balance_for_mc_power(void) ...@@ -827,7 +830,7 @@ static inline int sd_balance_for_mc_power(void)
if (sched_smt_power_savings) if (sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE; return SD_POWERSAVINGS_BALANCE;
return 0; return SD_PREFER_SIBLING;
} }
static inline int sd_balance_for_package_power(void) static inline int sd_balance_for_package_power(void)
...@@ -835,7 +838,7 @@ static inline int sd_balance_for_package_power(void) ...@@ -835,7 +838,7 @@ static inline int sd_balance_for_package_power(void)
if (sched_mc_power_savings | sched_smt_power_savings) if (sched_mc_power_savings | sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE; return SD_POWERSAVINGS_BALANCE;
return 0; return SD_PREFER_SIBLING;
} }
/* /*
...@@ -857,15 +860,9 @@ struct sched_group { ...@@ -857,15 +860,9 @@ struct sched_group {
/* /*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a * CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU. This is read only (except for setup, hotplug CPU). * single CPU.
* Note : Never change cpu_power without recompute its reciprocal
*/
unsigned int __cpu_power;
/*
* reciprocal value of cpu_power to avoid expensive divides
* (see include/linux/reciprocal_div.h)
*/ */
u32 reciprocal_cpu_power; unsigned int cpu_power;
/* /*
* The CPUs this group covers. * The CPUs this group covers.
...@@ -918,6 +915,7 @@ struct sched_domain { ...@@ -918,6 +915,7 @@ struct sched_domain {
unsigned int newidle_idx; unsigned int newidle_idx;
unsigned int wake_idx; unsigned int wake_idx;
unsigned int forkexec_idx; unsigned int forkexec_idx;
unsigned int smt_gain;
int flags; /* See SD_* */ int flags; /* See SD_* */
enum sched_domain_level level; enum sched_domain_level level;
...@@ -1045,7 +1043,6 @@ struct sched_class { ...@@ -1045,7 +1043,6 @@ struct sched_class {
struct rq *busiest, struct sched_domain *sd, struct rq *busiest, struct sched_domain *sd,
enum cpu_idle_type idle); enum cpu_idle_type idle);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
int (*needs_post_schedule) (struct rq *this_rq);
void (*post_schedule) (struct rq *this_rq); void (*post_schedule) (struct rq *this_rq);
void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
...@@ -1110,6 +1107,8 @@ struct sched_entity { ...@@ -1110,6 +1107,8 @@ struct sched_entity {
u64 wait_max; u64 wait_max;
u64 wait_count; u64 wait_count;
u64 wait_sum; u64 wait_sum;
u64 iowait_count;
u64 iowait_sum;
u64 sleep_start; u64 sleep_start;
u64 sleep_max; u64 sleep_max;
...@@ -1234,11 +1233,19 @@ struct task_struct { ...@@ -1234,11 +1233,19 @@ struct task_struct {
unsigned did_exec:1; unsigned did_exec:1;
unsigned in_execve:1; /* Tell the LSMs that the process is doing an unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */ * execve */
unsigned in_iowait:1;
/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;
pid_t pid; pid_t pid;
pid_t tgid; pid_t tgid;
#ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */ /* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary; unsigned long stack_canary;
#endif
/* /*
* pointers to (original) parent process, youngest child, younger sibling, * pointers to (original) parent process, youngest child, younger sibling,
...@@ -1840,11 +1847,12 @@ extern unsigned int sysctl_sched_min_granularity; ...@@ -1840,11 +1847,12 @@ extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_shares_ratelimit; extern unsigned int sysctl_sched_shares_ratelimit;
extern unsigned int sysctl_sched_shares_thresh; extern unsigned int sysctl_sched_shares_thresh;
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_child_runs_first;
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_time_avg;
extern unsigned int sysctl_timer_migration; extern unsigned int sysctl_timer_migration;
int sched_nr_latency_handler(struct ctl_table *table, int write, int sched_nr_latency_handler(struct ctl_table *table, int write,
...@@ -2308,23 +2316,31 @@ static inline int need_resched(void) ...@@ -2308,23 +2316,31 @@ static inline int need_resched(void)
* cond_resched_softirq() will enable bhs before scheduling. * cond_resched_softirq() will enable bhs before scheduling.
*/ */
extern int _cond_resched(void); extern int _cond_resched(void);
#ifdef CONFIG_PREEMPT_BKL
static inline int cond_resched(void) #define cond_resched() ({ \
{ __might_sleep(__FILE__, __LINE__, 0); \
return 0; _cond_resched(); \
} })
extern int __cond_resched_lock(spinlock_t *lock);
#ifdef CONFIG_PREEMPT
#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET
#else #else
static inline int cond_resched(void) #define PREEMPT_LOCK_OFFSET 0
{
return _cond_resched();
}
#endif #endif
extern int cond_resched_lock(spinlock_t * lock);
extern int cond_resched_softirq(void); #define cond_resched_lock(lock) ({ \
static inline int cond_resched_bkl(void) __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
{ __cond_resched_lock(lock); \
return _cond_resched(); })
}
extern int __cond_resched_softirq(void);
#define cond_resched_softirq() ({ \
__might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \
__cond_resched_softirq(); \
})
/* /*
* Does a critical section need to be broken due to another * Does a critical section need to be broken due to another
......
...@@ -85,20 +85,29 @@ int arch_update_cpu_topology(void); ...@@ -85,20 +85,29 @@ int arch_update_cpu_topology(void);
#define ARCH_HAS_SCHED_WAKE_IDLE #define ARCH_HAS_SCHED_WAKE_IDLE
/* Common values for SMT siblings */ /* Common values for SMT siblings */
#ifndef SD_SIBLING_INIT #ifndef SD_SIBLING_INIT
#define SD_SIBLING_INIT (struct sched_domain) { \ #define SD_SIBLING_INIT (struct sched_domain) { \
.min_interval = 1, \ .min_interval = 1, \
.max_interval = 2, \ .max_interval = 2, \
.busy_factor = 64, \ .busy_factor = 64, \
.imbalance_pct = 110, \ .imbalance_pct = 110, \
.flags = SD_LOAD_BALANCE \ \
| SD_BALANCE_NEWIDLE \ .flags = 1*SD_LOAD_BALANCE \
| SD_BALANCE_FORK \ | 1*SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \ | 1*SD_BALANCE_EXEC \
| SD_WAKE_AFFINE \ | 1*SD_BALANCE_FORK \
| SD_WAKE_BALANCE \ | 0*SD_WAKE_IDLE \
| SD_SHARE_CPUPOWER, \ | 1*SD_WAKE_AFFINE \
.last_balance = jiffies, \ | 1*SD_WAKE_BALANCE \
.balance_interval = 1, \ | 1*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
| 0*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
| 0*SD_WAKE_IDLE_FAR \
| 0*SD_PREFER_SIBLING \
, \
.last_balance = jiffies, \
.balance_interval = 1, \
.smt_gain = 1178, /* 15% */ \
} }
#endif #endif
#endif /* CONFIG_SCHED_SMT */ #endif /* CONFIG_SCHED_SMT */
...@@ -106,69 +115,94 @@ int arch_update_cpu_topology(void); ...@@ -106,69 +115,94 @@ int arch_update_cpu_topology(void);
#ifdef CONFIG_SCHED_MC #ifdef CONFIG_SCHED_MC
/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
#ifndef SD_MC_INIT #ifndef SD_MC_INIT
#define SD_MC_INIT (struct sched_domain) { \ #define SD_MC_INIT (struct sched_domain) { \
.min_interval = 1, \ .min_interval = 1, \
.max_interval = 4, \ .max_interval = 4, \
.busy_factor = 64, \ .busy_factor = 64, \
.imbalance_pct = 125, \ .imbalance_pct = 125, \
.cache_nice_tries = 1, \ .cache_nice_tries = 1, \
.busy_idx = 2, \ .busy_idx = 2, \
.wake_idx = 1, \ .wake_idx = 1, \
.forkexec_idx = 1, \ .forkexec_idx = 1, \
.flags = SD_LOAD_BALANCE \ \
| SD_BALANCE_FORK \ .flags = 1*SD_LOAD_BALANCE \
| SD_BALANCE_EXEC \ | 1*SD_BALANCE_NEWIDLE \
| SD_WAKE_AFFINE \ | 1*SD_BALANCE_EXEC \
| SD_WAKE_BALANCE \ | 1*SD_BALANCE_FORK \
| SD_SHARE_PKG_RESOURCES\ | 1*SD_WAKE_IDLE \
| sd_balance_for_mc_power()\ | 1*SD_WAKE_AFFINE \
| sd_power_saving_flags(),\ | 1*SD_WAKE_BALANCE \
.last_balance = jiffies, \ | 0*SD_SHARE_CPUPOWER \
.balance_interval = 1, \ | 1*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
| 0*SD_WAKE_IDLE_FAR \
| sd_balance_for_mc_power() \
| sd_power_saving_flags() \
, \
.last_balance = jiffies, \
.balance_interval = 1, \
} }
#endif #endif
#endif /* CONFIG_SCHED_MC */ #endif /* CONFIG_SCHED_MC */
/* Common values for CPUs */ /* Common values for CPUs */
#ifndef SD_CPU_INIT #ifndef SD_CPU_INIT
#define SD_CPU_INIT (struct sched_domain) { \ #define SD_CPU_INIT (struct sched_domain) { \
.min_interval = 1, \ .min_interval = 1, \
.max_interval = 4, \ .max_interval = 4, \
.busy_factor = 64, \ .busy_factor = 64, \
.imbalance_pct = 125, \ .imbalance_pct = 125, \
.cache_nice_tries = 1, \ .cache_nice_tries = 1, \
.busy_idx = 2, \ .busy_idx = 2, \
.idle_idx = 1, \ .idle_idx = 1, \
.newidle_idx = 2, \ .newidle_idx = 2, \
.wake_idx = 1, \ .wake_idx = 1, \
.forkexec_idx = 1, \ .forkexec_idx = 1, \
.flags = SD_LOAD_BALANCE \ \
| SD_BALANCE_EXEC \ .flags = 1*SD_LOAD_BALANCE \
| SD_BALANCE_FORK \ | 1*SD_BALANCE_NEWIDLE \
| SD_WAKE_AFFINE \ | 1*SD_BALANCE_EXEC \
| SD_WAKE_BALANCE \ | 1*SD_BALANCE_FORK \
| sd_balance_for_package_power()\ | 1*SD_WAKE_IDLE \
| sd_power_saving_flags(),\ | 0*SD_WAKE_AFFINE \
.last_balance = jiffies, \ | 1*SD_WAKE_BALANCE \
.balance_interval = 1, \ | 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
| 0*SD_WAKE_IDLE_FAR \
| sd_balance_for_package_power() \
| sd_power_saving_flags() \
, \
.last_balance = jiffies, \
.balance_interval = 1, \
} }
#endif #endif
/* sched_domains SD_ALLNODES_INIT for NUMA machines */ /* sched_domains SD_ALLNODES_INIT for NUMA machines */
#define SD_ALLNODES_INIT (struct sched_domain) { \ #define SD_ALLNODES_INIT (struct sched_domain) { \
.min_interval = 64, \ .min_interval = 64, \
.max_interval = 64*num_online_cpus(), \ .max_interval = 64*num_online_cpus(), \
.busy_factor = 128, \ .busy_factor = 128, \
.imbalance_pct = 133, \ .imbalance_pct = 133, \
.cache_nice_tries = 1, \ .cache_nice_tries = 1, \
.busy_idx = 3, \ .busy_idx = 3, \
.idle_idx = 3, \ .idle_idx = 3, \
.flags = SD_LOAD_BALANCE \ .flags = 1*SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_NEWIDLE \
| SD_WAKE_AFFINE \ | 0*SD_BALANCE_EXEC \
| SD_SERIALIZE, \ | 0*SD_BALANCE_FORK \
.last_balance = jiffies, \ | 0*SD_WAKE_IDLE \
.balance_interval = 64, \ | 1*SD_WAKE_AFFINE \
| 0*SD_WAKE_BALANCE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
| 0*SD_SHARE_PKG_RESOURCES \
| 1*SD_SERIALIZE \
| 1*SD_WAKE_IDLE_FAR \
| 0*SD_PREFER_SIBLING \
, \
.last_balance = jiffies, \
.balance_interval = 64, \
} }
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
......
...@@ -340,6 +340,101 @@ TRACE_EVENT(sched_signal_send, ...@@ -340,6 +340,101 @@ TRACE_EVENT(sched_signal_send,
__entry->sig, __entry->comm, __entry->pid) __entry->sig, __entry->comm, __entry->pid)
); );
/*
* XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
* adding sched_stat support to SCHED_FIFO/RR would be welcome.
*/
/*
* Tracepoint for accounting wait time (time the task is runnable
* but not actually running due to scheduler contention).
*/
TRACE_EVENT(sched_stat_wait,
TP_PROTO(struct task_struct *tsk, u64 delay),
TP_ARGS(tsk, delay),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( u64, delay )
),
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->delay = delay;
)
TP_perf_assign(
__perf_count(delay);
),
TP_printk("task: %s:%d wait: %Lu [ns]",
__entry->comm, __entry->pid,
(unsigned long long)__entry->delay)
);
/*
* Tracepoint for accounting sleep time (time the task is not runnable,
* including iowait, see below).
*/
TRACE_EVENT(sched_stat_sleep,
TP_PROTO(struct task_struct *tsk, u64 delay),
TP_ARGS(tsk, delay),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( u64, delay )
),
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->delay = delay;
)
TP_perf_assign(
__perf_count(delay);
),
TP_printk("task: %s:%d sleep: %Lu [ns]",
__entry->comm, __entry->pid,
(unsigned long long)__entry->delay)
);
/*
* Tracepoint for accounting iowait time (time the task is not runnable
* due to waiting on IO to complete).
*/
TRACE_EVENT(sched_stat_iowait,
TP_PROTO(struct task_struct *tsk, u64 delay),
TP_ARGS(tsk, delay),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( u64, delay )
),
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->delay = delay;
)
TP_perf_assign(
__perf_count(delay);
),
TP_printk("task: %s:%d iowait: %Lu [ns]",
__entry->comm, __entry->pid,
(unsigned long long)__entry->delay)
);
#endif /* _TRACE_SCHED_H */ #endif /* _TRACE_SCHED_H */
/* This part must be outside protection */ /* This part must be outside protection */
......
...@@ -631,7 +631,6 @@ asmlinkage void __init start_kernel(void) ...@@ -631,7 +631,6 @@ asmlinkage void __init start_kernel(void)
softirq_init(); softirq_init();
timekeeping_init(); timekeeping_init();
time_init(); time_init();
sched_clock_init();
profile_init(); profile_init();
if (!irqs_disabled()) if (!irqs_disabled())
printk(KERN_CRIT "start_kernel(): bug: interrupts were " printk(KERN_CRIT "start_kernel(): bug: interrupts were "
...@@ -682,6 +681,7 @@ asmlinkage void __init start_kernel(void) ...@@ -682,6 +681,7 @@ asmlinkage void __init start_kernel(void)
numa_policy_init(); numa_policy_init();
if (late_time_init) if (late_time_init)
late_time_init(); late_time_init();
sched_clock_init();
calibrate_delay(); calibrate_delay();
pidmap_init(); pidmap_init();
anon_vma_init(); anon_vma_init();
......
...@@ -16,8 +16,6 @@ ...@@ -16,8 +16,6 @@
#include <linux/mutex.h> #include <linux/mutex.h>
#include <trace/events/sched.h> #include <trace/events/sched.h>
#define KTHREAD_NICE_LEVEL (-5)
static DEFINE_SPINLOCK(kthread_create_lock); static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list); static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task; struct task_struct *kthreadd_task;
...@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), ...@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
* The kernel thread should not inherit these properties. * The kernel thread should not inherit these properties.
*/ */
sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param); sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
set_user_nice(create.result, KTHREAD_NICE_LEVEL);
set_cpus_allowed_ptr(create.result, cpu_all_mask); set_cpus_allowed_ptr(create.result, cpu_all_mask);
} }
return create.result; return create.result;
...@@ -221,7 +218,6 @@ int kthreadd(void *unused) ...@@ -221,7 +218,6 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */ /* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd"); set_task_comm(tsk, "kthreadd");
ignore_signals(tsk); ignore_signals(tsk);
set_user_nice(tsk, KTHREAD_NICE_LEVEL);
set_cpus_allowed_ptr(tsk, cpu_all_mask); set_cpus_allowed_ptr(tsk, cpu_all_mask);
set_mems_allowed(node_possible_map); set_mems_allowed(node_possible_map);
......
...@@ -64,7 +64,6 @@ ...@@ -64,7 +64,6 @@
#include <linux/tsacct_kern.h> #include <linux/tsacct_kern.h>
#include <linux/kprobes.h> #include <linux/kprobes.h>
#include <linux/delayacct.h> #include <linux/delayacct.h>
#include <linux/reciprocal_div.h>
#include <linux/unistd.h> #include <linux/unistd.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/hrtimer.h> #include <linux/hrtimer.h>
...@@ -120,30 +119,8 @@ ...@@ -120,30 +119,8 @@
*/ */
#define RUNTIME_INF ((u64)~0ULL) #define RUNTIME_INF ((u64)~0ULL)
#ifdef CONFIG_SMP
static void double_rq_lock(struct rq *rq1, struct rq *rq2); static void double_rq_lock(struct rq *rq1, struct rq *rq2);
/*
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
* Since cpu_power is a 'constant', we can use a reciprocal divide.
*/
static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
{
return reciprocal_divide(load, sg->reciprocal_cpu_power);
}
/*
* Each time a sched group cpu_power is changed,
* we must compute its reciprocal value
*/
static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
{
sg->__cpu_power += val;
sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
}
#endif
static inline int rt_policy(int policy) static inline int rt_policy(int policy)
{ {
if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
...@@ -309,8 +286,8 @@ void set_tg_uid(struct user_struct *user) ...@@ -309,8 +286,8 @@ void set_tg_uid(struct user_struct *user)
/* /*
* Root task group. * Root task group.
* Every UID task group (including init_task_group aka UID-0) will * Every UID task group (including init_task_group aka UID-0) will
* be a child to this group. * be a child to this group.
*/ */
struct task_group root_task_group; struct task_group root_task_group;
...@@ -318,7 +295,7 @@ struct task_group root_task_group; ...@@ -318,7 +295,7 @@ struct task_group root_task_group;
/* Default task group's sched entity on each cpu */ /* Default task group's sched entity on each cpu */
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */ /* Default task group's cfs_rq on each cpu */
static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
...@@ -616,6 +593,7 @@ struct rq { ...@@ -616,6 +593,7 @@ struct rq {
unsigned char idle_at_tick; unsigned char idle_at_tick;
/* For active balancing */ /* For active balancing */
int post_schedule;
int active_balance; int active_balance;
int push_cpu; int push_cpu;
/* cpu of this runqueue: */ /* cpu of this runqueue: */
...@@ -626,6 +604,9 @@ struct rq { ...@@ -626,6 +604,9 @@ struct rq {
struct task_struct *migration_thread; struct task_struct *migration_thread;
struct list_head migration_queue; struct list_head migration_queue;
u64 rt_avg;
u64 age_stamp;
#endif #endif
/* calc_load related fields */ /* calc_load related fields */
...@@ -693,6 +674,7 @@ static inline int cpu_of(struct rq *rq) ...@@ -693,6 +674,7 @@ static inline int cpu_of(struct rq *rq)
#define this_rq() (&__get_cpu_var(runqueues)) #define this_rq() (&__get_cpu_var(runqueues))
#define task_rq(p) cpu_rq(task_cpu(p)) #define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() (&__raw_get_cpu_var(runqueues))
inline void update_rq_clock(struct rq *rq) inline void update_rq_clock(struct rq *rq)
{ {
...@@ -860,6 +842,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; ...@@ -860,6 +842,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
*/ */
unsigned int sysctl_sched_shares_thresh = 4; unsigned int sysctl_sched_shares_thresh = 4;
/*
* period over which we average the RT time consumption, measured
* in ms.
*
* default: 1s
*/
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
/* /*
* period over which we measure -rt task cpu usage in us. * period over which we measure -rt task cpu usage in us.
* default: 1s * default: 1s
...@@ -1278,12 +1268,37 @@ void wake_up_idle_cpu(int cpu) ...@@ -1278,12 +1268,37 @@ void wake_up_idle_cpu(int cpu)
} }
#endif /* CONFIG_NO_HZ */ #endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
{
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}
static void sched_avg_update(struct rq *rq)
{
s64 period = sched_avg_period();
while ((s64)(rq->clock - rq->age_stamp) > period) {
rq->age_stamp += period;
rq->rt_avg /= 2;
}
}
static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
rq->rt_avg += rt_delta;
sched_avg_update(rq);
}
#else /* !CONFIG_SMP */ #else /* !CONFIG_SMP */
static void resched_task(struct task_struct *p) static void resched_task(struct task_struct *p)
{ {
assert_spin_locked(&task_rq(p)->lock); assert_spin_locked(&task_rq(p)->lock);
set_tsk_need_resched(p); set_tsk_need_resched(p);
} }
static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
}
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#if BITS_PER_LONG == 32 #if BITS_PER_LONG == 32
...@@ -1513,28 +1528,35 @@ static unsigned long cpu_avg_load_per_task(int cpu) ...@@ -1513,28 +1528,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
struct update_shares_data {
unsigned long rq_weight[NR_CPUS];
};
static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
static void __set_se_shares(struct sched_entity *se, unsigned long shares); static void __set_se_shares(struct sched_entity *se, unsigned long shares);
/* /*
* Calculate and set the cpu's group shares. * Calculate and set the cpu's group shares.
*/ */
static void static void update_group_shares_cpu(struct task_group *tg, int cpu,
update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares,
unsigned long sd_shares, unsigned long sd_rq_weight) unsigned long sd_rq_weight,
struct update_shares_data *usd)
{ {
unsigned long shares; unsigned long shares, rq_weight;
unsigned long rq_weight; int boost = 0;
if (!tg->se[cpu]) rq_weight = usd->rq_weight[cpu];
return; if (!rq_weight) {
boost = 1;
rq_weight = tg->cfs_rq[cpu]->rq_weight; rq_weight = NICE_0_LOAD;
}
/* /*
* \Sum shares * rq_weight * \Sum_j shares_j * rq_weight_i
* shares = ----------------------- * shares_i = -----------------------------
* \Sum rq_weight * \Sum_j rq_weight_j
*
*/ */
shares = (sd_shares * rq_weight) / sd_rq_weight; shares = (sd_shares * rq_weight) / sd_rq_weight;
shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
...@@ -1545,8 +1567,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, ...@@ -1545,8 +1567,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&rq->lock, flags); spin_lock_irqsave(&rq->lock, flags);
tg->cfs_rq[cpu]->shares = shares; tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
__set_se_shares(tg->se[cpu], shares); __set_se_shares(tg->se[cpu], shares);
spin_unlock_irqrestore(&rq->lock, flags); spin_unlock_irqrestore(&rq->lock, flags);
} }
...@@ -1559,22 +1581,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, ...@@ -1559,22 +1581,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
*/ */
static int tg_shares_up(struct task_group *tg, void *data) static int tg_shares_up(struct task_group *tg, void *data)
{ {
unsigned long weight, rq_weight = 0; unsigned long weight, rq_weight = 0, shares = 0;
unsigned long shares = 0; struct update_shares_data *usd;
struct sched_domain *sd = data; struct sched_domain *sd = data;
unsigned long flags;
int i; int i;
if (!tg->se[0])
return 0;
local_irq_save(flags);
usd = &__get_cpu_var(update_shares_data);
for_each_cpu(i, sched_domain_span(sd)) { for_each_cpu(i, sched_domain_span(sd)) {
weight = tg->cfs_rq[i]->load.weight;
usd->rq_weight[i] = weight;
/* /*
* If there are currently no tasks on the cpu pretend there * If there are currently no tasks on the cpu pretend there
* is one of average load so that when a new task gets to * is one of average load so that when a new task gets to
* run here it will not get delayed by group starvation. * run here it will not get delayed by group starvation.
*/ */
weight = tg->cfs_rq[i]->load.weight;
if (!weight) if (!weight)
weight = NICE_0_LOAD; weight = NICE_0_LOAD;
tg->cfs_rq[i]->rq_weight = weight;
rq_weight += weight; rq_weight += weight;
shares += tg->cfs_rq[i]->shares; shares += tg->cfs_rq[i]->shares;
} }
...@@ -1586,7 +1616,9 @@ static int tg_shares_up(struct task_group *tg, void *data) ...@@ -1586,7 +1616,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
shares = tg->shares; shares = tg->shares;
for_each_cpu(i, sched_domain_span(sd)) for_each_cpu(i, sched_domain_span(sd))
update_group_shares_cpu(tg, i, shares, rq_weight); update_group_shares_cpu(tg, i, shares, rq_weight, usd);
local_irq_restore(flags);
return 0; return 0;
} }
...@@ -1616,8 +1648,14 @@ static int tg_load_down(struct task_group *tg, void *data) ...@@ -1616,8 +1648,14 @@ static int tg_load_down(struct task_group *tg, void *data)
static void update_shares(struct sched_domain *sd) static void update_shares(struct sched_domain *sd)
{ {
u64 now = cpu_clock(raw_smp_processor_id()); s64 elapsed;
s64 elapsed = now - sd->last_update; u64 now;
if (root_task_group_empty())
return;
now = cpu_clock(raw_smp_processor_id());
elapsed = now - sd->last_update;
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
sd->last_update = now; sd->last_update = now;
...@@ -1627,6 +1665,9 @@ static void update_shares(struct sched_domain *sd) ...@@ -1627,6 +1665,9 @@ static void update_shares(struct sched_domain *sd)
static void update_shares_locked(struct rq *rq, struct sched_domain *sd) static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
{ {
if (root_task_group_empty())
return;
spin_unlock(&rq->lock); spin_unlock(&rq->lock);
update_shares(sd); update_shares(sd);
spin_lock(&rq->lock); spin_lock(&rq->lock);
...@@ -1634,6 +1675,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) ...@@ -1634,6 +1675,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
static void update_h_load(long cpu) static void update_h_load(long cpu)
{ {
if (root_task_group_empty())
return;
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
} }
...@@ -2268,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) ...@@ -2268,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
} }
/* Adjust by relative CPU power of the group */ /* Adjust by relative CPU power of the group */
avg_load = sg_div_cpu_power(group, avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
avg_load * SCHED_LOAD_SCALE);
if (local_group) { if (local_group) {
this_load = avg_load; this_load = avg_load;
...@@ -2637,9 +2680,32 @@ void sched_fork(struct task_struct *p, int clone_flags) ...@@ -2637,9 +2680,32 @@ void sched_fork(struct task_struct *p, int clone_flags)
set_task_cpu(p, cpu); set_task_cpu(p, cpu);
/* /*
* Make sure we do not leak PI boosting priority to the child: * Make sure we do not leak PI boosting priority to the child.
*/ */
p->prio = current->normal_prio; p->prio = current->normal_prio;
/*
* Revert to default priority/policy on fork if requested.
*/
if (unlikely(p->sched_reset_on_fork)) {
if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
p->policy = SCHED_NORMAL;
if (p->normal_prio < DEFAULT_PRIO)
p->prio = DEFAULT_PRIO;
if (PRIO_TO_NICE(p->static_prio) < 0) {
p->static_prio = NICE_TO_PRIO(0);
set_load_weight(p);
}
/*
* We don't need the reset flag anymore after the fork. It has
* fulfilled its duty:
*/
p->sched_reset_on_fork = 0;
}
if (!rt_prio(p->prio)) if (!rt_prio(p->prio))
p->sched_class = &fair_sched_class; p->sched_class = &fair_sched_class;
...@@ -2796,12 +2862,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) ...@@ -2796,12 +2862,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
{ {
struct mm_struct *mm = rq->prev_mm; struct mm_struct *mm = rq->prev_mm;
long prev_state; long prev_state;
#ifdef CONFIG_SMP
int post_schedule = 0;
if (current->sched_class->needs_post_schedule)
post_schedule = current->sched_class->needs_post_schedule(rq);
#endif
rq->prev_mm = NULL; rq->prev_mm = NULL;
...@@ -2820,10 +2880,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) ...@@ -2820,10 +2880,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
finish_arch_switch(prev); finish_arch_switch(prev);
perf_counter_task_sched_in(current, cpu_of(rq)); perf_counter_task_sched_in(current, cpu_of(rq));
finish_lock_switch(rq, prev); finish_lock_switch(rq, prev);
#ifdef CONFIG_SMP
if (post_schedule)
current->sched_class->post_schedule(rq);
#endif
fire_sched_in_preempt_notifiers(current); fire_sched_in_preempt_notifiers(current);
if (mm) if (mm)
...@@ -2838,6 +2894,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) ...@@ -2838,6 +2894,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
} }
} }
#ifdef CONFIG_SMP
/* assumes rq->lock is held */
static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
{
if (prev->sched_class->pre_schedule)
prev->sched_class->pre_schedule(rq, prev);
}
/* rq->lock is NOT held, but preemption is disabled */
static inline void post_schedule(struct rq *rq)
{
if (rq->post_schedule) {
unsigned long flags;
spin_lock_irqsave(&rq->lock, flags);
if (rq->curr->sched_class->post_schedule)
rq->curr->sched_class->post_schedule(rq);
spin_unlock_irqrestore(&rq->lock, flags);
rq->post_schedule = 0;
}
}
#else
static inline void pre_schedule(struct rq *rq, struct task_struct *p)
{
}
static inline void post_schedule(struct rq *rq)
{
}
#endif
/** /**
* schedule_tail - first thing a freshly forked thread must call. * schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from. * @prev: the thread we just switched away from.
...@@ -2848,6 +2940,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) ...@@ -2848,6 +2940,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
struct rq *rq = this_rq(); struct rq *rq = this_rq();
finish_task_switch(rq, prev); finish_task_switch(rq, prev);
/*
* FIXME: do we need to worry about rq being invalidated by the
* task_switch?
*/
post_schedule(rq);
#ifdef __ARCH_WANT_UNLOCKED_CTXSW #ifdef __ARCH_WANT_UNLOCKED_CTXSW
/* In this case, finish_task_switch does not reenable preemption */ /* In this case, finish_task_switch does not reenable preemption */
preempt_enable(); preempt_enable();
...@@ -3379,9 +3478,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, ...@@ -3379,9 +3478,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
{ {
const struct sched_class *class; const struct sched_class *class;
for (class = sched_class_highest; class; class = class->next) for_each_class(class) {
if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
return 1; return 1;
}
return 0; return 0;
} }
...@@ -3544,7 +3644,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, ...@@ -3544,7 +3644,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
* capacity but still has some space to pick up some load * capacity but still has some space to pick up some load
* from other group and save more power * from other group and save more power
*/ */
if (sgs->sum_nr_running > sgs->group_capacity - 1) if (sgs->sum_nr_running + 1 > sgs->group_capacity)
return; return;
if (sgs->sum_nr_running > sds->leader_nr_running || if (sgs->sum_nr_running > sds->leader_nr_running ||
...@@ -3611,6 +3711,77 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, ...@@ -3611,6 +3711,77 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
} }
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = cpumask_weight(sched_domain_span(sd));
unsigned long smt_gain = sd->smt_gain;
smt_gain /= weight;
return smt_gain;
}
unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 total, available;
sched_avg_update(rq);
total = sched_avg_period() + (rq->clock - rq->age_stamp);
available = total - rq->rt_avg;
if (unlikely((s64)total < SCHED_LOAD_SCALE))
total = SCHED_LOAD_SCALE;
total >>= SCHED_LOAD_SHIFT;
return div_u64(available, total);
}
static void update_cpu_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = cpumask_weight(sched_domain_span(sd));
unsigned long power = SCHED_LOAD_SCALE;
struct sched_group *sdg = sd->groups;
/* here we could scale based on cpufreq */
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
power *= arch_scale_smt_power(sd, cpu);
power >>= SCHED_LOAD_SHIFT;
}
power *= scale_rt_power(cpu);
power >>= SCHED_LOAD_SHIFT;
if (!power)
power = 1;
sdg->cpu_power = power;
}
static void update_group_power(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
unsigned long power;
if (!child) {
update_cpu_power(sd, cpu);
return;
}
power = 0;
group = child->groups;
do {
power += group->cpu_power;
group = group->next;
} while (group != child->groups);
sdg->cpu_power = power;
}
/** /**
* update_sg_lb_stats - Update sched_group's statistics for load balancing. * update_sg_lb_stats - Update sched_group's statistics for load balancing.
...@@ -3624,7 +3795,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, ...@@ -3624,7 +3795,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
* @balance: Should we balance. * @balance: Should we balance.
* @sgs: variable to hold the statistics for this group. * @sgs: variable to hold the statistics for this group.
*/ */
static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, static inline void update_sg_lb_stats(struct sched_domain *sd,
struct sched_group *group, int this_cpu,
enum cpu_idle_type idle, int load_idx, int *sd_idle, enum cpu_idle_type idle, int load_idx, int *sd_idle,
int local_group, const struct cpumask *cpus, int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs) int *balance, struct sg_lb_stats *sgs)
...@@ -3635,8 +3807,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, ...@@ -3635,8 +3807,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
unsigned long sum_avg_load_per_task; unsigned long sum_avg_load_per_task;
unsigned long avg_load_per_task; unsigned long avg_load_per_task;
if (local_group) if (local_group) {
balance_cpu = group_first_cpu(group); balance_cpu = group_first_cpu(group);
if (balance_cpu == this_cpu)
update_group_power(sd, this_cpu);
}
/* Tally up the load of all CPUs in the group */ /* Tally up the load of all CPUs in the group */
sum_avg_load_per_task = avg_load_per_task = 0; sum_avg_load_per_task = avg_load_per_task = 0;
...@@ -3685,8 +3860,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, ...@@ -3685,8 +3860,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
} }
/* Adjust by relative CPU power of the group */ /* Adjust by relative CPU power of the group */
sgs->avg_load = sg_div_cpu_power(group, sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
sgs->group_load * SCHED_LOAD_SCALE);
/* /*
...@@ -3698,14 +3872,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, ...@@ -3698,14 +3872,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
* normalized nr_running number somewhere that negates * normalized nr_running number somewhere that negates
* the hierarchy? * the hierarchy?
*/ */
avg_load_per_task = sg_div_cpu_power(group, avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
sum_avg_load_per_task * SCHED_LOAD_SCALE); group->cpu_power;
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
sgs->group_imb = 1; sgs->group_imb = 1;
sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; sgs->group_capacity =
DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
} }
/** /**
...@@ -3723,9 +3897,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, ...@@ -3723,9 +3897,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
const struct cpumask *cpus, int *balance, const struct cpumask *cpus, int *balance,
struct sd_lb_stats *sds) struct sd_lb_stats *sds)
{ {
struct sched_domain *child = sd->child;
struct sched_group *group = sd->groups; struct sched_group *group = sd->groups;
struct sg_lb_stats sgs; struct sg_lb_stats sgs;
int load_idx; int load_idx, prefer_sibling = 0;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
init_sd_power_savings_stats(sd, sds, idle); init_sd_power_savings_stats(sd, sds, idle);
load_idx = get_sd_load_idx(sd, idle); load_idx = get_sd_load_idx(sd, idle);
...@@ -3736,14 +3914,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, ...@@ -3736,14 +3914,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
local_group = cpumask_test_cpu(this_cpu, local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group)); sched_group_cpus(group));
memset(&sgs, 0, sizeof(sgs)); memset(&sgs, 0, sizeof(sgs));
update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
local_group, cpus, balance, &sgs); local_group, cpus, balance, &sgs);
if (local_group && balance && !(*balance)) if (local_group && balance && !(*balance))
return; return;
sds->total_load += sgs.group_load; sds->total_load += sgs.group_load;
sds->total_pwr += group->__cpu_power; sds->total_pwr += group->cpu_power;
/*
* In case the child domain prefers tasks go to siblings
* first, lower the group capacity to one so that we'll try
* and move all the excess tasks away.
*/
if (prefer_sibling)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
if (local_group) { if (local_group) {
sds->this_load = sgs.avg_load; sds->this_load = sgs.avg_load;
...@@ -3763,7 +3949,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, ...@@ -3763,7 +3949,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
update_sd_power_savings_stats(group, sds, local_group, &sgs); update_sd_power_savings_stats(group, sds, local_group, &sgs);
group = group->next; group = group->next;
} while (group != sd->groups); } while (group != sd->groups);
} }
/** /**
...@@ -3801,28 +3986,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, ...@@ -3801,28 +3986,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
* moving them. * moving them.
*/ */
pwr_now += sds->busiest->__cpu_power * pwr_now += sds->busiest->cpu_power *
min(sds->busiest_load_per_task, sds->max_load); min(sds->busiest_load_per_task, sds->max_load);
pwr_now += sds->this->__cpu_power * pwr_now += sds->this->cpu_power *
min(sds->this_load_per_task, sds->this_load); min(sds->this_load_per_task, sds->this_load);
pwr_now /= SCHED_LOAD_SCALE; pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */ /* Amount of load we'd subtract */
tmp = sg_div_cpu_power(sds->busiest, tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
sds->busiest_load_per_task * SCHED_LOAD_SCALE); sds->busiest->cpu_power;
if (sds->max_load > tmp) if (sds->max_load > tmp)
pwr_move += sds->busiest->__cpu_power * pwr_move += sds->busiest->cpu_power *
min(sds->busiest_load_per_task, sds->max_load - tmp); min(sds->busiest_load_per_task, sds->max_load - tmp);
/* Amount of load we'd add */ /* Amount of load we'd add */
if (sds->max_load * sds->busiest->__cpu_power < if (sds->max_load * sds->busiest->cpu_power <
sds->busiest_load_per_task * SCHED_LOAD_SCALE) sds->busiest_load_per_task * SCHED_LOAD_SCALE)
tmp = sg_div_cpu_power(sds->this, tmp = (sds->max_load * sds->busiest->cpu_power) /
sds->max_load * sds->busiest->__cpu_power); sds->this->cpu_power;
else else
tmp = sg_div_cpu_power(sds->this, tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
sds->busiest_load_per_task * SCHED_LOAD_SCALE); sds->this->cpu_power;
pwr_move += sds->this->__cpu_power * pwr_move += sds->this->cpu_power *
min(sds->this_load_per_task, sds->this_load + tmp); min(sds->this_load_per_task, sds->this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE; pwr_move /= SCHED_LOAD_SCALE;
...@@ -3857,8 +4042,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, ...@@ -3857,8 +4042,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
sds->max_load - sds->busiest_load_per_task); sds->max_load - sds->busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */ /* How much load to actually move to equalise the imbalance */
*imbalance = min(max_pull * sds->busiest->__cpu_power, *imbalance = min(max_pull * sds->busiest->cpu_power,
(sds->avg_load - sds->this_load) * sds->this->__cpu_power) (sds->avg_load - sds->this_load) * sds->this->cpu_power)
/ SCHED_LOAD_SCALE; / SCHED_LOAD_SCALE;
/* /*
...@@ -3976,6 +4161,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, ...@@ -3976,6 +4161,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
return NULL; return NULL;
} }
static struct sched_group *group_of(int cpu)
{
struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
if (!sd)
return NULL;
return sd->groups;
}
static unsigned long power_of(int cpu)
{
struct sched_group *group = group_of(cpu);
if (!group)
return SCHED_LOAD_SCALE;
return group->cpu_power;
}
/* /*
* find_busiest_queue - find the busiest runqueue among the cpus in group. * find_busiest_queue - find the busiest runqueue among the cpus in group.
*/ */
...@@ -3988,15 +4193,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, ...@@ -3988,15 +4193,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
int i; int i;
for_each_cpu(i, sched_group_cpus(group)) { for_each_cpu(i, sched_group_cpus(group)) {
unsigned long power = power_of(i);
unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
unsigned long wl; unsigned long wl;
if (!cpumask_test_cpu(i, cpus)) if (!cpumask_test_cpu(i, cpus))
continue; continue;
rq = cpu_rq(i); rq = cpu_rq(i);
wl = weighted_cpuload(i); wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
wl /= power;
if (rq->nr_running == 1 && wl > imbalance) if (capacity && rq->nr_running == 1 && wl > imbalance)
continue; continue;
if (wl > max_load) { if (wl > max_load) {
...@@ -5349,10 +5557,7 @@ asmlinkage void __sched schedule(void) ...@@ -5349,10 +5557,7 @@ asmlinkage void __sched schedule(void)
switch_count = &prev->nvcsw; switch_count = &prev->nvcsw;
} }
#ifdef CONFIG_SMP pre_schedule(rq, prev);
if (prev->sched_class->pre_schedule)
prev->sched_class->pre_schedule(rq, prev);
#endif
if (unlikely(!rq->nr_running)) if (unlikely(!rq->nr_running))
idle_balance(cpu, rq); idle_balance(cpu, rq);
...@@ -5378,6 +5583,8 @@ asmlinkage void __sched schedule(void) ...@@ -5378,6 +5583,8 @@ asmlinkage void __sched schedule(void)
} else } else
spin_unlock_irq(&rq->lock); spin_unlock_irq(&rq->lock);
post_schedule(rq);
if (unlikely(reacquire_kernel_lock(current) < 0)) if (unlikely(reacquire_kernel_lock(current) < 0))
goto need_resched_nonpreemptible; goto need_resched_nonpreemptible;
...@@ -6123,17 +6330,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, ...@@ -6123,17 +6330,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
unsigned long flags; unsigned long flags;
const struct sched_class *prev_class = p->sched_class; const struct sched_class *prev_class = p->sched_class;
struct rq *rq; struct rq *rq;
int reset_on_fork;
/* may grab non-irq protected spin_locks */ /* may grab non-irq protected spin_locks */
BUG_ON(in_interrupt()); BUG_ON(in_interrupt());
recheck: recheck:
/* double check policy once rq lock held */ /* double check policy once rq lock held */
if (policy < 0) if (policy < 0) {
reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy; policy = oldpolicy = p->policy;
else if (policy != SCHED_FIFO && policy != SCHED_RR && } else {
policy != SCHED_NORMAL && policy != SCHED_BATCH && reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
policy != SCHED_IDLE) policy &= ~SCHED_RESET_ON_FORK;
return -EINVAL;
if (policy != SCHED_FIFO && policy != SCHED_RR &&
policy != SCHED_NORMAL && policy != SCHED_BATCH &&
policy != SCHED_IDLE)
return -EINVAL;
}
/* /*
* Valid priorities for SCHED_FIFO and SCHED_RR are * Valid priorities for SCHED_FIFO and SCHED_RR are
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
...@@ -6177,6 +6392,10 @@ static int __sched_setscheduler(struct task_struct *p, int policy, ...@@ -6177,6 +6392,10 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
/* can't change other user's priorities */ /* can't change other user's priorities */
if (!check_same_owner(p)) if (!check_same_owner(p))
return -EPERM; return -EPERM;
/* Normal users shall not reset the sched_reset_on_fork flag */
if (p->sched_reset_on_fork && !reset_on_fork)
return -EPERM;
} }
if (user) { if (user) {
...@@ -6220,6 +6439,8 @@ static int __sched_setscheduler(struct task_struct *p, int policy, ...@@ -6220,6 +6439,8 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
if (running) if (running)
p->sched_class->put_prev_task(rq, p); p->sched_class->put_prev_task(rq, p);
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio; oldprio = p->prio;
__setscheduler(rq, p, policy, param->sched_priority); __setscheduler(rq, p, policy, param->sched_priority);
...@@ -6336,14 +6557,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ...@@ -6336,14 +6557,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
if (p) { if (p) {
retval = security_task_getscheduler(p); retval = security_task_getscheduler(p);
if (!retval) if (!retval)
retval = p->policy; retval = p->policy
| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
} }
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
return retval; return retval;
} }
/** /**
* sys_sched_getscheduler - get the RT priority of a thread * sys_sched_getparam - get the RT priority of a thread
* @pid: the pid in question. * @pid: the pid in question.
* @param: structure containing the RT priority. * @param: structure containing the RT priority.
*/ */
...@@ -6571,19 +6793,9 @@ static inline int should_resched(void) ...@@ -6571,19 +6793,9 @@ static inline int should_resched(void)
static void __cond_resched(void) static void __cond_resched(void)
{ {
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP add_preempt_count(PREEMPT_ACTIVE);
__might_sleep(__FILE__, __LINE__); schedule();
#endif sub_preempt_count(PREEMPT_ACTIVE);
/*
* The BKS might be reacquired before we have dropped
* PREEMPT_ACTIVE, which could trigger a second
* cond_resched() call.
*/
do {
add_preempt_count(PREEMPT_ACTIVE);
schedule();
sub_preempt_count(PREEMPT_ACTIVE);
} while (need_resched());
} }
int __sched _cond_resched(void) int __sched _cond_resched(void)
...@@ -6597,14 +6809,14 @@ int __sched _cond_resched(void) ...@@ -6597,14 +6809,14 @@ int __sched _cond_resched(void)
EXPORT_SYMBOL(_cond_resched); EXPORT_SYMBOL(_cond_resched);
/* /*
* cond_resched_lock() - if a reschedule is pending, drop the given lock, * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
* call schedule, and on return reacquire the lock. * call schedule, and on return reacquire the lock.
* *
* This works OK both with and without CONFIG_PREEMPT. We do strange low-level * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
* operations here to prevent schedule() from being called twice (once via * operations here to prevent schedule() from being called twice (once via
* spin_unlock(), once by hand). * spin_unlock(), once by hand).
*/ */
int cond_resched_lock(spinlock_t *lock) int __cond_resched_lock(spinlock_t *lock)
{ {
int resched = should_resched(); int resched = should_resched();
int ret = 0; int ret = 0;
...@@ -6622,9 +6834,9 @@ int cond_resched_lock(spinlock_t *lock) ...@@ -6622,9 +6834,9 @@ int cond_resched_lock(spinlock_t *lock)
} }
return ret; return ret;
} }
EXPORT_SYMBOL(cond_resched_lock); EXPORT_SYMBOL(__cond_resched_lock);
int __sched cond_resched_softirq(void) int __sched __cond_resched_softirq(void)
{ {
BUG_ON(!in_softirq()); BUG_ON(!in_softirq());
...@@ -6636,7 +6848,7 @@ int __sched cond_resched_softirq(void) ...@@ -6636,7 +6848,7 @@ int __sched cond_resched_softirq(void)
} }
return 0; return 0;
} }
EXPORT_SYMBOL(cond_resched_softirq); EXPORT_SYMBOL(__cond_resched_softirq);
/** /**
* yield - yield the current processor to other threads. * yield - yield the current processor to other threads.
...@@ -6660,11 +6872,13 @@ EXPORT_SYMBOL(yield); ...@@ -6660,11 +6872,13 @@ EXPORT_SYMBOL(yield);
*/ */
void __sched io_schedule(void) void __sched io_schedule(void)
{ {
struct rq *rq = &__raw_get_cpu_var(runqueues); struct rq *rq = raw_rq();
delayacct_blkio_start(); delayacct_blkio_start();
atomic_inc(&rq->nr_iowait); atomic_inc(&rq->nr_iowait);
current->in_iowait = 1;
schedule(); schedule();
current->in_iowait = 0;
atomic_dec(&rq->nr_iowait); atomic_dec(&rq->nr_iowait);
delayacct_blkio_end(); delayacct_blkio_end();
} }
...@@ -6672,12 +6886,14 @@ EXPORT_SYMBOL(io_schedule); ...@@ -6672,12 +6886,14 @@ EXPORT_SYMBOL(io_schedule);
long __sched io_schedule_timeout(long timeout) long __sched io_schedule_timeout(long timeout)
{ {
struct rq *rq = &__raw_get_cpu_var(runqueues); struct rq *rq = raw_rq();
long ret; long ret;
delayacct_blkio_start(); delayacct_blkio_start();
atomic_inc(&rq->nr_iowait); atomic_inc(&rq->nr_iowait);
current->in_iowait = 1;
ret = schedule_timeout(timeout); ret = schedule_timeout(timeout);
current->in_iowait = 0;
atomic_dec(&rq->nr_iowait); atomic_dec(&rq->nr_iowait);
delayacct_blkio_end(); delayacct_blkio_end();
return ret; return ret;
...@@ -6994,8 +7210,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ...@@ -6994,8 +7210,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
/* Need help from migration thread: drop lock and wait. */ /* Need help from migration thread: drop lock and wait. */
struct task_struct *mt = rq->migration_thread;
get_task_struct(mt);
task_rq_unlock(rq, &flags); task_rq_unlock(rq, &flags);
wake_up_process(rq->migration_thread); wake_up_process(rq->migration_thread);
put_task_struct(mt);
wait_for_completion(&req.done); wait_for_completion(&req.done);
tlb_migrate_finish(p->mm); tlb_migrate_finish(p->mm);
return 0; return 0;
...@@ -7642,7 +7862,7 @@ static int __init migration_init(void) ...@@ -7642,7 +7862,7 @@ static int __init migration_init(void)
migration_call(&migration_notifier, CPU_ONLINE, cpu); migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier); register_cpu_notifier(&migration_notifier);
return err; return 0;
} }
early_initcall(migration_init); early_initcall(migration_init);
#endif #endif
...@@ -7689,7 +7909,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, ...@@ -7689,7 +7909,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break; break;
} }
if (!group->__cpu_power) { if (!group->cpu_power) {
printk(KERN_CONT "\n"); printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_power not " printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n"); "set\n");
...@@ -7713,9 +7933,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, ...@@ -7713,9 +7933,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str); printk(KERN_CONT " %s", str);
if (group->__cpu_power != SCHED_LOAD_SCALE) { if (group->cpu_power != SCHED_LOAD_SCALE) {
printk(KERN_CONT " (__cpu_power = %d)", printk(KERN_CONT " (cpu_power = %d)",
group->__cpu_power); group->cpu_power);
} }
group = group->next; group = group->next;
...@@ -7858,7 +8078,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) ...@@ -7858,7 +8078,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
rq->rd = rd; rq->rd = rd;
cpumask_set_cpu(rq->cpu, rd->span); cpumask_set_cpu(rq->cpu, rd->span);
if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq); set_rq_online(rq);
spin_unlock_irqrestore(&rq->lock, flags); spin_unlock_irqrestore(&rq->lock, flags);
...@@ -8000,7 +8220,7 @@ init_sched_build_groups(const struct cpumask *span, ...@@ -8000,7 +8220,7 @@ init_sched_build_groups(const struct cpumask *span,
continue; continue;
cpumask_clear(sched_group_cpus(sg)); cpumask_clear(sched_group_cpus(sg));
sg->__cpu_power = 0; sg->cpu_power = 0;
for_each_cpu(j, span) { for_each_cpu(j, span) {
if (group_fn(j, cpu_map, NULL, tmpmask) != group) if (group_fn(j, cpu_map, NULL, tmpmask) != group)
...@@ -8108,6 +8328,39 @@ struct static_sched_domain { ...@@ -8108,6 +8328,39 @@ struct static_sched_domain {
DECLARE_BITMAP(span, CONFIG_NR_CPUS); DECLARE_BITMAP(span, CONFIG_NR_CPUS);
}; };
struct s_data {
#ifdef CONFIG_NUMA
int sd_allnodes;
cpumask_var_t domainspan;
cpumask_var_t covered;
cpumask_var_t notcovered;
#endif
cpumask_var_t nodemask;
cpumask_var_t this_sibling_map;
cpumask_var_t this_core_map;
cpumask_var_t send_covered;
cpumask_var_t tmpmask;
struct sched_group **sched_group_nodes;
struct root_domain *rd;
};
enum s_alloc {
sa_sched_groups = 0,
sa_rootdomain,
sa_tmpmask,
sa_send_covered,
sa_this_core_map,
sa_this_sibling_map,
sa_nodemask,
sa_sched_group_nodes,
#ifdef CONFIG_NUMA
sa_notcovered,
sa_covered,
sa_domainspan,
#endif
sa_none,
};
/* /*
* SMT sched-domains: * SMT sched-domains:
*/ */
...@@ -8225,11 +8478,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) ...@@ -8225,11 +8478,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
continue; continue;
} }
sg_inc_cpu_power(sg, sd->groups->__cpu_power); sg->cpu_power += sd->groups->cpu_power;
} }
sg = sg->next; sg = sg->next;
} while (sg != group_head); } while (sg != group_head);
} }
static int build_numa_sched_groups(struct s_data *d,
const struct cpumask *cpu_map, int num)
{
struct sched_domain *sd;
struct sched_group *sg, *prev;
int n, j;
cpumask_clear(d->covered);
cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
if (cpumask_empty(d->nodemask)) {
d->sched_group_nodes[num] = NULL;
goto out;
}
sched_domain_node_span(num, d->domainspan);
cpumask_and(d->domainspan, d->domainspan, cpu_map);
sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, num);
if (!sg) {
printk(KERN_WARNING "Can not alloc domain group for node %d\n",
num);
return -ENOMEM;
}
d->sched_group_nodes[num] = sg;
for_each_cpu(j, d->nodemask) {
sd = &per_cpu(node_domains, j).sd;
sd->groups = sg;
}
sg->cpu_power = 0;
cpumask_copy(sched_group_cpus(sg), d->nodemask);
sg->next = sg;
cpumask_or(d->covered, d->covered, d->nodemask);
prev = sg;
for (j = 0; j < nr_node_ids; j++) {
n = (num + j) % nr_node_ids;
cpumask_complement(d->notcovered, d->covered);
cpumask_and(d->tmpmask, d->notcovered, cpu_map);
cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
if (cpumask_empty(d->tmpmask))
break;
cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
if (cpumask_empty(d->tmpmask))
continue;
sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, num);
if (!sg) {
printk(KERN_WARNING
"Can not alloc domain group for node %d\n", j);
return -ENOMEM;
}
sg->cpu_power = 0;
cpumask_copy(sched_group_cpus(sg), d->tmpmask);
sg->next = prev->next;
cpumask_or(d->covered, d->covered, d->tmpmask);
prev->next = sg;
prev = sg;
}
out:
return 0;
}
#endif /* CONFIG_NUMA */ #endif /* CONFIG_NUMA */
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
...@@ -8283,15 +8601,13 @@ static void free_sched_groups(const struct cpumask *cpu_map, ...@@ -8283,15 +8601,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
* there are asymmetries in the topology. If there are asymmetries, group * there are asymmetries in the topology. If there are asymmetries, group
* having more cpu_power will pickup more load compared to the group having * having more cpu_power will pickup more load compared to the group having
* less cpu_power. * less cpu_power.
*
* cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
* the maximum number of tasks a group can handle in the presence of other idle
* or lightly loaded groups in the same sched domain.
*/ */
static void init_sched_groups_power(int cpu, struct sched_domain *sd) static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{ {
struct sched_domain *child; struct sched_domain *child;
struct sched_group *group; struct sched_group *group;
long power;
int weight;
WARN_ON(!sd || !sd->groups); WARN_ON(!sd || !sd->groups);
...@@ -8300,28 +8616,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) ...@@ -8300,28 +8616,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
child = sd->child; child = sd->child;
sd->groups->__cpu_power = 0; sd->groups->cpu_power = 0;
/* if (!child) {
* For perf policy, if the groups in child domain share resources power = SCHED_LOAD_SCALE;
* (for example cores sharing some portions of the cache hierarchy weight = cpumask_weight(sched_domain_span(sd));
* or SMT), then set this domain groups cpu_power such that each group /*
* can handle only one task, when there are other idle groups in the * SMT siblings share the power of a single core.
* same sched domain. * Usually multiple threads get a better yield out of
*/ * that one core than a single thread would have,
if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && * reflect that in sd->smt_gain.
(child->flags & */
(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); power *= sd->smt_gain;
power /= weight;
power >>= SCHED_LOAD_SHIFT;
}
sd->groups->cpu_power += power;
return; return;
} }
/* /*
* add cpu_power of each child group to this groups cpu_power * Add cpu_power of each child group to this groups cpu_power.
*/ */
group = child->groups; group = child->groups;
do { do {
sg_inc_cpu_power(sd->groups, group->__cpu_power); sd->groups->cpu_power += group->cpu_power;
group = group->next; group = group->next;
} while (group != child->groups); } while (group != child->groups);
} }
...@@ -8395,280 +8715,285 @@ static void set_domain_attribute(struct sched_domain *sd, ...@@ -8395,280 +8715,285 @@ static void set_domain_attribute(struct sched_domain *sd,
} }
} }
/* static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
* Build sched domains for a given set of cpus and attach the sched domains const struct cpumask *cpu_map)
* to the individual cpus {
*/ switch (what) {
static int __build_sched_domains(const struct cpumask *cpu_map, case sa_sched_groups:
struct sched_domain_attr *attr) free_sched_groups(cpu_map, d->tmpmask); /* fall through */
{ d->sched_group_nodes = NULL;
int i, err = -ENOMEM; case sa_rootdomain:
struct root_domain *rd; free_rootdomain(d->rd); /* fall through */
cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, case sa_tmpmask:
tmpmask; free_cpumask_var(d->tmpmask); /* fall through */
case sa_send_covered:
free_cpumask_var(d->send_covered); /* fall through */
case sa_this_core_map:
free_cpumask_var(d->this_core_map); /* fall through */
case sa_this_sibling_map:
free_cpumask_var(d->this_sibling_map); /* fall through */
case sa_nodemask:
free_cpumask_var(d->nodemask); /* fall through */
case sa_sched_group_nodes:
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
cpumask_var_t domainspan, covered, notcovered; kfree(d->sched_group_nodes); /* fall through */
struct sched_group **sched_group_nodes = NULL; case sa_notcovered:
int sd_allnodes = 0; free_cpumask_var(d->notcovered); /* fall through */
case sa_covered:
if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) free_cpumask_var(d->covered); /* fall through */
goto out; case sa_domainspan:
if (!alloc_cpumask_var(&covered, GFP_KERNEL)) free_cpumask_var(d->domainspan); /* fall through */
goto free_domainspan; #endif
if (!alloc_cpumask_var(&notcovered, GFP_KERNEL)) case sa_none:
goto free_covered; break;
#endif }
}
if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
goto free_notcovered;
if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
goto free_nodemask;
if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
goto free_this_sibling_map;
if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
goto free_this_core_map;
if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
goto free_send_covered;
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
const struct cpumask *cpu_map)
{
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
/* if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
* Allocate the per-node list of sched groups return sa_none;
*/ if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), return sa_domainspan;
GFP_KERNEL); if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
if (!sched_group_nodes) { return sa_covered;
/* Allocate the per-node list of sched groups */
d->sched_group_nodes = kcalloc(nr_node_ids,
sizeof(struct sched_group *), GFP_KERNEL);
if (!d->sched_group_nodes) {
printk(KERN_WARNING "Can not alloc sched group node list\n"); printk(KERN_WARNING "Can not alloc sched group node list\n");
goto free_tmpmask; return sa_notcovered;
} }
#endif sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
#endif
rd = alloc_rootdomain(); if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
if (!rd) { return sa_sched_group_nodes;
if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
return sa_nodemask;
if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
return sa_this_sibling_map;
if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
return sa_this_core_map;
if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
return sa_send_covered;
d->rd = alloc_rootdomain();
if (!d->rd) {
printk(KERN_WARNING "Cannot alloc root domain\n"); printk(KERN_WARNING "Cannot alloc root domain\n");
goto free_sched_groups; return sa_tmpmask;
} }
return sa_rootdomain;
}
static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
{
struct sched_domain *sd = NULL;
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; struct sched_domain *parent;
#endif
/*
* Set up domains for cpus specified by the cpu_map.
*/
for_each_cpu(i, cpu_map) {
struct sched_domain *sd = NULL, *p;
cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); d->sd_allnodes = 0;
if (cpumask_weight(cpu_map) >
#ifdef CONFIG_NUMA SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
if (cpumask_weight(cpu_map) > sd = &per_cpu(allnodes_domains, i).sd;
SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { SD_INIT(sd, ALLNODES);
sd = &per_cpu(allnodes_domains, i).sd;
SD_INIT(sd, ALLNODES);
set_domain_attribute(sd, attr);
cpumask_copy(sched_domain_span(sd), cpu_map);
cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
p = sd;
sd_allnodes = 1;
} else
p = NULL;
sd = &per_cpu(node_domains, i).sd;
SD_INIT(sd, NODE);
set_domain_attribute(sd, attr); set_domain_attribute(sd, attr);
sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); cpumask_copy(sched_domain_span(sd), cpu_map);
sd->parent = p; cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
if (p) d->sd_allnodes = 1;
p->child = sd; }
cpumask_and(sched_domain_span(sd), parent = sd;
sched_domain_span(sd), cpu_map);
sd = &per_cpu(node_domains, i).sd;
SD_INIT(sd, NODE);
set_domain_attribute(sd, attr);
sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
sd->parent = parent;
if (parent)
parent->child = sd;
cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
#endif #endif
return sd;
}
p = sd; static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
sd = &per_cpu(phys_domains, i).sd; const struct cpumask *cpu_map, struct sched_domain_attr *attr,
SD_INIT(sd, CPU); struct sched_domain *parent, int i)
set_domain_attribute(sd, attr); {
cpumask_copy(sched_domain_span(sd), nodemask); struct sched_domain *sd;
sd->parent = p; sd = &per_cpu(phys_domains, i).sd;
if (p) SD_INIT(sd, CPU);
p->child = sd; set_domain_attribute(sd, attr);
cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); cpumask_copy(sched_domain_span(sd), d->nodemask);
sd->parent = parent;
if (parent)
parent->child = sd;
cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
return sd;
}
static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *parent, int i)
{
struct sched_domain *sd = parent;
#ifdef CONFIG_SCHED_MC #ifdef CONFIG_SCHED_MC
p = sd; sd = &per_cpu(core_domains, i).sd;
sd = &per_cpu(core_domains, i).sd; SD_INIT(sd, MC);
SD_INIT(sd, MC); set_domain_attribute(sd, attr);
set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
cpumask_and(sched_domain_span(sd), cpu_map, sd->parent = parent;
cpu_coregroup_mask(i)); parent->child = sd;
sd->parent = p; cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
p->child = sd;
cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
#endif #endif
return sd;
}
static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *parent, int i)
{
struct sched_domain *sd = parent;
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
p = sd; sd = &per_cpu(cpu_domains, i).sd;
sd = &per_cpu(cpu_domains, i).sd; SD_INIT(sd, SIBLING);
SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr);
set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
cpumask_and(sched_domain_span(sd), sd->parent = parent;
topology_thread_cpumask(i), cpu_map); parent->child = sd;
sd->parent = p; cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
p->child = sd;
cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
#endif #endif
} return sd;
}
static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
const struct cpumask *cpu_map, int cpu)
{
switch (l) {
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
/* Set up CPU (sibling) groups */ case SD_LV_SIBLING: /* set up CPU (sibling) groups */
for_each_cpu(i, cpu_map) { cpumask_and(d->this_sibling_map, cpu_map,
cpumask_and(this_sibling_map, topology_thread_cpumask(cpu));
topology_thread_cpumask(i), cpu_map); if (cpu == cpumask_first(d->this_sibling_map))
if (i != cpumask_first(this_sibling_map)) init_sched_build_groups(d->this_sibling_map, cpu_map,
continue; &cpu_to_cpu_group,
d->send_covered, d->tmpmask);
init_sched_build_groups(this_sibling_map, cpu_map, break;
&cpu_to_cpu_group,
send_covered, tmpmask);
}
#endif #endif
#ifdef CONFIG_SCHED_MC #ifdef CONFIG_SCHED_MC
/* Set up multi-core groups */ case SD_LV_MC: /* set up multi-core groups */
for_each_cpu(i, cpu_map) { cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); if (cpu == cpumask_first(d->this_core_map))
if (i != cpumask_first(this_core_map)) init_sched_build_groups(d->this_core_map, cpu_map,
continue; &cpu_to_core_group,
d->send_covered, d->tmpmask);
init_sched_build_groups(this_core_map, cpu_map, break;
&cpu_to_core_group,
send_covered, tmpmask);
}
#endif #endif
case SD_LV_CPU: /* set up physical groups */
/* Set up physical groups */ cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
for (i = 0; i < nr_node_ids; i++) { if (!cpumask_empty(d->nodemask))
cpumask_and(nodemask, cpumask_of_node(i), cpu_map); init_sched_build_groups(d->nodemask, cpu_map,
if (cpumask_empty(nodemask)) &cpu_to_phys_group,
continue; d->send_covered, d->tmpmask);
break;
init_sched_build_groups(nodemask, cpu_map,
&cpu_to_phys_group,
send_covered, tmpmask);
}
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
/* Set up node groups */ case SD_LV_ALLNODES:
if (sd_allnodes) { init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
init_sched_build_groups(cpu_map, cpu_map, d->send_covered, d->tmpmask);
&cpu_to_allnodes_group, break;
send_covered, tmpmask); #endif
default:
break;
} }
}
for (i = 0; i < nr_node_ids; i++) { /*
/* Set up node groups */ * Build sched domains for a given set of cpus and attach the sched domains
struct sched_group *sg, *prev; * to the individual cpus
int j; */
static int __build_sched_domains(const struct cpumask *cpu_map,
cpumask_clear(covered); struct sched_domain_attr *attr)
cpumask_and(nodemask, cpumask_of_node(i), cpu_map); {
if (cpumask_empty(nodemask)) { enum s_alloc alloc_state = sa_none;
sched_group_nodes[i] = NULL; struct s_data d;
continue; struct sched_domain *sd;
} int i;
#ifdef CONFIG_NUMA
d.sd_allnodes = 0;
#endif
sched_domain_node_span(i, domainspan); alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
cpumask_and(domainspan, domainspan, cpu_map); if (alloc_state != sa_rootdomain)
goto error;
alloc_state = sa_sched_groups;
sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), /*
GFP_KERNEL, i); * Set up domains for cpus specified by the cpu_map.
if (!sg) { */
printk(KERN_WARNING "Can not alloc domain group for " for_each_cpu(i, cpu_map) {
"node %d\n", i); cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
goto error; cpu_map);
}
sched_group_nodes[i] = sg;
for_each_cpu(j, nodemask) {
struct sched_domain *sd;
sd = &per_cpu(node_domains, j).sd; sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
sd->groups = sg; sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
} sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
sg->__cpu_power = 0; sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
cpumask_copy(sched_group_cpus(sg), nodemask); }
sg->next = sg;
cpumask_or(covered, covered, nodemask);
prev = sg;
for (j = 0; j < nr_node_ids; j++) { for_each_cpu(i, cpu_map) {
int n = (i + j) % nr_node_ids; build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
build_sched_groups(&d, SD_LV_MC, cpu_map, i);
}
cpumask_complement(notcovered, covered); /* Set up physical groups */
cpumask_and(tmpmask, notcovered, cpu_map); for (i = 0; i < nr_node_ids; i++)
cpumask_and(tmpmask, tmpmask, domainspan); build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
if (cpumask_empty(tmpmask))
break;
cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); #ifdef CONFIG_NUMA
if (cpumask_empty(tmpmask)) /* Set up node groups */
continue; if (d.sd_allnodes)
build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
sg = kmalloc_node(sizeof(struct sched_group) + for (i = 0; i < nr_node_ids; i++)
cpumask_size(), if (build_numa_sched_groups(&d, cpu_map, i))
GFP_KERNEL, i); goto error;
if (!sg) {
printk(KERN_WARNING
"Can not alloc domain group for node %d\n", j);
goto error;
}
sg->__cpu_power = 0;
cpumask_copy(sched_group_cpus(sg), tmpmask);
sg->next = prev->next;
cpumask_or(covered, covered, tmpmask);
prev->next = sg;
prev = sg;
}
}
#endif #endif
/* Calculate CPU power for physical packages and nodes */ /* Calculate CPU power for physical packages and nodes */
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
for_each_cpu(i, cpu_map) { for_each_cpu(i, cpu_map) {
struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; sd = &per_cpu(cpu_domains, i).sd;
init_sched_groups_power(i, sd); init_sched_groups_power(i, sd);
} }
#endif #endif
#ifdef CONFIG_SCHED_MC #ifdef CONFIG_SCHED_MC
for_each_cpu(i, cpu_map) { for_each_cpu(i, cpu_map) {
struct sched_domain *sd = &per_cpu(core_domains, i).sd; sd = &per_cpu(core_domains, i).sd;
init_sched_groups_power(i, sd); init_sched_groups_power(i, sd);
} }
#endif #endif
for_each_cpu(i, cpu_map) { for_each_cpu(i, cpu_map) {
struct sched_domain *sd = &per_cpu(phys_domains, i).sd; sd = &per_cpu(phys_domains, i).sd;
init_sched_groups_power(i, sd); init_sched_groups_power(i, sd);
} }
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
for (i = 0; i < nr_node_ids; i++) for (i = 0; i < nr_node_ids; i++)
init_numa_sched_groups_power(sched_group_nodes[i]); init_numa_sched_groups_power(d.sched_group_nodes[i]);
if (sd_allnodes) { if (d.sd_allnodes) {
struct sched_group *sg; struct sched_group *sg;
cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
tmpmask); d.tmpmask);
init_numa_sched_groups_power(sg); init_numa_sched_groups_power(sg);
} }
#endif #endif
/* Attach the domains */ /* Attach the domains */
for_each_cpu(i, cpu_map) { for_each_cpu(i, cpu_map) {
struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
sd = &per_cpu(cpu_domains, i).sd; sd = &per_cpu(cpu_domains, i).sd;
#elif defined(CONFIG_SCHED_MC) #elif defined(CONFIG_SCHED_MC)
...@@ -8676,44 +9001,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, ...@@ -8676,44 +9001,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
#else #else
sd = &per_cpu(phys_domains, i).sd; sd = &per_cpu(phys_domains, i).sd;
#endif #endif
cpu_attach_domain(sd, rd, i); cpu_attach_domain(sd, d.rd, i);
} }
err = 0; d.sched_group_nodes = NULL; /* don't free this we still need it */
__free_domain_allocs(&d, sa_tmpmask, cpu_map);
free_tmpmask: return 0;
free_cpumask_var(tmpmask);
free_send_covered:
free_cpumask_var(send_covered);
free_this_core_map:
free_cpumask_var(this_core_map);
free_this_sibling_map:
free_cpumask_var(this_sibling_map);
free_nodemask:
free_cpumask_var(nodemask);
free_notcovered:
#ifdef CONFIG_NUMA
free_cpumask_var(notcovered);
free_covered:
free_cpumask_var(covered);
free_domainspan:
free_cpumask_var(domainspan);
out:
#endif
return err;
free_sched_groups:
#ifdef CONFIG_NUMA
kfree(sched_group_nodes);
#endif
goto free_tmpmask;
#ifdef CONFIG_NUMA
error: error:
free_sched_groups(cpu_map, tmpmask); __free_domain_allocs(&d, alloc_state, cpu_map);
free_rootdomain(rd); return -ENOMEM;
goto free_tmpmask;
#endif
} }
static int build_sched_domains(const struct cpumask *cpu_map) static int build_sched_domains(const struct cpumask *cpu_map)
...@@ -9321,11 +9618,11 @@ void __init sched_init(void) ...@@ -9321,11 +9618,11 @@ void __init sched_init(void)
* system cpu resource, based on the weight assigned to root * system cpu resource, based on the weight assigned to root
* user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
* by letting tasks of init_task_group sit in a separate cfs_rq * by letting tasks of init_task_group sit in a separate cfs_rq
* (init_cfs_rq) and having one entity represent this group of * (init_tg_cfs_rq) and having one entity represent this group of
* tasks in rq->cfs (i.e init_task_group->se[] != NULL). * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
*/ */
init_tg_cfs_entry(&init_task_group, init_tg_cfs_entry(&init_task_group,
&per_cpu(init_cfs_rq, i), &per_cpu(init_tg_cfs_rq, i),
&per_cpu(init_sched_entity, i), i, 1, &per_cpu(init_sched_entity, i), i, 1,
root_task_group.se[i]); root_task_group.se[i]);
...@@ -9351,6 +9648,7 @@ void __init sched_init(void) ...@@ -9351,6 +9648,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
rq->sd = NULL; rq->sd = NULL;
rq->rd = NULL; rq->rd = NULL;
rq->post_schedule = 0;
rq->active_balance = 0; rq->active_balance = 0;
rq->next_balance = jiffies; rq->next_balance = jiffies;
rq->push_cpu = 0; rq->push_cpu = 0;
...@@ -9415,13 +9713,20 @@ void __init sched_init(void) ...@@ -9415,13 +9713,20 @@ void __init sched_init(void)
} }
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
void __might_sleep(char *file, int line) static inline int preempt_count_equals(int preempt_offset)
{
int nested = preempt_count() & ~PREEMPT_ACTIVE;
return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
}
void __might_sleep(char *file, int line, int preempt_offset)
{ {
#ifdef in_atomic #ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */ static unsigned long prev_jiffy; /* ratelimiting */
if ((!in_atomic() && !irqs_disabled()) || if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
system_state != SYSTEM_RUNNING || oops_in_progress) system_state != SYSTEM_RUNNING || oops_in_progress)
return; return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return; return;
......
...@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) ...@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
/* /*
* If the cpu was currently mapped to a different value, we * If the cpu was currently mapped to a different value, we
* first need to unmap the old value * need to map it to the new value then remove the old value.
* Note, we must add the new value first, otherwise we risk the
* cpu being cleared from pri_active, and this cpu could be
* missed for a push or pull.
*/ */
if (likely(oldpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
spin_lock_irqsave(&vec->lock, flags);
vec->count--;
if (!vec->count)
clear_bit(oldpri, cp->pri_active);
cpumask_clear_cpu(cpu, vec->mask);
spin_unlock_irqrestore(&vec->lock, flags);
}
if (likely(newpri != CPUPRI_INVALID)) { if (likely(newpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
...@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) ...@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
spin_unlock_irqrestore(&vec->lock, flags); spin_unlock_irqrestore(&vec->lock, flags);
} }
if (likely(oldpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
spin_lock_irqsave(&vec->lock, flags);
vec->count--;
if (!vec->count)
clear_bit(oldpri, cp->pri_active);
cpumask_clear_cpu(cpu, vec->mask);
spin_unlock_irqrestore(&vec->lock, flags);
}
*currpri = newpri; *currpri = newpri;
} }
......
...@@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) ...@@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.wait_max); PN(se.wait_max);
PN(se.wait_sum); PN(se.wait_sum);
P(se.wait_count); P(se.wait_count);
PN(se.iowait_sum);
P(se.iowait_count);
P(sched_info.bkl_count); P(sched_info.bkl_count);
P(se.nr_migrations); P(se.nr_migrations);
P(se.nr_migrations_cold); P(se.nr_migrations_cold);
...@@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p) ...@@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p)
p->se.wait_max = 0; p->se.wait_max = 0;
p->se.wait_sum = 0; p->se.wait_sum = 0;
p->se.wait_count = 0; p->se.wait_count = 0;
p->se.iowait_sum = 0;
p->se.iowait_count = 0;
p->se.sleep_max = 0; p->se.sleep_max = 0;
p->se.sum_sleep_runtime = 0; p->se.sum_sleep_runtime = 0;
p->se.block_max = 0; p->se.block_max = 0;
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
/* /*
* Targeted preemption latency for CPU-bound tasks: * Targeted preemption latency for CPU-bound tasks:
* (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
* *
* NOTE: this latency value is not the same as the concept of * NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length * 'timeslice length' - timeslices in CFS are of variable length
...@@ -34,13 +34,13 @@ ...@@ -34,13 +34,13 @@
* (to see the precise effective timeslice length of your workload, * (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field) * run vmstat and monitor the context-switches (cs) field)
*/ */
unsigned int sysctl_sched_latency = 20000000ULL; unsigned int sysctl_sched_latency = 5000000ULL;
/* /*
* Minimal preemption granularity for CPU-bound tasks: * Minimal preemption granularity for CPU-bound tasks:
* (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/ */
unsigned int sysctl_sched_min_granularity = 4000000ULL; unsigned int sysctl_sched_min_granularity = 1000000ULL;
/* /*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
...@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL; ...@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
static unsigned int sched_nr_latency = 5; static unsigned int sched_nr_latency = 5;
/* /*
* After fork, child runs first. (default) If set to 0 then * After fork, child runs first. If set to 0 (default) then
* parent will (try to) run first. * parent will (try to) run first.
*/ */
const_debug unsigned int sysctl_sched_child_runs_first = 1; unsigned int sysctl_sched_child_runs_first __read_mostly;
/* /*
* sys_sched_yield() compat mode * sys_sched_yield() compat mode
...@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; ...@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
/* /*
* SCHED_OTHER wake-up granularity. * SCHED_OTHER wake-up granularity.
* (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
* *
* This option delays the preemption effects of decoupled workloads * This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still * and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies. * have immediate wakeup/sleep latencies.
*/ */
unsigned int sysctl_sched_wakeup_granularity = 5000000UL; unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
...@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class; ...@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
* CFS operations on generic schedulable entities: * CFS operations on generic schedulable entities:
*/ */
static inline struct task_struct *task_of(struct sched_entity *se)
{
return container_of(se, struct task_struct, se);
}
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
/* cpu runqueue to which this cfs_rq is attached */ /* cpu runqueue to which this cfs_rq is attached */
...@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) ...@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
/* An entity is a task if it doesn't "own" a runqueue */ /* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q) #define entity_is_task(se) (!se->my_q)
static inline struct task_struct *task_of(struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
WARN_ON_ONCE(!entity_is_task(se));
#endif
return container_of(se, struct task_struct, se);
}
/* Walk up scheduling entities hierarchy */ /* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \ #define for_each_sched_entity(se) \
for (; se; se = se->parent) for (; se; se = se->parent)
...@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) ...@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
} }
} }
#else /* CONFIG_FAIR_GROUP_SCHED */ #else /* !CONFIG_FAIR_GROUP_SCHED */
static inline struct task_struct *task_of(struct sched_entity *se)
{
return container_of(se, struct task_struct, se);
}
static inline struct rq *rq_of(struct cfs_rq *cfs_rq) static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{ {
...@@ -537,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -537,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
schedstat_set(se->wait_count, se->wait_count + 1); schedstat_set(se->wait_count, se->wait_count + 1);
schedstat_set(se->wait_sum, se->wait_sum + schedstat_set(se->wait_sum, se->wait_sum +
rq_of(cfs_rq)->clock - se->wait_start); rq_of(cfs_rq)->clock - se->wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
rq_of(cfs_rq)->clock - se->wait_start);
}
#endif
schedstat_set(se->wait_start, 0); schedstat_set(se->wait_start, 0);
} }
...@@ -628,8 +642,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -628,8 +642,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->sleep_start = 0; se->sleep_start = 0;
se->sum_sleep_runtime += delta; se->sum_sleep_runtime += delta;
if (tsk) if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1); account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
} }
if (se->block_start) { if (se->block_start) {
u64 delta = rq_of(cfs_rq)->clock - se->block_start; u64 delta = rq_of(cfs_rq)->clock - se->block_start;
...@@ -644,6 +660,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -644,6 +660,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->sum_sleep_runtime += delta; se->sum_sleep_runtime += delta;
if (tsk) { if (tsk) {
if (tsk->in_iowait) {
se->iowait_sum += delta;
se->iowait_count++;
trace_sched_stat_iowait(tsk, delta);
}
/* /*
* Blocking time is in units of nanosecs, so shift by * Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the * 20 to get a milliseconds-range estimation of the
...@@ -705,11 +727,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) ...@@ -705,11 +727,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime -= thresh; vruntime -= thresh;
} }
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
} }
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
se->vruntime = vruntime; se->vruntime = vruntime;
} }
...@@ -1046,17 +1068,21 @@ static void yield_task_fair(struct rq *rq) ...@@ -1046,17 +1068,21 @@ static void yield_task_fair(struct rq *rq)
* search starts with cpus closest then further out as needed, * search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu. * so we always favor a closer, idle cpu.
* Domains may include CPUs that are not usable for migration, * Domains may include CPUs that are not usable for migration,
* hence we need to mask them out (cpu_active_mask) * hence we need to mask them out (rq->rd->online)
* *
* Returns the CPU we should wake onto. * Returns the CPU we should wake onto.
*/ */
#if defined(ARCH_HAS_SCHED_WAKE_IDLE) #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
static int wake_idle(int cpu, struct task_struct *p) static int wake_idle(int cpu, struct task_struct *p)
{ {
struct sched_domain *sd; struct sched_domain *sd;
int i; int i;
unsigned int chosen_wakeup_cpu; unsigned int chosen_wakeup_cpu;
int this_cpu; int this_cpu;
struct rq *task_rq = task_rq(p);
/* /*
* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
...@@ -1089,10 +1115,10 @@ static int wake_idle(int cpu, struct task_struct *p) ...@@ -1089,10 +1115,10 @@ static int wake_idle(int cpu, struct task_struct *p)
for_each_domain(cpu, sd) { for_each_domain(cpu, sd) {
if ((sd->flags & SD_WAKE_IDLE) if ((sd->flags & SD_WAKE_IDLE)
|| ((sd->flags & SD_WAKE_IDLE_FAR) || ((sd->flags & SD_WAKE_IDLE_FAR)
&& !task_hot(p, task_rq(p)->clock, sd))) { && !task_hot(p, task_rq->clock, sd))) {
for_each_cpu_and(i, sched_domain_span(sd), for_each_cpu_and(i, sched_domain_span(sd),
&p->cpus_allowed) { &p->cpus_allowed) {
if (cpu_active(i) && idle_cpu(i)) { if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
if (i != task_cpu(p)) { if (i != task_cpu(p)) {
schedstat_inc(p, schedstat_inc(p,
se.nr_wakeups_idle); se.nr_wakeups_idle);
...@@ -1235,7 +1261,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, ...@@ -1235,7 +1261,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
tg = task_group(p); tg = task_group(p);
weight = p->se.load.weight; weight = p->se.load.weight;
balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= /*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped tl to 0, we'll always have
* an imbalance, but there's really nothing you can do about that, so
* that's good too.
*
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
balanced = !tl ||
100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
/* /*
...@@ -1278,8 +1314,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync) ...@@ -1278,8 +1314,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
this_rq = cpu_rq(this_cpu); this_rq = cpu_rq(this_cpu);
new_cpu = prev_cpu; new_cpu = prev_cpu;
if (prev_cpu == this_cpu)
goto out;
/* /*
* 'this_sd' is the first domain that both * 'this_sd' is the first domain that both
* this_cpu and prev_cpu are present in: * this_cpu and prev_cpu are present in:
...@@ -1721,6 +1755,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) ...@@ -1721,6 +1755,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
sched_info_queued(p); sched_info_queued(p);
update_curr(cfs_rq); update_curr(cfs_rq);
if (curr)
se->vruntime = curr->vruntime;
place_entity(cfs_rq, se, 1); place_entity(cfs_rq, se, 1);
/* 'curr' will be NULL if the child belongs to a different group */ /* 'curr' will be NULL if the child belongs to a different group */
......
SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
SCHED_FEAT(NORMALIZED_SLEEPER, 0) SCHED_FEAT(NORMALIZED_SLEEPER, 0)
SCHED_FEAT(ADAPTIVE_GRAN, 1) SCHED_FEAT(ADAPTIVE_GRAN, 1)
SCHED_FEAT(WAKEUP_PREEMPT, 1) SCHED_FEAT(WAKEUP_PREEMPT, 1)
......
...@@ -3,15 +3,18 @@ ...@@ -3,15 +3,18 @@
* policies) * policies)
*/ */
#ifdef CONFIG_RT_GROUP_SCHED
#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{ {
#ifdef CONFIG_SCHED_DEBUG
WARN_ON_ONCE(!rt_entity_is_task(rt_se));
#endif
return container_of(rt_se, struct task_struct, rt); return container_of(rt_se, struct task_struct, rt);
} }
#ifdef CONFIG_RT_GROUP_SCHED
#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{ {
return rt_rq->rq; return rt_rq->rq;
...@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) ...@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
#define rt_entity_is_task(rt_se) (1) #define rt_entity_is_task(rt_se) (1)
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
return container_of(rt_se, struct task_struct, rt);
}
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{ {
return container_of(rt_rq, struct rq, rt); return container_of(rt_rq, struct rq, rt);
...@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) ...@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
} }
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
}
#else #else
static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
...@@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq) ...@@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
curr->se.exec_start = rq->clock; curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec); cpuacct_charge(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
if (!rt_bandwidth_enabled()) if (!rt_bandwidth_enabled())
return; return;
...@@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) ...@@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p); enqueue_pushable_task(rq, p);
inc_cpu_load(rq, p->se.load.weight);
} }
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
...@@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) ...@@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
dequeue_rt_entity(rt_se); dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p); dequeue_pushable_task(rq, p);
dec_cpu_load(rq, p->se.load.weight);
} }
/* /*
...@@ -1064,6 +1075,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) ...@@ -1064,6 +1075,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
if (p) if (p)
dequeue_pushable_task(rq, p); dequeue_pushable_task(rq, p);
#ifdef CONFIG_SMP
/*
* We detect this state here so that we can avoid taking the RQ
* lock again later if there is no need to push
*/
rq->post_schedule = has_pushable_tasks(rq);
#endif
return p; return p;
} }
...@@ -1161,13 +1180,6 @@ static int find_lowest_rq(struct task_struct *task) ...@@ -1161,13 +1180,6 @@ static int find_lowest_rq(struct task_struct *task)
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
return -1; /* No targets found */ return -1; /* No targets found */
/*
* Only consider CPUs that are usable for migration.
* I guess we might want to change cpupri_find() to ignore those
* in the first place.
*/
cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
/* /*
* At this point we have built a mask of cpus representing the * At this point we have built a mask of cpus representing the
* lowest priority tasks in the system. Now we want to elect * lowest priority tasks in the system. Now we want to elect
...@@ -1262,11 +1274,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) ...@@ -1262,11 +1274,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq; return lowest_rq;
} }
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
}
static struct task_struct *pick_next_pushable_task(struct rq *rq) static struct task_struct *pick_next_pushable_task(struct rq *rq)
{ {
struct task_struct *p; struct task_struct *p;
...@@ -1466,23 +1473,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) ...@@ -1466,23 +1473,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
pull_rt_task(rq); pull_rt_task(rq);
} }
/*
* assumes rq->lock is held
*/
static int needs_post_schedule_rt(struct rq *rq)
{
return has_pushable_tasks(rq);
}
static void post_schedule_rt(struct rq *rq) static void post_schedule_rt(struct rq *rq)
{ {
/*
* This is only called if needs_post_schedule_rt() indicates that
* we need to push tasks away
*/
spin_lock_irq(&rq->lock);
push_rt_tasks(rq); push_rt_tasks(rq);
spin_unlock_irq(&rq->lock);
} }
/* /*
...@@ -1758,7 +1751,6 @@ static const struct sched_class rt_sched_class = { ...@@ -1758,7 +1751,6 @@ static const struct sched_class rt_sched_class = {
.rq_online = rq_online_rt, .rq_online = rq_online_rt,
.rq_offline = rq_offline_rt, .rq_offline = rq_offline_rt,
.pre_schedule = pre_schedule_rt, .pre_schedule = pre_schedule_rt,
.needs_post_schedule = needs_post_schedule_rt,
.post_schedule = post_schedule_rt, .post_schedule = post_schedule_rt,
.task_wake_up = task_wake_up_rt, .task_wake_up = task_wake_up_rt,
.switched_from = switched_from_rt, .switched_from = switched_from_rt,
......
...@@ -245,6 +245,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ ...@@ -245,6 +245,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
#endif #endif
static struct ctl_table kern_table[] = { static struct ctl_table kern_table[] = {
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
{ {
.ctl_name = CTL_UNNUMBERED, .ctl_name = CTL_UNNUMBERED,
...@@ -297,14 +305,6 @@ static struct ctl_table kern_table[] = { ...@@ -297,14 +305,6 @@ static struct ctl_table kern_table[] = {
.strategy = &sysctl_intvec, .strategy = &sysctl_intvec,
.extra1 = &zero, .extra1 = &zero,
}, },
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{ {
.ctl_name = CTL_UNNUMBERED, .ctl_name = CTL_UNNUMBERED,
.procname = "sched_features", .procname = "sched_features",
...@@ -329,6 +329,14 @@ static struct ctl_table kern_table[] = { ...@@ -329,6 +329,14 @@ static struct ctl_table kern_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = &proc_dointvec, .proc_handler = &proc_dointvec,
}, },
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_time_avg",
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{ {
.ctl_name = CTL_UNNUMBERED, .ctl_name = CTL_UNNUMBERED,
.procname = "timer_migration", .procname = "timer_migration",
......
...@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq) ...@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq)
if (cwq->wq->freezeable) if (cwq->wq->freezeable)
set_freezable(); set_freezable();
set_user_nice(current, -5);
for (;;) { for (;;) {
prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
if (!freezing(current) && if (!freezing(current) &&
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment