Commit b5869ce7 authored by Linus Torvalds's avatar Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched

* git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched: (140 commits)
  sched: sync wakeups preempt too
  sched: affine sync wakeups
  sched: guest CPU accounting: maintain guest state in KVM
  sched: guest CPU accounting: maintain stats in account_system_time()
  sched: guest CPU accounting: add guest-CPU /proc/<pid>/stat fields
  sched: guest CPU accounting: add guest-CPU /proc/stat field
  sched: domain sysctl fixes: add terminator comment
  sched: domain sysctl fixes: do not crash on allocation failure
  sched: domain sysctl fixes: unregister the sysctl table before domains
  sched: domain sysctl fixes: use for_each_online_cpu()
  sched: domain sysctl fixes: use kcalloc()
  Make scheduler debug file operations const
  sched: enable wake-idle on CONFIG_SCHED_MC=y
  sched: reintroduce topology.h tunings
  sched: allow the immediate migration of cache-cold tasks
  sched: debug, improve migration statistics
  sched: debug: increase width of debug line
  sched: activate task_hot() only on fair-scheduled tasks
  sched: reintroduce cache-hot affinity
  sched: speed up context-switches a bit
  ...
parents df3d80f5 9c63d9c0
......@@ -117,3 +117,70 @@ Some implementation details:
iterators of the scheduling modules are used. The balancing code got
quite a bit simpler as a result.
Group scheduler extension to CFS
================================
Normally the scheduler operates on individual tasks and strives to provide
fair CPU time to each task. Sometimes, it may be desirable to group tasks
and provide fair CPU time to each such task group. For example, it may
be desirable to first provide fair CPU time to each user on the system
and then to each task belonging to a user.
CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
groups. At present, there are two (mutually exclusive) mechanisms to group
tasks for CPU bandwidth control purpose:
- Based on user id (CONFIG_FAIR_USER_SCHED)
In this option, tasks are grouped according to their user id.
- Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
This options lets the administrator create arbitrary groups
of tasks, using the "cgroup" pseudo filesystem. See
Documentation/cgroups.txt for more information about this
filesystem.
Only one of these options to group tasks can be chosen and not both.
Group scheduler tunables:
When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
each new user and a "cpu_share" file is added in that directory.
# cd /sys/kernel/uids
# cat 512/cpu_share # Display user 512's CPU share
1024
# echo 2048 > 512/cpu_share # Modify user 512's CPU share
# cat 512/cpu_share # Display user 512's CPU share
2048
#
CPU bandwidth between two users are divided in the ratio of their CPU shares.
For ex: if you would like user "root" to get twice the bandwidth of user
"guest", then set the cpu_share for both the users such that "root"'s
cpu_share is twice "guest"'s cpu_share
When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
for each group created using the pseudo filesystem. See example steps
below to create task groups and modify their CPU share using the "cgroups"
pseudo filesystem
# mkdir /dev/cpuctl
# mount -t cgroup -ocpu none /dev/cpuctl
# cd /dev/cpuctl
# mkdir multimedia # create "multimedia" group of tasks
# mkdir browser # create "browser" group of tasks
# #Configure the multimedia group to receive twice the CPU bandwidth
# #that of browser group
# echo 2048 > multimedia/cpu.shares
# echo 1024 > browser/cpu.shares
# firefox & # Launch firefox and move it to "browser" group
# echo <firefox_pid> > browser/tasks
# #Launch gmplayer (or your favourite movie player)
# echo <movie_player_pid> > multimedia/tasks
......@@ -214,6 +214,17 @@ config X86_ES7000
endchoice
config SCHED_NO_NO_OMIT_FRAME_POINTER
bool "Single-depth WCHAN output"
default y
help
Calculate simpler /proc/<PID>/wchan values. If this option
is disabled then wchan values will recurse back to the
caller function. This provides more accurate wchan values,
at the expense of slightly more scheduling overhead.
If in doubt, say "Y".
config PARAVIRT
bool "Paravirtualization support (EXPERIMENTAL)"
depends on EXPERIMENTAL
......
......@@ -624,6 +624,16 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu);
int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
static inline void kvm_guest_enter(void)
{
current->flags |= PF_VCPU;
}
static inline void kvm_guest_exit(void)
{
current->flags &= ~PF_VCPU;
}
static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
......
......@@ -2046,6 +2046,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
vcpu->guest_mode = 1;
kvm_guest_enter();
if (vcpu->requests)
if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
......@@ -2053,6 +2054,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_x86_ops->run(vcpu, kvm_run);
kvm_guest_exit();
vcpu->guest_mode = 0;
local_irq_enable();
......
......@@ -45,8 +45,7 @@ void pipe_wait(struct pipe_inode_info *pipe)
* Pipes are system-local resources, so sleeping on them
* is considered a noninteractive wait:
*/
prepare_to_wait(&pipe->wait, &wait,
TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
if (pipe->inode)
mutex_unlock(&pipe->inode->i_mutex);
schedule();
......@@ -383,7 +382,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
/* Signal writers asynchronously that there is more room. */
if (do_wakeup) {
wake_up_interruptible(&pipe->wait);
wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
if (ret > 0)
......@@ -556,7 +555,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
out:
mutex_unlock(&inode->i_mutex);
if (do_wakeup) {
wake_up_interruptible(&pipe->wait);
wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
if (ret > 0)
......@@ -650,7 +649,7 @@ pipe_release(struct inode *inode, int decr, int decw)
if (!pipe->readers && !pipe->writers) {
free_pipe_info(inode);
} else {
wake_up_interruptible(&pipe->wait);
wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
......
......@@ -370,6 +370,11 @@ static cputime_t task_stime(struct task_struct *p)
}
#endif
static cputime_t task_gtime(struct task_struct *p)
{
return p->gtime;
}
static int do_task_stat(struct task_struct *task, char *buffer, int whole)
{
unsigned long vsize, eip, esp, wchan = ~0UL;
......@@ -385,6 +390,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
unsigned long cmin_flt = 0, cmaj_flt = 0;
unsigned long min_flt = 0, maj_flt = 0;
cputime_t cutime, cstime, utime, stime;
cputime_t cgtime, gtime;
unsigned long rsslim = 0;
char tcomm[sizeof(task->comm)];
unsigned long flags;
......@@ -403,6 +409,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
sigemptyset(&sigign);
sigemptyset(&sigcatch);
cutime = cstime = utime = stime = cputime_zero;
cgtime = gtime = cputime_zero;
rcu_read_lock();
if (lock_task_sighand(task, &flags)) {
......@@ -420,6 +427,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
cmaj_flt = sig->cmaj_flt;
cutime = sig->cutime;
cstime = sig->cstime;
cgtime = sig->cgtime;
rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
/* add up live thread stats at the group level */
......@@ -430,6 +438,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
maj_flt += t->maj_flt;
utime = cputime_add(utime, task_utime(t));
stime = cputime_add(stime, task_stime(t));
gtime = cputime_add(gtime, task_gtime(t));
t = next_thread(t);
} while (t != task);
......@@ -437,6 +446,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
maj_flt += sig->maj_flt;
utime = cputime_add(utime, sig->utime);
stime = cputime_add(stime, sig->stime);
gtime += cputime_add(gtime, sig->gtime);
}
sid = signal_session(sig);
......@@ -454,6 +464,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
maj_flt = task->maj_flt;
utime = task_utime(task);
stime = task_stime(task);
gtime = task_gtime(task);
}
/* scale priority and nice values from timeslices to -20..20 */
......@@ -471,7 +482,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n",
%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
task->pid,
tcomm,
state,
......@@ -516,7 +527,9 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
task_cpu(task),
task->rt_priority,
task->policy,
(unsigned long long)delayacct_blkio_ticks(task));
(unsigned long long)delayacct_blkio_ticks(task),
cputime_to_clock_t(gtime),
cputime_to_clock_t(cgtime));
if (mm)
mmput(mm);
return res;
......
......@@ -304,7 +304,7 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
return sprintf(buffer, "%llu %llu %lu\n",
task->sched_info.cpu_time,
task->sched_info.run_delay,
task->sched_info.pcnt);
task->sched_info.pcount);
}
#endif
......
......@@ -443,6 +443,7 @@ static int show_stat(struct seq_file *p, void *v)
int i;
unsigned long jif;
cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
cputime64_t guest;
u64 sum = 0;
struct timespec boottime;
unsigned int *per_irq_sum;
......@@ -453,6 +454,7 @@ static int show_stat(struct seq_file *p, void *v)
user = nice = system = idle = iowait =
irq = softirq = steal = cputime64_zero;
guest = cputime64_zero;
getboottime(&boottime);
jif = boottime.tv_sec;
......@@ -467,6 +469,7 @@ static int show_stat(struct seq_file *p, void *v)
irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
for (j = 0; j < NR_IRQS; j++) {
unsigned int temp = kstat_cpu(i).irqs[j];
sum += temp;
......@@ -474,7 +477,7 @@ static int show_stat(struct seq_file *p, void *v)
}
}
seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n",
seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
(unsigned long long)cputime64_to_clock_t(user),
(unsigned long long)cputime64_to_clock_t(nice),
(unsigned long long)cputime64_to_clock_t(system),
......@@ -482,7 +485,8 @@ static int show_stat(struct seq_file *p, void *v)
(unsigned long long)cputime64_to_clock_t(iowait),
(unsigned long long)cputime64_to_clock_t(irq),
(unsigned long long)cputime64_to_clock_t(softirq),
(unsigned long long)cputime64_to_clock_t(steal));
(unsigned long long)cputime64_to_clock_t(steal),
(unsigned long long)cputime64_to_clock_t(guest));
for_each_online_cpu(i) {
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
......@@ -494,7 +498,9 @@ static int show_stat(struct seq_file *p, void *v)
irq = kstat_cpu(i).cpustat.irq;
softirq = kstat_cpu(i).cpustat.softirq;
steal = kstat_cpu(i).cpustat.steal;
seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n",
guest = kstat_cpu(i).cpustat.guest;
seq_printf(p,
"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
i,
(unsigned long long)cputime64_to_clock_t(user),
(unsigned long long)cputime64_to_clock_t(nice),
......@@ -503,7 +509,8 @@ static int show_stat(struct seq_file *p, void *v)
(unsigned long long)cputime64_to_clock_t(iowait),
(unsigned long long)cputime64_to_clock_t(irq),
(unsigned long long)cputime64_to_clock_t(softirq),
(unsigned long long)cputime64_to_clock_t(steal));
(unsigned long long)cputime64_to_clock_t(steal),
(unsigned long long)cputime64_to_clock_t(guest));
}
seq_printf(p, "intr %llu", (unsigned long long)sum);
......
......@@ -23,6 +23,7 @@ struct cpu_usage_stat {
cputime64_t idle;
cputime64_t iowait;
cputime64_t steal;
cputime64_t guest;
};
struct kernel_stat {
......
......@@ -87,6 +87,7 @@ struct sched_param {
#include <linux/timer.h>
#include <linux/hrtimer.h>
#include <linux/task_io_accounting.h>
#include <linux/kobject.h>
#include <asm/processor.h>
......@@ -136,6 +137,7 @@ extern unsigned long weighted_cpuload(const int cpu);
struct seq_file;
struct cfs_rq;
struct task_group;
#ifdef CONFIG_SCHED_DEBUG
extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
extern void proc_sched_set_task(struct task_struct *p);
......@@ -174,8 +176,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
#define EXIT_ZOMBIE 16
#define EXIT_DEAD 32
/* in tsk->state again */
#define TASK_NONINTERACTIVE 64
#define TASK_DEAD 128
#define TASK_DEAD 64
#define __set_task_state(tsk, state_value) \
do { (tsk)->state = (state_value); } while (0)
......@@ -516,6 +517,8 @@ struct signal_struct {
* in __exit_signal, except for the group leader.
*/
cputime_t utime, stime, cutime, cstime;
cputime_t gtime;
cputime_t cgtime;
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
unsigned long inblock, oublock, cinblock, coublock;
......@@ -596,8 +599,21 @@ struct user_struct {
/* Hash table maintenance information */
struct hlist_node uidhash_node;
uid_t uid;
#ifdef CONFIG_FAIR_USER_SCHED
struct task_group *tg;
struct kset kset;
struct subsys_attribute user_attr;
struct work_struct work;
#endif
};
#ifdef CONFIG_FAIR_USER_SCHED
extern int uids_kobject_init(void);
#else
static inline int uids_kobject_init(void) { return 0; }
#endif
extern struct user_struct *find_user(uid_t);
extern struct user_struct root_user;
......@@ -609,13 +625,17 @@ struct reclaim_state;
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info {
/* cumulative counters */
unsigned long pcnt; /* # of times run on this cpu */
unsigned long pcount; /* # of times run on this cpu */
unsigned long long cpu_time, /* time spent on the cpu */
run_delay; /* time spent waiting on a runqueue */
/* timestamps */
unsigned long long last_arrival,/* when we last ran on a cpu */
last_queued; /* when we were last queued to run */
#ifdef CONFIG_SCHEDSTATS
/* BKL stats */
unsigned long bkl_count;
#endif
};
#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
......@@ -750,7 +770,7 @@ struct sched_domain {
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
unsigned long lb_count[CPU_MAX_IDLE_TYPES];
unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
......@@ -760,17 +780,17 @@ struct sched_domain {
unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
/* Active load balancing */
unsigned long alb_cnt;
unsigned long alb_count;
unsigned long alb_failed;
unsigned long alb_pushed;
/* SD_BALANCE_EXEC stats */
unsigned long sbe_cnt;
unsigned long sbe_count;
unsigned long sbe_balanced;
unsigned long sbe_pushed;
/* SD_BALANCE_FORK stats */
unsigned long sbf_cnt;
unsigned long sbf_count;
unsigned long sbf_balanced;
unsigned long sbf_pushed;
......@@ -854,11 +874,11 @@ struct rq;
struct sched_domain;
struct sched_class {
struct sched_class *next;
const struct sched_class *next;
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
void (*yield_task) (struct rq *rq, struct task_struct *p);
void (*yield_task) (struct rq *rq);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
......@@ -888,31 +908,22 @@ struct load_weight {
* 4 se->block_start
* 4 se->run_node
* 4 se->sleep_start
* 4 se->sleep_start_fair
* 6 se->load.weight
* 7 se->delta_fair
* 15 se->wait_runtime
*/
struct sched_entity {
long wait_runtime;
unsigned long delta_fair_run;
unsigned long delta_fair_sleep;
unsigned long delta_exec;
s64 fair_key;
struct load_weight load; /* for load-balancing */
struct rb_node run_node;
unsigned int on_rq;
int peer_preempt;
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
u64 prev_sum_exec_runtime;
u64 wait_start_fair;
u64 sleep_start_fair;
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 wait_max;
s64 sum_wait_runtime;
u64 sleep_start;
u64 sleep_max;
......@@ -921,9 +932,25 @@ struct sched_entity {
u64 block_start;
u64 block_max;
u64 exec_max;
unsigned long wait_runtime_overruns;
unsigned long wait_runtime_underruns;
u64 slice_max;
u64 nr_migrations;
u64 nr_migrations_cold;
u64 nr_failed_migrations_affine;
u64 nr_failed_migrations_running;
u64 nr_failed_migrations_hot;
u64 nr_forced_migrations;
u64 nr_forced2_migrations;
u64 nr_wakeups;
u64 nr_wakeups_sync;
u64 nr_wakeups_migrate;
u64 nr_wakeups_local;
u64 nr_wakeups_remote;
u64 nr_wakeups_affine;
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
......@@ -952,7 +979,7 @@ struct task_struct {
int prio, static_prio, normal_prio;
struct list_head run_list;
struct sched_class *sched_class;
const struct sched_class *sched_class;
struct sched_entity se;
#ifdef CONFIG_PREEMPT_NOTIFIERS
......@@ -1023,6 +1050,7 @@ struct task_struct {
unsigned int rt_priority;
cputime_t utime, stime;
cputime_t gtime;
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
......@@ -1314,6 +1342,7 @@ static inline void put_task_struct(struct task_struct *t)
#define PF_STARTING 0x00000002 /* being created */
#define PF_EXITING 0x00000004 /* getting shut down */
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* dumped core */
......@@ -1401,15 +1430,17 @@ static inline void idle_task_exit(void) {}
extern void sched_idle_next(void);
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_nr_latency;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_batch_wakeup_granularity;
extern unsigned int sysctl_sched_stat_granularity;
extern unsigned int sysctl_sched_runtime_limit;
extern unsigned int sysctl_sched_compat_yield;
extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
#endif
extern unsigned int sysctl_sched_compat_yield;
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
......@@ -1843,6 +1874,18 @@ extern int sched_mc_power_savings, sched_smt_power_savings;
extern void normalize_rt_tasks(void);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern struct task_group init_task_group;
extern struct task_group *sched_create_group(void);
extern void sched_destroy_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern unsigned long sched_group_shares(struct task_group *tg);
#endif
#ifdef CONFIG_TASK_XACCT
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
{
......
......@@ -159,15 +159,14 @@
.imbalance_pct = 125, \
.cache_nice_tries = 1, \
.busy_idx = 2, \
.idle_idx = 0, \
.newidle_idx = 0, \
.idle_idx = 1, \
.newidle_idx = 2, \
.wake_idx = 1, \
.forkexec_idx = 1, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \
| SD_WAKE_AFFINE \
| SD_WAKE_IDLE \
| BALANCE_FOR_PKG_POWER,\
.last_balance = jiffies, \
.balance_interval = 1, \
......
......@@ -281,6 +281,27 @@ config CPUSETS
Say N if unsure.
config FAIR_GROUP_SCHED
bool "Fair group CPU scheduler"
default y
depends on EXPERIMENTAL
help
This feature lets CPU scheduler recognize task groups and control CPU
bandwidth allocation to such task groups.
choice
depends on FAIR_GROUP_SCHED
prompt "Basis for grouping tasks"
default FAIR_USER_SCHED
config FAIR_USER_SCHED
bool "user id"
help
This option will choose userid as the basis for grouping
tasks, thus providing equal CPU bandwidth to each user.
endchoice
config SYSFS_DEPRECATED
bool "Create deprecated sysfs files"
default y
......
......@@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
* No locking available for sched_info (and too expensive to add one)
* Mitigate by taking snapshot of values
*/
t1 = tsk->sched_info.pcnt;
t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
t3 = tsk->sched_info.cpu_time;
......
......@@ -111,6 +111,7 @@ static void __exit_signal(struct task_struct *tsk)
*/
sig->utime = cputime_add(sig->utime, tsk->utime);
sig->stime = cputime_add(sig->stime, tsk->stime);
sig->gtime = cputime_add(sig->gtime, tsk->gtime);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
......@@ -1242,6 +1243,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
cputime_add(p->stime,
cputime_add(sig->stime,
sig->cstime)));
psig->cgtime =
cputime_add(psig->cgtime,
cputime_add(p->gtime,
cputime_add(sig->gtime,
sig->cgtime)));
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
......
......@@ -877,6 +877,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
sig->tty_old_pgrp = NULL;
sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
sig->gtime = cputime_zero;
sig->cgtime = cputime_zero;
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
......@@ -1045,6 +1047,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = cputime_zero;
p->stime = cputime_zero;
p->gtime = cputime_zero;
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
......
......@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kexec.h>
#include <linux/sched.h>
#define KERNEL_ATTR_RO(_name) \
static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
......@@ -116,6 +117,13 @@ static int __init ksysfs_init(void)
&notes_attr);
}
/*
* Create "/sys/kernel/uids" directory and corresponding root user's
* directory under it.
*/
if (!error)
error = uids_kobject_init();
return error;
}
......
......@@ -96,7 +96,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
/*
* Some helpers for converting nanosecond timing to jiffy resolution
*/
#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ))
#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
#define NICE_0_LOAD SCHED_LOAD_SCALE
......@@ -105,11 +105,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
/*
* These are the 'tuning knobs' of the scheduler:
*
* Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
* default timeslice is 100 msecs, maximum timeslice is 800 msecs.
* default timeslice is 100 msecs (used only for SCHED_RR tasks).
* Timeslices get refilled after they expire.
*/
#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
#define DEF_TIMESLICE (100 * HZ / 1000)
#ifdef CONFIG_SMP
......@@ -133,24 +131,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
}
#endif
#define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
/*
* static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
* to time slice values: [800ms ... 100ms ... 5ms]
*/
static unsigned int static_prio_timeslice(int static_prio)
{
if (static_prio == NICE_TO_PRIO(19))
return 1;
if (static_prio < NICE_TO_PRIO(0))
return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
else
return SCALE_PRIO(DEF_TIMESLICE, static_prio);
}
static inline int rt_policy(int policy)
{
if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
......@@ -171,31 +151,91 @@ struct rt_prio_array {
struct list_head queue[MAX_RT_PRIO];
};
struct load_stat {
struct load_weight load;
u64 load_update_start, load_update_last;
unsigned long delta_fair, delta_exec, delta_stat;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct cfs_rq;
/* task group related information */
struct task_group {
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
/* spinlock to serialize modification to shares */
spinlock_t lock;
};
/* Default task group's sched entity on each cpu */
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */
static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
static struct sched_entity *init_sched_entity_p[NR_CPUS];
static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
/* Default task group.
* Every task in system belong to this group at bootup.
*/
struct task_group init_task_group = {
.se = init_sched_entity_p,
.cfs_rq = init_cfs_rq_p,
};
#ifdef CONFIG_FAIR_USER_SCHED
# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD
#else
# define INIT_TASK_GRP_LOAD NICE_0_LOAD
#endif
static int init_task_group_load = INIT_TASK_GRP_LOAD;
/* return group to which a task belongs */
static inline struct task_group *task_group(struct task_struct *p)
{
struct task_group *tg;
#ifdef CONFIG_FAIR_USER_SCHED
tg = p->user->tg;
#else
tg = &init_task_group;
#endif
return tg;
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_cfs_rq(struct task_struct *p)
{
p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)];
p->se.parent = task_group(p)->se[task_cpu(p)];
}
#else
static inline void set_task_cfs_rq(struct task_struct *p) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned long nr_running;
s64 fair_clock;
u64 exec_clock;
s64 wait_runtime;
u64 sleeper_bonus;
unsigned long wait_runtime_overruns, wait_runtime_underruns;
u64 min_vruntime;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
struct rb_node *rb_load_balance_curr;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
unsigned long nr_spread_over;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
......@@ -206,6 +246,8 @@ struct cfs_rq {
* list is used during load balance.
*/
struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
struct task_group *tg; /* group that "owns" this runqueue */
struct rcu_head rcu;
#endif
};
......@@ -237,7 +279,7 @@ struct rq {
#ifdef CONFIG_NO_HZ
unsigned char in_nohz_recently;
#endif
struct load_stat ls; /* capture load from *all* tasks on this cpu */
struct load_weight load; /* capture load from *all* tasks on this cpu */
unsigned long nr_load_updates;
u64 nr_switches;
......@@ -289,16 +331,19 @@ struct rq {
unsigned long yld_exp_empty;
unsigned long yld_act_empty;
unsigned long yld_both_empty;
unsigned long yld_cnt;
unsigned long yld_count;
/* schedule() stats */
unsigned long sched_switch;
unsigned long sched_cnt;
unsigned long sched_count;
unsigned long sched_goidle;
/* try_to_wake_up() stats */
unsigned long ttwu_cnt;
unsigned long ttwu_count;
unsigned long ttwu_local;
/* BKL stats */
unsigned long bkl_count;
#endif
struct lock_class_key rq_lock_key;
};
......@@ -382,6 +427,37 @@ static void update_rq_clock(struct rq *rq)
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
/*
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
#ifdef CONFIG_SCHED_DEBUG
# define const_debug __read_mostly
#else
# define const_debug static const
#endif
/*
* Debugging: various feature bits
*/
enum {
SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
SCHED_FEAT_START_DEBIT = 2,
SCHED_FEAT_TREE_AVG = 4,
SCHED_FEAT_APPROX_AVG = 8,
SCHED_FEAT_WAKEUP_PREEMPT = 16,
SCHED_FEAT_PREEMPT_RESTRICT = 32,
};
const_debug unsigned int sysctl_sched_features =
SCHED_FEAT_NEW_FAIR_SLEEPERS *1 |
SCHED_FEAT_START_DEBIT *1 |
SCHED_FEAT_TREE_AVG *0 |
SCHED_FEAT_APPROX_AVG *0 |
SCHED_FEAT_WAKEUP_PREEMPT *1 |
SCHED_FEAT_PREEMPT_RESTRICT *1;
#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
/*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
* clock constructed from sched_clock():
......@@ -400,18 +476,7 @@ unsigned long long cpu_clock(int cpu)
return now;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
/* Change a task's ->cfs_rq if it moves across CPUs */
static inline void set_task_cfs_rq(struct task_struct *p)
{
p->se.cfs_rq = &task_rq(p)->cfs;
}
#else
static inline void set_task_cfs_rq(struct task_struct *p)
{
}
#endif
EXPORT_SYMBOL_GPL(cpu_clock);
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
......@@ -497,16 +562,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
struct rq *rq;
repeat_lock_task:
rq = task_rq(p);
spin_lock(&rq->lock);
if (unlikely(rq != task_rq(p))) {
for (;;) {
struct rq *rq = task_rq(p);
spin_lock(&rq->lock);
if (likely(rq == task_rq(p)))
return rq;
spin_unlock(&rq->lock);
goto repeat_lock_task;
}
return rq;
}
/*
......@@ -519,18 +581,17 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
{
struct rq *rq;
repeat_lock_task:
local_irq_save(*flags);
rq = task_rq(p);
spin_lock(&rq->lock);
if (unlikely(rq != task_rq(p))) {
for (;;) {
local_irq_save(*flags);
rq = task_rq(p);
spin_lock(&rq->lock);
if (likely(rq == task_rq(p)))
return rq;
spin_unlock_irqrestore(&rq->lock, *flags);
goto repeat_lock_task;
}
return rq;
}
static inline void __task_rq_unlock(struct rq *rq)
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
spin_unlock(&rq->lock);
......@@ -545,7 +606,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
static inline struct rq *this_rq_lock(void)
static struct rq *this_rq_lock(void)
__acquires(rq->lock)
{
struct rq *rq;
......@@ -645,19 +706,6 @@ static inline void resched_task(struct task_struct *p)
}
#endif
static u64 div64_likely32(u64 divident, unsigned long divisor)
{
#if BITS_PER_LONG == 32
if (likely(divident <= 0xffffffffULL))
return (u32)divident / divisor;
do_div(divident, divisor);
return divident;
#else
return divident / divisor;
#endif
}
#if BITS_PER_LONG == 32
# define WMULT_CONST (~0UL)
#else
......@@ -699,16 +747,14 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
}
static void update_load_add(struct load_weight *lw, unsigned long inc)
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
lw->inv_weight = 0;
}
static void update_load_sub(struct load_weight *lw, unsigned long dec)
static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
lw->weight -= dec;
lw->inv_weight = 0;
}
/*
......@@ -784,29 +830,20 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
int *this_best_prio, struct rq_iterator *iterator);
#include "sched_stats.h"
#include "sched_rt.c"
#include "sched_fair.c"
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
#endif
#define sched_class_highest (&rt_sched_class)
static void __update_curr_load(struct rq *rq, struct load_stat *ls)
{
if (rq->curr != rq->idle && ls->load.weight) {
ls->delta_exec += ls->delta_stat;
ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
ls->delta_stat = 0;
}
}
/*
* Update delta_exec, delta_fair fields for rq.
*
* delta_fair clock advances at a rate inversely proportional to
* total load (rq->ls.load.weight) on the runqueue, while
* total load (rq->load.weight) on the runqueue, while
* delta_exec advances at the same rate as wall-clock (provided
* cpu is not idle).
*
......@@ -814,35 +851,17 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls)
* runqueue over any given interval. This (smoothened) load is used
* during load balance.
*
* This function is called /before/ updating rq->ls.load
* This function is called /before/ updating rq->load
* and when switching tasks.
*/
static void update_curr_load(struct rq *rq)
{
struct load_stat *ls = &rq->ls;
u64 start;
start = ls->load_update_start;
ls->load_update_start = rq->clock;
ls->delta_stat += rq->clock - start;
/*
* Stagger updates to ls->delta_fair. Very frequent updates
* can be expensive.
*/
if (ls->delta_stat >= sysctl_sched_stat_granularity)
__update_curr_load(rq, ls);
}
static inline void inc_load(struct rq *rq, const struct task_struct *p)
{
update_curr_load(rq);
update_load_add(&rq->ls.load, p->se.load.weight);
update_load_add(&rq->load, p->se.load.weight);
}
static inline void dec_load(struct rq *rq, const struct task_struct *p)
{
update_curr_load(rq);
update_load_sub(&rq->ls.load, p->se.load.weight);
update_load_sub(&rq->load, p->se.load.weight);
}
static void inc_nr_running(struct task_struct *p, struct rq *rq)
......@@ -859,8 +878,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq)
static void set_load_weight(struct task_struct *p)
{
p->se.wait_runtime = 0;
if (task_has_rt_policy(p)) {
p->se.load.weight = prio_to_weight[0] * 2;
p->se.load.inv_weight = prio_to_wmult[0] >> 1;
......@@ -951,20 +968,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
inc_nr_running(p, rq);
}
/*
* activate_idle_task - move idle task to the _front_ of runqueue.
*/
static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
{
update_rq_clock(rq);
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
enqueue_task(rq, p, 0);
inc_nr_running(p, rq);
}
/*
* deactivate_task - remove a task from the runqueue.
*/
......@@ -989,32 +992,50 @@ inline int task_curr(const struct task_struct *p)
/* Used instead of source_load when we know the type == 0 */
unsigned long weighted_cpuload(const int cpu)
{
return cpu_rq(cpu)->ls.load.weight;
return cpu_rq(cpu)->load.weight;
}
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
#ifdef CONFIG_SMP
task_thread_info(p)->cpu = cpu;
set_task_cfs_rq(p);
#endif
set_task_cfs_rq(p);
}
#ifdef CONFIG_SMP
/*
* Is this task likely cache-hot:
*/
static inline int
task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
{
s64 delta;
if (p->sched_class != &fair_sched_class)
return 0;
if (sysctl_sched_migration_cost == -1)
return 1;
if (sysctl_sched_migration_cost == 0)
return 0;
delta = now - p->se.exec_start;
return delta < (s64)sysctl_sched_migration_cost;
}
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
int old_cpu = task_cpu(p);
struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
u64 clock_offset, fair_clock_offset;
struct cfs_rq *old_cfsrq = task_cfs_rq(p),
*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
u64 clock_offset;
clock_offset = old_rq->clock - new_rq->clock;
fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
if (p->se.wait_start_fair)
p->se.wait_start_fair -= fair_clock_offset;
if (p->se.sleep_start_fair)
p->se.sleep_start_fair -= fair_clock_offset;
#ifdef CONFIG_SCHEDSTATS
if (p->se.wait_start)
......@@ -1023,7 +1044,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->se.sleep_start -= clock_offset;
if (p->se.block_start)
p->se.block_start -= clock_offset;
if (old_cpu != new_cpu) {
schedstat_inc(p, se.nr_migrations);
if (task_hot(p, old_rq->clock, NULL))
schedstat_inc(p, se.nr_forced2_migrations);
}
#endif
p->se.vruntime -= old_cfsrq->min_vruntime -
new_cfsrq->min_vruntime;
__set_task_cpu(p, new_cpu);
}
......@@ -1078,69 +1106,71 @@ void wait_task_inactive(struct task_struct *p)
int running, on_rq;
struct rq *rq;
repeat:
/*
* We do the initial early heuristics without holding
* any task-queue locks at all. We'll only try to get
* the runqueue lock when things look like they will
* work out!
*/
rq = task_rq(p);
for (;;) {
/*
* We do the initial early heuristics without holding
* any task-queue locks at all. We'll only try to get
* the runqueue lock when things look like they will
* work out!
*/
rq = task_rq(p);
/*
* If the task is actively running on another CPU
* still, just relax and busy-wait without holding
* any locks.
*
* NOTE! Since we don't hold any locks, it's not
* even sure that "rq" stays as the right runqueue!
* But we don't care, since "task_running()" will
* return false if the runqueue has changed and p
* is actually now running somewhere else!
*/
while (task_running(rq, p))
cpu_relax();
/*
* If the task is actively running on another CPU
* still, just relax and busy-wait without holding
* any locks.
*
* NOTE! Since we don't hold any locks, it's not
* even sure that "rq" stays as the right runqueue!
* But we don't care, since "task_running()" will
* return false if the runqueue has changed and p
* is actually now running somewhere else!
*/
while (task_running(rq, p))
cpu_relax();
/*
* Ok, time to look more closely! We need the rq
* lock now, to be *sure*. If we're wrong, we'll
* just go back and repeat.
*/
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
on_rq = p->se.on_rq;
task_rq_unlock(rq, &flags);
/*
* Ok, time to look more closely! We need the rq
* lock now, to be *sure*. If we're wrong, we'll
* just go back and repeat.
*/
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
on_rq = p->se.on_rq;
task_rq_unlock(rq, &flags);
/*
* Was it really running after all now that we
* checked with the proper locks actually held?
*
* Oops. Go back and try again..
*/
if (unlikely(running)) {
cpu_relax();
goto repeat;
}
/*
* Was it really running after all now that we
* checked with the proper locks actually held?
*
* Oops. Go back and try again..
*/
if (unlikely(running)) {
cpu_relax();
continue;
}
/*
* It's not enough that it's not actively running,
* it must be off the runqueue _entirely_, and not
* preempted!
*
* So if it wa still runnable (but just not actively
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
if (unlikely(on_rq)) {
yield();
goto repeat;
}
/*
* It's not enough that it's not actively running,
* it must be off the runqueue _entirely_, and not
* preempted!
*
* So if it wa still runnable (but just not actively
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
if (unlikely(on_rq)) {
schedule_timeout_uninterruptible(1);
continue;
}
/*
* Ahh, all good. It wasn't running, and it wasn't
* runnable, which means that it will never become
* running in the future either. We're all done!
*/
/*
* Ahh, all good. It wasn't running, and it wasn't
* runnable, which means that it will never become
* running in the future either. We're all done!
*/
break;
}
}
/***
......@@ -1174,7 +1204,7 @@ void kick_process(struct task_struct *p)
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
static inline unsigned long source_load(int cpu, int type)
static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
......@@ -1189,7 +1219,7 @@ static inline unsigned long source_load(int cpu, int type)
* Return a high guess at the load of a migration-target cpu weighted
* according to the scheduling class and "nice" value.
*/
static inline unsigned long target_load(int cpu, int type)
static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
......@@ -1231,7 +1261,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
/* Skip over this group if it has no CPUs allowed */
if (!cpus_intersects(group->cpumask, p->cpus_allowed))
goto nextgroup;
continue;
local_group = cpu_isset(this_cpu, group->cpumask);
......@@ -1259,9 +1289,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
min_load = avg_load;
idlest = group;
}
nextgroup:
group = group->next;
} while (group != sd->groups);
} while (group = group->next, group != sd->groups);
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
......@@ -1393,8 +1421,13 @@ static int wake_idle(int cpu, struct task_struct *p)
if (sd->flags & SD_WAKE_IDLE) {
cpus_and(tmp, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
if (idle_cpu(i))
if (idle_cpu(i)) {
if (i != task_cpu(p)) {
schedstat_inc(p,
se.nr_wakeups_idle);
}
return i;
}
}
} else {
break;
......@@ -1425,7 +1458,7 @@ static inline int wake_idle(int cpu, struct task_struct *p)
*/
static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
{
int cpu, this_cpu, success = 0;
int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags;
long old_state;
struct rq *rq;
......@@ -1444,6 +1477,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
goto out_running;
cpu = task_cpu(p);
orig_cpu = cpu;
this_cpu = smp_processor_id();
#ifdef CONFIG_SMP
......@@ -1452,7 +1486,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
new_cpu = cpu;
schedstat_inc(rq, ttwu_cnt);
schedstat_inc(rq, ttwu_count);
if (cpu == this_cpu) {
schedstat_inc(rq, ttwu_local);
goto out_set_cpu;
......@@ -1487,6 +1521,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
unsigned long tl = this_load;
unsigned long tl_per_task;
/*
* Attract cache-cold tasks on sync wakeups:
*/
if (sync && !task_hot(p, rq->clock, this_sd))
goto out_set_cpu;
schedstat_inc(p, se.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
/*
......@@ -1506,6 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
* there is no bad imbalance.
*/
schedstat_inc(this_sd, ttwu_move_affine);
schedstat_inc(p, se.nr_wakeups_affine);
goto out_set_cpu;
}
}
......@@ -1517,6 +1559,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
if (this_sd->flags & SD_WAKE_BALANCE) {
if (imbalance*this_load <= 100*load) {
schedstat_inc(this_sd, ttwu_move_balance);
schedstat_inc(p, se.nr_wakeups_passive);
goto out_set_cpu;
}
}
......@@ -1542,18 +1585,18 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
out_activate:
#endif /* CONFIG_SMP */
schedstat_inc(p, se.nr_wakeups);
if (sync)
schedstat_inc(p, se.nr_wakeups_sync);
if (orig_cpu != cpu)
schedstat_inc(p, se.nr_wakeups_migrate);
if (cpu == this_cpu)
schedstat_inc(p, se.nr_wakeups_local);
else
schedstat_inc(p, se.nr_wakeups_remote);
update_rq_clock(rq);
activate_task(rq, p, 1);
/*
* Sync wakeups (i.e. those types of wakeups where the waker
* has indicated that it will leave the CPU in short order)
* don't trigger a preemption, if the woken up task will run on
* this cpu. (in this case the 'I will reschedule' promise of
* the waker guarantees that the freshly woken up task is going
* to be considered on this CPU.)
*/
if (!sync || cpu != this_cpu)
check_preempt_curr(rq, p);
check_preempt_curr(rq, p);
success = 1;
out_running:
......@@ -1584,28 +1627,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
*/
static void __sched_fork(struct task_struct *p)
{
p->se.wait_start_fair = 0;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.delta_exec = 0;
p->se.delta_fair_run = 0;
p->se.delta_fair_sleep = 0;
p->se.wait_runtime = 0;
p->se.sleep_start_fair = 0;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
p->se.sum_wait_runtime = 0;
p->se.sum_sleep_runtime = 0;
p->se.sleep_start = 0;
p->se.block_start = 0;
p->se.sleep_max = 0;
p->se.block_max = 0;
p->se.exec_max = 0;
p->se.slice_max = 0;
p->se.wait_max = 0;
p->se.wait_runtime_overruns = 0;
p->se.wait_runtime_underruns = 0;
#endif
INIT_LIST_HEAD(&p->run_list);
......@@ -1636,12 +1671,14 @@ void sched_fork(struct task_struct *p, int clone_flags)
#ifdef CONFIG_SMP
cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
#endif
__set_task_cpu(p, cpu);
set_task_cpu(p, cpu);
/*
* Make sure we do not leak PI boosting priority to the child:
*/
p->prio = current->normal_prio;
if (!rt_prio(p->prio))
p->sched_class = &fair_sched_class;
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (likely(sched_info_on()))
......@@ -1657,12 +1694,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
put_cpu();
}
/*
* After fork, child runs first. (default) If set to 0 then
* parent will (try to) run first.
*/
unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
......@@ -1674,24 +1705,14 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
unsigned long flags;
struct rq *rq;
int this_cpu;
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
this_cpu = smp_processor_id(); /* parent's CPU */
update_rq_clock(rq);
p->prio = effective_prio(p);
if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
(clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
!current->se.on_rq) {
if (!p->sched_class->task_new || !current->se.on_rq || !rq->cfs.curr) {
activate_task(rq, p, 0);
} else {
/*
......@@ -1800,7 +1821,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
* with the lock held can cause deadlocks; see schedule() for
* details.)
*/
static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
static void finish_task_switch(struct rq *rq, struct task_struct *prev)
__releases(rq->lock)
{
struct mm_struct *mm = rq->prev_mm;
......@@ -1982,42 +2003,10 @@ unsigned long nr_active(void)
*/
static void update_cpu_load(struct rq *this_rq)
{
u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
unsigned long total_load = this_rq->ls.load.weight;
unsigned long this_load = total_load;
struct load_stat *ls = &this_rq->ls;
unsigned long this_load = this_rq->load.weight;
int i, scale;
this_rq->nr_load_updates++;
if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
goto do_avg;
/* Update delta_fair/delta_exec fields first */
update_curr_load(this_rq);
fair_delta64 = ls->delta_fair + 1;
ls->delta_fair = 0;
exec_delta64 = ls->delta_exec + 1;
ls->delta_exec = 0;
sample_interval64 = this_rq->clock - ls->load_update_last;
ls->load_update_last = this_rq->clock;
if ((s64)sample_interval64 < (s64)TICK_NSEC)
sample_interval64 = TICK_NSEC;
if (exec_delta64 > sample_interval64)
exec_delta64 = sample_interval64;
idle_delta64 = sample_interval64 - exec_delta64;
tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
this_load = (unsigned long)tmp64;
do_avg:
/* Update our load: */
for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
......@@ -2027,7 +2016,13 @@ static void update_cpu_load(struct rq *this_rq)
old_load = this_rq->cpu_load[i];
new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
* prevents us from getting stuck on 9 if the load is 10, for
* example.
*/
if (new_load > old_load)
new_load += scale-1;
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
}
}
......@@ -2179,13 +2174,38 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
*/
if (!cpu_isset(this_cpu, p->cpus_allowed))
if (!cpu_isset(this_cpu, p->cpus_allowed)) {
schedstat_inc(p, se.nr_failed_migrations_affine);
return 0;
}
*all_pinned = 0;
if (task_running(rq, p))
if (task_running(rq, p)) {
schedstat_inc(p, se.nr_failed_migrations_running);
return 0;
}
/*
* Aggressive migration if:
* 1) task is cache cold, or
* 2) too many balance attempts have failed.
*/
if (!task_hot(p, rq->clock, sd) ||
sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
if (task_hot(p, rq->clock, sd)) {
schedstat_inc(sd, lb_hot_gained[idle]);
schedstat_inc(p, se.nr_forced_migrations);
}
#endif
return 1;
}
if (task_hot(p, rq->clock, sd)) {
schedstat_inc(p, se.nr_failed_migrations_hot);
return 0;
}
return 1;
}
......@@ -2264,7 +2284,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
struct sched_class *class = sched_class_highest;
const struct sched_class *class = sched_class_highest;
unsigned long total_load_moved = 0;
int this_best_prio = this_rq->curr->prio;
......@@ -2289,7 +2309,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct sched_domain *sd, enum cpu_idle_type idle)
{
struct sched_class *class;
const struct sched_class *class;
int this_best_prio = MAX_PRIO;
for (class = sched_class_highest; class; class = class->next)
......@@ -2653,7 +2673,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
sd_idle = 1;
schedstat_inc(sd, lb_cnt[idle]);
schedstat_inc(sd, lb_count[idle]);
redo:
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
......@@ -2806,7 +2826,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
sd_idle = 1;
schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
redo:
group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
&sd_idle, &cpus, NULL);
......@@ -2940,7 +2960,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
}
if (likely(sd)) {
schedstat_inc(sd, alb_cnt);
schedstat_inc(sd, alb_count);
if (move_one_task(target_rq, target_cpu, busiest_rq,
sd, CPU_IDLE))
......@@ -3033,7 +3053,7 @@ static DEFINE_SPINLOCK(balancing);
*
* Balancing parameters are set up in arch_init_sched_domains.
*/
static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
int balance = 1;
struct rq *rq = cpu_rq(cpu);
......@@ -3279,6 +3299,25 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
cpustat->user = cputime64_add(cpustat->user, tmp);
}
/*
* Account guest cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in virtual machine since the last update
*/
void account_guest_time(struct task_struct *p, cputime_t cputime)
{
cputime64_t tmp;
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
tmp = cputime_to_cputime64(cputime);
p->utime = cputime_add(p->utime, cputime);
p->gtime = cputime_add(p->gtime, cputime);
cpustat->user = cputime64_add(cpustat->user, tmp);
cpustat->guest = cputime64_add(cpustat->guest, tmp);
}
/*
* Account system cpu time to a process.
* @p: the process that the cpu time gets accounted to
......@@ -3292,6 +3331,12 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
struct rq *rq = this_rq();
cputime64_t tmp;
if (p->flags & PF_VCPU) {
account_guest_time(p, cputime);
p->flags &= ~PF_VCPU;
return;
}
p->stime = cputime_add(p->stime, cputime);
/* Add system time to cpustat. */
......@@ -3430,7 +3475,13 @@ static inline void schedule_debug(struct task_struct *prev)
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
schedstat_inc(this_rq(), sched_cnt);
schedstat_inc(this_rq(), sched_count);
#ifdef CONFIG_SCHEDSTATS
if (unlikely(prev->lock_depth >= 0)) {
schedstat_inc(this_rq(), bkl_count);
schedstat_inc(prev, sched_info.bkl_count);
}
#endif
}
/*
......@@ -3439,7 +3490,7 @@ static inline void schedule_debug(struct task_struct *prev)
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev)
{
struct sched_class *class;
const struct sched_class *class;
struct task_struct *p;
/*
......@@ -3488,9 +3539,13 @@ asmlinkage void __sched schedule(void)
schedule_debug(prev);
spin_lock_irq(&rq->lock);
clear_tsk_need_resched(prev);
/*
* Do the rq-clock update outside the rq lock:
*/
local_irq_disable();
__update_rq_clock(rq);
spin_lock(&rq->lock);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
......@@ -3550,27 +3605,30 @@ asmlinkage void __sched preempt_schedule(void)
if (likely(ti->preempt_count || irqs_disabled()))
return;
need_resched:
add_preempt_count(PREEMPT_ACTIVE);
/*
* We keep the big kernel semaphore locked, but we
* clear ->lock_depth so that schedule() doesnt
* auto-release the semaphore:
*/
do {
add_preempt_count(PREEMPT_ACTIVE);
/*
* We keep the big kernel semaphore locked, but we
* clear ->lock_depth so that schedule() doesnt
* auto-release the semaphore:
*/
#ifdef CONFIG_PREEMPT_BKL
saved_lock_depth = task->lock_depth;
task->lock_depth = -1;
saved_lock_depth = task->lock_depth;
task->lock_depth = -1;
#endif
schedule();
schedule();
#ifdef CONFIG_PREEMPT_BKL
task->lock_depth = saved_lock_depth;
task->lock_depth = saved_lock_depth;
#endif
sub_preempt_count(PREEMPT_ACTIVE);
sub_preempt_count(PREEMPT_ACTIVE);
/* we could miss a preemption opportunity between schedule and now */
barrier();
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
/*
* Check again in case we missed a preemption opportunity
* between schedule and now.
*/
barrier();
} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
}
EXPORT_SYMBOL(preempt_schedule);
......@@ -3590,29 +3648,32 @@ asmlinkage void __sched preempt_schedule_irq(void)
/* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());
need_resched:
add_preempt_count(PREEMPT_ACTIVE);
/*
* We keep the big kernel semaphore locked, but we
* clear ->lock_depth so that schedule() doesnt
* auto-release the semaphore:
*/
do {
add_preempt_count(PREEMPT_ACTIVE);
/*
* We keep the big kernel semaphore locked, but we
* clear ->lock_depth so that schedule() doesnt
* auto-release the semaphore:
*/
#ifdef CONFIG_PREEMPT_BKL
saved_lock_depth = task->lock_depth;
task->lock_depth = -1;
saved_lock_depth = task->lock_depth;
task->lock_depth = -1;
#endif
local_irq_enable();
schedule();
local_irq_disable();
local_irq_enable();
schedule();
local_irq_disable();
#ifdef CONFIG_PREEMPT_BKL
task->lock_depth = saved_lock_depth;
task->lock_depth = saved_lock_depth;
#endif
sub_preempt_count(PREEMPT_ACTIVE);
sub_preempt_count(PREEMPT_ACTIVE);
/* we could miss a preemption opportunity between schedule and now */
barrier();
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
/*
* Check again in case we missed a preemption opportunity
* between schedule and now.
*/
barrier();
} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
}
#endif /* CONFIG_PREEMPT */
......@@ -3636,10 +3697,9 @@ EXPORT_SYMBOL(default_wake_function);
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int sync, void *key)
{
struct list_head *tmp, *next;
wait_queue_t *curr, *next;
list_for_each_safe(tmp, next, &q->task_list) {
wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
if (curr->func(curr, mode, sync, key) &&
......@@ -3729,206 +3789,116 @@ void fastcall complete_all(struct completion *x)
}
EXPORT_SYMBOL(complete_all);
void fastcall __sched wait_for_completion(struct completion *x)
static inline long __sched
do_wait_for_common(struct completion *x, long timeout, int state)
{
might_sleep();
spin_lock_irq(&x->wait.lock);
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&x->wait.lock);
schedule();
spin_lock_irq(&x->wait.lock);
} while (!x->done);
__remove_wait_queue(&x->wait, &wait);
}
x->done--;
spin_unlock_irq(&x->wait.lock);
}
EXPORT_SYMBOL(wait_for_completion);
unsigned long fastcall __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
{
might_sleep();
spin_lock_irq(&x->wait.lock);
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
__set_current_state(TASK_UNINTERRUPTIBLE);
if (state == TASK_INTERRUPTIBLE &&
signal_pending(current)) {
__remove_wait_queue(&x->wait, &wait);
return -ERESTARTSYS;
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&x->wait.lock);
if (!timeout) {
__remove_wait_queue(&x->wait, &wait);
goto out;
return timeout;
}
} while (!x->done);
__remove_wait_queue(&x->wait, &wait);
}
x->done--;
out:
spin_unlock_irq(&x->wait.lock);
return timeout;
}
EXPORT_SYMBOL(wait_for_completion_timeout);
int fastcall __sched wait_for_completion_interruptible(struct completion *x)
static long __sched
wait_for_common(struct completion *x, long timeout, int state)
{
int ret = 0;
might_sleep();
spin_lock_irq(&x->wait.lock);
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
if (signal_pending(current)) {
ret = -ERESTARTSYS;
__remove_wait_queue(&x->wait, &wait);
goto out;
}
__set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irq(&x->wait.lock);
schedule();
spin_lock_irq(&x->wait.lock);
} while (!x->done);
__remove_wait_queue(&x->wait, &wait);
}
x->done--;
out:
timeout = do_wait_for_common(x, timeout, state);
spin_unlock_irq(&x->wait.lock);
return timeout;
}
return ret;
void fastcall __sched wait_for_completion(struct completion *x)
{
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_interruptible);
EXPORT_SYMBOL(wait_for_completion);
unsigned long fastcall __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
{
might_sleep();
spin_lock_irq(&x->wait.lock);
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
if (signal_pending(current)) {
timeout = -ERESTARTSYS;
__remove_wait_queue(&x->wait, &wait);
goto out;
}
__set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irq(&x->wait.lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&x->wait.lock);
if (!timeout) {
__remove_wait_queue(&x->wait, &wait);
goto out;
}
} while (!x->done);
__remove_wait_queue(&x->wait, &wait);
}
x->done--;
out:
spin_unlock_irq(&x->wait.lock);
return timeout;
return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
EXPORT_SYMBOL(wait_for_completion_timeout);
static inline void
sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
int __sched wait_for_completion_interruptible(struct completion *x)
{
spin_lock_irqsave(&q->lock, *flags);
__add_wait_queue(q, wait);
spin_unlock(&q->lock);
return wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_interruptible);
static inline void
sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
unsigned long fastcall __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
{
spin_lock_irq(&q->lock);
__remove_wait_queue(q, wait);
spin_unlock_irqrestore(&q->lock, *flags);
return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
void __sched interruptible_sleep_on(wait_queue_head_t *q)
static long __sched
sleep_on_common(wait_queue_head_t *q, int state, long timeout)
{
unsigned long flags;
wait_queue_t wait;
init_waitqueue_entry(&wait, current);
current->state = TASK_INTERRUPTIBLE;
__set_current_state(state);
sleep_on_head(q, &wait, &flags);
schedule();
sleep_on_tail(q, &wait, &flags);
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, &wait);
spin_unlock(&q->lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&q->lock);
__remove_wait_queue(q, &wait);
spin_unlock_irqrestore(&q->lock, flags);
return timeout;
}
void __sched interruptible_sleep_on(wait_queue_head_t *q)
{
sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
EXPORT_SYMBOL(interruptible_sleep_on);
long __sched
interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
unsigned long flags;
wait_queue_t wait;
init_waitqueue_entry(&wait, current);
current->state = TASK_INTERRUPTIBLE;
sleep_on_head(q, &wait, &flags);
timeout = schedule_timeout(timeout);
sleep_on_tail(q, &wait, &flags);
return timeout;
return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
}
EXPORT_SYMBOL(interruptible_sleep_on_timeout);
void __sched sleep_on(wait_queue_head_t *q)
{
unsigned long flags;
wait_queue_t wait;
init_waitqueue_entry(&wait, current);
current->state = TASK_UNINTERRUPTIBLE;
sleep_on_head(q, &wait, &flags);
schedule();
sleep_on_tail(q, &wait, &flags);
sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
EXPORT_SYMBOL(sleep_on);
long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
unsigned long flags;
wait_queue_t wait;
init_waitqueue_entry(&wait, current);
current->state = TASK_UNINTERRUPTIBLE;
sleep_on_head(q, &wait, &flags);
timeout = schedule_timeout(timeout);
sleep_on_tail(q, &wait, &flags);
return timeout;
return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
}
EXPORT_SYMBOL(sleep_on_timeout);
......@@ -3947,7 +3917,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
void rt_mutex_setprio(struct task_struct *p, int prio)
{
unsigned long flags;
int oldprio, on_rq;
int oldprio, on_rq, running;
struct rq *rq;
BUG_ON(prio < 0 || prio > MAX_PRIO);
......@@ -3957,8 +3927,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
oldprio = p->prio;
on_rq = p->se.on_rq;
if (on_rq)
running = task_running(rq, p);
if (on_rq) {
dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
}
if (rt_prio(prio))
p->sched_class = &rt_sched_class;
......@@ -3968,13 +3942,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
p->prio = prio;
if (on_rq) {
if (running)
p->sched_class->set_curr_task(rq);
enqueue_task(rq, p, 0);
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
if (task_running(rq, p)) {
if (running) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else {
......@@ -4138,7 +4114,7 @@ struct task_struct *idle_task(int cpu)
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
*/
static inline struct task_struct *find_process_by_pid(pid_t pid)
static struct task_struct *find_process_by_pid(pid_t pid)
{
return pid ? find_task_by_pid(pid) : current;
}
......@@ -4180,7 +4156,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
int sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param)
{
int retval, oldprio, oldpolicy = -1, on_rq;
int retval, oldprio, oldpolicy = -1, on_rq, running;
unsigned long flags;
struct rq *rq;
......@@ -4262,18 +4238,26 @@ int sched_setscheduler(struct task_struct *p, int policy,
}
update_rq_clock(rq);
on_rq = p->se.on_rq;
if (on_rq)
running = task_running(rq, p);
if (on_rq) {
deactivate_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
}
oldprio = p->prio;
__setscheduler(rq, p, policy, param->sched_priority);
if (on_rq) {
if (running)
p->sched_class->set_curr_task(rq);
activate_task(rq, p, 0);
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
if (task_running(rq, p)) {
if (running) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else {
......@@ -4344,10 +4328,10 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
asmlinkage long sys_sched_getscheduler(pid_t pid)
{
struct task_struct *p;
int retval = -EINVAL;
int retval;
if (pid < 0)
goto out_nounlock;
return -EINVAL;
retval = -ESRCH;
read_lock(&tasklist_lock);
......@@ -4358,8 +4342,6 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
retval = p->policy;
}
read_unlock(&tasklist_lock);
out_nounlock:
return retval;
}
......@@ -4372,10 +4354,10 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
{
struct sched_param lp;
struct task_struct *p;
int retval = -EINVAL;
int retval;
if (!param || pid < 0)
goto out_nounlock;
return -EINVAL;
read_lock(&tasklist_lock);
p = find_process_by_pid(pid);
......@@ -4395,7 +4377,6 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
*/
retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
out_nounlock:
return retval;
out_unlock:
......@@ -4555,8 +4536,8 @@ asmlinkage long sys_sched_yield(void)
{
struct rq *rq = this_rq_lock();
schedstat_inc(rq, yld_cnt);
current->sched_class->yield_task(rq, current);
schedstat_inc(rq, yld_count);
current->sched_class->yield_task(rq);
/*
* Since we are going to call schedule() anyway, there's
......@@ -4750,11 +4731,12 @@ asmlinkage
long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
{
struct task_struct *p;
int retval = -EINVAL;
unsigned int time_slice;
int retval;
struct timespec t;
if (pid < 0)
goto out_nounlock;
return -EINVAL;
retval = -ESRCH;
read_lock(&tasklist_lock);
......@@ -4766,12 +4748,24 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
if (retval)
goto out_unlock;
jiffies_to_timespec(p->policy == SCHED_FIFO ?
0 : static_prio_timeslice(p->static_prio), &t);
if (p->policy == SCHED_FIFO)
time_slice = 0;
else if (p->policy == SCHED_RR)
time_slice = DEF_TIMESLICE;
else {
struct sched_entity *se = &p->se;
unsigned long flags;
struct rq *rq;
rq = task_rq_lock(p, &flags);
time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
task_rq_unlock(rq, &flags);
}
read_unlock(&tasklist_lock);
jiffies_to_timespec(time_slice, &t);
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
out_nounlock:
return retval;
out_unlock:
read_unlock(&tasklist_lock);
return retval;
......@@ -4900,32 +4894,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
*/
cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
/*
* Increase the granularity value when there are more CPUs,
* because with more CPUs the 'effective latency' as visible
* to users decreases. But the relationship is not linear,
* so pick a second-best guess by going with the log2 of the
* number of CPUs.
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
static inline void sched_init_granularity(void)
{
unsigned int factor = 1 + ilog2(num_online_cpus());
const unsigned long limit = 100000000;
sysctl_sched_min_granularity *= factor;
if (sysctl_sched_min_granularity > limit)
sysctl_sched_min_granularity = limit;
sysctl_sched_latency *= factor;
if (sysctl_sched_latency > limit)
sysctl_sched_latency = limit;
sysctl_sched_runtime_limit = sysctl_sched_latency;
sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
}
#ifdef CONFIG_SMP
/*
* This is how migration works:
......@@ -5103,35 +5071,34 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
struct rq *rq;
int dest_cpu;
restart:
/* On same node? */
mask = node_to_cpumask(cpu_to_node(dead_cpu));
cpus_and(mask, mask, p->cpus_allowed);
dest_cpu = any_online_cpu(mask);
/* On any allowed CPU? */
if (dest_cpu == NR_CPUS)
dest_cpu = any_online_cpu(p->cpus_allowed);
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
rq = task_rq_lock(p, &flags);
cpus_setall(p->cpus_allowed);
dest_cpu = any_online_cpu(p->cpus_allowed);
task_rq_unlock(rq, &flags);
do {
/* On same node? */
mask = node_to_cpumask(cpu_to_node(dead_cpu));
cpus_and(mask, mask, p->cpus_allowed);
dest_cpu = any_online_cpu(mask);
/* On any allowed CPU? */
if (dest_cpu == NR_CPUS)
dest_cpu = any_online_cpu(p->cpus_allowed);
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
rq = task_rq_lock(p, &flags);
cpus_setall(p->cpus_allowed);
dest_cpu = any_online_cpu(p->cpus_allowed);
task_rq_unlock(rq, &flags);
/*
* Don't tell them about moving exiting tasks or
* kernel threads (both mm NULL), since they never
* leave kernel.
*/
if (p->mm && printk_ratelimit())
printk(KERN_INFO "process %d (%s) no "
"longer affine to cpu%d\n",
p->pid, p->comm, dead_cpu);
}
if (!__migrate_task(p, dead_cpu, dest_cpu))
goto restart;
/*
* Don't tell them about moving exiting tasks or
* kernel threads (both mm NULL), since they never
* leave kernel.
*/
if (p->mm && printk_ratelimit())
printk(KERN_INFO "process %d (%s) no "
"longer affine to cpu%d\n",
p->pid, p->comm, dead_cpu);
}
} while (!__migrate_task(p, dead_cpu, dest_cpu));
}
/*
......@@ -5172,6 +5139,20 @@ static void migrate_live_tasks(int src_cpu)
write_unlock_irq(&tasklist_lock);
}
/*
* activate_idle_task - move idle task to the _front_ of runqueue.
*/
static void activate_idle_task(struct task_struct *p, struct rq *rq)
{
update_rq_clock(rq);
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
enqueue_task(rq, p, 0);
inc_nr_running(p, rq);
}
/*
* Schedules idle task to be the next runnable task on current CPU.
* It does so by boosting its priority to highest possible and adding it to
......@@ -5284,14 +5265,23 @@ static struct ctl_table sd_ctl_root[] = {
static struct ctl_table *sd_alloc_ctl_entry(int n)
{
struct ctl_table *entry =
kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);
BUG_ON(!entry);
memset(entry, 0, n * sizeof(struct ctl_table));
kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
return entry;
}
static void sd_free_ctl_entry(struct ctl_table **tablep)
{
struct ctl_table *entry = *tablep;
for (entry = *tablep; entry->procname; entry++)
if (entry->child)
sd_free_ctl_entry(&entry->child);
kfree(*tablep);
*tablep = NULL;
}
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
......@@ -5307,7 +5297,10 @@ set_table_entry(struct ctl_table *entry,
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
struct ctl_table *table = sd_alloc_ctl_entry(14);
struct ctl_table *table = sd_alloc_ctl_entry(12);
if (table == NULL)
return NULL;
set_table_entry(&table[0], "min_interval", &sd->min_interval,
sizeof(long), 0644, proc_doulongvec_minmax);
......@@ -5327,11 +5320,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[10], "cache_nice_tries",
set_table_entry(&table[9], "cache_nice_tries",
&sd->cache_nice_tries,
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[12], "flags", &sd->flags,
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax);
/* &table[11] is terminator */
return table;
}
......@@ -5346,6 +5340,8 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
for_each_domain(cpu, sd)
domain_num++;
entry = table = sd_alloc_ctl_entry(domain_num + 1);
if (table == NULL)
return NULL;
i = 0;
for_each_domain(cpu, sd) {
......@@ -5360,24 +5356,38 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
}
static struct ctl_table_header *sd_sysctl_header;
static void init_sched_domain_sysctl(void)
static void register_sched_domain_sysctl(void)
{
int i, cpu_num = num_online_cpus();
struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
char buf[32];
if (entry == NULL)
return;
sd_ctl_dir[0].child = entry;
for (i = 0; i < cpu_num; i++, entry++) {
for_each_online_cpu(i) {
snprintf(buf, 32, "cpu%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0555;
entry->child = sd_alloc_ctl_cpu_table(i);
entry++;
}
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
static void unregister_sched_domain_sysctl(void)
{
unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
sd_free_ctl_entry(&sd_ctl_dir[0].child);
}
#else
static void init_sched_domain_sysctl(void)
static void register_sched_domain_sysctl(void)
{
}
static void unregister_sched_domain_sysctl(void)
{
}
#endif
......@@ -5499,8 +5509,7 @@ int __init migration_init(void)
int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
#undef SCHED_DOMAIN_DEBUG
#ifdef SCHED_DOMAIN_DEBUG
#ifdef CONFIG_SCHED_DEBUG
static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
int level = 0;
......@@ -5558,16 +5567,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
printk("\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
break;
}
if (!cpus_weight(group->cpumask)) {
printk("\n");
printk(KERN_ERR "ERROR: empty group\n");
break;
}
if (cpus_intersects(groupmask, group->cpumask)) {
printk("\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
break;
}
cpus_or(groupmask, groupmask, group->cpumask);
......@@ -5701,7 +5713,7 @@ static int __init isolated_cpu_setup(char *str)
return 1;
}
__setup ("isolcpus=", isolated_cpu_setup);
__setup("isolcpus=", isolated_cpu_setup);
/*
* init_sched_build_groups takes the cpumask we wish to span, and a pointer
......@@ -5930,24 +5942,23 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
if (!sg)
return;
next_sg:
for_each_cpu_mask(j, sg->cpumask) {
struct sched_domain *sd;
do {
for_each_cpu_mask(j, sg->cpumask) {
struct sched_domain *sd;
sd = &per_cpu(phys_domains, j);
if (j != first_cpu(sd->groups->cpumask)) {
/*
* Only add "power" once for each
* physical package.
*/
continue;
}
sd = &per_cpu(phys_domains, j);
if (j != first_cpu(sd->groups->cpumask)) {
/*
* Only add "power" once for each
* physical package.
*/
continue;
}
sg_inc_cpu_power(sg, sd->groups->__cpu_power);
}
sg = sg->next;
if (sg != group_head)
goto next_sg;
sg_inc_cpu_power(sg, sd->groups->__cpu_power);
}
sg = sg->next;
} while (sg != group_head);
}
#endif
......@@ -6058,7 +6069,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
/*
* Allocate the per-node list of sched groups
*/
sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
GFP_KERNEL);
if (!sched_group_nodes) {
printk(KERN_WARNING "Can not alloc sched group node list\n");
......@@ -6311,6 +6322,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
err = build_sched_domains(&cpu_default_map);
register_sched_domain_sysctl();
return err;
}
......@@ -6327,6 +6340,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
{
int i;
unregister_sched_domain_sysctl();
for_each_cpu_mask(i, *cpu_map)
cpu_attach_domain(NULL, i);
synchronize_sched();
......@@ -6357,6 +6372,8 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
if (!err && !cpus_empty(*partition2))
err = build_sched_domains(partition2);
register_sched_domain_sysctl();
return err;
}
......@@ -6488,17 +6505,13 @@ void __init sched_init_smp(void)
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);
init_sched_domain_sysctl();
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed(current, non_isolated_cpus) < 0)
BUG();
sched_init_granularity();
}
#else
void __init sched_init_smp(void)
{
sched_init_granularity();
}
#endif /* CONFIG_SMP */
......@@ -6512,28 +6525,20 @@ int in_sched_functions(unsigned long addr)
&& addr < (unsigned long)__sched_text_end);
}
static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
{
cfs_rq->tasks_timeline = RB_ROOT;
cfs_rq->fair_clock = 1;
#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq->rq = rq;
#endif
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
}
void __init sched_init(void)
{
u64 now = sched_clock();
int highest_cpu = 0;
int i, j;
/*
* Link up the scheduling class hierarchy:
*/
rt_sched_class.next = &fair_sched_class;
fair_sched_class.next = &idle_sched_class;
idle_sched_class.next = NULL;
for_each_possible_cpu(i) {
struct rt_prio_array *array;
struct rq *rq;
......@@ -6546,10 +6551,28 @@ void __init sched_init(void)
init_cfs_rq(&rq->cfs, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
{
struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
struct sched_entity *se =
&per_cpu(init_sched_entity, i);
init_cfs_rq_p[i] = cfs_rq;
init_cfs_rq(cfs_rq, rq);
cfs_rq->tg = &init_task_group;
list_add(&cfs_rq->leaf_cfs_rq_list,
&rq->leaf_cfs_rq_list);
init_sched_entity_p[i] = se;
se->cfs_rq = &rq->cfs;
se->my_q = cfs_rq;
se->load.weight = init_task_group_load;
se->load.inv_weight =
div64_64(1ULL<<32, init_task_group_load);
se->parent = NULL;
}
init_task_group.shares = init_task_group_load;
spin_lock_init(&init_task_group.lock);
#endif
rq->ls.load_update_last = now;
rq->ls.load_update_start = now;
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
......@@ -6634,26 +6657,40 @@ EXPORT_SYMBOL(__might_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
static void normalize_task(struct rq *rq, struct task_struct *p)
{
int on_rq;
update_rq_clock(rq);
on_rq = p->se.on_rq;
if (on_rq)
deactivate_task(rq, p, 0);
__setscheduler(rq, p, SCHED_NORMAL, 0);
if (on_rq) {
activate_task(rq, p, 0);
resched_task(rq->curr);
}
}
void normalize_rt_tasks(void)
{
struct task_struct *g, *p;
unsigned long flags;
struct rq *rq;
int on_rq;
read_lock_irq(&tasklist_lock);
do_each_thread(g, p) {
p->se.fair_key = 0;
p->se.wait_runtime = 0;
/*
* Only normalize user tasks:
*/
if (!p->mm)
continue;
p->se.exec_start = 0;
p->se.wait_start_fair = 0;
p->se.sleep_start_fair = 0;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
p->se.sleep_start = 0;
p->se.block_start = 0;
#endif
task_rq(p)->cfs.fair_clock = 0;
task_rq(p)->clock = 0;
if (!rt_task(p)) {
......@@ -6668,26 +6705,9 @@ void normalize_rt_tasks(void)
spin_lock_irqsave(&p->pi_lock, flags);
rq = __task_rq_lock(p);
#ifdef CONFIG_SMP
/*
* Do not touch the migration thread:
*/
if (p == rq->migration_thread)
goto out_unlock;
#endif
update_rq_clock(rq);
on_rq = p->se.on_rq;
if (on_rq)
deactivate_task(rq, p, 0);
__setscheduler(rq, p, SCHED_NORMAL, 0);
if (on_rq) {
activate_task(rq, p, 0);
resched_task(rq->curr);
}
#ifdef CONFIG_SMP
out_unlock:
#endif
normalize_task(rq, p);
__task_rq_unlock(rq);
spin_unlock_irqrestore(&p->pi_lock, flags);
} while_each_thread(g, p);
......@@ -6740,3 +6760,201 @@ void set_curr_task(int cpu, struct task_struct *p)
}
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
/* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(void)
{
struct task_group *tg;
struct cfs_rq *cfs_rq;
struct sched_entity *se;
struct rq *rq;
int i;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
return ERR_PTR(-ENOMEM);
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
if (!tg->cfs_rq)
goto err;
tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
if (!tg->se)
goto err;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
cpu_to_node(i));
if (!cfs_rq)
goto err;
se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
cpu_to_node(i));
if (!se)
goto err;
memset(cfs_rq, 0, sizeof(struct cfs_rq));
memset(se, 0, sizeof(struct sched_entity));
tg->cfs_rq[i] = cfs_rq;
init_cfs_rq(cfs_rq, rq);
cfs_rq->tg = tg;
tg->se[i] = se;
se->cfs_rq = &rq->cfs;
se->my_q = cfs_rq;
se->load.weight = NICE_0_LOAD;
se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
se->parent = NULL;
}
for_each_possible_cpu(i) {
rq = cpu_rq(i);
cfs_rq = tg->cfs_rq[i];
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
}
tg->shares = NICE_0_LOAD;
spin_lock_init(&tg->lock);
return tg;
err:
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
if (tg->se)
kfree(tg->se[i]);
}
kfree(tg->cfs_rq);
kfree(tg->se);
kfree(tg);
return ERR_PTR(-ENOMEM);
}
/* rcu callback to free various structures associated with a task group */
static void free_sched_group(struct rcu_head *rhp)
{
struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu);
struct task_group *tg = cfs_rq->tg;
struct sched_entity *se;
int i;
/* now it should be safe to free those cfs_rqs */
for_each_possible_cpu(i) {
cfs_rq = tg->cfs_rq[i];
kfree(cfs_rq);
se = tg->se[i];
kfree(se);
}
kfree(tg->cfs_rq);
kfree(tg->se);
kfree(tg);
}
/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
struct cfs_rq *cfs_rq;
int i;
for_each_possible_cpu(i) {
cfs_rq = tg->cfs_rq[i];
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
}
cfs_rq = tg->cfs_rq[0];
/* wait for possible concurrent references to cfs_rqs complete */
call_rcu(&cfs_rq->rcu, free_sched_group);
}
/* change task's runqueue when it moves between groups.
* The caller of this function should have put the task in its new group
* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
* reflect its new group.
*/
void sched_move_task(struct task_struct *tsk)
{
int on_rq, running;
unsigned long flags;
struct rq *rq;
rq = task_rq_lock(tsk, &flags);
if (tsk->sched_class != &fair_sched_class)
goto done;
update_rq_clock(rq);
running = task_running(rq, tsk);
on_rq = tsk->se.on_rq;
if (on_rq) {
dequeue_task(rq, tsk, 0);
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
}
set_task_cfs_rq(tsk);
if (on_rq) {
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
enqueue_task(rq, tsk, 0);
}
done:
task_rq_unlock(rq, &flags);
}
static void set_se_shares(struct sched_entity *se, unsigned long shares)
{
struct cfs_rq *cfs_rq = se->cfs_rq;
struct rq *rq = cfs_rq->rq;
int on_rq;
spin_lock_irq(&rq->lock);
on_rq = se->on_rq;
if (on_rq)
dequeue_entity(cfs_rq, se, 0);
se->load.weight = shares;
se->load.inv_weight = div64_64((1ULL<<32), shares);
if (on_rq)
enqueue_entity(cfs_rq, se, 0);
spin_unlock_irq(&rq->lock);
}
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
spin_lock(&tg->lock);
if (tg->shares == shares)
goto done;
tg->shares = shares;
for_each_possible_cpu(i)
set_se_shares(tg->se[i], shares);
done:
spin_unlock(&tg->lock);
return 0;
}
unsigned long sched_group_shares(struct task_group *tg)
{
return tg->shares;
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
......@@ -28,6 +28,31 @@
printk(x); \
} while (0)
/*
* Ease the printing of nsec fields:
*/
static long long nsec_high(long long nsec)
{
if (nsec < 0) {
nsec = -nsec;
do_div(nsec, 1000000);
return -nsec;
}
do_div(nsec, 1000000);
return nsec;
}
static unsigned long nsec_low(long long nsec)
{
if (nsec < 0)
nsec = -nsec;
return do_div(nsec, 1000000);
}
#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
......@@ -36,23 +61,19 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " ");
SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ",
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
p->comm, p->pid,
(long long)p->se.fair_key,
(long long)(p->se.fair_key - rq->cfs.fair_clock),
(long long)p->se.wait_runtime,
SPLIT_NS(p->se.vruntime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
#ifdef CONFIG_SCHEDSTATS
SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
(long long)p->se.sum_exec_runtime,
(long long)p->se.sum_wait_runtime,
(long long)p->se.sum_sleep_runtime,
(long long)p->se.wait_runtime_overruns,
(long long)p->se.wait_runtime_underruns);
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n",
SPLIT_NS(p->se.vruntime),
SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(p->se.sum_sleep_runtime));
#else
SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
0LL, 0LL, 0LL, 0LL, 0LL);
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
#endif
}
......@@ -62,14 +83,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m,
"\nrunnable tasks:\n"
" task PID tree-key delta waiting"
" switches prio"
" sum-exec sum-wait sum-sleep"
" wait-overrun wait-underrun\n"
"------------------------------------------------------------------"
"----------------"
"------------------------------------------------"
"--------------------------------\n");
" task PID tree-key switches prio"
" exec-runtime sum-exec sum-sleep\n"
"------------------------------------------------------"
"----------------------------------------------------\n");
read_lock_irq(&tasklist_lock);
......@@ -83,45 +100,48 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_unlock_irq(&tasklist_lock);
}
static void
print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
s64 wait_runtime_rq_sum = 0;
struct task_struct *p;
struct rb_node *curr;
unsigned long flags;
s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
spread, rq0_min_vruntime, spread0;
struct rq *rq = &per_cpu(runqueues, cpu);
struct sched_entity *last;
unsigned long flags;
spin_lock_irqsave(&rq->lock, flags);
curr = first_fair(cfs_rq);
while (curr) {
p = rb_entry(curr, struct task_struct, se.run_node);
wait_runtime_rq_sum += p->se.wait_runtime;
curr = rb_next(curr);
}
spin_unlock_irqrestore(&rq->lock, flags);
SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum",
(long long)wait_runtime_rq_sum);
}
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
SEQ_printf(m, "\ncfs_rq\n");
#define P(x) \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
P(fair_clock);
P(exec_clock);
P(wait_runtime);
P(wait_runtime_overruns);
P(wait_runtime_underruns);
P(sleeper_bonus);
#undef P
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
spin_lock_irqsave(&rq->lock, flags);
if (cfs_rq->rb_leftmost)
MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
max_vruntime = last->vruntime;
min_vruntime = rq->cfs.min_vruntime;
rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
spin_unlock_irqrestore(&rq->lock, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
SPLIT_NS(MIN_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
SPLIT_NS(min_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
SPLIT_NS(max_vruntime));
spread = max_vruntime - MIN_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
SPLIT_NS(spread));
spread0 = min_vruntime - rq0_min_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SCHEDSTATS
SEQ_printf(m, " .%-30s: %ld\n", "bkl_count",
rq->bkl_count);
#endif
SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
cfs_rq->nr_spread_over);
}
static void print_cpu(struct seq_file *m, int cpu)
......@@ -141,31 +161,32 @@ static void print_cpu(struct seq_file *m, int cpu)
#define P(x) \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
#define PN(x) \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
P(nr_running);
SEQ_printf(m, " .%-30s: %lu\n", "load",
rq->ls.load.weight);
P(ls.delta_fair);
P(ls.delta_exec);
rq->load.weight);
P(nr_switches);
P(nr_load_updates);
P(nr_uninterruptible);
SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
P(next_balance);
PN(next_balance);
P(curr->pid);
P(clock);
P(idle_clock);
P(prev_clock_raw);
PN(clock);
PN(idle_clock);
PN(prev_clock_raw);
P(clock_warps);
P(clock_overflows);
P(clock_deep_idle_events);
P(clock_max_delta);
PN(clock_max_delta);
P(cpu_load[0]);
P(cpu_load[1]);
P(cpu_load[2]);
P(cpu_load[3]);
P(cpu_load[4]);
#undef P
#undef PN
print_cfs_stats(m, cpu);
......@@ -177,12 +198,25 @@ static int sched_debug_show(struct seq_file *m, void *v)
u64 now = ktime_to_ns(ktime_get());
int cpu;
SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n",
SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
#define P(x) \
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
PN(sysctl_sched_latency);
PN(sysctl_sched_nr_latency);
PN(sysctl_sched_wakeup_granularity);
PN(sysctl_sched_batch_wakeup_granularity);
PN(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#undef PN
#undef P
for_each_online_cpu(cpu)
print_cpu(m, cpu);
......@@ -202,7 +236,7 @@ static int sched_debug_open(struct inode *inode, struct file *filp)
return single_open(filp, sched_debug_show, NULL);
}
static struct file_operations sched_debug_fops = {
static const struct file_operations sched_debug_fops = {
.open = sched_debug_open,
.read = seq_read,
.llseek = seq_lseek,
......@@ -226,6 +260,7 @@ __initcall(init_sched_debug_procfs);
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
unsigned long nr_switches;
unsigned long flags;
int num_threads = 1;
......@@ -237,41 +272,89 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
rcu_read_unlock();
SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
SEQ_printf(m, "----------------------------------------------\n");
SEQ_printf(m,
"---------------------------------------------------------\n");
#define __P(F) \
SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
#define P(F) \
SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
#define __PN(F) \
SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \
SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
P(se.wait_runtime);
P(se.wait_start_fair);
P(se.exec_start);
P(se.sleep_start_fair);
P(se.sum_exec_runtime);
PN(se.exec_start);
PN(se.vruntime);
PN(se.sum_exec_runtime);
nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
P(se.wait_start);
P(se.sleep_start);
P(se.block_start);
P(se.sleep_max);
P(se.block_max);
P(se.exec_max);
P(se.wait_max);
P(se.wait_runtime_overruns);
P(se.wait_runtime_underruns);
P(se.sum_wait_runtime);
PN(se.wait_start);
PN(se.sleep_start);
PN(se.block_start);
PN(se.sleep_max);
PN(se.block_max);
PN(se.exec_max);
PN(se.slice_max);
PN(se.wait_max);
P(sched_info.bkl_count);
P(se.nr_migrations);
P(se.nr_migrations_cold);
P(se.nr_failed_migrations_affine);
P(se.nr_failed_migrations_running);
P(se.nr_failed_migrations_hot);
P(se.nr_forced_migrations);
P(se.nr_forced2_migrations);
P(se.nr_wakeups);
P(se.nr_wakeups_sync);
P(se.nr_wakeups_migrate);
P(se.nr_wakeups_local);
P(se.nr_wakeups_remote);
P(se.nr_wakeups_affine);
P(se.nr_wakeups_affine_attempts);
P(se.nr_wakeups_passive);
P(se.nr_wakeups_idle);
{
u64 avg_atom, avg_per_cpu;
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
do_div(avg_atom, nr_switches);
else
avg_atom = -1LL;
avg_per_cpu = p->se.sum_exec_runtime;
if (p->se.nr_migrations)
avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations);
else
avg_per_cpu = -1LL;
__PN(avg_atom);
__PN(avg_per_cpu);
}
#endif
SEQ_printf(m, "%-25s:%20Ld\n",
"nr_switches", (long long)(p->nvcsw + p->nivcsw));
__P(nr_switches);
SEQ_printf(m, "%-35s:%21Ld\n",
"nr_voluntary_switches", (long long)p->nvcsw);
SEQ_printf(m, "%-35s:%21Ld\n",
"nr_involuntary_switches", (long long)p->nivcsw);
P(se.load.weight);
P(policy);
P(prio);
#undef PN
#undef __PN
#undef P
#undef __P
{
u64 t0, t1;
t0 = sched_clock();
t1 = sched_clock();
SEQ_printf(m, "%-25s:%20Ld\n",
SEQ_printf(m, "%-35s:%21Ld\n",
"clock-delta", (long long)(t1-t0));
}
}
......@@ -279,9 +362,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
void proc_sched_set_task(struct task_struct *p)
{
#ifdef CONFIG_SCHEDSTATS
p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
p->se.wait_max = 0;
p->se.sleep_max = 0;
p->se.sum_sleep_runtime = 0;
p->se.block_max = 0;
p->se.exec_max = 0;
p->se.slice_max = 0;
p->se.nr_migrations = 0;
p->se.nr_migrations_cold = 0;
p->se.nr_failed_migrations_affine = 0;
p->se.nr_failed_migrations_running = 0;
p->se.nr_failed_migrations_hot = 0;
p->se.nr_forced_migrations = 0;
p->se.nr_forced2_migrations = 0;
p->se.nr_wakeups = 0;
p->se.nr_wakeups_sync = 0;
p->se.nr_wakeups_migrate = 0;
p->se.nr_wakeups_local = 0;
p->se.nr_wakeups_remote = 0;
p->se.nr_wakeups_affine = 0;
p->se.nr_wakeups_affine_attempts = 0;
p->se.nr_wakeups_passive = 0;
p->se.nr_wakeups_idle = 0;
p->sched_info.bkl_count = 0;
#endif
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->nvcsw = 0;
p->nivcsw = 0;
}
......@@ -25,22 +25,26 @@
* (default: 20ms, units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length.
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches field)
* 'timeslice length' - timeslices in CFS are of variable length
* and have no persistent notion like in traditional, time-slice
* based scheduling concepts.
*
* On SMP systems the value of this is multiplied by the log2 of the
* number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
* systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
* Targeted preemption latency for CPU-bound tasks:
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*/
unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
const_debug unsigned int sysctl_sched_latency = 20000000ULL;
/*
* After fork, child runs first. (default) If set to 0 then
* parent will (try to) run first.
*/
const_debug unsigned int sysctl_sched_child_runs_first = 1;
/*
* Minimal preemption granularity for CPU-bound tasks:
* (default: 2 msec, units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
const_debug unsigned int sysctl_sched_nr_latency = 20;
/*
* sys_sched_yield() compat mode
......@@ -52,52 +56,25 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
/*
* SCHED_BATCH wake-up granularity.
* (default: 25 msec, units: nanoseconds)
* (default: 10 msec, units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*/
unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL;
const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
/*
* SCHED_OTHER wake-up granularity.
* (default: 1 msec, units: nanoseconds)
* (default: 10 msec, units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*/
unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL;
unsigned int sysctl_sched_stat_granularity __read_mostly;
/*
* Initialized in sched_init_granularity() [to 5 times the base granularity]:
*/
unsigned int sysctl_sched_runtime_limit __read_mostly;
/*
* Debugging: various feature bits
*/
enum {
SCHED_FEAT_FAIR_SLEEPERS = 1,
SCHED_FEAT_SLEEPER_AVG = 2,
SCHED_FEAT_SLEEPER_LOAD_AVG = 4,
SCHED_FEAT_PRECISE_CPU_LOAD = 8,
SCHED_FEAT_START_DEBIT = 16,
SCHED_FEAT_SKIP_INITIAL = 32,
};
const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
unsigned int sysctl_sched_features __read_mostly =
SCHED_FEAT_FAIR_SLEEPERS *1 |
SCHED_FEAT_SLEEPER_AVG *0 |
SCHED_FEAT_SLEEPER_LOAD_AVG *1 |
SCHED_FEAT_PRECISE_CPU_LOAD *1 |
SCHED_FEAT_START_DEBIT *1 |
SCHED_FEAT_SKIP_INITIAL *0;
extern struct sched_class fair_sched_class;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
/**************************************************************
* CFS operations on generic schedulable entities:
......@@ -111,21 +88,9 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
return cfs_rq->rq;
}
/* currently running entity (if any) on this cfs_rq */
static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
{
return cfs_rq->curr;
}
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
static inline void
set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
cfs_rq->curr = se;
}
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
......@@ -133,21 +98,8 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
return container_of(cfs_rq, struct rq, cfs);
}
static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
if (unlikely(rq->curr->sched_class != &fair_sched_class))
return NULL;
return &rq->curr->se;
}
#define entity_is_task(se) 1
static inline void
set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
static inline struct task_struct *task_of(struct sched_entity *se)
......@@ -160,16 +112,38 @@ static inline struct task_struct *task_of(struct sched_entity *se)
* Scheduling class tree data structure manipulation methods:
*/
static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta > 0)
min_vruntime = vruntime;
return min_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
min_vruntime = vruntime;
return min_vruntime;
}
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return se->vruntime - cfs_rq->min_vruntime;
}
/*
* Enqueue an entity into the rb-tree:
*/
static inline void
__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
s64 key = se->fair_key;
s64 key = entity_key(cfs_rq, se);
int leftmost = 1;
/*
......@@ -182,7 +156,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (key - entry->fair_key < 0) {
if (key < entity_key(cfs_rq, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
......@@ -199,24 +173,14 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
update_load_add(&cfs_rq->load, se->load.weight);
cfs_rq->nr_running++;
se->on_rq = 1;
schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
}
static inline void
__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->rb_leftmost == &se->run_node)
cfs_rq->rb_leftmost = rb_next(&se->run_node);
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
update_load_sub(&cfs_rq->load, se->load.weight);
cfs_rq->nr_running--;
se->on_rq = 0;
schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
......@@ -229,118 +193,86 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
}
static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
struct sched_entity *se = NULL;
struct rb_node *parent;
while (*link) {
parent = *link;
se = rb_entry(parent, struct sched_entity, run_node);
link = &parent->rb_right;
}
return se;
}
/**************************************************************
* Scheduling class statistics methods:
*/
/*
* Calculate the preemption granularity needed to schedule every
* runnable task once per sysctl_sched_latency amount of time.
* (down to a sensible low limit on granularity)
*
* For example, if there are 2 tasks running and latency is 10 msecs,
* we switch tasks every 5 msecs. If we have 3 tasks running, we have
* to switch tasks every 3.33 msecs to get a 10 msecs observed latency
* for each task. We do finer and finer scheduling up to until we
* reach the minimum granularity value.
*
* To achieve this we use the following dynamic-granularity rule:
* The idea is to set a period in which each task runs once.
*
* gran = lat/nr - lat/nr/nr
* When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
* this period because otherwise the slices get too small.
*
* This comes out of the following equations:
*
* kA1 + gran = kB1
* kB2 + gran = kA2
* kA2 = kA1
* kB2 = kB1 - d + d/nr
* lat = d * nr
*
* Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running),
* '1' is start of time, '2' is end of time, 'd' is delay between
* 1 and 2 (during which task B was running), 'nr' is number of tasks
* running, 'lat' is the the period of each task. ('lat' is the
* sched_latency that we aim for.)
* p = (nr <= nl) ? l : l*nr/nl
*/
static long
sched_granularity(struct cfs_rq *cfs_rq)
static u64 __sched_period(unsigned long nr_running)
{
unsigned int gran = sysctl_sched_latency;
unsigned int nr = cfs_rq->nr_running;
u64 period = sysctl_sched_latency;
unsigned long nr_latency = sysctl_sched_nr_latency;
if (nr > 1) {
gran = gran/nr - gran/nr/nr;
gran = max(gran, sysctl_sched_min_granularity);
if (unlikely(nr_running > nr_latency)) {
period *= nr_running;
do_div(period, nr_latency);
}
return gran;
return period;
}
/*
* We rescale the rescheduling granularity of tasks according to their
* nice level, but only linearly, not exponentially:
* We calculate the wall-time slice from the period by taking a part
* proportional to the weight.
*
* s = p*w/rw
*/
static long
niced_granularity(struct sched_entity *curr, unsigned long granularity)
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 tmp;
u64 slice = __sched_period(cfs_rq->nr_running);
if (likely(curr->load.weight == NICE_0_LOAD))
return granularity;
/*
* Positive nice levels get the same granularity as nice-0:
*/
if (likely(curr->load.weight < NICE_0_LOAD)) {
tmp = curr->load.weight * (u64)granularity;
return (long) (tmp >> NICE_0_SHIFT);
}
/*
* Negative nice level tasks get linearly finer
* granularity:
*/
tmp = curr->load.inv_weight * (u64)granularity;
slice *= se->load.weight;
do_div(slice, cfs_rq->load.weight);
/*
* It will always fit into 'long':
*/
return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT));
return slice;
}
static inline void
limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* We calculate the vruntime slice.
*
* vs = s/w = p/rw
*/
static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
{
long limit = sysctl_sched_runtime_limit;
u64 vslice = __sched_period(nr_running);
/*
* Niced tasks have the same history dynamic range as
* non-niced tasks:
*/
if (unlikely(se->wait_runtime > limit)) {
se->wait_runtime = limit;
schedstat_inc(se, wait_runtime_overruns);
schedstat_inc(cfs_rq, wait_runtime_overruns);
}
if (unlikely(se->wait_runtime < -limit)) {
se->wait_runtime = -limit;
schedstat_inc(se, wait_runtime_underruns);
schedstat_inc(cfs_rq, wait_runtime_underruns);
}
do_div(vslice, rq_weight);
return vslice;
}
static inline void
__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
static u64 sched_vslice(struct cfs_rq *cfs_rq)
{
se->wait_runtime += delta;
schedstat_add(se, sum_wait_runtime, delta);
limit_wait_runtime(cfs_rq, se);
return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
}
static void
add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
__add_wait_runtime(cfs_rq, se, delta);
schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
return __sched_vslice(cfs_rq->load.weight + se->load.weight,
cfs_rq->nr_running + 1);
}
/*
......@@ -348,46 +280,41 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
* are not in our scheduling class.
*/
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
unsigned long delta_exec)
{
unsigned long delta, delta_exec, delta_fair, delta_mine;
struct load_weight *lw = &cfs_rq->load;
unsigned long load = lw->weight;
unsigned long delta_exec_weighted;
u64 vruntime;
delta_exec = curr->delta_exec;
schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
curr->sum_exec_runtime += delta_exec;
cfs_rq->exec_clock += delta_exec;
if (unlikely(!load))
return;
delta_fair = calc_delta_fair(delta_exec, lw);
delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
delta = min(delta, (unsigned long)(
(long)sysctl_sched_runtime_limit - curr->wait_runtime));
cfs_rq->sleeper_bonus -= delta;
delta_mine -= delta;
schedstat_add(cfs_rq, exec_clock, delta_exec);
delta_exec_weighted = delta_exec;
if (unlikely(curr->load.weight != NICE_0_LOAD)) {
delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
&curr->load);
}
curr->vruntime += delta_exec_weighted;
cfs_rq->fair_clock += delta_fair;
/*
* We executed delta_exec amount of time on the CPU,
* but we were only entitled to delta_mine amount of
* time during that period (if nr_running == 1 then
* the two values are equal)
* [Note: delta_mine - delta_exec is negative]:
* maintain cfs_rq->min_vruntime to be a monotonic increasing
* value tracking the leftmost vruntime in the tree.
*/
add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
if (first_fair(cfs_rq)) {
vruntime = min_vruntime(curr->vruntime,
__pick_next_entity(cfs_rq)->vruntime);
} else
vruntime = curr->vruntime;
cfs_rq->min_vruntime =
max_vruntime(cfs_rq->min_vruntime, vruntime);
}
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq_curr(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_of(cfs_rq)->clock;
unsigned long delta_exec;
if (unlikely(!curr))
......@@ -398,135 +325,47 @@ static void update_curr(struct cfs_rq *cfs_rq)
* since the last time we changed load (this cannot
* overflow on 32 bits):
*/
delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start);
delta_exec = (unsigned long)(now - curr->exec_start);
curr->delta_exec += delta_exec;
if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
__update_curr(cfs_rq, curr);
curr->delta_exec = 0;
}
curr->exec_start = rq_of(cfs_rq)->clock;
__update_curr(cfs_rq, curr, delta_exec);
curr->exec_start = now;
}
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
se->wait_start_fair = cfs_rq->fair_clock;
schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
}
/*
* We calculate fair deltas here, so protect against the random effects
* of a multiplication overflow by capping it to the runtime limit:
*/
#if BITS_PER_LONG == 32
static inline unsigned long
calc_weighted(unsigned long delta, unsigned long weight, int shift)
{
u64 tmp = (u64)delta * weight >> shift;
if (unlikely(tmp > sysctl_sched_runtime_limit*2))
return sysctl_sched_runtime_limit*2;
return tmp;
}
#else
static inline unsigned long
calc_weighted(unsigned long delta, unsigned long weight, int shift)
{
return delta * weight >> shift;
}
#endif
/*
* Task is being enqueued - update stats:
*/
static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 key;
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq_curr(cfs_rq))
if (se != cfs_rq->curr)
update_stats_wait_start(cfs_rq, se);
/*
* Update the key:
*/
key = cfs_rq->fair_clock;
/*
* Optimize the common nice 0 case:
*/
if (likely(se->load.weight == NICE_0_LOAD)) {
key -= se->wait_runtime;
} else {
u64 tmp;
if (se->wait_runtime < 0) {
tmp = -se->wait_runtime;
key += (tmp * se->load.inv_weight) >>
(WMULT_SHIFT - NICE_0_SHIFT);
} else {
tmp = se->wait_runtime;
key -= (tmp * se->load.inv_weight) >>
(WMULT_SHIFT - NICE_0_SHIFT);
}
}
se->fair_key = key;
}
/*
* Note: must be called with a freshly updated rq->fair_clock.
*/
static inline void
__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long delta_fair = se->delta_fair_run;
schedstat_set(se->wait_max, max(se->wait_max,
rq_of(cfs_rq)->clock - se->wait_start));
if (unlikely(se->load.weight != NICE_0_LOAD))
delta_fair = calc_weighted(delta_fair, se->load.weight,
NICE_0_SHIFT);
add_wait_runtime(cfs_rq, se, delta_fair);
}
static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long delta_fair;
if (unlikely(!se->wait_start_fair))
return;
delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
(u64)(cfs_rq->fair_clock - se->wait_start_fair));
se->delta_fair_run += delta_fair;
if (unlikely(abs(se->delta_fair_run) >=
sysctl_sched_stat_granularity)) {
__update_stats_wait_end(cfs_rq, se);
se->delta_fair_run = 0;
}
se->wait_start_fair = 0;
schedstat_set(se->wait_max, max(se->wait_max,
rq_of(cfs_rq)->clock - se->wait_start));
schedstat_set(se->wait_start, 0);
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_curr(cfs_rq);
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
*/
if (se != cfs_rq_curr(cfs_rq))
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
}
......@@ -542,79 +381,28 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->exec_start = rq_of(cfs_rq)->clock;
}
/*
* We are descheduling a task - update its stats:
*/
static inline void
update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
se->exec_start = 0;
}
/**************************************************
* Scheduling class queueing methods:
*/
static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long load = cfs_rq->load.weight, delta_fair;
long prev_runtime;
/*
* Do not boost sleepers if there's too much bonus 'in flight'
* already:
*/
if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
return;
if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
load = rq_of(cfs_rq)->cpu_load[2];
delta_fair = se->delta_fair_sleep;
/*
* Fix up delta_fair with the effect of us running
* during the whole sleep period:
*/
if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
delta_fair = div64_likely32((u64)delta_fair * load,
load + se->load.weight);
if (unlikely(se->load.weight != NICE_0_LOAD))
delta_fair = calc_weighted(delta_fair, se->load.weight,
NICE_0_SHIFT);
prev_runtime = se->wait_runtime;
__add_wait_runtime(cfs_rq, se, delta_fair);
delta_fair = se->wait_runtime - prev_runtime;
update_load_add(&cfs_rq->load, se->load.weight);
cfs_rq->nr_running++;
se->on_rq = 1;
}
/*
* Track the amount of bonus we've given to sleepers:
*/
cfs_rq->sleeper_bonus += delta_fair;
static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
cfs_rq->nr_running--;
se->on_rq = 0;
}
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *tsk = task_of(se);
unsigned long delta_fair;
if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
!(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
return;
delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
(u64)(cfs_rq->fair_clock - se->sleep_start_fair));
se->delta_fair_sleep += delta_fair;
if (unlikely(abs(se->delta_fair_sleep) >=
sysctl_sched_stat_granularity)) {
__enqueue_sleeper(cfs_rq, se);
se->delta_fair_sleep = 0;
}
se->sleep_start_fair = 0;
#ifdef CONFIG_SCHEDSTATS
if (se->sleep_start) {
u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
......@@ -646,6 +434,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
struct task_struct *tsk = task_of(se);
profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
delta >> 20);
}
......@@ -653,27 +443,81 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
#endif
}
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
s64 d = se->vruntime - cfs_rq->min_vruntime;
if (d < 0)
d = -d;
if (d > 3*sysctl_sched_latency)
schedstat_inc(cfs_rq, nr_spread_over);
#endif
}
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime;
vruntime = cfs_rq->min_vruntime;
if (sched_feat(TREE_AVG)) {
struct sched_entity *last = __pick_last_entity(cfs_rq);
if (last) {
vruntime += last->vruntime;
vruntime >>= 1;
}
} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
vruntime += sched_vslice(cfs_rq)/2;
if (initial && sched_feat(START_DEBIT))
vruntime += sched_vslice_add(cfs_rq, se);
if (!initial) {
if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
task_of(se)->policy != SCHED_BATCH)
vruntime -= sysctl_sched_latency;
vruntime = max_t(s64, vruntime, se->vruntime);
}
se->vruntime = vruntime;
}
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
{
/*
* Update the fair clock.
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
if (wakeup)
if (wakeup) {
place_entity(cfs_rq, se, 0);
enqueue_sleeper(cfs_rq, se);
}
update_stats_enqueue(cfs_rq, se);
__enqueue_entity(cfs_rq, se);
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
}
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
update_stats_dequeue(cfs_rq, se);
if (sleep) {
se->sleep_start_fair = cfs_rq->fair_clock;
se->peer_preempt = 0;
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
......@@ -685,68 +529,66 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
}
#endif
}
__dequeue_entity(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
}
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
struct sched_entity *curr, unsigned long granularity)
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
s64 __delta = curr->fair_key - se->fair_key;
unsigned long ideal_runtime, delta_exec;
/*
* ideal_runtime is compared against sum_exec_runtime, which is
* walltime, hence do not scale.
*/
ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running,
(unsigned long)sysctl_sched_min_granularity);
/*
* If we executed more than what the latency constraint suggests,
* reduce the rescheduling granularity. This way the total latency
* of how much a task is not scheduled converges to
* sysctl_sched_latency:
*/
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime)
granularity = 0;
/*
* Take scheduling granularity into account - do not
* preempt the current task unless the best task has
* a larger than sched_granularity fairness advantage:
*
* scale granularity as key space is in fair_clock.
*/
if (__delta > niced_granularity(curr, granularity))
if (delta_exec > ideal_runtime ||
(sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt))
resched_task(rq_of(cfs_rq)->curr);
curr->peer_preempt = 0;
}
static inline void
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
* Any task has to be enqueued before it get to execute on
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
#ifdef CONFIG_SCHEDSTATS
/*
* Any task has to be enqueued before it get to execute on
* a CPU. So account for the time it spent waiting on the
* runqueue. (note, here we rely on pick_next_task() having
* done a put_prev_task_fair() shortly before this, which
* updated rq->fair_clock - used by update_stats_wait_end())
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
update_stats_wait_end(cfs_rq, se);
update_stats_curr_start(cfs_rq, se);
set_cfs_rq_curr(cfs_rq, se);
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->slice_max = max(se->slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
#endif
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = __pick_next_entity(cfs_rq);
struct sched_entity *se = NULL;
set_next_entity(cfs_rq, se);
if (first_fair(cfs_rq)) {
se = __pick_next_entity(cfs_rq);
set_next_entity(cfs_rq, se);
}
return se;
}
......@@ -760,33 +602,24 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
if (prev->on_rq)
update_curr(cfs_rq);
update_stats_curr_end(cfs_rq, prev);
if (prev->on_rq)
check_spread(cfs_rq, prev);
if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
set_cfs_rq_curr(cfs_rq, NULL);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
}
cfs_rq->curr = NULL;
}
static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
struct sched_entity *next;
/*
* Dequeue and enqueue the task to update its
* position within the tree:
* Update run-time statistics of the 'current'.
*/
dequeue_entity(cfs_rq, curr, 0);
enqueue_entity(cfs_rq, curr, 0);
/*
* Reschedule if another task tops the current one.
*/
next = __pick_next_entity(cfs_rq);
if (next == curr)
return;
update_curr(cfs_rq);
__check_preempt_curr_fair(cfs_rq, next, curr,
sched_granularity(cfs_rq));
if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
check_preempt_tick(cfs_rq, curr);
}
/**************************************************
......@@ -821,23 +654,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
*/
static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
{
/* A later patch will take group into account */
return &cpu_rq(this_cpu)->cfs;
return cfs_rq->tg->cfs_rq[this_cpu];
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
/* Do the two (enqueued) tasks belong to the same group ? */
static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
/* Do the two (enqueued) entities belong to the same group ? */
static inline int
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (curr->se.cfs_rq == p->se.cfs_rq)
if (se->cfs_rq == pse->cfs_rq)
return 1;
return 0;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return se->parent;
}
#else /* CONFIG_FAIR_GROUP_SCHED */
#define for_each_sched_entity(se) \
......@@ -870,11 +708,17 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
static inline int
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
return 1;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
......@@ -892,6 +736,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, wakeup);
wakeup = 1;
}
}
......@@ -911,6 +756,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
sleep = 1;
}
}
......@@ -919,12 +765,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
*
* If compat_yield is turned on then we requeue to the end of the tree.
*/
static void yield_task_fair(struct rq *rq, struct task_struct *p)
static void yield_task_fair(struct rq *rq)
{
struct cfs_rq *cfs_rq = task_cfs_rq(p);
struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
struct sched_entity *rightmost, *se = &p->se;
struct rb_node *parent;
struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr);
struct sched_entity *rightmost, *se = &rq->curr->se;
/*
* Are we the only task in the tree?
......@@ -935,52 +779,39 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p)
if (likely(!sysctl_sched_compat_yield)) {
__update_rq_clock(rq);
/*
* Dequeue and enqueue the task to update its
* position within the tree:
* Update run-time statistics of the 'current'.
*/
dequeue_entity(cfs_rq, &p->se, 0);
enqueue_entity(cfs_rq, &p->se, 0);
update_curr(cfs_rq);
return;
}
/*
* Find the rightmost entry in the rbtree:
*/
do {
parent = *link;
link = &parent->rb_right;
} while (*link);
rightmost = rb_entry(parent, struct sched_entity, run_node);
rightmost = __pick_last_entity(cfs_rq);
/*
* Already in the rightmost position?
*/
if (unlikely(rightmost == se))
if (unlikely(rightmost->vruntime < se->vruntime))
return;
/*
* Minimally necessary key value to be last in the tree:
* Upon rescheduling, sched_class::put_prev_task() will place
* 'current' within the tree based on its new key value.
*/
se->fair_key = rightmost->fair_key + 1;
if (cfs_rq->rb_leftmost == &se->run_node)
cfs_rq->rb_leftmost = rb_next(&se->run_node);
/*
* Relink the task to the rightmost position:
*/
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
se->vruntime = rightmost->vruntime + 1;
}
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
{
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
unsigned long gran;
struct sched_entity *se = &curr->se, *pse = &p->se;
s64 delta, gran;
if (unlikely(rt_prio(p->prio))) {
update_rq_clock(rq);
......@@ -988,16 +819,31 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
resched_task(curr);
return;
}
gran = sysctl_sched_wakeup_granularity;
/*
* Batch tasks prefer throughput over latency:
* Batch tasks do not preempt (their preemption is driven by
* the tick):
*/
if (unlikely(p->policy == SCHED_BATCH))
gran = sysctl_sched_batch_wakeup_granularity;
return;
if (sched_feat(WAKEUP_PREEMPT)) {
while (!is_same_group(se, pse)) {
se = parent_entity(se);
pse = parent_entity(pse);
}
if (is_same_group(curr, p))
__check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
delta = se->vruntime - pse->vruntime;
gran = sysctl_sched_wakeup_granularity;
if (unlikely(se->load.weight != NICE_0_LOAD))
gran = calc_delta_fair(gran, &se->load);
if (delta > gran) {
int now = !sched_feat(PREEMPT_RESTRICT);
if (now || p->prio < curr->prio || !se->peer_preempt++)
resched_task(curr);
}
}
}
static struct task_struct *pick_next_task_fair(struct rq *rq)
......@@ -1041,7 +887,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
* achieve that by always pre-iterating before returning
* the current task:
*/
static inline struct task_struct *
static struct task_struct *
__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
{
struct task_struct *p;
......@@ -1078,7 +924,10 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
if (!cfs_rq->nr_running)
return MAX_PRIO;
curr = __pick_next_entity(cfs_rq);
curr = cfs_rq->curr;
if (!curr)
curr = __pick_next_entity(cfs_rq);
p = task_of(curr);
return p->prio;
......@@ -1153,6 +1002,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
}
}
#define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
/*
* Share the fairness runtime between parent and child, thus the
* total amount of pressure for CPU stays equal - new tasks
......@@ -1163,37 +1014,32 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
static void task_new_fair(struct rq *rq, struct task_struct *p)
{
struct cfs_rq *cfs_rq = task_cfs_rq(p);
struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq);
struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
int this_cpu = smp_processor_id();
sched_info_queued(p);
update_curr(cfs_rq);
update_stats_enqueue(cfs_rq, se);
/*
* Child runs first: we let it run before the parent
* until it reschedules once. We set up the key so that
* it will preempt the parent:
*/
se->fair_key = curr->fair_key -
niced_granularity(curr, sched_granularity(cfs_rq)) - 1;
/*
* The first wait is dominated by the child-runs-first logic,
* so do not credit it with that waiting time yet:
*/
if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
se->wait_start_fair = 0;
place_entity(cfs_rq, se, 1);
/*
* The statistical average of wait_runtime is about
* -granularity/2, so initialize the task with that:
*/
if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
curr->vruntime < se->vruntime) {
/*
* Upon rescheduling, sched_class::put_prev_task() will place
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
}
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
check_spread(cfs_rq, curr);
__enqueue_entity(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
se->peer_preempt = 0;
resched_task(rq->curr);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
/* Account for a task changing its policy or group.
*
* This routine is mostly called to set cfs_rq->curr field when a task
......@@ -1206,21 +1052,17 @@ static void set_curr_task_fair(struct rq *rq)
for_each_sched_entity(se)
set_next_entity(cfs_rq_of(se), se);
}
#else
static void set_curr_task_fair(struct rq *rq)
{
}
#endif
/*
* All the scheduling class methods:
*/
struct sched_class fair_sched_class __read_mostly = {
static const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.check_preempt_curr = check_preempt_curr_fair,
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
......@@ -1237,6 +1079,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
{
struct cfs_rq *cfs_rq;
#ifdef CONFIG_FAIR_GROUP_SCHED
print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
#endif
for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
print_cfs_rq(m, cpu, cfs_rq);
}
......
......@@ -50,10 +50,15 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr)
{
}
static void set_curr_task_idle(struct rq *rq)
{
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
static struct sched_class idle_sched_class __read_mostly = {
const struct sched_class idle_sched_class = {
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
......@@ -66,6 +71,7 @@ static struct sched_class idle_sched_class __read_mostly = {
.load_balance = load_balance_idle,
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
/* no .task_new for idle tasks */
};
......@@ -7,7 +7,7 @@
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
*/
static inline void update_curr_rt(struct rq *rq)
static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
u64 delta_exec;
......@@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p)
}
static void
yield_task_rt(struct rq *rq, struct task_struct *p)
yield_task_rt(struct rq *rq)
{
requeue_task_rt(rq, p);
requeue_task_rt(rq, rq->curr);
}
/*
......@@ -206,7 +206,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
if (--p->time_slice)
return;
p->time_slice = static_prio_timeslice(p->static_prio);
p->time_slice = DEF_TIMESLICE;
/*
* Requeue to the end of queue if we are not the only element
......@@ -218,7 +218,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
}
}
static struct sched_class rt_sched_class __read_mostly = {
static void set_curr_task_rt(struct rq *rq)
{
struct task_struct *p = rq->curr;
p->se.exec_start = rq->clock;
}
const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
......@@ -230,5 +238,6 @@ static struct sched_class rt_sched_class __read_mostly = {
.load_balance = load_balance_rt,
.set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
};
......@@ -16,18 +16,18 @@ static int show_schedstat(struct seq_file *seq, void *v)
struct rq *rq = cpu_rq(cpu);
#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcnt = 0;
int dcount = 0;
#endif
/* runqueue-specific stats */
seq_printf(seq,
"cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
cpu, rq->yld_both_empty,
rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
rq->ttwu_cnt, rq->ttwu_local,
rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
rq->sched_switch, rq->sched_count, rq->sched_goidle,
rq->ttwu_count, rq->ttwu_local,
rq->rq_sched_info.cpu_time,
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
seq_printf(seq, "\n");
......@@ -39,12 +39,12 @@ static int show_schedstat(struct seq_file *seq, void *v)
char mask_str[NR_CPUS];
cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
seq_printf(seq, "domain%d %s", dcnt++, mask_str);
seq_printf(seq, "domain%d %s", dcount++, mask_str);
for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
itype++) {
seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
"%lu",
sd->lb_cnt[itype],
sd->lb_count[itype],
sd->lb_balanced[itype],
sd->lb_failed[itype],
sd->lb_imbalance[itype],
......@@ -55,9 +55,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
}
seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
" %lu %lu %lu\n",
sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
sd->alb_count, sd->alb_failed, sd->alb_pushed,
sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
}
......@@ -101,7 +101,7 @@ rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
{
if (rq) {
rq->rq_sched_info.run_delay += delta;
rq->rq_sched_info.pcnt++;
rq->rq_sched_info.pcount++;
}
}
......@@ -129,7 +129,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
# define schedstat_set(var, val) do { } while (0)
#endif
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
#ifdef CONFIG_SCHEDSTATS
/*
* Called when a process is dequeued from the active array and given
* the cpu. We should note that with the exception of interactive
......@@ -164,7 +164,7 @@ static void sched_info_arrive(struct task_struct *t)
sched_info_dequeued(t);
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcnt++;
t->sched_info.pcount++;
rq_sched_info_arrive(task_rq(t), delta);
}
......@@ -233,5 +233,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
#else
#define sched_info_queued(t) do { } while (0)
#define sched_info_switch(t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
#endif /* CONFIG_SCHEDSTATS */
......@@ -222,14 +222,11 @@ static ctl_table kern_table[] = {
#ifdef CONFIG_SCHED_DEBUG
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_min_granularity_ns",
.data = &sysctl_sched_min_granularity,
.procname = "sched_nr_latency",
.data = &sysctl_sched_nr_latency,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
......@@ -266,38 +263,24 @@ static ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_stat_granularity_ns",
.data = &sysctl_sched_stat_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_runtime_limit_ns",
.data = &sysctl_sched_runtime_limit,
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
.procname = "sched_features",
.data = &sysctl_sched_features,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_features",
.data = &sysctl_sched_features,
.procname = "sched_migration_cost",
.data = &sysctl_sched_migration_cost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
......
......@@ -50,12 +50,16 @@ struct user_struct root_user = {
.uid_keyring = &root_user_keyring,
.session_keyring = &root_session_keyring,
#endif
#ifdef CONFIG_FAIR_USER_SCHED
.tg = &init_task_group,
#endif
};
/*
* These routines must be called with the uidhash spinlock held!
*/
static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
static inline void uid_hash_insert(struct user_struct *up,
struct hlist_head *hashent)
{
hlist_add_head(&up->uidhash_node, hashent);
}
......@@ -65,13 +69,14 @@ static inline void uid_hash_remove(struct user_struct *up)
hlist_del_init(&up->uidhash_node);
}
static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
static inline struct user_struct *uid_hash_find(uid_t uid,
struct hlist_head *hashent)
{
struct user_struct *user;
struct hlist_node *h;
hlist_for_each_entry(user, h, hashent, uidhash_node) {
if(user->uid == uid) {
if (user->uid == uid) {
atomic_inc(&user->__count);
return user;
}
......@@ -80,6 +85,203 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *ha
return NULL;
}
#ifdef CONFIG_FAIR_USER_SCHED
static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
static DEFINE_MUTEX(uids_mutex);
static void sched_destroy_user(struct user_struct *up)
{
sched_destroy_group(up->tg);
}
static int sched_create_user(struct user_struct *up)
{
int rc = 0;
up->tg = sched_create_group();
if (IS_ERR(up->tg))
rc = -ENOMEM;
return rc;
}
static void sched_switch_user(struct task_struct *p)
{
sched_move_task(p);
}
static inline void uids_mutex_lock(void)
{
mutex_lock(&uids_mutex);
}
static inline void uids_mutex_unlock(void)
{
mutex_unlock(&uids_mutex);
}
/* return cpu shares held by the user */
ssize_t cpu_shares_show(struct kset *kset, char *buffer)
{
struct user_struct *up = container_of(kset, struct user_struct, kset);
return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
}
/* modify cpu shares held by the user */
ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
{
struct user_struct *up = container_of(kset, struct user_struct, kset);
unsigned long shares;
int rc;
sscanf(buffer, "%lu", &shares);
rc = sched_group_set_shares(up->tg, shares);
return (rc ? rc : size);
}
static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
{
sa->attr.name = name;
sa->attr.mode = mode;
sa->show = cpu_shares_show;
sa->store = cpu_shares_store;
}
/* Create "/sys/kernel/uids/<uid>" directory and
* "/sys/kernel/uids/<uid>/cpu_share" file for this user.
*/
static int user_kobject_create(struct user_struct *up)
{
struct kset *kset = &up->kset;
struct kobject *kobj = &kset->kobj;
int error;
memset(kset, 0, sizeof(struct kset));
kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */
kobject_set_name(kobj, "%d", up->uid);
kset_init(kset);
user_attr_init(&up->user_attr, "cpu_share", 0644);
error = kobject_add(kobj);
if (error)
goto done;
error = sysfs_create_file(kobj, &up->user_attr.attr);
if (error)
kobject_del(kobj);
kobject_uevent(kobj, KOBJ_ADD);
done:
return error;
}
/* create these in sysfs filesystem:
* "/sys/kernel/uids" directory
* "/sys/kernel/uids/0" directory (for root user)
* "/sys/kernel/uids/0/cpu_share" file (for root user)
*/
int __init uids_kobject_init(void)
{
int error;
/* create under /sys/kernel dir */
uids_kobject.parent = &kernel_subsys.kobj;
uids_kobject.kset = &kernel_subsys;
kobject_set_name(&uids_kobject, "uids");
kobject_init(&uids_kobject);
error = kobject_add(&uids_kobject);
if (!error)
error = user_kobject_create(&root_user);
return error;
}
/* work function to remove sysfs directory for a user and free up
* corresponding structures.
*/
static void remove_user_sysfs_dir(struct work_struct *w)
{
struct user_struct *up = container_of(w, struct user_struct, work);
struct kobject *kobj = &up->kset.kobj;
unsigned long flags;
int remove_user = 0;
/* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
* atomic.
*/
uids_mutex_lock();
local_irq_save(flags);
if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
uid_hash_remove(up);
remove_user = 1;
spin_unlock_irqrestore(&uidhash_lock, flags);
} else {
local_irq_restore(flags);
}
if (!remove_user)
goto done;
sysfs_remove_file(kobj, &up->user_attr.attr);
kobject_uevent(kobj, KOBJ_REMOVE);
kobject_del(kobj);
sched_destroy_user(up);
key_put(up->uid_keyring);
key_put(up->session_keyring);
kmem_cache_free(uid_cachep, up);
done:
uids_mutex_unlock();
}
/* IRQs are disabled and uidhash_lock is held upon function entry.
* IRQ state (as stored in flags) is restored and uidhash_lock released
* upon function exit.
*/
static inline void free_user(struct user_struct *up, unsigned long flags)
{
/* restore back the count */
atomic_inc(&up->__count);
spin_unlock_irqrestore(&uidhash_lock, flags);
INIT_WORK(&up->work, remove_user_sysfs_dir);
schedule_work(&up->work);
}
#else /* CONFIG_FAIR_USER_SCHED */
static void sched_destroy_user(struct user_struct *up) { }
static int sched_create_user(struct user_struct *up) { return 0; }
static void sched_switch_user(struct task_struct *p) { }
static inline int user_kobject_create(struct user_struct *up) { return 0; }
static inline void uids_mutex_lock(void) { }
static inline void uids_mutex_unlock(void) { }
/* IRQs are disabled and uidhash_lock is held upon function entry.
* IRQ state (as stored in flags) is restored and uidhash_lock released
* upon function exit.
*/
static inline void free_user(struct user_struct *up, unsigned long flags)
{
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
sched_destroy_user(up);
key_put(up->uid_keyring);
key_put(up->session_keyring);
kmem_cache_free(uid_cachep, up);
}
#endif /* CONFIG_FAIR_USER_SCHED */
/*
* Locate the user_struct for the passed UID. If found, take a ref on it. The
* caller must undo that ref with free_uid().
......@@ -106,15 +308,10 @@ void free_uid(struct user_struct *up)
return;
local_irq_save(flags);
if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
key_put(up->uid_keyring);
key_put(up->session_keyring);
kmem_cache_free(uid_cachep, up);
} else {
if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
free_user(up, flags);
else
local_irq_restore(flags);
}
}
struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
......@@ -122,6 +319,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
struct hlist_head *hashent = uidhashentry(ns, uid);
struct user_struct *up;
/* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
* atomic.
*/
uids_mutex_lock();
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
spin_unlock_irq(&uidhash_lock);
......@@ -150,6 +352,22 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
return NULL;
}
if (sched_create_user(new) < 0) {
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
return NULL;
}
if (user_kobject_create(new)) {
sched_destroy_user(new);
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
uids_mutex_unlock();
return NULL;
}
/*
* Before adding this, check whether we raced
* on adding the same user already..
......@@ -157,6 +375,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
/* This case is not possible when CONFIG_FAIR_USER_SCHED
* is defined, since we serialize alloc_uid() using
* uids_mutex. Hence no need to call
* sched_destroy_user() or remove_user_sysfs_dir().
*/
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
......@@ -167,6 +390,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
spin_unlock_irq(&uidhash_lock);
}
uids_mutex_unlock();
return up;
}
......@@ -184,6 +410,7 @@ void switch_uid(struct user_struct *new_user)
atomic_dec(&old_user->processes);
switch_uid_keyring(new_user);
current->user = new_user;
sched_switch_user(current);
/*
* We need to synchronize with __sigqueue_alloc()
......
......@@ -334,7 +334,7 @@ static void unix_write_space(struct sock *sk)
read_lock(&sk->sk_callback_lock);
if (unix_writable(sk)) {
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible(sk->sk_sleep);
wake_up_interruptible_sync(sk->sk_sleep);
sk_wake_async(sk, 2, POLL_OUT);
}
read_unlock(&sk->sk_callback_lock);
......@@ -1639,7 +1639,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
if (!skb)
goto out_unlock;
wake_up_interruptible(&u->peer_wait);
wake_up_interruptible_sync(&u->peer_wait);
if (msg->msg_name)
unix_copy_addr(msg, skb->sk);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment