Commit 17489c05 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (76 commits)
  sched_clock: and multiplier for TSC to gtod drift
  sched_clock: record TSC after gtod
  sched_clock: only update deltas with local reads.
  sched_clock: fix calculation of other CPU
  sched_clock: stop maximum check on NO HZ
  sched_clock: widen the max and min time
  sched_clock: record from last tick
  sched: fix accounting in task delay accounting & migration
  sched: add avg-overlap support to RT tasks
  sched: terminate newidle balancing once at least one task has moved over
  sched: fix warning
  sched: build fix
  sched: sched_clock_cpu() based cpu_clock(), lockdep fix
  sched: export cpu_clock
  sched: make sched_{rt,fair}.c ifdefs more readable
  sched: bias effective_load() error towards failing wake_affine().
  sched: incremental effective_load()
  sched: correct wakeup weight calculations
  sched: fix mult overflow
  sched: update shares on wakeup
  ...
parents a3da5bf8 873a6ed6
......@@ -61,10 +61,7 @@ builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
arch_init_sched_domains function. This function will attach domains to all
CPUs using cpu_attach_domain.
Implementors should change the line
#undef SCHED_DOMAIN_DEBUG
to
#define SCHED_DOMAIN_DEBUG
in kernel/sched.c as this enables an error checking parse of the sched domains
The sched-domains debugging infrastructure can be enabled by enabling
CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
which should catch most possible errors (described above). It also prints out
the domain structure in a visual format.
......@@ -51,9 +51,9 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
0.00015s. So this group can be scheduled with a period of 0.005s and a run time
of 0.00015s.
The remaining CPU time will be used for user input and other tass. Because
The remaining CPU time will be used for user input and other tasks. Because
realtime tasks have explicitly allocated the CPU time they need to perform
their tasks, buffer underruns in the graphocs or audio can be eliminated.
their tasks, buffer underruns in the graphics or audio can be eliminated.
NOTE: the above example is not fully implemented as of yet (2.6.25). We still
lack an EDF scheduler to make non-uniform periods usable.
......
......@@ -134,7 +134,6 @@ extern unsigned long nr_running(void);
extern unsigned long nr_uninterruptible(void);
extern unsigned long nr_active(void);
extern unsigned long nr_iowait(void);
extern unsigned long weighted_cpuload(const int cpu);
struct seq_file;
struct cfs_rq;
......@@ -784,6 +783,8 @@ struct sched_domain {
unsigned int balance_interval; /* initialise to 1. units in ms. */
unsigned int nr_balance_failed; /* initialise to 0 */
u64 last_update;
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
......@@ -823,23 +824,6 @@ extern int arch_reinit_sched_domains(void);
#endif /* CONFIG_SMP */
/*
* A runqueue laden with a single nice 0 task scores a weighted_cpuload of
* SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
* task of nice 0 or enough lower priority tasks to bring up the
* weighted_cpuload
*/
static inline int above_background_load(void)
{
unsigned long cpu;
for_each_online_cpu(cpu) {
if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
return 1;
}
return 0;
}
struct io_context; /* See blkdev.h */
#define NGROUPS_SMALL 32
#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
......@@ -921,8 +905,8 @@ struct sched_class {
void (*set_cpus_allowed)(struct task_struct *p,
const cpumask_t *newmask);
void (*join_domain)(struct rq *rq);
void (*leave_domain)(struct rq *rq);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
void (*switched_from) (struct rq *this_rq, struct task_struct *task,
int running);
......@@ -1039,6 +1023,7 @@ struct task_struct {
#endif
int prio, static_prio, normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
......@@ -1122,7 +1107,6 @@ struct task_struct {
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
unsigned int rt_priority;
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
cputime_t prev_utime, prev_stime;
......@@ -1141,12 +1125,12 @@ struct task_struct {
gid_t gid,egid,sgid,fsgid;
struct group_info *group_info;
kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset;
unsigned securebits;
struct user_struct *user;
unsigned securebits;
#ifdef CONFIG_KEYS
unsigned char jit_keyring; /* default keyring to attach requested keys to */
struct key *request_key_auth; /* assumed request_key authority */
struct key *thread_keyring; /* keyring private to this thread */
unsigned char jit_keyring; /* default keyring to attach requested keys to */
#endif
char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
......@@ -1233,8 +1217,8 @@ struct task_struct {
# define MAX_LOCK_DEPTH 48UL
u64 curr_chain_key;
int lockdep_depth;
struct held_lock held_locks[MAX_LOCK_DEPTH];
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
#endif
/* journalling filesystem info */
......@@ -1262,10 +1246,6 @@ struct task_struct {
u64 acct_vm_mem1; /* accumulated virtual memory usage */
cputime_t acct_stimexpd;/* stime since last update */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy;
short il_next;
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed;
int cpuset_mems_generation;
......@@ -1284,6 +1264,10 @@ struct task_struct {
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy;
short il_next;
#endif
atomic_t fs_excl; /* holding fs exclusive resources */
struct rcu_head rcu;
......@@ -1504,6 +1488,7 @@ static inline void put_task_struct(struct task_struct *t)
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */
#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */
......@@ -1573,13 +1558,28 @@ static inline void sched_clock_idle_sleep_event(void)
static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
{
}
#else
#ifdef CONFIG_NO_HZ
static inline void sched_clock_tick_stop(int cpu)
{
}
static inline void sched_clock_tick_start(int cpu)
{
}
#endif
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
extern void sched_clock_init(void);
extern u64 sched_clock_cpu(int cpu);
extern void sched_clock_tick(void);
extern void sched_clock_idle_sleep_event(void);
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
#ifdef CONFIG_NO_HZ
extern void sched_clock_tick_stop(int cpu);
extern void sched_clock_tick_start(int cpu);
#endif
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
/*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
......@@ -1622,6 +1622,7 @@ extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_shares_ratelimit;
int sched_nr_latency_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length,
......
......@@ -3,7 +3,7 @@
#
obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
exit.o itimer.o time.o softirq.o resource.o \
cpu.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
......@@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
......@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_MARKERS) += marker.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
obj-$(CONFIG_SMP) += sched_cpupri.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
......
......@@ -15,6 +15,28 @@
#include <linux/stop_machine.h>
#include <linux/mutex.h>
/*
* Represents all cpu's present in the system
* In systems capable of hotplug, this map could dynamically grow
* as new cpu's are detected in the system via any platform specific
* method, such as ACPI for e.g.
*/
cpumask_t cpu_present_map __read_mostly;
EXPORT_SYMBOL(cpu_present_map);
#ifndef CONFIG_SMP
/*
* Represents all cpu's that are currently online.
*/
cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
EXPORT_SYMBOL(cpu_online_map);
cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
EXPORT_SYMBOL(cpu_possible_map);
#else /* CONFIG_SMP */
/* Serializes the updates to cpu_online_map, cpu_present_map */
static DEFINE_MUTEX(cpu_add_remove_lock);
......@@ -403,3 +425,5 @@ void __ref enable_nonboot_cpus(void)
cpu_maps_update_done();
}
#endif /* CONFIG_PM_SLEEP_SMP */
#endif /* CONFIG_SMP */
......@@ -1194,6 +1194,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
return -ENOSPC;
if (tsk->flags & PF_THREAD_BOUND) {
cpumask_t mask;
mutex_lock(&callback_mutex);
mask = cs->cpus_allowed;
mutex_unlock(&callback_mutex);
if (!cpus_equal(tsk->cpus_allowed, mask))
return -EINVAL;
}
return security_task_setscheduler(tsk, 0, NULL);
}
......@@ -1207,11 +1216,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
int err;
mutex_lock(&callback_mutex);
guarantee_online_cpus(cs, &cpus);
set_cpus_allowed_ptr(tsk, &cpus);
err = set_cpus_allowed_ptr(tsk, &cpus);
mutex_unlock(&callback_mutex);
if (err)
return;
from = oldcs->mems_allowed;
to = cs->mems_allowed;
......
......@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
set_task_cpu(k, cpu);
k->cpus_allowed = cpumask_of_cpu(cpu);
k->rt.nr_cpus_allowed = 1;
k->flags |= PF_THREAD_BOUND;
}
EXPORT_SYMBOL(kthread_bind);
......
This diff is collapsed.
......@@ -3,6 +3,9 @@
*
* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*
* Updates and enhancements:
* Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
*
* Based on code by:
* Ingo Molnar <mingo@redhat.com>
* Guillaume Chazarain <guichaz@gmail.com>
......@@ -32,6 +35,11 @@
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
#define MULTI_SHIFT 15
/* Max is double, Min is 1/2 */
#define MAX_MULTI (2LL << MULTI_SHIFT)
#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
struct sched_clock_data {
/*
* Raw spinlock - this is a special case: this might be called
......@@ -40,11 +48,15 @@ struct sched_clock_data {
*/
raw_spinlock_t lock;
unsigned long prev_jiffies;
unsigned long tick_jiffies;
u64 prev_raw;
u64 tick_raw;
u64 tick_gtod;
u64 clock;
s64 multi;
#ifdef CONFIG_NO_HZ
int check_max;
#endif
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
......@@ -71,41 +83,91 @@ void sched_clock_init(void)
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
scd->prev_jiffies = now_jiffies;
scd->tick_jiffies = now_jiffies;
scd->prev_raw = 0;
scd->tick_raw = 0;
scd->tick_gtod = ktime_now;
scd->clock = ktime_now;
scd->multi = 1 << MULTI_SHIFT;
#ifdef CONFIG_NO_HZ
scd->check_max = 1;
#endif
}
sched_clock_running = 1;
}
#ifdef CONFIG_NO_HZ
/*
* The dynamic ticks makes the delta jiffies inaccurate. This
* prevents us from checking the maximum time update.
* Disable the maximum check during stopped ticks.
*/
void sched_clock_tick_stop(int cpu)
{
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->check_max = 0;
}
void sched_clock_tick_start(int cpu)
{
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->check_max = 1;
}
static int check_max(struct sched_clock_data *scd)
{
return scd->check_max;
}
#else
static int check_max(struct sched_clock_data *scd)
{
return 1;
}
#endif /* CONFIG_NO_HZ */
/*
* update the percpu scd from the raw @now value
*
* - filter out backward motion
* - use jiffies to generate a min,max window to clip the raw values
*/
static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
{
unsigned long now_jiffies = jiffies;
long delta_jiffies = now_jiffies - scd->prev_jiffies;
long delta_jiffies = now_jiffies - scd->tick_jiffies;
u64 clock = scd->clock;
u64 min_clock, max_clock;
s64 delta = now - scd->prev_raw;
WARN_ON_ONCE(!irqs_disabled());
min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
/*
* At schedule tick the clock can be just under the gtod. We don't
* want to push it too prematurely.
*/
min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
if (min_clock > TICK_NSEC)
min_clock -= TICK_NSEC / 2;
if (unlikely(delta < 0)) {
clock++;
goto out;
}
max_clock = min_clock + TICK_NSEC;
/*
* The clock must stay within a jiffie of the gtod.
* But since we may be at the start of a jiffy or the end of one
* we add another jiffy buffer.
*/
max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
delta *= scd->multi;
delta >>= MULTI_SHIFT;
if (unlikely(clock + delta > max_clock)) {
if (unlikely(clock + delta > max_clock) && check_max(scd)) {
if (clock < max_clock)
clock = max_clock;
else
......@@ -118,9 +180,12 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
if (unlikely(clock < min_clock))
clock = min_clock;
scd->prev_raw = now;
scd->prev_jiffies = now_jiffies;
scd->clock = clock;
if (time)
*time = clock;
else {
scd->prev_raw = now;
scd->clock = clock;
}
}
static void lock_double_clock(struct sched_clock_data *data1,
......@@ -160,25 +225,30 @@ u64 sched_clock_cpu(int cpu)
now -= my_scd->tick_raw;
now += scd->tick_raw;
now -= my_scd->tick_gtod;
now += scd->tick_gtod;
now += my_scd->tick_gtod;
now -= scd->tick_gtod;
__raw_spin_unlock(&my_scd->lock);
__update_sched_clock(scd, now, &clock);
__raw_spin_unlock(&scd->lock);
} else {
__raw_spin_lock(&scd->lock);
__update_sched_clock(scd, now, NULL);
clock = scd->clock;
__raw_spin_unlock(&scd->lock);
}
__update_sched_clock(scd, now);
clock = scd->clock;
__raw_spin_unlock(&scd->lock);
return clock;
}
void sched_clock_tick(void)
{
struct sched_clock_data *scd = this_scd();
unsigned long now_jiffies = jiffies;
s64 mult, delta_gtod, delta_raw;
u64 now, now_gtod;
if (unlikely(!sched_clock_running))
......@@ -186,18 +256,33 @@ void sched_clock_tick(void)
WARN_ON_ONCE(!irqs_disabled());
now = sched_clock();
now_gtod = ktime_to_ns(ktime_get());
now = sched_clock();
__raw_spin_lock(&scd->lock);
__update_sched_clock(scd, now);
__update_sched_clock(scd, now, NULL);
/*
* update tick_gtod after __update_sched_clock() because that will
* already observe 1 new jiffy; adding a new tick_gtod to that would
* increase the clock 2 jiffies.
*/
delta_gtod = now_gtod - scd->tick_gtod;
delta_raw = now - scd->tick_raw;
if ((long)delta_raw > 0) {
mult = delta_gtod << MULTI_SHIFT;
do_div(mult, delta_raw);
scd->multi = mult;
if (scd->multi > MAX_MULTI)
scd->multi = MAX_MULTI;
else if (scd->multi < MIN_MULTI)
scd->multi = MIN_MULTI;
} else
scd->multi = 1 << MULTI_SHIFT;
scd->tick_raw = now;
scd->tick_gtod = now_gtod;
scd->tick_jiffies = now_jiffies;
__raw_spin_unlock(&scd->lock);
}
......@@ -227,6 +312,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
__raw_spin_lock(&scd->lock);
scd->prev_raw = now;
scd->clock += delta_ns;
scd->multi = 1 << MULTI_SHIFT;
__raw_spin_unlock(&scd->lock);
touch_softlockup_watchdog();
......@@ -244,3 +330,16 @@ unsigned long long __attribute__((weak)) sched_clock(void)
{
return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
}
unsigned long long cpu_clock(int cpu)
{
unsigned long long clock;
unsigned long flags;
local_irq_save(flags);
clock = sched_clock_cpu(cpu);
local_irq_restore(flags);
return clock;
}
EXPORT_SYMBOL_GPL(cpu_clock);
/*
* kernel/sched_cpupri.c
*
* CPU priority management
*
* Copyright (C) 2007-2008 Novell
*
* Author: Gregory Haskins <ghaskins@novell.com>
*
* This code tracks the priority of each CPU so that global migration
* decisions are easy to calculate. Each CPU can be in a state as follows:
*
* (INVALID), IDLE, NORMAL, RT1, ... RT99
*
* going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with
* a 2 dimensional bitmap (the first for priority class, the second for cpus
* in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a
* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include "sched_cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
static int convert_prio(int prio)
{
int cpupri;
if (prio == CPUPRI_INVALID)
cpupri = CPUPRI_INVALID;
else if (prio == MAX_PRIO)
cpupri = CPUPRI_IDLE;
else if (prio >= MAX_RT_PRIO)
cpupri = CPUPRI_NORMAL;
else
cpupri = MAX_RT_PRIO - prio + 1;
return cpupri;
}
#define for_each_cpupri_active(array, idx) \
for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
idx < CPUPRI_NR_PRIORITIES; \
idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
/**
* cpupri_find - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
* @lowest_mask: A mask to fill in with selected CPUs
*
* Note: This function returns the recommended CPUs as calculated during the
* current invokation. By the time the call returns, the CPUs may have in
* fact changed priorities any number of times. While not ideal, it is not
* an issue of correctness since the normal rebalancer logic will correct
* any discrepancies created by racing against the uncertainty of the current
* priority configuration.
*
* Returns: (int)bool - CPUs were found
*/
int cpupri_find(struct cpupri *cp, struct task_struct *p,
cpumask_t *lowest_mask)
{
int idx = 0;
int task_pri = convert_prio(p->prio);
for_each_cpupri_active(cp->pri_active, idx) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
cpumask_t mask;
if (idx >= task_pri)
break;
cpus_and(mask, p->cpus_allowed, vec->mask);
if (cpus_empty(mask))
continue;
*lowest_mask = mask;
return 1;
}
return 0;
}
/**
* cpupri_set - update the cpu priority setting
* @cp: The cpupri context
* @cpu: The target cpu
* @pri: The priority (INVALID-RT99) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
void cpupri_set(struct cpupri *cp, int cpu, int newpri)
{
int *currpri = &cp->cpu_to_pri[cpu];
int oldpri = *currpri;
unsigned long flags;
newpri = convert_prio(newpri);
BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
if (newpri == oldpri)
return;
/*
* If the cpu was currently mapped to a different value, we
* first need to unmap the old value
*/
if (likely(oldpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
spin_lock_irqsave(&vec->lock, flags);
vec->count--;
if (!vec->count)
clear_bit(oldpri, cp->pri_active);
cpu_clear(cpu, vec->mask);
spin_unlock_irqrestore(&vec->lock, flags);
}
if (likely(newpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
spin_lock_irqsave(&vec->lock, flags);
cpu_set(cpu, vec->mask);
vec->count++;
if (vec->count == 1)
set_bit(newpri, cp->pri_active);
spin_unlock_irqrestore(&vec->lock, flags);
}
*currpri = newpri;
}
/**
* cpupri_init - initialize the cpupri structure
* @cp: The cpupri context
*
* Returns: (void)
*/
void cpupri_init(struct cpupri *cp)
{
int i;
memset(cp, 0, sizeof(*cp));
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[i];
spin_lock_init(&vec->lock);
vec->count = 0;
cpus_clear(vec->mask);
}
for_each_possible_cpu(i)
cp->cpu_to_pri[i] = CPUPRI_INVALID;
}
#ifndef _LINUX_CPUPRI_H
#define _LINUX_CPUPRI_H
#include <linux/sched.h>
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
#define CPUPRI_INVALID -1
#define CPUPRI_IDLE 0
#define CPUPRI_NORMAL 1
/* values 2-101 are RT priorities 0-99 */
struct cpupri_vec {
spinlock_t lock;
int count;
cpumask_t mask;
};
struct cpupri {
struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
long pri_active[CPUPRI_NR_PRI_WORDS];
int cpu_to_pri[NR_CPUS];
};
#ifdef CONFIG_SMP
int cpupri_find(struct cpupri *cp,
struct task_struct *p, cpumask_t *lowest_mask);
void cpupri_set(struct cpupri *cp, int cpu, int pri);
void cpupri_init(struct cpupri *cp);
#else
#define cpupri_set(cp, cpu, pri) do { } while (0)
#define cpupri_init() do { } while (0)
#endif
#endif /* _LINUX_CPUPRI_H */
......@@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
struct sched_entity *last;
unsigned long flags;
#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED)
SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
#else
#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
char path[128] = "";
struct cgroup *cgroup = NULL;
struct task_group *tg = cfs_rq->tg;
......@@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cgroup_path(cgroup, path, sizeof(path));
SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
#else
SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
#endif
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
......@@ -162,11 +162,64 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SCHEDSTATS
SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
rq->bkl_count);
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
P(yld_exp_empty);
P(yld_act_empty);
P(yld_both_empty);
P(yld_count);
P(sched_switch);
P(sched_count);
P(sched_goidle);
P(ttwu_count);
P(ttwu_local);
P(bkl_count);
#undef P
#endif
SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
cfs_rq->nr_spread_over);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
#endif
#endif
}
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
char path[128] = "";
struct cgroup *cgroup = NULL;
struct task_group *tg = rt_rq->tg;
if (tg)
cgroup = tg->css.cgroup;
if (cgroup)
cgroup_path(cgroup, path, sizeof(path));
SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
#else
SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
#endif
#define P(x) \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
#define PN(x) \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
P(rt_nr_running);
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
#undef PN
#undef P
}
static void print_cpu(struct seq_file *m, int cpu)
......@@ -208,6 +261,7 @@ static void print_cpu(struct seq_file *m, int cpu)
#undef PN
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
print_rq(m, rq, cpu);
}
......
This diff is collapsed.
SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
SCHED_FEAT(NORMALIZED_SLEEPER, 1)
SCHED_FEAT(WAKEUP_PREEMPT, 1)
SCHED_FEAT(START_DEBIT, 1)
SCHED_FEAT(AFFINE_WAKEUPS, 1)
......@@ -6,5 +7,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
SCHED_FEAT(SYNC_WAKEUPS, 1)
SCHED_FEAT(HRTICK, 1)
SCHED_FEAT(DOUBLE_TICK, 0)
SCHED_FEAT(NORMALIZED_SLEEPER, 1)
SCHED_FEAT(DEADLINE, 1)
SCHED_FEAT(ASYM_GRAN, 1)
SCHED_FEAT(LB_BIAS, 0)
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
This diff is collapsed.
......@@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
if (rq)
rq->rq_sched_info.cpu_time += delta;
}
static inline void
rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
{
if (rq)
rq->rq_sched_info.run_delay += delta;
}
# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
# define schedstat_set(var, val) do { var = (val); } while (0)
......@@ -126,6 +133,9 @@ static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
{}
static inline void
rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
{}
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
# define schedstat_inc(rq, field) do { } while (0)
......@@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
#endif
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
static inline void sched_info_reset_dequeued(struct task_struct *t)
{
t->sched_info.last_queued = 0;
}
/*
* Called when a process is dequeued from the active array and given
* the cpu. We should note that with the exception of interactive
......@@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
* active queue, thus delaying tasks in the expired queue from running;
* see scheduler_tick()).
*
* This function is only called from sched_info_arrive(), rather than
* dequeue_task(). Even though a task may be queued and dequeued multiple
* times as it is shuffled about, we're really interested in knowing how
* long it was from the *first* time it was queued to the time that it
* finally hit a cpu.
* Though we are interested in knowing how long it was from the *first* time a
* task was queued to the time that it finally hit a cpu, we call this routine
* from dequeue_task() to account for possible rq->clock skew across cpus. The
* delta taken on each cpu would annul the skew.
*/
static inline void sched_info_dequeued(struct task_struct *t)
{
t->sched_info.last_queued = 0;
unsigned long long now = task_rq(t)->clock, delta = 0;
if (unlikely(sched_info_on()))
if (t->sched_info.last_queued)
delta = now - t->sched_info.last_queued;
sched_info_reset_dequeued(t);
t->sched_info.run_delay += delta;
rq_sched_info_dequeued(task_rq(t), delta);
}
/*
......@@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t)
if (t->sched_info.last_queued)
delta = now - t->sched_info.last_queued;
sched_info_dequeued(t);
sched_info_reset_dequeued(t);
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
......@@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
__sched_info_switch(prev, next);
}
#else
#define sched_info_queued(t) do { } while (0)
#define sched_info_switch(t, next) do { } while (0)
#define sched_info_queued(t) do { } while (0)
#define sched_info_reset_dequeued(t) do { } while (0)
#define sched_info_dequeued(t) do { } while (0)
#define sched_info_switch(t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
......@@ -264,6 +264,14 @@ static struct ctl_table kern_table[] = {
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_shares_ratelimit",
.data = &sysctl_sched_shares_ratelimit,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_child_runs_first",
......
......@@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void)
ts->tick_stopped = 1;
ts->idle_jiffies = last_jiffies;
rcu_enter_nohz();
sched_clock_tick_stop(cpu);
}
/*
......@@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void)
select_nohz_load_balancer(0);
now = ktime_get();
tick_do_update_jiffies64(now);
sched_clock_tick_start(cpu);
cpu_clear(cpu, nohz_cpu_mask);
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment