Commit 3174ffaa authored by Linus Torvalds's avatar Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched

* git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched:
  sched: rt-group: refure unrunnable tasks
  sched: rt-group: clean up the ifdeffery
  sched: rt-group: make rt groups scheduling configurable
  sched: rt-group: interface
  sched: rt-group: deal with PI
  sched: fix incorrect irq lock usage in normalize_rt_tasks()
  sched: fair-group: separate tg->shares from task_group_lock
  hrtimer: more hrtimer_init_sleeper() fallout.
parents d7ab95f8 b68aa230
Real-Time group scheduling.
The problem space:
In order to schedule multiple groups of realtime tasks each group must
be assigned a fixed portion of the CPU time available. Without a minimum
guarantee a realtime group can obviously fall short. A fuzzy upper limit
is of no use since it cannot be relied upon. Which leaves us with just
the single fixed portion.
CPU time is divided by means of specifying how much time can be spent
running in a given period. Say a frame fixed realtime renderer must
deliver 25 frames a second, which yields a period of 0.04s. Now say
it will also have to play some music and respond to input, leaving it
with around 80% for the graphics. We can then give this group a runtime
of 0.8 * 0.04s = 0.032s.
This way the graphics group will have a 0.04s period with a 0.032s runtime
limit.
Now if the audio thread needs to refill the DMA buffer every 0.005s, but
needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s
= 0.00015s.
The Interface:
system wide:
/proc/sys/kernel/sched_rt_period_ms
/proc/sys/kernel/sched_rt_runtime_us
CONFIG_FAIR_USER_SCHED
/sys/kernel/uids/<uid>/cpu_rt_runtime_us
or
CONFIG_FAIR_CGROUP_SCHED
/cgroup/<cgroup>/cpu.rt_runtime_us
[ time is specified in us because the interface is s32; this gives an
operating range of ~35m to 1us ]
The period takes values in [ 1, INT_MAX ], runtime in [ -1, INT_MAX - 1 ].
A runtime of -1 specifies runtime == period, ie. no limit.
New groups get the period from /proc/sys/kernel/sched_rt_period_us and
a runtime of 0.
Settings are constrained to:
\Sum_{i} runtime_{i} / global_period <= global_runtime / global_period
in order to keep the configuration schedulable.
...@@ -25,7 +25,7 @@ SUBSYS(ns) ...@@ -25,7 +25,7 @@ SUBSYS(ns)
/* */ /* */
#ifdef CONFIG_FAIR_CGROUP_SCHED #ifdef CONFIG_CGROUP_SCHED
SUBSYS(cpu_cgroup) SUBSYS(cpu_cgroup)
#endif #endif
......
...@@ -590,7 +590,7 @@ struct user_struct { ...@@ -590,7 +590,7 @@ struct user_struct {
struct hlist_node uidhash_node; struct hlist_node uidhash_node;
uid_t uid; uid_t uid;
#ifdef CONFIG_FAIR_USER_SCHED #ifdef CONFIG_USER_SCHED
struct task_group *tg; struct task_group *tg;
#ifdef CONFIG_SYSFS #ifdef CONFIG_SYSFS
struct kobject kobj; struct kobject kobj;
...@@ -973,7 +973,7 @@ struct sched_rt_entity { ...@@ -973,7 +973,7 @@ struct sched_rt_entity {
unsigned long timeout; unsigned long timeout;
int nr_cpus_allowed; int nr_cpus_allowed;
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity *parent; struct sched_rt_entity *parent;
/* rq on which this entity is (to be) queued: */ /* rq on which this entity is (to be) queued: */
struct rt_rq *rt_rq; struct rt_rq *rt_rq;
...@@ -1541,8 +1541,6 @@ extern unsigned int sysctl_sched_child_runs_first; ...@@ -1541,8 +1541,6 @@ extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_rt_period;
extern unsigned int sysctl_sched_rt_ratio;
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
extern unsigned int sysctl_sched_min_bal_int_shares; extern unsigned int sysctl_sched_min_bal_int_shares;
extern unsigned int sysctl_sched_max_bal_int_shares; extern unsigned int sysctl_sched_max_bal_int_shares;
...@@ -1552,6 +1550,8 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, ...@@ -1552,6 +1550,8 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, struct file *file, void __user *buffer, size_t *length,
loff_t *ppos); loff_t *ppos);
#endif #endif
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
extern unsigned int sysctl_sched_compat_yield; extern unsigned int sysctl_sched_compat_yield;
...@@ -2027,16 +2027,22 @@ extern int sched_mc_power_savings, sched_smt_power_savings; ...@@ -2027,16 +2027,22 @@ extern int sched_mc_power_savings, sched_smt_power_savings;
extern void normalize_rt_tasks(void); extern void normalize_rt_tasks(void);
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_GROUP_SCHED
extern struct task_group init_task_group; extern struct task_group init_task_group;
extern struct task_group *sched_create_group(void); extern struct task_group *sched_create_group(void);
extern void sched_destroy_group(struct task_group *tg); extern void sched_destroy_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk); extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern unsigned long sched_group_shares(struct task_group *tg); extern unsigned long sched_group_shares(struct task_group *tg);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
extern int sched_group_set_rt_runtime(struct task_group *tg,
long rt_runtime_us);
extern long sched_group_rt_runtime(struct task_group *tg);
#endif
#endif #endif
#ifdef CONFIG_TASK_XACCT #ifdef CONFIG_TASK_XACCT
......
...@@ -311,25 +311,36 @@ config CPUSETS ...@@ -311,25 +311,36 @@ config CPUSETS
Say N if unsure. Say N if unsure.
config FAIR_GROUP_SCHED config GROUP_SCHED
bool "Fair group CPU scheduler" bool "Group CPU scheduler"
default y default y
help help
This feature lets CPU scheduler recognize task groups and control CPU This feature lets CPU scheduler recognize task groups and control CPU
bandwidth allocation to such task groups. bandwidth allocation to such task groups.
config FAIR_GROUP_SCHED
bool "Group scheduling for SCHED_OTHER"
depends on GROUP_SCHED
default y
config RT_GROUP_SCHED
bool "Group scheduling for SCHED_RR/FIFO"
depends on EXPERIMENTAL
depends on GROUP_SCHED
default n
choice choice
depends on FAIR_GROUP_SCHED depends on GROUP_SCHED
prompt "Basis for grouping tasks" prompt "Basis for grouping tasks"
default FAIR_USER_SCHED default USER_SCHED
config FAIR_USER_SCHED config USER_SCHED
bool "user id" bool "user id"
help help
This option will choose userid as the basis for grouping This option will choose userid as the basis for grouping
tasks, thus providing equal CPU bandwidth to each user. tasks, thus providing equal CPU bandwidth to each user.
config FAIR_CGROUP_SCHED config CGROUP_SCHED
bool "Control groups" bool "Control groups"
depends on CGROUPS depends on CGROUPS
help help
......
...@@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, ...@@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
set_current_state(state); set_current_state(state);
/* Setup the timer, when timeout != NULL */ /* Setup the timer, when timeout != NULL */
if (unlikely(timeout)) if (unlikely(timeout)) {
hrtimer_start(&timeout->timer, timeout->timer.expires, hrtimer_start(&timeout->timer, timeout->timer.expires,
HRTIMER_MODE_ABS); HRTIMER_MODE_ABS);
if (!hrtimer_active(&timeout->timer))
timeout->task = NULL;
}
for (;;) { for (;;) {
/* Try to acquire the lock: */ /* Try to acquire the lock: */
......
...@@ -155,7 +155,7 @@ struct rt_prio_array { ...@@ -155,7 +155,7 @@ struct rt_prio_array {
struct list_head queue[MAX_RT_PRIO]; struct list_head queue[MAX_RT_PRIO];
}; };
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_GROUP_SCHED
#include <linux/cgroup.h> #include <linux/cgroup.h>
...@@ -165,19 +165,16 @@ static LIST_HEAD(task_groups); ...@@ -165,19 +165,16 @@ static LIST_HEAD(task_groups);
/* task group related information */ /* task group related information */
struct task_group { struct task_group {
#ifdef CONFIG_FAIR_CGROUP_SCHED #ifdef CONFIG_CGROUP_SCHED
struct cgroup_subsys_state css; struct cgroup_subsys_state css;
#endif #endif
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */ /* schedulable entities of this group on each cpu */
struct sched_entity **se; struct sched_entity **se;
/* runqueue "owned" by this group on each cpu */ /* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq; struct cfs_rq **cfs_rq;
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
unsigned int rt_ratio;
/* /*
* shares assigned to a task group governs how much of cpu bandwidth * shares assigned to a task group governs how much of cpu bandwidth
* is allocated to the group. The more shares a group has, the more is * is allocated to the group. The more shares a group has, the more is
...@@ -213,33 +210,46 @@ struct task_group { ...@@ -213,33 +210,46 @@ struct task_group {
* *
*/ */
unsigned long shares; unsigned long shares;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
u64 rt_runtime;
#endif
struct rcu_head rcu; struct rcu_head rcu;
struct list_head list; struct list_head list;
}; };
#ifdef CONFIG_FAIR_GROUP_SCHED
/* Default task group's sched entity on each cpu */ /* Default task group's sched entity on each cpu */
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */ /* Default task group's cfs_rq on each cpu */
static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
static struct sched_entity *init_sched_entity_p[NR_CPUS]; static struct sched_entity *init_sched_entity_p[NR_CPUS];
static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
#endif
#ifdef CONFIG_RT_GROUP_SCHED
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
static struct rt_rq *init_rt_rq_p[NR_CPUS]; static struct rt_rq *init_rt_rq_p[NR_CPUS];
#endif
/* task_group_mutex serializes add/remove of task groups and also changes to /* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares. * a task group's cpu shares.
*/ */
static DEFINE_MUTEX(task_group_mutex); static DEFINE_SPINLOCK(task_group_lock);
/* doms_cur_mutex serializes access to doms_cur[] array */ /* doms_cur_mutex serializes access to doms_cur[] array */
static DEFINE_MUTEX(doms_cur_mutex); static DEFINE_MUTEX(doms_cur_mutex);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* kernel thread that runs rebalance_shares() periodically */ /* kernel thread that runs rebalance_shares() periodically */
static struct task_struct *lb_monitor_task; static struct task_struct *lb_monitor_task;
...@@ -248,35 +258,40 @@ static int load_balance_monitor(void *unused); ...@@ -248,35 +258,40 @@ static int load_balance_monitor(void *unused);
static void set_se_shares(struct sched_entity *se, unsigned long shares); static void set_se_shares(struct sched_entity *se, unsigned long shares);
#ifdef CONFIG_USER_SCHED
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
#else
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
#endif
#define MIN_GROUP_SHARES 2
static int init_task_group_load = INIT_TASK_GROUP_LOAD;
#endif
/* Default task group. /* Default task group.
* Every task in system belong to this group at bootup. * Every task in system belong to this group at bootup.
*/ */
struct task_group init_task_group = { struct task_group init_task_group = {
#ifdef CONFIG_FAIR_GROUP_SCHED
.se = init_sched_entity_p, .se = init_sched_entity_p,
.cfs_rq = init_cfs_rq_p, .cfs_rq = init_cfs_rq_p,
#endif
#ifdef CONFIG_RT_GROUP_SCHED
.rt_se = init_sched_rt_entity_p, .rt_se = init_sched_rt_entity_p,
.rt_rq = init_rt_rq_p, .rt_rq = init_rt_rq_p,
};
#ifdef CONFIG_FAIR_USER_SCHED
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
#else
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
#endif #endif
};
#define MIN_GROUP_SHARES 2
static int init_task_group_load = INIT_TASK_GROUP_LOAD;
/* return group to which a task belongs */ /* return group to which a task belongs */
static inline struct task_group *task_group(struct task_struct *p) static inline struct task_group *task_group(struct task_struct *p)
{ {
struct task_group *tg; struct task_group *tg;
#ifdef CONFIG_FAIR_USER_SCHED #ifdef CONFIG_USER_SCHED
tg = p->user->tg; tg = p->user->tg;
#elif defined(CONFIG_FAIR_CGROUP_SCHED) #elif defined(CONFIG_CGROUP_SCHED)
tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
struct task_group, css); struct task_group, css);
#else #else
...@@ -288,21 +303,15 @@ static inline struct task_group *task_group(struct task_struct *p) ...@@ -288,21 +303,15 @@ static inline struct task_group *task_group(struct task_struct *p)
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{ {
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
p->se.parent = task_group(p)->se[cpu]; p->se.parent = task_group(p)->se[cpu];
#endif
#ifdef CONFIG_RT_GROUP_SCHED
p->rt.rt_rq = task_group(p)->rt_rq[cpu]; p->rt.rt_rq = task_group(p)->rt_rq[cpu];
p->rt.parent = task_group(p)->rt_se[cpu]; p->rt.parent = task_group(p)->rt_se[cpu];
} #endif
static inline void lock_task_group_list(void)
{
mutex_lock(&task_group_mutex);
}
static inline void unlock_task_group_list(void)
{
mutex_unlock(&task_group_mutex);
} }
static inline void lock_doms_cur(void) static inline void lock_doms_cur(void)
...@@ -318,12 +327,10 @@ static inline void unlock_doms_cur(void) ...@@ -318,12 +327,10 @@ static inline void unlock_doms_cur(void)
#else #else
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
static inline void lock_task_group_list(void) { }
static inline void unlock_task_group_list(void) { }
static inline void lock_doms_cur(void) { } static inline void lock_doms_cur(void) { }
static inline void unlock_doms_cur(void) { } static inline void unlock_doms_cur(void) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_GROUP_SCHED */
/* CFS-related fields in a runqueue */ /* CFS-related fields in a runqueue */
struct cfs_rq { struct cfs_rq {
...@@ -363,7 +370,7 @@ struct cfs_rq { ...@@ -363,7 +370,7 @@ struct cfs_rq {
struct rt_rq { struct rt_rq {
struct rt_prio_array active; struct rt_prio_array active;
unsigned long rt_nr_running; unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
int highest_prio; /* highest queued rt task prio */ int highest_prio; /* highest queued rt task prio */
#endif #endif
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -373,7 +380,9 @@ struct rt_rq { ...@@ -373,7 +380,9 @@ struct rt_rq {
int rt_throttled; int rt_throttled;
u64 rt_time; u64 rt_time;
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;
struct rq *rq; struct rq *rq;
struct list_head leaf_rt_rq_list; struct list_head leaf_rt_rq_list;
struct task_group *tg; struct task_group *tg;
...@@ -447,6 +456,8 @@ struct rq { ...@@ -447,6 +456,8 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */ /* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list; struct list_head leaf_cfs_rq_list;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct list_head leaf_rt_rq_list; struct list_head leaf_rt_rq_list;
#endif #endif
...@@ -652,19 +663,21 @@ const_debug unsigned int sysctl_sched_features = ...@@ -652,19 +663,21 @@ const_debug unsigned int sysctl_sched_features =
const_debug unsigned int sysctl_sched_nr_migrate = 32; const_debug unsigned int sysctl_sched_nr_migrate = 32;
/* /*
* period over which we measure -rt task cpu usage in ms. * period over which we measure -rt task cpu usage in us.
* default: 1s * default: 1s
*/ */
const_debug unsigned int sysctl_sched_rt_period = 1000; unsigned int sysctl_sched_rt_period = 1000000;
#define SCHED_RT_FRAC_SHIFT 16 /*
#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) * part of the period that we allow rt tasks to run in us.
* default: 0.95s
*/
int sysctl_sched_rt_runtime = 950000;
/* /*
* ratio of time -rt tasks may consume. * single value that denotes runtime == period, ie unlimited time.
* default: 95%
*/ */
const_debug unsigned int sysctl_sched_rt_ratio = 62259; #define RUNTIME_INF ((u64)~0ULL)
/* /*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
...@@ -4571,6 +4584,15 @@ int sched_setscheduler(struct task_struct *p, int policy, ...@@ -4571,6 +4584,15 @@ int sched_setscheduler(struct task_struct *p, int policy,
return -EPERM; return -EPERM;
} }
#ifdef CONFIG_RT_GROUP_SCHED
/*
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
return -EPERM;
#endif
retval = security_task_setscheduler(p, policy, param); retval = security_task_setscheduler(p, policy, param);
if (retval) if (retval)
return retval; return retval;
...@@ -7112,7 +7134,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) ...@@ -7112,7 +7134,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
/* delimiter for bitsearch: */ /* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap); __set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
rt_rq->highest_prio = MAX_RT_PRIO; rt_rq->highest_prio = MAX_RT_PRIO;
#endif #endif
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -7123,7 +7145,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) ...@@ -7123,7 +7145,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->rt_time = 0; rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0; rt_rq->rt_throttled = 0;
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq; rt_rq->rq = rq;
#endif #endif
} }
...@@ -7146,7 +7169,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, ...@@ -7146,7 +7169,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
se->parent = NULL; se->parent = NULL;
} }
#endif
#ifdef CONFIG_RT_GROUP_SCHED
static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
int cpu, int add) int cpu, int add)
...@@ -7175,7 +7200,7 @@ void __init sched_init(void) ...@@ -7175,7 +7200,7 @@ void __init sched_init(void)
init_defrootdomain(); init_defrootdomain();
#endif #endif
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_GROUP_SCHED
list_add(&init_task_group.list, &task_groups); list_add(&init_task_group.list, &task_groups);
#endif #endif
...@@ -7196,7 +7221,10 @@ void __init sched_init(void) ...@@ -7196,7 +7221,10 @@ void __init sched_init(void)
&per_cpu(init_cfs_rq, i), &per_cpu(init_cfs_rq, i),
&per_cpu(init_sched_entity, i), i, 1); &per_cpu(init_sched_entity, i), i, 1);
init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ #endif
#ifdef CONFIG_RT_GROUP_SCHED
init_task_group.rt_runtime =
sysctl_sched_rt_runtime * NSEC_PER_USEC;
INIT_LIST_HEAD(&rq->leaf_rt_rq_list); INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
init_tg_rt_entry(rq, &init_task_group, init_tg_rt_entry(rq, &init_task_group,
&per_cpu(init_rt_rq, i), &per_cpu(init_rt_rq, i),
...@@ -7303,7 +7331,7 @@ void normalize_rt_tasks(void) ...@@ -7303,7 +7331,7 @@ void normalize_rt_tasks(void)
unsigned long flags; unsigned long flags;
struct rq *rq; struct rq *rq;
read_lock_irq(&tasklist_lock); read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, p) { do_each_thread(g, p) {
/* /*
* Only normalize user tasks: * Only normalize user tasks:
...@@ -7329,16 +7357,16 @@ void normalize_rt_tasks(void) ...@@ -7329,16 +7357,16 @@ void normalize_rt_tasks(void)
continue; continue;
} }
spin_lock_irqsave(&p->pi_lock, flags); spin_lock(&p->pi_lock);
rq = __task_rq_lock(p); rq = __task_rq_lock(p);
normalize_task(rq, p); normalize_task(rq, p);
__task_rq_unlock(rq); __task_rq_unlock(rq);
spin_unlock_irqrestore(&p->pi_lock, flags); spin_unlock(&p->pi_lock);
} while_each_thread(g, p); } while_each_thread(g, p);
read_unlock_irq(&tasklist_lock); read_unlock_irqrestore(&tasklist_lock, flags);
} }
#endif /* CONFIG_MAGIC_SYSRQ */ #endif /* CONFIG_MAGIC_SYSRQ */
...@@ -7387,9 +7415,9 @@ void set_curr_task(int cpu, struct task_struct *p) ...@@ -7387,9 +7415,9 @@ void set_curr_task(int cpu, struct task_struct *p)
#endif #endif
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_GROUP_SCHED
#ifdef CONFIG_SMP #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
/* /*
* distribute shares of all task groups among their schedulable entities, * distribute shares of all task groups among their schedulable entities,
* to reflect load distribution across cpus. * to reflect load distribution across cpus.
...@@ -7540,7 +7568,8 @@ static int load_balance_monitor(void *unused) ...@@ -7540,7 +7568,8 @@ static int load_balance_monitor(void *unused)
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
static void free_sched_group(struct task_group *tg) #ifdef CONFIG_FAIR_GROUP_SCHED
static void free_fair_sched_group(struct task_group *tg)
{ {
int i; int i;
...@@ -7549,49 +7578,27 @@ static void free_sched_group(struct task_group *tg) ...@@ -7549,49 +7578,27 @@ static void free_sched_group(struct task_group *tg)
kfree(tg->cfs_rq[i]); kfree(tg->cfs_rq[i]);
if (tg->se) if (tg->se)
kfree(tg->se[i]); kfree(tg->se[i]);
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
if (tg->rt_se)
kfree(tg->rt_se[i]);
} }
kfree(tg->cfs_rq); kfree(tg->cfs_rq);
kfree(tg->se); kfree(tg->se);
kfree(tg->rt_rq);
kfree(tg->rt_se);
kfree(tg);
} }
/* allocate runqueue etc for a new task group */ static int alloc_fair_sched_group(struct task_group *tg)
struct task_group *sched_create_group(void)
{ {
struct task_group *tg;
struct cfs_rq *cfs_rq; struct cfs_rq *cfs_rq;
struct sched_entity *se; struct sched_entity *se;
struct rt_rq *rt_rq;
struct sched_rt_entity *rt_se;
struct rq *rq; struct rq *rq;
int i; int i;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
return ERR_PTR(-ENOMEM);
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
if (!tg->cfs_rq) if (!tg->cfs_rq)
goto err; goto err;
tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
if (!tg->se) if (!tg->se)
goto err; goto err;
tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
if (!tg->rt_rq)
goto err;
tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
if (!tg->rt_se)
goto err;
tg->shares = NICE_0_LOAD; tg->shares = NICE_0_LOAD;
tg->rt_ratio = 0; /* XXX */
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
rq = cpu_rq(i); rq = cpu_rq(i);
...@@ -7606,6 +7613,79 @@ struct task_group *sched_create_group(void) ...@@ -7606,6 +7613,79 @@ struct task_group *sched_create_group(void)
if (!se) if (!se)
goto err; goto err;
init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
}
return 1;
err:
return 0;
}
static inline void register_fair_sched_group(struct task_group *tg, int cpu)
{
list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
&cpu_rq(cpu)->leaf_cfs_rq_list);
}
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
}
#else
static inline void free_fair_sched_group(struct task_group *tg)
{
}
static inline int alloc_fair_sched_group(struct task_group *tg)
{
return 1;
}
static inline void register_fair_sched_group(struct task_group *tg, int cpu)
{
}
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
}
#endif
#ifdef CONFIG_RT_GROUP_SCHED
static void free_rt_sched_group(struct task_group *tg)
{
int i;
for_each_possible_cpu(i) {
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
if (tg->rt_se)
kfree(tg->rt_se[i]);
}
kfree(tg->rt_rq);
kfree(tg->rt_se);
}
static int alloc_rt_sched_group(struct task_group *tg)
{
struct rt_rq *rt_rq;
struct sched_rt_entity *rt_se;
struct rq *rq;
int i;
tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
if (!tg->rt_rq)
goto err;
tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
if (!tg->rt_se)
goto err;
tg->rt_runtime = 0;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
rt_rq = kmalloc_node(sizeof(struct rt_rq), rt_rq = kmalloc_node(sizeof(struct rt_rq),
GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
if (!rt_rq) if (!rt_rq)
...@@ -7616,20 +7696,75 @@ struct task_group *sched_create_group(void) ...@@ -7616,20 +7696,75 @@ struct task_group *sched_create_group(void)
if (!rt_se) if (!rt_se)
goto err; goto err;
init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
} }
lock_task_group_list(); return 1;
err:
return 0;
}
static inline void register_rt_sched_group(struct task_group *tg, int cpu)
{
list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
&cpu_rq(cpu)->leaf_rt_rq_list);
}
static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
{
list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
}
#else
static inline void free_rt_sched_group(struct task_group *tg)
{
}
static inline int alloc_rt_sched_group(struct task_group *tg)
{
return 1;
}
static inline void register_rt_sched_group(struct task_group *tg, int cpu)
{
}
static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
{
}
#endif
static void free_sched_group(struct task_group *tg)
{
free_fair_sched_group(tg);
free_rt_sched_group(tg);
kfree(tg);
}
/* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(void)
{
struct task_group *tg;
unsigned long flags;
int i;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
return ERR_PTR(-ENOMEM);
if (!alloc_fair_sched_group(tg))
goto err;
if (!alloc_rt_sched_group(tg))
goto err;
spin_lock_irqsave(&task_group_lock, flags);
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
rq = cpu_rq(i); register_fair_sched_group(tg, i);
cfs_rq = tg->cfs_rq[i]; register_rt_sched_group(tg, i);
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
rt_rq = tg->rt_rq[i];
list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
} }
list_add_rcu(&tg->list, &task_groups); list_add_rcu(&tg->list, &task_groups);
unlock_task_group_list(); spin_unlock_irqrestore(&task_group_lock, flags);
return tg; return tg;
...@@ -7648,21 +7783,16 @@ static void free_sched_group_rcu(struct rcu_head *rhp) ...@@ -7648,21 +7783,16 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
/* Destroy runqueue etc associated with a task group */ /* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg) void sched_destroy_group(struct task_group *tg)
{ {
struct cfs_rq *cfs_rq = NULL; unsigned long flags;
struct rt_rq *rt_rq = NULL;
int i; int i;
lock_task_group_list(); spin_lock_irqsave(&task_group_lock, flags);
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
cfs_rq = tg->cfs_rq[i]; unregister_fair_sched_group(tg, i);
list_del_rcu(&cfs_rq->leaf_cfs_rq_list); unregister_rt_sched_group(tg, i);
rt_rq = tg->rt_rq[i];
list_del_rcu(&rt_rq->leaf_rt_rq_list);
} }
list_del_rcu(&tg->list); list_del_rcu(&tg->list);
unlock_task_group_list(); spin_unlock_irqrestore(&task_group_lock, flags);
BUG_ON(!cfs_rq);
/* wait for possible concurrent references to cfs_rqs complete */ /* wait for possible concurrent references to cfs_rqs complete */
call_rcu(&tg->rcu, free_sched_group_rcu); call_rcu(&tg->rcu, free_sched_group_rcu);
...@@ -7703,6 +7833,7 @@ void sched_move_task(struct task_struct *tsk) ...@@ -7703,6 +7833,7 @@ void sched_move_task(struct task_struct *tsk)
task_rq_unlock(rq, &flags); task_rq_unlock(rq, &flags);
} }
#ifdef CONFIG_FAIR_GROUP_SCHED
/* rq->lock to be locked by caller */ /* rq->lock to be locked by caller */
static void set_se_shares(struct sched_entity *se, unsigned long shares) static void set_se_shares(struct sched_entity *se, unsigned long shares)
{ {
...@@ -7728,13 +7859,14 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) ...@@ -7728,13 +7859,14 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
} }
} }
static DEFINE_MUTEX(shares_mutex);
int sched_group_set_shares(struct task_group *tg, unsigned long shares) int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{ {
int i; int i;
struct cfs_rq *cfs_rq; unsigned long flags;
struct rq *rq;
lock_task_group_list(); mutex_lock(&shares_mutex);
if (tg->shares == shares) if (tg->shares == shares)
goto done; goto done;
...@@ -7746,10 +7878,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) ...@@ -7746,10 +7878,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* load_balance_fair) from referring to this group first, * load_balance_fair) from referring to this group first,
* by taking it off the rq->leaf_cfs_rq_list on each cpu. * by taking it off the rq->leaf_cfs_rq_list on each cpu.
*/ */
for_each_possible_cpu(i) { spin_lock_irqsave(&task_group_lock, flags);
cfs_rq = tg->cfs_rq[i]; for_each_possible_cpu(i)
list_del_rcu(&cfs_rq->leaf_cfs_rq_list); unregister_fair_sched_group(tg, i);
} spin_unlock_irqrestore(&task_group_lock, flags);
/* wait for any ongoing reference to this group to finish */ /* wait for any ongoing reference to this group to finish */
synchronize_sched(); synchronize_sched();
...@@ -7769,13 +7901,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) ...@@ -7769,13 +7901,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* Enable load balance activity on this group, by inserting it back on * Enable load balance activity on this group, by inserting it back on
* each cpu's rq->leaf_cfs_rq_list. * each cpu's rq->leaf_cfs_rq_list.
*/ */
for_each_possible_cpu(i) { spin_lock_irqsave(&task_group_lock, flags);
rq = cpu_rq(i); for_each_possible_cpu(i)
cfs_rq = tg->cfs_rq[i]; register_fair_sched_group(tg, i);
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); spin_unlock_irqrestore(&task_group_lock, flags);
}
done: done:
unlock_task_group_list(); mutex_unlock(&shares_mutex);
return 0; return 0;
} }
...@@ -7783,35 +7914,84 @@ unsigned long sched_group_shares(struct task_group *tg) ...@@ -7783,35 +7914,84 @@ unsigned long sched_group_shares(struct task_group *tg)
{ {
return tg->shares; return tg->shares;
} }
#endif
#ifdef CONFIG_RT_GROUP_SCHED
/* /*
* Ensure the total rt_ratio <= sysctl_sched_rt_ratio * Ensure that the real time constraints are schedulable.
*/ */
int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) static DEFINE_MUTEX(rt_constraints_mutex);
static unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
return 1ULL << 16;
runtime *= (1ULL << 16);
div64_64(runtime, period);
return runtime;
}
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{ {
struct task_group *tgi; struct task_group *tgi;
unsigned long total = 0; unsigned long total = 0;
unsigned long global_ratio =
to_ratio(sysctl_sched_rt_period,
sysctl_sched_rt_runtime < 0 ?
RUNTIME_INF : sysctl_sched_rt_runtime);
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_rcu(tgi, &task_groups, list) list_for_each_entry_rcu(tgi, &task_groups, list) {
total += tgi->rt_ratio; if (tgi == tg)
rcu_read_unlock(); continue;
if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) total += to_ratio(period, tgi->rt_runtime);
return -EINVAL; }
rcu_read_unlock();
tg->rt_ratio = rt_ratio; return total + to_ratio(period, runtime) < global_ratio;
return 0;
} }
unsigned long sched_group_rt_ratio(struct task_group *tg) int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{ {
return tg->rt_ratio; u64 rt_runtime, rt_period;
int err = 0;
rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
if (rt_runtime_us == -1)
rt_runtime = rt_period;
mutex_lock(&rt_constraints_mutex);
if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
err = -EINVAL;
goto unlock;
}
if (rt_runtime_us == -1)
rt_runtime = RUNTIME_INF;
tg->rt_runtime = rt_runtime;
unlock:
mutex_unlock(&rt_constraints_mutex);
return err;
} }
#endif /* CONFIG_FAIR_GROUP_SCHED */ long sched_group_rt_runtime(struct task_group *tg)
{
u64 rt_runtime_us;
if (tg->rt_runtime == RUNTIME_INF)
return -1;
rt_runtime_us = tg->rt_runtime;
do_div(rt_runtime_us, NSEC_PER_USEC);
return rt_runtime_us;
}
#endif
#endif /* CONFIG_GROUP_SCHED */
#ifdef CONFIG_FAIR_CGROUP_SCHED #ifdef CONFIG_CGROUP_SCHED
/* return corresponding task_group object of a cgroup */ /* return corresponding task_group object of a cgroup */
static inline struct task_group *cgroup_tg(struct cgroup *cgrp) static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
...@@ -7857,9 +8037,15 @@ static int ...@@ -7857,9 +8037,15 @@ static int
cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *tsk) struct task_struct *tsk)
{ {
#ifdef CONFIG_RT_GROUP_SCHED
/* Don't accept realtime tasks when there is no way for them to run */
if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
return -EINVAL;
#else
/* We don't support RT-tasks being in separate groups */ /* We don't support RT-tasks being in separate groups */
if (tsk->sched_class != &fair_sched_class) if (tsk->sched_class != &fair_sched_class)
return -EINVAL; return -EINVAL;
#endif
return 0; return 0;
} }
...@@ -7871,6 +8057,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, ...@@ -7871,6 +8057,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
sched_move_task(tsk); sched_move_task(tsk);
} }
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval) u64 shareval)
{ {
...@@ -7883,31 +8070,70 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) ...@@ -7883,31 +8070,70 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
return (u64) tg->shares; return (u64) tg->shares;
} }
#endif
static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, #ifdef CONFIG_RT_GROUP_SCHED
u64 rt_ratio_val) static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
struct file *file,
const char __user *userbuf,
size_t nbytes, loff_t *unused_ppos)
{ {
return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); char buffer[64];
int retval = 0;
s64 val;
char *end;
if (!nbytes)
return -EINVAL;
if (nbytes >= sizeof(buffer))
return -E2BIG;
if (copy_from_user(buffer, userbuf, nbytes))
return -EFAULT;
buffer[nbytes] = 0; /* nul-terminate */
/* strip newline if necessary */
if (nbytes && (buffer[nbytes-1] == '\n'))
buffer[nbytes-1] = 0;
val = simple_strtoll(buffer, &end, 0);
if (*end)
return -EINVAL;
/* Pass to subsystem */
retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
if (!retval)
retval = nbytes;
return retval;
} }
static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
struct file *file,
char __user *buf, size_t nbytes,
loff_t *ppos)
{ {
struct task_group *tg = cgroup_tg(cgrp); char tmp[64];
long val = sched_group_rt_runtime(cgroup_tg(cgrp));
int len = sprintf(tmp, "%ld\n", val);
return (u64) tg->rt_ratio; return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
} }
#endif
static struct cftype cpu_files[] = { static struct cftype cpu_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{ {
.name = "shares", .name = "shares",
.read_uint = cpu_shares_read_uint, .read_uint = cpu_shares_read_uint,
.write_uint = cpu_shares_write_uint, .write_uint = cpu_shares_write_uint,
}, },
#endif
#ifdef CONFIG_RT_GROUP_SCHED
{ {
.name = "rt_ratio", .name = "rt_runtime_us",
.read_uint = cpu_rt_ratio_read_uint, .read = cpu_rt_runtime_read,
.write_uint = cpu_rt_ratio_write_uint, .write = cpu_rt_runtime_write,
}, },
#endif
}; };
static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
...@@ -7926,7 +8152,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { ...@@ -7926,7 +8152,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.early_init = 1, .early_init = 1,
}; };
#endif /* CONFIG_FAIR_CGROUP_SCHED */ #endif /* CONFIG_CGROUP_SCHED */
#ifdef CONFIG_CGROUP_CPUACCT #ifdef CONFIG_CGROUP_CPUACCT
......
...@@ -55,14 +55,14 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se) ...@@ -55,14 +55,14 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
return !list_empty(&rt_se->run_list); return !list_empty(&rt_se->run_list);
} }
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{ {
if (!rt_rq->tg) if (!rt_rq->tg)
return SCHED_RT_FRAC; return RUNTIME_INF;
return rt_rq->tg->rt_ratio; return rt_rq->tg->rt_runtime;
} }
#define for_each_leaf_rt_rq(rt_rq, rq) \ #define for_each_leaf_rt_rq(rt_rq, rq) \
...@@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) ...@@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
static void enqueue_rt_entity(struct sched_rt_entity *rt_se); static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{ {
struct sched_rt_entity *rt_se = rt_rq->rt_se; struct sched_rt_entity *rt_se = rt_rq->rt_se;
...@@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) ...@@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
} }
} }
static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{ {
struct sched_rt_entity *rt_se = rt_rq->rt_se; struct sched_rt_entity *rt_se = rt_rq->rt_se;
...@@ -110,11 +110,31 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) ...@@ -110,11 +110,31 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
dequeue_rt_entity(rt_se); dequeue_rt_entity(rt_se);
} }
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
}
static int rt_se_boosted(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = group_rt_rq(rt_se);
struct task_struct *p;
if (rt_rq)
return !!rt_rq->rt_nr_boosted;
p = rt_task_of(rt_se);
return p->prio != p->normal_prio;
}
#else #else
static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{ {
return sysctl_sched_rt_ratio; if (sysctl_sched_rt_runtime == -1)
return RUNTIME_INF;
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
} }
#define for_each_leaf_rt_rq(rt_rq, rq) \ #define for_each_leaf_rt_rq(rt_rq, rq) \
...@@ -141,19 +161,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) ...@@ -141,19 +161,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
return NULL; return NULL;
} }
static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{ {
} }
static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{ {
} }
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
return rt_rq->rt_throttled;
}
#endif #endif
static inline int rt_se_prio(struct sched_rt_entity *rt_se) static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{ {
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
struct rt_rq *rt_rq = group_rt_rq(rt_se); struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq) if (rt_rq)
...@@ -163,28 +187,26 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) ...@@ -163,28 +187,26 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
return rt_task_of(rt_se)->prio; return rt_task_of(rt_se)->prio;
} }
static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{ {
unsigned int rt_ratio = sched_rt_ratio(rt_rq); u64 runtime = sched_rt_runtime(rt_rq);
u64 period, ratio;
if (rt_ratio == SCHED_RT_FRAC) if (runtime == RUNTIME_INF)
return 0; return 0;
if (rt_rq->rt_throttled) if (rt_rq->rt_throttled)
return 1; return rt_rq_throttled(rt_rq);
period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
if (rt_rq->rt_time > ratio) { if (rt_rq->rt_time > runtime) {
struct rq *rq = rq_of_rt_rq(rt_rq); struct rq *rq = rq_of_rt_rq(rt_rq);
rq->rt_throttled = 1; rq->rt_throttled = 1;
rt_rq->rt_throttled = 1; rt_rq->rt_throttled = 1;
sched_rt_ratio_dequeue(rt_rq); if (rt_rq_throttled(rt_rq)) {
return 1; sched_rt_rq_dequeue(rt_rq);
return 1;
}
} }
return 0; return 0;
...@@ -196,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq) ...@@ -196,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq)
u64 period; u64 period;
while (rq->clock > rq->rt_period_expire) { while (rq->clock > rq->rt_period_expire) {
period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
rq->rt_period_expire += period; rq->rt_period_expire += period;
for_each_leaf_rt_rq(rt_rq, rq) { for_each_leaf_rt_rq(rt_rq, rq) {
unsigned long rt_ratio = sched_rt_ratio(rt_rq); u64 runtime = sched_rt_runtime(rt_rq);
u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
rt_rq->rt_time -= min(rt_rq->rt_time, ratio); rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
if (rt_rq->rt_throttled) { if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
rt_rq->rt_throttled = 0; rt_rq->rt_throttled = 0;
sched_rt_ratio_enqueue(rt_rq); sched_rt_rq_enqueue(rt_rq);
} }
} }
...@@ -239,12 +260,7 @@ static void update_curr_rt(struct rq *rq) ...@@ -239,12 +260,7 @@ static void update_curr_rt(struct rq *rq)
cpuacct_charge(curr, delta_exec); cpuacct_charge(curr, delta_exec);
rt_rq->rt_time += delta_exec; rt_rq->rt_time += delta_exec;
/* if (sched_rt_runtime_exceeded(rt_rq))
* might make it a tad more accurate:
*
* update_sched_rt_period(rq);
*/
if (sched_rt_ratio_exceeded(rt_rq))
resched_task(curr); resched_task(curr);
} }
...@@ -253,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ...@@ -253,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{ {
WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_prio(rt_se_prio(rt_se)));
rt_rq->rt_nr_running++; rt_rq->rt_nr_running++;
#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
if (rt_se_prio(rt_se) < rt_rq->highest_prio) if (rt_se_prio(rt_se) < rt_rq->highest_prio)
rt_rq->highest_prio = rt_se_prio(rt_se); rt_rq->highest_prio = rt_se_prio(rt_se);
#endif #endif
...@@ -265,6 +281,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ...@@ -265,6 +281,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
update_rt_migration(rq_of_rt_rq(rt_rq)); update_rt_migration(rq_of_rt_rq(rt_rq));
#endif #endif
#ifdef CONFIG_RT_GROUP_SCHED
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted++;
#endif
} }
static inline static inline
...@@ -273,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ...@@ -273,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running); WARN_ON(!rt_rq->rt_nr_running);
rt_rq->rt_nr_running--; rt_rq->rt_nr_running--;
#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
if (rt_rq->rt_nr_running) { if (rt_rq->rt_nr_running) {
struct rt_prio_array *array; struct rt_prio_array *array;
...@@ -295,6 +315,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ...@@ -295,6 +315,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
update_rt_migration(rq_of_rt_rq(rt_rq)); update_rt_migration(rq_of_rt_rq(rt_rq));
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#ifdef CONFIG_RT_GROUP_SCHED
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted--;
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
#endif
} }
static void enqueue_rt_entity(struct sched_rt_entity *rt_se) static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
...@@ -303,7 +329,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) ...@@ -303,7 +329,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
struct rt_prio_array *array = &rt_rq->active; struct rt_prio_array *array = &rt_rq->active;
struct rt_rq *group_rq = group_rt_rq(rt_se); struct rt_rq *group_rq = group_rt_rq(rt_se);
if (group_rq && group_rq->rt_throttled) if (group_rq && rt_rq_throttled(group_rq))
return; return;
list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
...@@ -496,7 +522,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) ...@@ -496,7 +522,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
if (unlikely(!rt_rq->rt_nr_running)) if (unlikely(!rt_rq->rt_nr_running))
return NULL; return NULL;
if (sched_rt_ratio_exceeded(rt_rq)) if (rt_rq_throttled(rt_rq))
return NULL; return NULL;
do { do {
......
...@@ -311,22 +311,6 @@ static struct ctl_table kern_table[] = { ...@@ -311,22 +311,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = &proc_dointvec, .proc_handler = &proc_dointvec,
}, },
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_rt_period_ms",
.data = &sysctl_sched_rt_period,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_rt_ratio",
.data = &sysctl_sched_rt_ratio,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
{ {
.ctl_name = CTL_UNNUMBERED, .ctl_name = CTL_UNNUMBERED,
...@@ -346,6 +330,22 @@ static struct ctl_table kern_table[] = { ...@@ -346,6 +330,22 @@ static struct ctl_table kern_table[] = {
}, },
#endif #endif
#endif #endif
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_rt_runtime_us",
.data = &sysctl_sched_rt_runtime,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{ {
.ctl_name = CTL_UNNUMBERED, .ctl_name = CTL_UNNUMBERED,
.procname = "sched_compat_yield", .procname = "sched_compat_yield",
......
...@@ -57,7 +57,7 @@ struct user_struct root_user = { ...@@ -57,7 +57,7 @@ struct user_struct root_user = {
.uid_keyring = &root_user_keyring, .uid_keyring = &root_user_keyring,
.session_keyring = &root_session_keyring, .session_keyring = &root_session_keyring,
#endif #endif
#ifdef CONFIG_FAIR_USER_SCHED #ifdef CONFIG_USER_SCHED
.tg = &init_task_group, .tg = &init_task_group,
#endif #endif
}; };
...@@ -90,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) ...@@ -90,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
return NULL; return NULL;
} }
#ifdef CONFIG_FAIR_USER_SCHED #ifdef CONFIG_USER_SCHED
static void sched_destroy_user(struct user_struct *up) static void sched_destroy_user(struct user_struct *up)
{ {
...@@ -113,15 +113,15 @@ static void sched_switch_user(struct task_struct *p) ...@@ -113,15 +113,15 @@ static void sched_switch_user(struct task_struct *p)
sched_move_task(p); sched_move_task(p);
} }
#else /* CONFIG_FAIR_USER_SCHED */ #else /* CONFIG_USER_SCHED */
static void sched_destroy_user(struct user_struct *up) { } static void sched_destroy_user(struct user_struct *up) { }
static int sched_create_user(struct user_struct *up) { return 0; } static int sched_create_user(struct user_struct *up) { return 0; }
static void sched_switch_user(struct task_struct *p) { } static void sched_switch_user(struct task_struct *p) { }
#endif /* CONFIG_FAIR_USER_SCHED */ #endif /* CONFIG_USER_SCHED */
#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) #if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
static DEFINE_MUTEX(uids_mutex); static DEFINE_MUTEX(uids_mutex);
...@@ -137,6 +137,7 @@ static inline void uids_mutex_unlock(void) ...@@ -137,6 +137,7 @@ static inline void uids_mutex_unlock(void)
} }
/* uid directory attributes */ /* uid directory attributes */
#ifdef CONFIG_FAIR_GROUP_SCHED
static ssize_t cpu_shares_show(struct kobject *kobj, static ssize_t cpu_shares_show(struct kobject *kobj,
struct kobj_attribute *attr, struct kobj_attribute *attr,
char *buf) char *buf)
...@@ -163,10 +164,45 @@ static ssize_t cpu_shares_store(struct kobject *kobj, ...@@ -163,10 +164,45 @@ static ssize_t cpu_shares_store(struct kobject *kobj,
static struct kobj_attribute cpu_share_attr = static struct kobj_attribute cpu_share_attr =
__ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
struct user_struct *up = container_of(kobj, struct user_struct, kobj);
return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
}
static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t size)
{
struct user_struct *up = container_of(kobj, struct user_struct, kobj);
unsigned long rt_runtime;
int rc;
sscanf(buf, "%lu", &rt_runtime);
rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
return (rc ? rc : size);
}
static struct kobj_attribute cpu_rt_runtime_attr =
__ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
#endif
/* default attributes per uid directory */ /* default attributes per uid directory */
static struct attribute *uids_attributes[] = { static struct attribute *uids_attributes[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
&cpu_share_attr.attr, &cpu_share_attr.attr,
#endif
#ifdef CONFIG_RT_GROUP_SCHED
&cpu_rt_runtime_attr.attr,
#endif
NULL NULL
}; };
...@@ -269,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags) ...@@ -269,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
schedule_work(&up->work); schedule_work(&up->work);
} }
#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
int uids_sysfs_init(void) { return 0; } int uids_sysfs_init(void) { return 0; }
static inline int uids_user_create(struct user_struct *up) { return 0; } static inline int uids_user_create(struct user_struct *up) { return 0; }
...@@ -373,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) ...@@ -373,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
spin_lock_irq(&uidhash_lock); spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent); up = uid_hash_find(uid, hashent);
if (up) { if (up) {
/* This case is not possible when CONFIG_FAIR_USER_SCHED /* This case is not possible when CONFIG_USER_SCHED
* is defined, since we serialize alloc_uid() using * is defined, since we serialize alloc_uid() using
* uids_mutex. Hence no need to call * uids_mutex. Hence no need to call
* sched_destroy_user() or remove_user_sysfs_dir(). * sched_destroy_user() or remove_user_sysfs_dir().
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment