Commit 2004cef1 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched-core-2024-09-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - Implement the SCHED_DEADLINE server infrastructure - Daniel Bristot
   de Oliveira's last major contribution to the kernel:

     "SCHED_DEADLINE servers can help fixing starvation issues of low
      priority tasks (e.g., SCHED_OTHER) when higher priority tasks
      monopolize CPU cycles. Today we have RT Throttling; DEADLINE
      servers should be able to replace and improve that."

   (Daniel Bristot de Oliveira, Peter Zijlstra, Joel Fernandes, Youssef
   Esmat, Huang Shijie)

 - Preparatory changes for sched_ext integration:
     - Use set_next_task(.first) where required
     - Fix up set_next_task() implementations
     - Clean up DL server vs. core sched
     - Split up put_prev_task_balance()
     - Rework pick_next_task()
     - Combine the last put_prev_task() and the first set_next_task()
     - Rework dl_server
     - Add put_prev_task(.next)

   (Peter Zijlstra, with a fix by Tejun Heo)

 - Complete the EEVDF transition and refine EEVDF scheduling:
     - Implement delayed dequeue
     - Allow shorter slices to wakeup-preempt
     - Use sched_attr::sched_runtime to set request/slice suggestion
     - Document the new feature flags
     - Remove unused and duplicate-functionality fields
     - Simplify & unify pick_next_task_fair()
     - Misc debuggability enhancements

   (Peter Zijlstra, with fixes/cleanups by Dietmar Eggemann, Valentin
   Schneider and Chuyi Zhou)

 - Initialize the vruntime of a new task when it is first enqueued,
   resulting in significant decrease in latency of newly woken tasks
   (Zhang Qiao)

 - Introduce SM_IDLE and an idle re-entry fast-path in __schedule()
   (K Prateek Nayak, Peter Zijlstra)

 - Clean up and clarify the usage of Clean up usage of rt_task()
   (Qais Yousef)

 - Preempt SCHED_IDLE entities in strict cgroup hierarchies
   (Tianchen Ding)

 - Clarify the documentation of time units for deadline scheduler
   parameters (Christian Loehle)

 - Remove the HZ_BW chicken-bit feature flag introduced a year ago,
   the original change seems to be working fine (Phil Auld)

 - Misc fixes and cleanups (Chen Yu, Dan Carpenter, Huang Shijie,
   Peilin He, Qais Yousefm and Vincent Guittot)

* tag 'sched-core-2024-09-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits)
  sched/cpufreq: Use NSEC_PER_MSEC for deadline task
  cpufreq/cppc: Use NSEC_PER_MSEC for deadline task
  sched/deadline: Clarify nanoseconds in uapi
  sched/deadline: Convert schedtool example to chrt
  sched/debug: Fix the runnable tasks output
  sched: Fix sched_delayed vs sched_core
  kernel/sched: Fix util_est accounting for DELAY_DEQUEUE
  kthread: Fix task state in kthread worker if being frozen
  sched/pelt: Use rq_clock_task() for hw_pressure
  sched/fair: Move effective_cpu_util() and effective_cpu_util() in fair.c
  sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()
  sched: Add put_prev_task(.next)
  sched: Rework dl_server
  sched: Combine the last put_prev_task() and the first set_next_task()
  sched: Rework pick_next_task()
  sched: Split up put_prev_task_balance()
  sched: Clean up DL server vs core sched
  sched: Fixup set_next_task() implementations
  sched: Use set_next_task(.first) where required
  sched/fair: Properly deactivate sched_delayed task upon class change
  ...
parents 509d2cd1 bc9057da
......@@ -749,21 +749,19 @@ Appendix A. Test suite
of the command line options. Please refer to rt-app documentation for more
details (`<rt-app-sources>/doc/*.json`).
The second testing application is a modification of schedtool, called
schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a
certain pid/application. schedtool-dl is available at:
https://github.com/scheduler-tools/schedtool-dl.git.
The second testing application is done using chrt which has support
for SCHED_DEADLINE.
The usage is straightforward::
# schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app
# chrt -d -T 10000000 -D 100000000 0 ./my_cpuhog_app
With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation
of 10ms every 100ms (note that parameters are expressed in microseconds).
You can also use schedtool to create a reservation for an already running
of 10ms every 100ms (note that parameters are expressed in nanoseconds).
You can also use chrt to create a reservation for an already running
application, given that you know its pid::
# schedtool -E -t 10000000:100000000 my_app_pid
# chrt -d -T 10000000 -D 100000000 -p 0 my_app_pid
Appendix B. Minimal main()
==========================
......
......@@ -224,9 +224,9 @@ static void __init cppc_freq_invariance_init(void)
* Fake (unused) bandwidth; workaround to "fix"
* priority inheritance.
*/
.sched_runtime = 1000000,
.sched_deadline = 10000000,
.sched_period = 10000000,
.sched_runtime = NSEC_PER_MSEC,
.sched_deadline = 10 * NSEC_PER_MSEC,
.sched_period = 10 * NSEC_PER_MSEC,
};
int ret;
......
......@@ -335,7 +335,7 @@ static inline bool six_owner_running(struct six_lock *lock)
*/
rcu_read_lock();
struct task_struct *owner = READ_ONCE(lock->owner);
bool ret = owner ? owner_on_cpu(owner) : !rt_task(current);
bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
rcu_read_unlock();
return ret;
......
......@@ -2626,7 +2626,7 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
}
task_lock(p);
if (task_is_realtime(p))
if (rt_or_dl_task_policy(p))
slack_ns = 0;
else if (slack_ns == 0)
slack_ns = p->default_timer_slack_ns;
......
......@@ -40,7 +40,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
{
if (task->policy == SCHED_IDLE)
return IOPRIO_CLASS_IDLE;
else if (task_is_realtime(task))
else if (rt_or_dl_task_policy(task))
return IOPRIO_CLASS_RT;
else
return IOPRIO_CLASS_BE;
......
......@@ -149,8 +149,9 @@ struct user_event_mm;
* Special states are those that do not use the normal wait-loop pattern. See
* the comment with set_special_state().
*/
#define is_special_task_state(state) \
((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
#define is_special_task_state(state) \
((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \
TASK_DEAD | TASK_FROZEN))
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
# define debug_normal_state_change(state_value) \
......@@ -541,9 +542,14 @@ struct sched_entity {
struct rb_node run_node;
u64 deadline;
u64 min_vruntime;
u64 min_slice;
struct list_head group_node;
unsigned int on_rq;
unsigned char on_rq;
unsigned char sched_delayed;
unsigned char rel_deadline;
unsigned char custom_slice;
/* hole */
u64 exec_start;
u64 sum_exec_runtime;
......@@ -639,12 +645,26 @@ struct sched_dl_entity {
*
* @dl_overrun tells if the task asked to be informed about runtime
* overruns.
*
* @dl_server tells if this is a server entity.
*
* @dl_defer tells if this is a deferred or regular server. For
* now only defer server exists.
*
* @dl_defer_armed tells if the deferrable server is waiting
* for the replenishment timer to activate it.
*
* @dl_defer_running tells if the deferrable server is actually
* running, skipping the defer phase.
*/
unsigned int dl_throttled : 1;
unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
unsigned int dl_server : 1;
unsigned int dl_defer : 1;
unsigned int dl_defer_armed : 1;
unsigned int dl_defer_running : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
......@@ -672,7 +692,7 @@ struct sched_dl_entity {
*/
struct rq *rq;
dl_server_has_tasks_f server_has_tasks;
dl_server_pick_f server_pick;
dl_server_pick_f server_pick_task;
#ifdef CONFIG_RT_MUTEXES
/*
......
......@@ -10,16 +10,16 @@
#include <linux/sched.h>
#define MAX_DL_PRIO 0
static inline int dl_prio(int prio)
static inline bool dl_prio(int prio)
{
if (unlikely(prio < MAX_DL_PRIO))
return 1;
return 0;
return unlikely(prio < MAX_DL_PRIO);
}
static inline int dl_task(struct task_struct *p)
/*
* Returns true if a task has a priority that belongs to DL class. PI-boosted
* tasks will return true. Use dl_policy() to ignore PI-boosted tasks.
*/
static inline bool dl_task(struct task_struct *p)
{
return dl_prio(p->prio);
}
......
......@@ -14,6 +14,7 @@
*/
#define MAX_RT_PRIO 100
#define MAX_DL_PRIO 0
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
......
......@@ -6,19 +6,40 @@
struct task_struct;
static inline int rt_prio(int prio)
static inline bool rt_prio(int prio)
{
if (unlikely(prio < MAX_RT_PRIO))
return 1;
return 0;
return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO);
}
static inline int rt_task(struct task_struct *p)
static inline bool rt_or_dl_prio(int prio)
{
return unlikely(prio < MAX_RT_PRIO);
}
/*
* Returns true if a task has a priority that belongs to RT class. PI-boosted
* tasks will return true. Use rt_policy() to ignore PI-boosted tasks.
*/
static inline bool rt_task(struct task_struct *p)
{
return rt_prio(p->prio);
}
static inline bool task_is_realtime(struct task_struct *tsk)
/*
* Returns true if a task has a priority that belongs to RT or DL classes.
* PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore
* PI-boosted tasks.
*/
static inline bool rt_or_dl_task(struct task_struct *p)
{
return rt_or_dl_prio(p->prio);
}
/*
* Returns true if a task has a policy that belongs to RT or DL classes.
* PI-boosted tasks will return false.
*/
static inline bool rt_or_dl_task_policy(struct task_struct *tsk)
{
int policy = tsk->policy;
......
......@@ -58,9 +58,9 @@
*
* This is reflected by the following fields of the sched_attr structure:
*
* @sched_deadline representative of the task's deadline
* @sched_runtime representative of the task's runtime
* @sched_period representative of the task's period
* @sched_deadline representative of the task's deadline in nanoseconds
* @sched_runtime representative of the task's runtime in nanoseconds
* @sched_period representative of the task's period in nanoseconds
*
* Given this task model, there are a multiplicity of scheduling algorithms
* and policies, that can be used to ensure all the tasks will make their
......
......@@ -72,7 +72,7 @@ bool __refrigerator(bool check_kthr_stop)
bool freeze;
raw_spin_lock_irq(&current->pi_lock);
set_current_state(TASK_FROZEN);
WRITE_ONCE(current->__state, TASK_FROZEN);
/* unstale saved_state so that __thaw_task() will wake us up */
current->saved_state = TASK_RUNNING;
raw_spin_unlock_irq(&current->pi_lock);
......
......@@ -845,8 +845,16 @@ int kthread_worker_fn(void *worker_ptr)
* event only cares about the address.
*/
trace_sched_kthread_work_execute_end(work, func);
} else if (!freezing(current))
} else if (!freezing(current)) {
schedule();
} else {
/*
* Handle the case where the current remains
* TASK_INTERRUPTIBLE. try_to_freeze() expects
* the current to be TASK_RUNNING.
*/
__set_current_state(TASK_RUNNING);
}
try_to_freeze();
cond_resched();
......
......@@ -347,7 +347,7 @@ static __always_inline int __waiter_prio(struct task_struct *task)
{
int prio = task->prio;
if (!rt_prio(prio))
if (!rt_or_dl_prio(prio))
return DEFAULT_PRIO;
return prio;
......@@ -435,7 +435,7 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
* Note that RT tasks are excluded from same priority (lateral)
* steals to prevent the introduction of an unbounded latency.
*/
if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
if (rt_or_dl_prio(waiter->tree.prio))
return false;
return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
......
......@@ -631,7 +631,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
* if it is an RT task or wait in the wait queue
* for too long.
*/
if (has_handoff || (!rt_task(waiter->task) &&
if (has_handoff || (!rt_or_dl_task(waiter->task) &&
!time_after(jiffies, waiter->timeout)))
return false;
......@@ -914,7 +914,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
if (owner_state != OWNER_WRITER) {
if (need_resched())
break;
if (rt_task(current) &&
if (rt_or_dl_task(current) &&
(prev_owner_state != OWNER_WRITER))
break;
}
......
......@@ -237,7 +237,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
int a_prio = a->task->prio;
int b_prio = b->task->prio;
if (rt_prio(a_prio) || rt_prio(b_prio)) {
if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) {
if (a_prio > b_prio)
return true;
......
This diff is collapsed.
......@@ -654,9 +654,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
* Fake (unused) bandwidth; workaround to "fix"
* priority inheritance.
*/
.sched_runtime = 1000000,
.sched_deadline = 10000000,
.sched_period = 10000000,
.sched_runtime = NSEC_PER_MSEC,
.sched_deadline = 10 * NSEC_PER_MSEC,
.sched_period = 10 * NSEC_PER_MSEC,
};
struct cpufreq_policy *policy = sg_policy->policy;
int ret;
......
This diff is collapsed.
......@@ -333,8 +333,165 @@ static const struct file_operations sched_debug_fops = {
.release = seq_release,
};
enum dl_param {
DL_RUNTIME = 0,
DL_PERIOD,
};
static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos, enum dl_param param)
{
long cpu = (long) ((struct seq_file *) filp->private_data)->private;
struct rq *rq = cpu_rq(cpu);
u64 runtime, period;
size_t err;
int retval;
u64 value;
err = kstrtoull_from_user(ubuf, cnt, 10, &value);
if (err)
return err;
scoped_guard (rq_lock_irqsave, rq) {
runtime = rq->fair_server.dl_runtime;
period = rq->fair_server.dl_period;
switch (param) {
case DL_RUNTIME:
if (runtime == value)
break;
runtime = value;
break;
case DL_PERIOD:
if (value == period)
break;
period = value;
break;
}
if (runtime > period ||
period > fair_server_period_max ||
period < fair_server_period_min) {
return -EINVAL;
}
if (rq->cfs.h_nr_running) {
update_rq_clock(rq);
dl_server_stop(&rq->fair_server);
}
retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
if (retval)
cnt = retval;
if (!runtime)
printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
cpu_of(rq));
if (rq->cfs.h_nr_running)
dl_server_start(&rq->fair_server);
}
*ppos += cnt;
return cnt;
}
static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
{
unsigned long cpu = (unsigned long) m->private;
struct rq *rq = cpu_rq(cpu);
u64 value;
switch (param) {
case DL_RUNTIME:
value = rq->fair_server.dl_runtime;
break;
case DL_PERIOD:
value = rq->fair_server.dl_period;
break;
}
seq_printf(m, "%llu\n", value);
return 0;
}
static ssize_t
sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
}
static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
{
return sched_fair_server_show(m, v, DL_RUNTIME);
}
static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
}
static const struct file_operations fair_server_runtime_fops = {
.open = sched_fair_server_runtime_open,
.write = sched_fair_server_runtime_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static ssize_t
sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
}
static int sched_fair_server_period_show(struct seq_file *m, void *v)
{
return sched_fair_server_show(m, v, DL_PERIOD);
}
static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_fair_server_period_show, inode->i_private);
}
static const struct file_operations fair_server_period_fops = {
.open = sched_fair_server_period_open,
.write = sched_fair_server_period_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static struct dentry *debugfs_sched;
static void debugfs_fair_server_init(void)
{
struct dentry *d_fair;
unsigned long cpu;
d_fair = debugfs_create_dir("fair_server", debugfs_sched);
if (!d_fair)
return;
for_each_possible_cpu(cpu) {
struct dentry *d_cpu;
char buf[32];
snprintf(buf, sizeof(buf), "cpu%lu", cpu);
d_cpu = debugfs_create_dir(buf, d_fair);
debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
}
}
static __init int sched_init_debug(void)
{
struct dentry __maybe_unused *numa;
......@@ -374,6 +531,8 @@ static __init int sched_init_debug(void)
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
debugfs_fair_server_init();
return 0;
}
late_initcall(sched_init_debug);
......@@ -580,27 +739,27 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));
SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
SPLIT_NS(p->se.deadline),
p->se.custom_slice ? 'S' : ' ',
SPLIT_NS(p->se.slice),
SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld",
SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld",
SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
SEQ_printf_task_group_path(m, task_group(p), " %s")
SEQ_printf_task_group_path(m, task_group(p), " %s")
#endif
SEQ_printf(m, "\n");
......@@ -612,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m, "\n");
SEQ_printf(m, "runnable tasks:\n");
SEQ_printf(m, " S task PID tree-key switches prio"
" wait-time sum-exec sum-sleep\n");
SEQ_printf(m, " S task PID vruntime eligible "
"deadline slice sum-exec switches "
"prio wait-time sum-sleep sum-block"
#ifdef CONFIG_NUMA_BALANCING
" node group-id"
#endif
#ifdef CONFIG_CGROUP_SCHED
" group-path"
#endif
"\n");
SEQ_printf(m, "-------------------------------------------------------"
"------------------------------------------------------\n");
"------------------------------------------------------"
"------------------------------------------------------"
#ifdef CONFIG_NUMA_BALANCING
"--------------"
#endif
#ifdef CONFIG_CGROUP_SCHED
"--------------"
#endif
"\n");
rcu_read_lock();
for_each_process_thread(g, p) {
......@@ -641,8 +816,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, "\n");
SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
#endif
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_rq_lock_irqsave(rq, flags);
root = __pick_root_entity(cfs_rq);
......@@ -669,8 +842,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(right_vruntime));
spread = right_vruntime - left_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
......@@ -730,9 +901,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
PU(rt_nr_running);
#ifdef CONFIG_RT_GROUP_SCHED
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
#endif
#undef PN
#undef PU
......
This diff is collapsed.
......@@ -5,8 +5,24 @@
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
SCHED_FEAT(PLACE_LAG, true)
/*
* Give new tasks half a slice to ease into the competition.
*/
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
/*
* Preserve relative virtual deadline on 'migration'.
*/
SCHED_FEAT(PLACE_REL_DEADLINE, true)
/*
* Inhibit (wakeup) preemption until the current task has either matched the
* 0-lag point or until is has exhausted it's slice.
*/
SCHED_FEAT(RUN_TO_PARITY, true)
/*
* Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for
* current.
*/
SCHED_FEAT(PREEMPT_SHORT, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
......@@ -21,6 +37,18 @@ SCHED_FEAT(NEXT_BUDDY, false)
*/
SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
* Delay dequeueing tasks until they get selected or woken.
*
* By delaying the dequeue for non-eligible tasks, they remain in the
* competition and can burn off their negative lag. When they get selected
* they'll have positive lag by definition.
*
* DELAY_ZERO clips the lag on dequeue (or wakeup) to 0.
*/
SCHED_FEAT(DELAY_DEQUEUE, true)
SCHED_FEAT(DELAY_ZERO, true)
/*
* Allow wakeup-time preemption of the current task:
*/
......@@ -85,5 +113,3 @@ SCHED_FEAT(WA_BIAS, true)
SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(LATENCY_WARN, false)
SCHED_FEAT(HZ_BW, true)
......@@ -450,43 +450,35 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
resched_curr(rq);
}
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
dl_server_update_idle_time(rq, prev);
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
next->se.exec_start = rq_clock_task(rq);
}
#ifdef CONFIG_SMP
static struct task_struct *pick_task_idle(struct rq *rq)
struct task_struct *pick_task_idle(struct rq *rq)
{
return rq->idle;
}
#endif
struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
set_next_task_idle(rq, next, true);
return next;
}
/*
* It is not legal to sleep in the idle task - print a warning
* message if some code attempts to do it:
*/
static void
static bool
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
raw_spin_rq_unlock_irq(rq);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
raw_spin_rq_lock_irq(rq);
return true;
}
/*
......@@ -528,13 +520,12 @@ DEFINE_SCHED_CLASS(idle) = {
.wakeup_preempt = wakeup_preempt_idle,
.pick_next_task = pick_next_task_idle,
.pick_task = pick_task_idle,
.put_prev_task = put_prev_task_idle,
.set_next_task = set_next_task_idle,
#ifdef CONFIG_SMP
.balance = balance_idle,
.pick_task = pick_task_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
......
This diff is collapsed.
......@@ -68,6 +68,7 @@
#include <linux/wait_api.h>
#include <linux/wait_bit.h>
#include <linux/workqueue_api.h>
#include <linux/delayacct.h>
#include <trace/events/power.h>
#include <trace/events/sched.h>
......@@ -335,7 +336,7 @@ extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int dl_bw_check_overflow(int cpu);
extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
/*
* SCHED_DEADLINE supports servers (nested scheduling) with the following
* interface:
......@@ -361,7 +362,14 @@ extern void dl_server_start(struct sched_dl_entity *dl_se);
extern void dl_server_stop(struct sched_dl_entity *dl_se);
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick);
dl_server_pick_f pick_task);
extern void dl_server_update_idle_time(struct rq *rq,
struct task_struct *p);
extern void fair_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
#ifdef CONFIG_CGROUP_SCHED
......@@ -599,17 +607,12 @@ struct cfs_rq {
s64 avg_vruntime;
u64 avg_load;
u64 exec_clock;
u64 min_vruntime;
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
u64 min_vruntime_fi;
#endif
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
struct rb_root_cached tasks_timeline;
/*
......@@ -619,10 +622,6 @@ struct cfs_rq {
struct sched_entity *curr;
struct sched_entity *next;
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
#endif
#ifdef CONFIG_SMP
/*
* CFS load tracking
......@@ -726,13 +725,13 @@ struct rt_rq {
#endif /* CONFIG_SMP */
int rt_queued;
#ifdef CONFIG_RT_GROUP_SCHED
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned int rt_nr_boosted;
struct rq *rq;
......@@ -820,6 +819,9 @@ static inline void se_update_runnable(struct sched_entity *se)
static inline long se_runnable(struct sched_entity *se)
{
if (se->sched_delayed)
return false;
if (entity_is_task(se))
return !!se->on_rq;
else
......@@ -834,6 +836,9 @@ static inline void se_update_runnable(struct sched_entity *se) { }
static inline long se_runnable(struct sched_entity *se)
{
if (se->sched_delayed)
return false;
return !!se->on_rq;
}
......@@ -1044,6 +1049,8 @@ struct rq {
struct rt_rq rt;
struct dl_rq dl;
struct sched_dl_entity fair_server;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
struct list_head leaf_cfs_rq_list;
......@@ -1059,6 +1066,7 @@ struct rq {
unsigned int nr_uninterruptible;
struct task_struct __rcu *curr;
struct sched_dl_entity *dl_server;
struct task_struct *idle;
struct task_struct *stop;
unsigned long next_balance;
......@@ -1158,7 +1166,6 @@ struct rq {
/* latency stats */
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_count;
......@@ -1187,6 +1194,7 @@ struct rq {
/* per rq */
struct rq *core;
struct task_struct *core_pick;
struct sched_dl_entity *core_dl_server;
unsigned int core_enabled;
unsigned int core_sched_seq;
struct rb_root core_tree;
......@@ -2247,11 +2255,13 @@ extern const u32 sched_prio_to_wmult[40];
*
*/
#define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */
#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
#define DEQUEUE_SPECIAL 0x10
#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
......@@ -2267,6 +2277,7 @@ extern const u32 sched_prio_to_wmult[40];
#endif
#define ENQUEUE_INITIAL 0x80
#define ENQUEUE_MIGRATING 0x100
#define ENQUEUE_DELAYED 0x200
#define RETRY_TASK ((void *)-1UL)
......@@ -2285,23 +2296,31 @@ struct sched_class {
#endif
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
struct task_struct *(*pick_next_task)(struct rq *rq);
struct task_struct *(*pick_task)(struct rq *rq);
/*
* Optional! When implemented pick_next_task() should be equivalent to:
*
* next = pick_task();
* if (next) {
* put_prev_task(prev);
* set_next_task_first(next);
* }
*/
struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
struct task_struct * (*pick_task)(struct rq *rq);
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
......@@ -2345,7 +2364,7 @@ struct sched_class {
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
WARN_ON_ONCE(rq->curr != prev);
prev->sched_class->put_prev_task(rq, prev);
prev->sched_class->put_prev_task(rq, prev, NULL);
}
static inline void set_next_task(struct rq *rq, struct task_struct *next)
......@@ -2353,6 +2372,30 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
next->sched_class->set_next_task(rq, next, false);
}
static inline void
__put_prev_set_next_dl_server(struct rq *rq,
struct task_struct *prev,
struct task_struct *next)
{
prev->dl_server = NULL;
next->dl_server = rq->dl_server;
rq->dl_server = NULL;
}
static inline void put_prev_set_next_task(struct rq *rq,
struct task_struct *prev,
struct task_struct *next)
{
WARN_ON_ONCE(rq->curr != prev);
__put_prev_set_next_dl_server(rq, prev, next);
if (next == prev)
return;
prev->sched_class->put_prev_task(rq, prev, next);
next->sched_class->set_next_task(rq, next, true);
}
/*
* Helper to define a sched_class instance; each one is placed in a separate
......@@ -2408,7 +2451,7 @@ static inline bool sched_fair_runnable(struct rq *rq)
}
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
extern struct task_struct *pick_next_task_idle(struct rq *rq);
extern struct task_struct *pick_task_idle(struct rq *rq);
#define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02
......@@ -2515,7 +2558,6 @@ extern void reweight_task(struct task_struct *p, const struct load_weight *lw);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
......@@ -2586,6 +2628,19 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
sched_update_tick_dependency(rq);
}
static inline void __block_task(struct rq *rq, struct task_struct *p)
{
WRITE_ONCE(p->on_rq, 0);
ASSERT_EXCLUSIVE_WRITER(p->on_rq);
if (p->sched_contributes_to_load)
rq->nr_uninterruptible++;
if (p->in_iowait) {
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
}
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
......@@ -3607,7 +3662,7 @@ extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *c
extern void __setscheduler_prio(struct task_struct *p, int prio);
extern void set_load_weight(struct task_struct *p, bool update_load);
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
extern void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
......
......@@ -41,26 +41,17 @@ static struct task_struct *pick_task_stop(struct rq *rq)
return rq->stop;
}
static struct task_struct *pick_next_task_stop(struct rq *rq)
{
struct task_struct *p = pick_task_stop(rq);
if (p)
set_next_task_stop(rq, p, true);
return p;
}
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
}
static void
static bool
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
return true;
}
static void yield_task_stop(struct rq *rq)
......@@ -68,7 +59,7 @@ static void yield_task_stop(struct rq *rq)
BUG(); /* the stop task should never yield, its pointless. */
}
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
update_curr_common(rq);
}
......@@ -111,13 +102,12 @@ DEFINE_SCHED_CLASS(stop) = {
.wakeup_preempt = wakeup_preempt_stop,
.pick_next_task = pick_next_task_stop,
.pick_task = pick_task_stop,
.put_prev_task = put_prev_task_stop,
.set_next_task = set_next_task_stop,
#ifdef CONFIG_SMP
.balance = balance_stop,
.pick_task = pick_task_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
......
......@@ -57,7 +57,7 @@ static int effective_prio(struct task_struct *p)
* keep the priority unchanged. Otherwise, update priority
* to the normal priority:
*/
if (!rt_prio(p->prio))
if (!rt_or_dl_prio(p->prio))
return p->normal_prio;
return p->prio;
}
......@@ -258,107 +258,6 @@ int sched_core_idle_cpu(int cpu)
#endif
#ifdef CONFIG_SMP
/*
* This function computes an effective utilization for the given CPU, to be
* used for frequency selection given the linear relation: f = u * f_max.
*
* The scheduler tracks the following metrics:
*
* cpu_util_{cfs,rt,dl,irq}()
* cpu_bw_dl()
*
* Where the cfs,rt and dl util numbers are tracked with the same metric and
* synchronized windows and are thus directly comparable.
*
* The cfs,rt,dl utilization are the running times measured with rq->clock_task
* which excludes things like IRQ and steal-time. These latter are then accrued
* in the IRQ utilization.
*
* The DL bandwidth number OTOH is not a measured metric but a value computed
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long *min,
unsigned long *max)
{
unsigned long util, irq, scale;
struct rq *rq = cpu_rq(cpu);
scale = arch_scale_cpu_capacity(cpu);
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
* because of inaccuracies in how we track these -- see
* update_irq_load_avg().
*/
irq = cpu_util_irq(rq);
if (unlikely(irq >= scale)) {
if (min)
*min = scale;
if (max)
*max = scale;
return scale;
}
if (min) {
/*
* The minimum utilization returns the highest level between:
* - the computed DL bandwidth needed with the IRQ pressure which
* steals time to the deadline task.
* - The minimum performance requirement for CFS and/or RT.
*/
*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
/*
* When an RT task is runnable and uclamp is not used, we must
* ensure that the task will run at maximum compute capacity.
*/
if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
*min = max(*min, scale);
}
/*
* Because the time spend on RT/DL tasks is visible as 'lost' time to
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
*/
util = util_cfs + cpu_util_rt(rq);
util += cpu_util_dl(rq);
/*
* The maximum hint is a soft bandwidth requirement, which can be lower
* than the actual utilization because of uclamp_max requirements.
*/
if (max)
*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
if (util >= scale)
return scale;
/*
* There is still idle time; further improve the number by using the
* IRQ metric. Because IRQ/steal time is hidden from the task clock we
* need to scale the task numbers:
*
* max - irq
* U' = irq + --------- * U
* max
*/
util = scale_irq_capacity(util, irq, scale);
util += irq;
return min(scale, util);
}
unsigned long sched_cpu_util(int cpu)
{
return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
}
#endif /* CONFIG_SMP */
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
......@@ -401,13 +300,23 @@ static void __setscheduler_params(struct task_struct *p,
p->policy = policy;
if (dl_policy(policy))
if (dl_policy(policy)) {
__setparam_dl(p, attr);
else if (fair_policy(policy))
} else if (fair_policy(policy)) {
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
if (attr->sched_runtime) {
p->se.custom_slice = 1;
p->se.slice = clamp_t(u64, attr->sched_runtime,
NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
NSEC_PER_MSEC*100); /* HZ=100 / 10 */
} else {
p->se.custom_slice = 0;
p->se.slice = sysctl_sched_base_slice;
}
}
/* rt-policy tasks do not have a timerslack */
if (task_is_realtime(p)) {
if (rt_or_dl_task_policy(p)) {
p->timer_slack_ns = 0;
} else if (p->timer_slack_ns == 0) {
/* when switching back to non-rt policy, restore timerslack */
......@@ -708,7 +617,9 @@ int __sched_setscheduler(struct task_struct *p,
* but store a possible modification of reset_on_fork.
*/
if (unlikely(policy == p->policy)) {
if (fair_policy(policy) && attr->sched_nice != task_nice(p))
if (fair_policy(policy) &&
(attr->sched_nice != task_nice(p) ||
(attr->sched_runtime != p->se.slice)))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
......@@ -854,6 +765,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
.sched_nice = PRIO_TO_NICE(p->static_prio),
};
if (p->se.custom_slice)
attr.sched_runtime = p->se.slice;
/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
......@@ -1020,12 +934,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
static void get_params(struct task_struct *p, struct sched_attr *attr)
{
if (task_has_dl_policy(p))
if (task_has_dl_policy(p)) {
__getparam_dl(p, attr);
else if (task_has_rt_policy(p))
} else if (task_has_rt_policy(p)) {
attr->sched_priority = p->rt_priority;
else
} else {
attr->sched_nice = task_nice(p);
attr->sched_runtime = p->se.slice;
}
}
/**
......
......@@ -516,6 +516,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
/*
* Because the rq is not a task, dl_add_task_root_domain() did not
* move the fair server bw to the rd if it already started.
* Add it now.
*/
if (rq->fair_server.dl_server)
__dl_server_attach_root(&rq->fair_server, rq);
rq_unlock_irqrestore(rq, &rf);
if (old_rd)
......
......@@ -2557,7 +2557,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = current->timer_slack_ns;
break;
case PR_SET_TIMERSLACK:
if (task_is_realtime(current))
if (rt_or_dl_task_policy(current))
break;
if (arg2 <= 0)
current->timer_slack_ns =
......
......@@ -1977,7 +1977,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
* expiry.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT))
mode |= HRTIMER_MODE_HARD;
}
......
......@@ -547,7 +547,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
* - wakeup_dl handles tasks belonging to sched_dl class only.
*/
if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
(wakeup_rt && !dl_task(p) && !rt_task(p)) ||
(wakeup_rt && !rt_or_dl_task(p)) ||
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
......
......@@ -418,7 +418,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
tsk = current;
if (rt_task(tsk)) {
if (rt_or_dl_task(tsk)) {
bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
}
......@@ -477,7 +477,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat)
else
dirty = vm_dirty_ratio * node_memory / 100;
if (rt_task(tsk))
if (rt_or_dl_task(tsk))
dirty += dirty / 4;
/*
......
......@@ -4004,7 +4004,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
*/
if (alloc_flags & ALLOC_MIN_RESERVE)
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && in_task())
} else if (unlikely(rt_or_dl_task(current)) && in_task())
alloc_flags |= ALLOC_MIN_RESERVE;
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment