Commit 0b1777f0 authored by Tejun Heo's avatar Tejun Heo

Merge branch 'tip/sched/core' into sched_ext/for-6.12

Pull in tip/sched/core to resolve two merge conflicts:

- 96fd6c65 ("sched: Factor out update_other_load_avgs() from __update_blocked_others()")
  5d871a63 ("sched/fair: Move effective_cpu_util() and effective_cpu_util() in fair.c")

  A simple context conflict. The former added __update_blocked_others() in
  the same #ifdef CONFIG_SMP block that effective_cpu_util() and
  sched_cpu_util() are in and the latter moved those functions to fair.c.
  This makes __update_blocked_others() more out of place. Will follow up
  with a patch to relocate.

- 96fd6c65 ("sched: Factor out update_other_load_avgs() from __update_blocked_others()")
  84d26528 ("sched/pelt: Use rq_clock_task() for hw_pressure")

  The former factored out the body of __update_blocked_others() into
  update_other_load_avgs(). The latter changed how update_hw_load_avg() is
  called in the body. Resolved by applying the change to
  update_other_load_avgs() instead.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parents 513ed0c7 bc9057da
...@@ -749,21 +749,19 @@ Appendix A. Test suite ...@@ -749,21 +749,19 @@ Appendix A. Test suite
of the command line options. Please refer to rt-app documentation for more of the command line options. Please refer to rt-app documentation for more
details (`<rt-app-sources>/doc/*.json`). details (`<rt-app-sources>/doc/*.json`).
The second testing application is a modification of schedtool, called The second testing application is done using chrt which has support
schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a for SCHED_DEADLINE.
certain pid/application. schedtool-dl is available at:
https://github.com/scheduler-tools/schedtool-dl.git.
The usage is straightforward:: The usage is straightforward::
# schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app # chrt -d -T 10000000 -D 100000000 0 ./my_cpuhog_app
With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation
of 10ms every 100ms (note that parameters are expressed in microseconds). of 10ms every 100ms (note that parameters are expressed in nanoseconds).
You can also use schedtool to create a reservation for an already running You can also use chrt to create a reservation for an already running
application, given that you know its pid:: application, given that you know its pid::
# schedtool -E -t 10000000:100000000 my_app_pid # chrt -d -T 10000000 -D 100000000 -p 0 my_app_pid
Appendix B. Minimal main() Appendix B. Minimal main()
========================== ==========================
......
...@@ -224,9 +224,9 @@ static void __init cppc_freq_invariance_init(void) ...@@ -224,9 +224,9 @@ static void __init cppc_freq_invariance_init(void)
* Fake (unused) bandwidth; workaround to "fix" * Fake (unused) bandwidth; workaround to "fix"
* priority inheritance. * priority inheritance.
*/ */
.sched_runtime = 1000000, .sched_runtime = NSEC_PER_MSEC,
.sched_deadline = 10000000, .sched_deadline = 10 * NSEC_PER_MSEC,
.sched_period = 10000000, .sched_period = 10 * NSEC_PER_MSEC,
}; };
int ret; int ret;
......
...@@ -58,9 +58,9 @@ ...@@ -58,9 +58,9 @@
* *
* This is reflected by the following fields of the sched_attr structure: * This is reflected by the following fields of the sched_attr structure:
* *
* @sched_deadline representative of the task's deadline * @sched_deadline representative of the task's deadline in nanoseconds
* @sched_runtime representative of the task's runtime * @sched_runtime representative of the task's runtime in nanoseconds
* @sched_period representative of the task's period * @sched_period representative of the task's period in nanoseconds
* *
* Given this task model, there are a multiplicity of scheduling algorithms * Given this task model, there are a multiplicity of scheduling algorithms
* and policies, that can be used to ensure all the tasks will make their * and policies, that can be used to ensure all the tasks will make their
......
...@@ -845,8 +845,16 @@ int kthread_worker_fn(void *worker_ptr) ...@@ -845,8 +845,16 @@ int kthread_worker_fn(void *worker_ptr)
* event only cares about the address. * event only cares about the address.
*/ */
trace_sched_kthread_work_execute_end(work, func); trace_sched_kthread_work_execute_end(work, func);
} else if (!freezing(current)) } else if (!freezing(current)) {
schedule(); schedule();
} else {
/*
* Handle the case where the current remains
* TASK_INTERRUPTIBLE. try_to_freeze() expects
* the current to be TASK_RUNNING.
*/
__set_current_state(TASK_RUNNING);
}
try_to_freeze(); try_to_freeze();
cond_resched(); cond_resched();
......
...@@ -267,6 +267,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) ...@@ -267,6 +267,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
void sched_core_enqueue(struct rq *rq, struct task_struct *p) void sched_core_enqueue(struct rq *rq, struct task_struct *p)
{ {
if (p->se.sched_delayed)
return;
rq->core->core_task_seq++; rq->core->core_task_seq++;
if (!p->core_cookie) if (!p->core_cookie)
...@@ -277,6 +280,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p) ...@@ -277,6 +280,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{ {
if (p->se.sched_delayed)
return;
rq->core->core_task_seq++; rq->core->core_task_seq++;
if (sched_core_enqueued(p)) { if (sched_core_enqueued(p)) {
...@@ -6477,19 +6483,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -6477,19 +6483,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* Constants for the sched_mode argument of __schedule(). * Constants for the sched_mode argument of __schedule().
* *
* The mode argument allows RT enabled kernels to differentiate a * The mode argument allows RT enabled kernels to differentiate a
* preemption from blocking on an 'sleeping' spin/rwlock. Note that * preemption from blocking on an 'sleeping' spin/rwlock.
* SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
* optimize the AND operation out and just check for zero.
*/ */
#define SM_NONE 0x0 #define SM_IDLE (-1)
#define SM_PREEMPT 0x1 #define SM_NONE 0
#define SM_RTLOCK_WAIT 0x2 #define SM_PREEMPT 1
#define SM_RTLOCK_WAIT 2
#ifndef CONFIG_PREEMPT_RT
# define SM_MASK_PREEMPT (~0U)
#else
# define SM_MASK_PREEMPT SM_PREEMPT
#endif
/* /*
* __schedule() is the main scheduler function. * __schedule() is the main scheduler function.
...@@ -6530,9 +6529,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -6530,9 +6529,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* *
* WARNING: must be called with preemption disabled! * WARNING: must be called with preemption disabled!
*/ */
static void __sched notrace __schedule(unsigned int sched_mode) static void __sched notrace __schedule(int sched_mode)
{ {
struct task_struct *prev, *next; struct task_struct *prev, *next;
/*
* On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
* as a preemption by schedule_debug() and RCU.
*/
bool preempt = sched_mode > SM_NONE;
unsigned long *switch_count; unsigned long *switch_count;
unsigned long prev_state; unsigned long prev_state;
struct rq_flags rf; struct rq_flags rf;
...@@ -6543,13 +6547,13 @@ static void __sched notrace __schedule(unsigned int sched_mode) ...@@ -6543,13 +6547,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
rq = cpu_rq(cpu); rq = cpu_rq(cpu);
prev = rq->curr; prev = rq->curr;
schedule_debug(prev, !!sched_mode); schedule_debug(prev, preempt);
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq); hrtick_clear(rq);
local_irq_disable(); local_irq_disable();
rcu_note_context_switch(!!sched_mode); rcu_note_context_switch(preempt);
/* /*
* Make sure that signal_pending_state()->signal_pending() below * Make sure that signal_pending_state()->signal_pending() below
...@@ -6578,12 +6582,20 @@ static void __sched notrace __schedule(unsigned int sched_mode) ...@@ -6578,12 +6582,20 @@ static void __sched notrace __schedule(unsigned int sched_mode)
switch_count = &prev->nivcsw; switch_count = &prev->nivcsw;
/* Task state changes only considers SM_PREEMPT as preemption */
preempt = sched_mode == SM_PREEMPT;
/* /*
* We must load prev->state once (task_struct::state is volatile), such * We must load prev->state once (task_struct::state is volatile), such
* that we form a control dependency vs deactivate_task() below. * that we form a control dependency vs deactivate_task() below.
*/ */
prev_state = READ_ONCE(prev->__state); prev_state = READ_ONCE(prev->__state);
if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { if (sched_mode == SM_IDLE) {
if (!rq->nr_running) {
next = prev;
goto picked;
}
} else if (!preempt && prev_state) {
if (signal_pending_state(prev_state, prev)) { if (signal_pending_state(prev_state, prev)) {
WRITE_ONCE(prev->__state, TASK_RUNNING); WRITE_ONCE(prev->__state, TASK_RUNNING);
} else { } else {
...@@ -6614,6 +6626,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) ...@@ -6614,6 +6626,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
} }
next = pick_next_task(rq, prev, &rf); next = pick_next_task(rq, prev, &rf);
picked:
clear_tsk_need_resched(prev); clear_tsk_need_resched(prev);
clear_preempt_need_resched(); clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
...@@ -6655,7 +6668,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) ...@@ -6655,7 +6668,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
psi_account_irqtime(rq, prev, next); psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev)); psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); trace_sched_switch(preempt, prev, next, prev_state);
/* Also unlocks the rq: */ /* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf); rq = context_switch(rq, prev, next, &rf);
...@@ -6731,7 +6744,7 @@ static void sched_update_worker(struct task_struct *tsk) ...@@ -6731,7 +6744,7 @@ static void sched_update_worker(struct task_struct *tsk)
} }
} }
static __always_inline void __schedule_loop(unsigned int sched_mode) static __always_inline void __schedule_loop(int sched_mode)
{ {
do { do {
preempt_disable(); preempt_disable();
...@@ -6776,7 +6789,7 @@ void __sched schedule_idle(void) ...@@ -6776,7 +6789,7 @@ void __sched schedule_idle(void)
*/ */
WARN_ON_ONCE(current->__state); WARN_ON_ONCE(current->__state);
do { do {
__schedule(SM_NONE); __schedule(SM_IDLE);
} while (need_resched()); } while (need_resched());
} }
......
...@@ -662,9 +662,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) ...@@ -662,9 +662,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
* Fake (unused) bandwidth; workaround to "fix" * Fake (unused) bandwidth; workaround to "fix"
* priority inheritance. * priority inheritance.
*/ */
.sched_runtime = 1000000, .sched_runtime = NSEC_PER_MSEC,
.sched_deadline = 10000000, .sched_deadline = 10 * NSEC_PER_MSEC,
.sched_period = 10000000, .sched_period = 10 * NSEC_PER_MSEC,
}; };
struct cpufreq_policy *policy = sg_policy->policy; struct cpufreq_policy *policy = sg_policy->policy;
int ret; int ret;
......
...@@ -739,7 +739,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) ...@@ -739,7 +739,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else else
SEQ_printf(m, " %c", task_state_to_char(p)); SEQ_printf(m, " %c", task_state_to_char(p));
SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p), p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime), SPLIT_NS(p->se.vruntime),
entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
...@@ -750,17 +750,16 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) ...@@ -750,17 +750,16 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
(long long)(p->nvcsw + p->nivcsw), (long long)(p->nvcsw + p->nivcsw),
p->prio); p->prio);
SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld", SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld",
SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif #endif
#ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_CGROUP_SCHED
SEQ_printf_task_group_path(m, task_group(p), " %s") SEQ_printf_task_group_path(m, task_group(p), " %s")
#endif #endif
SEQ_printf(m, "\n"); SEQ_printf(m, "\n");
...@@ -772,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) ...@@ -772,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m, "\n"); SEQ_printf(m, "\n");
SEQ_printf(m, "runnable tasks:\n"); SEQ_printf(m, "runnable tasks:\n");
SEQ_printf(m, " S task PID tree-key switches prio" SEQ_printf(m, " S task PID vruntime eligible "
" wait-time sum-exec sum-sleep\n"); "deadline slice sum-exec switches "
"prio wait-time sum-sleep sum-block"
#ifdef CONFIG_NUMA_BALANCING
" node group-id"
#endif
#ifdef CONFIG_CGROUP_SCHED
" group-path"
#endif
"\n");
SEQ_printf(m, "-------------------------------------------------------" SEQ_printf(m, "-------------------------------------------------------"
"------------------------------------------------------\n"); "------------------------------------------------------"
"------------------------------------------------------"
#ifdef CONFIG_NUMA_BALANCING
"--------------"
#endif
#ifdef CONFIG_CGROUP_SCHED
"--------------"
#endif
"\n");
rcu_read_lock(); rcu_read_lock();
for_each_process_thread(g, p) { for_each_process_thread(g, p) {
......
...@@ -6949,18 +6949,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -6949,18 +6949,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int rq_h_nr_running = rq->cfs.h_nr_running; int rq_h_nr_running = rq->cfs.h_nr_running;
u64 slice = 0; u64 slice = 0;
if (flags & ENQUEUE_DELAYED) {
requeue_delayed_entity(se);
return;
}
/* /*
* The code below (indirectly) updates schedutil which looks at * The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency. * the cfs_rq utilization to select a frequency.
* Let's add the task's estimated utilization to the cfs_rq's * Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil. * estimated utilization, before we update schedutil.
*/ */
util_est_enqueue(&rq->cfs, p); if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
util_est_enqueue(&rq->cfs, p);
if (flags & ENQUEUE_DELAYED) {
requeue_delayed_entity(se);
return;
}
/* /*
* If in_iowait is set, the code below may not trigger any cpufreq * If in_iowait is set, the code below may not trigger any cpufreq
...@@ -7178,7 +7179,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) ...@@ -7178,7 +7179,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
*/ */
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{ {
util_est_dequeue(&rq->cfs, p); if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
util_est_dequeue(&rq->cfs, p);
if (dequeue_entities(rq, &p->se, flags) < 0) { if (dequeue_entities(rq, &p->se, flags) < 0) {
util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
...@@ -8085,6 +8087,105 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) ...@@ -8085,6 +8087,105 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
return cpu_util(cpu, p, -1, 0); return cpu_util(cpu, p, -1, 0);
} }
/*
* This function computes an effective utilization for the given CPU, to be
* used for frequency selection given the linear relation: f = u * f_max.
*
* The scheduler tracks the following metrics:
*
* cpu_util_{cfs,rt,dl,irq}()
* cpu_bw_dl()
*
* Where the cfs,rt and dl util numbers are tracked with the same metric and
* synchronized windows and are thus directly comparable.
*
* The cfs,rt,dl utilization are the running times measured with rq->clock_task
* which excludes things like IRQ and steal-time. These latter are then accrued
* in the IRQ utilization.
*
* The DL bandwidth number OTOH is not a measured metric but a value computed
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long *min,
unsigned long *max)
{
unsigned long util, irq, scale;
struct rq *rq = cpu_rq(cpu);
scale = arch_scale_cpu_capacity(cpu);
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
* because of inaccuracies in how we track these -- see
* update_irq_load_avg().
*/
irq = cpu_util_irq(rq);
if (unlikely(irq >= scale)) {
if (min)
*min = scale;
if (max)
*max = scale;
return scale;
}
if (min) {
/*
* The minimum utilization returns the highest level between:
* - the computed DL bandwidth needed with the IRQ pressure which
* steals time to the deadline task.
* - The minimum performance requirement for CFS and/or RT.
*/
*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
/*
* When an RT task is runnable and uclamp is not used, we must
* ensure that the task will run at maximum compute capacity.
*/
if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
*min = max(*min, scale);
}
/*
* Because the time spend on RT/DL tasks is visible as 'lost' time to
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
*/
util = util_cfs + cpu_util_rt(rq);
util += cpu_util_dl(rq);
/*
* The maximum hint is a soft bandwidth requirement, which can be lower
* than the actual utilization because of uclamp_max requirements.
*/
if (max)
*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
if (util >= scale)
return scale;
/*
* There is still idle time; further improve the number by using the
* IRQ metric. Because IRQ/steal time is hidden from the task clock we
* need to scale the task numbers:
*
* max - irq
* U' = irq + --------- * U
* max
*/
util = scale_irq_capacity(util, irq, scale);
util += irq;
return min(scale, util);
}
unsigned long sched_cpu_util(int cpu)
{
return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
}
/* /*
* energy_env - Utilization landscape for energy estimation. * energy_env - Utilization landscape for energy estimation.
* @task_busy_time: Utilization contribution by the task for which we test the * @task_busy_time: Utilization contribution by the task for which we test the
......
...@@ -272,110 +272,12 @@ bool update_other_load_avgs(struct rq *rq) ...@@ -272,110 +272,12 @@ bool update_other_load_avgs(struct rq *rq)
lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq);
/* hw_pressure doesn't care about invariance */
return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
update_hw_load_avg(now, rq, hw_pressure) | update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) |
update_irq_load_avg(rq, 0); update_irq_load_avg(rq, 0);
} }
/*
* This function computes an effective utilization for the given CPU, to be
* used for frequency selection given the linear relation: f = u * f_max.
*
* The scheduler tracks the following metrics:
*
* cpu_util_{cfs,rt,dl,irq}()
* cpu_bw_dl()
*
* Where the cfs,rt and dl util numbers are tracked with the same metric and
* synchronized windows and are thus directly comparable.
*
* The cfs,rt,dl utilization are the running times measured with rq->clock_task
* which excludes things like IRQ and steal-time. These latter are then accrued
* in the IRQ utilization.
*
* The DL bandwidth number OTOH is not a measured metric but a value computed
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long *min,
unsigned long *max)
{
unsigned long util, irq, scale;
struct rq *rq = cpu_rq(cpu);
scale = arch_scale_cpu_capacity(cpu);
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
* because of inaccuracies in how we track these -- see
* update_irq_load_avg().
*/
irq = cpu_util_irq(rq);
if (unlikely(irq >= scale)) {
if (min)
*min = scale;
if (max)
*max = scale;
return scale;
}
if (min) {
/*
* The minimum utilization returns the highest level between:
* - the computed DL bandwidth needed with the IRQ pressure which
* steals time to the deadline task.
* - The minimum performance requirement for CFS and/or RT.
*/
*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
/*
* When an RT task is runnable and uclamp is not used, we must
* ensure that the task will run at maximum compute capacity.
*/
if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
*min = max(*min, scale);
}
/*
* Because the time spend on RT/DL tasks is visible as 'lost' time to
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
*/
util = util_cfs + cpu_util_rt(rq);
util += cpu_util_dl(rq);
/*
* The maximum hint is a soft bandwidth requirement, which can be lower
* than the actual utilization because of uclamp_max requirements.
*/
if (max)
*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
if (util >= scale)
return scale;
/*
* There is still idle time; further improve the number by using the
* IRQ metric. Because IRQ/steal time is hidden from the task clock we
* need to scale the task numbers:
*
* max - irq
* U' = irq + --------- * U
* max
*/
util = scale_irq_capacity(util, irq, scale);
util += irq;
return min(scale, util);
}
unsigned long sched_cpu_util(int cpu)
{
return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
}
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
/** /**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment