Commit dad1c12e authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - Remove the unused per rq load array and all its infrastructure, by
   Dietmar Eggemann.

 - Add utilization clamping support by Patrick Bellasi. This is a
   refinement of the energy aware scheduling framework with support for
   boosting of interactive and capping of background workloads: to make
   sure critical GUI threads get maximum frequency ASAP, and to make
   sure background processing doesn't unnecessarily move to cpufreq
   governor to higher frequencies and less energy efficient CPU modes.

 - Add the bare minimum of tracepoints required for LISA EAS regression
   testing, by Qais Yousef - which allows automated testing of various
   power management features, including energy aware scheduling.

 - Restructure the former tsk_nr_cpus_allowed() facility that the -rt
   kernel used to modify the scheduler's CPU affinity logic such as
   migrate_disable() - introduce the task->cpus_ptr value instead of
   taking the address of &task->cpus_allowed directly - by Sebastian
   Andrzej Siewior.

 - Misc optimizations, fixes, cleanups and small enhancements - see the
   Git log for details.

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits)
  sched/uclamp: Add uclamp support to energy_compute()
  sched/uclamp: Add uclamp_util_with()
  sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks
  sched/uclamp: Set default clamps for RT tasks
  sched/uclamp: Reset uclamp values on RESET_ON_FORK
  sched/uclamp: Extend sched_setattr() to support utilization clamping
  sched/core: Allow sched_setattr() to use the current policy
  sched/uclamp: Add system default clamps
  sched/uclamp: Enforce last task's UCLAMP_MAX
  sched/uclamp: Add bucket local max tracking
  sched/uclamp: Add CPU's clamp buckets refcounting
  sched/fair: Rename weighted_cpuload() to cpu_runnable_load()
  sched/debug: Export the newly added tracepoints
  sched/debug: Add sched_overutilized tracepoint
  sched/debug: Add new tracepoint to track PELT at se level
  sched/debug: Add new tracepoints to track PELT at rq level
  sched/debug: Add a new sched_trace_*() helper functions
  sched/autogroup: Make autogroup_path() always available
  sched/wait: Deduplicate code with do-while
  sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity()
  ...
parents 090bc5a2 af24bde8
......@@ -20,7 +20,8 @@ void calc_runnable_avg_yN_inv(void)
int i;
unsigned int x;
printf("static const u32 runnable_avg_yN_inv[] = {");
/* To silence -Wunused-but-set-variable warnings. */
printf("static const u32 runnable_avg_yN_inv[] __maybe_unused = {");
for (i = 0; i < HALFLIFE; i++) {
x = ((1UL<<32)-1)*pow(y, i);
......
......@@ -169,7 +169,7 @@ static void update_cpu_capacity(unsigned int cpu)
topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity);
pr_info("CPU%u: update cpu_capacity %lu\n",
cpu, topology_get_cpu_scale(NULL, cpu));
cpu, topology_get_cpu_scale(cpu));
}
#else
......
......@@ -1831,7 +1831,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
ti->cpu = cpu;
p->stack = ti;
p->state = TASK_UNINTERRUPTIBLE;
cpumask_set_cpu(cpu, &p->cpus_allowed);
cpumask_set_cpu(cpu, &p->cpus_mask);
INIT_LIST_HEAD(&p->tasks);
p->parent = p->real_parent = p->group_leader = p;
INIT_LIST_HEAD(&p->children);
......
......@@ -42,7 +42,7 @@ extern struct task_struct *ll_task;
* inline to try to keep the overhead down. If we have been forced to run on
* a "CPU" with an FPU because of a previous high level of FP computation,
* but did not actually use the FPU during the most recent time-slice (CU1
* isn't set), we undo the restriction on cpus_allowed.
* isn't set), we undo the restriction on cpus_mask.
*
* We're not calling set_cpus_allowed() here, because we have no need to
* force prompt migration - we're already switching the current CPU to a
......@@ -57,7 +57,7 @@ do { \
test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \
(!(KSTK_STATUS(prev) & ST0_CU1))) { \
clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \
prev->cpus_allowed = prev->thread.user_cpus_allowed; \
prev->cpus_mask = prev->thread.user_cpus_allowed; \
} \
next->thread.emulated_fp = 0; \
} while(0)
......
......@@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
if (retval)
goto out_unlock;
cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed);
cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr);
cpumask_and(&mask, &allowed, cpu_active_mask);
out_unlock:
......
......@@ -891,12 +891,12 @@ static void mt_ase_fp_affinity(void)
* restricted the allowed set to exclude any CPUs with FPUs,
* we'll skip the procedure.
*/
if (cpumask_intersects(&current->cpus_allowed, &mt_fpu_cpumask)) {
if (cpumask_intersects(&current->cpus_mask, &mt_fpu_cpumask)) {
cpumask_t tmask;
current->thread.user_cpus_allowed
= current->cpus_allowed;
cpumask_and(&tmask, &current->cpus_allowed,
= current->cpus_mask;
cpumask_and(&tmask, &current->cpus_mask,
&mt_fpu_cpumask);
set_cpus_allowed_ptr(current, &tmask);
set_thread_flag(TIF_FPUBOUND);
......
......@@ -128,7 +128,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
* runqueue. The context will be rescheduled on the proper node
* if it is timesliced or preempted.
*/
cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
/* Save the current cpu id for spu interrupt routing. */
ctx->last_ran = raw_smp_processor_id();
......
......@@ -1503,7 +1503,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
* may be scheduled elsewhere and invalidate entries in the
* pseudo-locked region.
*/
if (!cpumask_subset(&current->cpus_allowed, &plr->d->cpu_mask)) {
if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
mutex_unlock(&rdtgroup_mutex);
return -EINVAL;
}
......
......@@ -43,7 +43,7 @@ static ssize_t cpu_capacity_show(struct device *dev,
{
struct cpu *cpu = container_of(dev, struct cpu, dev);
return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id));
return sprintf(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id));
}
static void update_topology_flags_workfn(struct work_struct *work);
......@@ -116,7 +116,7 @@ void topology_normalize_cpu_scale(void)
/ capacity_scale;
topology_set_cpu_scale(cpu, capacity);
pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
cpu, topology_get_cpu_scale(NULL, cpu));
cpu, topology_get_cpu_scale(cpu));
}
}
......@@ -185,7 +185,7 @@ init_cpu_capacity_callback(struct notifier_block *nb,
cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
for_each_cpu(cpu, policy->related_cpus) {
raw_capacity[cpu] = topology_get_cpu_scale(NULL, cpu) *
raw_capacity[cpu] = topology_get_cpu_scale(cpu) *
policy->cpuinfo.max_freq / 1000UL;
capacity_scale = max(raw_capacity[cpu], capacity_scale);
}
......
......@@ -1038,7 +1038,7 @@ int hfi1_get_proc_affinity(int node)
struct hfi1_affinity_node *entry;
cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
const struct cpumask *node_mask,
*proc_mask = &current->cpus_allowed;
*proc_mask = current->cpus_ptr;
struct hfi1_affinity_node_list *affinity = &node_affinity;
struct cpu_mask_set *set = &affinity->proc;
......@@ -1046,7 +1046,7 @@ int hfi1_get_proc_affinity(int node)
* check whether process/context affinity has already
* been set
*/
if (cpumask_weight(proc_mask) == 1) {
if (current->nr_cpus_allowed == 1) {
hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
current->pid, current->comm,
cpumask_pr_args(proc_mask));
......@@ -1057,7 +1057,7 @@ int hfi1_get_proc_affinity(int node)
cpu = cpumask_first(proc_mask);
cpumask_set_cpu(cpu, &set->used);
goto done;
} else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
} else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
current->pid, current->comm,
cpumask_pr_args(proc_mask));
......
......@@ -869,14 +869,13 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
{
struct sdma_rht_node *rht_node;
struct sdma_engine *sde = NULL;
const struct cpumask *current_mask = &current->cpus_allowed;
unsigned long cpu_id;
/*
* To ensure that always the same sdma engine(s) will be
* selected make sure the process is pinned to this CPU only.
*/
if (cpumask_weight(current_mask) != 1)
if (current->nr_cpus_allowed != 1)
goto out;
cpu_id = smp_processor_id();
......
......@@ -1142,7 +1142,7 @@ static __poll_t qib_poll(struct file *fp, struct poll_table_struct *pt)
static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
{
struct qib_filedata *fd = fp->private_data;
const unsigned int weight = cpumask_weight(&current->cpus_allowed);
const unsigned int weight = current->nr_cpus_allowed;
const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
int local_cpu;
......@@ -1623,9 +1623,8 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
ret = find_free_ctxt(i_minor - 1, fp, uinfo);
else {
int unit;
const unsigned int cpu = cpumask_first(&current->cpus_allowed);
const unsigned int weight =
cpumask_weight(&current->cpus_allowed);
const unsigned int cpu = cpumask_first(current->cpus_ptr);
const unsigned int weight = current->nr_cpus_allowed;
if (weight == 1 && !test_bit(cpu, qib_cpulist))
if (!find_hca(cpu, &unit) && unit >= 0)
......
......@@ -381,9 +381,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{
seq_printf(m, "Cpus_allowed:\t%*pb\n",
cpumask_pr_args(&task->cpus_allowed));
cpumask_pr_args(task->cpus_ptr));
seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
cpumask_pr_args(&task->cpus_allowed));
cpumask_pr_args(task->cpus_ptr));
}
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
......
......@@ -18,7 +18,7 @@ DECLARE_PER_CPU(unsigned long, cpu_scale);
struct sched_domain;
static inline
unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu)
unsigned long topology_get_cpu_scale(int cpu)
{
return per_cpu(cpu_scale, cpu);
}
......
......@@ -89,7 +89,7 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
* like schedutil.
*/
cpu = cpumask_first(to_cpumask(pd->cpus));
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(cpu);
cs = &pd->table[pd->nr_cap_states - 1];
freq = map_util_freq(max_util, cs->frequency, scale_cpu);
......
......@@ -220,4 +220,38 @@ int __order_base_2(unsigned long n)
ilog2((n) - 1) + 1) : \
__order_base_2(n) \
)
static inline __attribute__((const))
int __bits_per(unsigned long n)
{
if (n < 2)
return 1;
if (is_power_of_2(n))
return order_base_2(n) + 1;
return order_base_2(n);
}
/**
* bits_per - calculate the number of bits required for the argument
* @n: parameter
*
* This is constant-capable and can be used for compile time
* initializations, e.g bitfields.
*
* The first few values calculated by this routine:
* bf(0) = 1
* bf(1) = 1
* bf(2) = 2
* bf(3) = 2
* bf(4) = 3
* ... and so on.
*/
#define bits_per(n) \
( \
__builtin_constant_p(n) ? ( \
((n) == 0 || (n) == 1) \
? 1 : ilog2(n) + 1 \
) : \
__bits_per(n) \
)
#endif /* _LINUX_LOG2_H */
......@@ -35,6 +35,7 @@ struct audit_context;
struct backing_dev_info;
struct bio_list;
struct blk_plug;
struct capture_control;
struct cfs_rq;
struct fs_struct;
struct futex_pi_state;
......@@ -47,8 +48,9 @@ struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
struct reclaim_state;
struct capture_control;
struct robust_list_head;
struct root_domain;
struct rq;
struct sched_attr;
struct sched_param;
struct seq_file;
......@@ -281,6 +283,18 @@ struct vtime {
u64 gtime;
};
/*
* Utilization clamp constraints.
* @UCLAMP_MIN: Minimum utilization
* @UCLAMP_MAX: Maximum utilization
* @UCLAMP_CNT: Utilization clamp constraints count
*/
enum uclamp_id {
UCLAMP_MIN = 0,
UCLAMP_MAX,
UCLAMP_CNT
};
struct sched_info {
#ifdef CONFIG_SCHED_INFO
/* Cumulative counters: */
......@@ -312,6 +326,10 @@ struct sched_info {
# define SCHED_FIXEDPOINT_SHIFT 10
# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
/* Increase resolution of cpu_capacity calculations */
# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
struct load_weight {
unsigned long weight;
u32 inv_weight;
......@@ -560,6 +578,41 @@ struct sched_dl_entity {
struct hrtimer inactive_timer;
};
#ifdef CONFIG_UCLAMP_TASK
/* Number of utilization clamp buckets (shorter alias) */
#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
/*
* Utilization clamp for a scheduling entity
* @value: clamp value "assigned" to a se
* @bucket_id: bucket index corresponding to the "assigned" value
* @active: the se is currently refcounted in a rq's bucket
* @user_defined: the requested clamp value comes from user-space
*
* The bucket_id is the index of the clamp bucket matching the clamp value
* which is pre-computed and stored to avoid expensive integer divisions from
* the fast path.
*
* The active bit is set whenever a task has got an "effective" value assigned,
* which can be different from the clamp value "requested" from user-space.
* This allows to know a task is refcounted in the rq's bucket corresponding
* to the "effective" bucket_id.
*
* The user_defined bit is set whenever a task has got a task-specific clamp
* value requested from userspace, i.e. the system defaults apply to this task
* just as a restriction. This allows to relax default clamps when a less
* restrictive task-specific value has been requested, thus allowing to
* implement a "nice" semantic. For example, a task running with a 20%
* default boost can still drop its own boosting to 0%.
*/
struct uclamp_se {
unsigned int value : bits_per(SCHED_CAPACITY_SCALE);
unsigned int bucket_id : bits_per(UCLAMP_BUCKETS);
unsigned int active : 1;
unsigned int user_defined : 1;
};
#endif /* CONFIG_UCLAMP_TASK */
union rcu_special {
struct {
u8 blocked;
......@@ -640,6 +693,13 @@ struct task_struct {
#endif
struct sched_dl_entity dl;
#ifdef CONFIG_UCLAMP_TASK
/* Clamp values requested for a scheduling entity */
struct uclamp_se uclamp_req[UCLAMP_CNT];
/* Effective clamp values used for a scheduling entity */
struct uclamp_se uclamp[UCLAMP_CNT];
#endif
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* List of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
......@@ -651,7 +711,8 @@ struct task_struct {
unsigned int policy;
int nr_cpus_allowed;
cpumask_t cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t cpus_mask;
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
......@@ -1399,7 +1460,7 @@ extern struct pid *cad_pid;
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */
#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
......@@ -1915,4 +1976,16 @@ static inline void rseq_syscall(struct pt_regs *regs)
#endif
const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);
char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);
int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);
const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
int sched_trace_rq_cpu(struct rq *rq);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
#endif
......@@ -6,14 +6,6 @@
* This is the interface between the scheduler and nohz/dynticks:
*/
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern void cpu_load_update_nohz_start(void);
extern void cpu_load_update_nohz_stop(void);
#else
static inline void cpu_load_update_nohz_start(void) { }
static inline void cpu_load_update_nohz_stop(void) { }
#endif
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern void nohz_balance_enter_idle(int cpu);
extern int get_nohz_timer_target(void);
......
......@@ -56,6 +56,11 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
#ifdef CONFIG_UCLAMP_TASK
extern unsigned int sysctl_sched_uclamp_util_min;
extern unsigned int sysctl_sched_uclamp_util_max;
#endif
#ifdef CONFIG_CFS_BANDWIDTH
extern unsigned int sysctl_sched_cfs_bandwidth_slice;
#endif
......@@ -75,6 +80,12 @@ extern int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
#ifdef CONFIG_UCLAMP_TASK
extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
#endif
extern int sysctl_numa_balancing(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
......
......@@ -6,12 +6,6 @@
#include <linux/sched/idle.h>
/*
* Increase resolution of cpu_capacity calculations
*/
#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
/*
* sched-domains (multiprocessor balancing) declarations:
*/
......@@ -84,11 +78,6 @@ struct sched_domain {
unsigned int busy_factor; /* less balancing by factor if busy */
unsigned int imbalance_pct; /* No balance until over watermark */
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
unsigned int busy_idx;
unsigned int idle_idx;
unsigned int newidle_idx;
unsigned int wake_idx;
unsigned int forkexec_idx;
int nohz_idle; /* NOHZ IDLE status */
int flags; /* See SD_* */
......@@ -201,14 +190,6 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl);
# define SD_INIT_NAME(type)
#endif
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
#else /* CONFIG_SMP */
struct sched_domain_attr;
......@@ -224,16 +205,16 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
return true;
}
#endif /* !CONFIG_SMP */
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
unsigned long arch_scale_cpu_capacity(int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
#endif /* !CONFIG_SMP */
static inline int task_node(const struct task_struct *p)
{
return cpu_to_node(task_cpu(p));
......
......@@ -594,6 +594,37 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
TP_printk("cpu=%d", __entry->cpu)
);
/*
* Following tracepoints are not exported in tracefs and provide hooking
* mechanisms only for testing and debugging purposes.
*
* Postfixed with _tp to make them easily identifiable in the code.
*/
DECLARE_TRACE(pelt_cfs_tp,
TP_PROTO(struct cfs_rq *cfs_rq),
TP_ARGS(cfs_rq));
DECLARE_TRACE(pelt_rt_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
DECLARE_TRACE(pelt_dl_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
DECLARE_TRACE(pelt_irq_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
DECLARE_TRACE(pelt_se_tp,
TP_PROTO(struct sched_entity *se),
TP_ARGS(se));
DECLARE_TRACE(sched_overutilized_tp,
TP_PROTO(struct root_domain *rd, bool overutilized),
TP_ARGS(rd, overutilized));
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
......
......@@ -51,9 +51,21 @@
#define SCHED_FLAG_RESET_ON_FORK 0x01
#define SCHED_FLAG_RECLAIM 0x02
#define SCHED_FLAG_DL_OVERRUN 0x04
#define SCHED_FLAG_KEEP_POLICY 0x08
#define SCHED_FLAG_KEEP_PARAMS 0x10
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
SCHED_FLAG_KEEP_PARAMS)
#define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \
SCHED_FLAG_UTIL_CLAMP_MAX)
#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \
SCHED_FLAG_RECLAIM | \
SCHED_FLAG_DL_OVERRUN)
SCHED_FLAG_DL_OVERRUN | \
SCHED_FLAG_KEEP_ALL | \
SCHED_FLAG_UTIL_CLAMP)
#endif /* _UAPI_LINUX_SCHED_H */
......@@ -9,6 +9,7 @@ struct sched_param {
};
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */
/*
* Extended scheduling parameters data structure.
......@@ -21,8 +22,33 @@ struct sched_param {
* the tasks may be useful for a wide variety of application fields, e.g.,
* multimedia, streaming, automation and control, and many others.
*
* This variant (sched_attr) is meant at describing a so-called
* sporadic time-constrained task. In such model a task is specified by:
* This variant (sched_attr) allows to define additional attributes to
* improve the scheduler knowledge about task requirements.
*
* Scheduling Class Attributes
* ===========================
*
* A subset of sched_attr attributes specifies the
* scheduling policy and relative POSIX attributes:
*
* @size size of the structure, for fwd/bwd compat.
*
* @sched_policy task's scheduling policy
* @sched_nice task's nice value (SCHED_NORMAL/BATCH)
* @sched_priority task's static priority (SCHED_FIFO/RR)
*
* Certain more advanced scheduling features can be controlled by a
* predefined set of flags via the attribute:
*
* @sched_flags for customizing the scheduler behaviour
*
* Sporadic Time-Constrained Task Attributes
* =========================================
*
* A subset of sched_attr attributes allows to describe a so-called
* sporadic time-constrained task.
*
* In such a model a task is specified by:
* - the activation period or minimum instance inter-arrival time;
* - the maximum (or average, depending on the actual scheduling
* discipline) computation time of all instances, a.k.a. runtime;
......@@ -34,14 +60,8 @@ struct sched_param {
* than the runtime and must be completed by time instant t equal to
* the instance activation time + the deadline.
*
* This is reflected by the actual fields of the sched_attr structure:
* This is reflected by the following fields of the sched_attr structure:
*
* @size size of the structure, for fwd/bwd compat.
*
* @sched_policy task's scheduling policy
* @sched_flags for customizing the scheduler behaviour
* @sched_nice task's nice value (SCHED_NORMAL/BATCH)
* @sched_priority task's static priority (SCHED_FIFO/RR)
* @sched_deadline representative of the task's deadline
* @sched_runtime representative of the task's runtime
* @sched_period representative of the task's period
......@@ -53,6 +73,29 @@ struct sched_param {
* As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
* only user of this new interface. More information about the algorithm
* available in the scheduling class file or in Documentation/.
*
* Task Utilization Attributes
* ===========================
*
* A subset of sched_attr attributes allows to specify the utilization
* expected for a task. These attributes allow to inform the scheduler about
* the utilization boundaries within which it should schedule the task. These
* boundaries are valuable hints to support scheduler decisions on both task
* placement and frequency selection.
*
* @sched_util_min represents the minimum utilization
* @sched_util_max represents the maximum utilization
*
* Utilization is a value in the range [0..SCHED_CAPACITY_SCALE]. It
* represents the percentage of CPU time used by a task when running at the
* maximum frequency on the highest capacity CPU of the system. For example, a
* 20% utilization task is a task running for 2ms every 10ms at maximum
* frequency.
*
* A task with a min utilization value bigger than 0 is more likely scheduled
* on a CPU with a capacity big enough to fit the specified value.
* A task with a max utilization value smaller than 1024 is more likely
* scheduled on a CPU with no more capacity than the specified value.
*/
struct sched_attr {
__u32 size;
......@@ -70,6 +113,11 @@ struct sched_attr {
__u64 sched_runtime;
__u64 sched_deadline;
__u64 sched_period;
/* Utilization hints */
__u32 sched_util_min;
__u32 sched_util_max;
};
#endif /* _UAPI_LINUX_SCHED_TYPES_H */
......@@ -677,6 +677,59 @@ config HAVE_UNSTABLE_SCHED_CLOCK
config GENERIC_SCHED_CLOCK
bool
menu "Scheduler features"
config UCLAMP_TASK
bool "Enable utilization clamping for RT/FAIR tasks"
depends on CPU_FREQ_GOV_SCHEDUTIL
help
This feature enables the scheduler to track the clamped utilization
of each CPU based on RUNNABLE tasks scheduled on that CPU.
With this option, the user can specify the min and max CPU
utilization allowed for RUNNABLE tasks. The max utilization defines
the maximum frequency a task should use while the min utilization
defines the minimum frequency it should use.
Both min and max utilization clamp values are hints to the scheduler,
aiming at improving its frequency selection policy, but they do not
enforce or grant any specific bandwidth for tasks.
If in doubt, say N.
config UCLAMP_BUCKETS_COUNT
int "Number of supported utilization clamp buckets"
range 5 20
default 5
depends on UCLAMP_TASK
help
Defines the number of clamp buckets to use. The range of each bucket
will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the
number of clamp buckets the finer their granularity and the higher
the precision of clamping aggregation and tracking at run-time.
For example, with the minimum configuration value we will have 5
clamp buckets tracking 20% utilization each. A 25% boosted tasks will
be refcounted in the [20..39]% bucket and will set the bucket clamp
effective value to 25%.
If a second 30% boosted task should be co-scheduled on the same CPU,
that task will be refcounted in the same bucket of the first task and
it will boost the bucket clamp effective value to 30%.
The clamp effective value of a bucket is reset to its nominal value
(20% in the example above) when there are no more tasks refcounted in
that bucket.
An additional boost/capping margin can be added to some tasks. In the
example above the 25% task will be boosted to 30% until it exits the
CPU. If that should be considered not acceptable on certain systems,
it's always possible to reduce the margin by increasing the number of
clamp buckets to trade off used memory for run-time tracking
precision.
If in doubt, use the default value.
endmenu
#
# For architectures that want to enable the support for NUMA-affine scheduler
# balancing logic:
......
......@@ -72,7 +72,8 @@ struct task_struct init_task
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
.policy = SCHED_NORMAL,
.cpus_allowed = CPU_MASK_ALL,
.cpus_ptr = &init_task.cpus_mask,
.cpus_mask = CPU_MASK_ALL,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
......
......@@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task)
if (task_css_is_root(task, cpuset_cgrp_id))
return;
set_cpus_allowed_ptr(task, &current->cpus_allowed);
set_cpus_allowed_ptr(task, current->cpus_ptr);
task->mems_allowed = current->mems_allowed;
}
......
......@@ -898,6 +898,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
if (orig->cpus_ptr == &orig->cpus_mask)
tsk->cpus_ptr = &tsk->cpus_mask;
/*
* One for us, one for whoever does the "release_task()" (usually
......
......@@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
* All CPUs of a domain must have the same micro-architecture
* since they all share the same table.
*/
cap = arch_scale_cpu_capacity(NULL, cpu);
cap = arch_scale_cpu_capacity(cpu);
if (prev_cap && prev_cap != cap) {
pr_err("CPUs of %*pbl must have the same capacity\n",
cpumask_pr_args(span));
......
......@@ -259,7 +259,6 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
}
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SCHED_DEBUG
int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
if (!task_group_is_autogroup(tg))
......@@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
#endif
This diff is collapsed.
......@@ -120,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
if (later_mask)
cpumask_set_cpu(best_cpu, later_mask);
......
......@@ -196,14 +196,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type)
unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type,
struct task_struct *p)
{
unsigned long dl_util, util, irq;
struct rq *rq = cpu_rq(cpu);
if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
}
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
......@@ -219,9 +222,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
*
* CFS and RT utilization can be boosted or capped, depending on
* utilization clamp constraints requested by currently RUNNABLE
* tasks.
* When there are no CFS RUNNABLE tasks, clamps are released and
* frequency will be gracefully reduced with the utilization decay.
*/
util = util_cfs;
util += cpu_util_rt(rq);
util = util_cfs + cpu_util_rt(rq);
if (type == FREQUENCY_UTIL)
util = uclamp_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
......@@ -276,12 +286,12 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util = cpu_util_cfs(rq);
unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
}
/**
......
......@@ -94,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
continue;
if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
continue;
if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
/*
* We have to ensure that we have at least one bit
......
......@@ -538,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any
* online CPU:
*/
cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
if (cpu >= nr_cpu_ids) {
/*
* Failed to find any suitable CPU.
......@@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq)
&curr->dl);
} else {
unsigned long scale_freq = arch_scale_freq_capacity(cpu);
unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
scaled_delta_exec = cap_scale(delta_exec, scale_freq);
scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
......@@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, &p->cpus_allowed))
cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
}
......@@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
!cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {
......
......@@ -233,49 +233,35 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
*tablep = NULL;
}
static int min_load_idx = 0;
static int max_load_idx = CPU_LOAD_IDX_MAX-1;
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
umode_t mode, proc_handler *proc_handler,
bool load_idx)
umode_t mode, proc_handler *proc_handler)
{
entry->procname = procname;
entry->data = data;
entry->maxlen = maxlen;
entry->mode = mode;
entry->proc_handler = proc_handler;
if (load_idx) {
entry->extra1 = &min_load_idx;
entry->extra2 = &max_load_idx;
}
}
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
struct ctl_table *table = sd_alloc_ctl_entry(14);
struct ctl_table *table = sd_alloc_ctl_entry(9);
if (table == NULL)
return NULL;
set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
/* &table[13] is terminator */
set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[8] is terminator */
return table;
}
......@@ -653,8 +639,6 @@ do { \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
P(nr_running);
SEQ_printf(m, " .%-30s: %lu\n", "load",
rq->load.weight);
P(nr_switches);
P(nr_load_updates);
P(nr_uninterruptible);
......@@ -662,11 +646,6 @@ do { \
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
PN(clock_task);
P(cpu_load[0]);
P(cpu_load[1]);
P(cpu_load[2]);
P(cpu_load[3]);
P(cpu_load[4]);
#undef P
#undef PN
......
This diff is collapsed.
......@@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, false)
/*
* Decrement CPU capacity based on time not spent running tasks
......
......@@ -28,6 +28,8 @@
#include "sched.h"
#include "pelt.h"
#include <trace/events/sched.h>
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
......@@ -265,6 +267,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
trace_pelt_se_tp(se);
return 1;
}
......@@ -278,6 +281,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
cfs_se_util_change(&se->avg);
trace_pelt_se_tp(se);
return 1;
}
......@@ -292,6 +296,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1, 1);
trace_pelt_cfs_tp(cfs_rq);
return 1;
}
......@@ -317,6 +322,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
running)) {
___update_load_avg(&rq->avg_rt, 1, 1);
trace_pelt_rt_tp(rq);
return 1;
}
......@@ -340,6 +346,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
running)) {
___update_load_avg(&rq->avg_dl, 1, 1);
trace_pelt_dl_tp(rq);
return 1;
}
......@@ -366,7 +373,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
* reflect the real amount of computation
*/
running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
/*
* We know the time that has been used by interrupt since last update
......@@ -388,8 +395,10 @@ int update_irq_load_avg(struct rq *rq, u64 running)
1,
1);
if (ret)
if (ret) {
___update_load_avg(&rq->avg_irq, 1, 1);
trace_pelt_irq_tp(rq);
}
return ret;
}
......
......@@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
* Scale the elapsed time to reflect the real amount of
* computation
*/
delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
rq->clock_pelt += delta;
......
......@@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, &p->cpus_allowed))
cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
......@@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
!cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
......@@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = {
.switched_to = switched_to_rt,
.update_curr = update_curr_rt,
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
};
#ifdef CONFIG_RT_GROUP_SCHED
......
/* SPDX-License-Identifier: GPL-2.0 */
/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
static const u32 runnable_avg_yN_inv[] = {
static const u32 runnable_avg_yN_inv[] __maybe_unused = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
......
......@@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
#ifdef CONFIG_SMP
extern void cpu_load_update_active(struct rq *this_rq);
#else
static inline void cpu_load_update_active(struct rq *this_rq) { }
#endif
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
......@@ -344,8 +338,10 @@ struct cfs_bandwidth {
u64 runtime_expires;
int expires_seq;
short idle;
short period_active;
u8 idle;
u8 period_active;
u8 distribute_running;
u8 slack_started;
struct hrtimer period_timer;
struct hrtimer slack_timer;
struct list_head throttled_cfs_rq;
......@@ -354,8 +350,6 @@ struct cfs_bandwidth {
int nr_periods;
int nr_throttled;
u64 throttled_time;
bool distribute_running;
#endif
};
......@@ -797,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
#endif
#endif /* CONFIG_SMP */
#ifdef CONFIG_UCLAMP_TASK
/*
* struct uclamp_bucket - Utilization clamp bucket
* @value: utilization clamp value for tasks on this clamp bucket
* @tasks: number of RUNNABLE tasks on this clamp bucket
*
* Keep track of how many tasks are RUNNABLE for a given utilization
* clamp value.
*/
struct uclamp_bucket {
unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
};
/*
* struct uclamp_rq - rq's utilization clamp
* @value: currently active clamp values for a rq
* @bucket: utilization clamp buckets affecting a rq
*
* Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
* A clamp value is affecting a rq when there is at least one task RUNNABLE
* (or actually running) with that value.
*
* There are up to UCLAMP_CNT possible different clamp values, currently there
* are only two: minimum utilization and maximum utilization.
*
* All utilization clamping values are MAX aggregated, since:
* - for util_min: we want to run the CPU at least at the max of the minimum
* utilization required by its currently RUNNABLE tasks.
* - for util_max: we want to allow the CPU to run up to the max of the
* maximum utilization allowed by its currently RUNNABLE tasks.
*
* Since on each system we expect only a limited number of different
* utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
* the metrics required to compute all the per-rq utilization clamp values.
*/
struct uclamp_rq {
unsigned int value;
struct uclamp_bucket bucket[UCLAMP_BUCKETS];
};
#endif /* CONFIG_UCLAMP_TASK */
/*
* This is the main, per-CPU runqueue data structure.
*
......@@ -818,8 +854,6 @@ struct rq {
unsigned int nr_preferred_running;
unsigned int numa_migrate_on;
#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
......@@ -830,11 +864,16 @@ struct rq {
atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
/* capture load from *all* tasks on this CPU: */
struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
#ifdef CONFIG_UCLAMP_TASK
/* Utilization clamp values based on CPU's RUNNABLE tasks */
struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
unsigned int uclamp_flags;
#define UCLAMP_FLAG_IDLE 0x01
#endif
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
......@@ -1649,6 +1688,10 @@ extern const u32 sched_prio_to_wmult[40];
struct sched_class {
const struct sched_class *next;
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
......@@ -2222,6 +2265,48 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
static __always_inline
unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
struct task_struct *p)
{
unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
if (p) {
min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
}
/*
* Since CPU's {min,max}_util clamps are MAX aggregated considering
* RUNNABLE tasks with _different_ clamps, we can end up with an
* inversion. Fix it now when the clamps are applied.
*/
if (unlikely(min_util >= max_util))
return min_util;
return clamp(util, min_util, max_util);
}
static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
{
return uclamp_util_with(rq, util, NULL);
}
#else /* CONFIG_UCLAMP_TASK */
static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
struct task_struct *p)
{
return util;
}
static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
{
return util;
}
#endif /* CONFIG_UCLAMP_TASK */
#ifdef arch_scale_freq_capacity
# ifndef arch_scale_freq_invariant
# define arch_scale_freq_invariant() true
......@@ -2237,7 +2322,6 @@ static inline unsigned long capacity_orig_of(int cpu)
}
#endif
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
/**
* enum schedutil_type - CPU utilization type
* @FREQUENCY_UTIL: Utilization used to select frequency
......@@ -2253,15 +2337,11 @@ enum schedutil_type {
ENERGY_UTIL,
};
unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type);
static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
{
unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL);
}
unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type,
struct task_struct *p);
static inline unsigned long cpu_bw_dl(struct rq *rq)
{
......@@ -2290,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
return READ_ONCE(rq->avg_rt.util_avg);
}
#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type,
struct task_struct *p)
{
return cfs;
return 0;
}
#endif
#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
......
......@@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl,
.imbalance_pct = 125,
.cache_nice_tries = 0,
.busy_idx = 0,
.idle_idx = 0,
.newidle_idx = 0,
.wake_idx = 0,
.forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
......@@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl,
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
#ifdef CONFIG_NUMA
} else if (sd->flags & SD_NUMA) {
sd->cache_nice_tries = 2;
sd->busy_idx = 3;
sd->idle_idx = 2;
sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
......@@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl,
#endif
} else {
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
sd->idle_idx = 1;
}
/*
......@@ -1884,10 +1874,10 @@ static struct sched_domain_topology_level
unsigned long cap;
/* Is there any asymmetry? */
cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
for_each_cpu(i, cpu_map) {
if (arch_scale_cpu_capacity(NULL, i) != cap) {
if (arch_scale_cpu_capacity(i) != cap) {
asym = true;
break;
}
......@@ -1902,7 +1892,7 @@ static struct sched_domain_topology_level
* to everyone.
*/
for_each_cpu(i, cpu_map) {
unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
unsigned long max_capacity = arch_scale_cpu_capacity(i);
int tl_id = 0;
for_each_sd_topology(tl) {
......@@ -1912,7 +1902,7 @@ static struct sched_domain_topology_level
for_each_cpu_and(j, tl->mask(i), cpu_map) {
unsigned long capacity;
capacity = arch_scale_cpu_capacity(NULL, j);
capacity = arch_scale_cpu_capacity(j);
if (capacity <= max_capacity)
continue;
......
......@@ -118,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
bookmark.func = NULL;
INIT_LIST_HEAD(&bookmark.entry);
spin_lock_irqsave(&wq_head->lock, flags);
nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
spin_unlock_irqrestore(&wq_head->lock, flags);
while (bookmark.flags & WQ_FLAG_BOOKMARK) {
do {
spin_lock_irqsave(&wq_head->lock, flags);
nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
wake_flags, key, &bookmark);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
} while (bookmark.flags & WQ_FLAG_BOOKMARK);
}
/**
......
......@@ -452,6 +452,22 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rr_handler,
},
#ifdef CONFIG_UCLAMP_TASK
{
.procname = "sched_util_clamp_min",
.data = &sysctl_sched_uclamp_util_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
{
.procname = "sched_util_clamp_max",
.data = &sysctl_sched_uclamp_util_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
#endif
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",
......
......@@ -782,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
cpu_load_update_nohz_start();
quiet_vmstat();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
......@@ -829,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
tick_do_update_jiffies64(now);
cpu_load_update_nohz_stop();
/*
* Clear the timer idle flag, so we avoid IPIs on remote queueing and
......
......@@ -277,7 +277,7 @@ static void move_to_next_cpu(void)
* of this thread, than stop migrating for the duration
* of the current test.
*/
if (!cpumask_equal(current_mask, &current->cpus_allowed))
if (!cpumask_equal(current_mask, current->cpus_ptr))
goto disable;
get_online_cpus();
......
......@@ -23,7 +23,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
* Kernel threads bound to a single CPU can safely use
* smp_processor_id():
*/
if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu)))
if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu)))
goto out;
/*
......
......@@ -34,7 +34,7 @@ static void simple_thread_func(int cnt)
/* Silly tracepoints */
trace_foo_bar("hello", cnt, array, random_strings[len],
&current->cpus_allowed);
current->cpus_ptr);
trace_foo_with_template_simple("HELLO", cnt);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment