Commit edaa5ddf authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - reorganize & clean up the SD* flags definitions and add a bunch of
   sanity checks. These new checks caught quite a few bugs or at least
   inconsistencies, resulting in another set of patches.

 - rseq updates, add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ

 - add a new tracepoint to improve CPU capacity tracking

 - improve overloaded SMP system load-balancing behavior

 - tweak SMT balancing

 - energy-aware scheduling updates

 - NUMA balancing improvements

 - deadline scheduler fixes and improvements

 - CPU isolation fixes

 - misc cleanups, simplifications and smaller optimizations

* tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (42 commits)
  sched/deadline: Unthrottle PI boosted threads while enqueuing
  sched/debug: Add new tracepoint to track cpu_capacity
  sched/fair: Tweak pick_next_entity()
  rseq/selftests: Test MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
  rseq/selftests,x86_64: Add rseq_offset_deref_addv()
  rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
  sched/fair: Use dst group while checking imbalance for NUMA balancer
  sched/fair: Reduce busy load balance interval
  sched/fair: Minimize concurrent LBs between domain level
  sched/fair: Reduce minimal imbalance threshold
  sched/fair: Relax constraint on task's load during load balance
  sched/fair: Remove the force parameter of update_tg_load_avg()
  sched/fair: Fix wrong cpu selecting from isolated domain
  sched: Remove unused inline function uclamp_bucket_base_value()
  sched/rt: Disable RT_RUNTIME_SHARE by default
  sched/deadline: Fix stale throttling on de-/boosted tasks
  sched/numa: Use runnable_avg to classify node
  sched/topology: Move sd_flag_debug out of #ifdef CONFIG_SYSCTL
  MAINTAINERS: Add myself as SCHED_DEADLINE reviewer
  sched/topology: Move SD_DEGENERATE_GROUPS_MASK out of linux/sched/topology.h
  ...
parents 13cb7349 feff2e65
...@@ -15407,6 +15407,7 @@ R: Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL) ...@@ -15407,6 +15407,7 @@ R: Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
R: Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR) R: Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH) R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING) R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
R: Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE)
L: linux-kernel@vger.kernel.org L: linux-kernel@vger.kernel.org
S: Maintained S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
......
...@@ -177,15 +177,6 @@ static inline void parse_dt_topology(void) {} ...@@ -177,15 +177,6 @@ static inline void parse_dt_topology(void) {}
static inline void update_cpu_capacity(unsigned int cpuid) {} static inline void update_cpu_capacity(unsigned int cpuid) {}
#endif #endif
/*
* The current assumption is that we can power gate each core independently.
* This will be superseded by DT binding once available.
*/
const struct cpumask *cpu_corepower_mask(int cpu)
{
return &cpu_topology[cpu].thread_sibling;
}
/* /*
* store_cpu_topology is called at boot when only one cpu is running * store_cpu_topology is called at boot when only one cpu is running
* and with the mutex cpu_hotplug.lock locked, when several cpus have booted, * and with the mutex cpu_hotplug.lock locked, when several cpus have booted,
...@@ -241,20 +232,6 @@ void store_cpu_topology(unsigned int cpuid) ...@@ -241,20 +232,6 @@ void store_cpu_topology(unsigned int cpuid)
update_siblings_masks(cpuid); update_siblings_masks(cpuid);
} }
static inline int cpu_corepower_flags(void)
{
return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
}
static struct sched_domain_topology_level arm_topology[] = {
#ifdef CONFIG_SCHED_MC
{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
/* /*
* init_cpu_topology is called at boot when only one cpu is running * init_cpu_topology is called at boot when only one cpu is running
* which prevent simultaneous write access to cpu_topology array * which prevent simultaneous write access to cpu_topology array
...@@ -265,7 +242,4 @@ void __init init_cpu_topology(void) ...@@ -265,7 +242,4 @@ void __init init_cpu_topology(void)
smp_wmb(); smp_wmb();
parse_dt_topology(); parse_dt_topology();
/* Set scheduler topology descriptor */
set_sched_topology(arm_topology);
} }
...@@ -1491,9 +1491,10 @@ extern struct pid *cad_pid; ...@@ -1491,9 +1491,10 @@ extern struct pid *cad_pid;
/* /*
* Per process flags * Per process flags
*/ */
#define PF_VCPU 0x00000001 /* I'm a virtual CPU */
#define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */
#define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_EXITING 0x00000004 /* Getting shut down */
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
#define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */
#define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */
...@@ -1517,7 +1518,6 @@ extern struct pid *cad_pid; ...@@ -1517,7 +1518,6 @@ extern struct pid *cad_pid;
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
#define PF_IO_WORKER 0x20000000 /* Task is an IO worker */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
...@@ -2046,6 +2046,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); ...@@ -2046,6 +2046,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
int sched_trace_rq_cpu(struct rq *rq); int sched_trace_rq_cpu(struct rq *rq);
int sched_trace_rq_cpu_capacity(struct rq *rq);
int sched_trace_rq_nr_running(struct rq *rq); int sched_trace_rq_nr_running(struct rq *rq);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd); const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
......
...@@ -348,10 +348,13 @@ enum { ...@@ -348,10 +348,13 @@ enum {
MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7),
}; };
enum { enum {
MEMBARRIER_FLAG_SYNC_CORE = (1U << 0), MEMBARRIER_FLAG_SYNC_CORE = (1U << 0),
MEMBARRIER_FLAG_RSEQ = (1U << 1),
}; };
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* sched-domains (multiprocessor balancing) flag declarations.
*/
#ifndef SD_FLAG
# error "Incorrect import of SD flags definitions"
#endif
/*
* Hierarchical metaflags
*
* SHARED_CHILD: These flags are meant to be set from the base domain upwards.
* If a domain has this flag set, all of its children should have it set. This
* is usually because the flag describes some shared resource (all CPUs in that
* domain share the same resource), or because they are tied to a scheduling
* behaviour that we want to disable at some point in the hierarchy for
* scalability reasons.
*
* In those cases it doesn't make sense to have the flag set for a domain but
* not have it in (some of) its children: sched domains ALWAYS span their child
* domains, so operations done with parent domains will cover CPUs in the lower
* child domains.
*
*
* SHARED_PARENT: These flags are meant to be set from the highest domain
* downwards. If a domain has this flag set, all of its parents should have it
* set. This is usually for topology properties that start to appear above a
* certain level (e.g. domain starts spanning CPUs outside of the base CPU's
* socket).
*/
#define SDF_SHARED_CHILD 0x1
#define SDF_SHARED_PARENT 0x2
/*
* Behavioural metaflags
*
* NEEDS_GROUPS: These flags are only relevant if the domain they are set on has
* more than one group. This is usually for balancing flags (load balancing
* involves equalizing a metric between groups), or for flags describing some
* shared resource (which would be shared between groups).
*/
#define SDF_NEEDS_GROUPS 0x4
/*
* Balance when about to become idle
*
* SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_BALANCE_NEWIDLE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Balance on exec
*
* SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_BALANCE_EXEC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Balance on fork, clone
*
* SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_BALANCE_FORK, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Balance on wakeup
*
* SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_BALANCE_WAKE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Consider waking task on waking CPU.
*
* SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
*/
SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD)
/*
* Domain members have different CPU capacities
*
* SHARED_PARENT: Set from the topmost domain down to the first domain where
* asymmetry is detected.
* NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups.
*/
SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
/*
* Domain members share CPU capacity (i.e. SMT)
*
* SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
* CPU capacity.
* NEEDS_GROUPS: Capacity is shared between groups.
*/
SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Domain members share CPU package resources (i.e. caches)
*
* SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
* the same cache(s).
* NEEDS_GROUPS: Caches are shared between groups.
*/
SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Only a single load balancing instance
*
* SHARED_PARENT: Set for all NUMA levels above NODE. Could be set from a
* different level upwards, but it doesn't change that if a
* domain has this flag set, then all of its parents need to have
* it too (otherwise the serialization doesn't make sense).
* NEEDS_GROUPS: No point in preserving domain if it has a single group.
*/
SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
/*
* Place busy tasks earlier in the domain
*
* SHARED_CHILD: Usually set on the SMT level. Technically could be set further
* up, but currently assumed to be set from the base domain
* upwards (see update_top_cache_domain()).
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Prefer to place tasks in a sibling domain
*
* Set up until domains start spanning NUMA nodes. Close to being a SHARED_CHILD
* flag, but cleared below domains with SD_ASYM_CPUCAPACITY.
*
* NEEDS_GROUPS: Load balancing flag.
*/
SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS)
/*
* sched_groups of this level overlap
*
* SHARED_PARENT: Set for all NUMA levels above NODE.
* NEEDS_GROUPS: Overlaps can only exist with more than one group.
*/
SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
/*
* Cross-node balancing
*
* SHARED_PARENT: Set for all NUMA levels above NODE.
* NEEDS_GROUPS: No point in preserving domain if it has a single group.
*/
SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
...@@ -11,20 +11,29 @@ ...@@ -11,20 +11,29 @@
*/ */
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */ /* Generate SD flag indexes */
#define SD_BALANCE_EXEC 0x0002 /* Balance on exec */ #define SD_FLAG(name, mflags) __##name,
#define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */ enum {
#define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */ #include <linux/sched/sd_flags.h>
#define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */ __SD_FLAG_CNT,
#define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */ };
#define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */ #undef SD_FLAG
#define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */ /* Generate SD flag bits */
#define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */ #define SD_FLAG(name, mflags) name = 1 << __##name,
#define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */ enum {
#define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */ #include <linux/sched/sd_flags.h>
#define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */ };
#define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */ #undef SD_FLAG
#define SD_NUMA 0x2000 /* cross-node balancing */
#ifdef CONFIG_SCHED_DEBUG
struct sd_flag_debug {
unsigned int meta_flags;
char *name;
};
extern const struct sd_flag_debug sd_flag_debug[];
#endif
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
static inline int cpu_smt_flags(void) static inline int cpu_smt_flags(void)
......
...@@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, ...@@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
const char __user *const __user *argv, const char __user *const __user *argv,
const char __user *const __user *envp, int flags); const char __user *const __user *envp, int flags);
asmlinkage long sys_userfaultfd(int flags); asmlinkage long sys_userfaultfd(int flags);
asmlinkage long sys_membarrier(int cmd, int flags); asmlinkage long sys_membarrier(int cmd, unsigned int flags, int cpu_id);
asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in, asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
int fd_out, loff_t __user *off_out, int fd_out, loff_t __user *off_out,
......
...@@ -630,6 +630,10 @@ DECLARE_TRACE(pelt_se_tp, ...@@ -630,6 +630,10 @@ DECLARE_TRACE(pelt_se_tp,
TP_PROTO(struct sched_entity *se), TP_PROTO(struct sched_entity *se),
TP_ARGS(se)); TP_ARGS(se));
DECLARE_TRACE(sched_cpu_capacity_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
DECLARE_TRACE(sched_overutilized_tp, DECLARE_TRACE(sched_overutilized_tp,
TP_PROTO(struct root_domain *rd, bool overutilized), TP_PROTO(struct root_domain *rd, bool overutilized),
TP_ARGS(rd, overutilized)); TP_ARGS(rd, overutilized));
......
...@@ -114,6 +114,26 @@ ...@@ -114,6 +114,26 @@
* If this command is not implemented by an * If this command is not implemented by an
* architecture, -EINVAL is returned. * architecture, -EINVAL is returned.
* Returns 0 on success. * Returns 0 on success.
* @MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
* Ensure the caller thread, upon return from
* system call, that all its running thread
* siblings have any currently running rseq
* critical sections restarted if @flags
* parameter is 0; if @flags parameter is
* MEMBARRIER_CMD_FLAG_CPU,
* then this operation is performed only
* on CPU indicated by @cpu_id. If this command is
* not implemented by an architecture, -EINVAL
* is returned. A process needs to register its
* intent to use the private expedited rseq
* command prior to using it, otherwise
* this command returns -EPERM.
* @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
* Register the process intent to use
* MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ.
* If this command is not implemented by an
* architecture, -EINVAL is returned.
* Returns 0 on success.
* @MEMBARRIER_CMD_SHARED: * @MEMBARRIER_CMD_SHARED:
* Alias to MEMBARRIER_CMD_GLOBAL. Provided for * Alias to MEMBARRIER_CMD_GLOBAL. Provided for
* header backward compatibility. * header backward compatibility.
...@@ -131,9 +151,15 @@ enum membarrier_cmd { ...@@ -131,9 +151,15 @@ enum membarrier_cmd {
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5), MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5),
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6), MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6),
MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = (1 << 7),
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8),
/* Alias for header backward compatibility. */ /* Alias for header backward compatibility. */
MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL, MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL,
}; };
enum membarrier_cmd_flag {
MEMBARRIER_CMD_FLAG_CPU = (1 << 0),
};
#endif /* _UAPI_LINUX_MEMBARRIER_H */ #endif /* _UAPI_LINUX_MEMBARRIER_H */
...@@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); ...@@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
...@@ -940,11 +941,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) ...@@ -940,11 +941,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
return clamp_value / UCLAMP_BUCKET_DELTA; return clamp_value / UCLAMP_BUCKET_DELTA;
} }
static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
{
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
static inline unsigned int uclamp_none(enum uclamp_id clamp_id) static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{ {
if (clamp_id == UCLAMP_MIN) if (clamp_id == UCLAMP_MIN)
...@@ -4551,9 +4547,12 @@ void __noreturn do_task_dead(void) ...@@ -4551,9 +4547,12 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk) static inline void sched_submit_work(struct task_struct *tsk)
{ {
unsigned int task_flags;
if (!tsk->state) if (!tsk->state)
return; return;
task_flags = tsk->flags;
/* /*
* If a worker went to sleep, notify and ask workqueue whether * If a worker went to sleep, notify and ask workqueue whether
* it wants to wake up a task to maintain concurrency. * it wants to wake up a task to maintain concurrency.
...@@ -4562,9 +4561,9 @@ static inline void sched_submit_work(struct task_struct *tsk) ...@@ -4562,9 +4561,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
* in the possible wakeup of a kworker and because wq_worker_sleeping() * in the possible wakeup of a kworker and because wq_worker_sleeping()
* requires it. * requires it.
*/ */
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
preempt_disable(); preempt_disable();
if (tsk->flags & PF_WQ_WORKER) if (task_flags & PF_WQ_WORKER)
wq_worker_sleeping(tsk); wq_worker_sleeping(tsk);
else else
io_wq_worker_sleeping(tsk); io_wq_worker_sleeping(tsk);
......
...@@ -1525,14 +1525,38 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) ...@@ -1525,14 +1525,38 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
*/ */
if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
pi_se = &pi_task->dl; pi_se = &pi_task->dl;
/*
* Because of delays in the detection of the overrun of a
* thread's runtime, it might be the case that a thread
* goes to sleep in a rt mutex with negative runtime. As
* a consequence, the thread will be throttled.
*
* While waiting for the mutex, this thread can also be
* boosted via PI, resulting in a thread that is throttled
* and boosted at the same time.
*
* In this case, the boost overrides the throttle.
*/
if (p->dl.dl_throttled) {
/*
* The replenish timer needs to be canceled. No
* problem if it fires concurrently: boosted threads
* are ignored in dl_task_timer().
*/
hrtimer_try_to_cancel(&p->dl.dl_timer);
p->dl.dl_throttled = 0;
}
} else if (!dl_prio(p->normal_prio)) { } else if (!dl_prio(p->normal_prio)) {
/* /*
* Special case in which we have a !SCHED_DEADLINE task * Special case in which we have a !SCHED_DEADLINE task that is going
* that is going to be deboosted, but exceeds its * to be deboosted, but exceeds its runtime while doing so. No point in
* runtime while doing so. No point in replenishing * replenishing it, as it's going to return back to its original
* it, as it's going to return back to its original * scheduling class after this. If it has been throttled, we need to
* scheduling class after this. * clear the flag, otherwise the task may wake up as throttled after
* being boosted again with no means to replenish the runtime and clear
* the throttle.
*/ */
p->dl.dl_throttled = 0;
BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
return; return;
} }
......
...@@ -245,6 +245,60 @@ set_table_entry(struct ctl_table *entry, ...@@ -245,6 +245,60 @@ set_table_entry(struct ctl_table *entry,
entry->proc_handler = proc_handler; entry->proc_handler = proc_handler;
} }
static int sd_ctl_doflags(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned long flags = *(unsigned long *)table->data;
size_t data_size = 0;
size_t len = 0;
char *tmp;
int idx;
if (write)
return 0;
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
char *name = sd_flag_debug[idx].name;
/* Name plus whitespace */
data_size += strlen(name) + 1;
}
if (*ppos > data_size) {
*lenp = 0;
return 0;
}
tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL);
if (!tmp)
return -ENOMEM;
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
char *name = sd_flag_debug[idx].name;
len += snprintf(tmp + len, strlen(name) + 2, "%s ", name);
}
tmp += *ppos;
len -= *ppos;
if (len > *lenp)
len = *lenp;
if (len)
memcpy(buffer, tmp, len);
if (len < *lenp) {
((char *)buffer)[len] = '\n';
len++;
}
*lenp = len;
*ppos += len;
kfree(tmp);
return 0;
}
static struct ctl_table * static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd) sd_alloc_ctl_domain_table(struct sched_domain *sd)
{ {
...@@ -258,7 +312,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) ...@@ -258,7 +312,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags);
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[8] is terminator */ /* &table[8] is terminator */
......
...@@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se) ...@@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se)
void post_init_entity_util_avg(struct task_struct *p) void post_init_entity_util_avg(struct task_struct *p)
{ {
} }
static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) static void update_tg_load_avg(struct cfs_rq *cfs_rq)
{ {
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
...@@ -1504,6 +1504,7 @@ enum numa_type { ...@@ -1504,6 +1504,7 @@ enum numa_type {
/* Cached statistics for all CPUs within a node */ /* Cached statistics for all CPUs within a node */
struct numa_stats { struct numa_stats {
unsigned long load; unsigned long load;
unsigned long runnable;
unsigned long util; unsigned long util;
/* Total compute capacity of CPUs on a node */ /* Total compute capacity of CPUs on a node */
unsigned long compute_capacity; unsigned long compute_capacity;
...@@ -1547,19 +1548,22 @@ struct task_numa_env { ...@@ -1547,19 +1548,22 @@ struct task_numa_env {
}; };
static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_util(int cpu); static unsigned long cpu_util(int cpu);
static inline long adjust_numa_imbalance(int imbalance, int src_nr_running); static inline long adjust_numa_imbalance(int imbalance, int nr_running);
static inline enum static inline enum
numa_type numa_classify(unsigned int imbalance_pct, numa_type numa_classify(unsigned int imbalance_pct,
struct numa_stats *ns) struct numa_stats *ns)
{ {
if ((ns->nr_running > ns->weight) && if ((ns->nr_running > ns->weight) &&
((ns->compute_capacity * 100) < (ns->util * imbalance_pct))) (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
return node_overloaded; return node_overloaded;
if ((ns->nr_running < ns->weight) || if ((ns->nr_running < ns->weight) ||
((ns->compute_capacity * 100) > (ns->util * imbalance_pct))) (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
return node_has_spare; return node_has_spare;
return node_fully_busy; return node_fully_busy;
...@@ -1610,6 +1614,7 @@ static void update_numa_stats(struct task_numa_env *env, ...@@ -1610,6 +1614,7 @@ static void update_numa_stats(struct task_numa_env *env,
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
ns->load += cpu_load(rq); ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
ns->util += cpu_util(cpu); ns->util += cpu_util(cpu);
ns->nr_running += rq->cfs.h_nr_running; ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu); ns->compute_capacity += capacity_of(cpu);
...@@ -1925,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, ...@@ -1925,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
src_running = env->src_stats.nr_running - 1; src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1; dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running); imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, src_running); imbalance = adjust_numa_imbalance(imbalance, dst_running);
/* Use idle CPU if there is no imbalance */ /* Use idle CPU if there is no imbalance */
if (!imbalance) { if (!imbalance) {
...@@ -3084,7 +3089,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, ...@@ -3084,7 +3089,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
/* commit outstanding execution time */ /* commit outstanding execution time */
if (cfs_rq->curr == se) if (cfs_rq->curr == se)
update_curr(cfs_rq); update_curr(cfs_rq);
account_entity_dequeue(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight);
} }
dequeue_load_avg(cfs_rq, se); dequeue_load_avg(cfs_rq, se);
...@@ -3100,7 +3105,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, ...@@ -3100,7 +3105,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se); enqueue_load_avg(cfs_rq, se);
if (se->on_rq) if (se->on_rq)
account_entity_enqueue(cfs_rq, se); update_load_add(&cfs_rq->load, se->load.weight);
} }
...@@ -3288,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) ...@@ -3288,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
/** /**
* update_tg_load_avg - update the tg's load avg * update_tg_load_avg - update the tg's load avg
* @cfs_rq: the cfs_rq whose avg changed * @cfs_rq: the cfs_rq whose avg changed
* @force: update regardless of how small the difference
* *
* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
* However, because tg->load_avg is a global value there are performance * However, because tg->load_avg is a global value there are performance
...@@ -3300,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) ...@@ -3300,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
* *
* Updating tg's load_avg is necessary before update_cfs_share(). * Updating tg's load_avg is necessary before update_cfs_share().
*/ */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{ {
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
...@@ -3310,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) ...@@ -3310,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
if (cfs_rq->tg == &root_task_group) if (cfs_rq->tg == &root_task_group)
return; return;
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg); atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
} }
...@@ -3612,7 +3616,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) ...@@ -3612,7 +3616,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
#else /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
static inline int propagate_entity_load_avg(struct sched_entity *se) static inline int propagate_entity_load_avg(struct sched_entity *se)
{ {
...@@ -3800,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s ...@@ -3800,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* IOW we're enqueueing a task on a new CPU. * IOW we're enqueueing a task on a new CPU.
*/ */
attach_entity_load_avg(cfs_rq, se); attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, 0); update_tg_load_avg(cfs_rq);
} else if (decayed) { } else if (decayed) {
cfs_rq_util_change(cfs_rq, 0); cfs_rq_util_change(cfs_rq, 0);
if (flags & UPDATE_TG) if (flags & UPDATE_TG)
update_tg_load_avg(cfs_rq, 0); update_tg_load_avg(cfs_rq);
} }
} }
...@@ -4461,17 +4465,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) ...@@ -4461,17 +4465,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
se = second; se = second;
} }
/* if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
* Prefer last buddy, try to return the CPU to a preempted task. /*
*/ * Someone really wants this to run. If it's not unfair, run it.
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) */
se = cfs_rq->last;
/*
* Someone really wants this to run. If it's not unfair, run it.
*/
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
se = cfs_rq->next; se = cfs_rq->next;
} else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/
se = cfs_rq->last;
}
clear_buddies(cfs_rq, se); clear_buddies(cfs_rq, se);
...@@ -6075,7 +6079,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int ...@@ -6075,7 +6079,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
/* /*
* Scan the local SMT mask for idle CPUs. * Scan the local SMT mask for idle CPUs.
*/ */
static int select_idle_smt(struct task_struct *p, int target) static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{ {
int cpu; int cpu;
...@@ -6083,7 +6087,8 @@ static int select_idle_smt(struct task_struct *p, int target) ...@@ -6083,7 +6087,8 @@ static int select_idle_smt(struct task_struct *p, int target)
return -1; return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) { for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr)) if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue; continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu; return cpu;
...@@ -6099,7 +6104,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s ...@@ -6099,7 +6104,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1; return -1;
} }
static inline int select_idle_smt(struct task_struct *p, int target) static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{ {
return -1; return -1;
} }
...@@ -6274,7 +6279,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) ...@@ -6274,7 +6279,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if ((unsigned)i < nr_cpumask_bits) if ((unsigned)i < nr_cpumask_bits)
return i; return i;
i = select_idle_smt(p, target); i = select_idle_smt(p, sd, target);
if ((unsigned)i < nr_cpumask_bits) if ((unsigned)i < nr_cpumask_bits)
return i; return i;
...@@ -6594,7 +6599,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) ...@@ -6594,7 +6599,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
util = cpu_util_next(cpu, p, cpu); util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu); cpu_cap = capacity_of(cpu);
spare_cap = cpu_cap - util; spare_cap = cpu_cap;
lsub_positive(&spare_cap, util);
/* /*
* Skip CPUs that cannot satisfy the capacity request. * Skip CPUs that cannot satisfy the capacity request.
...@@ -7402,6 +7408,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env) ...@@ -7402,6 +7408,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (unlikely(task_has_idle_policy(p))) if (unlikely(task_has_idle_policy(p)))
return 0; return 0;
/* SMT siblings share cache */
if (env->sd->flags & SD_SHARE_CPUCAPACITY)
return 0;
/* /*
* Buddy candidates are cache hot: * Buddy candidates are cache hot:
*/ */
...@@ -7669,8 +7679,8 @@ static int detach_tasks(struct lb_env *env) ...@@ -7669,8 +7679,8 @@ static int detach_tasks(struct lb_env *env)
* scheduler fails to find a good waiting task to * scheduler fails to find a good waiting task to
* migrate. * migrate.
*/ */
if (load/2 > env->imbalance &&
env->sd->nr_balance_failed <= env->sd->cache_nice_tries) if ((load >> env->sd->nr_balance_failed) > env->imbalance)
goto next; goto next;
env->imbalance -= load; env->imbalance -= load;
...@@ -7887,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) ...@@ -7887,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
struct sched_entity *se; struct sched_entity *se;
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq, 0); update_tg_load_avg(cfs_rq);
if (cfs_rq == &rq->cfs) if (cfs_rq == &rq->cfs)
decayed = true; decayed = true;
...@@ -8098,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) ...@@ -8098,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
capacity = 1; capacity = 1;
cpu_rq(cpu)->cpu_capacity = capacity; cpu_rq(cpu)->cpu_capacity = capacity;
trace_sched_cpu_capacity_tp(cpu_rq(cpu));
sdg->sgc->capacity = capacity; sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity; sdg->sgc->min_capacity = capacity;
sdg->sgc->max_capacity = capacity; sdg->sgc->max_capacity = capacity;
...@@ -8957,7 +8969,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd ...@@ -8957,7 +8969,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
} }
} }
static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) static inline long adjust_numa_imbalance(int imbalance, int nr_running)
{ {
unsigned int imbalance_min; unsigned int imbalance_min;
...@@ -8966,7 +8978,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) ...@@ -8966,7 +8978,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
* tasks that remain local when the source domain is almost idle. * tasks that remain local when the source domain is almost idle.
*/ */
imbalance_min = 2; imbalance_min = 2;
if (src_nr_running <= imbalance_min) if (nr_running <= imbalance_min)
return 0; return 0;
return imbalance; return imbalance;
...@@ -9780,6 +9792,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) ...@@ -9780,6 +9792,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
/* scale ms to jiffies */ /* scale ms to jiffies */
interval = msecs_to_jiffies(interval); interval = msecs_to_jiffies(interval);
/*
* Reduce likelihood of busy balancing at higher domains racing with
* balancing at lower domains by preventing their balancing periods
* from being multiples of each other.
*/
if (cpu_busy)
interval -= 1;
interval = clamp(interval, 1UL, max_load_balance_interval); interval = clamp(interval, 1UL, max_load_balance_interval);
return interval; return interval;
...@@ -10786,7 +10807,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se) ...@@ -10786,7 +10807,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
/* Catch up with the cfs_rq and remove our load when we leave */ /* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0); update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se); detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false); update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se); propagate_entity_cfs_rq(se);
} }
...@@ -10805,7 +10826,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) ...@@ -10805,7 +10826,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
/* Synchronize entity with its cfs_rq */ /* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se); attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false); update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se); propagate_entity_cfs_rq(se);
} }
...@@ -11302,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq) ...@@ -11302,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq)
} }
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
int sched_trace_rq_cpu_capacity(struct rq *rq)
{
return rq ?
#ifdef CONFIG_SMP
rq->cpu_capacity
#else
SCHED_CAPACITY_SCALE
#endif
: -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd) const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
{ {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
......
...@@ -77,7 +77,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) ...@@ -77,7 +77,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
SCHED_FEAT(RT_PUSH_IPI, true) SCHED_FEAT(RT_PUSH_IPI, true)
#endif #endif
SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(RT_RUNTIME_SHARE, false)
SCHED_FEAT(LB_MIN, false) SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true) SCHED_FEAT(ATTACH_AGE_LOAD, true)
......
...@@ -18,6 +18,14 @@ ...@@ -18,6 +18,14 @@
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
#endif #endif
#ifdef CONFIG_RSEQ
#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \
(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
#else
#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
#endif
#define MEMBARRIER_CMD_BITMASK \ #define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
...@@ -30,6 +38,11 @@ static void ipi_mb(void *info) ...@@ -30,6 +38,11 @@ static void ipi_mb(void *info)
smp_mb(); /* IPIs should be serializing but paranoid. */ smp_mb(); /* IPIs should be serializing but paranoid. */
} }
static void ipi_rseq(void *info)
{
rseq_preempt(current);
}
static void ipi_sync_rq_state(void *info) static void ipi_sync_rq_state(void *info)
{ {
struct mm_struct *mm = (struct mm_struct *) info; struct mm_struct *mm = (struct mm_struct *) info;
...@@ -129,19 +142,27 @@ static int membarrier_global_expedited(void) ...@@ -129,19 +142,27 @@ static int membarrier_global_expedited(void)
return 0; return 0;
} }
static int membarrier_private_expedited(int flags) static int membarrier_private_expedited(int flags, int cpu_id)
{ {
int cpu;
cpumask_var_t tmpmask; cpumask_var_t tmpmask;
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
smp_call_func_t ipi_func = ipi_mb;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL; return -EINVAL;
if (!(atomic_read(&mm->membarrier_state) & if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM; return -EPERM;
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
return -EPERM;
ipi_func = ipi_rseq;
} else { } else {
WARN_ON_ONCE(flags);
if (!(atomic_read(&mm->membarrier_state) & if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
return -EPERM; return -EPERM;
...@@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags) ...@@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags)
*/ */
smp_mb(); /* system call entry is not a mb. */ smp_mb(); /* system call entry is not a mb. */
if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM; return -ENOMEM;
cpus_read_lock(); cpus_read_lock();
rcu_read_lock();
for_each_online_cpu(cpu) { if (cpu_id >= 0) {
struct task_struct *p; struct task_struct *p;
/* if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
* Skipping the current CPU is OK even through we can be goto out;
* migrated at any point. The current CPU, at the point if (cpu_id == raw_smp_processor_id())
* where we read raw_smp_processor_id(), is ensured to goto out;
* be in program order with respect to the caller rcu_read_lock();
* thread. Therefore, we can skip this CPU from the p = rcu_dereference(cpu_rq(cpu_id)->curr);
* iteration. if (!p || p->mm != mm) {
*/ rcu_read_unlock();
if (cpu == raw_smp_processor_id()) goto out;
continue; }
p = rcu_dereference(cpu_rq(cpu)->curr); rcu_read_unlock();
if (p && p->mm == mm) } else {
__cpumask_set_cpu(cpu, tmpmask); int cpu;
rcu_read_lock();
for_each_online_cpu(cpu) {
struct task_struct *p;
/*
* Skipping the current CPU is OK even through we can be
* migrated at any point. The current CPU, at the point
* where we read raw_smp_processor_id(), is ensured to
* be in program order with respect to the caller
* thread. Therefore, we can skip this CPU from the
* iteration.
*/
if (cpu == raw_smp_processor_id())
continue;
p = rcu_dereference(cpu_rq(cpu)->curr);
if (p && p->mm == mm)
__cpumask_set_cpu(cpu, tmpmask);
}
rcu_read_unlock();
} }
rcu_read_unlock();
preempt_disable(); preempt_disable();
smp_call_function_many(tmpmask, ipi_mb, NULL, 1); if (cpu_id >= 0)
smp_call_function_single(cpu_id, ipi_func, NULL, 1);
else
smp_call_function_many(tmpmask, ipi_func, NULL, 1);
preempt_enable(); preempt_enable();
free_cpumask_var(tmpmask); out:
if (cpu_id < 0)
free_cpumask_var(tmpmask);
cpus_read_unlock(); cpus_read_unlock();
/* /*
...@@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags) ...@@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags)
set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
ret; ret;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL; return -EINVAL;
ready_state = ready_state =
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;
ready_state =
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
} else {
WARN_ON_ONCE(flags);
} }
/* /*
...@@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags) ...@@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags)
return 0; return 0;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) if (flags & MEMBARRIER_FLAG_SYNC_CORE)
set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
if (flags & MEMBARRIER_FLAG_RSEQ)
set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
atomic_or(set_state, &mm->membarrier_state); atomic_or(set_state, &mm->membarrier_state);
ret = sync_runqueues_membarrier_state(mm); ret = sync_runqueues_membarrier_state(mm);
if (ret) if (ret)
...@@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags) ...@@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags)
/** /**
* sys_membarrier - issue memory barriers on a set of threads * sys_membarrier - issue memory barriers on a set of threads
* @cmd: Takes command values defined in enum membarrier_cmd. * @cmd: Takes command values defined in enum membarrier_cmd.
* @flags: Currently needs to be 0. For future extensions. * @flags: Currently needs to be 0 for all commands other than
* MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
* case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
* contains the CPU on which to interrupt (= restart)
* the RSEQ critical section.
* @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
* RSEQ CS should be interrupted (@cmd must be
* MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
* *
* If this system call is not implemented, -ENOSYS is returned. If the * If this system call is not implemented, -ENOSYS is returned. If the
* command specified does not exist, not available on the running * command specified does not exist, not available on the running
...@@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags) ...@@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags)
* smp_mb() X O O * smp_mb() X O O
* sys_membarrier() O O O * sys_membarrier() O O O
*/ */
SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
{ {
if (unlikely(flags)) switch (cmd) {
return -EINVAL; case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
return -EINVAL;
break;
default:
if (unlikely(flags))
return -EINVAL;
}
if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
cpu_id = -1;
switch (cmd) { switch (cmd) {
case MEMBARRIER_CMD_QUERY: case MEMBARRIER_CMD_QUERY:
{ {
...@@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) ...@@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
return membarrier_register_global_expedited(); return membarrier_register_global_expedited();
case MEMBARRIER_CMD_PRIVATE_EXPEDITED: case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
return membarrier_private_expedited(0); return membarrier_private_expedited(0, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
return membarrier_register_private_expedited(0); return membarrier_register_private_expedited(0);
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
default: default:
return -EINVAL; return -EINVAL;
} }
......
...@@ -25,10 +25,18 @@ static inline bool sched_debug(void) ...@@ -25,10 +25,18 @@ static inline bool sched_debug(void)
return sched_debug_enabled; return sched_debug_enabled;
} }
#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
const struct sd_flag_debug sd_flag_debug[] = {
#include <linux/sched/sd_flags.h>
};
#undef SD_FLAG
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask) struct cpumask *groupmask)
{ {
struct sched_group *group = sd->groups; struct sched_group *group = sd->groups;
unsigned long flags = sd->flags;
unsigned int idx;
cpumask_clear(groupmask); cpumask_clear(groupmask);
...@@ -43,6 +51,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, ...@@ -43,6 +51,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
} }
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
unsigned int flag = BIT(idx);
unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
!(sd->child->flags & flag))
printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
sd_flag_debug[idx].name);
if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
!(sd->parent->flags & flag))
printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
sd_flag_debug[idx].name);
}
printk(KERN_DEBUG "%*s groups:", level + 1, ""); printk(KERN_DEBUG "%*s groups:", level + 1, "");
do { do {
if (!group) { if (!group) {
...@@ -137,22 +160,22 @@ static inline bool sched_debug(void) ...@@ -137,22 +160,22 @@ static inline bool sched_debug(void)
} }
#endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_SCHED_DEBUG */
/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
static const unsigned int SD_DEGENERATE_GROUPS_MASK =
#include <linux/sched/sd_flags.h>
0;
#undef SD_FLAG
static int sd_degenerate(struct sched_domain *sd) static int sd_degenerate(struct sched_domain *sd)
{ {
if (cpumask_weight(sched_domain_span(sd)) == 1) if (cpumask_weight(sched_domain_span(sd)) == 1)
return 1; return 1;
/* Following flags need at least 2 groups */ /* Following flags need at least 2 groups */
if (sd->flags & (SD_BALANCE_NEWIDLE | if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
SD_BALANCE_FORK | (sd->groups != sd->groups->next))
SD_BALANCE_EXEC | return 0;
SD_SHARE_CPUCAPACITY |
SD_ASYM_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
return 0;
}
/* Following flags don't use groups */ /* Following flags don't use groups */
if (sd->flags & (SD_WAKE_AFFINE)) if (sd->flags & (SD_WAKE_AFFINE))
...@@ -173,18 +196,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) ...@@ -173,18 +196,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 0; return 0;
/* Flags needing groups don't count if only 1 group in parent */ /* Flags needing groups don't count if only 1 group in parent */
if (parent->groups == parent->groups->next) { if (parent->groups == parent->groups->next)
pflags &= ~(SD_BALANCE_NEWIDLE | pflags &= ~SD_DEGENERATE_GROUPS_MASK;
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_ASYM_CPUCAPACITY |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
if (~cflags & pflags) if (~cflags & pflags)
return 0; return 0;
...@@ -1292,7 +1306,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ...@@ -1292,7 +1306,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
* SD_SHARE_CPUCAPACITY - describes SMT topologies * SD_SHARE_CPUCAPACITY - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches * SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies * SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
* *
* Odd one out, which beside describing the topology has a quirk also * Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it: * prescribes the desired behaviour that goes along with it:
...@@ -1303,8 +1316,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ...@@ -1303,8 +1316,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
(SD_SHARE_CPUCAPACITY | \ (SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \ SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \ SD_NUMA | \
SD_ASYM_PACKING | \ SD_ASYM_PACKING)
SD_SHARE_POWERDOMAIN)
static struct sched_domain * static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl, sd_init(struct sched_domain_topology_level *tl,
...@@ -1336,8 +1348,8 @@ sd_init(struct sched_domain_topology_level *tl, ...@@ -1336,8 +1348,8 @@ sd_init(struct sched_domain_topology_level *tl,
*sd = (struct sched_domain){ *sd = (struct sched_domain){
.min_interval = sd_weight, .min_interval = sd_weight,
.max_interval = 2*sd_weight, .max_interval = 2*sd_weight,
.busy_factor = 32, .busy_factor = 16,
.imbalance_pct = 125, .imbalance_pct = 117,
.cache_nice_tries = 0, .cache_nice_tries = 0,
...@@ -1989,11 +2001,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att ...@@ -1989,11 +2001,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Set up domains for CPUs specified by the cpu_map: */ /* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) { for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl; struct sched_domain_topology_level *tl;
int dflags = 0;
sd = NULL; sd = NULL;
for_each_sd_topology(tl) { for_each_sd_topology(tl) {
int dflags = 0;
if (tl == tl_asym) { if (tl == tl_asym) {
dflags |= SD_ASYM_CPUCAPACITY; dflags |= SD_ASYM_CPUCAPACITY;
has_asym = true; has_asym = true;
......
// SPDX-License-Identifier: LGPL-2.1 // SPDX-License-Identifier: LGPL-2.1
#define _GNU_SOURCE #define _GNU_SOURCE
#include <assert.h> #include <assert.h>
#include <linux/membarrier.h>
#include <pthread.h> #include <pthread.h>
#include <sched.h> #include <sched.h>
#include <stdatomic.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
...@@ -1131,6 +1133,220 @@ static int set_signal_handler(void) ...@@ -1131,6 +1133,220 @@ static int set_signal_handler(void)
return ret; return ret;
} }
struct test_membarrier_thread_args {
int stop;
intptr_t percpu_list_ptr;
};
/* Worker threads modify data in their "active" percpu lists. */
void *test_membarrier_worker_thread(void *arg)
{
struct test_membarrier_thread_args *args =
(struct test_membarrier_thread_args *)arg;
const int iters = opt_reps;
int i;
if (rseq_register_current_thread()) {
fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
errno, strerror(errno));
abort();
}
/* Wait for initialization. */
while (!atomic_load(&args->percpu_list_ptr)) {}
for (i = 0; i < iters; ++i) {
int ret;
do {
int cpu = rseq_cpu_start();
ret = rseq_offset_deref_addv(&args->percpu_list_ptr,
sizeof(struct percpu_list_entry) * cpu, 1, cpu);
} while (rseq_unlikely(ret));
}
if (rseq_unregister_current_thread()) {
fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
errno, strerror(errno));
abort();
}
return NULL;
}
void test_membarrier_init_percpu_list(struct percpu_list *list)
{
int i;
memset(list, 0, sizeof(*list));
for (i = 0; i < CPU_SETSIZE; i++) {
struct percpu_list_node *node;
node = malloc(sizeof(*node));
assert(node);
node->data = 0;
node->next = NULL;
list->c[i].head = node;
}
}
void test_membarrier_free_percpu_list(struct percpu_list *list)
{
int i;
for (i = 0; i < CPU_SETSIZE; i++)
free(list->c[i].head);
}
static int sys_membarrier(int cmd, int flags, int cpu_id)
{
return syscall(__NR_membarrier, cmd, flags, cpu_id);
}
/*
* The manager thread swaps per-cpu lists that worker threads see,
* and validates that there are no unexpected modifications.
*/
void *test_membarrier_manager_thread(void *arg)
{
struct test_membarrier_thread_args *args =
(struct test_membarrier_thread_args *)arg;
struct percpu_list list_a, list_b;
intptr_t expect_a = 0, expect_b = 0;
int cpu_a = 0, cpu_b = 0;
if (rseq_register_current_thread()) {
fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
errno, strerror(errno));
abort();
}
/* Init lists. */
test_membarrier_init_percpu_list(&list_a);
test_membarrier_init_percpu_list(&list_b);
atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
while (!atomic_load(&args->stop)) {
/* list_a is "active". */
cpu_a = rand() % CPU_SETSIZE;
/*
* As list_b is "inactive", we should never see changes
* to list_b.
*/
if (expect_b != atomic_load(&list_b.c[cpu_b].head->data)) {
fprintf(stderr, "Membarrier test failed\n");
abort();
}
/* Make list_b "active". */
atomic_store(&args->percpu_list_ptr, (intptr_t)&list_b);
if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
MEMBARRIER_CMD_FLAG_CPU, cpu_a) &&
errno != ENXIO /* missing CPU */) {
perror("sys_membarrier");
abort();
}
/*
* Cpu A should now only modify list_b, so the values
* in list_a should be stable.
*/
expect_a = atomic_load(&list_a.c[cpu_a].head->data);
cpu_b = rand() % CPU_SETSIZE;
/*
* As list_a is "inactive", we should never see changes
* to list_a.
*/
if (expect_a != atomic_load(&list_a.c[cpu_a].head->data)) {
fprintf(stderr, "Membarrier test failed\n");
abort();
}
/* Make list_a "active". */
atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
MEMBARRIER_CMD_FLAG_CPU, cpu_b) &&
errno != ENXIO /* missing CPU*/) {
perror("sys_membarrier");
abort();
}
/* Remember a value from list_b. */
expect_b = atomic_load(&list_b.c[cpu_b].head->data);
}
test_membarrier_free_percpu_list(&list_a);
test_membarrier_free_percpu_list(&list_b);
if (rseq_unregister_current_thread()) {
fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
errno, strerror(errno));
abort();
}
return NULL;
}
/* Test MEMBARRIER_CMD_PRIVATE_RESTART_RSEQ_ON_CPU membarrier command. */
#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
void test_membarrier(void)
{
const int num_threads = opt_threads;
struct test_membarrier_thread_args thread_args;
pthread_t worker_threads[num_threads];
pthread_t manager_thread;
int i, ret;
if (sys_membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, 0, 0)) {
perror("sys_membarrier");
abort();
}
thread_args.stop = 0;
thread_args.percpu_list_ptr = 0;
ret = pthread_create(&manager_thread, NULL,
test_membarrier_manager_thread, &thread_args);
if (ret) {
errno = ret;
perror("pthread_create");
abort();
}
for (i = 0; i < num_threads; i++) {
ret = pthread_create(&worker_threads[i], NULL,
test_membarrier_worker_thread, &thread_args);
if (ret) {
errno = ret;
perror("pthread_create");
abort();
}
}
for (i = 0; i < num_threads; i++) {
ret = pthread_join(worker_threads[i], NULL);
if (ret) {
errno = ret;
perror("pthread_join");
abort();
}
}
atomic_store(&thread_args.stop, 1);
ret = pthread_join(manager_thread, NULL);
if (ret) {
errno = ret;
perror("pthread_join");
abort();
}
}
#else /* RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV */
void test_membarrier(void)
{
fprintf(stderr, "rseq_offset_deref_addv is not implemented on this architecture. "
"Skipping membarrier test.\n");
}
#endif
static void show_usage(int argc, char **argv) static void show_usage(int argc, char **argv)
{ {
printf("Usage : %s <OPTIONS>\n", printf("Usage : %s <OPTIONS>\n",
...@@ -1153,7 +1369,7 @@ static void show_usage(int argc, char **argv) ...@@ -1153,7 +1369,7 @@ static void show_usage(int argc, char **argv)
printf(" [-r N] Number of repetitions per thread (default 5000)\n"); printf(" [-r N] Number of repetitions per thread (default 5000)\n");
printf(" [-d] Disable rseq system call (no initialization)\n"); printf(" [-d] Disable rseq system call (no initialization)\n");
printf(" [-D M] Disable rseq for each M threads\n"); printf(" [-D M] Disable rseq for each M threads\n");
printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n"); printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement, membarrie(r)\n");
printf(" [-M] Push into buffer and memcpy buffer with memory barriers.\n"); printf(" [-M] Push into buffer and memcpy buffer with memory barriers.\n");
printf(" [-v] Verbose output.\n"); printf(" [-v] Verbose output.\n");
printf(" [-h] Show this help.\n"); printf(" [-h] Show this help.\n");
...@@ -1268,6 +1484,7 @@ int main(int argc, char **argv) ...@@ -1268,6 +1484,7 @@ int main(int argc, char **argv)
case 'i': case 'i':
case 'b': case 'b':
case 'm': case 'm':
case 'r':
break; break;
default: default:
show_usage(argc, argv); show_usage(argc, argv);
...@@ -1320,6 +1537,10 @@ int main(int argc, char **argv) ...@@ -1320,6 +1537,10 @@ int main(int argc, char **argv)
printf_verbose("counter increment\n"); printf_verbose("counter increment\n");
test_percpu_inc(); test_percpu_inc();
break; break;
case 'r':
printf_verbose("membarrier\n");
test_membarrier();
break;
} }
if (!opt_disable_rseq && rseq_unregister_current_thread()) if (!opt_disable_rseq && rseq_unregister_current_thread())
abort(); abort();
......
...@@ -279,6 +279,63 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu) ...@@ -279,6 +279,63 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
#endif #endif
} }
#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
/*
* pval = *(ptr+off)
* *pval += inc;
*/
static inline __attribute__((always_inline))
int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu)
{
RSEQ_INJECT_C(9)
__asm__ __volatile__ goto (
RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
#endif
/* Start rseq by storing table entry pointer into rseq_cs. */
RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
RSEQ_INJECT_ASM(3)
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
#endif
/* get p+v */
"movq %[ptr], %%rbx\n\t"
"addq %[off], %%rbx\n\t"
/* get pv */
"movq (%%rbx), %%rcx\n\t"
/* *pv += inc */
"addq %[inc], (%%rcx)\n\t"
"2:\n\t"
RSEQ_INJECT_ASM(4)
RSEQ_ASM_DEFINE_ABORT(4, "", abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[rseq_abi] "r" (&__rseq_abi),
/* final store input */
[ptr] "m" (*ptr),
[off] "er" (off),
[inc] "er" (inc)
: "memory", "cc", "rax", "rbx", "rcx"
RSEQ_INJECT_CLOBBER
: abort
#ifdef RSEQ_COMPARE_TWICE
, error1
#endif
);
return 0;
abort:
RSEQ_INJECT_FAILED
return -1;
#ifdef RSEQ_COMPARE_TWICE
error1:
rseq_bug("cpu_id comparison failed");
#endif
}
static inline __attribute__((always_inline)) static inline __attribute__((always_inline))
int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect, int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
intptr_t *v2, intptr_t newv2, intptr_t *v2, intptr_t newv2,
......
...@@ -15,6 +15,7 @@ TEST_LIST=( ...@@ -15,6 +15,7 @@ TEST_LIST=(
"-T m" "-T m"
"-T m -M" "-T m -M"
"-T i" "-T i"
"-T r"
) )
TEST_NAME=( TEST_NAME=(
...@@ -25,6 +26,7 @@ TEST_NAME=( ...@@ -25,6 +26,7 @@ TEST_NAME=(
"memcpy" "memcpy"
"memcpy with barrier" "memcpy with barrier"
"increment" "increment"
"membarrier"
) )
IFS="$OLDIFS" IFS="$OLDIFS"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment