Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull more scheduler updates from Ingo Molnar: "Second round of scheduler changes: - try-to-wakeup and IPI reduction speedups, from Andy Lutomirski - continued power scheduling cleanups and refactorings, from Nicolas Pitre - misc fixes and enhancements" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/deadline: Delete extraneous extern for to_ratio() sched/idle: Optimize try-to-wake-up IPI sched/idle: Simplify wake_up_idle_cpu() sched/idle: Clear polling before descheduling the idle thread sched, trace: Add a tracepoint for IPI-less remote wakeups cpuidle: Set polling in poll_idle sched: Remove redundant assignment to "rt_rq" in update_curr_rt(...) sched: Rename capacity related flags sched: Final power vs. capacity cleanups sched: Remove remaining dubious usage of "power" sched: Let 'struct sched_group_power' care about CPU capacity sched/fair: Disambiguate existing/remaining "capacity" usage sched/fair: Change "has_capacity" to "has_free_capacity" sched/fair: Remove "power" from 'struct numa_stats' sched: Fix signedness bug in yield_to() sched/fair: Use time_after() in record_wakee() sched/balancing: Reduce the rate of needless idle load balancing sched/fair: Fix unlocked reads of some cfs_b->quota/period

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull more scheduler updates from Ingo Molnar: "Second round of scheduler changes: - try-to-wakeup and IPI reduction speedups, from Andy Lutomirski - continued power scheduling cleanups and refactorings, from Nicolas Pitre - misc fixes and enhancements" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/deadline: Delete extraneous extern for to_ratio() sched/idle: Optimize try-to-wake-up IPI sched/idle: Simplify wake_up_idle_cpu() sched/idle: Clear polling before descheduling the idle thread sched, trace: Add a tracepoint for IPI-less remote wakeups cpuidle: Set polling in poll_idle sched: Remove redundant assignment to "rt_rq" in update_curr_rt(...) sched: Rename capacity related flags sched: Final power vs. capacity cleanups sched: Remove remaining dubious usage of "power" sched: Let 'struct sched_group_power' care about CPU capacity sched/fair: Disambiguate existing/remaining "capacity" usage sched/fair: Change "has_capacity" to "has_free_capacity" sched/fair: Remove "power" from 'struct numa_stats' sched: Fix signedness bug in yield_to() sched/fair: Use time_after() in record_wakee() sched/balancing: Reduce the rate of needless idle load balancing sched/fair: Fix unlocked reads of some cfs_b->quota/period
b2e09f63 · Linus Torvalds · 3737a127 · 535560d8 · b2e09f63 · b2e09f63
Commit b2e09f63 authored Jun 12, 2014 by Linus Torvalds
14 changed files
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -26,30 +26,30 @@
 #include <asm/topology.h>

 /*
- * cpu power scale management
+ * cpu capacity scale management
 */

 /*
- * cpu power table
+ * cpu capacity table
 * This per cpu data structure describes the relative capacity of each core.
 * On a heteregenous system, cores don't have the same computation capacity
- * and we reflect that difference in the cpu_power field so the scheduler can
- * take this difference into account during load balance. A per cpu structure
- * is preferred because each CPU updates its own cpu_power field during the
- * load balance except for idle cores. One idle core is selected to run the
- * rebalance_domains for all idle cores and the cpu_power can be updated
- * during this sequence.
+ * and we reflect that difference in the cpu_capacity field so the scheduler
+ * can take this difference into account during load balance. A per cpu
+ * structure is preferred because each CPU updates its own cpu_capacity field
+ * during the load balance except for idle cores. One idle core is selected
+ * to run the rebalance_domains for all idle cores and the cpu_capacity can be
+ * updated during this sequence.
 */
 static DEFINE_PER_CPU(unsigned long, cpu_scale);

-unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
+unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
 {
 	return per_cpu(cpu_scale, cpu);
 }

-static void set_power_scale(unsigned int cpu, unsigned long power)
+static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
 {
-	per_cpu(cpu_scale, cpu) = power;
+	per_cpu(cpu_scale, cpu) = capacity;
 }

 #ifdef CONFIG_OF
@@ -62,11 +62,11 @@ struct cpu_efficiency {
 * Table of relative efficiency of each processors
 * The efficiency value must fit in 20bit and the final
 * cpu_scale value must be in the range
- *   0 < cpu_scale < 3*SCHED_POWER_SCALE/2
+ *   0 < cpu_scale < 3*SCHED_CAPACITY_SCALE/2
 * in order to return at most 1 when DIV_ROUND_CLOSEST
 * is used to compute the capacity of a CPU.
 * Processors that are not defined in the table,
- * use the default SCHED_POWER_SCALE value for cpu_scale.
+ * use the default SCHED_CAPACITY_SCALE value for cpu_scale.
 */
 static const struct cpu_efficiency table_efficiency[] = {
 	{"arm,cortex-a15", 3891},
@@ -83,9 +83,9 @@ static unsigned long middle_capacity = 1;
 * Iterate all CPUs' descriptor in DT and compute the efficiency
 * (as per table_efficiency). Also calculate a middle efficiency
 * as close as possible to  (max{eff_i} - min{eff_i}) / 2
- * This is later used to scale the cpu_power field such that an
- * 'average' CPU is of middle power. Also see the comments near
- * table_efficiency[] and update_cpu_power().
+ * This is later used to scale the cpu_capacity field such that an
+ * 'average' CPU is of middle capacity. Also see the comments near
+ * table_efficiency[] and update_cpu_capacity().
 */
 static void __init parse_dt_topology(void)
 {
@@ -141,15 +141,15 @@ static void __init parse_dt_topology(void)
 	 * cpu_scale because all CPUs have the same capacity. Otherwise, we
 	 * compute a middle_capacity factor that will ensure that the capacity
 	 * of an 'average' CPU of the system will be as close as possible to
-	 * SCHED_POWER_SCALE, which is the default value, but with the
+	 * SCHED_CAPACITY_SCALE, which is the default value, but with the
 	 * constraint explained near table_efficiency[].
 	 */
 	if (4*max_capacity < (3*(max_capacity + min_capacity)))
 		middle_capacity = (min_capacity + max_capacity)
-				>> (SCHED_POWER_SHIFT+1);
+				>> (SCHED_CAPACITY_SHIFT+1);
 	else
 		middle_capacity = ((max_capacity / 3)
-				>> (SCHED_POWER_SHIFT-1)) + 1;
+				>> (SCHED_CAPACITY_SHIFT-1)) + 1;

 }

@@ -158,20 +158,20 @@ static void __init parse_dt_topology(void)
 * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the
 * function returns directly for SMP system.
 */
-static void update_cpu_power(unsigned int cpu)
+static void update_cpu_capacity(unsigned int cpu)
 {
 	if (!cpu_capacity(cpu))
 		return;

-	set_power_scale(cpu, cpu_capacity(cpu) / middle_capacity);
+	set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);

-	printk(KERN_INFO "CPU%u: update cpu_power %lu\n",
-		cpu, arch_scale_freq_power(NULL, cpu));
+	printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n",
+		cpu, arch_scale_freq_capacity(NULL, cpu));
 }

 #else
 static inline void parse_dt_topology(void) {}
-static inline void update_cpu_power(unsigned int cpuid) {}
+static inline void update_cpu_capacity(unsigned int cpuid) {}
 #endif

 /*
@@ -267,7 +267,7 @@ void store_cpu_topology(unsigned int cpuid)

 	update_siblings_masks(cpuid);

-	update_cpu_power(cpuid);
+	update_cpu_capacity(cpuid);

 	printk(KERN_INFO "CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n",
 		cpuid, cpu_topology[cpuid].thread_id,
@@ -297,7 +297,7 @@ void __init init_cpu_topology(void)
 {
 	unsigned int cpu;

-	/* init core mask and power*/
+	/* init core mask and capacity */
 	for_each_possible_cpu(cpu) {
 		struct cputopo_arm *cpu_topo = &(cpu_topology[cpu]);

@@ -307,7 +307,7 @@ void __init init_cpu_topology(void)
 		cpumask_clear(&cpu_topo->core_sibling);
 		cpumask_clear(&cpu_topo->thread_sibling);

-		set_power_scale(cpu, SCHED_POWER_SCALE);
+		set_capacity_scale(cpu, SCHED_CAPACITY_SCALE);
 	}
 	smp_wmb();


--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -749,7 +749,7 @@ int setup_profiling_timer(unsigned int multiplier)
 /* cpumask of CPUs with asymetric SMT dependancy */
 static const int powerpc_smt_flags(void)
 {
-	int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
+	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;

 	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
 		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");

--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -187,8 +187,11 @@ static int poll_idle(struct cpuidle_device *dev,

 	t1 = ktime_get();
 	local_irq_enable();
-	while (!need_resched())
-		cpu_relax();
+	if (!current_set_polling_and_test()) {
+		while (!need_resched())
+			cpu_relax();
+	}
+	current_clr_polling();

 	t2 = ktime_get();
 	diff = ktime_to_us(ktime_sub(t2, t1));

--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -586,7 +586,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn);

 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-bool kvm_vcpu_yield_to(struct kvm_vcpu *target);
+int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -847,10 +847,10 @@ enum cpu_idle_type {
 };

 /*
- * Increase resolution of cpu_power calculations
+ * Increase resolution of cpu_capacity calculations
 */
-#define SCHED_POWER_SHIFT	10
-#define SCHED_POWER_SCALE	(1L << SCHED_POWER_SHIFT)
+#define SCHED_CAPACITY_SHIFT	10
+#define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)

 /*
 * sched-domains (multiprocessor balancing) declarations:
@@ -862,7 +862,7 @@ enum cpu_idle_type {
 #define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
 #define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
-#define SD_SHARE_CPUPOWER	0x0080	/* Domain members share cpu power */
+#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu power */
 #define SD_SHARE_POWERDOMAIN	0x0100	/* Domain members share power domain */
 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
@@ -874,7 +874,7 @@ enum cpu_idle_type {
 #ifdef CONFIG_SCHED_SMT
 static inline const int cpu_smt_flags(void)
 {
-	return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
+	return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
 }
 #endif

@@ -1006,7 +1006,7 @@ typedef const int (*sched_domain_flags_f)(void);
 struct sd_data {
 	struct sched_domain **__percpu sd;
 	struct sched_group **__percpu sg;
-	struct sched_group_power **__percpu sgp;
+	struct sched_group_capacity **__percpu sgc;
 };

 struct sched_domain_topology_level {
@@ -2173,7 +2173,7 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
 static inline void sched_autogroup_exit(struct signal_struct *sig) { }
 #endif

-extern bool yield_to(struct task_struct *p, bool preempt);
+extern int yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
 /**

--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -530,6 +530,26 @@ TRACE_EVENT(sched_swap_numa,
 			__entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid,
 			__entry->dst_cpu, __entry->dst_nid)
 );
+
+/*
+ * Tracepoint for waking a polling cpu without an IPI.
+ */
+TRACE_EVENT(sched_wake_idle_without_ipi,
+
+	TP_PROTO(int cpu),
+
+	TP_ARGS(cpu),
+
+	TP_STRUCT__entry(
+		__field(	int,	cpu	)
+	),
+
+	TP_fast_assign(
+		__entry->cpu	= cpu;
+	),
+
+	TP_printk("cpu=%d", __entry->cpu)
+);
 #endif /* _TRACE_SCHED_H */

 /* This part must be outside protection */

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
 	dl_b->dl_runtime = runtime;
 }

-extern unsigned long to_ratio(u64 period, u64 runtime);
-
 void init_dl_bw(struct dl_bw *dl_b)
 {
 	raw_spin_lock_init(&dl_b->lock);

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
 SCHED_FEAT(WAKEUP_PREEMPTION, true)

 /*
- * Use arch dependent cpu power functions
+ * Use arch dependent cpu capacity functions
 */
-SCHED_FEAT(ARCH_POWER, true)
+SCHED_FEAT(ARCH_CAPACITY, true)

 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)

 /*
- * Decrement CPU power based on time not spent running tasks
+ * Decrement CPU capacity based on time not spent running tasks
 */
-SCHED_FEAT(NONTASK_POWER, true)
+SCHED_FEAT(NONTASK_CAPACITY, true)

 /*
 * Queue remote wakeups on the target CPU and process them

--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -12,6 +12,8 @@

 #include <trace/events/power.h>

+#include "sched.h"
+
 static int __read_mostly cpu_idle_force_poll;

 void cpu_idle_poll_ctrl(bool enable)
@@ -67,6 +69,10 @@ void __weak arch_cpu_idle(void)
 * cpuidle_idle_call - the main idle function
 *
 * NOTE: no locks or semaphores should be used here
+ *
+ * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * set, and it returns with polling set.  If it ever stops polling, it
+ * must clear the polling bit.
 */
 static void cpuidle_idle_call(void)
 {
@@ -175,10 +181,22 @@ static void cpuidle_idle_call(void)

 /*
 * Generic idle loop implementation
+ *
+ * Called with polling cleared.
 */
 static void cpu_idle_loop(void)
 {
 	while (1) {
+		/*
+		 * If the arch has a polling bit, we maintain an invariant:
+		 *
+		 * Our polling bit is clear if we're not scheduled (i.e. if
+		 * rq->curr != rq->idle).  This means that, if rq->idle has
+		 * the polling bit set, then setting need_resched is
+		 * guaranteed to cause the cpu to reschedule.
+		 */
+
+		__current_set_polling();
 		tick_nohz_idle_enter();

 		while (!need_resched()) {
@@ -218,6 +236,17 @@ static void cpu_idle_loop(void)
 		 */
 		preempt_set_need_resched();
 		tick_nohz_idle_exit();
+		__current_clr_polling();
+
+		/*
+		 * We promise to call sched_ttwu_pending and reschedule
+		 * if need_resched is set while polling is set.  That
+		 * means that clearing polling needs to be visible
+		 * before doing these things.
+		 */
+		smp_mb__after_atomic();
+
+		sched_ttwu_pending();
 		schedule_preempt_disabled();
 	}
 }
@@ -239,7 +268,6 @@ void cpu_startup_entry(enum cpuhp_state state)
 	 */
 	boot_init_stack_canary();
 #endif
-	__current_set_polling();
 	arch_cpu_idle_prepare();
 	cpu_idle_loop();
 }
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -918,7 +918,6 @@ static void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_rt_entity *rt_se = &curr->rt;
-	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	u64 delta_exec;

 	if (curr->sched_class != &rt_sched_class)
@@ -943,7 +942,7 @@ static void update_curr_rt(struct rq *rq)
 		return;

 	for_each_sched_rt_entity(rt_se) {
-		rt_rq = rt_rq_of_se(rt_se);
+		struct rt_rq *rt_rq = rt_rq_of_se(rt_se);

 		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 			raw_spin_lock(&rt_rq->rt_runtime_lock);

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -567,7 +567,7 @@ struct rq {
 	struct root_domain *rd;
 	struct sched_domain *sd;

-	unsigned long cpu_power;
+	unsigned long cpu_capacity;

 	unsigned char idle_balance;
 	/* For active balancing */
@@ -670,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);

 #ifdef CONFIG_SMP

+extern void sched_ttwu_pending(void);
+
 #define rcu_dereference_check_sched_domain(p) \
 	rcu_dereference_check((p), \
 			      lockdep_is_held(&sched_domains_mutex))
@@ -728,15 +730,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);

-struct sched_group_power {
+struct sched_group_capacity {
 	atomic_t ref;
 	/*
-	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
-	 * single CPU.
+	 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+	 * for a single CPU.
 	 */
-	unsigned int power, power_orig;
+	unsigned int capacity, capacity_orig;
 	unsigned long next_update;
-	int imbalance; /* XXX unrelated to power but shared group state */
+	int imbalance; /* XXX unrelated to capacity but shared group state */
 	/*
 	 * Number of busy cpus in this group.
 	 */
@@ -750,7 +752,7 @@ struct sched_group {
 	atomic_t ref;

 	unsigned int group_weight;
-	struct sched_group_power *sgp;
+	struct sched_group_capacity *sgc;

 	/*
 	 * The CPUs this group covers.
@@ -773,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
 */
 static inline struct cpumask *sched_group_mask(struct sched_group *sg)
 {
-	return to_cpumask(sg->sgp->cpumask);
+	return to_cpumask(sg->sgc->cpumask);
 }

 /**
@@ -787,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)

 extern int group_balance_cpu(struct sched_group *sg);

+#else
+
+static inline void sched_ttwu_pending(void) { }
+
 #endif /* CONFIG_SMP */

 #include "stats.h"
@@ -1167,7 +1173,7 @@ extern const struct sched_class idle_sched_class;

 #ifdef CONFIG_SMP

-extern void update_group_power(struct sched_domain *sd, int cpu);
+extern void update_group_capacity(struct sched_domain *sd, int cpu);

 extern void trigger_load_balance(struct rq *rq);


--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1714,11 +1714,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
 #endif /* !CONFIG_S390 */

-bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
+int kvm_vcpu_yield_to(struct kvm_vcpu *target)
 {
 	struct pid *pid;
 	struct task_struct *task = NULL;
-	bool ret = false;
+	int ret = 0;

 	rcu_read_lock();
 	pid = rcu_dereference(target->pid);