Merge branch 'pm-cpufreq-governor' into pm-cpufreq

a5acbfbd · Rafael J. Wysocki · edd4a893 · adaf9fcd · a5acbfbd · a5acbfbd
Commit a5acbfbd authored Mar 10, 2016 by Rafael J. Wysocki
17 changed files
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -19,6 +19,7 @@ config CPU_FREQ
 if CPU_FREQ
 config CPU_FREQ_GOV_COMMON
+	select IRQ_WORK
 	bool
 config CPU_FREQ_BOOST_SW

--- a/drivers/cpufreq/amd_freq_sensitivity.c
+++ b/drivers/cpufreq/amd_freq_sensitivity.c
@@ -21,7 +21,7 @@
 #include <asm/msr.h>
 #include <asm/cpufeature.h>
-#include "cpufreq_governor.h"
+#include "cpufreq_ondemand.h"
 #define MSR_AMD64_FREQ_SENSITIVITY_ACTUAL	0xc0010080
 #define MSR_AMD64_FREQ_SENSITIVITY_REFERENCE	0xc0010081
@@ -45,10 +45,10 @@ static unsigned int amd_powersave_bias_target(struct cpufreq_policy *policy,
 	long d_actual, d_reference;
 	struct msr actual, reference;
 	struct cpu_data_t *data = &per_cpu(cpu_data, policy->cpu);
-	struct dbs_data *od_data = policy->governor_data;
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+	struct dbs_data *od_data = policy_dbs->dbs_data;
 	struct od_dbs_tuners *od_tuners = od_data->tuners;
-	struct od_cpu_dbs_info_s *od_info =
+	struct od_policy_dbs_info *od_info = to_dbs_info(policy_dbs);
-		od_data->cdata->get_cpu_dbs_info_s(policy->cpu);
 	if (!od_info->freq_table)
 		return freq_next;

--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
--- a/drivers/cpufreq/cpufreq_ondemand.h
+++ b/drivers/cpufreq/cpufreq_ondemand.h
+/*
+ * Header file for CPUFreq ondemand governor and related code.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include "cpufreq_governor.h"
+struct od_policy_dbs_info {
+	struct policy_dbs_info policy_dbs;
+	struct cpufreq_frequency_table *freq_table;
+	unsigned int freq_lo;
+	unsigned int freq_lo_delay_us;
+	unsigned int freq_hi_delay_us;
+	unsigned int sample_type:1;
+};
+static inline struct od_policy_dbs_info *to_dbs_info(struct policy_dbs_info *policy_dbs)
+{
+	return container_of(policy_dbs, struct od_policy_dbs_info, policy_dbs);
+}
+struct od_dbs_tuners {
+	unsigned int powersave_bias;
+};
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -71,7 +71,7 @@ struct sample {
 	u64 mperf;
 	u64 tsc;
 	int freq;
-	ktime_t time;
+	u64 time;
 };
 struct pstate_data {
@@ -103,13 +103,13 @@ struct _pid {
 struct cpudata {
 	int cpu;
-	struct timer_list timer;
+	struct update_util_data update_util;
 	struct pstate_data pstate;
 	struct vid_data vid;
 	struct _pid pid;
-	ktime_t last_sample_time;
+	u64	last_sample_time;
 	u64	prev_aperf;
 	u64	prev_mperf;
 	u64	prev_tsc;
@@ -120,6 +120,7 @@ struct cpudata {
 static struct cpudata **all_cpu_data;
 struct pstate_adjust_policy {
 	int sample_rate_ms;
+	s64 sample_rate_ns;
 	int deadband;
 	int setpoint;
 	int p_gain_pct;
@@ -718,7 +719,7 @@ static void core_set_pstate(struct cpudata *cpudata, int pstate)
 	if (limits->no_turbo && !limits->turbo_disabled)
 		val |= (u64)1 << 32;
-	wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val);
+	wrmsrl(MSR_IA32_PERF_CTL, val);
 }
 static int knl_get_turbo_pstate(void)
@@ -889,7 +890,7 @@ static inline void intel_pstate_calc_busy(struct cpudata *cpu)
 	sample->core_pct_busy = (int32_t)core_pct;
 }
-static inline void intel_pstate_sample(struct cpudata *cpu)
+static inline void intel_pstate_sample(struct cpudata *cpu, u64 time)
 {
 	u64 aperf, mperf;
 	unsigned long flags;
@@ -906,7 +907,7 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
 	local_irq_restore(flags);
 	cpu->last_sample_time = cpu->sample.time;
-	cpu->sample.time = ktime_get();
+	cpu->sample.time = time;
 	cpu->sample.aperf = aperf;
 	cpu->sample.mperf = mperf;
 	cpu->sample.tsc =  tsc;
@@ -921,22 +922,6 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
 	cpu->prev_tsc = tsc;
 }
-static inline void intel_hwp_set_sample_time(struct cpudata *cpu)
-{
-	int delay;
-	delay = msecs_to_jiffies(50);
-	mod_timer_pinned(&cpu->timer, jiffies + delay);
-}
-static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
-{
-	int delay;
-	delay = msecs_to_jiffies(pid_params.sample_rate_ms);
-	mod_timer_pinned(&cpu->timer, jiffies + delay);
-}
 static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
 {
 	struct sample *sample = &cpu->sample;
@@ -976,8 +961,7 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
 static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 {
 	int32_t core_busy, max_pstate, current_pstate, sample_ratio;
-	s64 duration_us;
+	u64 duration_ns;
-	u32 sample_time;
 	/*
 	 * core_busy is the ratio of actual performance to max
@@ -996,18 +980,16 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 	core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
 	/*
-	 * Since we have a deferred timer, it will not fire unless
+	 * Since our utilization update callback will not run unless we are
-	 * we are in C0.  So, determine if the actual elapsed time
+	 * in C0, check if the actual elapsed time is significantly greater (3x)
-	 * is significantly greater (3x) than our sample interval.  If it
+	 * than our sample interval.  If it is, then we were idle for a long
-	 * is, then we were idle for a long enough period of time
+	 * enough period of time to adjust our busyness.
-	 * to adjust our busyness.
 	 */
-	sample_time = pid_params.sample_rate_ms  * USEC_PER_MSEC;
+	duration_ns = cpu->sample.time - cpu->last_sample_time;
-	duration_us = ktime_us_delta(cpu->sample.time,
+	if ((s64)duration_ns > pid_params.sample_rate_ns * 3
-				     cpu->last_sample_time);
+	    && cpu->last_sample_time > 0) {
-	if (duration_us > sample_time * 3) {
+		sample_ratio = div_fp(int_tofp(pid_params.sample_rate_ns),
-		sample_ratio = div_fp(int_tofp(sample_time),
+				      int_tofp(duration_ns));
-				      int_tofp(duration_us));
 		core_busy = mul_fp(core_busy, sample_ratio);
 	}
@@ -1037,23 +1019,17 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
 		sample->freq);
 }
-static void intel_hwp_timer_func(unsigned long __data)
+static void intel_pstate_update_util(struct update_util_data *data, u64 time,
-{
+				     unsigned long util, unsigned long max)
-	struct cpudata *cpu = (struct cpudata *) __data;
-	intel_pstate_sample(cpu);
-	intel_hwp_set_sample_time(cpu);
-}
-static void intel_pstate_timer_func(unsigned long __data)
 {
-	struct cpudata *cpu = (struct cpudata *) __data;
+	struct cpudata *cpu = container_of(data, struct cpudata, update_util);
+	u64 delta_ns = time - cpu->sample.time;
-	intel_pstate_sample(cpu);
+	if ((s64)delta_ns >= pid_params.sample_rate_ns) {
+		intel_pstate_sample(cpu, time);
+		if (!hwp_active)
 			intel_pstate_adjust_busy_pstate(cpu);
+	}
-	intel_pstate_set_sample_time(cpu);
 }
 #define ICPU(model, policy) \
@@ -1101,24 +1077,19 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
 	cpu->cpu = cpunum;
-	if (hwp_active)
+	if (hwp_active) {
 		intel_pstate_hwp_enable(cpu);
+		pid_params.sample_rate_ms = 50;
+		pid_params.sample_rate_ns = 50 * NSEC_PER_MSEC;
+	}
 	intel_pstate_get_cpu_pstates(cpu);
-	init_timer_deferrable(&cpu->timer);
-	cpu->timer.data = (unsigned long)cpu;
-	cpu->timer.expires = jiffies + HZ/100;
-	if (!hwp_active)
-		cpu->timer.function = intel_pstate_timer_func;
-	else
-		cpu->timer.function = intel_hwp_timer_func;
 	intel_pstate_busy_pid_reset(cpu);
-	intel_pstate_sample(cpu);
+	intel_pstate_sample(cpu, 0);
-	add_timer_on(&cpu->timer, cpunum);
+	cpu->update_util.func = intel_pstate_update_util;
+	cpufreq_set_update_util_data(cpunum, &cpu->update_util);
 	pr_debug("intel_pstate: controlling: cpu %d\n", cpunum);
@@ -1202,7 +1173,9 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
 	pr_debug("intel_pstate: CPU %d exiting\n", cpu_num);
-	del_timer_sync(&all_cpu_data[cpu_num]->timer);
+	cpufreq_set_update_util_data(cpu_num, NULL);
+	synchronize_sched();
 	if (hwp_active)
 		return;
@@ -1266,6 +1239,7 @@ static int intel_pstate_msrs_not_valid(void)
 static void copy_pid_params(struct pstate_adjust_policy *policy)
 {
 	pid_params.sample_rate_ms = policy->sample_rate_ms;
+	pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC;
 	pid_params.p_gain_pct = policy->p_gain_pct;
 	pid_params.i_gain_pct = policy->i_gain_pct;
 	pid_params.d_gain_pct = policy->d_gain_pct;
@@ -1467,7 +1441,8 @@ static int __init intel_pstate_init(void)
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		if (all_cpu_data[cpu]) {
-			del_timer_sync(&all_cpu_data[cpu]->timer);
+			cpufreq_set_update_util_data(cpu, NULL);
+			synchronize_sched();
 			kfree(all_cpu_data[cpu]);
 		}
 	}

--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -80,7 +80,6 @@ struct cpufreq_policy {
 	unsigned int		last_policy; /* policy before unplug */
 	struct cpufreq_governor	*governor; /* see below */
 	void			*governor_data;
-	bool			governor_enabled; /* governor start/stop flag */
 	char			last_governor[CPUFREQ_NAME_LEN]; /* last governor used */
 	struct work_struct	update; /* if update_policy() needs to be
@@ -100,10 +99,6 @@ struct cpufreq_policy {
 	 * - Any routine that will write to the policy structure and/or may take away
 	 *   the policy altogether (eg. CPU hotplug), will hold this lock in write
 	 *   mode before doing so.
-	 *
-	 * Additional rules:
-	 * - Lock should not be held across
-	 *     __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT);
 	 */
 	struct rw_semaphore	rwsem;

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3207,4 +3207,13 @@ static inline unsigned long rlimit_max(unsigned int limit)
 	return task_rlimit_max(current, limit);
 }
+#ifdef CONFIG_CPU_FREQ
+struct update_util_data {
+	void (*func)(struct update_util_data *data,
+		     u64 time, unsigned long util, unsigned long max);
+};
+void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
+#endif /* CONFIG_CPU_FREQ */
 #endif
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq.o
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include "sched.h"
+DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+/**
+ * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ *
+ * Set and publish the update_util_data pointer for the given CPU.  That pointer
+ * points to a struct update_util_data object containing a callback function
+ * to call from cpufreq_update_util().  That function will be called from an RCU
+ * read-side critical section, so it must not sleep.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_set_update_util_data(int cpu, struct update_util_data *data)
+{
+	if (WARN_ON(data && !data->func))
+		return;
+	rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -726,6 +726,10 @@ static void update_curr_dl(struct rq *rq)
 	if (!dl_task(curr) || !on_dl_rq(dl_se))
 		return;
+	/* Kick cpufreq (see the comment in linux/cpufreq.h). */
+	if (cpu_of(rq) == smp_processor_id())
+		cpufreq_trigger_update(rq_clock(rq));
 	/*
 	 * Consumed budget is computed considering the time as
 	 * observed by schedulable tasks (excluding time spent

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2824,7 +2824,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
-	int cpu = cpu_of(rq_of(cfs_rq));
+	struct rq *rq = rq_of(cfs_rq);
+	int cpu = cpu_of(rq);
 	/*
 	 * Track task load average for carrying it to new CPU after migrated, and
@@ -2836,6 +2837,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
 		update_tg_load_avg(cfs_rq, 0);
+	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+		unsigned long max = rq->cpu_capacity_orig;
+		/*
+		 * There are a few boundary cases this might miss but it should
+		 * get called often enough that that should (hopefully) not be
+		 * a real problem -- added to that it only calls on the local
+		 * CPU, so if we enqueue remotely we'll miss an update, but
+		 * the next tick/schedule should update.
+		 *
+		 * It will not get called when we go idle, because the idle
+		 * thread is a different class (!fair), nor will the utilization
+		 * number include things like RT tasks.
+		 *
+		 * As is, the util number is not freq-invariant (we'd have to
+		 * implement arch_scale_freq_capacity() for that).
+		 *
+		 * See cpu_util().
+		 */
+		cpufreq_update_util(rq_clock(rq),
+				    min(cfs_rq->avg.util_avg, max), max);
+	}
 }
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -945,6 +945,10 @@ static void update_curr_rt(struct rq *rq)
 	if (curr->sched_class != &rt_sched_class)
 		return;
+	/* Kick cpufreq (see the comment in linux/cpufreq.h). */
+	if (cpu_of(rq) == smp_processor_id())
+		cpufreq_trigger_update(rq_clock(rq));
 	delta_exec = rq_clock_task(rq) - curr->se.exec_start;
 	if (unlikely((s64)delta_exec <= 0))
 		return;

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1738,3 +1738,51 @@ static inline u64 irq_time_read(int cpu)
 }
 #endif /* CONFIG_64BIT */
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @time: Current time.
+ * @util: Current utilization.
+ * @max: Utilization ceiling.
+ *
+ * This function is called by the scheduler on every invocation of
+ * update_load_avg() on the CPU whose utilization is being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ */
+static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
+{
+       struct update_util_data *data;
+       data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+       if (data)
+               data->func(data, time, util, max);
+}
+/**
+ * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
+ * @time: Current time.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid.  Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_trigger_update(u64 time)
+{
+	cpufreq_update_util(time, ULONG_MAX, 0);
+}
+#else
+static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
+static inline void cpufreq_trigger_update(u64 time) {}
+#endif /* CONFIG_CPU_FREQ */