Commit 4ed39004 authored by Rafael J. Wysocki's avatar Rafael J. Wysocki

Merge branch 'pm-cpufreq'

* pm-cpufreq: (94 commits)
  intel_pstate: Do not skip samples partially
  intel_pstate: Remove freq calculation from intel_pstate_calc_busy()
  intel_pstate: Move intel_pstate_calc_busy() into get_target_pstate_use_performance()
  intel_pstate: Optimize calculation for max/min_perf_adj
  intel_pstate: Remove extra conversions in pid calculation
  cpufreq: Move scheduler-related code to the sched directory
  Revert "cpufreq: postfix policy directory with the first CPU in related_cpus"
  cpufreq: Reduce cpufreq_update_util() overhead a bit
  cpufreq: Select IRQ_WORK if CPU_FREQ_GOV_COMMON is set
  cpufreq: Remove 'policy->governor_enabled'
  cpufreq: Rename __cpufreq_governor() to cpufreq_governor()
  cpufreq: Relocate handle_update() to kill its declaration
  cpufreq: governor: Drop unnecessary checks from show() and store()
  cpufreq: governor: Fix race in dbs_update_util_handler()
  cpufreq: governor: Make gov_set_update_util() static
  cpufreq: governor: Narrow down the dbs_data_mutex coverage
  cpufreq: governor: Make dbs_data_mutex static
  cpufreq: governor: Relocate definitions of tuners structures
  cpufreq: governor: Move per-CPU data to the common code
  cpufreq: governor: Make governor private data per-policy
  ...
parents b5d5fad9 4fec7ad5
......@@ -25,7 +25,7 @@ callback, so cpufreq core can't request a transition to a specific frequency.
The driver provides minimum and maximum frequency limits and callbacks to set a
policy. The policy in cpufreq sysfs is referred to as the "scaling governor".
The cpufreq core can request the driver to operate in any of the two policies:
"performance: and "powersave". The driver decides which frequency to use based
"performance" and "powersave". The driver decides which frequency to use based
on the above policy selection considering minimum and maximum frequency limits.
The Intel P-State driver falls under the latter category, which implements the
......
......@@ -19,6 +19,7 @@ config CPU_FREQ
if CPU_FREQ
config CPU_FREQ_GOV_COMMON
select IRQ_WORK
bool
config CPU_FREQ_BOOST_SW
......
......@@ -70,6 +70,8 @@ struct acpi_cpufreq_data {
unsigned int cpu_feature;
unsigned int acpi_perf_cpu;
cpumask_var_t freqdomain_cpus;
void (*cpu_freq_write)(struct acpi_pct_register *reg, u32 val);
u32 (*cpu_freq_read)(struct acpi_pct_register *reg);
};
/* acpi_perf_data is a pointer to percpu data. */
......@@ -243,125 +245,119 @@ static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
}
}
struct msr_addr {
u32 reg;
};
u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
{
u32 val, dummy;
struct io_addr {
u16 port;
u8 bit_width;
};
rdmsr(MSR_IA32_PERF_CTL, val, dummy);
return val;
}
void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
{
u32 lo, hi;
rdmsr(MSR_IA32_PERF_CTL, lo, hi);
lo = (lo & ~INTEL_MSR_RANGE) | (val & INTEL_MSR_RANGE);
wrmsr(MSR_IA32_PERF_CTL, lo, hi);
}
u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
{
u32 val, dummy;
rdmsr(MSR_AMD_PERF_CTL, val, dummy);
return val;
}
void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val)
{
wrmsr(MSR_AMD_PERF_CTL, val, 0);
}
u32 cpu_freq_read_io(struct acpi_pct_register *reg)
{
u32 val;
acpi_os_read_port(reg->address, &val, reg->bit_width);
return val;
}
void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val)
{
acpi_os_write_port(reg->address, val, reg->bit_width);
}
struct drv_cmd {
unsigned int type;
const struct cpumask *mask;
union {
struct msr_addr msr;
struct io_addr io;
} addr;
struct acpi_pct_register *reg;
u32 val;
union {
void (*write)(struct acpi_pct_register *reg, u32 val);
u32 (*read)(struct acpi_pct_register *reg);
} func;
};
/* Called via smp_call_function_single(), on the target CPU */
static void do_drv_read(void *_cmd)
{
struct drv_cmd *cmd = _cmd;
u32 h;
switch (cmd->type) {
case SYSTEM_INTEL_MSR_CAPABLE:
case SYSTEM_AMD_MSR_CAPABLE:
rdmsr(cmd->addr.msr.reg, cmd->val, h);
break;
case SYSTEM_IO_CAPABLE:
acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
&cmd->val,
(u32)cmd->addr.io.bit_width);
break;
default:
break;
}
cmd->val = cmd->func.read(cmd->reg);
}
/* Called via smp_call_function_many(), on the target CPUs */
static void do_drv_write(void *_cmd)
static u32 drv_read(struct acpi_cpufreq_data *data, const struct cpumask *mask)
{
struct drv_cmd *cmd = _cmd;
u32 lo, hi;
struct acpi_processor_performance *perf = to_perf_data(data);
struct drv_cmd cmd = {
.reg = &perf->control_register,
.func.read = data->cpu_freq_read,
};
int err;
switch (cmd->type) {
case SYSTEM_INTEL_MSR_CAPABLE:
rdmsr(cmd->addr.msr.reg, lo, hi);
lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
wrmsr(cmd->addr.msr.reg, lo, hi);
break;
case SYSTEM_AMD_MSR_CAPABLE:
wrmsr(cmd->addr.msr.reg, cmd->val, 0);
break;
case SYSTEM_IO_CAPABLE:
acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
cmd->val,
(u32)cmd->addr.io.bit_width);
break;
default:
break;
}
err = smp_call_function_any(mask, do_drv_read, &cmd, 1);
WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
return cmd.val;
}
static void drv_read(struct drv_cmd *cmd)
/* Called via smp_call_function_many(), on the target CPUs */
static void do_drv_write(void *_cmd)
{
int err;
cmd->val = 0;
struct drv_cmd *cmd = _cmd;
err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
cmd->func.write(cmd->reg, cmd->val);
}
static void drv_write(struct drv_cmd *cmd)
static void drv_write(struct acpi_cpufreq_data *data,
const struct cpumask *mask, u32 val)
{
struct acpi_processor_performance *perf = to_perf_data(data);
struct drv_cmd cmd = {
.reg = &perf->control_register,
.val = val,
.func.write = data->cpu_freq_write,
};
int this_cpu;
this_cpu = get_cpu();
if (cpumask_test_cpu(this_cpu, cmd->mask))
do_drv_write(cmd);
smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
if (cpumask_test_cpu(this_cpu, mask))
do_drv_write(&cmd);
smp_call_function_many(mask, do_drv_write, &cmd, 1);
put_cpu();
}
static u32
get_cur_val(const struct cpumask *mask, struct acpi_cpufreq_data *data)
static u32 get_cur_val(const struct cpumask *mask, struct acpi_cpufreq_data *data)
{
struct acpi_processor_performance *perf;
struct drv_cmd cmd;
u32 val;
if (unlikely(cpumask_empty(mask)))
return 0;
switch (data->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
break;
case SYSTEM_AMD_MSR_CAPABLE:
cmd.type = SYSTEM_AMD_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_AMD_PERF_CTL;
break;
case SYSTEM_IO_CAPABLE:
cmd.type = SYSTEM_IO_CAPABLE;
perf = to_perf_data(data);
cmd.addr.io.port = perf->control_register.address;
cmd.addr.io.bit_width = perf->control_register.bit_width;
break;
default:
return 0;
}
cmd.mask = mask;
drv_read(&cmd);
val = drv_read(data, mask);
pr_debug("get_cur_val = %u\n", cmd.val);
pr_debug("get_cur_val = %u\n", val);
return cmd.val;
return val;
}
static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
......@@ -416,7 +412,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
{
struct acpi_cpufreq_data *data = policy->driver_data;
struct acpi_processor_performance *perf;
struct drv_cmd cmd;
const struct cpumask *mask;
unsigned int next_perf_state = 0; /* Index into perf table */
int result = 0;
......@@ -434,42 +430,21 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
} else {
pr_debug("Already at target state (P%d)\n",
next_perf_state);
goto out;
}
return 0;
}
switch (data->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
cmd.val = (u32) perf->states[next_perf_state].control;
break;
case SYSTEM_AMD_MSR_CAPABLE:
cmd.type = SYSTEM_AMD_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_AMD_PERF_CTL;
cmd.val = (u32) perf->states[next_perf_state].control;
break;
case SYSTEM_IO_CAPABLE:
cmd.type = SYSTEM_IO_CAPABLE;
cmd.addr.io.port = perf->control_register.address;
cmd.addr.io.bit_width = perf->control_register.bit_width;
cmd.val = (u32) perf->states[next_perf_state].control;
break;
default:
result = -ENODEV;
goto out;
}
/* cpufreq holds the hotplug lock, so we are safe from here on */
if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
cmd.mask = policy->cpus;
else
cmd.mask = cpumask_of(policy->cpu);
/*
* The core won't allow CPUs to go away until the governor has been
* stopped, so we can rely on the stability of policy->cpus.
*/
mask = policy->shared_type == CPUFREQ_SHARED_TYPE_ANY ?
cpumask_of(policy->cpu) : policy->cpus;
drv_write(&cmd);
drv_write(data, mask, perf->states[next_perf_state].control);
if (acpi_pstate_strict) {
if (!check_freqs(cmd.mask, data->freq_table[index].frequency,
if (!check_freqs(mask, data->freq_table[index].frequency,
data)) {
pr_debug("acpi_cpufreq_target failed (%d)\n",
policy->cpu);
......@@ -480,7 +455,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
if (!result)
perf->state = next_perf_state;
out:
return result;
}
......@@ -740,15 +714,21 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
}
pr_debug("SYSTEM IO addr space\n");
data->cpu_feature = SYSTEM_IO_CAPABLE;
data->cpu_freq_read = cpu_freq_read_io;
data->cpu_freq_write = cpu_freq_write_io;
break;
case ACPI_ADR_SPACE_FIXED_HARDWARE:
pr_debug("HARDWARE addr space\n");
if (check_est_cpu(cpu)) {
data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
data->cpu_freq_read = cpu_freq_read_intel;
data->cpu_freq_write = cpu_freq_write_intel;
break;
}
if (check_amd_hwpstate_cpu(cpu)) {
data->cpu_feature = SYSTEM_AMD_MSR_CAPABLE;
data->cpu_freq_read = cpu_freq_read_amd;
data->cpu_freq_write = cpu_freq_write_amd;
break;
}
result = -ENODEV;
......
......@@ -21,7 +21,7 @@
#include <asm/msr.h>
#include <asm/cpufeature.h>
#include "cpufreq_governor.h"
#include "cpufreq_ondemand.h"
#define MSR_AMD64_FREQ_SENSITIVITY_ACTUAL 0xc0010080
#define MSR_AMD64_FREQ_SENSITIVITY_REFERENCE 0xc0010081
......@@ -45,10 +45,10 @@ static unsigned int amd_powersave_bias_target(struct cpufreq_policy *policy,
long d_actual, d_reference;
struct msr actual, reference;
struct cpu_data_t *data = &per_cpu(cpu_data, policy->cpu);
struct dbs_data *od_data = policy->governor_data;
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct dbs_data *od_data = policy_dbs->dbs_data;
struct od_dbs_tuners *od_tuners = od_data->tuners;
struct od_cpu_dbs_info_s *od_info =
od_data->cdata->get_cpu_dbs_info_s(policy->cpu);
struct od_policy_dbs_info *od_info = to_dbs_info(policy_dbs);
if (!od_info->freq_table)
return freq_next;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
/*
* Header file for CPUFreq ondemand governor and related code.
*
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include "cpufreq_governor.h"
struct od_policy_dbs_info {
struct policy_dbs_info policy_dbs;
struct cpufreq_frequency_table *freq_table;
unsigned int freq_lo;
unsigned int freq_lo_delay_us;
unsigned int freq_hi_delay_us;
unsigned int sample_type:1;
};
static inline struct od_policy_dbs_info *to_dbs_info(struct policy_dbs_info *policy_dbs)
{
return container_of(policy_dbs, struct od_policy_dbs_info, policy_dbs);
}
struct od_dbs_tuners {
unsigned int powersave_bias;
};
......@@ -33,10 +33,7 @@ static int cpufreq_governor_performance(struct cpufreq_policy *policy,
return 0;
}
#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE_MODULE
static
#endif
struct cpufreq_governor cpufreq_gov_performance = {
static struct cpufreq_governor cpufreq_gov_performance = {
.name = "performance",
.governor = cpufreq_governor_performance,
.owner = THIS_MODULE,
......@@ -52,6 +49,19 @@ static void __exit cpufreq_gov_performance_exit(void)
cpufreq_unregister_governor(&cpufreq_gov_performance);
}
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
struct cpufreq_governor *cpufreq_default_governor(void)
{
return &cpufreq_gov_performance;
}
#endif
#ifndef CONFIG_CPU_FREQ_GOV_PERFORMANCE_MODULE
struct cpufreq_governor *cpufreq_fallback_governor(void)
{
return &cpufreq_gov_performance;
}
#endif
MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
MODULE_DESCRIPTION("CPUfreq policy governor 'performance'");
MODULE_LICENSE("GPL");
......
......@@ -33,10 +33,7 @@ static int cpufreq_governor_powersave(struct cpufreq_policy *policy,
return 0;
}
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE
static
#endif
struct cpufreq_governor cpufreq_gov_powersave = {
static struct cpufreq_governor cpufreq_gov_powersave = {
.name = "powersave",
.governor = cpufreq_governor_powersave,
.owner = THIS_MODULE,
......@@ -57,6 +54,11 @@ MODULE_DESCRIPTION("CPUfreq policy governor 'powersave'");
MODULE_LICENSE("GPL");
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE
struct cpufreq_governor *cpufreq_default_governor(void)
{
return &cpufreq_gov_powersave;
}
fs_initcall(cpufreq_gov_powersave_init);
#else
module_init(cpufreq_gov_powersave_init);
......
......@@ -89,10 +89,7 @@ static int cpufreq_governor_userspace(struct cpufreq_policy *policy,
return rc;
}
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE
static
#endif
struct cpufreq_governor cpufreq_gov_userspace = {
static struct cpufreq_governor cpufreq_gov_userspace = {
.name = "userspace",
.governor = cpufreq_governor_userspace,
.store_setspeed = cpufreq_set,
......@@ -116,6 +113,11 @@ MODULE_DESCRIPTION("CPUfreq policy governor 'userspace'");
MODULE_LICENSE("GPL");
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE
struct cpufreq_governor *cpufreq_default_governor(void)
{
return &cpufreq_gov_userspace;
}
fs_initcall(cpufreq_gov_userspace_init);
#else
module_init(cpufreq_gov_userspace_init);
......
This diff is collapsed.
......@@ -28,6 +28,8 @@
#include <linux/of.h>
#include <linux/reboot.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <trace/events/power.h>
#include <asm/cputhreads.h>
#include <asm/firmware.h>
......@@ -42,13 +44,24 @@
static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
static bool rebooting, throttled, occ_reset;
static unsigned int *core_to_chip_map;
static const char * const throttle_reason[] = {
"No throttling",
"Power Cap",
"Processor Over Temperature",
"Power Supply Failure",
"Over Current",
"OCC Reset"
};
static struct chip {
unsigned int id;
bool throttled;
bool restore;
u8 throttle_reason;
cpumask_t mask;
struct work_struct throttle;
bool restore;
} *chips;
static int nr_chips;
......@@ -312,13 +325,14 @@ static inline unsigned int get_nominal_index(void)
static void powernv_cpufreq_throttle_check(void *data)
{
unsigned int cpu = smp_processor_id();
unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)];
unsigned long pmsr;
int pmsr_pmax, i;
pmsr = get_pmspr(SPRN_PMSR);
for (i = 0; i < nr_chips; i++)
if (chips[i].id == cpu_to_chip_id(cpu))
if (chips[i].id == chip_id)
break;
/* Check for Pmax Capping */
......@@ -328,17 +342,17 @@ static void powernv_cpufreq_throttle_check(void *data)
goto next;
chips[i].throttled = true;
if (pmsr_pmax < powernv_pstate_info.nominal)
pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
cpu, chips[i].id, pmsr_pmax,
powernv_pstate_info.nominal);
else
pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n",
cpu, chips[i].id, pmsr_pmax,
powernv_pstate_info.max);
trace_powernv_throttle(chips[i].id,
throttle_reason[chips[i].throttle_reason],
pmsr_pmax);
} else if (chips[i].throttled) {
chips[i].throttled = false;
pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu,
chips[i].id, pmsr_pmax);
trace_powernv_throttle(chips[i].id,
throttle_reason[chips[i].throttle_reason],
pmsr_pmax);
}
/* Check if Psafe_mode_active is set in PMSR. */
......@@ -356,7 +370,7 @@ static void powernv_cpufreq_throttle_check(void *data)
if (throttled) {
pr_info("PMSR = %16lx\n", pmsr);
pr_crit("CPU Frequency could be throttled\n");
pr_warn("CPU Frequency could be throttled\n");
}
}
......@@ -423,18 +437,19 @@ void powernv_cpufreq_work_fn(struct work_struct *work)
{
struct chip *chip = container_of(work, struct chip, throttle);
unsigned int cpu;
cpumask_var_t mask;
cpumask_t mask;
smp_call_function_any(&chip->mask,
get_online_cpus();
cpumask_and(&mask, &chip->mask, cpu_online_mask);
smp_call_function_any(&mask,
powernv_cpufreq_throttle_check, NULL, 0);
if (!chip->restore)
return;
goto out;
chip->restore = false;
cpumask_copy(mask, &chip->mask);
for_each_cpu_and(cpu, mask, cpu_online_mask) {
int index, tcpu;
for_each_cpu(cpu, &mask) {
int index;
struct cpufreq_policy policy;
cpufreq_get_policy(&policy, cpu);
......@@ -442,20 +457,12 @@ void powernv_cpufreq_work_fn(struct work_struct *work)
policy.cur,
CPUFREQ_RELATION_C, &index);
powernv_cpufreq_target_index(&policy, index);
for_each_cpu(tcpu, policy.cpus)
cpumask_clear_cpu(tcpu, mask);
cpumask_andnot(&mask, &mask, policy.cpus);
}
out:
put_online_cpus();
}
static char throttle_reason[][30] = {
"No throttling",
"Power Cap",
"Processor Over Temperature",
"Power Supply Failure",
"Over Current",
"OCC Reset"
};
static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
unsigned long msg_type, void *_msg)
{
......@@ -481,7 +488,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
*/
if (!throttled) {
throttled = true;
pr_crit("CPU frequency is throttled for duration\n");
pr_warn("CPU frequency is throttled for duration\n");
}
break;
......@@ -505,24 +512,19 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
return 0;
}
if (omsg.throttle_status &&
for (i = 0; i < nr_chips; i++)
if (chips[i].id == omsg.chip)
break;
if (omsg.throttle_status >= 0 &&
omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS)
pr_info("OCC: Chip %u Pmax reduced due to %s\n",
(unsigned int)omsg.chip,
throttle_reason[omsg.throttle_status]);
else if (!omsg.throttle_status)
pr_info("OCC: Chip %u %s\n", (unsigned int)omsg.chip,
throttle_reason[omsg.throttle_status]);
else
return 0;
chips[i].throttle_reason = omsg.throttle_status;
for (i = 0; i < nr_chips; i++)
if (chips[i].id == omsg.chip) {
if (!omsg.throttle_status)
chips[i].restore = true;
schedule_work(&chips[i].throttle);
}
}
return 0;
}
......@@ -556,29 +558,54 @@ static int init_chip_info(void)
unsigned int chip[256];
unsigned int cpu, i;
unsigned int prev_chip_id = UINT_MAX;
cpumask_t cpu_mask;
int ret = -ENOMEM;
core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int),
GFP_KERNEL);
if (!core_to_chip_map)
goto out;
for_each_possible_cpu(cpu) {
cpumask_copy(&cpu_mask, cpu_possible_mask);
for_each_cpu(cpu, &cpu_mask) {
unsigned int id = cpu_to_chip_id(cpu);
if (prev_chip_id != id) {
prev_chip_id = id;
chip[nr_chips++] = id;
}
core_to_chip_map[cpu_core_index_of_thread(cpu)] = id;
cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu));
}
chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL);
chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
if (!chips)
return -ENOMEM;
goto free_chip_map;
for (i = 0; i < nr_chips; i++) {
chips[i].id = chip[i];
chips[i].throttled = false;
cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
chips[i].restore = false;
}
return 0;
free_chip_map:
kfree(core_to_chip_map);
out:
return ret;
}
static inline void clean_chip_info(void)
{
kfree(chips);
kfree(core_to_chip_map);
}
static inline void unregister_all_notifiers(void)
{
opal_message_notifier_unregister(OPAL_MSG_OCC,
&powernv_cpufreq_opal_nb);
unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
}
static int __init powernv_cpufreq_init(void)
......@@ -591,28 +618,35 @@ static int __init powernv_cpufreq_init(void)
/* Discover pstates from device tree and init */
rc = init_powernv_pstates();
if (rc) {
pr_info("powernv-cpufreq disabled. System does not support PState control\n");
return rc;
}
if (rc)
goto out;
/* Populate chip info */
rc = init_chip_info();
if (rc)
return rc;
goto out;
register_reboot_notifier(&powernv_cpufreq_reboot_nb);
opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb);
return cpufreq_register_driver(&powernv_cpufreq_driver);
rc = cpufreq_register_driver(&powernv_cpufreq_driver);
if (!rc)
return 0;
pr_info("Failed to register the cpufreq driver (%d)\n", rc);
unregister_all_notifiers();
clean_chip_info();
out:
pr_info("Platform driver disabled. System does not support PState control\n");
return rc;
}
module_init(powernv_cpufreq_init);
static void __exit powernv_cpufreq_exit(void)
{
unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
opal_message_notifier_unregister(OPAL_MSG_OCC,
&powernv_cpufreq_opal_nb);
cpufreq_unregister_driver(&powernv_cpufreq_driver);
unregister_all_notifiers();
clean_chip_info();
}
module_exit(powernv_cpufreq_exit);
......
......@@ -80,7 +80,6 @@ struct cpufreq_policy {
unsigned int last_policy; /* policy before unplug */
struct cpufreq_governor *governor; /* see below */
void *governor_data;
bool governor_enabled; /* governor start/stop flag */
char last_governor[CPUFREQ_NAME_LEN]; /* last governor used */
struct work_struct update; /* if update_policy() needs to be
......@@ -100,10 +99,6 @@ struct cpufreq_policy {
* - Any routine that will write to the policy structure and/or may take away
* the policy altogether (eg. CPU hotplug), will hold this lock in write
* mode before doing so.
*
* Additional rules:
* - Lock should not be held across
* __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT);
*/
struct rw_semaphore rwsem;
......@@ -464,29 +459,8 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
int cpufreq_register_governor(struct cpufreq_governor *governor);
void cpufreq_unregister_governor(struct cpufreq_governor *governor);
/* CPUFREQ DEFAULT GOVERNOR */
/*
* Performance governor is fallback governor if any other gov failed to auto
* load due latency restrictions
*/
#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
extern struct cpufreq_governor cpufreq_gov_performance;
#endif
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_performance)
#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE)
extern struct cpufreq_governor cpufreq_gov_powersave;
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_powersave)
#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE)
extern struct cpufreq_governor cpufreq_gov_userspace;
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_userspace)
#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND)
extern struct cpufreq_governor cpufreq_gov_ondemand;
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_ondemand)
#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
extern struct cpufreq_governor cpufreq_gov_conservative;
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative)
#endif
struct cpufreq_governor *cpufreq_default_governor(void);
struct cpufreq_governor *cpufreq_fallback_governor(void);
/*********************************************************************
* FREQUENCY TABLE HELPERS *
......@@ -525,16 +499,6 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
}
#endif
static inline bool cpufreq_next_valid(struct cpufreq_frequency_table **pos)
{
while ((*pos)->frequency != CPUFREQ_TABLE_END)
if ((*pos)->frequency != CPUFREQ_ENTRY_INVALID)
return true;
else
(*pos)++;
return false;
}
/*
* cpufreq_for_each_entry - iterate over a cpufreq_frequency_table
* @pos: the cpufreq_frequency_table * to use as a loop cursor.
......@@ -552,7 +516,10 @@ static inline bool cpufreq_next_valid(struct cpufreq_frequency_table **pos)
*/
#define cpufreq_for_each_valid_entry(pos, table) \
for (pos = table; cpufreq_next_valid(&pos); pos++)
for (pos = table; pos->frequency != CPUFREQ_TABLE_END; pos++) \
if (pos->frequency == CPUFREQ_ENTRY_INVALID) \
continue; \
else
int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
struct cpufreq_frequency_table *table);
......
......@@ -3207,4 +3207,13 @@ static inline unsigned long rlimit_max(unsigned int limit)
return task_rlimit_max(current, limit);
}
#ifdef CONFIG_CPU_FREQ
struct update_util_data {
void (*func)(struct update_util_data *data,
u64 time, unsigned long util, unsigned long max);
};
void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
#endif /* CONFIG_CPU_FREQ */
#endif
......@@ -38,6 +38,28 @@ DEFINE_EVENT(cpu, cpu_idle,
TP_ARGS(state, cpu_id)
);
TRACE_EVENT(powernv_throttle,
TP_PROTO(int chip_id, const char *reason, int pmax),
TP_ARGS(chip_id, reason, pmax),
TP_STRUCT__entry(
__field(int, chip_id)
__string(reason, reason)
__field(int, pmax)
),
TP_fast_assign(
__entry->chip_id = chip_id;
__assign_str(reason, reason);
__entry->pmax = pmax;
),
TP_printk("Chip %d Pmax %d %s", __entry->chip_id,
__entry->pmax, __get_str(reason))
);
TRACE_EVENT(pstate_sample,
TP_PROTO(u32 core_busy,
......
......@@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
/*
* Scheduler code and data structures related to cpufreq.
*
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include "sched.h"
DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
/**
* cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer.
* @cpu: The CPU to set the pointer for.
* @data: New pointer value.
*
* Set and publish the update_util_data pointer for the given CPU. That pointer
* points to a struct update_util_data object containing a callback function
* to call from cpufreq_update_util(). That function will be called from an RCU
* read-side critical section, so it must not sleep.
*
* Callers must use RCU-sched callbacks to free any memory that might be
* accessed via the old update_util_data pointer or invoke synchronize_sched()
* right after this function to avoid use-after-free.
*/
void cpufreq_set_update_util_data(int cpu, struct update_util_data *data)
{
if (WARN_ON(data && !data->func))
return;
rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
}
EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
......@@ -726,6 +726,10 @@ static void update_curr_dl(struct rq *rq)
if (!dl_task(curr) || !on_dl_rq(dl_se))
return;
/* Kick cpufreq (see the comment in linux/cpufreq.h). */
if (cpu_of(rq) == smp_processor_id())
cpufreq_trigger_update(rq_clock(rq));
/*
* Consumed budget is computed considering the time as
* observed by schedulable tasks (excluding time spent
......
......@@ -2824,7 +2824,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
int cpu = cpu_of(rq_of(cfs_rq));
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
/*
* Track task load average for carrying it to new CPU after migrated, and
......@@ -2836,6 +2837,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
update_tg_load_avg(cfs_rq, 0);
if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
unsigned long max = rq->cpu_capacity_orig;
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
* a real problem -- added to that it only calls on the local
* CPU, so if we enqueue remotely we'll miss an update, but
* the next tick/schedule should update.
*
* It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization
* number include things like RT tasks.
*
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
* See cpu_util().
*/
cpufreq_update_util(rq_clock(rq),
min(cfs_rq->avg.util_avg, max), max);
}
}
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
......
......@@ -945,6 +945,10 @@ static void update_curr_rt(struct rq *rq)
if (curr->sched_class != &rt_sched_class)
return;
/* Kick cpufreq (see the comment in linux/cpufreq.h). */
if (cpu_of(rq) == smp_processor_id())
cpufreq_trigger_update(rq_clock(rq));
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
......
......@@ -1738,3 +1738,51 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
/**
* cpufreq_update_util - Take a note about CPU utilization changes.
* @time: Current time.
* @util: Current utilization.
* @max: Utilization ceiling.
*
* This function is called by the scheduler on every invocation of
* update_load_avg() on the CPU whose utilization is being updated.
*
* It can only be called from RCU-sched read-side critical sections.
*/
static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
{
struct update_util_data *data;
data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
if (data)
data->func(data, time, util, max);
}
/**
* cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
* @time: Current time.
*
* The way cpufreq is currently arranged requires it to evaluate the CPU
* performance state (frequency/voltage) on a regular basis to prevent it from
* being stuck in a completely inadequate performance level for too long.
* That is not guaranteed to happen if the updates are only triggered from CFS,
* though, because they may not be coming in if RT or deadline tasks are active
* all the time (or there are RT and DL tasks only).
*
* As a workaround for that issue, this function is called by the RT and DL
* sched classes to trigger extra cpufreq updates to prevent it from stalling,
* but that really is a band-aid. Going forward it should be replaced with
* solutions targeted more specifically at RT and DL tasks.
*/
static inline void cpufreq_trigger_update(u64 time)
{
cpufreq_update_util(time, ULONG_MAX, 0);
}
#else
static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
static inline void cpufreq_trigger_update(u64 time) {}
#endif /* CONFIG_CPU_FREQ */
......@@ -15,4 +15,5 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment