Commit 7e71a133 authored by Rafael J. Wysocki's avatar Rafael J. Wysocki

Merge branches 'pm-cpuidle', 'pm-core' and 'pm-sleep'

Merge cpuidle updates, PM core updates and changes related to system
sleep handling for 6.3-rc1:

 - Make the TEO cpuidle governor check CPU utilization in order to refine
   idle state selection (Kajetan Puchalski).

 - Make Kconfig select the haltpoll cpuidle governor when the haltpoll
   cpuidle driver is selected and replace a default_idle() call in that
   driver with arch_cpu_idle() which allows MWAIT to be used (Li
   RongQing).

 - Add Emerald Rapids Xeon support to the intel_idle driver (Artem
   Bityutskiy).

 - Add ARCH_SUSPEND_POSSIBLE dependencies for ARMv4 cpuidle drivers to
   avoid randconfig build failures (Arnd Bergmann).

 - Make kobj_type structures used in the cpuidle sysfs interface
   constant (Thomas Weißschuh).

 - Make the cpuidle driver registration code update microsecond values
   of idle state parameters in accordance with their nanosecond values
   if they are provided (Rafael Wysocki).

 - Make the PSCI cpuidle driver prevent topology CPUs from being
   suspended on PREEMPT_RT (Krzysztof Kozlowski).

 - Document that pm_runtime_force_suspend() cannot be used with
   DPM_FLAG_SMART_SUSPEND (Richard Fitzgerald).

 - Add EXPORT macros for exporting PM functions from drivers (Richard
   Fitzgerald).

 - Drop "select SRCU" from system sleep Kconfig (Paul E. McKenney).

 - Remove /** from non-kernel-doc comments in hibernation code (Randy
   Dunlap).

* pm-cpuidle:
  cpuidle: psci: Do not suspend topology CPUs on PREEMPT_RT
  cpuidle: driver: Update microsecond values of state parameters as needed
  cpuidle: sysfs: make kobj_type structures constant
  cpuidle: add ARCH_SUSPEND_POSSIBLE dependencies
  intel_idle: add Emerald Rapids Xeon support
  cpuidle-haltpoll: Replace default_idle() with arch_cpu_idle()
  cpuidle-haltpoll: select haltpoll governor
  cpuidle: teo: Introduce util-awareness
  cpuidle: teo: Optionally skip polling states in teo_find_shallower_state()

* pm-core:
  PM: Add EXPORT macros for exporting PM functions
  PM: runtime: Document that force_suspend() is incompatible with SMART_SUSPEND

* pm-sleep:
  PM: sleep: Remove "select SRCU"
  PM: hibernate: swap: don't use /** for non-kernel-doc comments
......@@ -721,6 +721,7 @@ void arch_cpu_idle(void)
{
x86_idle();
}
EXPORT_SYMBOL_GPL(arch_cpu_idle);
/*
* We use this if we don't have any better idle routine..
......
......@@ -1864,6 +1864,10 @@ static bool pm_runtime_need_not_resume(struct device *dev)
* sure the device is put into low power state and it should only be used during
* system-wide PM transitions to sleep states. It assumes that the analogous
* pm_runtime_force_resume() will be used to resume the device.
*
* Do not use with DPM_FLAG_SMART_SUSPEND as this can lead to an inconsistent
* state where this function has called the ->runtime_suspend callback but the
* PM core marks the driver as runtime active.
*/
int pm_runtime_force_suspend(struct device *dev)
{
......
......@@ -74,6 +74,7 @@ endmenu
config HALTPOLL_CPUIDLE
tristate "Halt poll cpuidle driver"
depends on X86 && KVM_GUEST
select CPU_IDLE_GOV_HALTPOLL
default y
help
This option enables halt poll cpuidle driver, which allows to poll
......
......@@ -24,6 +24,14 @@ config ARM_PSCI_CPUIDLE
It provides an idle driver that is capable of detecting and
managing idle states through the PSCI firmware interface.
The driver has limitations when used with PREEMPT_RT:
- If the idle states are described with the non-hierarchical layout,
all idle states are still available.
- If the idle states are described with the hierarchical layout,
only the idle states defined per CPU are available, but not the ones
being shared among a group of CPUs (aka cluster idle states).
config ARM_PSCI_CPUIDLE_DOMAIN
bool "PSCI CPU idle Domain"
depends on ARM_PSCI_CPUIDLE
......@@ -102,6 +110,7 @@ config ARM_MVEBU_V7_CPUIDLE
config ARM_TEGRA_CPUIDLE
bool "CPU Idle Driver for NVIDIA Tegra SoCs"
depends on (ARCH_TEGRA || COMPILE_TEST) && !ARM64 && MMU
depends on ARCH_SUSPEND_POSSIBLE
select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
select ARM_CPU_SUSPEND
help
......@@ -110,6 +119,7 @@ config ARM_TEGRA_CPUIDLE
config ARM_QCOM_SPM_CPUIDLE
bool "CPU Idle Driver for Qualcomm Subsystem Power Manager (SPM)"
depends on (ARCH_QCOM || COMPILE_TEST) && !ARM64 && MMU
depends on ARCH_SUSPEND_POSSIBLE
select ARM_CPU_SUSPEND
select CPU_IDLE_MULTIPLE_DRIVERS
select DT_IDLE_STATES
......
......@@ -32,7 +32,7 @@ static int default_enter_idle(struct cpuidle_device *dev,
local_irq_enable();
return index;
}
default_idle();
arch_cpu_idle();
return index;
}
......
......@@ -64,8 +64,11 @@ static int psci_pd_init(struct device_node *np, bool use_osi)
pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN;
/* Allow power off when OSI has been successfully enabled. */
if (use_osi)
/*
* Allow power off when OSI has been successfully enabled.
* PREEMPT_RT is not yet ready to enter domain idle states.
*/
if (use_osi && !IS_ENABLED(CONFIG_PREEMPT_RT))
pd->power_off = psci_pd_power_off;
else
pd->flags |= GENPD_FLAG_ALWAYS_ON;
......
......@@ -231,6 +231,9 @@ static int psci_dt_cpu_init_topology(struct cpuidle_driver *drv,
if (!psci_has_osi_support())
return 0;
if (IS_ENABLED(CONFIG_PREEMPT_RT))
return 0;
data->dev = psci_dt_attach_cpu(cpu);
if (IS_ERR_OR_NULL(data->dev))
return PTR_ERR_OR_ZERO(data->dev);
......
......@@ -183,11 +183,15 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
s->target_residency_ns = s->target_residency * NSEC_PER_USEC;
else if (s->target_residency_ns < 0)
s->target_residency_ns = 0;
else
s->target_residency = div_u64(s->target_residency_ns, NSEC_PER_USEC);
if (s->exit_latency > 0)
s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC;
else if (s->exit_latency_ns < 0)
s->exit_latency_ns = 0;
else
s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC);
}
}
......
......@@ -2,8 +2,13 @@
/*
* Timer events oriented CPU idle governor
*
* TEO governor:
* Copyright (C) 2018 - 2021 Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*
* Util-awareness mechanism:
* Copyright (C) 2022 Arm Ltd.
* Author: Kajetan Puchalski <kajetan.puchalski@arm.com>
*/
/**
......@@ -99,14 +104,55 @@
* select the given idle state instead of the candidate one.
*
* 3. By default, select the candidate state.
*
* Util-awareness mechanism:
*
* The idea behind the util-awareness extension is that there are two distinct
* scenarios for the CPU which should result in two different approaches to idle
* state selection - utilized and not utilized.
*
* In this case, 'utilized' means that the average runqueue util of the CPU is
* above a certain threshold.
*
* When the CPU is utilized while going into idle, more likely than not it will
* be woken up to do more work soon and so a shallower idle state should be
* selected to minimise latency and maximise performance. When the CPU is not
* being utilized, the usual metrics-based approach to selecting the deepest
* available idle state should be preferred to take advantage of the power
* saving.
*
* In order to achieve this, the governor uses a utilization threshold.
* The threshold is computed per-CPU as a percentage of the CPU's capacity
* by bit shifting the capacity value. Based on testing, the shift of 6 (~1.56%)
* seems to be getting the best results.
*
* Before selecting the next idle state, the governor compares the current CPU
* util to the precomputed util threshold. If it's below, it defaults to the
* TEO metrics mechanism. If it's above, the closest shallower idle state will
* be selected instead, as long as is not a polling state.
*/
#include <linux/cpuidle.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/sched/topology.h>
#include <linux/tick.h>
/*
* The number of bits to shift the CPU's capacity by in order to determine
* the utilized threshold.
*
* 6 was chosen based on testing as the number that achieved the best balance
* of power and performance on average.
*
* The resulting threshold is high enough to not be triggered by background
* noise and low enough to react quickly when activity starts to ramp up.
*/
#define UTIL_THRESHOLD_SHIFT 6
/*
* The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
* is used for decreasing metrics on a regular basis.
......@@ -137,9 +183,11 @@ struct teo_bin {
* @time_span_ns: Time between idle state selection and post-wakeup update.
* @sleep_length_ns: Time till the closest timer event (at the selection time).
* @state_bins: Idle state data bins for this CPU.
* @total: Grand total of the "intercepts" and "hits" mertics for all bins.
* @total: Grand total of the "intercepts" and "hits" metrics for all bins.
* @next_recent_idx: Index of the next @recent_idx entry to update.
* @recent_idx: Indices of bins corresponding to recent "intercepts".
* @util_threshold: Threshold above which the CPU is considered utilized
* @utilized: Whether the last sleep on the CPU happened while utilized
*/
struct teo_cpu {
s64 time_span_ns;
......@@ -148,10 +196,29 @@ struct teo_cpu {
unsigned int total;
int next_recent_idx;
int recent_idx[NR_RECENT];
unsigned long util_threshold;
bool utilized;
};
static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
/**
* teo_cpu_is_utilized - Check if the CPU's util is above the threshold
* @cpu: Target CPU
* @cpu_data: Governor CPU data for the target CPU
*/
#ifdef CONFIG_SMP
static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data)
{
return sched_cpu_util(cpu) > cpu_data->util_threshold;
}
#else
static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data)
{
return false;
}
#endif
/**
* teo_update - Update CPU metrics after wakeup.
* @drv: cpuidle driver containing state data.
......@@ -258,15 +325,17 @@ static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv)
* @dev: Target CPU.
* @state_idx: Index of the capping idle state.
* @duration_ns: Idle duration value to match.
* @no_poll: Don't consider polling states.
*/
static int teo_find_shallower_state(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int state_idx,
s64 duration_ns)
s64 duration_ns, bool no_poll)
{
int i;
for (i = state_idx - 1; i >= 0; i--) {
if (dev->states_usage[i].disable)
if (dev->states_usage[i].disable ||
(no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING))
continue;
state_idx = i;
......@@ -321,6 +390,22 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
goto end;
}
cpu_data->utilized = teo_cpu_is_utilized(dev->cpu, cpu_data);
/*
* If the CPU is being utilized over the threshold and there are only 2
* states to choose from, the metrics need not be considered, so choose
* the shallowest non-polling state and exit.
*/
if (drv->state_count < 3 && cpu_data->utilized) {
for (i = 0; i < drv->state_count; ++i) {
if (!dev->states_usage[i].disable &&
!(drv->states[i].flags & CPUIDLE_FLAG_POLLING)) {
idx = i;
goto end;
}
}
}
/*
* Find the deepest idle state whose target residency does not exceed
* the current sleep length and the deepest idle state not deeper than
......@@ -452,6 +537,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
if (idx > constraint_idx)
idx = constraint_idx;
/*
* If the CPU is being utilized over the threshold, choose a shallower
* non-polling state to improve latency
*/
if (cpu_data->utilized)
idx = teo_find_shallower_state(drv, dev, idx, duration_ns, true);
end:
/*
* Don't stop the tick if the selected state is a polling one or if the
......@@ -469,7 +561,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*/
if (idx > idx0 &&
drv->states[idx].target_residency_ns > delta_tick)
idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false);
}
return idx;
......@@ -508,9 +600,11 @@ static int teo_enable_device(struct cpuidle_driver *drv,
struct cpuidle_device *dev)
{
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
unsigned long max_capacity = arch_scale_cpu_capacity(dev->cpu);
int i;
memset(cpu_data, 0, sizeof(*cpu_data));
cpu_data->util_threshold = max_capacity >> UTIL_THRESHOLD_SHIFT;
for (i = 0; i < NR_RECENT; i++)
cpu_data->recent_idx[i] = -1;
......
......@@ -200,7 +200,7 @@ static void cpuidle_sysfs_release(struct kobject *kobj)
complete(&kdev->kobj_unregister);
}
static struct kobj_type ktype_cpuidle = {
static const struct kobj_type ktype_cpuidle = {
.sysfs_ops = &cpuidle_sysfs_ops,
.release = cpuidle_sysfs_release,
};
......@@ -447,7 +447,7 @@ static void cpuidle_state_sysfs_release(struct kobject *kobj)
complete(&state_obj->kobj_unregister);
}
static struct kobj_type ktype_state_cpuidle = {
static const struct kobj_type ktype_state_cpuidle = {
.sysfs_ops = &cpuidle_state_sysfs_ops,
.default_groups = cpuidle_state_default_groups,
.release = cpuidle_state_sysfs_release,
......@@ -594,7 +594,7 @@ static struct attribute *cpuidle_driver_default_attrs[] = {
};
ATTRIBUTE_GROUPS(cpuidle_driver_default);
static struct kobj_type ktype_driver_cpuidle = {
static const struct kobj_type ktype_driver_cpuidle = {
.sysfs_ops = &cpuidle_driver_sysfs_ops,
.default_groups = cpuidle_driver_default_groups,
.release = cpuidle_driver_sysfs_release,
......
......@@ -1430,6 +1430,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &idle_cpu_adl_l),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &idle_cpu_adl_n),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr),
X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &idle_cpu_spr),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt),
......@@ -1862,6 +1863,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
skx_idle_state_table_update();
break;
case INTEL_FAM6_SAPPHIRERAPIDS_X:
case INTEL_FAM6_EMERALDRAPIDS_X:
spr_idle_state_table_update();
break;
case INTEL_FAM6_ALDERLAKE:
......
......@@ -379,9 +379,13 @@ const struct dev_pm_ops name = { \
const struct dev_pm_ops name; \
__EXPORT_SYMBOL(name, sec, ns); \
const struct dev_pm_ops name
#define EXPORT_PM_FN_GPL(name) EXPORT_SYMBOL_GPL(name)
#define EXPORT_PM_FN_NS_GPL(name, ns) EXPORT_SYMBOL_NS_GPL(name, ns)
#else
#define _EXPORT_DEV_PM_OPS(name, sec, ns) \
static __maybe_unused const struct dev_pm_ops __static_##name
#define EXPORT_PM_FN_GPL(name)
#define EXPORT_PM_FN_NS_GPL(name, ns)
#endif
#define EXPORT_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "", "")
......
......@@ -118,7 +118,6 @@ config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
select PM
select SRCU
config PM_SLEEP_SMP
def_bool y
......
......@@ -581,7 +581,7 @@ static int save_image(struct swap_map_handle *handle,
return ret;
}
/**
/*
* Structure used for CRC32.
*/
struct crc_data {
......@@ -596,7 +596,7 @@ struct crc_data {
unsigned char *unc[LZO_THREADS]; /* uncompressed data */
};
/**
/*
* CRC32 update function that runs in its own thread.
*/
static int crc32_threadfn(void *data)
......@@ -623,7 +623,7 @@ static int crc32_threadfn(void *data)
}
return 0;
}
/**
/*
* Structure used for LZO data compression.
*/
struct cmp_data {
......@@ -640,7 +640,7 @@ struct cmp_data {
unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
};
/**
/*
* Compression function that runs in its own thread.
*/
static int lzo_compress_threadfn(void *data)
......@@ -948,9 +948,9 @@ int swsusp_write(unsigned int flags)
return error;
}
/**
/*
* The following functions allow us to read data using a swap map
* in a file-alike way
* in a file-like way.
*/
static void release_swap_reader(struct swap_map_handle *handle)
......@@ -1107,7 +1107,7 @@ static int load_image(struct swap_map_handle *handle,
return ret;
}
/**
/*
* Structure used for LZO data decompression.
*/
struct dec_data {
......@@ -1123,7 +1123,7 @@ struct dec_data {
unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
};
/**
/*
* Decompression function that runs in its own thread.
*/
static int lzo_decompress_threadfn(void *data)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment