Commit 3527d3e9 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main changes in this cycle were:

   - another round of rq-clock handling debugging, robustization and
     fixes

   - PELT accounting improvements

   - CPU hotplug related ->cpus_allowed affinity handling fixes all
     around the tree

   - ... plus misc fixes, cleanups and updates"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (35 commits)
  sched/x86: Update reschedule warning text
  crypto: N2 - Replace racy task affinity logic
  cpufreq/sparc-us2e: Replace racy task affinity logic
  cpufreq/sparc-us3: Replace racy task affinity logic
  cpufreq/sh: Replace racy task affinity logic
  cpufreq/ia64: Replace racy task affinity logic
  ACPI/processor: Replace racy task affinity logic
  ACPI/processor: Fix error handling in __acpi_processor_start()
  sparc/sysfs: Replace racy task affinity logic
  powerpc/smp: Replace open coded task affinity logic
  ia64/sn/hwperf: Replace racy task affinity logic
  ia64/salinfo: Replace racy task affinity logic
  workqueue: Provide work_on_cpu_safe()
  ia64/topology: Remove cpus_allowed manipulation
  sched/fair: Move the PELT constants into a generated header
  sched/fair: Increase PELT accuracy for small tasks
  sched/fair: Fix comments
  sched/Documentation: Add 'sched-pelt' tool
  sched/fair: Fix corner case in __accumulate_sum()
  sched/core: Remove 'task' parameter and rename tsk_restore_flags() to current_restore_flags()
  ...
parents 3711c94f 21173d0b
/*
* The following program is used to generate the constants for
* computing sched averages.
*
* ==============================================================
* C program (compile with -lm)
* ==============================================================
*/
#include <math.h>
#include <stdio.h>
#define HALFLIFE 32
#define SHIFT 32
double y;
void calc_runnable_avg_yN_inv(void)
{
int i;
unsigned int x;
printf("static const u32 runnable_avg_yN_inv[] = {");
for (i = 0; i < HALFLIFE; i++) {
x = ((1UL<<32)-1)*pow(y, i);
if (i % 6 == 0) printf("\n\t");
printf("0x%8x, ", x);
}
printf("\n};\n\n");
}
int sum = 1024;
void calc_runnable_avg_yN_sum(void)
{
int i;
printf("static const u32 runnable_avg_yN_sum[] = {\n\t 0,");
for (i = 1; i <= HALFLIFE; i++) {
if (i == 1)
sum *= y;
else
sum = sum*y + 1024*y;
if (i % 11 == 0)
printf("\n\t");
printf("%5d,", sum);
}
printf("\n};\n\n");
}
int n = -1;
/* first period */
long max = 1024;
void calc_converged_max(void)
{
long last = 0, y_inv = ((1UL<<32)-1)*y;
for (; ; n++) {
if (n > -1)
max = ((max*y_inv)>>SHIFT) + 1024;
/*
* This is the same as:
* max = max*y + 1024;
*/
if (last == max)
break;
last = max;
}
n--;
printf("#define LOAD_AVG_PERIOD %d\n", HALFLIFE);
printf("#define LOAD_AVG_MAX %ld\n", max);
// printf("#define LOAD_AVG_MAX_N %d\n\n", n);
}
void calc_accumulated_sum_32(void)
{
int i, x = sum;
printf("static const u32 __accumulated_sum_N32[] = {\n\t 0,");
for (i = 1; i <= n/HALFLIFE+1; i++) {
if (i > 1)
x = x/2 + sum;
if (i % 6 == 0)
printf("\n\t");
printf("%6d,", x);
}
printf("\n};\n\n");
}
void main(void)
{
printf("/* Generated by Documentation/scheduler/sched-pelt; do not modify. */\n\n");
y = pow(0.5, 1/(double)HALFLIFE);
calc_runnable_avg_yN_inv();
// calc_runnable_avg_yN_sum();
calc_converged_max();
// calc_accumulated_sum_32();
}
...@@ -179,14 +179,14 @@ struct salinfo_platform_oemdata_parms { ...@@ -179,14 +179,14 @@ struct salinfo_platform_oemdata_parms {
const u8 *efi_guid; const u8 *efi_guid;
u8 **oemdata; u8 **oemdata;
u64 *oemdata_size; u64 *oemdata_size;
int ret;
}; };
static void static long
salinfo_platform_oemdata_cpu(void *context) salinfo_platform_oemdata_cpu(void *context)
{ {
struct salinfo_platform_oemdata_parms *parms = context; struct salinfo_platform_oemdata_parms *parms = context;
parms->ret = salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
return salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
} }
static void static void
...@@ -380,16 +380,7 @@ salinfo_log_release(struct inode *inode, struct file *file) ...@@ -380,16 +380,7 @@ salinfo_log_release(struct inode *inode, struct file *file)
return 0; return 0;
} }
static void static long
call_on_cpu(int cpu, void (*fn)(void *), void *arg)
{
cpumask_t save_cpus_allowed = current->cpus_allowed;
set_cpus_allowed_ptr(current, cpumask_of(cpu));
(*fn)(arg);
set_cpus_allowed_ptr(current, &save_cpus_allowed);
}
static void
salinfo_log_read_cpu(void *context) salinfo_log_read_cpu(void *context)
{ {
struct salinfo_data *data = context; struct salinfo_data *data = context;
...@@ -399,6 +390,7 @@ salinfo_log_read_cpu(void *context) ...@@ -399,6 +390,7 @@ salinfo_log_read_cpu(void *context)
/* Clear corrected errors as they are read from SAL */ /* Clear corrected errors as they are read from SAL */
if (rh->severity == sal_log_severity_corrected) if (rh->severity == sal_log_severity_corrected)
ia64_sal_clear_state_info(data->type); ia64_sal_clear_state_info(data->type);
return 0;
} }
static void static void
...@@ -430,7 +422,7 @@ salinfo_log_new_read(int cpu, struct salinfo_data *data) ...@@ -430,7 +422,7 @@ salinfo_log_new_read(int cpu, struct salinfo_data *data)
spin_unlock_irqrestore(&data_saved_lock, flags); spin_unlock_irqrestore(&data_saved_lock, flags);
if (!data->saved_num) if (!data->saved_num)
call_on_cpu(cpu, salinfo_log_read_cpu, data); work_on_cpu_safe(cpu, salinfo_log_read_cpu, data);
if (!data->log_size) { if (!data->log_size) {
data->state = STATE_NO_DATA; data->state = STATE_NO_DATA;
cpumask_clear_cpu(cpu, &data->cpu_event); cpumask_clear_cpu(cpu, &data->cpu_event);
...@@ -459,11 +451,13 @@ salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *p ...@@ -459,11 +451,13 @@ salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *p
return simple_read_from_buffer(buffer, count, ppos, buf, bufsize); return simple_read_from_buffer(buffer, count, ppos, buf, bufsize);
} }
static void static long
salinfo_log_clear_cpu(void *context) salinfo_log_clear_cpu(void *context)
{ {
struct salinfo_data *data = context; struct salinfo_data *data = context;
ia64_sal_clear_state_info(data->type); ia64_sal_clear_state_info(data->type);
return 0;
} }
static int static int
...@@ -486,7 +480,7 @@ salinfo_log_clear(struct salinfo_data *data, int cpu) ...@@ -486,7 +480,7 @@ salinfo_log_clear(struct salinfo_data *data, int cpu)
rh = (sal_log_record_header_t *)(data->log_buffer); rh = (sal_log_record_header_t *)(data->log_buffer);
/* Corrected errors have already been cleared from SAL */ /* Corrected errors have already been cleared from SAL */
if (rh->severity != sal_log_severity_corrected) if (rh->severity != sal_log_severity_corrected)
call_on_cpu(cpu, salinfo_log_clear_cpu, data); work_on_cpu_safe(cpu, salinfo_log_clear_cpu, data);
/* clearing a record may make a new record visible */ /* clearing a record may make a new record visible */
salinfo_log_new_read(cpu, data); salinfo_log_new_read(cpu, data);
if (data->state == STATE_LOG_RECORD) { if (data->state == STATE_LOG_RECORD) {
...@@ -531,9 +525,8 @@ salinfo_log_write(struct file *file, const char __user *buffer, size_t count, lo ...@@ -531,9 +525,8 @@ salinfo_log_write(struct file *file, const char __user *buffer, size_t count, lo
.oemdata = &data->oemdata, .oemdata = &data->oemdata,
.oemdata_size = &data->oemdata_size .oemdata_size = &data->oemdata_size
}; };
call_on_cpu(cpu, salinfo_platform_oemdata_cpu, &parms); count = work_on_cpu_safe(cpu, salinfo_platform_oemdata_cpu,
if (parms.ret) &parms);
count = parms.ret;
} else } else
data->oemdata_size = 0; data->oemdata_size = 0;
} else } else
......
...@@ -355,18 +355,12 @@ static int cache_add_dev(unsigned int cpu) ...@@ -355,18 +355,12 @@ static int cache_add_dev(unsigned int cpu)
unsigned long i, j; unsigned long i, j;
struct cache_info *this_object; struct cache_info *this_object;
int retval = 0; int retval = 0;
cpumask_t oldmask;
if (all_cpu_cache_info[cpu].kobj.parent) if (all_cpu_cache_info[cpu].kobj.parent)
return 0; return 0;
oldmask = current->cpus_allowed;
retval = set_cpus_allowed_ptr(current, cpumask_of(cpu));
if (unlikely(retval))
return retval;
retval = cpu_cache_sysfs_init(cpu); retval = cpu_cache_sysfs_init(cpu);
set_cpus_allowed_ptr(current, &oldmask);
if (unlikely(retval < 0)) if (unlikely(retval < 0))
return retval; return retval;
......
...@@ -598,12 +598,17 @@ static void sn_hwperf_call_sal(void *info) ...@@ -598,12 +598,17 @@ static void sn_hwperf_call_sal(void *info)
op_info->ret = r; op_info->ret = r;
} }
static long sn_hwperf_call_sal_work(void *info)
{
sn_hwperf_call_sal(info);
return 0;
}
static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info) static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
{ {
u32 cpu; u32 cpu;
u32 use_ipi; u32 use_ipi;
int r = 0; int r = 0;
cpumask_t save_allowed;
cpu = (op_info->a->arg & SN_HWPERF_ARG_CPU_MASK) >> 32; cpu = (op_info->a->arg & SN_HWPERF_ARG_CPU_MASK) >> 32;
use_ipi = op_info->a->arg & SN_HWPERF_ARG_USE_IPI_MASK; use_ipi = op_info->a->arg & SN_HWPERF_ARG_USE_IPI_MASK;
...@@ -629,13 +634,9 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info) ...@@ -629,13 +634,9 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
/* use an interprocessor interrupt to call SAL */ /* use an interprocessor interrupt to call SAL */
smp_call_function_single(cpu, sn_hwperf_call_sal, smp_call_function_single(cpu, sn_hwperf_call_sal,
op_info, 1); op_info, 1);
} } else {
else { /* Call on the target CPU */
/* migrate the task before calling SAL */ work_on_cpu_safe(cpu, sn_hwperf_call_sal_work, op_info);
save_allowed = current->cpus_allowed;
set_cpus_allowed_ptr(current, cpumask_of(cpu));
sn_hwperf_call_sal(op_info);
set_cpus_allowed_ptr(current, &save_allowed);
} }
} }
r = op_info->ret; r = op_info->ret;
......
...@@ -787,24 +787,21 @@ static struct sched_domain_topology_level powerpc_topology[] = { ...@@ -787,24 +787,21 @@ static struct sched_domain_topology_level powerpc_topology[] = {
{ NULL, }, { NULL, },
}; };
void __init smp_cpus_done(unsigned int max_cpus) static __init long smp_setup_cpu_workfn(void *data __always_unused)
{ {
cpumask_var_t old_mask; smp_ops->setup_cpu(boot_cpuid);
return 0;
}
/* We want the setup_cpu() here to be called from CPU 0, but our void __init smp_cpus_done(unsigned int max_cpus)
* init thread may have been "borrowed" by another CPU in the meantime {
* se we pin us down to CPU 0 for a short while /*
* We want the setup_cpu() here to be called on the boot CPU, but
* init might run on any CPU, so make sure it's invoked on the boot
* CPU.
*/ */
alloc_cpumask_var(&old_mask, GFP_NOWAIT);
cpumask_copy(old_mask, &current->cpus_allowed);
set_cpus_allowed_ptr(current, cpumask_of(boot_cpuid));
if (smp_ops && smp_ops->setup_cpu) if (smp_ops && smp_ops->setup_cpu)
smp_ops->setup_cpu(boot_cpuid); work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL);
set_cpus_allowed_ptr(current, old_mask);
free_cpumask_var(old_mask);
if (smp_ops && smp_ops->bringup_done) if (smp_ops && smp_ops->bringup_done)
smp_ops->bringup_done(); smp_ops->bringup_done();
...@@ -812,7 +809,6 @@ void __init smp_cpus_done(unsigned int max_cpus) ...@@ -812,7 +809,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
dump_numa_cpu_topology(); dump_numa_cpu_topology();
set_sched_topology(powerpc_topology); set_sched_topology(powerpc_topology);
} }
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
......
...@@ -98,27 +98,7 @@ static struct attribute_group mmu_stat_group = { ...@@ -98,27 +98,7 @@ static struct attribute_group mmu_stat_group = {
.name = "mmu_stats", .name = "mmu_stats",
}; };
/* XXX convert to rusty's on_one_cpu */ static long read_mmustat_enable(void *data __maybe_unused)
static unsigned long run_on_cpu(unsigned long cpu,
unsigned long (*func)(unsigned long),
unsigned long arg)
{
cpumask_t old_affinity;
unsigned long ret;
cpumask_copy(&old_affinity, &current->cpus_allowed);
/* should return -EINVAL to userspace */
if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
return 0;
ret = func(arg);
set_cpus_allowed_ptr(current, &old_affinity);
return ret;
}
static unsigned long read_mmustat_enable(unsigned long junk)
{ {
unsigned long ra = 0; unsigned long ra = 0;
...@@ -127,11 +107,11 @@ static unsigned long read_mmustat_enable(unsigned long junk) ...@@ -127,11 +107,11 @@ static unsigned long read_mmustat_enable(unsigned long junk)
return ra != 0; return ra != 0;
} }
static unsigned long write_mmustat_enable(unsigned long val) static long write_mmustat_enable(void *data)
{ {
unsigned long ra, orig_ra; unsigned long ra, orig_ra, *val = data;
if (val) if (*val)
ra = __pa(&per_cpu(mmu_stats, smp_processor_id())); ra = __pa(&per_cpu(mmu_stats, smp_processor_id()));
else else
ra = 0UL; ra = 0UL;
...@@ -142,7 +122,8 @@ static unsigned long write_mmustat_enable(unsigned long val) ...@@ -142,7 +122,8 @@ static unsigned long write_mmustat_enable(unsigned long val)
static ssize_t show_mmustat_enable(struct device *s, static ssize_t show_mmustat_enable(struct device *s,
struct device_attribute *attr, char *buf) struct device_attribute *attr, char *buf)
{ {
unsigned long val = run_on_cpu(s->id, read_mmustat_enable, 0); long val = work_on_cpu(s->id, read_mmustat_enable, NULL);
return sprintf(buf, "%lx\n", val); return sprintf(buf, "%lx\n", val);
} }
...@@ -150,13 +131,15 @@ static ssize_t store_mmustat_enable(struct device *s, ...@@ -150,13 +131,15 @@ static ssize_t store_mmustat_enable(struct device *s,
struct device_attribute *attr, const char *buf, struct device_attribute *attr, const char *buf,
size_t count) size_t count)
{ {
unsigned long val, err; unsigned long val;
int ret = sscanf(buf, "%lu", &val); long err;
int ret;
ret = sscanf(buf, "%lu", &val);
if (ret != 1) if (ret != 1)
return -EINVAL; return -EINVAL;
err = run_on_cpu(s->id, write_mmustat_enable, val); err = work_on_cpu(s->id, write_mmustat_enable, &val);
if (err) if (err)
return -EIO; return -EIO;
......
...@@ -124,7 +124,7 @@ static bool smp_no_nmi_ipi = false; ...@@ -124,7 +124,7 @@ static bool smp_no_nmi_ipi = false;
static void native_smp_send_reschedule(int cpu) static void native_smp_send_reschedule(int cpu)
{ {
if (unlikely(cpu_is_offline(cpu))) { if (unlikely(cpu_is_offline(cpu))) {
WARN_ON(1); WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
return; return;
} }
apic->send_IPI(cpu, RESCHEDULE_VECTOR); apic->send_IPI(cpu, RESCHEDULE_VECTOR);
......
...@@ -251,6 +251,9 @@ static int __acpi_processor_start(struct acpi_device *device) ...@@ -251,6 +251,9 @@ static int __acpi_processor_start(struct acpi_device *device)
if (ACPI_SUCCESS(status)) if (ACPI_SUCCESS(status))
return 0; return 0;
result = -ENODEV;
acpi_pss_perf_exit(pr, device);
err_power_exit: err_power_exit:
acpi_processor_power_exit(pr); acpi_processor_power_exit(pr);
return result; return result;
...@@ -259,11 +262,16 @@ static int __acpi_processor_start(struct acpi_device *device) ...@@ -259,11 +262,16 @@ static int __acpi_processor_start(struct acpi_device *device)
static int acpi_processor_start(struct device *dev) static int acpi_processor_start(struct device *dev)
{ {
struct acpi_device *device = ACPI_COMPANION(dev); struct acpi_device *device = ACPI_COMPANION(dev);
int ret;
if (!device) if (!device)
return -ENODEV; return -ENODEV;
return __acpi_processor_start(device); /* Protect against concurrent CPU hotplug operations */
get_online_cpus();
ret = __acpi_processor_start(device);
put_online_cpus();
return ret;
} }
static int acpi_processor_stop(struct device *dev) static int acpi_processor_stop(struct device *dev)
......
...@@ -62,8 +62,8 @@ struct acpi_processor_throttling_arg { ...@@ -62,8 +62,8 @@ struct acpi_processor_throttling_arg {
#define THROTTLING_POSTCHANGE (2) #define THROTTLING_POSTCHANGE (2)
static int acpi_processor_get_throttling(struct acpi_processor *pr); static int acpi_processor_get_throttling(struct acpi_processor *pr);
int acpi_processor_set_throttling(struct acpi_processor *pr, static int __acpi_processor_set_throttling(struct acpi_processor *pr,
int state, bool force); int state, bool force, bool direct);
static int acpi_processor_update_tsd_coord(void) static int acpi_processor_update_tsd_coord(void)
{ {
...@@ -891,7 +891,8 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr) ...@@ -891,7 +891,8 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr)
ACPI_DEBUG_PRINT((ACPI_DB_INFO, ACPI_DEBUG_PRINT((ACPI_DB_INFO,
"Invalid throttling state, reset\n")); "Invalid throttling state, reset\n"));
state = 0; state = 0;
ret = acpi_processor_set_throttling(pr, state, true); ret = __acpi_processor_set_throttling(pr, state, true,
true);
if (ret) if (ret)
return ret; return ret;
} }
...@@ -901,36 +902,31 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr) ...@@ -901,36 +902,31 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr)
return 0; return 0;
} }
static int acpi_processor_get_throttling(struct acpi_processor *pr) static long __acpi_processor_get_throttling(void *data)
{ {
cpumask_var_t saved_mask; struct acpi_processor *pr = data;
int ret;
return pr->throttling.acpi_processor_get_throttling(pr);
}
static int acpi_processor_get_throttling(struct acpi_processor *pr)
{
if (!pr) if (!pr)
return -EINVAL; return -EINVAL;
if (!pr->flags.throttling) if (!pr->flags.throttling)
return -ENODEV; return -ENODEV;
if (!alloc_cpumask_var(&saved_mask, GFP_KERNEL))
return -ENOMEM;
/* /*
* Migrate task to the cpu pointed by pr. * This is either called from the CPU hotplug callback of
* processor_driver or via the ACPI probe function. In the latter
* case the CPU is not guaranteed to be online. Both call sites are
* protected against CPU hotplug.
*/ */
cpumask_copy(saved_mask, &current->cpus_allowed); if (!cpu_online(pr->id))
/* FIXME: use work_on_cpu() */
if (set_cpus_allowed_ptr(current, cpumask_of(pr->id))) {
/* Can't migrate to the target pr->id CPU. Exit */
free_cpumask_var(saved_mask);
return -ENODEV; return -ENODEV;
}
ret = pr->throttling.acpi_processor_get_throttling(pr);
/* restore the previous state */
set_cpus_allowed_ptr(current, saved_mask);
free_cpumask_var(saved_mask);
return ret; return work_on_cpu(pr->id, __acpi_processor_get_throttling, pr);
} }
static int acpi_processor_get_fadt_info(struct acpi_processor *pr) static int acpi_processor_get_fadt_info(struct acpi_processor *pr)
...@@ -1080,8 +1076,15 @@ static long acpi_processor_throttling_fn(void *data) ...@@ -1080,8 +1076,15 @@ static long acpi_processor_throttling_fn(void *data)
arg->target_state, arg->force); arg->target_state, arg->force);
} }
int acpi_processor_set_throttling(struct acpi_processor *pr, static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct)
int state, bool force) {
if (direct)
return fn(arg);
return work_on_cpu(cpu, fn, arg);
}
static int __acpi_processor_set_throttling(struct acpi_processor *pr,
int state, bool force, bool direct)
{ {
int ret = 0; int ret = 0;
unsigned int i; unsigned int i;
...@@ -1130,7 +1133,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, ...@@ -1130,7 +1133,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr,
arg.pr = pr; arg.pr = pr;
arg.target_state = state; arg.target_state = state;
arg.force = force; arg.force = force;
ret = work_on_cpu(pr->id, acpi_processor_throttling_fn, &arg); ret = call_on_cpu(pr->id, acpi_processor_throttling_fn, &arg,
direct);
} else { } else {
/* /*
* When the T-state coordination is SW_ALL or HW_ALL, * When the T-state coordination is SW_ALL or HW_ALL,
...@@ -1163,8 +1167,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, ...@@ -1163,8 +1167,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr,
arg.pr = match_pr; arg.pr = match_pr;
arg.target_state = state; arg.target_state = state;
arg.force = force; arg.force = force;
ret = work_on_cpu(pr->id, acpi_processor_throttling_fn, ret = call_on_cpu(pr->id, acpi_processor_throttling_fn,
&arg); &arg, direct);
} }
} }
/* /*
...@@ -1182,6 +1186,12 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, ...@@ -1182,6 +1186,12 @@ int acpi_processor_set_throttling(struct acpi_processor *pr,
return ret; return ret;
} }
int acpi_processor_set_throttling(struct acpi_processor *pr, int state,
bool force)
{
return __acpi_processor_set_throttling(pr, state, force, false);
}
int acpi_processor_get_throttling_info(struct acpi_processor *pr) int acpi_processor_get_throttling_info(struct acpi_processor *pr)
{ {
int result = 0; int result = 0;
......
...@@ -381,7 +381,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, ...@@ -381,7 +381,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
*sent += result; *sent += result;
} while (msg_data_left(&msg)); } while (msg_data_left(&msg));
tsk_restore_flags(current, pflags, PF_MEMALLOC); current_restore_flags(pflags, PF_MEMALLOC);
return result; return result;
} }
......
...@@ -34,6 +34,11 @@ struct cpufreq_acpi_io { ...@@ -34,6 +34,11 @@ struct cpufreq_acpi_io {
unsigned int resume; unsigned int resume;
}; };
struct cpufreq_acpi_req {
unsigned int cpu;
unsigned int state;
};
static struct cpufreq_acpi_io *acpi_io_data[NR_CPUS]; static struct cpufreq_acpi_io *acpi_io_data[NR_CPUS];
static struct cpufreq_driver acpi_cpufreq_driver; static struct cpufreq_driver acpi_cpufreq_driver;
...@@ -83,8 +88,7 @@ processor_get_pstate ( ...@@ -83,8 +88,7 @@ processor_get_pstate (
static unsigned static unsigned
extract_clock ( extract_clock (
struct cpufreq_acpi_io *data, struct cpufreq_acpi_io *data,
unsigned value, unsigned value)
unsigned int cpu)
{ {
unsigned long i; unsigned long i;
...@@ -98,60 +102,43 @@ extract_clock ( ...@@ -98,60 +102,43 @@ extract_clock (
} }
static unsigned int static long
processor_get_freq ( processor_get_freq (
struct cpufreq_acpi_io *data, void *arg)
unsigned int cpu)
{ {
int ret = 0; struct cpufreq_acpi_req *req = arg;
u32 value = 0; unsigned int cpu = req->cpu;
cpumask_t saved_mask; struct cpufreq_acpi_io *data = acpi_io_data[cpu];
unsigned long clock_freq; u32 value;
int ret;
pr_debug("processor_get_freq\n"); pr_debug("processor_get_freq\n");
saved_mask = current->cpus_allowed;
set_cpus_allowed_ptr(current, cpumask_of(cpu));
if (smp_processor_id() != cpu) if (smp_processor_id() != cpu)
goto migrate_end; return -EAGAIN;
/* processor_get_pstate gets the instantaneous frequency */ /* processor_get_pstate gets the instantaneous frequency */
ret = processor_get_pstate(&value); ret = processor_get_pstate(&value);
if (ret) { if (ret) {
set_cpus_allowed_ptr(current, &saved_mask);
pr_warn("get performance failed with error %d\n", ret); pr_warn("get performance failed with error %d\n", ret);
ret = 0; return ret;
goto migrate_end;
} }
clock_freq = extract_clock(data, value, cpu); return 1000 * extract_clock(data, value);
ret = (clock_freq*1000);
migrate_end:
set_cpus_allowed_ptr(current, &saved_mask);
return ret;
} }
static int static long
processor_set_freq ( processor_set_freq (
struct cpufreq_acpi_io *data, void *arg)
struct cpufreq_policy *policy,
int state)
{ {
int ret = 0; struct cpufreq_acpi_req *req = arg;
u32 value = 0; unsigned int cpu = req->cpu;
cpumask_t saved_mask; struct cpufreq_acpi_io *data = acpi_io_data[cpu];
int retval; int ret, state = req->state;
u32 value;
pr_debug("processor_set_freq\n"); pr_debug("processor_set_freq\n");
if (smp_processor_id() != cpu)
saved_mask = current->cpus_allowed; return -EAGAIN;
set_cpus_allowed_ptr(current, cpumask_of(policy->cpu));
if (smp_processor_id() != policy->cpu) {
retval = -EAGAIN;
goto migrate_end;
}
if (state == data->acpi_data.state) { if (state == data->acpi_data.state) {
if (unlikely(data->resume)) { if (unlikely(data->resume)) {
...@@ -159,8 +146,7 @@ processor_set_freq ( ...@@ -159,8 +146,7 @@ processor_set_freq (
data->resume = 0; data->resume = 0;
} else { } else {
pr_debug("Already at target state (P%d)\n", state); pr_debug("Already at target state (P%d)\n", state);
retval = 0; return 0;
goto migrate_end;
} }
} }
...@@ -171,7 +157,6 @@ processor_set_freq ( ...@@ -171,7 +157,6 @@ processor_set_freq (
* First we write the target state's 'control' value to the * First we write the target state's 'control' value to the
* control_register. * control_register.
*/ */
value = (u32) data->acpi_data.states[state].control; value = (u32) data->acpi_data.states[state].control;
pr_debug("Transitioning to state: 0x%08x\n", value); pr_debug("Transitioning to state: 0x%08x\n", value);
...@@ -179,17 +164,11 @@ processor_set_freq ( ...@@ -179,17 +164,11 @@ processor_set_freq (
ret = processor_set_pstate(value); ret = processor_set_pstate(value);
if (ret) { if (ret) {
pr_warn("Transition failed with error %d\n", ret); pr_warn("Transition failed with error %d\n", ret);
retval = -ENODEV; return -ENODEV;
goto migrate_end;
} }
data->acpi_data.state = state; data->acpi_data.state = state;
return 0;
retval = 0;
migrate_end:
set_cpus_allowed_ptr(current, &saved_mask);
return (retval);
} }
...@@ -197,11 +176,13 @@ static unsigned int ...@@ -197,11 +176,13 @@ static unsigned int
acpi_cpufreq_get ( acpi_cpufreq_get (
unsigned int cpu) unsigned int cpu)
{ {
struct cpufreq_acpi_io *data = acpi_io_data[cpu]; struct cpufreq_acpi_req req;
long ret;
pr_debug("acpi_cpufreq_get\n"); req.cpu = cpu;
ret = work_on_cpu(cpu, processor_get_freq, &req);
return processor_get_freq(data, cpu); return ret > 0 ? (unsigned int) ret : 0;
} }
...@@ -210,7 +191,12 @@ acpi_cpufreq_target ( ...@@ -210,7 +191,12 @@ acpi_cpufreq_target (
struct cpufreq_policy *policy, struct cpufreq_policy *policy,
unsigned int index) unsigned int index)
{ {
return processor_set_freq(acpi_io_data[policy->cpu], policy, index); struct cpufreq_acpi_req req;
req.cpu = policy->cpu;
req.state = index;
return work_on_cpu(req.cpu, processor_set_freq, &req);
} }
static int static int
......
...@@ -30,54 +30,63 @@ ...@@ -30,54 +30,63 @@
static DEFINE_PER_CPU(struct clk, sh_cpuclk); static DEFINE_PER_CPU(struct clk, sh_cpuclk);
struct cpufreq_target {
struct cpufreq_policy *policy;
unsigned int freq;
};
static unsigned int sh_cpufreq_get(unsigned int cpu) static unsigned int sh_cpufreq_get(unsigned int cpu)
{ {
return (clk_get_rate(&per_cpu(sh_cpuclk, cpu)) + 500) / 1000; return (clk_get_rate(&per_cpu(sh_cpuclk, cpu)) + 500) / 1000;
} }
/* static long __sh_cpufreq_target(void *arg)
* Here we notify other drivers of the proposed change and the final change.
*/
static int sh_cpufreq_target(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation)
{ {
unsigned int cpu = policy->cpu; struct cpufreq_target *target = arg;
struct cpufreq_policy *policy = target->policy;
int cpu = policy->cpu;
struct clk *cpuclk = &per_cpu(sh_cpuclk, cpu); struct clk *cpuclk = &per_cpu(sh_cpuclk, cpu);
cpumask_t cpus_allowed;
struct cpufreq_freqs freqs; struct cpufreq_freqs freqs;
struct device *dev; struct device *dev;
long freq; long freq;
cpus_allowed = current->cpus_allowed; if (smp_processor_id() != cpu)
set_cpus_allowed_ptr(current, cpumask_of(cpu)); return -ENODEV;
BUG_ON(smp_processor_id() != cpu);
dev = get_cpu_device(cpu); dev = get_cpu_device(cpu);
/* Convert target_freq from kHz to Hz */ /* Convert target_freq from kHz to Hz */
freq = clk_round_rate(cpuclk, target_freq * 1000); freq = clk_round_rate(cpuclk, target->freq * 1000);
if (freq < (policy->min * 1000) || freq > (policy->max * 1000)) if (freq < (policy->min * 1000) || freq > (policy->max * 1000))
return -EINVAL; return -EINVAL;
dev_dbg(dev, "requested frequency %u Hz\n", target_freq * 1000); dev_dbg(dev, "requested frequency %u Hz\n", target->freq * 1000);
freqs.old = sh_cpufreq_get(cpu); freqs.old = sh_cpufreq_get(cpu);
freqs.new = (freq + 500) / 1000; freqs.new = (freq + 500) / 1000;
freqs.flags = 0; freqs.flags = 0;
cpufreq_freq_transition_begin(policy, &freqs); cpufreq_freq_transition_begin(target->policy, &freqs);
set_cpus_allowed_ptr(current, &cpus_allowed);
clk_set_rate(cpuclk, freq); clk_set_rate(cpuclk, freq);
cpufreq_freq_transition_end(policy, &freqs, 0); cpufreq_freq_transition_end(target->policy, &freqs, 0);
dev_dbg(dev, "set frequency %lu Hz\n", freq); dev_dbg(dev, "set frequency %lu Hz\n", freq);
return 0; return 0;
} }
/*
* Here we notify other drivers of the proposed change and the final change.
*/
static int sh_cpufreq_target(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation)
{
struct cpufreq_target data = { .policy = policy, .freq = target_freq };
return work_on_cpu(policy->cpu, __sh_cpufreq_target, &data);
}
static int sh_cpufreq_verify(struct cpufreq_policy *policy) static int sh_cpufreq_verify(struct cpufreq_policy *policy)
{ {
struct clk *cpuclk = &per_cpu(sh_cpuclk, policy->cpu); struct clk *cpuclk = &per_cpu(sh_cpuclk, policy->cpu);
......
...@@ -118,10 +118,6 @@ static void us2e_transition(unsigned long estar, unsigned long new_bits, ...@@ -118,10 +118,6 @@ static void us2e_transition(unsigned long estar, unsigned long new_bits,
unsigned long clock_tick, unsigned long clock_tick,
unsigned long old_divisor, unsigned long divisor) unsigned long old_divisor, unsigned long divisor)
{ {
unsigned long flags;
local_irq_save(flags);
estar &= ~ESTAR_MODE_DIV_MASK; estar &= ~ESTAR_MODE_DIV_MASK;
/* This is based upon the state transition diagram in the IIe manual. */ /* This is based upon the state transition diagram in the IIe manual. */
...@@ -152,8 +148,6 @@ static void us2e_transition(unsigned long estar, unsigned long new_bits, ...@@ -152,8 +148,6 @@ static void us2e_transition(unsigned long estar, unsigned long new_bits,
} else { } else {
BUG(); BUG();
} }
local_irq_restore(flags);
} }
static unsigned long index_to_estar_mode(unsigned int index) static unsigned long index_to_estar_mode(unsigned int index)
...@@ -229,48 +223,51 @@ static unsigned long estar_to_divisor(unsigned long estar) ...@@ -229,48 +223,51 @@ static unsigned long estar_to_divisor(unsigned long estar)
return ret; return ret;
} }
static void __us2e_freq_get(void *arg)
{
unsigned long *estar = arg;
*estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
}
static unsigned int us2e_freq_get(unsigned int cpu) static unsigned int us2e_freq_get(unsigned int cpu)
{ {
cpumask_t cpus_allowed;
unsigned long clock_tick, estar; unsigned long clock_tick, estar;
cpumask_copy(&cpus_allowed, &current->cpus_allowed);
set_cpus_allowed_ptr(current, cpumask_of(cpu));
clock_tick = sparc64_get_clock_tick(cpu) / 1000; clock_tick = sparc64_get_clock_tick(cpu) / 1000;
estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR); if (smp_call_function_single(cpu, __us2e_freq_get, &estar, 1))
return 0;
set_cpus_allowed_ptr(current, &cpus_allowed);
return clock_tick / estar_to_divisor(estar); return clock_tick / estar_to_divisor(estar);
} }
static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index) static void __us2e_freq_target(void *arg)
{ {
unsigned int cpu = policy->cpu; unsigned int cpu = smp_processor_id();
unsigned int *index = arg;
unsigned long new_bits, new_freq; unsigned long new_bits, new_freq;
unsigned long clock_tick, divisor, old_divisor, estar; unsigned long clock_tick, divisor, old_divisor, estar;
cpumask_t cpus_allowed;
cpumask_copy(&cpus_allowed, &current->cpus_allowed);
set_cpus_allowed_ptr(current, cpumask_of(cpu));
new_freq = clock_tick = sparc64_get_clock_tick(cpu) / 1000; new_freq = clock_tick = sparc64_get_clock_tick(cpu) / 1000;
new_bits = index_to_estar_mode(index); new_bits = index_to_estar_mode(*index);
divisor = index_to_divisor(index); divisor = index_to_divisor(*index);
new_freq /= divisor; new_freq /= divisor;
estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR); estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
old_divisor = estar_to_divisor(estar); old_divisor = estar_to_divisor(estar);
if (old_divisor != divisor) if (old_divisor != divisor) {
us2e_transition(estar, new_bits, clock_tick * 1000, us2e_transition(estar, new_bits, clock_tick * 1000,
old_divisor, divisor); old_divisor, divisor);
}
}
set_cpus_allowed_ptr(current, &cpus_allowed); static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index)
{
unsigned int cpu = policy->cpu;
return 0; return smp_call_function_single(cpu, __us2e_freq_target, &index, 1);
} }
static int __init us2e_freq_cpu_init(struct cpufreq_policy *policy) static int __init us2e_freq_cpu_init(struct cpufreq_policy *policy)
......
...@@ -35,22 +35,28 @@ static struct us3_freq_percpu_info *us3_freq_table; ...@@ -35,22 +35,28 @@ static struct us3_freq_percpu_info *us3_freq_table;
#define SAFARI_CFG_DIV_32 0x0000000080000000UL #define SAFARI_CFG_DIV_32 0x0000000080000000UL
#define SAFARI_CFG_DIV_MASK 0x00000000C0000000UL #define SAFARI_CFG_DIV_MASK 0x00000000C0000000UL
static unsigned long read_safari_cfg(void) static void read_safari_cfg(void *arg)
{ {
unsigned long ret; unsigned long ret, *val = arg;
__asm__ __volatile__("ldxa [%%g0] %1, %0" __asm__ __volatile__("ldxa [%%g0] %1, %0"
: "=&r" (ret) : "=&r" (ret)
: "i" (ASI_SAFARI_CONFIG)); : "i" (ASI_SAFARI_CONFIG));
return ret; *val = ret;
} }
static void write_safari_cfg(unsigned long val) static void update_safari_cfg(void *arg)
{ {
unsigned long reg, *new_bits = arg;
read_safari_cfg(&reg);
reg &= ~SAFARI_CFG_DIV_MASK;
reg |= *new_bits;
__asm__ __volatile__("stxa %0, [%%g0] %1\n\t" __asm__ __volatile__("stxa %0, [%%g0] %1\n\t"
"membar #Sync" "membar #Sync"
: /* no outputs */ : /* no outputs */
: "r" (val), "i" (ASI_SAFARI_CONFIG) : "r" (reg), "i" (ASI_SAFARI_CONFIG)
: "memory"); : "memory");
} }
...@@ -78,29 +84,17 @@ static unsigned long get_current_freq(unsigned int cpu, unsigned long safari_cfg ...@@ -78,29 +84,17 @@ static unsigned long get_current_freq(unsigned int cpu, unsigned long safari_cfg
static unsigned int us3_freq_get(unsigned int cpu) static unsigned int us3_freq_get(unsigned int cpu)
{ {
cpumask_t cpus_allowed;
unsigned long reg; unsigned long reg;
unsigned int ret;
cpumask_copy(&cpus_allowed, &current->cpus_allowed);
set_cpus_allowed_ptr(current, cpumask_of(cpu));
reg = read_safari_cfg();
ret = get_current_freq(cpu, reg);
set_cpus_allowed_ptr(current, &cpus_allowed);
return ret; if (smp_call_function_single(cpu, read_safari_cfg, &reg, 1))
return 0;
return get_current_freq(cpu, reg);
} }
static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index) static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index)
{ {
unsigned int cpu = policy->cpu; unsigned int cpu = policy->cpu;
unsigned long new_bits, new_freq, reg; unsigned long new_bits, new_freq;
cpumask_t cpus_allowed;
cpumask_copy(&cpus_allowed, &current->cpus_allowed);
set_cpus_allowed_ptr(current, cpumask_of(cpu));
new_freq = sparc64_get_clock_tick(cpu) / 1000; new_freq = sparc64_get_clock_tick(cpu) / 1000;
switch (index) { switch (index) {
...@@ -121,15 +115,7 @@ static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index) ...@@ -121,15 +115,7 @@ static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index)
BUG(); BUG();
} }
reg = read_safari_cfg(); return smp_call_function_single(cpu, update_safari_cfg, &new_bits, 1);
reg &= ~SAFARI_CFG_DIV_MASK;
reg |= new_bits;
write_safari_cfg(reg);
set_cpus_allowed_ptr(current, &cpus_allowed);
return 0;
} }
static int __init us3_freq_cpu_init(struct cpufreq_policy *policy) static int __init us3_freq_cpu_init(struct cpufreq_policy *policy)
......
...@@ -65,6 +65,11 @@ struct spu_queue { ...@@ -65,6 +65,11 @@ struct spu_queue {
struct list_head list; struct list_head list;
}; };
struct spu_qreg {
struct spu_queue *queue;
unsigned long type;
};
static struct spu_queue **cpu_to_cwq; static struct spu_queue **cpu_to_cwq;
static struct spu_queue **cpu_to_mau; static struct spu_queue **cpu_to_mau;
...@@ -1631,31 +1636,27 @@ static void queue_cache_destroy(void) ...@@ -1631,31 +1636,27 @@ static void queue_cache_destroy(void)
kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_CWQ - 1]); kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_CWQ - 1]);
} }
static int spu_queue_register(struct spu_queue *p, unsigned long q_type) static long spu_queue_register_workfn(void *arg)
{ {
cpumask_var_t old_allowed; struct spu_qreg *qr = arg;
struct spu_queue *p = qr->queue;
unsigned long q_type = qr->type;
unsigned long hv_ret; unsigned long hv_ret;
if (cpumask_empty(&p->sharing))
return -EINVAL;
if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
return -ENOMEM;
cpumask_copy(old_allowed, &current->cpus_allowed);
set_cpus_allowed_ptr(current, &p->sharing);
hv_ret = sun4v_ncs_qconf(q_type, __pa(p->q), hv_ret = sun4v_ncs_qconf(q_type, __pa(p->q),
CWQ_NUM_ENTRIES, &p->qhandle); CWQ_NUM_ENTRIES, &p->qhandle);
if (!hv_ret) if (!hv_ret)
sun4v_ncs_sethead_marker(p->qhandle, 0); sun4v_ncs_sethead_marker(p->qhandle, 0);
set_cpus_allowed_ptr(current, old_allowed); return hv_ret ? -EINVAL : 0;
}
free_cpumask_var(old_allowed); static int spu_queue_register(struct spu_queue *p, unsigned long q_type)
{
int cpu = cpumask_any_and(&p->sharing, cpu_online_mask);
struct spu_qreg qr = { .queue = p, .type = q_type };
return (hv_ret ? -EINVAL : 0); return work_on_cpu_safe(cpu, spu_queue_register_workfn, &qr);
} }
static int spu_queue_setup(struct spu_queue *p) static int spu_queue_setup(struct spu_queue *p)
......
...@@ -387,7 +387,7 @@ static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task) ...@@ -387,7 +387,7 @@ static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task)
rc = 0; rc = 0;
} }
tsk_restore_flags(current, pflags, PF_MEMALLOC); current_restore_flags(pflags, PF_MEMALLOC);
return rc; return rc;
} }
......
...@@ -1004,7 +1004,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, ...@@ -1004,7 +1004,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
else else
err = nfserrno(host_err); err = nfserrno(host_err);
if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); current_restore_flags(pflags, PF_LESS_THROTTLE);
return err; return err;
} }
......
...@@ -1290,10 +1290,10 @@ TASK_PFA_TEST(LMK_WAITING, lmk_waiting) ...@@ -1290,10 +1290,10 @@ TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
TASK_PFA_SET(LMK_WAITING, lmk_waiting) TASK_PFA_SET(LMK_WAITING, lmk_waiting)
static inline void static inline void
tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags) current_restore_flags(unsigned long orig_flags, unsigned long flags)
{ {
task->flags &= ~flags; current->flags &= ~flags;
task->flags |= orig_flags & flags; current->flags |= orig_flags & flags;
} }
extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
......
...@@ -608,8 +608,13 @@ static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) ...@@ -608,8 +608,13 @@ static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
{ {
return fn(arg); return fn(arg);
} }
static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
{
return fn(arg);
}
#else #else
long work_on_cpu(int cpu, long (*fn)(void *), void *arg); long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER #ifdef CONFIG_FREEZER
......
...@@ -85,21 +85,6 @@ int sysctl_sched_rt_runtime = 950000; ...@@ -85,21 +85,6 @@ int sysctl_sched_rt_runtime = 950000;
/* CPUs with isolated domains */ /* CPUs with isolated domains */
cpumask_var_t cpu_isolated_map; cpumask_var_t cpu_isolated_map;
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
static struct rq *this_rq_lock(void)
__acquires(rq->lock)
{
struct rq *rq;
local_irq_disable();
rq = this_rq();
raw_spin_lock(&rq->lock);
return rq;
}
/* /*
* __task_rq_lock - lock the rq @p resides on. * __task_rq_lock - lock the rq @p resides on.
*/ */
...@@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq) ...@@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq)
return; return;
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
if (sched_feat(WARN_DOUBLE_CLOCK))
SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED; rq->clock_update_flags |= RQCF_UPDATED;
#endif #endif
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
if (delta < 0) if (delta < 0)
return; return;
...@@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq) ...@@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq)
static enum hrtimer_restart hrtick(struct hrtimer *timer) static enum hrtimer_restart hrtick(struct hrtimer *timer)
{ {
struct rq *rq = container_of(timer, struct rq, hrtick_timer); struct rq *rq = container_of(timer, struct rq, hrtick_timer);
struct rq_flags rf;
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
raw_spin_lock(&rq->lock); rq_lock(rq, &rf);
update_rq_clock(rq); update_rq_clock(rq);
rq->curr->sched_class->task_tick(rq, rq->curr, 1); rq->curr->sched_class->task_tick(rq, rq->curr, 1);
raw_spin_unlock(&rq->lock); rq_unlock(rq, &rf);
return HRTIMER_NORESTART; return HRTIMER_NORESTART;
} }
...@@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq) ...@@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq)
static void __hrtick_start(void *arg) static void __hrtick_start(void *arg)
{ {
struct rq *rq = arg; struct rq *rq = arg;
struct rq_flags rf;
raw_spin_lock(&rq->lock); rq_lock(rq, &rf);
__hrtick_restart(rq); __hrtick_restart(rq);
rq->hrtick_csd_pending = 0; rq->hrtick_csd_pending = 0;
raw_spin_unlock(&rq->lock); rq_unlock(rq, &rf);
} }
/* /*
...@@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p) ...@@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p)
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{ {
update_rq_clock(rq); if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & ENQUEUE_RESTORE)) if (!(flags & ENQUEUE_RESTORE))
sched_info_queued(rq, p); sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags); p->sched_class->enqueue_task(rq, p, flags);
} }
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{ {
update_rq_clock(rq); if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & DEQUEUE_SAVE)) if (!(flags & DEQUEUE_SAVE))
sched_info_dequeued(rq, p); sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags); p->sched_class->dequeue_task(rq, p, flags);
} }
...@@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) ...@@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* *
* Returns (locked) new rq. Old rq's lock is released. * Returns (locked) new rq. Old rq's lock is released.
*/ */
static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int new_cpu)
{ {
lockdep_assert_held(&rq->lock); lockdep_assert_held(&rq->lock);
p->on_rq = TASK_ON_RQ_MIGRATING; p->on_rq = TASK_ON_RQ_MIGRATING;
dequeue_task(rq, p, 0); dequeue_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu); set_task_cpu(p, new_cpu);
raw_spin_unlock(&rq->lock); rq_unlock(rq, rf);
rq = cpu_rq(new_cpu); rq = cpu_rq(new_cpu);
raw_spin_lock(&rq->lock); rq_lock(rq, rf);
BUG_ON(task_cpu(p) != new_cpu); BUG_ON(task_cpu(p) != new_cpu);
enqueue_task(rq, p, 0); enqueue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED; p->on_rq = TASK_ON_RQ_QUEUED;
...@@ -980,7 +977,8 @@ struct migration_arg { ...@@ -980,7 +977,8 @@ struct migration_arg {
* So we race with normal scheduler movements, but that's OK, as long * So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU. * as the task is no longer on this CPU.
*/ */
static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
{ {
if (unlikely(!cpu_active(dest_cpu))) if (unlikely(!cpu_active(dest_cpu)))
return rq; return rq;
...@@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_ ...@@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
return rq; return rq;
rq = move_queued_task(rq, p, dest_cpu); update_rq_clock(rq);
rq = move_queued_task(rq, rf, p, dest_cpu);
return rq; return rq;
} }
...@@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data) ...@@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data)
struct migration_arg *arg = data; struct migration_arg *arg = data;
struct task_struct *p = arg->task; struct task_struct *p = arg->task;
struct rq *rq = this_rq(); struct rq *rq = this_rq();
struct rq_flags rf;
/* /*
* The original target CPU might have gone down and we might * The original target CPU might have gone down and we might
...@@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data) ...@@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data)
sched_ttwu_pending(); sched_ttwu_pending();
raw_spin_lock(&p->pi_lock); raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock); rq_lock(rq, &rf);
/* /*
* If task_rq(p) != rq, it cannot be migrated here, because we're * If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
...@@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data) ...@@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data)
*/ */
if (task_rq(p) == rq) { if (task_rq(p) == rq) {
if (task_on_rq_queued(p)) if (task_on_rq_queued(p))
rq = __migrate_task(rq, p, arg->dest_cpu); rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
else else
p->wake_cpu = arg->dest_cpu; p->wake_cpu = arg->dest_cpu;
} }
raw_spin_unlock(&rq->lock); rq_unlock(rq, &rf);
raw_spin_unlock(&p->pi_lock); raw_spin_unlock(&p->pi_lock);
local_irq_enable(); local_irq_enable();
...@@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ...@@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* holding rq->lock. * holding rq->lock.
*/ */
lockdep_assert_held(&rq->lock); lockdep_assert_held(&rq->lock);
dequeue_task(rq, p, DEQUEUE_SAVE); dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
} }
if (running) if (running)
put_prev_task(rq, p); put_prev_task(rq, p);
...@@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ...@@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask); p->sched_class->set_cpus_allowed(p, new_mask);
if (queued) if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE); enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running) if (running)
set_curr_task(rq, p); set_curr_task(rq, p);
} }
...@@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, ...@@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* OK, since we're going to drop the lock immediately * OK, since we're going to drop the lock immediately
* afterwards anyway. * afterwards anyway.
*/ */
rq_unpin_lock(rq, &rf); rq = move_queued_task(rq, &rf, p, dest_cpu);
rq = move_queued_task(rq, p, dest_cpu);
rq_repin_lock(rq, &rf);
} }
out: out:
task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf);
...@@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) ...@@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
{ {
if (task_on_rq_queued(p)) { if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq; struct rq *src_rq, *dst_rq;
struct rq_flags srf, drf;
src_rq = task_rq(p); src_rq = task_rq(p);
dst_rq = cpu_rq(cpu); dst_rq = cpu_rq(cpu);
rq_pin_lock(src_rq, &srf);
rq_pin_lock(dst_rq, &drf);
p->on_rq = TASK_ON_RQ_MIGRATING; p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0); deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu); set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0); activate_task(dst_rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED; p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0); check_preempt_curr(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
rq_unpin_lock(src_rq, &srf);
} else { } else {
/* /*
* Task isn't running anymore; make it appear like we migrated * Task isn't running anymore; make it appear like we migrated
...@@ -1680,7 +1686,7 @@ static void ...@@ -1680,7 +1686,7 @@ static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf) struct rq_flags *rf)
{ {
int en_flags = ENQUEUE_WAKEUP; int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
lockdep_assert_held(&rq->lock); lockdep_assert_held(&rq->lock);
...@@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void) ...@@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void)
struct rq *rq = this_rq(); struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list); struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p; struct task_struct *p;
unsigned long flags;
struct rq_flags rf; struct rq_flags rf;
if (!llist) if (!llist)
return; return;
raw_spin_lock_irqsave(&rq->lock, flags); rq_lock_irqsave(rq, &rf);
rq_pin_lock(rq, &rf); update_rq_clock(rq);
while (llist) { while (llist) {
int wake_flags = 0; int wake_flags = 0;
...@@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void) ...@@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, wake_flags, &rf); ttwu_do_activate(rq, p, wake_flags, &rf);
} }
rq_unpin_lock(rq, &rf); rq_unlock_irqrestore(rq, &rf);
raw_spin_unlock_irqrestore(&rq->lock, flags);
} }
void scheduler_ipi(void) void scheduler_ipi(void)
...@@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) ...@@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
void wake_up_if_idle(int cpu) void wake_up_if_idle(int cpu)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
unsigned long flags; struct rq_flags rf;
rcu_read_lock(); rcu_read_lock();
...@@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu) ...@@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu)
if (set_nr_if_polling(rq->idle)) { if (set_nr_if_polling(rq->idle)) {
trace_sched_wake_idle_without_ipi(cpu); trace_sched_wake_idle_without_ipi(cpu);
} else { } else {
raw_spin_lock_irqsave(&rq->lock, flags); rq_lock_irqsave(rq, &rf);
if (is_idle_task(rq->curr)) if (is_idle_task(rq->curr))
smp_send_reschedule(cpu); smp_send_reschedule(cpu);
/* Else CPU is not idle, do nothing here: */ /* Else CPU is not idle, do nothing here: */
raw_spin_unlock_irqrestore(&rq->lock, flags); rq_unlock_irqrestore(rq, &rf);
} }
out: out:
...@@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ...@@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
} }
#endif #endif
raw_spin_lock(&rq->lock); rq_lock(rq, &rf);
rq_pin_lock(rq, &rf); update_rq_clock(rq);
ttwu_do_activate(rq, p, wake_flags, &rf); ttwu_do_activate(rq, p, wake_flags, &rf);
rq_unpin_lock(rq, &rf); rq_unlock(rq, &rf);
raw_spin_unlock(&rq->lock);
} }
/* /*
...@@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) ...@@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
* disabled avoiding further scheduler activity on it and we've * disabled avoiding further scheduler activity on it and we've
* not yet picked a replacement task. * not yet picked a replacement task.
*/ */
rq_unpin_lock(rq, rf); rq_unlock(rq, rf);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&p->pi_lock); raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock); rq_relock(rq, rf);
rq_repin_lock(rq, rf);
} }
if (!(p->state & TASK_NORMAL)) if (!(p->state & TASK_NORMAL))
...@@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) ...@@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
delayacct_blkio_end(); delayacct_blkio_end();
atomic_dec(&rq->nr_iowait); atomic_dec(&rq->nr_iowait);
} }
ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
} }
ttwu_do_wakeup(rq, p, 0, rf); ttwu_do_wakeup(rq, p, 0, rf);
...@@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p) ...@@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p)
update_rq_clock(rq); update_rq_clock(rq);
post_init_entity_util_avg(&p->se); post_init_entity_util_avg(&p->se);
activate_task(rq, p, 0); activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED; p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p); trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK); check_preempt_curr(rq, p, WF_FORK);
...@@ -3093,15 +3094,18 @@ void scheduler_tick(void) ...@@ -3093,15 +3094,18 @@ void scheduler_tick(void)
int cpu = smp_processor_id(); int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr; struct task_struct *curr = rq->curr;
struct rq_flags rf;
sched_clock_tick(); sched_clock_tick();
raw_spin_lock(&rq->lock); rq_lock(rq, &rf);
update_rq_clock(rq); update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0); curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq); cpu_load_update_active(rq);
calc_global_load_tick(rq); calc_global_load_tick(rq);
raw_spin_unlock(&rq->lock);
rq_unlock(rq, &rf);
perf_event_task_tick(); perf_event_task_tick();
...@@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt) ...@@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt)
* done by the caller to avoid the race with signal_wake_up(). * done by the caller to avoid the race with signal_wake_up().
*/ */
smp_mb__before_spinlock(); smp_mb__before_spinlock();
raw_spin_lock(&rq->lock); rq_lock(rq, &rf);
rq_pin_lock(rq, &rf);
/* Promote REQ to ACT */ /* Promote REQ to ACT */
rq->clock_update_flags <<= 1; rq->clock_update_flags <<= 1;
update_rq_clock(rq);
switch_count = &prev->nivcsw; switch_count = &prev->nivcsw;
if (!preempt && prev->state) { if (!preempt && prev->state) {
if (unlikely(signal_pending_state(prev->state, prev))) { if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING; prev->state = TASK_RUNNING;
} else { } else {
deactivate_task(rq, prev, DEQUEUE_SLEEP); deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
prev->on_rq = 0; prev->on_rq = 0;
if (prev->in_iowait) { if (prev->in_iowait) {
...@@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt) ...@@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt)
switch_count = &prev->nvcsw; switch_count = &prev->nvcsw;
} }
if (task_on_rq_queued(prev))
update_rq_clock(rq);
next = pick_next_task(rq, prev, &rf); next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev); clear_tsk_need_resched(prev);
clear_preempt_need_resched(); clear_preempt_need_resched();
...@@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt) ...@@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt)
rq = context_switch(rq, prev, next, &rf); rq = context_switch(rq, prev, next, &rf);
} else { } else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unpin_lock(rq, &rf); rq_unlock_irq(rq, &rf);
raw_spin_unlock_irq(&rq->lock);
} }
balance_callback(rq); balance_callback(rq);
...@@ -3684,7 +3684,8 @@ EXPORT_SYMBOL(default_wake_function); ...@@ -3684,7 +3684,8 @@ EXPORT_SYMBOL(default_wake_function);
*/ */
void rt_mutex_setprio(struct task_struct *p, int prio) void rt_mutex_setprio(struct task_struct *p, int prio)
{ {
int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; int oldprio, queued, running, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class; const struct sched_class *prev_class;
struct rq_flags rf; struct rq_flags rf;
struct rq *rq; struct rq *rq;
...@@ -3805,7 +3806,7 @@ void set_user_nice(struct task_struct *p, long nice) ...@@ -3805,7 +3806,7 @@ void set_user_nice(struct task_struct *p, long nice)
queued = task_on_rq_queued(p); queued = task_on_rq_queued(p);
running = task_current(rq, p); running = task_current(rq, p);
if (queued) if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE); dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
if (running) if (running)
put_prev_task(rq, p); put_prev_task(rq, p);
...@@ -3816,7 +3817,7 @@ void set_user_nice(struct task_struct *p, long nice) ...@@ -3816,7 +3817,7 @@ void set_user_nice(struct task_struct *p, long nice)
delta = p->prio - old_prio; delta = p->prio - old_prio;
if (queued) { if (queued) {
enqueue_task(rq, p, ENQUEUE_RESTORE); enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
/* /*
* If the task increased its priority or is running and * If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU: * lowered its priority, then reschedule its CPU:
...@@ -4126,7 +4127,7 @@ static int __sched_setscheduler(struct task_struct *p, ...@@ -4126,7 +4127,7 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_class *prev_class; const struct sched_class *prev_class;
struct rq_flags rf; struct rq_flags rf;
int reset_on_fork; int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq; struct rq *rq;
/* May grab non-irq protected spin_locks: */ /* May grab non-irq protected spin_locks: */
...@@ -4923,7 +4924,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ...@@ -4923,7 +4924,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
*/ */
SYSCALL_DEFINE0(sched_yield) SYSCALL_DEFINE0(sched_yield)
{ {
struct rq *rq = this_rq_lock(); struct rq_flags rf;
struct rq *rq;
local_irq_disable();
rq = this_rq();
rq_lock(rq, &rf);
schedstat_inc(rq->yld_count); schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq); current->sched_class->yield_task(rq);
...@@ -4932,9 +4938,8 @@ SYSCALL_DEFINE0(sched_yield) ...@@ -4932,9 +4938,8 @@ SYSCALL_DEFINE0(sched_yield)
* Since we are going to call schedule() anyway, there's * Since we are going to call schedule() anyway, there's
* no need to preempt or enable interrupts: * no need to preempt or enable interrupts:
*/ */
__release(rq->lock); preempt_disable();
spin_release(&rq->lock.dep_map, 1, _THIS_IP_); rq_unlock(rq, &rf);
do_raw_spin_unlock(&rq->lock);
sched_preempt_enable_no_resched(); sched_preempt_enable_no_resched();
schedule(); schedule();
...@@ -5514,7 +5519,7 @@ void sched_setnuma(struct task_struct *p, int nid) ...@@ -5514,7 +5519,7 @@ void sched_setnuma(struct task_struct *p, int nid)
p->numa_preferred_nid = nid; p->numa_preferred_nid = nid;
if (queued) if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE); enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running) if (running)
set_curr_task(rq, p); set_curr_task(rq, p);
task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf);
...@@ -5579,11 +5584,11 @@ static struct task_struct fake_task = { ...@@ -5579,11 +5584,11 @@ static struct task_struct fake_task = {
* there's no concurrency possible, we hold the required locks anyway * there's no concurrency possible, we hold the required locks anyway
* because of lock validation efforts. * because of lock validation efforts.
*/ */
static void migrate_tasks(struct rq *dead_rq) static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
{ {
struct rq *rq = dead_rq; struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop; struct task_struct *next, *stop = rq->stop;
struct rq_flags rf; struct rq_flags orf = *rf;
int dest_cpu; int dest_cpu;
/* /*
...@@ -5602,9 +5607,7 @@ static void migrate_tasks(struct rq *dead_rq) ...@@ -5602,9 +5607,7 @@ static void migrate_tasks(struct rq *dead_rq)
* class method both need to have an up-to-date * class method both need to have an up-to-date
* value of rq->clock[_task] * value of rq->clock[_task]
*/ */
rq_pin_lock(rq, &rf);
update_rq_clock(rq); update_rq_clock(rq);
rq_unpin_lock(rq, &rf);
for (;;) { for (;;) {
/* /*
...@@ -5617,8 +5620,7 @@ static void migrate_tasks(struct rq *dead_rq) ...@@ -5617,8 +5620,7 @@ static void migrate_tasks(struct rq *dead_rq)
/* /*
* pick_next_task() assumes pinned rq->lock: * pick_next_task() assumes pinned rq->lock:
*/ */
rq_repin_lock(rq, &rf); next = pick_next_task(rq, &fake_task, rf);
next = pick_next_task(rq, &fake_task, &rf);
BUG_ON(!next); BUG_ON(!next);
next->sched_class->put_prev_task(rq, next); next->sched_class->put_prev_task(rq, next);
...@@ -5631,10 +5633,9 @@ static void migrate_tasks(struct rq *dead_rq) ...@@ -5631,10 +5633,9 @@ static void migrate_tasks(struct rq *dead_rq)
* because !cpu_active at this point, which means load-balance * because !cpu_active at this point, which means load-balance
* will not interfere. Also, stop-machine. * will not interfere. Also, stop-machine.
*/ */
rq_unpin_lock(rq, &rf); rq_unlock(rq, rf);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&next->pi_lock); raw_spin_lock(&next->pi_lock);
raw_spin_lock(&rq->lock); rq_relock(rq, rf);
/* /*
* Since we're inside stop-machine, _nothing_ should have * Since we're inside stop-machine, _nothing_ should have
...@@ -5648,12 +5649,12 @@ static void migrate_tasks(struct rq *dead_rq) ...@@ -5648,12 +5649,12 @@ static void migrate_tasks(struct rq *dead_rq)
/* Find suitable destination for @next, with force if needed. */ /* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next); dest_cpu = select_fallback_rq(dead_rq->cpu, next);
rq = __migrate_task(rq, rf, next, dest_cpu);
rq = __migrate_task(rq, next, dest_cpu);
if (rq != dead_rq) { if (rq != dead_rq) {
raw_spin_unlock(&rq->lock); rq_unlock(rq, rf);
rq = dead_rq; rq = dead_rq;
raw_spin_lock(&rq->lock); *rf = orf;
rq_relock(rq, rf);
} }
raw_spin_unlock(&next->pi_lock); raw_spin_unlock(&next->pi_lock);
} }
...@@ -5766,7 +5767,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) ...@@ -5766,7 +5767,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
int sched_cpu_activate(unsigned int cpu) int sched_cpu_activate(unsigned int cpu)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
unsigned long flags; struct rq_flags rf;
set_cpu_active(cpu, true); set_cpu_active(cpu, true);
...@@ -5784,12 +5785,12 @@ int sched_cpu_activate(unsigned int cpu) ...@@ -5784,12 +5785,12 @@ int sched_cpu_activate(unsigned int cpu)
* 2) At runtime, if cpuset_cpu_active() fails to rebuild the * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
* domains. * domains.
*/ */
raw_spin_lock_irqsave(&rq->lock, flags); rq_lock_irqsave(rq, &rf);
if (rq->rd) { if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_online(rq); set_rq_online(rq);
} }
raw_spin_unlock_irqrestore(&rq->lock, flags); rq_unlock_irqrestore(rq, &rf);
update_max_interval(); update_max_interval();
...@@ -5847,18 +5848,20 @@ int sched_cpu_starting(unsigned int cpu) ...@@ -5847,18 +5848,20 @@ int sched_cpu_starting(unsigned int cpu)
int sched_cpu_dying(unsigned int cpu) int sched_cpu_dying(unsigned int cpu)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
unsigned long flags; struct rq_flags rf;
/* Handle pending wakeups and then migrate everything off */ /* Handle pending wakeups and then migrate everything off */
sched_ttwu_pending(); sched_ttwu_pending();
raw_spin_lock_irqsave(&rq->lock, flags);
rq_lock_irqsave(rq, &rf);
if (rq->rd) { if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq); set_rq_offline(rq);
} }
migrate_tasks(rq); migrate_tasks(rq, &rf);
BUG_ON(rq->nr_running != 1); BUG_ON(rq->nr_running != 1);
raw_spin_unlock_irqrestore(&rq->lock, flags); rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq); calc_load_migrate(rq);
update_max_interval(); update_max_interval();
nohz_balance_exit_idle(cpu); nohz_balance_exit_idle(cpu);
...@@ -6412,7 +6415,8 @@ static void sched_change_group(struct task_struct *tsk, int type) ...@@ -6412,7 +6415,8 @@ static void sched_change_group(struct task_struct *tsk, int type)
*/ */
void sched_move_task(struct task_struct *tsk) void sched_move_task(struct task_struct *tsk)
{ {
int queued, running; int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq_flags rf; struct rq_flags rf;
struct rq *rq; struct rq *rq;
...@@ -6423,14 +6427,14 @@ void sched_move_task(struct task_struct *tsk) ...@@ -6423,14 +6427,14 @@ void sched_move_task(struct task_struct *tsk)
queued = task_on_rq_queued(tsk); queued = task_on_rq_queued(tsk);
if (queued) if (queued)
dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); dequeue_task(rq, tsk, queue_flags);
if (running) if (running)
put_prev_task(rq, tsk); put_prev_task(rq, tsk);
sched_change_group(tsk, TASK_MOVE_GROUP); sched_change_group(tsk, TASK_MOVE_GROUP);
if (queued) if (queued)
enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); enqueue_task(rq, tsk, queue_flags);
if (running) if (running)
set_curr_task(rq, tsk); set_curr_task(rq, tsk);
...@@ -7008,14 +7012,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) ...@@ -7008,14 +7012,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
for_each_online_cpu(i) { for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq; struct rq *rq = cfs_rq->rq;
struct rq_flags rf;
raw_spin_lock_irq(&rq->lock); rq_lock_irq(rq, &rf);
cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0; cfs_rq->runtime_remaining = 0;
if (cfs_rq->throttled) if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq); unthrottle_cfs_rq(cfs_rq);
raw_spin_unlock_irq(&rq->lock); rq_unlock_irq(rq, &rf);
} }
if (runtime_was_enabled && !runtime_enabled) if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec(); cfs_bandwidth_usage_dec();
......
...@@ -717,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -717,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
} }
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#include "sched-pelt.h"
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p); static unsigned long task_h_load(struct task_struct *p);
/*
* We choose a half-life close to 1 scheduling period.
* Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
* dependent on this value.
*/
#define LOAD_AVG_PERIOD 32
#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
/* Give new sched_entity start runnable values to heavy its load in infant time */ /* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se) void init_entity_runnable_average(struct sched_entity *se)
{ {
...@@ -2733,47 +2727,15 @@ static inline void update_cfs_shares(struct sched_entity *se) ...@@ -2733,47 +2727,15 @@ static inline void update_cfs_shares(struct sched_entity *se)
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* Precomputed fixed inverse multiplies for multiplication by y^n */
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
0x85aac367, 0x82cd8698,
};
/*
* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
* over-estimates when re-combining.
*/
static const u32 runnable_avg_yN_sum[] = {
0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
};
/*
* Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
* lower integers. See Documentation/scheduler/sched-avg.txt how these
* were generated:
*/
static const u32 __accumulated_sum_N32[] = {
0, 23371, 35056, 40899, 43820, 45281,
46011, 46376, 46559, 46650, 46696, 46719,
};
/* /*
* Approximate: * Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period) * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/ */
static __always_inline u64 decay_load(u64 val, u64 n) static u64 decay_load(u64 val, u64 n)
{ {
unsigned int local_n; unsigned int local_n;
if (!n) if (unlikely(n > LOAD_AVG_PERIOD * 63))
return val;
else if (unlikely(n > LOAD_AVG_PERIOD * 63))
return 0; return 0;
/* after bounds checking we can collapse to 32-bit */ /* after bounds checking we can collapse to 32-bit */
...@@ -2795,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n) ...@@ -2795,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n)
return val; return val;
} }
static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
{
u32 c1, c2, c3 = d3; /* y^0 == 1 */
/*
* c1 = d1 y^p
*/
c1 = decay_load((u64)d1, periods);
/*
* p-1
* c2 = 1024 \Sum y^n
* n=1
*
* inf inf
* = 1024 ( \Sum y^n - \Sum y^n - y^0 )
* n=0 n=p
*/
c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
return c1 + c2 + c3;
}
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
/* /*
* For updates fully spanning n periods, the contribution to runnable * Accumulate the three separate parts of the sum; d1 the remainder
* average will be: \Sum 1024*y^n * of the last (incomplete) period, d2 the span of full periods and d3
* the remainder of the (incomplete) current period.
*
* d1 d2 d3
* ^ ^ ^
* | | |
* |<->|<----------------->|<--->|
* ... |---x---|------| ... |------|-----x (now)
*
* p-1
* u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
* n=1
* *
* We can compute this reasonably efficiently by combining: * = u y^p + (Step 1)
* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} *
* p-1
* d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
* n=1
*/ */
static u32 __compute_runnable_contrib(u64 n) static __always_inline u32
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{ {
u32 contrib = 0; unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
if (likely(n <= LOAD_AVG_PERIOD)) scale_freq = arch_scale_freq_capacity(NULL, cpu);
return runnable_avg_yN_sum[n]; scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
else if (unlikely(n >= LOAD_AVG_MAX_N))
return LOAD_AVG_MAX;
/* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ delta += sa->period_contrib;
contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; periods = delta / 1024; /* A period is 1024us (~1ms) */
n %= LOAD_AVG_PERIOD;
contrib = decay_load(contrib, n);
return contrib + runnable_avg_yN_sum[n];
}
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) /*
* Step 1: decay old *_sum if we crossed period boundaries.
*/
if (periods) {
sa->load_sum = decay_load(sa->load_sum, periods);
if (cfs_rq) {
cfs_rq->runnable_load_sum =
decay_load(cfs_rq->runnable_load_sum, periods);
}
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
/*
* Step 2
*/
delta %= 1024;
contrib = __accumulate_pelt_segments(periods,
1024 - sa->period_contrib, delta);
}
sa->period_contrib = delta;
contrib = cap_scale(contrib, scale_freq);
if (weight) {
sa->load_sum += weight * contrib;
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * contrib;
}
if (running)
sa->util_sum += contrib * scale_cpu;
return periods;
}
/* /*
* We can represent the historical contribution to runnable average as the * We can represent the historical contribution to runnable average as the
...@@ -2849,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n) ...@@ -2849,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n)
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/ */
static __always_inline int static __always_inline int
__update_load_avg(u64 now, int cpu, struct sched_avg *sa, ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq) unsigned long weight, int running, struct cfs_rq *cfs_rq)
{ {
u64 delta, scaled_delta, periods; u64 delta;
u32 contrib;
unsigned int delta_w, scaled_delta_w, decayed = 0;
unsigned long scale_freq, scale_cpu;
delta = now - sa->last_update_time; delta = now - sa->last_update_time;
/* /*
...@@ -2874,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, ...@@ -2874,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
delta >>= 10; delta >>= 10;
if (!delta) if (!delta)
return 0; return 0;
sa->last_update_time = now;
scale_freq = arch_scale_freq_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
/* delta_w is the amount already accumulated against our next period */
delta_w = sa->period_contrib;
if (delta + delta_w >= 1024) {
decayed = 1;
/* how much left for next period will start over, we don't know yet */ sa->last_update_time += delta << 10;
sa->period_contrib = 0;
/* /*
* Now that we know we're crossing a period boundary, figure * Now we know we crossed measurement unit boundaries. The *_avg
* out how much from delta we need to complete the current * accrues by two steps:
* period and accrue it. *
*/ * Step 1: accumulate *_sum since last_update_time. If we haven't
delta_w = 1024 - delta_w; * crossed period boundaries, finish.
scaled_delta_w = cap_scale(delta_w, scale_freq); */
if (weight) { if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
sa->load_sum += weight * scaled_delta_w; return 0;
if (cfs_rq) {
cfs_rq->runnable_load_sum +=
weight * scaled_delta_w;
}
}
if (running)
sa->util_sum += scaled_delta_w * scale_cpu;
delta -= delta_w;
/* Figure out how many additional periods this update spans */
periods = delta / 1024;
delta %= 1024;
sa->load_sum = decay_load(sa->load_sum, periods + 1); /*
if (cfs_rq) { * Step 2: update *_avg.
cfs_rq->runnable_load_sum = */
decay_load(cfs_rq->runnable_load_sum, periods + 1); sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
} if (cfs_rq) {
sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); cfs_rq->runnable_load_avg =
div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
contrib = __compute_runnable_contrib(periods);
contrib = cap_scale(contrib, scale_freq);
if (weight) {
sa->load_sum += weight * contrib;
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * contrib;
}
if (running)
sa->util_sum += contrib * scale_cpu;
} }
sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
/* Remainder of delta accrued against u_0` */ return 1;
scaled_delta = cap_scale(delta, scale_freq); }
if (weight) {
sa->load_sum += weight * scaled_delta;
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * scaled_delta;
}
if (running)
sa->util_sum += scaled_delta * scale_cpu;
sa->period_contrib += delta; static int
__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{
return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
}
if (decayed) { static int
sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
if (cfs_rq) { {
cfs_rq->runnable_load_avg = return ___update_load_avg(now, cpu, &se->avg,
div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); se->on_rq * scale_load_down(se->load.weight),
} cfs_rq->curr == se, NULL);
sa->util_avg = sa->util_sum / LOAD_AVG_MAX; }
}
return decayed; static int
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
{
return ___update_load_avg(now, cpu, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
cfs_rq->curr != NULL, cfs_rq);
} }
/* /*
...@@ -3014,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) ...@@ -3014,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
void set_task_rq_fair(struct sched_entity *se, void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next) struct cfs_rq *prev, struct cfs_rq *next)
{ {
u64 p_last_update_time;
u64 n_last_update_time;
if (!sched_feat(ATTACH_AGE_LOAD)) if (!sched_feat(ATTACH_AGE_LOAD))
return; return;
...@@ -3024,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se, ...@@ -3024,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se,
* time. This will result in the wakee task is less decayed, but giving * time. This will result in the wakee task is less decayed, but giving
* the wakee more load sounds not bad. * the wakee more load sounds not bad.
*/ */
if (se->avg.last_update_time && prev) { if (!(se->avg.last_update_time && prev))
u64 p_last_update_time; return;
u64 n_last_update_time;
#ifndef CONFIG_64BIT #ifndef CONFIG_64BIT
{
u64 p_last_update_time_copy; u64 p_last_update_time_copy;
u64 n_last_update_time_copy; u64 n_last_update_time_copy;
...@@ -3043,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se, ...@@ -3043,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se,
} while (p_last_update_time != p_last_update_time_copy || } while (p_last_update_time != p_last_update_time_copy ||
n_last_update_time != n_last_update_time_copy); n_last_update_time != n_last_update_time_copy);
}
#else #else
p_last_update_time = prev->avg.last_update_time; p_last_update_time = prev->avg.last_update_time;
n_last_update_time = next->avg.last_update_time; n_last_update_time = next->avg.last_update_time;
#endif #endif
__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
&se->avg, 0, 0, NULL); se->avg.last_update_time = n_last_update_time;
se->avg.last_update_time = n_last_update_time;
}
} }
/* Take into account change of utilization of a child task group */ /* Take into account change of utilization of a child task group */
...@@ -3173,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) ...@@ -3173,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
return 1; return 1;
} }
/*
* Check if we need to update the load and the utilization of a blocked
* group_entity:
*/
static inline bool skip_blocked_update(struct sched_entity *se)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
/*
* If sched_entity still have not zero load or utilization, we have to
* decay it:
*/
if (se->avg.load_avg || se->avg.util_avg)
return false;
/*
* If there is a pending propagation, we have to update the load and
* the utilization of the sched_entity:
*/
if (gcfs_rq->propagate_avg)
return false;
/*
* Otherwise, the load and the utilization of the sched_entity is
* already zero and there is no pending propagation, so it will be a
* waste of time to try to decay it:
*/
return true;
}
#else /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
...@@ -3265,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) ...@@ -3265,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
set_tg_cfs_propagate(cfs_rq); set_tg_cfs_propagate(cfs_rq);
} }
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
#ifndef CONFIG_64BIT #ifndef CONFIG_64BIT
smp_wmb(); smp_wmb();
...@@ -3298,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags) ...@@ -3298,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
* Track task load average for carrying it to new CPU after migrated, and * Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration * track group sched_entity load average for task_h_load calc in migration
*/ */
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg(now, cpu, &se->avg, __update_load_avg_se(now, cpu, cfs_rq, se);
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
}
decayed = update_cfs_rq_load_avg(now, cfs_rq, true); decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
decayed |= propagate_entity_load_avg(se); decayed |= propagate_entity_load_avg(se);
...@@ -3407,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se) ...@@ -3407,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se)
u64 last_update_time; u64 last_update_time;
last_update_time = cfs_rq_last_update_time(cfs_rq); last_update_time = cfs_rq_last_update_time(cfs_rq);
__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
} }
/* /*
...@@ -4271,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, ...@@ -4271,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) { throttled_list) {
struct rq *rq = rq_of(cfs_rq); struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;
raw_spin_lock(&rq->lock); rq_lock(rq, &rf);
if (!cfs_rq_throttled(cfs_rq)) if (!cfs_rq_throttled(cfs_rq))
goto next; goto next;
...@@ -4289,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, ...@@ -4289,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
unthrottle_cfs_rq(cfs_rq); unthrottle_cfs_rq(cfs_rq);
next: next:
raw_spin_unlock(&rq->lock); rq_unlock(rq, &rf);
if (!remaining) if (!remaining)
break; break;
...@@ -5097,15 +5121,16 @@ void cpu_load_update_nohz_stop(void) ...@@ -5097,15 +5121,16 @@ void cpu_load_update_nohz_stop(void)
unsigned long curr_jiffies = READ_ONCE(jiffies); unsigned long curr_jiffies = READ_ONCE(jiffies);
struct rq *this_rq = this_rq(); struct rq *this_rq = this_rq();
unsigned long load; unsigned long load;
struct rq_flags rf;
if (curr_jiffies == this_rq->last_load_update_tick) if (curr_jiffies == this_rq->last_load_update_tick)
return; return;
load = weighted_cpuload(cpu_of(this_rq)); load = weighted_cpuload(cpu_of(this_rq));
raw_spin_lock(&this_rq->lock); rq_lock(this_rq, &rf);
update_rq_clock(this_rq); update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load); cpu_load_update_nohz(this_rq, curr_jiffies, load);
raw_spin_unlock(&this_rq->lock); rq_unlock(this_rq, &rf);
} }
#else /* !CONFIG_NO_HZ_COMMON */ #else /* !CONFIG_NO_HZ_COMMON */
static inline void cpu_load_update_nohz(struct rq *this_rq, static inline void cpu_load_update_nohz(struct rq *this_rq,
...@@ -6769,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env) ...@@ -6769,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
lockdep_assert_held(&env->src_rq->lock); lockdep_assert_held(&env->src_rq->lock);
p->on_rq = TASK_ON_RQ_MIGRATING; p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(env->src_rq, p, 0); deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu); set_task_cpu(p, env->dst_cpu);
} }
...@@ -6902,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) ...@@ -6902,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
lockdep_assert_held(&rq->lock); lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq); BUG_ON(task_rq(p) != rq);
activate_task(rq, p, 0); activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED; p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0); check_preempt_curr(rq, p, 0);
} }
...@@ -6913,9 +6938,12 @@ static void attach_task(struct rq *rq, struct task_struct *p) ...@@ -6913,9 +6938,12 @@ static void attach_task(struct rq *rq, struct task_struct *p)
*/ */
static void attach_one_task(struct rq *rq, struct task_struct *p) static void attach_one_task(struct rq *rq, struct task_struct *p)
{ {
raw_spin_lock(&rq->lock); struct rq_flags rf;
rq_lock(rq, &rf);
update_rq_clock(rq);
attach_task(rq, p); attach_task(rq, p);
raw_spin_unlock(&rq->lock); rq_unlock(rq, &rf);
} }
/* /*
...@@ -6926,8 +6954,10 @@ static void attach_tasks(struct lb_env *env) ...@@ -6926,8 +6954,10 @@ static void attach_tasks(struct lb_env *env)
{ {
struct list_head *tasks = &env->tasks; struct list_head *tasks = &env->tasks;
struct task_struct *p; struct task_struct *p;
struct rq_flags rf;
raw_spin_lock(&env->dst_rq->lock); rq_lock(env->dst_rq, &rf);
update_rq_clock(env->dst_rq);
while (!list_empty(tasks)) { while (!list_empty(tasks)) {
p = list_first_entry(tasks, struct task_struct, se.group_node); p = list_first_entry(tasks, struct task_struct, se.group_node);
...@@ -6936,7 +6966,7 @@ static void attach_tasks(struct lb_env *env) ...@@ -6936,7 +6966,7 @@ static void attach_tasks(struct lb_env *env)
attach_task(env->dst_rq, p); attach_task(env->dst_rq, p);
} }
raw_spin_unlock(&env->dst_rq->lock); rq_unlock(env->dst_rq, &rf);
} }
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
...@@ -6944,9 +6974,9 @@ static void update_blocked_averages(int cpu) ...@@ -6944,9 +6974,9 @@ static void update_blocked_averages(int cpu)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq; struct cfs_rq *cfs_rq;
unsigned long flags; struct rq_flags rf;
raw_spin_lock_irqsave(&rq->lock, flags); rq_lock_irqsave(rq, &rf);
update_rq_clock(rq); update_rq_clock(rq);
/* /*
...@@ -6954,6 +6984,8 @@ static void update_blocked_averages(int cpu) ...@@ -6954,6 +6984,8 @@ static void update_blocked_averages(int cpu)
* list_add_leaf_cfs_rq() for details. * list_add_leaf_cfs_rq() for details.
*/ */
for_each_leaf_cfs_rq(rq, cfs_rq) { for_each_leaf_cfs_rq(rq, cfs_rq) {
struct sched_entity *se;
/* throttled entities do not contribute to load */ /* throttled entities do not contribute to load */
if (throttled_hierarchy(cfs_rq)) if (throttled_hierarchy(cfs_rq))
continue; continue;
...@@ -6961,11 +6993,12 @@ static void update_blocked_averages(int cpu) ...@@ -6961,11 +6993,12 @@ static void update_blocked_averages(int cpu)
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0); update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent */ /* Propagate pending load changes to the parent, if any: */
if (cfs_rq->tg->se[cpu]) se = cfs_rq->tg->se[cpu];
update_load_avg(cfs_rq->tg->se[cpu], 0); if (se && !skip_blocked_update(se))
update_load_avg(se, 0);
} }
raw_spin_unlock_irqrestore(&rq->lock, flags); rq_unlock_irqrestore(rq, &rf);
} }
/* /*
...@@ -7019,12 +7052,12 @@ static inline void update_blocked_averages(int cpu) ...@@ -7019,12 +7052,12 @@ static inline void update_blocked_averages(int cpu)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs; struct cfs_rq *cfs_rq = &rq->cfs;
unsigned long flags; struct rq_flags rf;
raw_spin_lock_irqsave(&rq->lock, flags); rq_lock_irqsave(rq, &rf);
update_rq_clock(rq); update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
raw_spin_unlock_irqrestore(&rq->lock, flags); rq_unlock_irqrestore(rq, &rf);
} }
static unsigned long task_h_load(struct task_struct *p) static unsigned long task_h_load(struct task_struct *p)
...@@ -7525,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd ...@@ -7525,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
{ {
struct sched_domain *child = env->sd->child; struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups; struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs; struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0; int load_idx, prefer_sibling = 0;
bool overload = false; bool overload = false;
...@@ -7541,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd ...@@ -7541,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
if (local_group) { if (local_group) {
sds->local = sg; sds->local = sg;
sgs = &sds->local_stat; sgs = local;
if (env->idle != CPU_NEWLY_IDLE || if (env->idle != CPU_NEWLY_IDLE ||
time_after_eq(jiffies, sg->sgc->next_update)) time_after_eq(jiffies, sg->sgc->next_update))
...@@ -7565,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd ...@@ -7565,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
* the tasks on the system). * the tasks on the system).
*/ */
if (prefer_sibling && sds->local && if (prefer_sibling && sds->local &&
group_has_capacity(env, &sds->local_stat) && group_has_capacity(env, local) &&
(sgs->sum_nr_running > 1)) { (sgs->sum_nr_running > local->sum_nr_running + 1)) {
sgs->group_no_capacity = 1; sgs->group_no_capacity = 1;
sgs->group_type = group_classify(sg, sgs); sgs->group_type = group_classify(sg, sgs);
} }
...@@ -8042,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -8042,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd_parent = sd->parent; struct sched_domain *sd_parent = sd->parent;
struct sched_group *group; struct sched_group *group;
struct rq *busiest; struct rq *busiest;
unsigned long flags; struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = { struct lb_env env = {
...@@ -8105,7 +8139,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -8105,7 +8139,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance: more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags); rq_lock_irqsave(busiest, &rf);
update_rq_clock(busiest); update_rq_clock(busiest);
/* /*
...@@ -8122,14 +8156,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -8122,14 +8156,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* See task_rq_lock() family for the details. * See task_rq_lock() family for the details.
*/ */
raw_spin_unlock(&busiest->lock); rq_unlock(busiest, &rf);
if (cur_ld_moved) { if (cur_ld_moved) {
attach_tasks(&env); attach_tasks(&env);
ld_moved += cur_ld_moved; ld_moved += cur_ld_moved;
} }
local_irq_restore(flags); local_irq_restore(rf.flags);
if (env.flags & LBF_NEED_BREAK) { if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK; env.flags &= ~LBF_NEED_BREAK;
...@@ -8207,6 +8241,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -8207,6 +8241,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
sd->nr_balance_failed++; sd->nr_balance_failed++;
if (need_active_balance(&env)) { if (need_active_balance(&env)) {
unsigned long flags;
raw_spin_lock_irqsave(&busiest->lock, flags); raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop, /* don't kick the active_load_balance_cpu_stop,
...@@ -8444,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data) ...@@ -8444,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data)
struct rq *target_rq = cpu_rq(target_cpu); struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd; struct sched_domain *sd;
struct task_struct *p = NULL; struct task_struct *p = NULL;
struct rq_flags rf;
raw_spin_lock_irq(&busiest_rq->lock); rq_lock_irq(busiest_rq, &rf);
/* make sure the requested cpu hasn't gone down in the meantime */ /* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() || if (unlikely(busiest_cpu != smp_processor_id() ||
...@@ -8496,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data) ...@@ -8496,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data)
rcu_read_unlock(); rcu_read_unlock();
out_unlock: out_unlock:
busiest_rq->active_balance = 0; busiest_rq->active_balance = 0;
raw_spin_unlock(&busiest_rq->lock); rq_unlock(busiest_rq, &rf);
if (p) if (p)
attach_one_task(target_rq, p); attach_one_task(target_rq, p);
...@@ -8794,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) ...@@ -8794,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* do the balance. * do the balance.
*/ */
if (time_after_eq(jiffies, rq->next_balance)) { if (time_after_eq(jiffies, rq->next_balance)) {
raw_spin_lock_irq(&rq->lock); struct rq_flags rf;
rq_lock_irq(rq, &rf);
update_rq_clock(rq); update_rq_clock(rq);
cpu_load_update_idle(rq); cpu_load_update_idle(rq);
raw_spin_unlock_irq(&rq->lock); rq_unlock_irq(rq, &rf);
rebalance_domains(rq, CPU_IDLE); rebalance_domains(rq, CPU_IDLE);
} }
...@@ -8988,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p) ...@@ -8988,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p)
struct cfs_rq *cfs_rq; struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr; struct sched_entity *se = &p->se, *curr;
struct rq *rq = this_rq(); struct rq *rq = this_rq();
struct rq_flags rf;
raw_spin_lock(&rq->lock); rq_lock(rq, &rf);
update_rq_clock(rq); update_rq_clock(rq);
cfs_rq = task_cfs_rq(current); cfs_rq = task_cfs_rq(current);
...@@ -9010,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p) ...@@ -9010,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p)
} }
se->vruntime -= cfs_rq->min_vruntime; se->vruntime -= cfs_rq->min_vruntime;
raw_spin_unlock(&rq->lock); rq_unlock(rq, &rf);
} }
/* /*
...@@ -9372,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex); ...@@ -9372,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex);
int sched_group_set_shares(struct task_group *tg, unsigned long shares) int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{ {
int i; int i;
unsigned long flags;
/* /*
* We can't change the weight of the root cgroup. * We can't change the weight of the root cgroup.
...@@ -9389,19 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) ...@@ -9389,19 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
tg->shares = shares; tg->shares = shares;
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i); struct rq *rq = cpu_rq(i);
struct sched_entity *se; struct sched_entity *se = tg->se[i];
struct rq_flags rf;
se = tg->se[i];
/* Propagate contribution to hierarchy */ /* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags); rq_lock_irqsave(rq, &rf);
/* Possible calls to update_curr() need rq clock */
update_rq_clock(rq); update_rq_clock(rq);
for_each_sched_entity(se) { for_each_sched_entity(se) {
update_load_avg(se, UPDATE_TG); update_load_avg(se, UPDATE_TG);
update_cfs_shares(se); update_cfs_shares(se);
} }
raw_spin_unlock_irqrestore(&rq->lock, flags); rq_unlock_irqrestore(rq, &rf);
} }
done: done:
......
...@@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true) ...@@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true)
*/ */
SCHED_FEAT(SIS_AVG_CPU, false) SCHED_FEAT(SIS_AVG_CPU, false)
/*
* Issue a WARN when we do multiple update_rq_clock() calls
* in a single rq->lock section. Default disabled because the
* annotations are not complete.
*/
SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
#ifdef HAVE_RT_PUSH_IPI #ifdef HAVE_RT_PUSH_IPI
/* /*
* In order to avoid a thundering herd attack of CPUs that are * In order to avoid a thundering herd attack of CPUs that are
......
...@@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq) ...@@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq)
#define RT_PUSH_IPI_EXECUTING 1 #define RT_PUSH_IPI_EXECUTING 1
#define RT_PUSH_IPI_RESTART 2 #define RT_PUSH_IPI_RESTART 2
/*
* When a high priority task schedules out from a CPU and a lower priority
* task is scheduled in, a check is made to see if there's any RT tasks
* on other CPUs that are waiting to run because a higher priority RT task
* is currently running on its CPU. In this case, the CPU with multiple RT
* tasks queued on it (overloaded) needs to be notified that a CPU has opened
* up that may be able to run one of its non-running queued RT tasks.
*
* On large CPU boxes, there's the case that several CPUs could schedule
* a lower priority task at the same time, in which case it will look for
* any overloaded CPUs that it could pull a task from. To do this, the runqueue
* lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
* for a single overloaded CPU's runqueue lock can produce a large latency.
* (This has actually been observed on large boxes running cyclictest).
* Instead of taking the runqueue lock of the overloaded CPU, each of the
* CPUs that scheduled a lower priority task simply sends an IPI to the
* overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
* lots of contention. The overloaded CPU will look to push its non-running
* RT task off, and if it does, it can then ignore the other IPIs coming
* in, and just pass those IPIs off to any other overloaded CPU.
*
* When a CPU schedules a lower priority task, it only sends an IPI to
* the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
* as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
* RT overloaded tasks, would cause 100 IPIs to go out at once.
*
* The overloaded RT CPU, when receiving an IPI, will try to push off its
* overloaded RT tasks and then send an IPI to the next CPU that has
* overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
* have completed. Just because a CPU may have pushed off its own overloaded
* RT task does not mean it should stop sending the IPI around to other
* overloaded CPUs. There may be another RT task waiting to run on one of
* those CPUs that are of higher priority than the one that was just
* pushed.
*
* An optimization that could possibly be made is to make a CPU array similar
* to the cpupri array mask of all running RT tasks, but for the overloaded
* case, then the IPI could be sent to only the CPU with the highest priority
* RT task waiting, and that CPU could send off further IPIs to the CPU with
* the next highest waiting task. Since the overloaded case is much less likely
* to happen, the complexity of this implementation may not be worth it.
* Instead, just send an IPI around to all overloaded CPUs.
*
* The rq->rt.push_flags holds the status of the IPI that is going around.
* A run queue can only send out a single IPI at a time. The possible flags
* for rq->rt.push_flags are:
*
* (None or zero): No IPI is going around for the current rq
* RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
* RT_PUSH_IPI_RESTART: The priority of the running task for the rq
* has changed, and the IPI should restart
* circulating the overloaded CPUs again.
*
* rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
* before sending to the next CPU.
*
* Instead of having all CPUs that schedule a lower priority task send
* an IPI to the same "first" CPU in the RT overload mask, they send it
* to the next overloaded CPU after their own CPU. This helps distribute
* the work when there's more than one overloaded CPU and multiple CPUs
* scheduling in lower priority tasks.
*
* When a rq schedules a lower priority task than what was currently
* running, the next CPU with overloaded RT tasks is examined first.
* That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
* priority task, it will send an IPI first to CPU 5, then CPU 5 will
* send to CPU 1 if it is still overloaded. CPU 1 will clear the
* rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
*
* The first CPU to notice IPI_RESTART is set, will clear that flag and then
* send an IPI to the next overloaded CPU after the rq->cpu and not the next
* CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
* schedules a lower priority task, and the IPI_RESTART gets set while the
* handling is being done on CPU 5, it will clear the flag and send it back to
* CPU 4 instead of CPU 1.
*
* Note, the above logic can be disabled by turning off the sched_feature
* RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
* taken by the CPU requesting a pull and the waiting RT task will be pulled
* by that CPU. This may be fine for machines with few CPUs.
*/
static void tell_cpu_to_push(struct rq *rq) static void tell_cpu_to_push(struct rq *rq)
{ {
int cpu; int cpu;
......
/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
0x85aac367, 0x82cd8698,
};
#define LOAD_AVG_PERIOD 32
#define LOAD_AVG_MAX 47742
...@@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40]; ...@@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */
#define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02 #define ENQUEUE_RESTORE 0x02
#define ENQUEUE_MOVE 0x04 #define ENQUEUE_MOVE 0x04
#define ENQUEUE_NOCLOCK 0x08
#define ENQUEUE_HEAD 0x08 #define ENQUEUE_HEAD 0x10
#define ENQUEUE_REPLENISH 0x10 #define ENQUEUE_REPLENISH 0x20
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define ENQUEUE_MIGRATED 0x20 #define ENQUEUE_MIGRATED 0x40
#else #else
#define ENQUEUE_MIGRATED 0x00 #define ENQUEUE_MIGRATED 0x00
#endif #endif
...@@ -1624,6 +1626,7 @@ static inline void sched_avg_update(struct rq *rq) { } ...@@ -1624,6 +1626,7 @@ static inline void sched_avg_update(struct rq *rq) { }
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock); __acquires(rq->lock);
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock) __acquires(p->pi_lock)
__acquires(rq->lock); __acquires(rq->lock);
...@@ -1645,6 +1648,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ...@@ -1645,6 +1648,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
} }
static inline void
rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
raw_spin_lock_irqsave(&rq->lock, rf->flags);
rq_pin_lock(rq, rf);
}
static inline void
rq_lock_irq(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
raw_spin_lock_irq(&rq->lock);
rq_pin_lock(rq, rf);
}
static inline void
rq_lock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
raw_spin_lock(&rq->lock);
rq_pin_lock(rq, rf);
}
static inline void
rq_relock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
raw_spin_lock(&rq->lock);
rq_repin_lock(rq, rf);
}
static inline void
rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
}
static inline void
rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
raw_spin_unlock_irq(&rq->lock);
}
static inline void
rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
raw_spin_unlock(&rq->lock);
}
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT #ifdef CONFIG_PREEMPT
......
...@@ -309,7 +309,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) ...@@ -309,7 +309,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
account_irq_exit_time(current); account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET); __local_bh_enable(SOFTIRQ_OFFSET);
WARN_ON_ONCE(in_interrupt()); WARN_ON_ONCE(in_interrupt());
tsk_restore_flags(current, old_flags, PF_MEMALLOC); current_restore_flags(old_flags, PF_MEMALLOC);
} }
asmlinkage __visible void do_softirq(void) asmlinkage __visible void do_softirq(void)
......
...@@ -4734,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) ...@@ -4734,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
return wfc.ret; return wfc.ret;
} }
EXPORT_SYMBOL_GPL(work_on_cpu); EXPORT_SYMBOL_GPL(work_on_cpu);
/**
* work_on_cpu_safe - run a function in thread context on a particular cpu
* @cpu: the cpu to run on
* @fn: the function to run
* @arg: the function argument
*
* Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
* any locks which would prevent @fn from completing.
*
* Return: The value @fn returns.
*/
long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
{
long ret = -ENODEV;
get_online_cpus();
if (cpu_online(cpu))
ret = work_on_cpu(cpu, fn, arg);
put_online_cpus();
return ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_safe);
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER #ifdef CONFIG_FREEZER
......
...@@ -4243,7 +4243,7 @@ static int __netif_receive_skb(struct sk_buff *skb) ...@@ -4243,7 +4243,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
*/ */
current->flags |= PF_MEMALLOC; current->flags |= PF_MEMALLOC;
ret = __netif_receive_skb_core(skb, true); ret = __netif_receive_skb_core(skb, true);
tsk_restore_flags(current, pflags, PF_MEMALLOC); current_restore_flags(pflags, PF_MEMALLOC);
} else } else
ret = __netif_receive_skb_core(skb, false); ret = __netif_receive_skb_core(skb, false);
......
...@@ -325,7 +325,7 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -325,7 +325,7 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
current->flags |= PF_MEMALLOC; current->flags |= PF_MEMALLOC;
ret = sk->sk_backlog_rcv(sk, skb); ret = sk->sk_backlog_rcv(sk, skb);
tsk_restore_flags(current, pflags, PF_MEMALLOC); current_restore_flags(pflags, PF_MEMALLOC);
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment