Commit 586b222d authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched-core-2023-04-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - Allow unprivileged PSI poll()ing

 - Fix performance regression introduced by mm_cid

 - Improve livepatch stalls by adding livepatch task switching to
   cond_resched(). This resolves livepatching busy-loop stalls with
   certain CPU-bound kthreads

 - Improve sched_move_task() performance on autogroup configs

 - On core-scheduling CPUs, avoid selecting throttled tasks to run

 - Misc cleanups, fixes and improvements

* tag 'sched-core-2023-04-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/clock: Fix local_clock() before sched_clock_init()
  sched/rt: Fix bad task migration for rt tasks
  sched: Fix performance regression introduced by mm_cid
  sched/core: Make sched_dynamic_mutex static
  sched/psi: Allow unprivileged polling of N*2s period
  sched/psi: Extract update_triggers side effect
  sched/psi: Rename existing poll members in preparation
  sched/psi: Rearrange polling code in preparation
  sched/fair: Fix inaccurate tally of ttwu_move_affine
  vhost: Fix livepatch timeouts in vhost_worker()
  livepatch,sched: Add livepatch task switching to cond_resched()
  livepatch: Skip task_call_func() for current task
  livepatch: Convert stack entries array to percpu
  sched: Interleave cfs bandwidth timers for improved single thread performance at low utilization
  sched/core: Reduce cost of sched_move_task when config autogroup
  sched/core: Avoid selecting the task that is throttled to run when core-sched enable
  sched/topology: Make sched_energy_mutex,update static
parents 7c339778 f31dcb15
......@@ -105,6 +105,10 @@ prevent overly frequent polling. Max limit is chosen as a high enough number
after which monitors are most likely not needed and psi averages can be used
instead.
Unprivileged users can also create monitors, with the only limitation that the
window size must be a multiple of 2s, in order to prevent excessive resource
usage.
When activated, psi monitor stays active for at least the duration of one
tracking window to avoid repeated activations/deactivations when system is
bouncing in and out of the stall state.
......
......@@ -361,8 +361,7 @@ static int vhost_worker(void *data)
kcov_remote_start_common(worker->kcov_handle);
work->fn(work);
kcov_remote_stop();
if (need_resched())
schedule();
cond_resched();
}
}
......
......@@ -13,6 +13,7 @@
#include <linux/ftrace.h>
#include <linux/completion.h>
#include <linux/list.h>
#include <linux/livepatch_sched.h>
#if IS_ENABLED(CONFIG_LIVEPATCH)
......
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _LINUX_LIVEPATCH_SCHED_H_
#define _LINUX_LIVEPATCH_SCHED_H_
#include <linux/jump_label.h>
#include <linux/static_call_types.h>
#ifdef CONFIG_LIVEPATCH
void __klp_sched_try_switch(void);
#if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
static __always_inline void klp_sched_try_switch(void)
{
if (static_branch_unlikely(&klp_sched_try_switch_key))
__klp_sched_try_switch();
}
#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
#else /* !CONFIG_LIVEPATCH */
static inline void klp_sched_try_switch(void) {}
static inline void __klp_sched_try_switch(void) {}
#endif /* CONFIG_LIVEPATCH */
#endif /* _LINUX_LIVEPATCH_SCHED_H_ */
......@@ -573,6 +573,13 @@ struct vm_area_struct {
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
#ifdef CONFIG_SCHED_MM_CID
struct mm_cid {
u64 time;
int cid;
};
#endif
struct kioctx_table;
struct mm_struct {
struct {
......@@ -623,15 +630,19 @@ struct mm_struct {
atomic_t mm_count;
#ifdef CONFIG_SCHED_MM_CID
/**
* @cid_lock: Protect cid bitmap updates vs lookups.
* @pcpu_cid: Per-cpu current cid.
*
* Prevent situations where updates to the cid bitmap happen
* concurrently with lookups. Those can lead to situations
* where a lookup cannot find a free bit simply because it was
* unlucky enough to load, non-atomically, bitmap words as they
* were being concurrently updated by the updaters.
* Keep track of the currently allocated mm_cid for each cpu.
* The per-cpu mm_cid values are serialized by their respective
* runqueue locks.
*/
raw_spinlock_t cid_lock;
struct mm_cid __percpu *pcpu_cid;
/*
* @mm_cid_next_scan: Next mm_cid scan (in jiffies).
*
* When the next mm_cid scan is due (in jiffies).
*/
unsigned long mm_cid_next_scan;
#endif
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* size of all page tables */
......@@ -899,6 +910,37 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
}
#ifdef CONFIG_SCHED_MM_CID
enum mm_cid_state {
MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */
MM_CID_LAZY_PUT = (1U << 31),
};
static inline bool mm_cid_is_unset(int cid)
{
return cid == MM_CID_UNSET;
}
static inline bool mm_cid_is_lazy_put(int cid)
{
return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
}
static inline bool mm_cid_is_valid(int cid)
{
return !(cid & MM_CID_LAZY_PUT);
}
static inline int mm_cid_set_lazy_put(int cid)
{
return cid | MM_CID_LAZY_PUT;
}
static inline int mm_cid_clear_lazy_put(int cid)
{
return cid & ~MM_CID_LAZY_PUT;
}
/* Accessor for struct mm_struct's cidmask. */
static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
{
......@@ -912,16 +954,40 @@ static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
static inline void mm_init_cid(struct mm_struct *mm)
{
raw_spin_lock_init(&mm->cid_lock);
int i;
for_each_possible_cpu(i) {
struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
pcpu_cid->cid = MM_CID_UNSET;
pcpu_cid->time = 0;
}
cpumask_clear(mm_cidmask(mm));
}
static inline int mm_alloc_cid(struct mm_struct *mm)
{
mm->pcpu_cid = alloc_percpu(struct mm_cid);
if (!mm->pcpu_cid)
return -ENOMEM;
mm_init_cid(mm);
return 0;
}
static inline void mm_destroy_cid(struct mm_struct *mm)
{
free_percpu(mm->pcpu_cid);
mm->pcpu_cid = NULL;
}
static inline unsigned int mm_cid_size(void)
{
return cpumask_size();
}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm) { }
static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; }
static inline void mm_destroy_cid(struct mm_struct *mm) { }
static inline unsigned int mm_cid_size(void)
{
return 0;
......
......@@ -24,7 +24,7 @@ void psi_memstall_leave(unsigned long *flags);
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
struct psi_trigger *psi_trigger_create(struct psi_group *group,
char *buf, enum psi_res res);
char *buf, enum psi_res res, struct file *file);
void psi_trigger_destroy(struct psi_trigger *t);
__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
......
......@@ -151,6 +151,9 @@ struct psi_trigger {
/* Deferred event(s) from previous ratelimit window */
bool pending_event;
/* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */
enum psi_aggregators aggregator;
};
struct psi_group {
......@@ -171,30 +174,34 @@ struct psi_group {
/* Aggregator work control */
struct delayed_work avgs_work;
/* Unprivileged triggers against N*PSI_FREQ windows */
struct list_head avg_triggers;
u32 avg_nr_triggers[NR_PSI_STATES - 1];
/* Total stall times and sampled pressure averages */
u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
unsigned long avg[NR_PSI_STATES - 1][3];
/* Monitor work control */
struct task_struct __rcu *poll_task;
struct timer_list poll_timer;
wait_queue_head_t poll_wait;
atomic_t poll_wakeup;
atomic_t poll_scheduled;
/* Monitor RT polling work control */
struct task_struct __rcu *rtpoll_task;
struct timer_list rtpoll_timer;
wait_queue_head_t rtpoll_wait;
atomic_t rtpoll_wakeup;
atomic_t rtpoll_scheduled;
/* Protects data used by the monitor */
struct mutex trigger_lock;
/* Configured polling triggers */
struct list_head triggers;
u32 nr_triggers[NR_PSI_STATES - 1];
u32 poll_states;
u64 poll_min_period;
/* Total stall times at the start of monitor activation */
u64 polling_total[NR_PSI_STATES - 1];
u64 polling_next_update;
u64 polling_until;
struct mutex rtpoll_trigger_lock;
/* Configured RT polling triggers */
struct list_head rtpoll_triggers;
u32 rtpoll_nr_triggers[NR_PSI_STATES - 1];
u32 rtpoll_states;
u64 rtpoll_min_period;
/* Total stall times at the start of RT polling monitor activation */
u64 rtpoll_total[NR_PSI_STATES - 1];
u64 rtpoll_next_update;
u64 rtpoll_until;
};
#else /* CONFIG_PSI */
......
......@@ -36,6 +36,7 @@
#include <linux/seqlock.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
#include <linux/livepatch_sched.h>
#include <asm/kmap_size.h>
/* task_struct member predeclarations (sorted alphabetically): */
......@@ -1313,7 +1314,10 @@ struct task_struct {
#ifdef CONFIG_SCHED_MM_CID
int mm_cid; /* Current cid in mm */
int last_mm_cid; /* Most recent cid in mm */
int migrate_from_cpu;
int mm_cid_active; /* Whether cid bitmap is active */
struct callback_head cid_work;
#endif
struct tlbflush_unmap_batch tlb_ubc;
......@@ -2067,6 +2071,9 @@ extern int __cond_resched(void);
#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
void sched_dynamic_klp_enable(void);
void sched_dynamic_klp_disable(void);
DECLARE_STATIC_CALL(cond_resched, __cond_resched);
static __always_inline int _cond_resched(void)
......@@ -2075,6 +2082,7 @@ static __always_inline int _cond_resched(void)
}
#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
extern int dynamic_cond_resched(void);
static __always_inline int _cond_resched(void)
......@@ -2082,20 +2090,25 @@ static __always_inline int _cond_resched(void)
return dynamic_cond_resched();
}
#else
#else /* !CONFIG_PREEMPTION */
static inline int _cond_resched(void)
{
klp_sched_try_switch();
return __cond_resched();
}
#endif /* CONFIG_PREEMPT_DYNAMIC */
#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
#else
#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
static inline int _cond_resched(void) { return 0; }
static inline int _cond_resched(void)
{
klp_sched_try_switch();
return 0;
}
#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */
#define cond_resched() ({ \
__might_resched(__FILE__, __LINE__, 0); \
......
......@@ -37,6 +37,11 @@ static inline void mmgrab(struct mm_struct *mm)
atomic_inc(&mm->mm_count);
}
static inline void smp_mb__after_mmgrab(void)
{
smp_mb__after_atomic();
}
extern void __mmdrop(struct mm_struct *mm);
static inline void mmdrop(struct mm_struct *mm)
......
......@@ -3771,7 +3771,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
}
psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res);
new = psi_trigger_create(psi, buf, res, of->file);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
......
......@@ -924,6 +924,7 @@ void __mmdrop(struct mm_struct *mm)
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
mm_destroy_cid(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
percpu_counter_destroy(&mm->rss_stat[i]);
......@@ -1188,7 +1189,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid = -1;
tsk->last_mm_cid = -1;
tsk->mm_cid_active = 0;
tsk->migrate_from_cpu = -1;
#endif
return tsk;
......@@ -1296,18 +1299,22 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
if (mm_alloc_cid(mm))
goto fail_cid;
for (i = 0; i < NR_MM_COUNTERS; i++)
if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
goto fail_pcpu;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
mm_init_cid(mm);
return mm;
fail_pcpu:
while (i > 0)
percpu_counter_destroy(&mm->rss_stat[--i]);
mm_destroy_cid(mm);
fail_cid:
destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
......
......@@ -33,6 +33,7 @@
*
* - klp_ftrace_handler()
* - klp_update_patch_state()
* - __klp_sched_try_switch()
*/
DEFINE_MUTEX(klp_mutex);
......
......@@ -9,11 +9,14 @@
#include <linux/cpu.h>
#include <linux/stacktrace.h>
#include <linux/static_call.h>
#include "core.h"
#include "patch.h"
#include "transition.h"
#define MAX_STACK_ENTRIES 100
DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
#define STACK_ERR_BUF_SIZE 128
#define SIGNALS_TIMEOUT 15
......@@ -24,6 +27,25 @@ static int klp_target_state = KLP_UNDEFINED;
static unsigned int klp_signals_cnt;
/*
* When a livepatch is in progress, enable klp stack checking in
* cond_resched(). This helps CPU-bound kthreads get patched.
*/
#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define klp_cond_resched_enable() sched_dynamic_klp_enable()
#define klp_cond_resched_disable() sched_dynamic_klp_disable()
#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
EXPORT_SYMBOL(klp_sched_try_switch_key);
#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
/*
* This work can be performed periodically to finish patching or unpatching any
* "straggler" tasks which failed to transition in the first attempt.
......@@ -172,8 +194,8 @@ void klp_update_patch_state(struct task_struct *task)
* barrier (smp_rmb) for two cases:
*
* 1) Enforce the order of the TIF_PATCH_PENDING read and the
* klp_target_state read. The corresponding write barrier is in
* klp_init_transition().
* klp_target_state read. The corresponding write barriers are in
* klp_init_transition() and klp_reverse_transition().
*
* 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
* of func->transition, if klp_ftrace_handler() is called later on
......@@ -240,12 +262,15 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
*/
static int klp_check_stack(struct task_struct *task, const char **oldname)
{
static unsigned long entries[MAX_STACK_ENTRIES];
unsigned long *entries = this_cpu_ptr(klp_stack_entries);
struct klp_object *obj;
struct klp_func *func;
int ret, nr_entries;
ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
/* Protect 'klp_stack_entries' */
lockdep_assert_preemption_disabled();
ret = stack_trace_save_tsk_reliable(task, entries, MAX_STACK_ENTRIES);
if (ret < 0)
return -EINVAL;
nr_entries = ret;
......@@ -307,7 +332,11 @@ static bool klp_try_switch_task(struct task_struct *task)
* functions. If all goes well, switch the task to the target patch
* state.
*/
ret = task_call_func(task, klp_check_and_switch_task, &old_name);
if (task == current)
ret = klp_check_and_switch_task(current, &old_name);
else
ret = task_call_func(task, klp_check_and_switch_task, &old_name);
switch (ret) {
case 0: /* success */
break;
......@@ -334,6 +363,44 @@ static bool klp_try_switch_task(struct task_struct *task)
return !ret;
}
void __klp_sched_try_switch(void)
{
if (likely(!klp_patch_pending(current)))
return;
/*
* This function is called from cond_resched() which is called in many
* places throughout the kernel. Using the klp_mutex here might
* deadlock.
*
* Instead, disable preemption to prevent racing with other callers of
* klp_try_switch_task(). Thanks to task_call_func() they won't be
* able to switch this task while it's running.
*/
preempt_disable();
/*
* Make sure current didn't get patched between the above check and
* preempt_disable().
*/
if (unlikely(!klp_patch_pending(current)))
goto out;
/*
* Enforce the order of the TIF_PATCH_PENDING read above and the
* klp_target_state read in klp_try_switch_task(). The corresponding
* write barriers are in klp_init_transition() and
* klp_reverse_transition().
*/
smp_rmb();
klp_try_switch_task(current);
out:
preempt_enable();
}
EXPORT_SYMBOL(__klp_sched_try_switch);
/*
* Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
* Kthreads with TIF_PATCH_PENDING set are woken up.
......@@ -440,7 +507,8 @@ void klp_try_complete_transition(void)
return;
}
/* we're done, now cleanup the data structures */
/* Done! Now cleanup the data structures. */
klp_cond_resched_disable();
patch = klp_transition_patch;
klp_complete_transition();
......@@ -492,6 +560,8 @@ void klp_start_transition(void)
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
}
klp_cond_resched_enable();
klp_signals_cnt = 0;
}
......@@ -547,8 +617,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
* see a func in transition with a task->patch_state of KLP_UNDEFINED.
*
* Also enforce the order of the klp_target_state write and future
* TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
* set a task->patch_state to KLP_UNDEFINED.
* TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and
* __klp_sched_try_switch() don't set a task->patch_state to
* KLP_UNDEFINED.
*/
smp_wmb();
......@@ -584,14 +655,10 @@ void klp_reverse_transition(void)
klp_target_state == KLP_PATCHED ? "patching to unpatching" :
"unpatching to patching");
klp_transition_patch->enabled = !klp_transition_patch->enabled;
klp_target_state = !klp_target_state;
/*
* Clear all TIF_PATCH_PENDING flags to prevent races caused by
* klp_update_patch_state() running in parallel with
* klp_start_transition().
* klp_update_patch_state() or __klp_sched_try_switch() running in
* parallel with the reverse transition.
*/
read_lock(&tasklist_lock);
for_each_process_thread(g, task)
......@@ -601,9 +668,28 @@ void klp_reverse_transition(void)
for_each_possible_cpu(cpu)
clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
/* Let any remaining calls to klp_update_patch_state() complete */
/*
* Make sure all existing invocations of klp_update_patch_state() and
* __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before
* starting the reverse transition.
*/
klp_synchronize_transition();
/*
* All patching has stopped, now re-initialize the global variables to
* prepare for the reverse transition.
*/
klp_transition_patch->enabled = !klp_transition_patch->enabled;
klp_target_state = !klp_target_state;
/*
* Enforce the order of the klp_target_state write and the
* TIF_PATCH_PENDING writes in klp_start_transition() to ensure
* klp_update_patch_state() and __klp_sched_try_switch() don't set
* task->patch_state to the wrong value.
*/
smp_wmb();
klp_start_transition();
}
......@@ -617,9 +703,9 @@ void klp_copy_process(struct task_struct *child)
* the task flag up to date with the parent here.
*
* The operation is serialized against all klp_*_transition()
* operations by the tasklist_lock. The only exception is
* klp_update_patch_state(current), but we cannot race with
* that because we are current.
* operations by the tasklist_lock. The only exceptions are
* klp_update_patch_state(current) and __klp_sched_try_switch(), but we
* cannot race with them because we are current.
*/
if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
set_tsk_thread_flag(child, TIF_PATCH_PENDING);
......
......@@ -300,6 +300,9 @@ noinstr u64 local_clock(void)
if (static_branch_likely(&__sched_clock_stable))
return sched_clock() + __sched_clock_offset;
if (!static_branch_likely(&sched_clock_running))
return sched_clock();
preempt_disable_notrace();
clock = sched_clock_local(this_scd());
preempt_enable_notrace();
......
This diff is collapsed.
......@@ -2246,6 +2246,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
task_on_cpu(rq, task) ||
!dl_task(task) ||
is_migration_disabled(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
......@@ -2704,6 +2705,13 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
#endif
}
#ifdef CONFIG_SCHED_CORE
static int task_is_throttled_dl(struct task_struct *p, int cpu)
{
return p->dl.dl_throttled;
}
#endif
DEFINE_SCHED_CLASS(dl) = {
.enqueue_task = enqueue_task_dl,
......@@ -2736,6 +2744,9 @@ DEFINE_SCHED_CLASS(dl) = {
.switched_to = switched_to_dl,
.update_curr = update_curr_dl,
#ifdef CONFIG_SCHED_CORE
.task_is_throttled = task_is_throttled_dl,
#endif
};
/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
......
......@@ -6016,6 +6016,10 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
cfs_b->period_timer.function = sched_cfs_period_timer;
/* Add a random offset so that timers interleave */
hrtimer_set_expires(&cfs_b->period_timer,
get_random_u32_below(cfs_b->period));
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
cfs_b->slack_started = false;
......@@ -6671,7 +6675,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->stats.nr_wakeups_affine_attempts);
if (target == nr_cpumask_bits)
if (target != this_cpu)
return prev_cpu;
schedstat_inc(sd->ttwu_move_affine);
......@@ -12033,6 +12037,18 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
return delta > 0;
}
static int task_is_throttled_fair(struct task_struct *p, int cpu)
{
struct cfs_rq *cfs_rq;
#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq = task_group(p)->cfs_rq[cpu];
#else
cfs_rq = &cpu_rq(cpu)->cfs;
#endif
return throttled_hierarchy(cfs_rq);
}
#else
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
#endif
......@@ -12659,6 +12675,10 @@ DEFINE_SCHED_CLASS(fair) = {
.task_change_group = task_change_group_fair,
#endif
#ifdef CONFIG_SCHED_CORE
.task_is_throttled = task_is_throttled_fair,
#endif
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
......
This diff is collapsed.
......@@ -2000,11 +2000,15 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* the mean time, task could have
* migrated already or had its affinity changed.
* Also make sure that it wasn't scheduled on its rq.
* It is possible the task was scheduled, set
* "migrate_disabled" and then got preempted, so we must
* check the task migration disable flag here too.
*/
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
task_on_cpu(rq, task) ||
!rt_task(task) ||
is_migration_disabled(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, lowest_rq);
......@@ -2677,6 +2681,21 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0;
}
#ifdef CONFIG_SCHED_CORE
static int task_is_throttled_rt(struct task_struct *p, int cpu)
{
struct rt_rq *rt_rq;
#ifdef CONFIG_RT_GROUP_SCHED
rt_rq = task_group(p)->rt_rq[cpu];
#else
rt_rq = &cpu_rq(cpu)->rt;
#endif
return rt_rq_throttled(rt_rq);
}
#endif
DEFINE_SCHED_CLASS(rt) = {
.enqueue_task = enqueue_task_rt,
......@@ -2710,6 +2729,10 @@ DEFINE_SCHED_CLASS(rt) = {
.update_curr = update_curr_rt,
#ifdef CONFIG_SCHED_CORE
.task_is_throttled = task_is_throttled_rt,
#endif
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
......
......@@ -2224,6 +2224,10 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p);
#endif
#ifdef CONFIG_SCHED_CORE
int (*task_is_throttled)(struct task_struct *p, int cpu);
#endif
};
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
......@@ -3249,61 +3253,238 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
}
#ifdef CONFIG_SCHED_MM_CID
static inline int __mm_cid_get(struct mm_struct *mm)
#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
#define MM_CID_SCAN_DELAY 100 /* 100ms */
extern raw_spinlock_t cid_lock;
extern int use_cid_lock;
extern void sched_mm_cid_migrate_from(struct task_struct *t);
extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
extern void init_sched_mm_cid(struct task_struct *t);
static inline void __mm_cid_put(struct mm_struct *mm, int cid)
{
if (cid < 0)
return;
cpumask_clear_cpu(cid, mm_cidmask(mm));
}
/*
* The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
* the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
* be held to transition to other states.
*
* State transitions synchronized with cmpxchg or try_cmpxchg need to be
* consistent across cpus, which prevents use of this_cpu_cmpxchg.
*/
static inline void mm_cid_put_lazy(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
int cid;
lockdep_assert_irqs_disabled();
cid = __this_cpu_read(pcpu_cid->cid);
if (!mm_cid_is_lazy_put(cid) ||
!try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
return;
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
}
static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
{
struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
int cid, res;
lockdep_assert_irqs_disabled();
cid = __this_cpu_read(pcpu_cid->cid);
for (;;) {
if (mm_cid_is_unset(cid))
return MM_CID_UNSET;
/*
* Attempt transition from valid or lazy-put to unset.
*/
res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
if (res == cid)
break;
cid = res;
}
return cid;
}
static inline void mm_cid_put(struct mm_struct *mm)
{
int cid;
lockdep_assert_irqs_disabled();
cid = mm_cid_pcpu_unset(mm);
if (cid == MM_CID_UNSET)
return;
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
}
static inline int __mm_cid_try_get(struct mm_struct *mm)
{
struct cpumask *cpumask;
int cid;
cpumask = mm_cidmask(mm);
cid = cpumask_first_zero(cpumask);
if (cid >= nr_cpu_ids)
/*
* Retry finding first zero bit if the mask is temporarily
* filled. This only happens during concurrent remote-clear
* which owns a cid without holding a rq lock.
*/
for (;;) {
cid = cpumask_first_zero(cpumask);
if (cid < nr_cpu_ids)
break;
cpu_relax();
}
if (cpumask_test_and_set_cpu(cid, cpumask))
return -1;
__cpumask_set_cpu(cid, cpumask);
return cid;
}
static inline void mm_cid_put(struct mm_struct *mm, int cid)
/*
* Save a snapshot of the current runqueue time of this cpu
* with the per-cpu cid value, allowing to estimate how recently it was used.
*/
static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
{
struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
lockdep_assert_rq_held(rq);
WRITE_ONCE(pcpu_cid->time, rq->clock);
}
static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
{
lockdep_assert_irqs_disabled();
if (cid < 0)
return;
raw_spin_lock(&mm->cid_lock);
__cpumask_clear_cpu(cid, mm_cidmask(mm));
raw_spin_unlock(&mm->cid_lock);
int cid;
/*
* All allocations (even those using the cid_lock) are lock-free. If
* use_cid_lock is set, hold the cid_lock to perform cid allocation to
* guarantee forward progress.
*/
if (!READ_ONCE(use_cid_lock)) {
cid = __mm_cid_try_get(mm);
if (cid >= 0)
goto end;
raw_spin_lock(&cid_lock);
} else {
raw_spin_lock(&cid_lock);
cid = __mm_cid_try_get(mm);
if (cid >= 0)
goto unlock;
}
/*
* cid concurrently allocated. Retry while forcing following
* allocations to use the cid_lock to ensure forward progress.
*/
WRITE_ONCE(use_cid_lock, 1);
/*
* Set use_cid_lock before allocation. Only care about program order
* because this is only required for forward progress.
*/
barrier();
/*
* Retry until it succeeds. It is guaranteed to eventually succeed once
* all newcoming allocations observe the use_cid_lock flag set.
*/
do {
cid = __mm_cid_try_get(mm);
cpu_relax();
} while (cid < 0);
/*
* Allocate before clearing use_cid_lock. Only care about
* program order because this is for forward progress.
*/
barrier();
WRITE_ONCE(use_cid_lock, 0);
unlock:
raw_spin_unlock(&cid_lock);
end:
mm_cid_snapshot_time(rq, mm);
return cid;
}
static inline int mm_cid_get(struct mm_struct *mm)
static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
{
int ret;
struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
struct cpumask *cpumask;
int cid;
lockdep_assert_irqs_disabled();
raw_spin_lock(&mm->cid_lock);
ret = __mm_cid_get(mm);
raw_spin_unlock(&mm->cid_lock);
return ret;
lockdep_assert_rq_held(rq);
cpumask = mm_cidmask(mm);
cid = __this_cpu_read(pcpu_cid->cid);
if (mm_cid_is_valid(cid)) {
mm_cid_snapshot_time(rq, mm);
return cid;
}
if (mm_cid_is_lazy_put(cid)) {
if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
}
cid = __mm_cid_get(rq, mm);
__this_cpu_write(pcpu_cid->cid, cid);
return cid;
}
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
static inline void switch_mm_cid(struct rq *rq,
struct task_struct *prev,
struct task_struct *next)
{
/*
* Provide a memory barrier between rq->curr store and load of
* {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
*
* Should be adapted if context_switch() is modified.
*/
if (!next->mm) { // to kernel
/*
* user -> kernel transition does not guarantee a barrier, but
* we can use the fact that it performs an atomic operation in
* mmgrab().
*/
if (prev->mm) // from user
smp_mb__after_mmgrab();
/*
* kernel -> kernel transition does not change rq->curr->mm
* state. It stays NULL.
*/
} else { // to user
/*
* kernel -> user transition does not provide a barrier
* between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
* Provide it here.
*/
if (!prev->mm) // from kernel
smp_mb();
/*
* user -> user transition guarantees a memory barrier through
* switch_mm() when current->mm changes. If current->mm is
* unchanged, no barrier is needed.
*/
}
if (prev->mm_cid_active) {
if (next->mm_cid_active && next->mm == prev->mm) {
/*
* Context switch between threads in same mm, hand over
* the mm_cid from prev to next.
*/
next->mm_cid = prev->mm_cid;
prev->mm_cid = -1;
return;
}
mm_cid_put(prev->mm, prev->mm_cid);
mm_cid_snapshot_time(rq, prev->mm);
mm_cid_put_lazy(prev);
prev->mm_cid = -1;
}
if (next->mm_cid_active)
next->mm_cid = mm_cid_get(next->mm);
next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm);
}
#else
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
static inline void init_sched_mm_cid(struct task_struct *t) { }
#endif
#endif /* _KERNEL_SCHED_SCHED_H */
......@@ -209,8 +209,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
DEFINE_STATIC_KEY_FALSE(sched_energy_present);
static unsigned int sysctl_sched_energy_aware = 1;
DEFINE_MUTEX(sched_energy_mutex);
bool sched_energy_update;
static DEFINE_MUTEX(sched_energy_mutex);
static bool sched_energy_update;
void rebuild_sched_domains_energy(void)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment