Commit 12fa97c6 authored by Peter Zijlstra's avatar Peter Zijlstra

Merge branch 'sched/migrate-disable'

parents b6d37a76 c777d847
......@@ -382,9 +382,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{
seq_printf(m, "Cpus_allowed:\t%*pb\n",
cpumask_pr_args(task->cpus_ptr));
cpumask_pr_args(&task->cpus_mask));
seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
cpumask_pr_args(task->cpus_ptr));
cpumask_pr_args(&task->cpus_mask));
}
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
......
......@@ -152,6 +152,7 @@ enum cpuhp_state {
CPUHP_AP_ONLINE,
CPUHP_TEARDOWN_CPU,
CPUHP_AP_ONLINE_IDLE,
CPUHP_AP_SCHED_WAIT_EMPTY,
CPUHP_AP_SMPBOOT_THREADS,
CPUHP_AP_X86_VDSO_VMA_ONLINE,
CPUHP_AP_IRQ_AFFINITY_ONLINE,
......
......@@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
return cpumask_next_and(-1, src1p, src2p);
}
static inline int cpumask_any_distribute(const struct cpumask *srcp)
{
return cpumask_first(srcp);
}
#define for_each_cpu(cpu, mask) \
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_not(cpu, mask) \
......@@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
unsigned int cpumask_local_spread(unsigned int i, int node);
int cpumask_any_and_distribute(const struct cpumask *src1p,
const struct cpumask *src2p);
int cpumask_any_distribute(const struct cpumask *srcp);
/**
* for_each_cpu - iterate over every cpu in a mask
......
......@@ -322,6 +322,73 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
#endif
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
/*
* Migrate-Disable and why it is undesired.
*
* When a preempted task becomes elegible to run under the ideal model (IOW it
* becomes one of the M highest priority tasks), it might still have to wait
* for the preemptee's migrate_disable() section to complete. Thereby suffering
* a reduction in bandwidth in the exact duration of the migrate_disable()
* section.
*
* Per this argument, the change from preempt_disable() to migrate_disable()
* gets us:
*
* - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
* it would have had to wait for the lower priority task.
*
* - a lower priority tasks; which under preempt_disable() could've instantly
* migrated away when another CPU becomes available, is now constrained
* by the ability to push the higher priority task away, which might itself be
* in a migrate_disable() section, reducing it's available bandwidth.
*
* IOW it trades latency / moves the interference term, but it stays in the
* system, and as long as it remains unbounded, the system is not fully
* deterministic.
*
*
* The reason we have it anyway.
*
* PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
* number of primitives into becoming preemptible, they would also allow
* migration. This turns out to break a bunch of per-cpu usage. To this end,
* all these primitives employ migirate_disable() to restore this implicit
* assumption.
*
* This is a 'temporary' work-around at best. The correct solution is getting
* rid of the above assumptions and reworking the code to employ explicit
* per-cpu locking or short preempt-disable regions.
*
* The end goal must be to get rid of migrate_disable(), alternatively we need
* a schedulability theory that does not depend on abritrary migration.
*
*
* Notes on the implementation.
*
* The implementation is particularly tricky since existing code patterns
* dictate neither migrate_disable() nor migrate_enable() is allowed to block.
* This means that it cannot use cpus_read_lock() to serialize against hotplug,
* nor can it easily migrate itself into a pending affinity mask change on
* migrate_enable().
*
*
* Note: even non-work-conserving schedulers like semi-partitioned depends on
* migration, so migrate_disable() is not only a problem for
* work-conserving schedulers.
*
*/
extern void migrate_disable(void);
extern void migrate_enable(void);
#elif defined(CONFIG_PREEMPT_RT)
static inline void migrate_disable(void) { }
static inline void migrate_enable(void) { }
#else /* !CONFIG_PREEMPT_RT */
/**
* migrate_disable - Prevent migration of the current task
*
......@@ -352,4 +419,6 @@ static __always_inline void migrate_enable(void)
preempt_enable();
}
#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */
#endif /* __LINUX_PREEMPT_H */
......@@ -714,6 +714,11 @@ struct task_struct {
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t cpus_mask;
void *migration_pending;
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
unsigned short migration_disabled;
#endif
unsigned short migration_flags;
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
......
......@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu);
extern int sched_cpu_deactivate(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU
extern int sched_cpu_wait_empty(unsigned int cpu);
extern int sched_cpu_dying(unsigned int cpu);
#else
# define sched_cpu_wait_empty NULL
# define sched_cpu_dying NULL
#endif
......
......@@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
struct cpu_stop_work {
struct list_head list; /* cpu_stopper->works */
cpu_stop_fn_t fn;
unsigned long caller;
void *arg;
struct cpu_stop_done *done;
};
......@@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
void stop_machine_unpark(int cpu);
void stop_machine_yield(const struct cpumask *cpumask);
extern void print_stop_info(const char *log_lvl, struct task_struct *task);
#else /* CONFIG_SMP */
#include <linux/workqueue.h>
......@@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu,
return false;
}
static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
#endif /* CONFIG_SMP */
/*
......
......@@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.name = "ap:online",
},
/*
* Handled on controll processor until the plugged processor manages
* Handled on control processor until the plugged processor manages
* this itself.
*/
[CPUHP_TEARDOWN_CPU] = {
......@@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.teardown.single = takedown_cpu,
.cant_stop = true,
},
[CPUHP_AP_SCHED_WAIT_EMPTY] = {
.name = "sched:waitempty",
.startup.single = NULL,
.teardown.single = sched_cpu_wait_empty,
},
/* Handle smpboot threads park/unpark */
[CPUHP_AP_SMPBOOT_THREADS] = {
.name = "smpboot/threads:online",
......
......@@ -1696,6 +1696,80 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT_RT
static void
__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
static int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask,
u32 flags);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
if (likely(!p->migration_disabled))
return;
if (p->cpus_ptr != &p->cpus_mask)
return;
/*
* Violates locking rules! see comment in __do_set_cpus_allowed().
*/
__do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
}
void migrate_disable(void)
{
struct task_struct *p = current;
if (p->migration_disabled) {
p->migration_disabled++;
return;
}
preempt_disable();
this_rq()->nr_pinned++;
p->migration_disabled = 1;
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
void migrate_enable(void)
{
struct task_struct *p = current;
if (p->migration_disabled > 1) {
p->migration_disabled--;
return;
}
/*
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
*/
preempt_disable();
if (p->cpus_ptr != &p->cpus_mask)
__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
/*
* Mustn't clear migration_disabled() until cpus_ptr points back at the
* regular cpus_mask, otherwise things that race (eg.
* select_fallback_rq) get confused.
*/
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
static inline bool rq_has_pinned_tasks(struct rq *rq)
{
return rq->nr_pinned;
}
#endif
/*
* Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
......@@ -1705,7 +1779,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
if (is_per_cpu_kthread(p))
if (is_per_cpu_kthread(p) || is_migration_disabled(p))
return cpu_online(cpu);
return cpu_active(cpu);
......@@ -1750,8 +1824,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
}
struct migration_arg {
struct task_struct *task;
int dest_cpu;
struct task_struct *task;
int dest_cpu;
struct set_affinity_pending *pending;
};
struct set_affinity_pending {
refcount_t refs;
struct completion done;
struct cpu_stop_work stop_work;
struct migration_arg arg;
};
/*
......@@ -1783,16 +1865,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
*/
static int migration_cpu_stop(void *data)
{
struct set_affinity_pending *pending;
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
bool complete = false;
struct rq_flags rf;
/*
* The original target CPU might have gone down and we might
* be on another CPU but it doesn't matter.
*/
local_irq_disable();
local_irq_save(rf.flags);
/*
* We need to explicitly wake pending tasks before running
* __migrate_task() such that we will not miss enforcing cpus_ptr
......@@ -1802,21 +1887,126 @@ static int migration_cpu_stop(void *data)
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
if (task_rq(p) == rq) {
if (is_migration_disabled(p))
goto out;
if (pending) {
p->migration_pending = NULL;
complete = true;
}
/* migrate_enable() -- we must not race against SCA */
if (dest_cpu < 0) {
/*
* When this was migrate_enable() but we no longer
* have a @pending, a concurrent SCA 'fixed' things
* and we should be valid again. Nothing to do.
*/
if (!pending) {
WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
goto out;
}
dest_cpu = cpumask_any_distribute(&p->cpus_mask);
}
if (task_on_rq_queued(p))
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
rq = __migrate_task(rq, &rf, p, dest_cpu);
else
p->wake_cpu = arg->dest_cpu;
p->wake_cpu = dest_cpu;
} else if (dest_cpu < 0) {
/*
* This happens when we get migrated between migrate_enable()'s
* preempt_enable() and scheduling the stopper task. At that
* point we're a regular task again and not current anymore.
*
* A !PREEMPT kernel has a giant hole here, which makes it far
* more likely.
*/
/*
* When this was migrate_enable() but we no longer have an
* @pending, a concurrent SCA 'fixed' things and we should be
* valid again. Nothing to do.
*/
if (!pending) {
WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
goto out;
}
/*
* When migrate_enable() hits a rq mis-match we can't reliably
* determine is_migration_disabled() and so have to chase after
* it.
*/
task_rq_unlock(rq, p, &rf);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
return 0;
}
rq_unlock(rq, &rf);
raw_spin_unlock(&p->pi_lock);
out:
task_rq_unlock(rq, p, &rf);
if (complete)
complete_all(&pending->done);
/* For pending->{arg,stop_work} */
pending = arg->pending;
if (pending && refcount_dec_and_test(&pending->refs))
wake_up_var(&pending->refs);
local_irq_enable();
return 0;
}
int push_cpu_stop(void *arg)
{
struct rq *lowest_rq = NULL, *rq = this_rq();
struct task_struct *p = arg;
raw_spin_lock_irq(&p->pi_lock);
raw_spin_lock(&rq->lock);
if (task_rq(p) != rq)
goto out_unlock;
if (is_migration_disabled(p)) {
p->migration_flags |= MDF_PUSH;
goto out_unlock;
}
p->migration_flags &= ~MDF_PUSH;
if (p->sched_class->find_lock_rq)
lowest_rq = p->sched_class->find_lock_rq(p, rq);
if (!lowest_rq)
goto out_unlock;
// XXX validate p is still the highest prio task
if (task_rq(p) == rq) {
deactivate_task(rq, p, 0);
set_task_cpu(p, lowest_rq->cpu);
activate_task(lowest_rq, p, 0);
resched_curr(lowest_rq);
}
double_unlock_balance(rq, lowest_rq);
out_unlock:
rq->push_busy = false;
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
return 0;
}
......@@ -1824,18 +2014,39 @@ static int migration_cpu_stop(void *data)
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
*/
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
{
if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
p->cpus_ptr = new_mask;
return;
}
cpumask_copy(&p->cpus_mask, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
static void
__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
{
struct rq *rq = task_rq(p);
bool queued, running;
lockdep_assert_held(&p->pi_lock);
/*
* This here violates the locking rules for affinity, since we're only
* supposed to change these variables while holding both rq->lock and
* p->pi_lock.
*
* HOWEVER, it magically works, because ttwu() is the only code that
* accesses these variables under p->pi_lock and only does so after
* smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
* before finish_task().
*
* XXX do further audits, this smells like something putrid.
*/
if (flags & SCA_MIGRATE_DISABLE)
SCHED_WARN_ON(!p->on_cpu);
else
lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
......@@ -1851,7 +2062,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
if (running)
put_prev_task(rq, p);
p->sched_class->set_cpus_allowed(p, new_mask);
p->sched_class->set_cpus_allowed(p, new_mask, flags);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
......@@ -1859,6 +2070,208 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
set_next_task(rq, p);
}
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
__do_set_cpus_allowed(p, new_mask, 0);
}
/*
* This function is wildly self concurrent; here be dragons.
*
*
* When given a valid mask, __set_cpus_allowed_ptr() must block until the
* designated task is enqueued on an allowed CPU. If that task is currently
* running, we have to kick it out using the CPU stopper.
*
* Migrate-Disable comes along and tramples all over our nice sandcastle.
* Consider:
*
* Initial conditions: P0->cpus_mask = [0, 1]
*
* P0@CPU0 P1
*
* migrate_disable();
* <preempted>
* set_cpus_allowed_ptr(P0, [1]);
*
* P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
* its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
* This means we need the following scheme:
*
* P0@CPU0 P1
*
* migrate_disable();
* <preempted>
* set_cpus_allowed_ptr(P0, [1]);
* <blocks>
* <resumes>
* migrate_enable();
* __set_cpus_allowed_ptr();
* <wakes local stopper>
* `--> <woken on migration completion>
*
* Now the fun stuff: there may be several P1-like tasks, i.e. multiple
* concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
* task p are serialized by p->pi_lock, which we can leverage: the one that
* should come into effect at the end of the Migrate-Disable region is the last
* one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
* but we still need to properly signal those waiting tasks at the appropriate
* moment.
*
* This is implemented using struct set_affinity_pending. The first
* __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
* setup an instance of that struct and install it on the targeted task_struct.
* Any and all further callers will reuse that instance. Those then wait for
* a completion signaled at the tail of the CPU stopper callback (1), triggered
* on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
*
*
* (1) In the cases covered above. There is one more where the completion is
* signaled within affine_move_task() itself: when a subsequent affinity request
* cancels the need for an active migration. Consider:
*
* Initial conditions: P0->cpus_mask = [0, 1]
*
* P0@CPU0 P1 P2
*
* migrate_disable();
* <preempted>
* set_cpus_allowed_ptr(P0, [1]);
* <blocks>
* set_cpus_allowed_ptr(P0, [0, 1]);
* <signal completion>
* <awakes>
*
* Note that the above is safe vs a concurrent migrate_enable(), as any
* pending affinity completion is preceded by an uninstallation of
* p->migration_pending done with p->pi_lock held.
*/
static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
int dest_cpu, unsigned int flags)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
struct migration_arg arg = {
.task = p,
.dest_cpu = dest_cpu,
};
bool complete = false;
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
struct task_struct *push_task = NULL;
if ((flags & SCA_MIGRATE_ENABLE) &&
(p->migration_flags & MDF_PUSH) && !rq->push_busy) {
rq->push_busy = true;
push_task = get_task_struct(p);
}
pending = p->migration_pending;
if (pending) {
refcount_inc(&pending->refs);
p->migration_pending = NULL;
complete = true;
}
task_rq_unlock(rq, p, rf);
if (push_task) {
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
p, &rq->push_work);
}
if (complete)
goto do_complete;
return 0;
}
if (!(flags & SCA_MIGRATE_ENABLE)) {
/* serialized by p->pi_lock */
if (!p->migration_pending) {
/* Install the request */
refcount_set(&my_pending.refs, 1);
init_completion(&my_pending.done);
p->migration_pending = &my_pending;
} else {
pending = p->migration_pending;
refcount_inc(&pending->refs);
}
}
pending = p->migration_pending;
/*
* - !MIGRATE_ENABLE:
* we'll have installed a pending if there wasn't one already.
*
* - MIGRATE_ENABLE:
* we're here because the current CPU isn't matching anymore,
* the only way that can happen is because of a concurrent
* set_cpus_allowed_ptr() call, which should then still be
* pending completion.
*
* Either way, we really should have a @pending here.
*/
if (WARN_ON_ONCE(!pending)) {
task_rq_unlock(rq, p, rf);
return -EINVAL;
}
if (flags & SCA_MIGRATE_ENABLE) {
refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
p->migration_flags &= ~MDF_PUSH;
task_rq_unlock(rq, p, rf);
pending->arg = (struct migration_arg) {
.task = p,
.dest_cpu = -1,
.pending = pending,
};
stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
&pending->arg, &pending->stop_work);
return 0;
}
if (task_running(rq, p) || p->state == TASK_WAKING) {
/*
* Lessen races (and headaches) by delegating
* is_migration_disabled(p) checks to the stopper, which will
* run on the same CPU as said p.
*/
task_rq_unlock(rq, p, rf);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
} else {
if (!is_migration_disabled(p)) {
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);
p->migration_pending = NULL;
complete = true;
}
task_rq_unlock(rq, p, rf);
do_complete:
if (complete)
complete_all(&pending->done);
}
wait_for_completion(&pending->done);
if (refcount_dec_and_test(&pending->refs))
wake_up_var(&pending->refs);
/*
* Block the original owner of &pending until all subsequent callers
* have seen the completion and decremented the refcount
*/
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
return 0;
}
/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
......@@ -1869,7 +2282,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* call is not atomic; no spinlocks may be held.
*/
static int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask, bool check)
const struct cpumask *new_mask,
u32 flags)
{
const struct cpumask *cpu_valid_mask = cpu_active_mask;
unsigned int dest_cpu;
......@@ -1880,9 +2294,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
if (p->flags & PF_KTHREAD) {
if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
/*
* Kernel threads are allowed on online && !active CPUs
* Kernel threads are allowed on online && !active CPUs.
*
* Specifically, migration_disabled() tasks must not fail the
* cpumask_any_and_distribute() pick below, esp. so on
* SCA_MIGRATE_ENABLE, otherwise we'll not call
* set_cpus_allowed_common() and actually reset p->cpus_ptr.
*/
cpu_valid_mask = cpu_online_mask;
}
......@@ -1891,13 +2310,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
*/
if (check && (p->flags & PF_NO_SETAFFINITY)) {
if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out;
}
if (cpumask_equal(&p->cpus_mask, new_mask))
goto out;
if (!(flags & SCA_MIGRATE_ENABLE)) {
if (cpumask_equal(&p->cpus_mask, new_mask))
goto out;
if (WARN_ON_ONCE(p == current &&
is_migration_disabled(p) &&
!cpumask_test_cpu(task_cpu(p), new_mask))) {
ret = -EBUSY;
goto out;
}
}
/*
* Picking a ~random cpu helps in cases where we are changing affinity
......@@ -1910,7 +2338,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
do_set_cpus_allowed(p, new_mask);
__do_set_cpus_allowed(p, new_mask, flags);
if (p->flags & PF_KTHREAD) {
/*
......@@ -1922,23 +2350,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
p->nr_cpus_allowed != 1);
}
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
return affine_move_task(rq, p, &rf, dest_cpu, flags);
if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, p, &rf);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
return 0;
} else if (task_on_rq_queued(p)) {
/*
* OK, since we're going to drop the lock immediately
* afterwards anyway.
*/
rq = move_queued_task(rq, &rf, p, dest_cpu);
}
out:
task_rq_unlock(rq, p, &rf);
......@@ -1947,7 +2360,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
return __set_cpus_allowed_ptr(p, new_mask, false);
return __set_cpus_allowed_ptr(p, new_mask, 0);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
......@@ -1988,6 +2401,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
*/
WARN_ON_ONCE(!cpu_online(new_cpu));
WARN_ON_ONCE(is_migration_disabled(p));
#endif
trace_sched_migrate_task(p, new_cpu);
......@@ -2318,6 +2733,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
fallthrough;
case possible:
/*
* XXX When called from select_task_rq() we only
* hold p->pi_lock and again violate locking order.
*
* More yuck to audit.
*/
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
break;
......@@ -2352,7 +2773,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
lockdep_assert_held(&p->pi_lock);
if (p->nr_cpus_allowed > 1)
if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
else
cpu = cpumask_any(p->cpus_ptr);
......@@ -2375,6 +2796,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
static struct lock_class_key stop_pi_lock;
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
struct task_struct *old_stop = cpu_rq(cpu)->stop;
......@@ -2390,6 +2812,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
stop->sched_class = &stop_sched_class;
/*
* The PI code calls rt_mutex_setprio() with ->pi_lock held to
* adjust the effective priority of a task. As a result,
* rt_mutex_setprio() can trigger (RT) balancing operations,
* which can then trigger wakeups of the stop thread to push
* around the current task.
*
* The stop task itself will never be part of the PI-chain, it
* never blocks, therefore that ->pi_lock recursion is safe.
* Tell lockdep about this by placing the stop->pi_lock in its
* own class.
*/
lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
}
cpu_rq(cpu)->stop = stop;
......@@ -2406,13 +2842,25 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
#else
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask, bool check)
const struct cpumask *new_mask,
u32 flags)
{
return set_cpus_allowed_ptr(p, new_mask);
}
#endif /* CONFIG_SMP */
#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT)
static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
static inline bool rq_has_pinned_tasks(struct rq *rq)
{
return false;
}
#endif
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
......@@ -3098,6 +3546,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
#endif
}
......@@ -3485,6 +3934,90 @@ static inline void finish_task(struct task_struct *prev)
#endif
}
#ifdef CONFIG_SMP
static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
{
void (*func)(struct rq *rq);
struct callback_head *next;
lockdep_assert_held(&rq->lock);
while (head) {
func = (void (*)(struct rq *))head->func;
next = head->next;
head->next = NULL;
head = next;
func(rq);
}
}
static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
{
struct callback_head *head = rq->balance_callback;
lockdep_assert_held(&rq->lock);
if (head) {
rq->balance_callback = NULL;
rq->balance_flags &= ~BALANCE_WORK;
}
return head;
}
static void __balance_callbacks(struct rq *rq)
{
do_balance_callbacks(rq, splice_balance_callbacks(rq));
}
static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
{
unsigned long flags;
if (unlikely(head)) {
raw_spin_lock_irqsave(&rq->lock, flags);
do_balance_callbacks(rq, head);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
}
static void balance_push(struct rq *rq);
static inline void balance_switch(struct rq *rq)
{
if (likely(!rq->balance_flags))
return;
if (rq->balance_flags & BALANCE_PUSH) {
balance_push(rq);
return;
}
__balance_callbacks(rq);
}
#else
static inline void __balance_callbacks(struct rq *rq)
{
}
static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
{
return NULL;
}
static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
{
}
static inline void balance_switch(struct rq *rq)
{
}
#endif
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
......@@ -3510,6 +4043,7 @@ static inline void finish_lock_switch(struct rq *rq)
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
balance_switch(rq);
raw_spin_unlock_irq(&rq->lock);
}
......@@ -3651,43 +4185,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
return rq;
}
#ifdef CONFIG_SMP
/* rq->lock is NOT held, but preemption is disabled */
static void __balance_callback(struct rq *rq)
{
struct callback_head *head, *next;
void (*func)(struct rq *rq);
unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags);
head = rq->balance_callback;
rq->balance_callback = NULL;
while (head) {
func = (void (*)(struct rq *))head->func;
next = head->next;
head->next = NULL;
head = next;
func(rq);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
static inline void balance_callback(struct rq *rq)
{
if (unlikely(rq->balance_callback))
__balance_callback(rq);
}
#else
static inline void balance_callback(struct rq *rq)
{
}
#endif
/**
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
......@@ -3707,7 +4204,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
*/
rq = finish_task_switch(prev);
balance_callback(rq);
preempt_enable();
if (current->set_child_tid)
......@@ -4515,6 +5011,7 @@ static void __sched notrace __schedule(bool preempt)
*/
++*switch_count;
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(preempt, prev, next);
......@@ -4523,10 +5020,11 @@ static void __sched notrace __schedule(bool preempt)
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
}
balance_callback(rq);
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_unlock_irq(&rq->lock);
}
}
void __noreturn do_task_dead(void)
......@@ -4937,9 +5435,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
__task_rq_unlock(rq, &rf);
balance_callback(rq);
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_unlock(&rq->lock);
preempt_enable();
}
#else
......@@ -5213,6 +5713,7 @@ static int __sched_setscheduler(struct task_struct *p,
int retval, oldprio, oldpolicy = -1, queued, running;
int new_effective_prio, policy = attr->sched_policy;
const struct sched_class *prev_class;
struct callback_head *head;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
......@@ -5451,6 +5952,7 @@ static int __sched_setscheduler(struct task_struct *p,
/* Avoid rq from going away on us: */
preempt_disable();
head = splice_balance_callbacks(rq);
task_rq_unlock(rq, p, &rf);
if (pi) {
......@@ -5459,7 +5961,7 @@ static int __sched_setscheduler(struct task_struct *p,
}
/* Run balance callbacks after we've adjusted the PI chain: */
balance_callback(rq);
balance_callbacks(rq, head);
preempt_enable();
return 0;
......@@ -5954,7 +6456,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
#endif
again:
retval = __set_cpus_allowed_ptr(p, new_mask, true);
retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
if (!retval) {
cpuset_cpus_allowed(p, cpus_allowed);
......@@ -6443,6 +6945,7 @@ void sched_show_task(struct task_struct *p)
(unsigned long)task_thread_info(p)->flags);
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
show_stack(p, NULL, KERN_INFO);
put_task_stack(p);
}
......@@ -6533,7 +7036,7 @@ void init_idle(struct task_struct *idle, int cpu)
*
* And since this is boot we can forgo the serialization.
*/
set_cpus_allowed_common(idle, cpumask_of(cpu));
set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
#endif
/*
* We're having a chicken and egg problem, even though we are
......@@ -6684,119 +7187,126 @@ void idle_task_exit(void)
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
/*
* Since this CPU is going 'away' for a while, fold any nr_active delta
* we might have. Assumes we're called after migrate_tasks() so that the
* nr_active count is stable. We need to take the teardown thread which
* is calling this into account, so we hand in adjust = 1 to the load
* calculation.
*
* Also see the comment "Global load-average calculations".
*/
static void calc_load_migrate(struct rq *rq)
static int __balance_push_cpu_stop(void *arg)
{
long delta = calc_load_fold_active(rq, 1);
if (delta)
atomic_long_add(delta, &calc_load_tasks);
}
struct task_struct *p = arg;
struct rq *rq = this_rq();
struct rq_flags rf;
int cpu;
static struct task_struct *__pick_migrate_task(struct rq *rq)
{
const struct sched_class *class;
struct task_struct *next;
raw_spin_lock_irq(&p->pi_lock);
rq_lock(rq, &rf);
for_each_class(class) {
next = class->pick_next_task(rq);
if (next) {
next->sched_class->put_prev_task(rq, next);
return next;
}
update_rq_clock(rq);
if (task_rq(p) == rq && task_on_rq_queued(p)) {
cpu = select_fallback_rq(rq->cpu, p);
rq = __migrate_task(rq, &rf, p, cpu);
}
/* The idle class should always have a runnable task */
BUG();
rq_unlock(rq, &rf);
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
return 0;
}
static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
* try_to_wake_up()->select_task_rq().
*
* Called with rq->lock held even though we'er in stop_machine() and
* there's no concurrency possible, we hold the required locks anyway
* because of lock validation efforts.
* Ensure we only run per-cpu kthreads once the CPU goes !active.
*/
static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
static void balance_push(struct rq *rq)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
struct rq_flags orf = *rf;
int dest_cpu;
struct task_struct *push_task = rq->curr;
lockdep_assert_held(&rq->lock);
SCHED_WARN_ON(rq->cpu != smp_processor_id());
/*
* Fudge the rq selection such that the below task selection loop
* doesn't get stuck on the currently eligible stop task.
*
* We're currently inside stop_machine() and the rq is either stuck
* in the stop_machine_cpu_stop() loop, or we're executing this code,
* either way we should never end up calling schedule() until we're
* done here.
* Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process.
*/
rq->stop = NULL;
if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
/*
* If this is the idle task on the outgoing CPU try to wake
* up the hotplug control thread which might wait for the
* last task to vanish. The rcuwait_active() check is
* accurate here because the waiter is pinned on this CPU
* and can't obviously be running in parallel.
*
* On RT kernels this also has to check whether there are
* pinned and scheduled out tasks on the runqueue. They
* need to leave the migrate disabled section first.
*/
if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
rcuwait_active(&rq->hotplug_wait)) {
raw_spin_unlock(&rq->lock);
rcuwait_wake_up(&rq->hotplug_wait);
raw_spin_lock(&rq->lock);
}
return;
}
get_task_struct(push_task);
/*
* put_prev_task() and pick_next_task() sched
* class method both need to have an up-to-date
* value of rq->clock[_task]
* Temporarily drop rq->lock such that we can wake-up the stop task.
* Both preemption and IRQs are still disabled.
*/
update_rq_clock(rq);
raw_spin_unlock(&rq->lock);
stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
this_cpu_ptr(&push_work));
/*
* At this point need_resched() is true and we'll take the loop in
* schedule(). The next pick is obviously going to be the stop task
* which is_per_cpu_kthread() and will push this task away.
*/
raw_spin_lock(&rq->lock);
}
for (;;) {
/*
* There's this thread running, bail when that's the only
* remaining thread:
*/
if (rq->nr_running == 1)
break;
static void balance_push_set(int cpu, bool on)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
next = __pick_migrate_task(rq);
rq_lock_irqsave(rq, &rf);
if (on)
rq->balance_flags |= BALANCE_PUSH;
else
rq->balance_flags &= ~BALANCE_PUSH;
rq_unlock_irqrestore(rq, &rf);
}
/*
* Rules for changing task_struct::cpus_mask are holding
* both pi_lock and rq->lock, such that holding either
* stabilizes the mask.
*
* Drop rq->lock is not quite as disastrous as it usually is
* because !cpu_active at this point, which means load-balance
* will not interfere. Also, stop-machine.
*/
rq_unlock(rq, rf);
raw_spin_lock(&next->pi_lock);
rq_relock(rq, rf);
/*
* Invoked from a CPUs hotplug control thread after the CPU has been marked
* inactive. All tasks which are not per CPU kernel threads are either
* pushed off this CPU now via balance_push() or placed on a different CPU
* during wakeup. Wait until the CPU is quiescent.
*/
static void balance_hotplug_wait(void)
{
struct rq *rq = this_rq();
/*
* Since we're inside stop-machine, _nothing_ should have
* changed the task, WARN if weird stuff happened, because in
* that case the above rq->lock drop is a fail too.
*/
if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
raw_spin_unlock(&next->pi_lock);
continue;
}
rcuwait_wait_event(&rq->hotplug_wait,
rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
TASK_UNINTERRUPTIBLE);
}
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
rq = __migrate_task(rq, rf, next, dest_cpu);
if (rq != dead_rq) {
rq_unlock(rq, rf);
rq = dead_rq;
*rf = orf;
rq_relock(rq, rf);
}
raw_spin_unlock(&next->pi_lock);
}
#else
rq->stop = stop;
static inline void balance_push(struct rq *rq)
{
}
static inline void balance_push_set(int cpu, bool on)
{
}
static inline void balance_hotplug_wait(void)
{
}
#endif /* CONFIG_HOTPLUG_CPU */
void set_rq_online(struct rq *rq)
......@@ -6882,6 +7392,8 @@ int sched_cpu_activate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
balance_push_set(cpu, false);
#ifdef CONFIG_SCHED_SMT
/*
* When going up, increment the number of cores with SMT present.
......@@ -6917,6 +7429,8 @@ int sched_cpu_activate(unsigned int cpu)
int sched_cpu_deactivate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
int ret;
set_cpu_active(cpu, false);
......@@ -6929,6 +7443,16 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
synchronize_rcu();
balance_push_set(cpu, true);
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
update_rq_clock(rq);
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
rq_unlock_irqrestore(rq, &rf);
#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
......@@ -6942,6 +7466,7 @@ int sched_cpu_deactivate(unsigned int cpu)
ret = cpuset_cpu_inactive(cpu);
if (ret) {
balance_push_set(cpu, false);
set_cpu_active(cpu, true);
return ret;
}
......@@ -6965,6 +7490,41 @@ int sched_cpu_starting(unsigned int cpu)
}
#ifdef CONFIG_HOTPLUG_CPU
/*
* Invoked immediately before the stopper thread is invoked to bring the
* CPU down completely. At this point all per CPU kthreads except the
* hotplug thread (current) and the stopper thread (inactive) have been
* either parked or have been unbound from the outgoing CPU. Ensure that
* any of those which might be on the way out are gone.
*
* If after this point a bound task is being woken on this CPU then the
* responsible hotplug callback has failed to do it's job.
* sched_cpu_dying() will catch it with the appropriate fireworks.
*/
int sched_cpu_wait_empty(unsigned int cpu)
{
balance_hotplug_wait();
return 0;
}
/*
* Since this CPU is going 'away' for a while, fold any nr_active delta we
* might have. Called from the CPU stopper task after ensuring that the
* stopper is the last running task on the CPU, so nr_active count is
* stable. We need to take the teardown thread which is calling this into
* account, so we hand in adjust = 1 to the load calculation.
*
* Also see the comment "Global load-average calculations".
*/
static void calc_load_migrate(struct rq *rq)
{
long delta = calc_load_fold_active(rq, 1);
if (delta)
atomic_long_add(delta, &calc_load_tasks);
}
int sched_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
......@@ -6974,12 +7534,7 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
migrate_tasks(rq, &rf);
BUG_ON(rq->nr_running != 1);
BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
......@@ -7186,6 +7741,9 @@ void __init sched_init(void)
rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
#endif
#ifdef CONFIG_HOTPLUG_CPU
rcuwait_init(&rq->hotplug_wait);
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
......
......@@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {
unsigned long cap, max_cap = 0;
int cpu, max_cpu = -1;
......@@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
if (later_mask)
cpumask_set_cpu(best_cpu, later_mask);
......
......@@ -97,11 +97,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
return 0;
if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
return 0;
if (lowest_mask) {
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
/*
* We have to ensure that we have at least one bit
......
......@@ -559,7 +559,7 @@ static int push_dl_task(struct rq *rq);
static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
{
return dl_task(prev);
return rq->online && dl_task(prev);
}
static DEFINE_PER_CPU(struct callback_head, dl_push_head);
......@@ -1931,7 +1931,7 @@ static void task_fork_dl(struct task_struct *p)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, p->cpus_ptr))
cpumask_test_cpu(cpu, &p->cpus_mask))
return 1;
return 0;
}
......@@ -2021,8 +2021,8 @@ static int find_later_rq(struct task_struct *task)
return this_cpu;
}
best_cpu = cpumask_first_and(later_mask,
sched_domain_span(sd));
best_cpu = cpumask_any_and_distribute(later_mask,
sched_domain_span(sd));
/*
* Last chance: if a CPU being in both later_mask
* and current sd span is valid, that becomes our
......@@ -2044,7 +2044,7 @@ static int find_later_rq(struct task_struct *task)
if (this_cpu != -1)
return this_cpu;
cpu = cpumask_any(later_mask);
cpu = cpumask_any_distribute(later_mask);
if (cpu < nr_cpu_ids)
return cpu;
......@@ -2081,7 +2081,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
task_running(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {
......@@ -2148,6 +2148,9 @@ static int push_dl_task(struct rq *rq)
return 0;
retry:
if (is_migration_disabled(next_task))
return 0;
if (WARN_ON(next_task == rq->curr))
return 0;
......@@ -2225,7 +2228,7 @@ static void push_dl_tasks(struct rq *rq)
static void pull_dl_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, cpu;
struct task_struct *p;
struct task_struct *p, *push_task;
bool resched = false;
struct rq *src_rq;
u64 dmin = LONG_MAX;
......@@ -2255,6 +2258,7 @@ static void pull_dl_task(struct rq *this_rq)
continue;
/* Might drop this_rq->lock */
push_task = NULL;
double_lock_balance(this_rq, src_rq);
/*
......@@ -2286,17 +2290,27 @@ static void pull_dl_task(struct rq *this_rq)
src_rq->curr->dl.deadline))
goto skip;
resched = true;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
if (is_migration_disabled(p)) {
push_task = get_push_task(src_rq);
} else {
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
resched = true;
}
/* Is there any other task even earlier? */
}
skip:
double_unlock_balance(this_rq, src_rq);
if (push_task) {
raw_spin_unlock(&this_rq->lock);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
raw_spin_lock(&this_rq->lock);
}
}
if (resched)
......@@ -2320,7 +2334,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
}
static void set_cpus_allowed_dl(struct task_struct *p,
const struct cpumask *new_mask)
const struct cpumask *new_mask,
u32 flags)
{
struct root_domain *src_rd;
struct rq *rq;
......@@ -2349,7 +2364,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
raw_spin_unlock(&src_dl_b->lock);
}
set_cpus_allowed_common(p, new_mask);
set_cpus_allowed_common(p, new_mask, flags);
}
/* Assumes rq->lock is held */
......@@ -2542,6 +2557,7 @@ DEFINE_SCHED_CLASS(dl) = {
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
.task_woken = task_woken_dl,
.find_lock_rq = find_lock_later_rq,
#endif
.task_tick = task_tick_dl,
......
......@@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this_rq);
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
return rq->rt.highest_prio.curr > prev->prio;
return rq->online && rq->rt.highest_prio.curr > prev->prio;
}
static inline int rt_overloaded(struct rq *rq)
......@@ -1660,7 +1660,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, p->cpus_ptr))
cpumask_test_cpu(cpu, &p->cpus_mask))
return 1;
return 0;
......@@ -1754,8 +1754,8 @@ static int find_lowest_rq(struct task_struct *task)
return this_cpu;
}
best_cpu = cpumask_first_and(lowest_mask,
sched_domain_span(sd));
best_cpu = cpumask_any_and_distribute(lowest_mask,
sched_domain_span(sd));
if (best_cpu < nr_cpu_ids) {
rcu_read_unlock();
return best_cpu;
......@@ -1772,7 +1772,7 @@ static int find_lowest_rq(struct task_struct *task)
if (this_cpu != -1)
return this_cpu;
cpu = cpumask_any(lowest_mask);
cpu = cpumask_any_distribute(lowest_mask);
if (cpu < nr_cpu_ids)
return cpu;
......@@ -1813,7 +1813,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
......@@ -1861,7 +1861,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
* running task can migrate over to a CPU that is running a task
* of lesser priority.
*/
static int push_rt_task(struct rq *rq)
static int push_rt_task(struct rq *rq, bool pull)
{
struct task_struct *next_task;
struct rq *lowest_rq;
......@@ -1875,6 +1875,34 @@ static int push_rt_task(struct rq *rq)
return 0;
retry:
if (is_migration_disabled(next_task)) {
struct task_struct *push_task = NULL;
int cpu;
if (!pull || rq->push_busy)
return 0;
cpu = find_lowest_rq(rq->curr);
if (cpu == -1 || cpu == rq->cpu)
return 0;
/*
* Given we found a CPU with lower priority than @next_task,
* therefore it should be running. However we cannot migrate it
* to this other CPU, instead attempt to push the current
* running task on this CPU away.
*/
push_task = get_push_task(rq);
if (push_task) {
raw_spin_unlock(&rq->lock);
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
push_task, &rq->push_work);
raw_spin_lock(&rq->lock);
}
return 0;
}
if (WARN_ON(next_task == rq->curr))
return 0;
......@@ -1929,12 +1957,10 @@ static int push_rt_task(struct rq *rq)
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, lowest_rq->cpu);
activate_task(lowest_rq, next_task, 0);
ret = 1;
resched_curr(lowest_rq);
ret = 1;
double_unlock_balance(rq, lowest_rq);
out:
put_task_struct(next_task);
......@@ -1944,7 +1970,7 @@ static int push_rt_task(struct rq *rq)
static void push_rt_tasks(struct rq *rq)
{
/* push_rt_task will return true if it moved an RT */
while (push_rt_task(rq))
while (push_rt_task(rq, false))
;
}
......@@ -2097,7 +2123,8 @@ void rto_push_irq_work_func(struct irq_work *work)
*/
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
push_rt_tasks(rq);
while (push_rt_task(rq, true))
;
raw_spin_unlock(&rq->lock);
}
......@@ -2122,7 +2149,7 @@ static void pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, cpu;
bool resched = false;
struct task_struct *p;
struct task_struct *p, *push_task;
struct rq *src_rq;
int rt_overload_count = rt_overloaded(this_rq);
......@@ -2169,6 +2196,7 @@ static void pull_rt_task(struct rq *this_rq)
* double_lock_balance, and another CPU could
* alter this_rq
*/
push_task = NULL;
double_lock_balance(this_rq, src_rq);
/*
......@@ -2196,11 +2224,14 @@ static void pull_rt_task(struct rq *this_rq)
if (p->prio < src_rq->curr->prio)
goto skip;
resched = true;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
if (is_migration_disabled(p)) {
push_task = get_push_task(src_rq);
} else {
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
resched = true;
}
/*
* We continue with the search, just in
* case there's an even higher prio task
......@@ -2210,6 +2241,13 @@ static void pull_rt_task(struct rq *this_rq)
}
skip:
double_unlock_balance(this_rq, src_rq);
if (push_task) {
raw_spin_unlock(&this_rq->lock);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
raw_spin_lock(&this_rq->lock);
}
}
if (resched)
......@@ -2451,6 +2489,7 @@ DEFINE_SCHED_CLASS(rt) = {
.rq_offline = rq_offline_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
.find_lock_rq = find_lock_lowest_rq,
#endif
.task_tick = task_tick_rt,
......
......@@ -975,6 +975,7 @@ struct rq {
unsigned long cpu_capacity_orig;
struct callback_head *balance_callback;
unsigned char balance_flags;
unsigned char nohz_idle_balance;
unsigned char idle_balance;
......@@ -1005,6 +1006,10 @@ struct rq {
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
#ifdef CONFIG_HOTPLUG_CPU
struct rcuwait hotplug_wait;
#endif
#endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
......@@ -1050,6 +1055,12 @@ struct rq {
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
#endif
#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
unsigned int nr_pinned;
#endif
unsigned int push_busy;
struct cpu_stop_work push_work;
};
#ifdef CONFIG_FAIR_GROUP_SCHED
......@@ -1077,6 +1088,16 @@ static inline int cpu_of(struct rq *rq)
#endif
}
#define MDF_PUSH 0x01
static inline bool is_migration_disabled(struct task_struct *p)
{
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
return p->migration_disabled;
#else
return false;
#endif
}
#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);
......@@ -1223,6 +1244,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
#endif
#ifdef CONFIG_SMP
SCHED_WARN_ON(rq->balance_callback);
#endif
}
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
......@@ -1384,6 +1408,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_SMP
#define BALANCE_WORK 0x01
#define BALANCE_PUSH 0x02
static inline void
queue_balance_callback(struct rq *rq,
struct callback_head *head,
......@@ -1391,12 +1418,13 @@ queue_balance_callback(struct rq *rq,
{
lockdep_assert_held(&rq->lock);
if (unlikely(head->next))
if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
return;
head->func = (void (*)(struct callback_head *))func;
head->next = rq->balance_callback;
rq->balance_callback = head;
rq->balance_flags |= BALANCE_WORK;
}
#define rcu_dereference_check_sched_domain(p) \
......@@ -1804,10 +1832,13 @@ struct sched_class {
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
const struct cpumask *newmask);
const struct cpumask *newmask,
u32 flags);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
#endif
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
......@@ -1905,13 +1936,35 @@ static inline bool sched_fair_runnable(struct rq *rq)
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
extern struct task_struct *pick_next_task_idle(struct rq *rq);
#define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02
#define SCA_MIGRATE_ENABLE 0x04
#ifdef CONFIG_SMP
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
static inline struct task_struct *get_push_task(struct rq *rq)
{
struct task_struct *p = rq->curr;
lockdep_assert_held(&rq->lock);
if (rq->push_busy)
return NULL;
if (p->nr_cpus_allowed == 1)
return NULL;
rq->push_busy = true;
return get_task_struct(p);
}
extern int push_cpu_stop(void *arg);
#endif
......
......@@ -42,11 +42,27 @@ struct cpu_stopper {
struct list_head works; /* list of pending works */
struct cpu_stop_work stop_work; /* for stop_cpus */
unsigned long caller;
cpu_stop_fn_t fn;
};
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
static bool stop_machine_initialized = false;
void print_stop_info(const char *log_lvl, struct task_struct *task)
{
/*
* If @task is a stopper task, it cannot migrate and task_cpu() is
* stable.
*/
struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task));
if (task != stopper->thread)
return;
printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
}
/* static data for stop_cpus */
static DEFINE_MUTEX(stop_cpus_mutex);
static bool stop_cpus_in_progress;
......@@ -123,7 +139,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
{
struct cpu_stop_done done;
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
cpu_stop_init_done(&done, 1);
if (!cpu_stop_queue_work(cpu, &work))
......@@ -331,7 +347,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
work1 = work2 = (struct cpu_stop_work){
.fn = multi_cpu_stop,
.arg = &msdata,
.done = &done
.done = &done,
.caller = _RET_IP_,
};
cpu_stop_init_done(&done, 2);
......@@ -367,7 +384,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf)
{
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
return cpu_stop_queue_work(cpu, work_buf);
}
......@@ -487,6 +504,8 @@ static void cpu_stopper_thread(unsigned int cpu)
int ret;
/* cpu stop callbacks must not sleep, make in_atomic() == T */
stopper->caller = work->caller;
stopper->fn = fn;
preempt_count_inc();
ret = fn(arg);
if (done) {
......@@ -495,6 +514,8 @@ static void cpu_stopper_thread(unsigned int cpu)
cpu_stop_signal_done(done);
}
preempt_count_dec();
stopper->fn = NULL;
stopper->caller = 0;
WARN_ONCE(preempt_count(),
"cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
goto repeat;
......
......@@ -4908,6 +4908,10 @@ static void unbind_workers(int cpu)
pool->flags |= POOL_DISASSOCIATED;
raw_spin_unlock_irq(&pool->lock);
for_each_pool_worker(worker, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);
mutex_unlock(&wq_pool_attach_mutex);
/*
......
......@@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p,
return next;
}
EXPORT_SYMBOL(cpumask_any_and_distribute);
int cpumask_any_distribute(const struct cpumask *srcp)
{
int next, prev;
/* NOTE: our first selection will skip 0. */
prev = __this_cpu_read(distribute_cpu_mask_prev);
next = cpumask_next(prev, srcp);
if (next >= nr_cpu_ids)
next = cpumask_first(srcp);
if (next < nr_cpu_ids)
__this_cpu_write(distribute_cpu_mask_prev, next);
return next;
}
EXPORT_SYMBOL(cpumask_any_distribute);
......@@ -12,6 +12,7 @@
#include <linux/atomic.h>
#include <linux/kexec.h>
#include <linux/utsname.h>
#include <linux/stop_machine.h>
static char dump_stack_arch_desc_str[128];
......@@ -57,6 +58,7 @@ void dump_stack_print_info(const char *log_lvl)
log_lvl, dump_stack_arch_desc_str);
print_worker_info(log_lvl, current);
print_stop_info(log_lvl, current);
}
/**
......
......@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
if (current->nr_cpus_allowed == 1)
goto out;
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
if (current->migration_disabled)
goto out;
#endif
/*
* It is valid to assume CPU-locality during early bootup:
*/
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment