Commit f5fa3630 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
 "Various scheduler fixes all over the place: three SCHED_DL fixes,
  three sched/numa fixes, two generic race fixes and a comment fix"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/dl: Fix preemption checks
  sched: Update comments for CLONE_NEWNS
  sched: stop the unbound recursion in preempt_schedule_context()
  sched/fair: Fix division by zero sysctl_numa_balancing_scan_size
  sched/fair: Care divide error in update_task_scan_period()
  sched/numa: Fix unsafe get_task_struct() in task_numa_assign()
  sched/deadline: Fix races between rt_mutex_setprio() and dl_task_timer()
  sched/deadline: Don't replenish from a !SCHED_DEADLINE entity
  sched: Fix race between task_group and sched_task_group
parents 5656b408 f3a7e1a9
......@@ -105,6 +105,7 @@ static __always_inline bool should_resched(void)
# ifdef CONFIG_CONTEXT_TRACKING
extern asmlinkage void ___preempt_schedule_context(void);
# define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
extern asmlinkage void preempt_schedule_context(void);
# endif
#endif
......
......@@ -13,7 +13,7 @@
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
#define CLONE_THREAD 0x00010000 /* Same thread group? */
#define CLONE_NEWNS 0x00020000 /* New namespace group? */
#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
......
......@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
}
NOKPROBE_SYMBOL(context_tracking_user_enter);
#ifdef CONFIG_PREEMPT
/**
* preempt_schedule_context - preempt_schedule called by tracing
*
* The tracing infrastructure uses preempt_enable_notrace to prevent
* recursion and tracing preempt enabling caused by the tracing
* infrastructure itself. But as tracing can happen in areas coming
* from userspace or just about to enter userspace, a preempt enable
* can occur before user_exit() is called. This will cause the scheduler
* to be called when the system is still in usermode.
*
* To prevent this, the preempt_enable_notrace will use this function
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
asmlinkage __visible void __sched notrace preempt_schedule_context(void)
{
enum ctx_state prev_ctx;
if (likely(!preemptible()))
return;
/*
* Need to disable preemption in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
* an infinite recursion.
*/
preempt_disable_notrace();
prev_ctx = exception_enter();
preempt_enable_no_resched_notrace();
preempt_schedule();
preempt_disable_notrace();
exception_exit(prev_ctx);
preempt_enable_notrace();
}
EXPORT_SYMBOL_GPL(preempt_schedule_context);
#endif /* CONFIG_PREEMPT */
/**
* context_tracking_user_exit - Inform the context tracking that the CPU is
* exiting userspace mode and entering the kernel.
......
......@@ -2951,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#ifdef CONFIG_CONTEXT_TRACKING
/**
* preempt_schedule_context - preempt_schedule called by tracing
*
* The tracing infrastructure uses preempt_enable_notrace to prevent
* recursion and tracing preempt enabling caused by the tracing
* infrastructure itself. But as tracing can happen in areas coming
* from userspace or just about to enter userspace, a preempt enable
* can occur before user_exit() is called. This will cause the scheduler
* to be called when the system is still in usermode.
*
* To prevent this, the preempt_enable_notrace will use this function
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
asmlinkage __visible void __sched notrace preempt_schedule_context(void)
{
enum ctx_state prev_ctx;
if (likely(!preemptible()))
return;
do {
__preempt_count_add(PREEMPT_ACTIVE);
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
* an infinite recursion.
*/
prev_ctx = exception_enter();
__schedule();
exception_exit(prev_ctx);
__preempt_count_sub(PREEMPT_ACTIVE);
barrier();
} while (need_resched());
}
EXPORT_SYMBOL_GPL(preempt_schedule_context);
#endif /* CONFIG_CONTEXT_TRACKING */
#endif /* CONFIG_PREEMPT */
/*
......@@ -7833,6 +7874,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
static void cpu_cgroup_fork(struct task_struct *task)
{
sched_move_task(task);
}
static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
......@@ -8205,6 +8251,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_free = cpu_cgroup_css_free,
.css_online = cpu_cgroup_css_online,
.css_offline = cpu_cgroup_css_offline,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
......
......@@ -518,12 +518,20 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
}
/*
* We need to take care of a possible races here. In fact, the
* task might have changed its scheduling policy to something
* different from SCHED_DEADLINE or changed its reservation
* parameters (through sched_setattr()).
* We need to take care of several possible races here:
*
* - the task might have changed its scheduling policy
* to something different than SCHED_DEADLINE
* - the task might have changed its reservation parameters
* (through sched_setattr())
* - the task might have been boosted by someone else and
* might be in the boosting/deboosting path
*
* In all this cases we bail out, as the task is already
* in the runqueue or is going to be enqueued back anyway.
*/
if (!dl_task(p) || dl_se->dl_new)
if (!dl_task(p) || dl_se->dl_new ||
dl_se->dl_boosted || !dl_se->dl_throttled)
goto unlock;
sched_clock_tick();
......@@ -532,7 +540,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
dl_se->dl_yielded = 0;
if (task_on_rq_queued(p)) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (task_has_dl_policy(rq->curr))
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
......@@ -847,8 +855,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* smaller than our one... OTW we keep our runtime and
* deadline.
*/
if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
pi_se = &pi_task->dl;
} else if (!dl_prio(p->normal_prio)) {
/*
* Special case in which we have a !SCHED_DEADLINE task
* that is going to be deboosted, but exceedes its
* runtime while doing so. No point in replenishing
* it, as it's going to return back to its original
* scheduling class after this.
*/
BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
return;
}
/*
* If p is throttled, we do nothing. In fact, if it exhausted
......@@ -1607,8 +1626,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
/* Only reschedule if pushing failed */
check_resched = 0;
#endif /* CONFIG_SMP */
if (check_resched && task_has_dl_policy(rq->curr))
if (check_resched) {
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
}
}
}
......
......@@ -828,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
static unsigned int task_scan_min(struct task_struct *p)
{
unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
if (scan_size < MAX_SCAN_WINDOW)
windows = MAX_SCAN_WINDOW / scan_size;
floor = 1000 / windows;
scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
......@@ -1164,9 +1165,19 @@ static void task_numa_compare(struct task_numa_env *env,
long moveimp = imp;
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
if (cur->pid == 0) /* idle */
raw_spin_lock_irq(&dst_rq->lock);
cur = dst_rq->curr;
/*
* No need to move the exiting task, and this ensures that ->curr
* wasn't reaped and thus get_task_struct() in task_numa_assign()
* is safe under RCU read lock.
* Note that rcu_read_lock() itself can't protect from the final
* put_task_struct() after the last schedule().
*/
if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
raw_spin_unlock_irq(&dst_rq->lock);
/*
* "imp" is the fault differential for the source task between the
......@@ -1520,7 +1531,7 @@ static void update_task_scan_period(struct task_struct *p,
* scanning faster if shared accesses dominate as it may
* simply bounce migrations uselessly
*/
ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
}
......
......@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_numa_balancing_scan_size,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
.proc_handler = proc_dointvec_minmax,
.extra1 = &one,
},
{
.procname = "numa_balancing",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment