Merge tag 'sched-urgent-2021-05-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner: "A set of scheduler updates: - Prevent PSI state corruption when schedule() races with cgroup move. A recent commit combined two PSI callbacks to reduce the number of cgroup tree updates, but missed that schedule() can drop rq::lock for load balancing, which opens the race window for cgroup_move_task() which then observes half updated state. The fix is to solely use task::ps_flags instead of looking at the potentially mismatching scheduler state - Prevent an out-of-bounds access in uclamp caused bu a rounding division which can lead to an off-by-one error exceeding the buckets array size. - Prevent unfairness caused by missing load decay when a task is attached to a cfs runqueue. The old load of the task was attached to the runqueue and never removed. Fix it by enforcing the load update through the hierarchy for unthrottled run queue instances. - A documentation fix fot the 'sched_verbose' command line option" * tag 'sched-urgent-2021-05-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Fix unfairness caused by missing load decay sched: Fix out-of-bound access in uclamp psi: Fix psi state corruption when schedule() races with cgroup move sched,doc: sched_debug_verbose cmdline should be sched_verbose

Merge tag 'sched-urgent-2021-05-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Thomas Gleixner: "A set of scheduler updates: - Prevent PSI state corruption when schedule() races with cgroup move. A recent commit combined two PSI callbacks to reduce the number of cgroup tree updates, but missed that schedule() can drop rq::lock for load balancing, which opens the race window for cgroup_move_task() which then observes half updated state. The fix is to solely use task::ps_flags instead of looking at the potentially mismatching scheduler state - Prevent an out-of-bounds access in uclamp caused bu a rounding division which can lead to an off-by-one error exceeding the buckets array size. - Prevent unfairness caused by missing load decay when a task is attached to a cfs runqueue. The old load of the task was attached to the runqueue and never removed. Fix it by enforcing the load update through the hierarchy for unthrottled run queue instances. - A documentation fix fot the 'sched_verbose' command line option" * tag 'sched-urgent-2021-05-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Fix unfairness caused by missing load decay sched: Fix out-of-bound access in uclamp psi: Fix psi state corruption when schedule() races with cgroup move sched,doc: sched_debug_verbose cmdline should be sched_verbose
9819f682 · Linus Torvalds · 732a27a0 · 0258bdfa · 9819f682 · 9819f682
Commit 9819f682 authored May 09, 2021 by Linus Torvalds
4 changed files
--- a/Documentation/scheduler/sched-domains.rst
+++ b/Documentation/scheduler/sched-domains.rst
@@ -74,7 +74,7 @@ for a given topology level by creating a sched_domain_topology_level array and
 calling set_sched_topology() with this array as the parameter.

 The sched-domains debugging infrastructure can be enabled by enabling
-CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you
+CONFIG_SCHED_DEBUG and adding 'sched_verbose' to your cmdline. If you
 forgot to tweak your cmdline, you can also flip the
 /sys/kernel/debug/sched/verbose knob. This enables an error checking parse of
 the sched domains which should catch most possible errors (described above). It

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -938,7 +938,7 @@ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);

 static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
 {
-	return clamp_value / UCLAMP_BUCKET_DELTA;
+	return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
 }

 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10878,16 +10878,22 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq;

+	list_add_leaf_cfs_rq(cfs_rq_of(se));
+
 	/* Start to propagate at parent */
 	se = se->parent;

 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);

-		if (cfs_rq_throttled(cfs_rq))
-			break;
-
+		if (!cfs_rq_throttled(cfs_rq)){
 			update_load_avg(cfs_rq, se, UPDATE_TG);
+			list_add_leaf_cfs_rq(cfs_rq);
+			continue;
+		}
+
+		if (list_add_leaf_cfs_rq(cfs_rq))
+			break;
 	}
 }
 #else

--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -972,7 +972,7 @@ void psi_cgroup_free(struct cgroup *cgroup)
 */
 void cgroup_move_task(struct task_struct *task, struct css_set *to)
 {
-	unsigned int task_flags = 0;
+	unsigned int task_flags;
 	struct rq_flags rf;
 	struct rq *rq;

@@ -987,15 +987,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)

 	rq = task_rq_lock(task, &rf);

-	if (task_on_rq_queued(task)) {
-		task_flags = TSK_RUNNING;
-		if (task_current(rq, task))
-			task_flags |= TSK_ONCPU;
-	} else if (task->in_iowait)
-		task_flags = TSK_IOWAIT;
-
-	if (task->in_memstall)
-		task_flags |= TSK_MEMSTALL;
+	/*
+	 * We may race with schedule() dropping the rq lock between
+	 * deactivating prev and switching to next. Because the psi
+	 * updates from the deactivation are deferred to the switch
+	 * callback to save cgroup tree updates, the task's scheduling
+	 * state here is not coherent with its psi state:
+	 *
+	 * schedule()                   cgroup_move_task()
+	 *   rq_lock()
+	 *   deactivate_task()
+	 *     p->on_rq = 0
+	 *     psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
+	 *   pick_next_task()
+	 *     rq_unlock()
+	 *                                rq_lock()
+	 *                                psi_task_change() // old cgroup
+	 *                                task->cgroups = to
+	 *                                psi_task_change() // new cgroup
+	 *                                rq_unlock()
+	 *     rq_lock()
+	 *   psi_sched_switch() // does deferred updates in new cgroup
+	 *
+	 * Don't rely on the scheduling state. Use psi_flags instead.
+	 */
+	task_flags = task->psi_flags;

 	if (task_flags)
 		psi_task_change(task, task_flags, 0);