Commit 71dbdde7 authored by Johannes Weiner's avatar Johannes Weiner Committed by Peter Zijlstra

sched/psi: Remove NR_ONCPU task accounting

We put all fields updated by the scheduler in the first cacheline of
struct psi_group_cpu for performance.

Since we want add another PSI_IRQ_FULL to track IRQ/SOFTIRQ pressure,
we need to reclaim space first. This patch remove NR_ONCPU task accounting
in struct psi_group_cpu, use one bit in state_mask to track instead.
Signed-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Signed-off-by: default avatarChengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: default avatarChengming Zhou <zhouchengming@bytedance.com>
Tested-by: default avatarChengming Zhou <zhouchengming@bytedance.com>
Link: https://lore.kernel.org/r/20220825164111.29534-7-zhouchengming@bytedance.com
parent 65176f59
...@@ -15,13 +15,6 @@ enum psi_task_count { ...@@ -15,13 +15,6 @@ enum psi_task_count {
NR_IOWAIT, NR_IOWAIT,
NR_MEMSTALL, NR_MEMSTALL,
NR_RUNNING, NR_RUNNING,
/*
* This can't have values other than 0 or 1 and could be
* implemented as a bit flag. But for now we still have room
* in the first cacheline of psi_group_cpu, and this way we
* don't have to special case any state tracking for it.
*/
NR_ONCPU,
/* /*
* For IO and CPU stalls the presence of running/oncpu tasks * For IO and CPU stalls the presence of running/oncpu tasks
* in the domain means a partial rather than a full stall. * in the domain means a partial rather than a full stall.
...@@ -32,16 +25,18 @@ enum psi_task_count { ...@@ -32,16 +25,18 @@ enum psi_task_count {
* threads and memstall ones. * threads and memstall ones.
*/ */
NR_MEMSTALL_RUNNING, NR_MEMSTALL_RUNNING,
NR_PSI_TASK_COUNTS = 5, NR_PSI_TASK_COUNTS = 4,
}; };
/* Task state bitmasks */ /* Task state bitmasks */
#define TSK_IOWAIT (1 << NR_IOWAIT) #define TSK_IOWAIT (1 << NR_IOWAIT)
#define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING) #define TSK_RUNNING (1 << NR_RUNNING)
#define TSK_ONCPU (1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
/* Only one task can be scheduled, no corresponding task count */
#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS)
/* Resources that workloads could be stalled on */ /* Resources that workloads could be stalled on */
enum psi_res { enum psi_res {
PSI_IO, PSI_IO,
...@@ -68,6 +63,9 @@ enum psi_states { ...@@ -68,6 +63,9 @@ enum psi_states {
NR_PSI_STATES = 7, NR_PSI_STATES = 7,
}; };
/* Use one bit in the state mask to track TSK_ONCPU */
#define PSI_ONCPU (1 << NR_PSI_STATES)
enum psi_aggregators { enum psi_aggregators {
PSI_AVGS = 0, PSI_AVGS = 0,
PSI_POLL, PSI_POLL,
......
...@@ -212,7 +212,7 @@ void __init psi_init(void) ...@@ -212,7 +212,7 @@ void __init psi_init(void)
group_init(&psi_system); group_init(&psi_system);
} }
static bool test_state(unsigned int *tasks, enum psi_states state) static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
{ {
switch (state) { switch (state) {
case PSI_IO_SOME: case PSI_IO_SOME:
...@@ -225,9 +225,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state) ...@@ -225,9 +225,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
return unlikely(tasks[NR_MEMSTALL] && return unlikely(tasks[NR_MEMSTALL] &&
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME: case PSI_CPU_SOME:
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); return unlikely(tasks[NR_RUNNING] > oncpu);
case PSI_CPU_FULL: case PSI_CPU_FULL:
return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); return unlikely(tasks[NR_RUNNING] && !oncpu);
case PSI_NONIDLE: case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING]; tasks[NR_RUNNING];
...@@ -689,9 +689,9 @@ static void psi_group_change(struct psi_group *group, int cpu, ...@@ -689,9 +689,9 @@ static void psi_group_change(struct psi_group *group, int cpu,
bool wake_clock) bool wake_clock)
{ {
struct psi_group_cpu *groupc; struct psi_group_cpu *groupc;
u32 state_mask = 0;
unsigned int t, m; unsigned int t, m;
enum psi_states s; enum psi_states s;
u32 state_mask;
groupc = per_cpu_ptr(group->pcpu, cpu); groupc = per_cpu_ptr(group->pcpu, cpu);
...@@ -707,17 +707,36 @@ static void psi_group_change(struct psi_group *group, int cpu, ...@@ -707,17 +707,36 @@ static void psi_group_change(struct psi_group *group, int cpu,
record_times(groupc, now); record_times(groupc, now);
/*
* Start with TSK_ONCPU, which doesn't have a corresponding
* task count - it's just a boolean flag directly encoded in
* the state mask. Clear, set, or carry the current state if
* no changes are requested.
*/
if (unlikely(clear & TSK_ONCPU)) {
state_mask = 0;
clear &= ~TSK_ONCPU;
} else if (unlikely(set & TSK_ONCPU)) {
state_mask = PSI_ONCPU;
set &= ~TSK_ONCPU;
} else {
state_mask = groupc->state_mask & PSI_ONCPU;
}
/*
* The rest of the state mask is calculated based on the task
* counts. Update those first, then construct the mask.
*/
for (t = 0, m = clear; m; m &= ~(1 << t), t++) { for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t))) if (!(m & (1 << t)))
continue; continue;
if (groupc->tasks[t]) { if (groupc->tasks[t]) {
groupc->tasks[t]--; groupc->tasks[t]--;
} else if (!psi_bug) { } else if (!psi_bug) {
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0], cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2], groupc->tasks[1], groupc->tasks[2],
groupc->tasks[3], groupc->tasks[4], groupc->tasks[3], clear, set);
clear, set);
psi_bug = 1; psi_bug = 1;
} }
} }
...@@ -726,9 +745,8 @@ static void psi_group_change(struct psi_group *group, int cpu, ...@@ -726,9 +745,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (set & (1 << t)) if (set & (1 << t))
groupc->tasks[t]++; groupc->tasks[t]++;
/* Calculate state mask representing active states */
for (s = 0; s < NR_PSI_STATES; s++) { for (s = 0; s < NR_PSI_STATES; s++) {
if (test_state(groupc->tasks, s)) if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
state_mask |= (1 << s); state_mask |= (1 << s);
} }
...@@ -740,7 +758,7 @@ static void psi_group_change(struct psi_group *group, int cpu, ...@@ -740,7 +758,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
* task in a cgroup is in_memstall, the corresponding groupc * task in a cgroup is in_memstall, the corresponding groupc
* on that cpu is in PSI_MEM_FULL state. * on that cpu is in PSI_MEM_FULL state.
*/ */
if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
state_mask |= (1 << PSI_MEM_FULL); state_mask |= (1 << PSI_MEM_FULL);
groupc->state_mask = state_mask; groupc->state_mask = state_mask;
...@@ -828,7 +846,8 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, ...@@ -828,7 +846,8 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
*/ */
iter = NULL; iter = NULL;
while ((group = iterate_groups(next, &iter))) { while ((group = iterate_groups(next, &iter))) {
if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
PSI_ONCPU) {
common = group; common = group;
break; break;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment