Commit bd9a3dba authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull PSI updates from Ingo Molnar:

 - Various performance optimizations, resulting in a 4%-9% speedup in
   the mmtests/config-scheduler-perfpipe micro-benchmark.

 - New interface to turn PSI on/off on a per cgroup level.

* tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/psi: Per-cgroup PSI accounting disable/re-enable interface
  sched/psi: Cache parent psi_group to speed up group iteration
  sched/psi: Consolidate cgroup_psi()
  sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure
  sched/psi: Remove NR_ONCPU task accounting
  sched/psi: Optimize task switch inside shared cgroups again
  sched/psi: Move private helpers to sched/stats.h
  sched/psi: Save percpu memory when !psi_cgroups_enabled
  sched/psi: Don't create cgroup PSI files when psi_disabled
  sched/psi: Fix periodic aggregation shut off
parents 1df046ab 34f26a15
...@@ -976,6 +976,29 @@ All cgroup core files are prefixed with "cgroup." ...@@ -976,6 +976,29 @@ All cgroup core files are prefixed with "cgroup."
killing cgroups is a process directed operation, i.e. it affects killing cgroups is a process directed operation, i.e. it affects
the whole thread-group. the whole thread-group.
cgroup.pressure
A read-write single value file that allowed values are "0" and "1".
The default is "1".
Writing "0" to the file will disable the cgroup PSI accounting.
Writing "1" to the file will re-enable the cgroup PSI accounting.
This control attribute is not hierarchical, so disable or enable PSI
accounting in a cgroup does not affect PSI accounting in descendants
and doesn't need pass enablement via ancestors from root.
The reason this control attribute exists is that PSI accounts stalls for
each cgroup separately and aggregates it at each level of the hierarchy.
This may cause non-negligible overhead for some workloads when under
deep level of the hierarchy, in which case this control attribute can
be used to disable PSI accounting in the non-leaf cgroups.
irq.pressure
A read-write nested-keyed file.
Shows pressure stall information for IRQ/SOFTIRQ. See
:ref:`Documentation/accounting/psi.rst <psi>` for details.
Controllers Controllers
=========== ===========
......
...@@ -428,6 +428,9 @@ struct cgroup { ...@@ -428,6 +428,9 @@ struct cgroup {
struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */
struct cgroup_file events_file; /* handle for "cgroup.events" */ struct cgroup_file events_file; /* handle for "cgroup.events" */
/* handles for "{cpu,memory,io,irq}.pressure" */
struct cgroup_file psi_files[NR_PSI_RESOURCES];
/* /*
* The bitmask of subsystems enabled on the child cgroups. * The bitmask of subsystems enabled on the child cgroups.
* ->subtree_control is the one configured through * ->subtree_control is the one configured through
......
...@@ -682,11 +682,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) ...@@ -682,11 +682,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
pr_cont_kernfs_path(cgrp->kn); pr_cont_kernfs_path(cgrp->kn);
} }
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
return cgrp->psi;
}
bool cgroup_psi_enabled(void); bool cgroup_psi_enabled(void);
static inline void cgroup_init_kthreadd(void) static inline void cgroup_init_kthreadd(void)
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/poll.h> #include <linux/poll.h>
#include <linux/cgroup-defs.h> #include <linux/cgroup-defs.h>
#include <linux/cgroup.h>
struct seq_file; struct seq_file;
struct css_set; struct css_set;
...@@ -18,10 +19,6 @@ extern struct psi_group psi_system; ...@@ -18,10 +19,6 @@ extern struct psi_group psi_system;
void psi_init(void); void psi_init(void);
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
void psi_memstall_enter(unsigned long *flags); void psi_memstall_enter(unsigned long *flags);
void psi_memstall_leave(unsigned long *flags); void psi_memstall_leave(unsigned long *flags);
...@@ -34,9 +31,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, ...@@ -34,9 +31,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
poll_table *wait); poll_table *wait);
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
}
int psi_cgroup_alloc(struct cgroup *cgrp); int psi_cgroup_alloc(struct cgroup *cgrp);
void psi_cgroup_free(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp);
void cgroup_move_task(struct task_struct *p, struct css_set *to); void cgroup_move_task(struct task_struct *p, struct css_set *to);
void psi_cgroup_restart(struct psi_group *group);
#endif #endif
#else /* CONFIG_PSI */ #else /* CONFIG_PSI */
...@@ -58,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) ...@@ -58,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
{ {
rcu_assign_pointer(p->cgroups, to); rcu_assign_pointer(p->cgroups, to);
} }
static inline void psi_cgroup_restart(struct psi_group *group) {}
#endif #endif
#endif /* CONFIG_PSI */ #endif /* CONFIG_PSI */
......
...@@ -15,13 +15,6 @@ enum psi_task_count { ...@@ -15,13 +15,6 @@ enum psi_task_count {
NR_IOWAIT, NR_IOWAIT,
NR_MEMSTALL, NR_MEMSTALL,
NR_RUNNING, NR_RUNNING,
/*
* This can't have values other than 0 or 1 and could be
* implemented as a bit flag. But for now we still have room
* in the first cacheline of psi_group_cpu, and this way we
* don't have to special case any state tracking for it.
*/
NR_ONCPU,
/* /*
* For IO and CPU stalls the presence of running/oncpu tasks * For IO and CPU stalls the presence of running/oncpu tasks
* in the domain means a partial rather than a full stall. * in the domain means a partial rather than a full stall.
...@@ -32,22 +25,27 @@ enum psi_task_count { ...@@ -32,22 +25,27 @@ enum psi_task_count {
* threads and memstall ones. * threads and memstall ones.
*/ */
NR_MEMSTALL_RUNNING, NR_MEMSTALL_RUNNING,
NR_PSI_TASK_COUNTS = 5, NR_PSI_TASK_COUNTS = 4,
}; };
/* Task state bitmasks */ /* Task state bitmasks */
#define TSK_IOWAIT (1 << NR_IOWAIT) #define TSK_IOWAIT (1 << NR_IOWAIT)
#define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING) #define TSK_RUNNING (1 << NR_RUNNING)
#define TSK_ONCPU (1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
/* Only one task can be scheduled, no corresponding task count */
#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS)
/* Resources that workloads could be stalled on */ /* Resources that workloads could be stalled on */
enum psi_res { enum psi_res {
PSI_IO, PSI_IO,
PSI_MEM, PSI_MEM,
PSI_CPU, PSI_CPU,
NR_PSI_RESOURCES = 3, #ifdef CONFIG_IRQ_TIME_ACCOUNTING
PSI_IRQ,
#endif
NR_PSI_RESOURCES,
}; };
/* /*
...@@ -63,11 +61,17 @@ enum psi_states { ...@@ -63,11 +61,17 @@ enum psi_states {
PSI_MEM_FULL, PSI_MEM_FULL,
PSI_CPU_SOME, PSI_CPU_SOME,
PSI_CPU_FULL, PSI_CPU_FULL,
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
PSI_IRQ_FULL,
#endif
/* Only per-CPU, to weigh the CPU in the global average: */ /* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE, PSI_NONIDLE,
NR_PSI_STATES = 7, NR_PSI_STATES,
}; };
/* Use one bit in the state mask to track TSK_ONCPU */
#define PSI_ONCPU (1 << NR_PSI_STATES)
enum psi_aggregators { enum psi_aggregators {
PSI_AVGS = 0, PSI_AVGS = 0,
PSI_POLL, PSI_POLL,
...@@ -147,6 +151,9 @@ struct psi_trigger { ...@@ -147,6 +151,9 @@ struct psi_trigger {
}; };
struct psi_group { struct psi_group {
struct psi_group *parent;
bool enabled;
/* Protects data used by the aggregator */ /* Protects data used by the aggregator */
struct mutex avgs_lock; struct mutex avgs_lock;
...@@ -188,6 +195,8 @@ struct psi_group { ...@@ -188,6 +195,8 @@ struct psi_group {
#else /* CONFIG_PSI */ #else /* CONFIG_PSI */
#define NR_PSI_RESOURCES 0
struct psi_group { }; struct psi_group { };
#endif /* CONFIG_PSI */ #endif /* CONFIG_PSI */
......
...@@ -3698,27 +3698,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v) ...@@ -3698,27 +3698,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{ {
struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IO); return psi_show(seq, psi, PSI_IO);
} }
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{ {
struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_MEM); return psi_show(seq, psi, PSI_MEM);
} }
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{ {
struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_CPU); return psi_show(seq, psi, PSI_CPU);
} }
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res) size_t nbytes, enum psi_res res)
{ {
struct cgroup_file_ctx *ctx = of->priv; struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new; struct psi_trigger *new;
...@@ -3738,7 +3738,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, ...@@ -3738,7 +3738,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return -EBUSY; return -EBUSY;
} }
psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res); new = psi_trigger_create(psi, buf, res);
if (IS_ERR(new)) { if (IS_ERR(new)) {
cgroup_put(cgrp); cgroup_put(cgrp);
...@@ -3755,21 +3755,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, ...@@ -3755,21 +3755,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, char *buf, size_t nbytes,
loff_t off) loff_t off)
{ {
return cgroup_pressure_write(of, buf, nbytes, PSI_IO); return pressure_write(of, buf, nbytes, PSI_IO);
} }
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, char *buf, size_t nbytes,
loff_t off) loff_t off)
{ {
return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); return pressure_write(of, buf, nbytes, PSI_MEM);
} }
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, char *buf, size_t nbytes,
loff_t off) loff_t off)
{ {
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); return pressure_write(of, buf, nbytes, PSI_CPU);
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IRQ);
}
static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return pressure_write(of, buf, nbytes, PSI_IRQ);
}
#endif
static int cgroup_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_psi(cgrp);
seq_printf(seq, "%d\n", psi->enabled);
return 0;
}
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
ssize_t ret;
int enable;
struct cgroup *cgrp;
struct psi_group *psi;
ret = kstrtoint(strstrip(buf), 0, &enable);
if (ret)
return ret;
if (enable < 0 || enable > 1)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
psi = cgroup_psi(cgrp);
if (psi->enabled != enable) {
int i;
/* show or hide {cpu,memory,io,irq}.pressure files */
for (i = 0; i < NR_PSI_RESOURCES; i++)
cgroup_file_show(&cgrp->psi_files[i], enable);
psi->enabled = enable;
if (enable)
psi_cgroup_restart(psi);
}
cgroup_kn_unlock(of->kn);
return nbytes;
} }
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
...@@ -3789,6 +3854,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) ...@@ -3789,6 +3854,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
bool cgroup_psi_enabled(void) bool cgroup_psi_enabled(void)
{ {
if (static_branch_likely(&psi_disabled))
return false;
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
} }
...@@ -5175,6 +5243,7 @@ static struct cftype cgroup_psi_files[] = { ...@@ -5175,6 +5243,7 @@ static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI #ifdef CONFIG_PSI
{ {
.name = "io.pressure", .name = "io.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.seq_show = cgroup_io_pressure_show, .seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write, .write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll, .poll = cgroup_pressure_poll,
...@@ -5182,6 +5251,7 @@ static struct cftype cgroup_psi_files[] = { ...@@ -5182,6 +5251,7 @@ static struct cftype cgroup_psi_files[] = {
}, },
{ {
.name = "memory.pressure", .name = "memory.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.seq_show = cgroup_memory_pressure_show, .seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write, .write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll, .poll = cgroup_pressure_poll,
...@@ -5189,11 +5259,27 @@ static struct cftype cgroup_psi_files[] = { ...@@ -5189,11 +5259,27 @@ static struct cftype cgroup_psi_files[] = {
}, },
{ {
.name = "cpu.pressure", .name = "cpu.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.seq_show = cgroup_cpu_pressure_show, .seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write, .write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll, .poll = cgroup_pressure_poll,
.release = cgroup_pressure_release, .release = cgroup_pressure_release,
}, },
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
{
.name = "irq.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
.seq_show = cgroup_irq_pressure_show,
.write = cgroup_irq_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
#endif
{
.name = "cgroup.pressure",
.seq_show = cgroup_pressure_show,
.write = cgroup_pressure_write,
},
#endif /* CONFIG_PSI */ #endif /* CONFIG_PSI */
{ } /* terminate */ { } /* terminate */
}; };
......
...@@ -701,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) ...@@ -701,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta; rq->prev_irq_time += irq_delta;
delta -= irq_delta; delta -= irq_delta;
psi_account_irqtime(rq->curr, irq_delta);
#endif #endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) { if (static_key_false((&paravirt_steal_rq_enabled))) {
......
...@@ -181,6 +181,7 @@ static void group_init(struct psi_group *group) ...@@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
{ {
int cpu; int cpu;
group->enabled = true;
for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock(); group->avg_last_update = sched_clock();
...@@ -201,6 +202,7 @@ void __init psi_init(void) ...@@ -201,6 +202,7 @@ void __init psi_init(void)
{ {
if (!psi_enable) { if (!psi_enable) {
static_branch_enable(&psi_disabled); static_branch_enable(&psi_disabled);
static_branch_disable(&psi_cgroups_enabled);
return; return;
} }
...@@ -211,7 +213,7 @@ void __init psi_init(void) ...@@ -211,7 +213,7 @@ void __init psi_init(void)
group_init(&psi_system); group_init(&psi_system);
} }
static bool test_state(unsigned int *tasks, enum psi_states state) static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
{ {
switch (state) { switch (state) {
case PSI_IO_SOME: case PSI_IO_SOME:
...@@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state) ...@@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
return unlikely(tasks[NR_MEMSTALL] && return unlikely(tasks[NR_MEMSTALL] &&
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME: case PSI_CPU_SOME:
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); return unlikely(tasks[NR_RUNNING] > oncpu);
case PSI_CPU_FULL: case PSI_CPU_FULL:
return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); return unlikely(tasks[NR_RUNNING] && !oncpu);
case PSI_NONIDLE: case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING]; tasks[NR_RUNNING];
...@@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu, ...@@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu,
bool wake_clock) bool wake_clock)
{ {
struct psi_group_cpu *groupc; struct psi_group_cpu *groupc;
u32 state_mask = 0;
unsigned int t, m; unsigned int t, m;
enum psi_states s; enum psi_states s;
u32 state_mask;
groupc = per_cpu_ptr(group->pcpu, cpu); groupc = per_cpu_ptr(group->pcpu, cpu);
/* /*
* First we assess the aggregate resource states this CPU's * First we update the task counts according to the state
* tasks have been in since the last change, and account any
* SOME and FULL time these may have resulted in.
*
* Then we update the task counts according to the state
* change requested through the @clear and @set bits. * change requested through the @clear and @set bits.
*
* Then if the cgroup PSI stats accounting enabled, we
* assess the aggregate resource states this CPU's tasks
* have been in since the last change, and account any
* SOME and FULL time these may have resulted in.
*/ */
write_seqcount_begin(&groupc->seq); write_seqcount_begin(&groupc->seq);
record_times(groupc, now); /*
* Start with TSK_ONCPU, which doesn't have a corresponding
* task count - it's just a boolean flag directly encoded in
* the state mask. Clear, set, or carry the current state if
* no changes are requested.
*/
if (unlikely(clear & TSK_ONCPU)) {
state_mask = 0;
clear &= ~TSK_ONCPU;
} else if (unlikely(set & TSK_ONCPU)) {
state_mask = PSI_ONCPU;
set &= ~TSK_ONCPU;
} else {
state_mask = groupc->state_mask & PSI_ONCPU;
}
/*
* The rest of the state mask is calculated based on the task
* counts. Update those first, then construct the mask.
*/
for (t = 0, m = clear; m; m &= ~(1 << t), t++) { for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t))) if (!(m & (1 << t)))
continue; continue;
if (groupc->tasks[t]) { if (groupc->tasks[t]) {
groupc->tasks[t]--; groupc->tasks[t]--;
} else if (!psi_bug) { } else if (!psi_bug) {
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0], cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2], groupc->tasks[1], groupc->tasks[2],
groupc->tasks[3], groupc->tasks[4], groupc->tasks[3], clear, set);
clear, set);
psi_bug = 1; psi_bug = 1;
} }
} }
...@@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu, ...@@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (set & (1 << t)) if (set & (1 << t))
groupc->tasks[t]++; groupc->tasks[t]++;
/* Calculate state mask representing active states */ if (!group->enabled) {
/*
* On the first group change after disabling PSI, conclude
* the current state and flush its time. This is unlikely
* to matter to the user, but aggregation (get_recent_times)
* may have already incorporated the live state into times_prev;
* avoid a delta sample underflow when PSI is later re-enabled.
*/
if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
record_times(groupc, now);
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
return;
}
for (s = 0; s < NR_PSI_STATES; s++) { for (s = 0; s < NR_PSI_STATES; s++) {
if (test_state(groupc->tasks, s)) if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
state_mask |= (1 << s); state_mask |= (1 << s);
} }
...@@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu, ...@@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
* task in a cgroup is in_memstall, the corresponding groupc * task in a cgroup is in_memstall, the corresponding groupc
* on that cpu is in PSI_MEM_FULL state. * on that cpu is in PSI_MEM_FULL state.
*/ */
if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
state_mask |= (1 << PSI_MEM_FULL); state_mask |= (1 << PSI_MEM_FULL);
record_times(groupc, now);
groupc->state_mask = state_mask; groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq); write_seqcount_end(&groupc->seq);
...@@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu, ...@@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu,
schedule_delayed_work(&group->avgs_work, PSI_FREQ); schedule_delayed_work(&group->avgs_work, PSI_FREQ);
} }
static struct psi_group *iterate_groups(struct task_struct *task, void **iter) static inline struct psi_group *task_psi_group(struct task_struct *task)
{ {
if (*iter == &psi_system)
return NULL;
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
if (static_branch_likely(&psi_cgroups_enabled)) { if (static_branch_likely(&psi_cgroups_enabled))
struct cgroup *cgroup = NULL; return cgroup_psi(task_dfl_cgroup(task));
if (!*iter)
cgroup = task->cgroups->dfl_cgrp;
else
cgroup = cgroup_parent(*iter);
if (cgroup && cgroup_parent(cgroup)) {
*iter = cgroup;
return cgroup_psi(cgroup);
}
}
#endif #endif
*iter = &psi_system;
return &psi_system; return &psi_system;
} }
...@@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set) ...@@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set)
{ {
int cpu = task_cpu(task); int cpu = task_cpu(task);
struct psi_group *group; struct psi_group *group;
bool wake_clock = true;
void *iter = NULL;
u64 now; u64 now;
if (!task->pid) if (!task->pid)
...@@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set) ...@@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
psi_flags_change(task, clear, set); psi_flags_change(task, clear, set);
now = cpu_clock(cpu); now = cpu_clock(cpu);
/*
* Periodic aggregation shuts off if there is a period of no
* task changes, so we wake it back up if necessary. However,
* don't do this if the task change is the aggregation worker
* itself going to sleep, or we'll ping-pong forever.
*/
if (unlikely((clear & TSK_RUNNING) &&
(task->flags & PF_WQ_WORKER) &&
wq_worker_last_func(task) == psi_avgs_work))
wake_clock = false;
while ((group = iterate_groups(task, &iter))) group = task_psi_group(task);
psi_group_change(group, cpu, clear, set, now, wake_clock); do {
psi_group_change(group, cpu, clear, set, now, true);
} while ((group = group->parent));
} }
void psi_task_switch(struct task_struct *prev, struct task_struct *next, void psi_task_switch(struct task_struct *prev, struct task_struct *next,
...@@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, ...@@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
{ {
struct psi_group *group, *common = NULL; struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev); int cpu = task_cpu(prev);
void *iter;
u64 now = cpu_clock(cpu); u64 now = cpu_clock(cpu);
if (next->pid) { if (next->pid) {
bool identical_state;
psi_flags_change(next, 0, TSK_ONCPU); psi_flags_change(next, 0, TSK_ONCPU);
/* /*
* When switching between tasks that have an identical * Set TSK_ONCPU on @next's cgroups. If @next shares any
* runtime state, the cgroup that contains both tasks * ancestors with @prev, those will already have @prev's
* we reach the first common ancestor. Iterate @next's * TSK_ONCPU bit set, and we can stop the iteration there.
* ancestors only until we encounter @prev's ONCPU.
*/ */
identical_state = prev->psi_flags == next->psi_flags; group = task_psi_group(next);
iter = NULL; do {
while ((group = iterate_groups(next, &iter))) { if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
if (identical_state && PSI_ONCPU) {
per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
common = group; common = group;
break; break;
} }
psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
} } while ((group = group->parent));
} }
if (prev->pid) { if (prev->pid) {
int clear = TSK_ONCPU, set = 0; int clear = TSK_ONCPU, set = 0;
bool wake_clock = true;
/* /*
* When we're going to sleep, psi_dequeue() lets us * When we're going to sleep, psi_dequeue() lets us
...@@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, ...@@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
clear |= TSK_MEMSTALL_RUNNING; clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait) if (prev->in_iowait)
set |= TSK_IOWAIT; set |= TSK_IOWAIT;
/*
* Periodic aggregation shuts off if there is a period of no
* task changes, so we wake it back up if necessary. However,
* don't do this if the task change is the aggregation worker
* itself going to sleep, or we'll ping-pong forever.
*/
if (unlikely((prev->flags & PF_WQ_WORKER) &&
wq_worker_last_func(prev) == psi_avgs_work))
wake_clock = false;
} }
psi_flags_change(prev, clear, set); psi_flags_change(prev, clear, set);
iter = NULL; group = task_psi_group(prev);
while ((group = iterate_groups(prev, &iter)) && group != common) do {
psi_group_change(group, cpu, clear, set, now, true); if (group == common)
break;
psi_group_change(group, cpu, clear, set, now, wake_clock);
} while ((group = group->parent));
/* /*
* TSK_ONCPU is handled up to the common ancestor. If we're tasked * TSK_ONCPU is handled up to the common ancestor. If there are
* with dequeuing too, finish that for the rest of the hierarchy. * any other differences between the two tasks (e.g. prev goes
* to sleep, or only one task is memstall), finish propagating
* those differences all the way up to the root.
*/ */
if (sleep) { if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
clear &= ~TSK_ONCPU; clear &= ~TSK_ONCPU;
for (; group; group = iterate_groups(prev, &iter)) for (; group; group = group->parent)
psi_group_change(group, cpu, clear, set, now, true); psi_group_change(group, cpu, clear, set, now, wake_clock);
} }
} }
} }
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct task_struct *task, u32 delta)
{
int cpu = task_cpu(task);
struct psi_group *group;
struct psi_group_cpu *groupc;
u64 now;
if (!task->pid)
return;
now = cpu_clock(cpu);
group = task_psi_group(task);
do {
if (!group->enabled)
continue;
groupc = per_cpu_ptr(group->pcpu, cpu);
write_seqcount_begin(&groupc->seq);
record_times(groupc, now);
groupc->times[PSI_IRQ_FULL] += delta;
write_seqcount_end(&groupc->seq);
if (group->poll_states & (1 << PSI_IRQ_FULL))
psi_schedule_poll_work(group, 1);
} while ((group = group->parent));
}
#endif
/** /**
* psi_memstall_enter - mark the beginning of a memory stall section * psi_memstall_enter - mark the beginning of a memory stall section
* @flags: flags to handle nested sections * @flags: flags to handle nested sections
...@@ -952,7 +1009,7 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave); ...@@ -952,7 +1009,7 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave);
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup) int psi_cgroup_alloc(struct cgroup *cgroup)
{ {
if (static_branch_likely(&psi_disabled)) if (!static_branch_likely(&psi_cgroups_enabled))
return 0; return 0;
cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
...@@ -965,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup) ...@@ -965,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
return -ENOMEM; return -ENOMEM;
} }
group_init(cgroup->psi); group_init(cgroup->psi);
cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
return 0; return 0;
} }
void psi_cgroup_free(struct cgroup *cgroup) void psi_cgroup_free(struct cgroup *cgroup)
{ {
if (static_branch_likely(&psi_disabled)) if (!static_branch_likely(&psi_cgroups_enabled))
return; return;
cancel_delayed_work_sync(&cgroup->psi->avgs_work); cancel_delayed_work_sync(&cgroup->psi->avgs_work);
...@@ -998,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) ...@@ -998,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
struct rq_flags rf; struct rq_flags rf;
struct rq *rq; struct rq *rq;
if (static_branch_likely(&psi_disabled)) { if (!static_branch_likely(&psi_cgroups_enabled)) {
/* /*
* Lame to do this here, but the scheduler cannot be locked * Lame to do this here, but the scheduler cannot be locked
* from the outside, so we move cgroups from inside sched/. * from the outside, so we move cgroups from inside sched/.
...@@ -1046,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) ...@@ -1046,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
task_rq_unlock(rq, task, &rf); task_rq_unlock(rq, task, &rf);
} }
void psi_cgroup_restart(struct psi_group *group)
{
int cpu;
/*
* After we disable psi_group->enabled, we don't actually
* stop percpu tasks accounting in each psi_group_cpu,
* instead only stop test_state() loop, record_times()
* and averaging worker, see psi_group_change() for details.
*
* When disable cgroup PSI, this function has nothing to sync
* since cgroup pressure files are hidden and percpu psi_group_cpu
* would see !psi_group->enabled and only do task accounting.
*
* When re-enable cgroup PSI, this function use psi_group_change()
* to get correct state mask from test_state() loop on tasks[],
* and restart groupc->state_start from now, use .clear = .set = 0
* here since no task status really changed.
*/
if (!group->enabled)
return;
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
u64 now;
rq_lock_irq(rq, &rf);
now = cpu_clock(cpu);
psi_group_change(group, cpu, 0, 0, now, true);
rq_unlock_irq(rq, &rf);
}
}
#endif /* CONFIG_CGROUPS */ #endif /* CONFIG_CGROUPS */
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{ {
bool only_full = false;
int full; int full;
u64 now; u64 now;
...@@ -1064,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) ...@@ -1064,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
group->avg_next_update = update_averages(group, now); group->avg_next_update = update_averages(group, now);
mutex_unlock(&group->avgs_lock); mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2; full++) { #ifdef CONFIG_IRQ_TIME_ACCOUNTING
only_full = res == PSI_IRQ;
#endif
for (full = 0; full < 2 - only_full; full++) {
unsigned long avg[3] = { 0, }; unsigned long avg[3] = { 0, };
u64 total = 0; u64 total = 0;
int w; int w;
...@@ -1078,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) ...@@ -1078,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
} }
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
full ? "full" : "some", full || only_full ? "full" : "some",
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
...@@ -1106,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, ...@@ -1106,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
else else
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
return ERR_PTR(-EINVAL);
#endif
if (state >= PSI_NONIDLE) if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
...@@ -1390,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = { ...@@ -1390,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
.proc_release = psi_fop_release, .proc_release = psi_fop_release,
}; };
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int psi_irq_show(struct seq_file *m, void *v)
{
return psi_show(m, &psi_system, PSI_IRQ);
}
static int psi_irq_open(struct inode *inode, struct file *file)
{
return psi_open(file, psi_irq_show);
}
static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
size_t nbytes, loff_t *ppos)
{
return psi_write(file, user_buf, nbytes, PSI_IRQ);
}
static const struct proc_ops psi_irq_proc_ops = {
.proc_open = psi_irq_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_write = psi_irq_write,
.proc_poll = psi_fop_poll,
.proc_release = psi_fop_release,
};
#endif
static int __init psi_proc_init(void) static int __init psi_proc_init(void)
{ {
if (psi_enable) { if (psi_enable) {
...@@ -1397,6 +1526,9 @@ static int __init psi_proc_init(void) ...@@ -1397,6 +1526,9 @@ static int __init psi_proc_init(void)
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
#endif
} }
return 0; return 0;
} }
......
...@@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se) ...@@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se)
} }
#ifdef CONFIG_PSI #ifdef CONFIG_PSI
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
void psi_account_irqtime(struct task_struct *task, u32 delta);
/* /*
* PSI tracks state that persists across sleeps, such as iowaits and * PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps, * memory stalls. As a result, it has to distinguish between sleeps,
...@@ -201,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} ...@@ -201,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev, static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next, struct task_struct *next,
bool sleep) {} bool sleep) {}
static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
#endif /* CONFIG_PSI */ #endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO #ifdef CONFIG_SCHED_INFO
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment