Commit f06cc667 authored by Peter Zijlstra's avatar Peter Zijlstra

perf: Optimize perf_cgroup_switch()

Namhyung reported that bd275681 ("perf: Rewrite core context handling")
regresses context switch overhead when perf-cgroup is in use together
with 'slow' PMUs like uncore.

Specifically, perf_cgroup_switch()'s perf_ctx_disable() /
ctx_sched_out() etc.. all iterate the full list of active PMUs for
that CPU, even if they don't have cgroup events.

Previously there was cgrp_cpuctx_list which linked the relevant PMUs
together, but that got lost in the rework. Instead of re-instruducing
a similar list, let the perf_event_pmu_context iteration skip those
that do not have cgroup events. This avoids growing multiple versions
of the perf_event_pmu_context iteration.

Measured performance (on a slightly different patch):

Before)

  $ taskset -c 0 ./perf bench sched pipe -l 10000 -G AAA,BBB
  # Running 'sched/pipe' benchmark:
  # Executed 10000 pipe operations between two processes

       Total time: 0.901 [sec]

        90.128700 usecs/op
            11095 ops/sec

After)

  $ taskset -c 0 ./perf bench sched pipe -l 10000 -G AAA,BBB
  # Running 'sched/pipe' benchmark:
  # Executed 10000 pipe operations between two processes

       Total time: 0.065 [sec]

         6.560100 usecs/op
           152436 ops/sec

Fixes: bd275681 ("perf: Rewrite core context handling")
Reported-by: default avatarNamhyung Kim <namhyung@kernel.org>
Debugged-by: default avatarNamhyung Kim <namhyung@kernel.org>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231009210425.GC6307@noisy.programming.kicks-ass.net
parent 8f4156d5
...@@ -878,6 +878,7 @@ struct perf_event_pmu_context { ...@@ -878,6 +878,7 @@ struct perf_event_pmu_context {
unsigned int embedded : 1; unsigned int embedded : 1;
unsigned int nr_events; unsigned int nr_events;
unsigned int nr_cgroups;
atomic_t refcount; /* event <-> epc */ atomic_t refcount; /* event <-> epc */
struct rcu_head rcu_head; struct rcu_head rcu_head;
......
...@@ -375,6 +375,7 @@ enum event_type_t { ...@@ -375,6 +375,7 @@ enum event_type_t {
EVENT_TIME = 0x4, EVENT_TIME = 0x4,
/* see ctx_resched() for details */ /* see ctx_resched() for details */
EVENT_CPU = 0x8, EVENT_CPU = 0x8,
EVENT_CGROUP = 0x10,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
}; };
...@@ -684,20 +685,26 @@ do { \ ...@@ -684,20 +685,26 @@ do { \
___p; \ ___p; \
}) })
static void perf_ctx_disable(struct perf_event_context *ctx) static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{ {
struct perf_event_pmu_context *pmu_ctx; struct perf_event_pmu_context *pmu_ctx;
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
if (cgroup && !pmu_ctx->nr_cgroups)
continue;
perf_pmu_disable(pmu_ctx->pmu); perf_pmu_disable(pmu_ctx->pmu);
}
} }
static void perf_ctx_enable(struct perf_event_context *ctx) static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{ {
struct perf_event_pmu_context *pmu_ctx; struct perf_event_pmu_context *pmu_ctx;
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
if (cgroup && !pmu_ctx->nr_cgroups)
continue;
perf_pmu_enable(pmu_ctx->pmu); perf_pmu_enable(pmu_ctx->pmu);
}
} }
static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
...@@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task) ...@@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task)
return; return;
perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_ctx_disable(&cpuctx->ctx); perf_ctx_disable(&cpuctx->ctx, true);
ctx_sched_out(&cpuctx->ctx, EVENT_ALL); ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
/* /*
* must not be done before ctxswout due * must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in * to update_cgrp_time_from_cpuctx() in
...@@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task) ...@@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task)
* perf_cgroup_set_timestamp() in ctx_sched_in() * perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around * to not have to pass task around
*/ */
ctx_sched_in(&cpuctx->ctx, EVENT_ALL); ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx); perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx); perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
} }
...@@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct ...@@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
if (!is_cgroup_event(event)) if (!is_cgroup_event(event))
return; return;
event->pmu_ctx->nr_cgroups++;
/* /*
* Because cgroup events are always per-cpu events, * Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx. * @ctx == &cpuctx->ctx.
...@@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c ...@@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (!is_cgroup_event(event)) if (!is_cgroup_event(event))
return; return;
event->pmu_ctx->nr_cgroups--;
/* /*
* Because cgroup events are always per-cpu events, * Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx. * @ctx == &cpuctx->ctx.
...@@ -2677,9 +2688,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, ...@@ -2677,9 +2688,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
event_type &= EVENT_ALL; event_type &= EVENT_ALL;
perf_ctx_disable(&cpuctx->ctx); perf_ctx_disable(&cpuctx->ctx, false);
if (task_ctx) { if (task_ctx) {
perf_ctx_disable(task_ctx); perf_ctx_disable(task_ctx, false);
task_ctx_sched_out(task_ctx, event_type); task_ctx_sched_out(task_ctx, event_type);
} }
...@@ -2697,9 +2708,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, ...@@ -2697,9 +2708,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
perf_event_sched_in(cpuctx, task_ctx); perf_event_sched_in(cpuctx, task_ctx);
perf_ctx_enable(&cpuctx->ctx); perf_ctx_enable(&cpuctx->ctx, false);
if (task_ctx) if (task_ctx)
perf_ctx_enable(task_ctx); perf_ctx_enable(task_ctx, false);
} }
void perf_pmu_resched(struct pmu *pmu) void perf_pmu_resched(struct pmu *pmu)
...@@ -3244,6 +3255,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) ...@@ -3244,6 +3255,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx; struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active; int is_active = ctx->is_active;
bool cgroup = event_type & EVENT_CGROUP;
event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock); lockdep_assert_held(&ctx->lock);
...@@ -3290,8 +3304,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) ...@@ -3290,8 +3304,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
is_active ^= ctx->is_active; /* changed bits */ is_active ^= ctx->is_active; /* changed bits */
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
if (cgroup && !pmu_ctx->nr_cgroups)
continue;
__pmu_ctx_sched_out(pmu_ctx, is_active); __pmu_ctx_sched_out(pmu_ctx, is_active);
}
} }
/* /*
...@@ -3482,7 +3499,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) ...@@ -3482,7 +3499,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) { if (context_equiv(ctx, next_ctx)) {
perf_ctx_disable(ctx); perf_ctx_disable(ctx, false);
/* PMIs are disabled; ctx->nr_pending is stable. */ /* PMIs are disabled; ctx->nr_pending is stable. */
if (local_read(&ctx->nr_pending) || if (local_read(&ctx->nr_pending) ||
...@@ -3502,7 +3519,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) ...@@ -3502,7 +3519,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
perf_ctx_sched_task_cb(ctx, false); perf_ctx_sched_task_cb(ctx, false);
perf_event_swap_task_ctx_data(ctx, next_ctx); perf_event_swap_task_ctx_data(ctx, next_ctx);
perf_ctx_enable(ctx); perf_ctx_enable(ctx, false);
/* /*
* RCU_INIT_POINTER here is safe because we've not * RCU_INIT_POINTER here is safe because we've not
...@@ -3526,13 +3543,13 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) ...@@ -3526,13 +3543,13 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
if (do_switch) { if (do_switch) {
raw_spin_lock(&ctx->lock); raw_spin_lock(&ctx->lock);
perf_ctx_disable(ctx); perf_ctx_disable(ctx, false);
inside_switch: inside_switch:
perf_ctx_sched_task_cb(ctx, false); perf_ctx_sched_task_cb(ctx, false);
task_ctx_sched_out(ctx, EVENT_ALL); task_ctx_sched_out(ctx, EVENT_ALL);
perf_ctx_enable(ctx); perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock); raw_spin_unlock(&ctx->lock);
} }
} }
...@@ -3818,47 +3835,32 @@ static int merge_sched_in(struct perf_event *event, void *data) ...@@ -3818,47 +3835,32 @@ static int merge_sched_in(struct perf_event *event, void *data)
return 0; return 0;
} }
static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu) static void pmu_groups_sched_in(struct perf_event_context *ctx,
struct perf_event_groups *groups,
struct pmu *pmu)
{ {
struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1; int can_add_hw = 1;
visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
if (pmu) { merge_sched_in, &can_add_hw);
visit_groups_merge(ctx, &ctx->pinned_groups,
smp_processor_id(), pmu,
merge_sched_in, &can_add_hw);
} else {
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
can_add_hw = 1;
visit_groups_merge(ctx, &ctx->pinned_groups,
smp_processor_id(), pmu_ctx->pmu,
merge_sched_in, &can_add_hw);
}
}
} }
static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu) static void ctx_groups_sched_in(struct perf_event_context *ctx,
struct perf_event_groups *groups,
bool cgroup)
{ {
struct perf_event_pmu_context *pmu_ctx; struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
if (pmu) { list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
visit_groups_merge(ctx, &ctx->flexible_groups, if (cgroup && !pmu_ctx->nr_cgroups)
smp_processor_id(), pmu, continue;
merge_sched_in, &can_add_hw); pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
} else {
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
can_add_hw = 1;
visit_groups_merge(ctx, &ctx->flexible_groups,
smp_processor_id(), pmu_ctx->pmu,
merge_sched_in, &can_add_hw);
}
} }
} }
static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu) static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
struct pmu *pmu)
{ {
ctx_flexible_sched_in(ctx, pmu); pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
} }
static void static void
...@@ -3866,6 +3868,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) ...@@ -3866,6 +3868,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{ {
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active; int is_active = ctx->is_active;
bool cgroup = event_type & EVENT_CGROUP;
event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock); lockdep_assert_held(&ctx->lock);
...@@ -3898,11 +3903,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) ...@@ -3898,11 +3903,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
* in order to give them the best chance of going on. * in order to give them the best chance of going on.
*/ */
if (is_active & EVENT_PINNED) if (is_active & EVENT_PINNED)
ctx_pinned_sched_in(ctx, NULL); ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
/* Then walk through the lower prio flexible groups */ /* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE) if (is_active & EVENT_FLEXIBLE)
ctx_flexible_sched_in(ctx, NULL); ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
} }
static void perf_event_context_sched_in(struct task_struct *task) static void perf_event_context_sched_in(struct task_struct *task)
...@@ -3917,11 +3922,11 @@ static void perf_event_context_sched_in(struct task_struct *task) ...@@ -3917,11 +3922,11 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (cpuctx->task_ctx == ctx) { if (cpuctx->task_ctx == ctx) {
perf_ctx_lock(cpuctx, ctx); perf_ctx_lock(cpuctx, ctx);
perf_ctx_disable(ctx); perf_ctx_disable(ctx, false);
perf_ctx_sched_task_cb(ctx, true); perf_ctx_sched_task_cb(ctx, true);
perf_ctx_enable(ctx); perf_ctx_enable(ctx, false);
perf_ctx_unlock(cpuctx, ctx); perf_ctx_unlock(cpuctx, ctx);
goto rcu_unlock; goto rcu_unlock;
} }
...@@ -3934,7 +3939,7 @@ static void perf_event_context_sched_in(struct task_struct *task) ...@@ -3934,7 +3939,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (!ctx->nr_events) if (!ctx->nr_events)
goto unlock; goto unlock;
perf_ctx_disable(ctx); perf_ctx_disable(ctx, false);
/* /*
* We want to keep the following priority order: * We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned, * cpu pinned (that don't need to move), task pinned,
...@@ -3944,7 +3949,7 @@ static void perf_event_context_sched_in(struct task_struct *task) ...@@ -3944,7 +3949,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
* events, no need to flip the cpuctx's events around. * events, no need to flip the cpuctx's events around.
*/ */
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
perf_ctx_disable(&cpuctx->ctx); perf_ctx_disable(&cpuctx->ctx, false);
ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
} }
...@@ -3953,9 +3958,9 @@ static void perf_event_context_sched_in(struct task_struct *task) ...@@ -3953,9 +3958,9 @@ static void perf_event_context_sched_in(struct task_struct *task)
perf_ctx_sched_task_cb(cpuctx->task_ctx, true); perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
perf_ctx_enable(&cpuctx->ctx); perf_ctx_enable(&cpuctx->ctx, false);
perf_ctx_enable(ctx); perf_ctx_enable(ctx, false);
unlock: unlock:
perf_ctx_unlock(cpuctx, ctx); perf_ctx_unlock(cpuctx, ctx);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment