Commit bd275681 authored by Peter Zijlstra's avatar Peter Zijlstra

perf: Rewrite core context handling

There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).

Notably:
 - HW breakpoint PMU
 - ARM big.little PMU / Intel ADL PMU
 - Intel Branch Monitoring PMU
 - AMD IBS PMU
 - S390 cpum_cf PMU
 - PowerPC trace_imc PMU

*Current design:*

Currently we have a per task and per cpu perf_event_contexts:

  task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
       ^                                 |    ^     |           ^
       `---------------------------------'    |     `--> pmu ---'
                                              v           ^
                                         perf_event ------'

Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.

Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.

The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.

Each perf_event is then associated with its PMU and one
perf_event_context.

*Proposed design:*

New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:

  task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
       ^                           |   ^ ^
       `---------------------------'   | |
                                       | |    perf_cpu_pmu_context <--.
                                       | `----.    ^                  |
                                       |      |    |                  |
                                       |      v    v                  |
                                       | ,--> perf_event_pmu_context  |
                                       | |                            |
                                       | |                            |
                                       v v                            |
                                  perf_event ---> pmu ----------------'

With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:

  {cpu, pmu, cgroup, group_index}

Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.

Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.

Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.

Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.

Much thanks to Ravi for picking this up and pushing it towards
completion.
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: default avatarRavi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: default avatarRavi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
parent 247f34f7
......@@ -806,10 +806,14 @@ static void armv8pmu_disable_event(struct perf_event *event)
static void armv8pmu_start(struct arm_pmu *cpu_pmu)
{
struct perf_event_context *task_ctx =
this_cpu_ptr(cpu_pmu->pmu.pmu_cpu_context)->task_ctx;
struct perf_event_context *ctx;
int nr_user = 0;
if (sysctl_perf_user_access && task_ctx && task_ctx->nr_user)
ctx = perf_cpu_task_ctx();
if (ctx)
nr_user = ctx->nr_user;
if (sysctl_perf_user_access && nr_user)
armv8pmu_enable_user_access(cpu_pmu);
else
armv8pmu_disable_user_access();
......@@ -1019,10 +1023,10 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event,
return 0;
}
static int armv8pmu_filter_match(struct perf_event *event)
static bool armv8pmu_filter(struct pmu *pmu, int cpu)
{
unsigned long evtype = event->hw.config_base & ARMV8_PMU_EVTYPE_EVENT;
return evtype != ARMV8_PMUV3_PERFCTR_CHAIN;
struct arm_pmu *armpmu = to_arm_pmu(pmu);
return !cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus);
}
static void armv8pmu_reset(void *info)
......@@ -1253,7 +1257,7 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
cpu_pmu->stop = armv8pmu_stop;
cpu_pmu->reset = armv8pmu_reset;
cpu_pmu->set_event_filter = armv8pmu_set_event_filter;
cpu_pmu->filter_match = armv8pmu_filter_match;
cpu_pmu->filter = armv8pmu_filter;
cpu_pmu->pmu.event_idx = armv8pmu_user_event_idx;
......
......@@ -132,7 +132,7 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {}
static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) {}
static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {}
static void pmao_restore_workaround(bool ebb) { }
#endif /* CONFIG_PPC32 */
......@@ -424,7 +424,7 @@ static void power_pmu_bhrb_enable(struct perf_event *event)
cpuhw->bhrb_context = event->ctx;
}
cpuhw->bhrb_users++;
perf_sched_cb_inc(event->ctx->pmu);
perf_sched_cb_inc(event->pmu);
}
static void power_pmu_bhrb_disable(struct perf_event *event)
......@@ -436,7 +436,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
WARN_ON_ONCE(!cpuhw->bhrb_users);
cpuhw->bhrb_users--;
perf_sched_cb_dec(event->ctx->pmu);
perf_sched_cb_dec(event->pmu);
if (!cpuhw->disabled && !cpuhw->bhrb_users) {
/* BHRB cannot be turned off when other
......@@ -451,7 +451,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
/* Called from ctxsw to prevent one process's branch entries to
* mingle with the other process's entries during context switch.
*/
static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
if (!ppmu->bhrb_nr)
return;
......
......@@ -379,7 +379,7 @@ static int paicrypt_push_sample(void)
/* Called on schedule-in and schedule-out. No access to event structure,
* but for sampling only event CRYPTO_ALL is allowed.
*/
static void paicrypt_sched_task(struct perf_event_context *ctx, bool sched_in)
static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
/* We started with a clean page on event installation. So read out
* results on schedule_out and if page was dirty, clear values.
......
......@@ -471,7 +471,7 @@ static int paiext_push_sample(void)
/* Called on schedule-in and schedule-out. No access to event structure,
* but for sampling only event NNPA_ALL is allowed.
*/
static void paiext_sched_task(struct perf_event_context *ctx, bool sched_in)
static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
/* We started with a clean page on event installation. So read out
* results on schedule_out and if page was dirty, clear values.
......
......@@ -384,7 +384,7 @@ static void amd_brs_poison_buffer(void)
* On ctxswin, sched_in = true, called after the PMU has started
* On ctxswout, sched_in = false, called before the PMU is stopped
*/
void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
......
......@@ -352,7 +352,7 @@ void amd_pmu_lbr_add(struct perf_event *event)
cpuc->br_sel = reg->reg;
}
perf_sched_cb_inc(event->ctx->pmu);
perf_sched_cb_inc(event->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
amd_pmu_lbr_reset();
......@@ -370,10 +370,10 @@ void amd_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
perf_sched_cb_dec(event->ctx->pmu);
perf_sched_cb_dec(event->pmu);
}
void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
......
......@@ -90,6 +90,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
/*
* This one is magic, it will get called even when PMU init fails (because
* there is no PMU), in which case it should simply return NULL.
......@@ -2031,6 +2033,7 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
static_call_update(x86_pmu_filter, x86_pmu.filter);
}
static void _x86_pmu_read(struct perf_event *event)
......@@ -2052,23 +2055,6 @@ void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
pr_info("... event mask: %016Lx\n", intel_ctrl);
}
/*
* The generic code is not hybrid friendly. The hybrid_pmu->pmu
* of the first registered PMU is unconditionally assigned to
* each possible cpuctx->ctx.pmu.
* Update the correct hybrid PMU to the cpuctx->ctx.pmu.
*/
void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu)
{
struct perf_cpu_context *cpuctx;
if (!pmu->pmu_cpu_context)
return;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
cpuctx->ctx.pmu = pmu;
}
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
......@@ -2195,9 +2181,6 @@ static int __init init_hw_perf_events(void)
(hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
if (err)
break;
if (cpu_type == hybrid_pmu->cpu_type)
x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id());
}
if (i < x86_pmu.num_hybrid_pmus) {
......@@ -2646,15 +2629,15 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};
static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
static_call_cond(x86_pmu_sched_task)(ctx, sched_in);
static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in);
}
static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
struct perf_event_context *next)
static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc)
{
static_call_cond(x86_pmu_swap_task_ctx)(prev, next);
static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc);
}
void perf_check_microcode(void)
......@@ -2689,12 +2672,13 @@ static int x86_pmu_aux_output_match(struct perf_event *event)
return 0;
}
static int x86_pmu_filter_match(struct perf_event *event)
static bool x86_pmu_filter(struct pmu *pmu, int cpu)
{
if (x86_pmu.filter_match)
return x86_pmu.filter_match(event);
bool ret = false;
return 1;
static_call_cond(x86_pmu_filter)(pmu, cpu, &ret);
return ret;
}
static struct pmu pmu = {
......@@ -2725,7 +2709,7 @@ static struct pmu pmu = {
.aux_output_match = x86_pmu_aux_output_match,
.filter_match = x86_pmu_filter_match,
.filter = x86_pmu_filter,
};
void arch_perf_update_userpage(struct perf_event *event,
......
......@@ -4536,8 +4536,6 @@ static bool init_hybrid_pmu(int cpu)
cpumask_set_cpu(cpu, &pmu->supported_cpus);
cpuc->pmu = &pmu->pmu;
x86_pmu_update_cpu_context(&pmu->pmu, cpu);
return true;
}
......@@ -4671,17 +4669,17 @@ static void intel_pmu_cpu_dead(int cpu)
cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus);
}
static void intel_pmu_sched_task(struct perf_event_context *ctx,
static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
bool sched_in)
{
intel_pmu_pebs_sched_task(ctx, sched_in);
intel_pmu_lbr_sched_task(ctx, sched_in);
intel_pmu_pebs_sched_task(pmu_ctx, sched_in);
intel_pmu_lbr_sched_task(pmu_ctx, sched_in);
}
static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
struct perf_event_context *next)
static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc)
{
intel_pmu_lbr_swap_task_ctx(prev, next);
intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc);
}
static int intel_pmu_check_period(struct perf_event *event, u64 value)
......@@ -4705,12 +4703,11 @@ static int intel_pmu_aux_output_match(struct perf_event *event)
return is_intel_pt_event(event);
}
static int intel_pmu_filter_match(struct perf_event *event)
static void intel_pmu_filter(struct pmu *pmu, int cpu, bool *ret)
{
struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
unsigned int cpu = smp_processor_id();
struct x86_hybrid_pmu *hpmu = hybrid_pmu(pmu);
return cpumask_test_cpu(cpu, &pmu->supported_cpus);
*ret = !cpumask_test_cpu(cpu, &hpmu->supported_cpus);
}
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
......@@ -6412,7 +6409,7 @@ __init int intel_pmu_init(void)
static_call_update(intel_pmu_set_topdown_event_period,
&adl_set_topdown_event_period);
x86_pmu.filter_match = intel_pmu_filter_match;
x86_pmu.filter = intel_pmu_filter;
x86_pmu.get_event_constraints = adl_get_event_constraints;
x86_pmu.hw_config = adl_hw_config;
x86_pmu.limit_period = spr_limit_period;
......
......@@ -1059,7 +1059,7 @@ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
}
void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
......@@ -1167,7 +1167,7 @@ static void
pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
struct perf_event *event, bool add)
{
struct pmu *pmu = event->ctx->pmu;
struct pmu *pmu = event->pmu;
/*
* Make sure we get updated with the first PEBS
* event. It will trigger also during removal, but
......
......@@ -515,21 +515,21 @@ static void __intel_pmu_lbr_save(void *ctx)
cpuc->last_log_id = ++task_context_opt(ctx)->log_id;
}
void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
struct perf_event_context *next)
void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc)
{
void *prev_ctx_data, *next_ctx_data;
swap(prev->task_ctx_data, next->task_ctx_data);
swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
/*
* Architecture specific synchronization makes sense in
* case both prev->task_ctx_data and next->task_ctx_data
* Architecture specific synchronization makes sense in case
* both prev_epc->task_ctx_data and next_epc->task_ctx_data
* pointers are allocated.
*/
prev_ctx_data = next->task_ctx_data;
next_ctx_data = prev->task_ctx_data;
prev_ctx_data = next_epc->task_ctx_data;
next_ctx_data = prev_epc->task_ctx_data;
if (!prev_ctx_data || !next_ctx_data)
return;
......@@ -538,7 +538,7 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
task_context_opt(next_ctx_data)->lbr_callstack_users);
}
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
void *task_ctx;
......@@ -551,7 +551,7 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
* the task was scheduled out, restore the stack. Otherwise flush
* the LBR stack.
*/
task_ctx = ctx ? ctx->task_ctx_data : NULL;
task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL;
if (task_ctx) {
if (sched_in)
__intel_pmu_lbr_restore(task_ctx);
......@@ -587,8 +587,8 @@ void intel_pmu_lbr_add(struct perf_event *event)
cpuc->br_sel = event->hw.branch_reg.reg;
if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data)
task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++;
if (branch_user_callstack(cpuc->br_sel) && event->pmu_ctx->task_ctx_data)
task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users++;
/*
* Request pmu::sched_task() callback, which will fire inside the
......@@ -611,7 +611,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
*/
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
cpuc->lbr_pebs_users++;
perf_sched_cb_inc(event->ctx->pmu);
perf_sched_cb_inc(event->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
}
......@@ -664,8 +664,8 @@ void intel_pmu_lbr_del(struct perf_event *event)
return;
if (branch_user_callstack(cpuc->br_sel) &&
event->ctx->task_ctx_data)
task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--;
event->pmu_ctx->task_ctx_data)
task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users--;
if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
cpuc->lbr_select = 0;
......@@ -675,7 +675,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
perf_sched_cb_dec(event->ctx->pmu);
perf_sched_cb_dec(event->pmu);
}
static inline bool vlbr_exclude_host(void)
......
......@@ -811,7 +811,7 @@ struct x86_pmu {
void (*cpu_dead)(int cpu);
void (*check_microcode)(void);
void (*sched_task)(struct perf_event_context *ctx,
void (*sched_task)(struct perf_event_pmu_context *pmu_ctx,
bool sched_in);
/*
......@@ -894,12 +894,12 @@ struct x86_pmu {
int num_topdown_events;
/*
* perf task context (i.e. struct perf_event_context::task_ctx_data)
* perf task context (i.e. struct perf_event_pmu_context::task_ctx_data)
* switch helper to bridge calls from perf/core to perf/x86.
* See struct pmu::swap_task_ctx() usage for examples;
*/
void (*swap_task_ctx)(struct perf_event_context *prev,
struct perf_event_context *next);
void (*swap_task_ctx)(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc);
/*
* AMD bits
......@@ -925,7 +925,7 @@ struct x86_pmu {
int (*aux_output_match) (struct perf_event *event);
int (*filter_match)(struct perf_event *event);
void (*filter)(struct pmu *pmu, int cpu, bool *ret);
/*
* Hybrid support
*
......@@ -1180,8 +1180,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs);
void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
u64 intel_ctrl);
void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu);
extern struct event_constraint emptyconstraint;
extern struct event_constraint unconstrained;
......@@ -1306,7 +1304,7 @@ void amd_pmu_lbr_reset(void);
void amd_pmu_lbr_read(void);
void amd_pmu_lbr_add(struct perf_event *event);
void amd_pmu_lbr_del(struct perf_event *event);
void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
void amd_pmu_lbr_enable_all(void);
void amd_pmu_lbr_disable_all(void);
int amd_pmu_lbr_hw_config(struct perf_event *event);
......@@ -1330,7 +1328,7 @@ static inline void amd_pmu_brs_add(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
perf_sched_cb_inc(event->ctx->pmu);
perf_sched_cb_inc(event->pmu);
cpuc->lbr_users++;
/*
* No need to reset BRS because it is reset
......@@ -1345,10 +1343,10 @@ static inline void amd_pmu_brs_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
perf_sched_cb_dec(event->ctx->pmu);
perf_sched_cb_dec(event->pmu);
}
void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in);
void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
#else
static inline int amd_brs_init(void)
{
......@@ -1373,7 +1371,7 @@ static inline void amd_pmu_brs_del(struct perf_event *event)
{
}
static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
}
......@@ -1533,7 +1531,7 @@ void intel_pmu_pebs_enable_all(void);
void intel_pmu_pebs_disable_all(void);
void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
void intel_pmu_auto_reload_read(struct perf_event *event);
......@@ -1541,10 +1539,10 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
void intel_ds_init(void);
void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
struct perf_event_context *next);
void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc);
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
u64 lbr_from_signext_quirk_wr(u64 val);
......
......@@ -550,15 +550,14 @@ static void armpmu_disable(struct pmu *pmu)
* microarchitecture, and aren't suitable for another. Thus, only match CPUs of
* the same microarchitecture.
*/
static int armpmu_filter_match(struct perf_event *event)
static bool armpmu_filter(struct pmu *pmu, int cpu)
{
struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
unsigned int cpu = smp_processor_id();
int ret;
struct arm_pmu *armpmu = to_arm_pmu(pmu);
bool ret;
ret = cpumask_test_cpu(cpu, &armpmu->supported_cpus);
if (ret && armpmu->filter_match)
return armpmu->filter_match(event);
if (ret && armpmu->filter)
return armpmu->filter(pmu, cpu);
return ret;
}
......@@ -885,14 +884,13 @@ static struct arm_pmu *__armpmu_alloc(gfp_t flags)
.start = armpmu_start,
.stop = armpmu_stop,
.read = armpmu_read,
.filter_match = armpmu_filter_match,
.filter = armpmu_filter,
.attr_groups = pmu->attr_groups,
/*
* This is a CPU PMU potentially in a heterogeneous
* configuration (e.g. big.LITTLE). This is not an uncore PMU,
* and we have taken ctx sharing into account (e.g. with our
* pmu::filter_match callback and pmu::event_init group
* validation).
* pmu::filter callback and pmu::event_init group validation).
*/
.capabilities = PERF_PMU_CAP_HETEROGENEOUS_CPUS | PERF_PMU_CAP_EXTENDED_REGS,
};
......
......@@ -100,7 +100,7 @@ struct arm_pmu {
void (*stop)(struct arm_pmu *);
void (*reset)(void *);
int (*map_event)(struct perf_event *event);
int (*filter_match)(struct perf_event *event);
bool (*filter)(struct pmu *pmu, int cpu);
int num_events;
bool secure_access; /* 32-bit ARM only */
#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40
......
......@@ -266,6 +266,7 @@ struct hw_perf_event {
};
struct perf_event;
struct perf_event_pmu_context;
/*
* Common implementation detail of pmu::{start,commit,cancel}_txn
......@@ -308,7 +309,7 @@ struct pmu {
int capabilities;
int __percpu *pmu_disable_count;
struct perf_cpu_context __percpu *pmu_cpu_context;
struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
int task_ctx_nr;
int hrtimer_interval_ms;
......@@ -443,7 +444,7 @@ struct pmu {
/*
* context-switches callback
*/
void (*sched_task) (struct perf_event_context *ctx,
void (*sched_task) (struct perf_event_pmu_context *pmu_ctx,
bool sched_in);
/*
......@@ -457,8 +458,8 @@ struct pmu {
* implementation and Perf core context switch handling callbacks for usage
* examples.
*/
void (*swap_task_ctx) (struct perf_event_context *prev,
struct perf_event_context *next);
void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc);
/* optional */
/*
......@@ -522,9 +523,10 @@ struct pmu {
/* optional */
/*
* Filter events for PMU-specific reasons.
* Skip programming this PMU on the given CPU. Typically needed for
* big.LITTLE things.
*/
int (*filter_match) (struct perf_event *event); /* optional */
bool (*filter) (struct pmu *pmu, int cpu); /* optional */
/*
* Check period value for PERF_EVENT_IOC_PERIOD ioctl.
......@@ -695,6 +697,11 @@ struct perf_event {
int group_caps;
struct perf_event *group_leader;
/*
* event->pmu will always point to pmu in which this event belongs.
* Whereas event->pmu_ctx->pmu may point to other pmu when group of
* different pmu events is created.
*/
struct pmu *pmu;
void *pmu_private;
......@@ -720,6 +727,12 @@ struct perf_event {
struct hw_perf_event hw;
struct perf_event_context *ctx;
/*
* event->pmu_ctx points to perf_event_pmu_context in which the event
* is added. This pmu_ctx can be of other pmu for sw event when that
* sw event is part of a group which also contains non-sw events.
*/
struct perf_event_pmu_context *pmu_ctx;
atomic_long_t refcount;
/*
......@@ -812,19 +825,69 @@ struct perf_event {
#endif /* CONFIG_PERF_EVENTS */
};
/*
* ,-----------------------[1:n]----------------------.
* V V
* perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
* ^ ^ | |
* `--------[1:n]---------' `-[n:1]-> pmu <-[1:n]-'
*
*
* struct perf_event_pmu_context lifetime is refcount based and RCU freed
* (similar to perf_event_context). Locking is as if it were a member of
* perf_event_context; specifically:
*
* modification, both: ctx->mutex && ctx->lock
* reading, either: ctx->mutex || ctx->lock
*
* There is one exception to this; namely put_pmu_ctx() isn't always called
* with ctx->mutex held; this means that as long as we can guarantee the epc
* has events the above rules hold.
*
* Specificially, sys_perf_event_open()'s group_leader case depends on
* ctx->mutex pinning the configuration. Since we hold a reference on
* group_leader (through the filedesc) it can't go away, therefore it's
* associated pmu_ctx must exist and cannot change due to ctx->mutex.
*/
struct perf_event_pmu_context {
struct pmu *pmu;
struct perf_event_context *ctx;
struct list_head pmu_ctx_entry;
struct list_head pinned_active;
struct list_head flexible_active;
/* Used to avoid freeing per-cpu perf_event_pmu_context */
unsigned int embedded : 1;
unsigned int nr_events;
atomic_t refcount; /* event <-> epc */
struct rcu_head rcu_head;
void *task_ctx_data; /* pmu specific data */
/*
* Set when one or more (plausibly active) event can't be scheduled
* due to pmu overcommit or pmu constraints, except tolerant to
* events not necessary to be active due to scheduling constraints,
* such as cgroups.
*/
int rotate_necessary;
};
struct perf_event_groups {
struct rb_root tree;
u64 index;
};
/**
* struct perf_event_context - event context structure
*
* Used as a container for task events and CPU events as well:
*/
struct perf_event_context {
struct pmu *pmu;
/*
* Protect the states of the events in the list,
* nr_active, and the list:
......@@ -837,27 +900,21 @@ struct perf_event_context {
*/
struct mutex mutex;
struct list_head active_ctx_list;
struct list_head pmu_ctx_list;
struct perf_event_groups pinned_groups;
struct perf_event_groups flexible_groups;
struct list_head event_list;
struct list_head pinned_active;
struct list_head flexible_active;
int nr_events;
int nr_active;
int nr_user;
int is_active;
int nr_task_data;
int nr_stat;
int nr_freq;
int rotate_disable;
/*
* Set when nr_events != nr_active, except tolerant to events not
* necessary to be active due to scheduling constraints, such as cgroups.
*/
int rotate_necessary;
refcount_t refcount;
refcount_t refcount; /* event <-> ctx */
struct task_struct *task;
/*
......@@ -878,7 +935,6 @@ struct perf_event_context {
#ifdef CONFIG_CGROUP_PERF
int nr_cgroups; /* cgroup evts */
#endif
void *task_ctx_data; /* pmu specific data */
struct rcu_head rcu_head;
/*
......@@ -896,12 +952,13 @@ struct perf_event_context {
*/
#define PERF_NR_CONTEXTS 4
/**
* struct perf_cpu_context - per cpu event context structure
*/
struct perf_cpu_context {
struct perf_event_context ctx;
struct perf_event_context *task_ctx;
struct perf_cpu_pmu_context {
struct perf_event_pmu_context epc;
struct perf_event_pmu_context *task_epc;
struct list_head sched_cb_entry;
int sched_cb_usage;
int active_oncpu;
int exclusive;
......@@ -909,16 +966,20 @@ struct perf_cpu_context {
struct hrtimer hrtimer;
ktime_t hrtimer_interval;
unsigned int hrtimer_active;
};
/**
* struct perf_event_cpu_context - per cpu event context structure
*/
struct perf_cpu_context {
struct perf_event_context ctx;
struct perf_event_context *task_ctx;
int online;
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
struct list_head cgrp_cpuctx_entry;
#endif
struct list_head sched_cb_entry;
int sched_cb_usage;
int online;
/*
* Per-CPU storage for iterators used in visit_groups_merge. The default
* storage is of size 2 to hold the CPU and any CPU event iterators.
......@@ -982,6 +1043,8 @@ perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
#ifdef CONFIG_PERF_EVENTS
extern struct perf_event_context *perf_cpu_task_ctx(void);
extern void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *event);
extern void perf_aux_output_end(struct perf_output_handle *handle,
......@@ -1187,7 +1250,7 @@ static inline int is_software_event(struct perf_event *event)
*/
static inline int in_software_context(struct perf_event *event)
{
return event->ctx->pmu->task_ctx_nr == perf_sw_context;
return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context;
}
static inline int is_exclusive_pmu(struct pmu *pmu)
......
......@@ -1243,7 +1243,7 @@ struct task_struct {
unsigned int futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct perf_event_context *perf_event_ctxp;
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment