Commit 76e1d904 authored by Frederic Weisbecker's avatar Frederic Weisbecker

perf: Store active software events in a hashlist

Each time a software event triggers, we need to walk through
the entire list of events from the current cpu and task contexts
to retrieve a running perf event that matches.
We also need to check a matching perf event is actually counting.

This walk is wasteful and makes the event fast path scaling
down with a growing number of events running on the same
contexts.

To solve this, we store the running perf events in a hashlist to
get an immediate access to them against their type:event_id when
they trigger.

v2: - Fix SWEVENT_HLIST_SIZE definition (and re-learn some basic
      maths along the way)
    - Only allocate hlist for online cpus, but keep track of the
      refcount on offline possible cpus too, so that we allocate it
      if needed when it becomes online.
    - Drop the kref use as it's not adapted to our tricks anymore.

v3: - Fix bad refcount check (address instead of value). Thanks to
      Eric Dumazet who spotted this.
    - While exiting cpu, move the hlist release out of the IPI path
      to lock the hlist mutex sanely.
Signed-off-by: default avatarFrederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
parent c0555642
...@@ -589,6 +589,14 @@ enum perf_group_flag { ...@@ -589,6 +589,14 @@ enum perf_group_flag {
PERF_GROUP_SOFTWARE = 0x1, PERF_GROUP_SOFTWARE = 0x1,
}; };
#define SWEVENT_HLIST_BITS 8
#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS)
struct swevent_hlist {
struct hlist_head heads[SWEVENT_HLIST_SIZE];
struct rcu_head rcu_head;
};
/** /**
* struct perf_event - performance event kernel representation: * struct perf_event - performance event kernel representation:
*/ */
...@@ -597,6 +605,7 @@ struct perf_event { ...@@ -597,6 +605,7 @@ struct perf_event {
struct list_head group_entry; struct list_head group_entry;
struct list_head event_entry; struct list_head event_entry;
struct list_head sibling_list; struct list_head sibling_list;
struct hlist_node hlist_entry;
int nr_siblings; int nr_siblings;
int group_flags; int group_flags;
struct perf_event *group_leader; struct perf_event *group_leader;
...@@ -744,6 +753,9 @@ struct perf_cpu_context { ...@@ -744,6 +753,9 @@ struct perf_cpu_context {
int active_oncpu; int active_oncpu;
int max_pertask; int max_pertask;
int exclusive; int exclusive;
struct swevent_hlist *swevent_hlist;
struct mutex hlist_mutex;
int hlist_refcount;
/* /*
* Recursion avoidance: * Recursion avoidance:
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <linux/file.h> #include <linux/file.h>
#include <linux/poll.h> #include <linux/poll.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/hash.h>
#include <linux/sysfs.h> #include <linux/sysfs.h>
#include <linux/dcache.h> #include <linux/dcache.h>
#include <linux/percpu.h> #include <linux/percpu.h>
...@@ -3966,36 +3967,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, ...@@ -3966,36 +3967,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
perf_swevent_overflow(event, 0, nmi, data, regs); perf_swevent_overflow(event, 0, nmi, data, regs);
} }
static int perf_swevent_is_counting(struct perf_event *event)
{
/*
* The event is active, we're good!
*/
if (event->state == PERF_EVENT_STATE_ACTIVE)
return 1;
/*
* The event is off/error, not counting.
*/
if (event->state != PERF_EVENT_STATE_INACTIVE)
return 0;
/*
* The event is inactive, if the context is active
* we're part of a group that didn't make it on the 'pmu',
* not counting.
*/
if (event->ctx->is_active)
return 0;
/*
* We're inactive and the context is too, this means the
* task is scheduled out, we're counting events that happen
* to us, like migration events.
*/
return 1;
}
static int perf_tp_event_match(struct perf_event *event, static int perf_tp_event_match(struct perf_event *event,
struct perf_sample_data *data); struct perf_sample_data *data);
...@@ -4019,12 +3990,6 @@ static int perf_swevent_match(struct perf_event *event, ...@@ -4019,12 +3990,6 @@ static int perf_swevent_match(struct perf_event *event,
struct perf_sample_data *data, struct perf_sample_data *data,
struct pt_regs *regs) struct pt_regs *regs)
{ {
if (event->cpu != -1 && event->cpu != smp_processor_id())
return 0;
if (!perf_swevent_is_counting(event))
return 0;
if (event->attr.type != type) if (event->attr.type != type)
return 0; return 0;
...@@ -4041,18 +4006,53 @@ static int perf_swevent_match(struct perf_event *event, ...@@ -4041,18 +4006,53 @@ static int perf_swevent_match(struct perf_event *event,
return 1; return 1;
} }
static void perf_swevent_ctx_event(struct perf_event_context *ctx, static inline u64 swevent_hash(u64 type, u32 event_id)
enum perf_type_id type, {
u32 event_id, u64 nr, int nmi, u64 val = event_id | (type << 32);
struct perf_sample_data *data,
struct pt_regs *regs) return hash_64(val, SWEVENT_HLIST_BITS);
}
static struct hlist_head *
find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
{
u64 hash;
struct swevent_hlist *hlist;
hash = swevent_hash(type, event_id);
hlist = rcu_dereference(ctx->swevent_hlist);
if (!hlist)
return NULL;
return &hlist->heads[hash];
}
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
u64 nr, int nmi,
struct perf_sample_data *data,
struct pt_regs *regs)
{ {
struct perf_cpu_context *cpuctx;
struct perf_event *event; struct perf_event *event;
struct hlist_node *node;
struct hlist_head *head;
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { cpuctx = &__get_cpu_var(perf_cpu_context);
rcu_read_lock();
head = find_swevent_head(cpuctx, type, event_id);
if (!head)
goto end;
hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
if (perf_swevent_match(event, type, event_id, data, regs)) if (perf_swevent_match(event, type, event_id, data, regs))
perf_swevent_add(event, nr, nmi, data, regs); perf_swevent_add(event, nr, nmi, data, regs);
} }
end:
rcu_read_unlock();
} }
int perf_swevent_get_recursion_context(void) int perf_swevent_get_recursion_context(void)
...@@ -4090,27 +4090,6 @@ void perf_swevent_put_recursion_context(int rctx) ...@@ -4090,27 +4090,6 @@ void perf_swevent_put_recursion_context(int rctx)
} }
EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
u64 nr, int nmi,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
cpuctx = &__get_cpu_var(perf_cpu_context);
rcu_read_lock();
perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
nr, nmi, data, regs);
/*
* doesn't really matter which of the child contexts the
* events ends up in.
*/
ctx = rcu_dereference(current->perf_event_ctxp);
if (ctx)
perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
rcu_read_unlock();
}
void __perf_sw_event(u32 event_id, u64 nr, int nmi, void __perf_sw_event(u32 event_id, u64 nr, int nmi,
struct pt_regs *regs, u64 addr) struct pt_regs *regs, u64 addr)
...@@ -4136,16 +4115,28 @@ static void perf_swevent_read(struct perf_event *event) ...@@ -4136,16 +4115,28 @@ static void perf_swevent_read(struct perf_event *event)
static int perf_swevent_enable(struct perf_event *event) static int perf_swevent_enable(struct perf_event *event)
{ {
struct hw_perf_event *hwc = &event->hw; struct hw_perf_event *hwc = &event->hw;
struct perf_cpu_context *cpuctx;
struct hlist_head *head;
cpuctx = &__get_cpu_var(perf_cpu_context);
if (hwc->sample_period) { if (hwc->sample_period) {
hwc->last_period = hwc->sample_period; hwc->last_period = hwc->sample_period;
perf_swevent_set_period(event); perf_swevent_set_period(event);
} }
head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
if (WARN_ON_ONCE(!head))
return -EINVAL;
hlist_add_head_rcu(&event->hlist_entry, head);
return 0; return 0;
} }
static void perf_swevent_disable(struct perf_event *event) static void perf_swevent_disable(struct perf_event *event)
{ {
hlist_del_rcu(&event->hlist_entry);
} }
static const struct pmu perf_ops_generic = { static const struct pmu perf_ops_generic = {
...@@ -4359,13 +4350,115 @@ static int perf_tp_event_match(struct perf_event *event, ...@@ -4359,13 +4350,115 @@ static int perf_tp_event_match(struct perf_event *event,
return 0; return 0;
} }
static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
{
struct swevent_hlist *hlist;
hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
kfree(hlist);
}
static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
{
struct swevent_hlist *hlist;
if (!cpuctx->swevent_hlist)
return;
hlist = cpuctx->swevent_hlist;
rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
}
static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
{
struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
mutex_lock(&cpuctx->hlist_mutex);
if (!--cpuctx->hlist_refcount)
swevent_hlist_release(cpuctx);
mutex_unlock(&cpuctx->hlist_mutex);
}
static void swevent_hlist_put(struct perf_event *event)
{
int cpu;
if (event->cpu != -1) {
swevent_hlist_put_cpu(event, event->cpu);
return;
}
for_each_possible_cpu(cpu)
swevent_hlist_put_cpu(event, cpu);
}
static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
{
struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
int err = 0;
mutex_lock(&cpuctx->hlist_mutex);
if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
struct swevent_hlist *hlist;
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
if (!hlist) {
err = -ENOMEM;
goto exit;
}
rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
}
cpuctx->hlist_refcount++;
exit:
mutex_unlock(&cpuctx->hlist_mutex);
return err;
}
static int swevent_hlist_get(struct perf_event *event)
{
int err;
int cpu, failed_cpu;
if (event->cpu != -1)
return swevent_hlist_get_cpu(event, event->cpu);
get_online_cpus();
for_each_possible_cpu(cpu) {
err = swevent_hlist_get_cpu(event, cpu);
if (err) {
failed_cpu = cpu;
goto fail;
}
}
put_online_cpus();
return 0;
fail:
for_each_possible_cpu(cpu) {
if (cpu == failed_cpu)
break;
swevent_hlist_put_cpu(event, cpu);
}
put_online_cpus();
return err;
}
static void tp_perf_event_destroy(struct perf_event *event) static void tp_perf_event_destroy(struct perf_event *event)
{ {
perf_trace_disable(event->attr.config); perf_trace_disable(event->attr.config);
swevent_hlist_put(event);
} }
static const struct pmu *tp_perf_event_init(struct perf_event *event) static const struct pmu *tp_perf_event_init(struct perf_event *event)
{ {
int err;
/* /*
* Raw tracepoint data is a severe data leak, only allow root to * Raw tracepoint data is a severe data leak, only allow root to
* have these. * have these.
...@@ -4379,6 +4472,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) ...@@ -4379,6 +4472,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
return NULL; return NULL;
event->destroy = tp_perf_event_destroy; event->destroy = tp_perf_event_destroy;
err = swevent_hlist_get(event);
if (err) {
perf_trace_disable(event->attr.config);
return ERR_PTR(err);
}
return &perf_ops_generic; return &perf_ops_generic;
} }
...@@ -4479,6 +4577,7 @@ static void sw_perf_event_destroy(struct perf_event *event) ...@@ -4479,6 +4577,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
WARN_ON(event->parent); WARN_ON(event->parent);
atomic_dec(&perf_swevent_enabled[event_id]); atomic_dec(&perf_swevent_enabled[event_id]);
swevent_hlist_put(event);
} }
static const struct pmu *sw_perf_event_init(struct perf_event *event) static const struct pmu *sw_perf_event_init(struct perf_event *event)
...@@ -4517,6 +4616,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) ...@@ -4517,6 +4616,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
case PERF_COUNT_SW_ALIGNMENT_FAULTS: case PERF_COUNT_SW_ALIGNMENT_FAULTS:
case PERF_COUNT_SW_EMULATION_FAULTS: case PERF_COUNT_SW_EMULATION_FAULTS:
if (!event->parent) { if (!event->parent) {
int err;
err = swevent_hlist_get(event);
if (err)
return ERR_PTR(err);
atomic_inc(&perf_swevent_enabled[event_id]); atomic_inc(&perf_swevent_enabled[event_id]);
event->destroy = sw_perf_event_destroy; event->destroy = sw_perf_event_destroy;
} }
...@@ -5389,6 +5494,7 @@ static void __init perf_event_init_all_cpus(void) ...@@ -5389,6 +5494,7 @@ static void __init perf_event_init_all_cpus(void)
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
cpuctx = &per_cpu(perf_cpu_context, cpu); cpuctx = &per_cpu(perf_cpu_context, cpu);
mutex_init(&cpuctx->hlist_mutex);
__perf_event_init_context(&cpuctx->ctx, NULL); __perf_event_init_context(&cpuctx->ctx, NULL);
} }
} }
...@@ -5402,6 +5508,16 @@ static void __cpuinit perf_event_init_cpu(int cpu) ...@@ -5402,6 +5508,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
spin_lock(&perf_resource_lock); spin_lock(&perf_resource_lock);
cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
spin_unlock(&perf_resource_lock); spin_unlock(&perf_resource_lock);
mutex_lock(&cpuctx->hlist_mutex);
if (cpuctx->hlist_refcount > 0) {
struct swevent_hlist *hlist;
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
WARN_ON_ONCE(!hlist);
rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
}
mutex_unlock(&cpuctx->hlist_mutex);
} }
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
...@@ -5421,6 +5537,10 @@ static void perf_event_exit_cpu(int cpu) ...@@ -5421,6 +5537,10 @@ static void perf_event_exit_cpu(int cpu)
struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
struct perf_event_context *ctx = &cpuctx->ctx; struct perf_event_context *ctx = &cpuctx->ctx;
mutex_lock(&cpuctx->hlist_mutex);
swevent_hlist_release(cpuctx);
mutex_unlock(&cpuctx->hlist_mutex);
mutex_lock(&ctx->mutex); mutex_lock(&ctx->mutex);
smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
mutex_unlock(&ctx->mutex); mutex_unlock(&ctx->mutex);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment