Commit cfa0bd52 authored by Ingo Molnar's avatar Ingo Molnar

Merge tag 'perf-core-for-mingo-2' of...

Merge tag 'perf-core-for-mingo-2' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core

Pull perf/core improvements from Arnaldo Carvalho de Melo:

User visible changes:

  - Support handling complete branch stacks as histograms (Andi Kleen)

Infrastructure changes:

  - Prep work for supporting per-pkg and snapshot counters in 'perf stat' (Jiri Olsa)
Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parents e460bfdc 09a6a1b0
...@@ -159,7 +159,7 @@ OPTIONS ...@@ -159,7 +159,7 @@ OPTIONS
--dump-raw-trace:: --dump-raw-trace::
Dump raw trace in ASCII. Dump raw trace in ASCII.
-g [type,min[,limit],order[,key]]:: -g [type,min[,limit],order[,key][,branch]]::
--call-graph:: --call-graph::
Display call chains using type, min percent threshold, optional print Display call chains using type, min percent threshold, optional print
limit and order. limit and order.
...@@ -177,6 +177,11 @@ OPTIONS ...@@ -177,6 +177,11 @@ OPTIONS
- function: compare on functions - function: compare on functions
- address: compare on individual code addresses - address: compare on individual code addresses
branch can be:
- branch: include last branch information in callgraph
when available. Usually more convenient to use --branch-history
for this.
Default: fractal,0.5,callee,function. Default: fractal,0.5,callee,function.
--children:: --children::
...@@ -266,6 +271,11 @@ OPTIONS ...@@ -266,6 +271,11 @@ OPTIONS
branch stacks and it will automatically switch to the branch view mode, branch stacks and it will automatically switch to the branch view mode,
unless --no-branch-stack is used. unless --no-branch-stack is used.
--branch-history::
Add the addresses of sampled taken branches to the callstack.
This allows to examine the path the program took to each sample.
The data collection must have used -b (or -j) and -g.
--objdump=<path>:: --objdump=<path>::
Path to objdump binary. Path to objdump binary.
......
...@@ -226,8 +226,9 @@ static int report__setup_sample_type(struct report *rep) ...@@ -226,8 +226,9 @@ static int report__setup_sample_type(struct report *rep)
return -EINVAL; return -EINVAL;
} }
if (symbol_conf.use_callchain) { if (symbol_conf.use_callchain) {
ui__error("Selected -g but no callchain data. Did " ui__error("Selected -g or --branch-history but no "
"you call 'perf record' without -g?\n"); "callchain data. Did\n"
"you call 'perf record' without -g?\n");
return -1; return -1;
} }
} else if (!rep->dont_use_callchains && } else if (!rep->dont_use_callchains &&
...@@ -575,6 +576,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) ...@@ -575,6 +576,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
struct stat st; struct stat st;
bool has_br_stack = false; bool has_br_stack = false;
int branch_mode = -1; int branch_mode = -1;
bool branch_call_mode = false;
char callchain_default_opt[] = "fractal,0.5,callee"; char callchain_default_opt[] = "fractal,0.5,callee";
const char * const report_usage[] = { const char * const report_usage[] = {
"perf report [<options>]", "perf report [<options>]",
...@@ -637,8 +639,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) ...@@ -637,8 +639,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
"regex filter to identify parent, see: '--sort parent'"), "regex filter to identify parent, see: '--sort parent'"),
OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other, OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
"Only display entries with parent-match"), "Only display entries with parent-match"),
OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order", OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order[,branch]",
"Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). " "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address), add branches. "
"Default: fractal,0.5,callee,function", &report_parse_callchain_opt, callchain_default_opt), "Default: fractal,0.5,callee,function", &report_parse_callchain_opt, callchain_default_opt),
OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain, OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain,
"Accumulate callchains of children and show total overhead as well"), "Accumulate callchains of children and show total overhead as well"),
...@@ -684,7 +686,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) ...@@ -684,7 +686,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
OPT_BOOLEAN(0, "group", &symbol_conf.event_group, OPT_BOOLEAN(0, "group", &symbol_conf.event_group,
"Show event group information together"), "Show event group information together"),
OPT_CALLBACK_NOOPT('b', "branch-stack", &branch_mode, "", OPT_CALLBACK_NOOPT('b', "branch-stack", &branch_mode, "",
"use branch records for histogram filling", parse_branch_mode), "use branch records for per branch histogram filling",
parse_branch_mode),
OPT_BOOLEAN(0, "branch-history", &branch_call_mode,
"add last branch records to call history"),
OPT_STRING(0, "objdump", &objdump_path, "path", OPT_STRING(0, "objdump", &objdump_path, "path",
"objdump binary to use for disassembly and annotations"), "objdump binary to use for disassembly and annotations"),
OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle, OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle,
...@@ -745,10 +750,24 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) ...@@ -745,10 +750,24 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
has_br_stack = perf_header__has_feat(&session->header, has_br_stack = perf_header__has_feat(&session->header,
HEADER_BRANCH_STACK); HEADER_BRANCH_STACK);
if ((branch_mode == -1 && has_br_stack) || branch_mode == 1) { /*
* Branch mode is a tristate:
* -1 means default, so decide based on the file having branch data.
* 0/1 means the user chose a mode.
*/
if (((branch_mode == -1 && has_br_stack) || branch_mode == 1) &&
branch_call_mode == -1) {
sort__mode = SORT_MODE__BRANCH; sort__mode = SORT_MODE__BRANCH;
symbol_conf.cumulate_callchain = false; symbol_conf.cumulate_callchain = false;
} }
if (branch_call_mode) {
callchain_param.key = CCKEY_ADDRESS;
callchain_param.branch_callstack = 1;
symbol_conf.use_callchain = true;
callchain_register_param(&callchain_param);
if (sort_order == NULL)
sort_order = "srcline,symbol,dso";
}
if (report.mem_mode) { if (report.mem_mode) {
if (sort__mode == SORT_MODE__BRANCH) { if (sort__mode == SORT_MODE__BRANCH) {
......
...@@ -388,20 +388,102 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) ...@@ -388,20 +388,102 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
update_stats(&runtime_itlb_cache_stats[0], count[0]); update_stats(&runtime_itlb_cache_stats[0], count[0]);
} }
static void zero_per_pkg(struct perf_evsel *counter)
{
if (counter->per_pkg_mask)
memset(counter->per_pkg_mask, 0, MAX_NR_CPUS);
}
static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip)
{
unsigned long *mask = counter->per_pkg_mask;
struct cpu_map *cpus = perf_evsel__cpus(counter);
int s;
*skip = false;
if (!counter->per_pkg)
return 0;
if (cpu_map__empty(cpus))
return 0;
if (!mask) {
mask = zalloc(MAX_NR_CPUS);
if (!mask)
return -ENOMEM;
counter->per_pkg_mask = mask;
}
s = cpu_map__get_socket(cpus, cpu);
if (s < 0)
return -1;
*skip = test_and_set_bit(s, mask) == 1;
return 0;
}
static int read_cb(struct perf_evsel *evsel, int cpu, int thread __maybe_unused,
struct perf_counts_values *count)
{
struct perf_counts_values *aggr = &evsel->counts->aggr;
static struct perf_counts_values zero;
bool skip = false;
if (check_per_pkg(evsel, cpu, &skip)) {
pr_err("failed to read per-pkg counter\n");
return -1;
}
if (skip)
count = &zero;
switch (aggr_mode) {
case AGGR_CORE:
case AGGR_SOCKET:
case AGGR_NONE:
if (!evsel->snapshot)
perf_evsel__compute_deltas(evsel, cpu, count);
perf_counts_values__scale(count, scale, NULL);
evsel->counts->cpu[cpu] = *count;
update_shadow_stats(evsel, count->values);
break;
case AGGR_GLOBAL:
aggr->val += count->val;
if (scale) {
aggr->ena += count->ena;
aggr->run += count->run;
}
default:
break;
}
return 0;
}
static int read_counter(struct perf_evsel *counter);
/* /*
* Read out the results of a single counter: * Read out the results of a single counter:
* aggregate counts across CPUs in system-wide mode * aggregate counts across CPUs in system-wide mode
*/ */
static int read_counter_aggr(struct perf_evsel *counter) static int read_counter_aggr(struct perf_evsel *counter)
{ {
struct perf_counts_values *aggr = &counter->counts->aggr;
struct perf_stat *ps = counter->priv; struct perf_stat *ps = counter->priv;
u64 *count = counter->counts->aggr.values; u64 *count = counter->counts->aggr.values;
int i; int i;
if (__perf_evsel__read(counter, perf_evsel__nr_cpus(counter), aggr->val = aggr->ena = aggr->run = 0;
thread_map__nr(evsel_list->threads), scale) < 0)
if (read_counter(counter))
return -1; return -1;
if (!counter->snapshot)
perf_evsel__compute_deltas(counter, -1, aggr);
perf_counts_values__scale(aggr, scale, &counter->counts->scaled);
for (i = 0; i < 3; i++) for (i = 0; i < 3; i++)
update_stats(&ps->res_stats[i], count[i]); update_stats(&ps->res_stats[i], count[i]);
...@@ -424,16 +506,21 @@ static int read_counter_aggr(struct perf_evsel *counter) ...@@ -424,16 +506,21 @@ static int read_counter_aggr(struct perf_evsel *counter)
*/ */
static int read_counter(struct perf_evsel *counter) static int read_counter(struct perf_evsel *counter)
{ {
u64 *count; int nthreads = thread_map__nr(evsel_list->threads);
int cpu; int ncpus = perf_evsel__nr_cpus(counter);
int cpu, thread;
for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { if (counter->system_wide)
if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0) nthreads = 1;
return -1;
count = counter->counts->cpu[cpu].values; if (counter->per_pkg)
zero_per_pkg(counter);
update_shadow_stats(counter, count); for (thread = 0; thread < nthreads; thread++) {
for (cpu = 0; cpu < ncpus; cpu++) {
if (perf_evsel__read_cb(counter, cpu, thread, read_cb))
return -1;
}
} }
return 0; return 0;
......
...@@ -149,6 +149,10 @@ static int parse_callchain_sort_key(const char *value) ...@@ -149,6 +149,10 @@ static int parse_callchain_sort_key(const char *value)
callchain_param.key = CCKEY_ADDRESS; callchain_param.key = CCKEY_ADDRESS;
return 0; return 0;
} }
if (!strncmp(value, "branch", strlen(value))) {
callchain_param.branch_callstack = 1;
return 0;
}
return -1; return -1;
} }
......
...@@ -63,6 +63,7 @@ struct callchain_param { ...@@ -63,6 +63,7 @@ struct callchain_param {
sort_chain_func_t sort; sort_chain_func_t sort;
enum chain_order order; enum chain_order order;
enum chain_key key; enum chain_key key;
bool branch_callstack;
}; };
extern struct callchain_param callchain_param; extern struct callchain_param callchain_param;
......
...@@ -954,40 +954,6 @@ int __perf_evsel__read_on_cpu(struct perf_evsel *evsel, ...@@ -954,40 +954,6 @@ int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
return 0; return 0;
} }
int __perf_evsel__read(struct perf_evsel *evsel,
int ncpus, int nthreads, bool scale)
{
size_t nv = scale ? 3 : 1;
int cpu, thread;
struct perf_counts_values *aggr = &evsel->counts->aggr, count;
if (evsel->system_wide)
nthreads = 1;
aggr->val = aggr->ena = aggr->run = 0;
for (cpu = 0; cpu < ncpus; cpu++) {
for (thread = 0; thread < nthreads; thread++) {
if (FD(evsel, cpu, thread) < 0)
continue;
if (readn(FD(evsel, cpu, thread),
&count, nv * sizeof(u64)) < 0)
return -errno;
aggr->val += count.val;
if (scale) {
aggr->ena += count.ena;
aggr->run += count.run;
}
}
}
perf_evsel__compute_deltas(evsel, -1, aggr);
perf_counts_values__scale(aggr, scale, &evsel->counts->scaled);
return 0;
}
static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread) static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread)
{ {
struct perf_evsel *leader = evsel->leader; struct perf_evsel *leader = evsel->leader;
......
...@@ -93,6 +93,7 @@ struct perf_evsel { ...@@ -93,6 +93,7 @@ struct perf_evsel {
bool system_wide; bool system_wide;
bool tracking; bool tracking;
bool per_pkg; bool per_pkg;
unsigned long *per_pkg_mask;
/* parse modifier helper */ /* parse modifier helper */
int exclude_GH; int exclude_GH;
int nr_members; int nr_members;
...@@ -271,35 +272,6 @@ static inline int perf_evsel__read_on_cpu_scaled(struct perf_evsel *evsel, ...@@ -271,35 +272,6 @@ static inline int perf_evsel__read_on_cpu_scaled(struct perf_evsel *evsel,
return __perf_evsel__read_on_cpu(evsel, cpu, thread, true); return __perf_evsel__read_on_cpu(evsel, cpu, thread, true);
} }
int __perf_evsel__read(struct perf_evsel *evsel, int ncpus, int nthreads,
bool scale);
/**
* perf_evsel__read - Read the aggregate results on all CPUs
*
* @evsel - event selector to read value
* @ncpus - Number of cpus affected, from zero
* @nthreads - Number of threads affected, from zero
*/
static inline int perf_evsel__read(struct perf_evsel *evsel,
int ncpus, int nthreads)
{
return __perf_evsel__read(evsel, ncpus, nthreads, false);
}
/**
* perf_evsel__read_scaled - Read the aggregate results on all CPUs, scaled
*
* @evsel - event selector to read value
* @ncpus - Number of cpus affected, from zero
* @nthreads - Number of threads affected, from zero
*/
static inline int perf_evsel__read_scaled(struct perf_evsel *evsel,
int ncpus, int nthreads)
{
return __perf_evsel__read(evsel, ncpus, nthreads, true);
}
int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event, int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
struct perf_sample *sample); struct perf_sample *sample);
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <stdbool.h> #include <stdbool.h>
#include <symbol/kallsyms.h> #include <symbol/kallsyms.h>
#include "unwind.h" #include "unwind.h"
#include "linux/hash.h"
static void dsos__init(struct dsos *dsos) static void dsos__init(struct dsos *dsos)
{ {
...@@ -1391,7 +1392,11 @@ static int add_callchain_ip(struct thread *thread, ...@@ -1391,7 +1392,11 @@ static int add_callchain_ip(struct thread *thread,
al.filtered = 0; al.filtered = 0;
al.sym = NULL; al.sym = NULL;
thread__find_addr_location(thread, cpumode, MAP__FUNCTION, if (cpumode == -1)
thread__find_cpumode_addr_location(thread, MAP__FUNCTION,
ip, &al);
else
thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
ip, &al); ip, &al);
if (al.sym != NULL) { if (al.sym != NULL) {
if (sort__has_parent && !*parent && if (sort__has_parent && !*parent &&
...@@ -1427,8 +1432,50 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, ...@@ -1427,8 +1432,50 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
return bi; return bi;
} }
#define CHASHSZ 127
#define CHASHBITS 7
#define NO_ENTRY 0xff
#define PERF_MAX_BRANCH_DEPTH 127
/* Remove loops. */
static int remove_loops(struct branch_entry *l, int nr)
{
int i, j, off;
unsigned char chash[CHASHSZ];
memset(chash, NO_ENTRY, sizeof(chash));
BUG_ON(PERF_MAX_BRANCH_DEPTH > 255);
for (i = 0; i < nr; i++) {
int h = hash_64(l[i].from, CHASHBITS) % CHASHSZ;
/* no collision handling for now */
if (chash[h] == NO_ENTRY) {
chash[h] = i;
} else if (l[chash[h]].from == l[i].from) {
bool is_loop = true;
/* check if it is a real loop */
off = 0;
for (j = chash[h]; j < i && i + off < nr; j++, off++)
if (l[j].from != l[i + off].from) {
is_loop = false;
break;
}
if (is_loop) {
memmove(l + i, l + i + off,
(nr - (i + off)) * sizeof(*l));
nr -= off;
}
}
}
return nr;
}
static int thread__resolve_callchain_sample(struct thread *thread, static int thread__resolve_callchain_sample(struct thread *thread,
struct ip_callchain *chain, struct ip_callchain *chain,
struct branch_stack *branch,
struct symbol **parent, struct symbol **parent,
struct addr_location *root_al, struct addr_location *root_al,
int max_stack) int max_stack)
...@@ -1438,22 +1485,82 @@ static int thread__resolve_callchain_sample(struct thread *thread, ...@@ -1438,22 +1485,82 @@ static int thread__resolve_callchain_sample(struct thread *thread,
int i; int i;
int j; int j;
int err; int err;
int skip_idx __maybe_unused; int skip_idx = -1;
int first_call = 0;
/*
* Based on DWARF debug information, some architectures skip
* a callchain entry saved by the kernel.
*/
if (chain->nr < PERF_MAX_STACK_DEPTH)
skip_idx = arch_skip_callchain_idx(thread, chain);
callchain_cursor_reset(&callchain_cursor); callchain_cursor_reset(&callchain_cursor);
/*
* Add branches to call stack for easier browsing. This gives
* more context for a sample than just the callers.
*
* This uses individual histograms of paths compared to the
* aggregated histograms the normal LBR mode uses.
*
* Limitations for now:
* - No extra filters
* - No annotations (should annotate somehow)
*/
if (branch && callchain_param.branch_callstack) {
int nr = min(max_stack, (int)branch->nr);
struct branch_entry be[nr];
if (branch->nr > PERF_MAX_BRANCH_DEPTH) {
pr_warning("corrupted branch chain. skipping...\n");
goto check_calls;
}
for (i = 0; i < nr; i++) {
if (callchain_param.order == ORDER_CALLEE) {
be[i] = branch->entries[i];
/*
* Check for overlap into the callchain.
* The return address is one off compared to
* the branch entry. To adjust for this
* assume the calling instruction is not longer
* than 8 bytes.
*/
if (i == skip_idx ||
chain->ips[first_call] >= PERF_CONTEXT_MAX)
first_call++;
else if (be[i].from < chain->ips[first_call] &&
be[i].from >= chain->ips[first_call] - 8)
first_call++;
} else
be[i] = branch->entries[branch->nr - i - 1];
}
nr = remove_loops(be, nr);
for (i = 0; i < nr; i++) {
err = add_callchain_ip(thread, parent, root_al,
-1, be[i].to);
if (!err)
err = add_callchain_ip(thread, parent, root_al,
-1, be[i].from);
if (err == -EINVAL)
break;
if (err)
return err;
}
chain_nr -= nr;
}
check_calls:
if (chain->nr > PERF_MAX_STACK_DEPTH) { if (chain->nr > PERF_MAX_STACK_DEPTH) {
pr_warning("corrupted callchain. skipping...\n"); pr_warning("corrupted callchain. skipping...\n");
return 0; return 0;
} }
/* for (i = first_call; i < chain_nr; i++) {
* Based on DWARF debug information, some architectures skip
* a callchain entry saved by the kernel.
*/
skip_idx = arch_skip_callchain_idx(thread, chain);
for (i = 0; i < chain_nr; i++) {
u64 ip; u64 ip;
if (callchain_param.order == ORDER_CALLEE) if (callchain_param.order == ORDER_CALLEE)
...@@ -1517,6 +1624,7 @@ int thread__resolve_callchain(struct thread *thread, ...@@ -1517,6 +1624,7 @@ int thread__resolve_callchain(struct thread *thread,
int max_stack) int max_stack)
{ {
int ret = thread__resolve_callchain_sample(thread, sample->callchain, int ret = thread__resolve_callchain_sample(thread, sample->callchain,
sample->branch_stack,
parent, root_al, max_stack); parent, root_al, max_stack);
if (ret) if (ret)
return ret; return ret;
......
...@@ -102,7 +102,8 @@ struct symbol_conf { ...@@ -102,7 +102,8 @@ struct symbol_conf {
demangle, demangle,
demangle_kernel, demangle_kernel,
filter_relative, filter_relative,
show_hist_headers; show_hist_headers,
branch_callstack;
const char *vmlinux_name, const char *vmlinux_name,
*kallsyms_name, *kallsyms_name,
*source_prefix, *source_prefix,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment