Commit aab667ca authored by K Prateek Nayak's avatar K Prateek Nayak Committed by Arnaldo Carvalho de Melo

perf stat: Add "--per-cache" aggregation option and document it

This patch adds support for "--per-cache" option for aggregation at a
particular cache level and documents the same.

Following is the output of 'perf stat' with aggregation at L3 for the
event "ls_dmnd_fills_from_sys.ext_cache_remote" on a dual socket 3rd
Generation EPYC Processor (2 x 64C/128T - 16 LLCs) when running
hackbench pinned to 4 LLCs:

  $ sudo perf stat --per-cache=L3 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \
    taskset -c 0-15,64-79,128-143,192-207 \
    perf bench sched messaging -p -t -l 100000 -g 8

  ...

   Performance counter stats for 'system wide':

  S0-D0-L3-ID0             16          9,500,803      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID8             16          6,338,099      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID16            16            355,005      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID24            16             22,067      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID32            16             16,321      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID40            16             11,619      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID48            16              4,238      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID56            16             31,158      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID64            16         28,242,452      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID72            16         22,906,973      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID80            16             72,898      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID88            16             56,907      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID96            16             20,456      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID104           16             40,913      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID112           16             78,113      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID120           16             37,897      ls_dmnd_fills_from_sys.ext_cache_remote

Also support 'perf stat record' and 'perf stat report' with the ability
to specify a different cache level to aggregate data at when running
'perf stat report'.

  $ sudo perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \
    taskset -c 0-15,64-79,128-143,192-207 \
    perf bench sched messaging -p -t -l 100000 -g 8

  ...

   Performance counter stats for 'system wide':

  S0-D0-L2-ID0              2          1,442,061      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID1              2          1,548,994      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID2              2          1,553,557      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID3              2          1,420,122      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID4              2          1,465,461      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID5              2          1,455,153      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID6              2          1,595,237      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID7              2          1,499,321      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID8              2          1,919,025      ls_dmnd_fills_from_sys.ext_cache_remote
  ...
  S1-D1-L2-ID127            2             21,295      ls_dmnd_fills_from_sys.ext_cache_remote

  $ sudo perf stat report --per-cache=L3

   Performance counter stats for 'perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote --\
                                  taskset -c 0-15,64-79,128-143,192-207 \
                                  perf bench sched messaging -p -t -l 100000 -g 8':

  S0-D0-L3-ID0             16         11,979,906      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID8             16         14,257,202      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID16            16            377,484      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID24            16             27,224      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID32            16             26,816      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID40            16             14,461      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID48            16             10,499      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID56            16             53,817      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID64            16         27,361,987      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID72            16         37,299,024      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID80            16             84,125      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID88            16             64,561      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID96            16             13,403      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID104           16             20,138      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID112           16             93,220      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID120           16             35,465      ls_dmnd_fills_from_sys.ext_cache_remote

On the above system, the domain covered by S0-D0-L3-ID0 contains
S0-D0-L2-ID0 to S0-D0-L2-ID7, the corresponding count for L3-ID0 is
equal to the sum of counts for L2-ID0 to L2-ID7.

Add documentation for the newly introduced "--per-cache" option.
Suggested-by: default avatarGautham Shenoy <gautham.shenoy@amd.com>
Signed-off-by: default avatarK Prateek Nayak <kprateek.nayak@amd.com>
Acked-by: default avatarIan Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wen Pu <puwen@hygon.cn>
Link: https://lore.kernel.org/r/20230517172745.5833-5-kprateek.nayak@amd.comSigned-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent 4b87406a
...@@ -308,6 +308,14 @@ use --per-die in addition to -a. (system-wide). The output includes the ...@@ -308,6 +308,14 @@ use --per-die in addition to -a. (system-wide). The output includes the
die number and the number of online processors on that die. This is die number and the number of online processors on that die. This is
useful to gauge the amount of aggregation. useful to gauge the amount of aggregation.
--per-cache::
Aggregate counts per cache instance for system-wide mode measurements. By
default, the aggregation happens for the cache level at the highest index
in the system. To specify a particular level, mention the cache level
alongside the option in the format [Ll][1-9][0-9]*. For example:
Using option "--per-cache=l3" or "--per-cache=L3" will aggregate the
information at the boundary of the level 3 cache in the system.
--per-core:: --per-core::
Aggregate counts per physical processor for system-wide mode measurements. This Aggregate counts per physical processor for system-wide mode measurements. This
is a useful mode to detect imbalance between physical cores. To enable this mode, is a useful mode to detect imbalance between physical cores. To enable this mode,
...@@ -379,6 +387,14 @@ Aggregate counts per processor socket for system-wide mode measurements. ...@@ -379,6 +387,14 @@ Aggregate counts per processor socket for system-wide mode measurements.
--per-die:: --per-die::
Aggregate counts per processor die for system-wide mode measurements. Aggregate counts per processor die for system-wide mode measurements.
--per-cache::
Aggregate counts per cache instance for system-wide mode measurements. By
default, the aggregation happens for the cache level at the highest index
in the system. To specify a particular level, mention the cache level
alongside the option in the format [Ll][1-9][0-9]*. For example: Using
option "--per-cache=l3" or "--per-cache=L3" will aggregate the
information at the boundary of the level 3 cache in the system.
--per-core:: --per-core::
Aggregate counts per physical processor for system-wide mode measurements. Aggregate counts per physical processor for system-wide mode measurements.
......
...@@ -1113,6 +1113,55 @@ static int parse_cputype(const struct option *opt, ...@@ -1113,6 +1113,55 @@ static int parse_cputype(const struct option *opt,
return 0; return 0;
} }
static int parse_cache_level(const struct option *opt,
const char *str,
int unset __maybe_unused)
{
int level;
u32 *aggr_mode = (u32 *)opt->value;
u32 *aggr_level = (u32 *)opt->data;
/*
* If no string is specified, aggregate based on the topology of
* Last Level Cache (LLC). Since the LLC level can change from
* architecture to architecture, set level greater than
* MAX_CACHE_LVL which will be interpreted as LLC.
*/
if (str == NULL) {
level = MAX_CACHE_LVL + 1;
goto out;
}
/*
* The format to specify cache level is LX or lX where X is the
* cache level.
*/
if (strlen(str) != 2 || (str[0] != 'l' && str[0] != 'L')) {
pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n",
MAX_CACHE_LVL,
MAX_CACHE_LVL);
return -EINVAL;
}
level = atoi(&str[1]);
if (level < 1) {
pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n",
MAX_CACHE_LVL,
MAX_CACHE_LVL);
return -EINVAL;
}
if (level > MAX_CACHE_LVL) {
pr_err("perf only supports max cache level of %d.\n"
"Consider increasing MAX_CACHE_LVL\n", MAX_CACHE_LVL);
return -EINVAL;
}
out:
*aggr_mode = AGGR_CACHE;
*aggr_level = level;
return 0;
}
static struct option stat_options[] = { static struct option stat_options[] = {
OPT_BOOLEAN('T', "transaction", &transaction_run, OPT_BOOLEAN('T', "transaction", &transaction_run,
"hardware transaction statistics"), "hardware transaction statistics"),
...@@ -1190,6 +1239,9 @@ static struct option stat_options[] = { ...@@ -1190,6 +1239,9 @@ static struct option stat_options[] = {
"aggregate counts per processor socket", AGGR_SOCKET), "aggregate counts per processor socket", AGGR_SOCKET),
OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode, OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode,
"aggregate counts per processor die", AGGR_DIE), "aggregate counts per processor die", AGGR_DIE),
OPT_CALLBACK_OPTARG(0, "per-cache", &stat_config.aggr_mode, &stat_config.aggr_level,
"cache level", "aggregate count at this cache level (Default: LLC)",
parse_cache_level),
OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode, OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode,
"aggregate counts per physical processor core", AGGR_CORE), "aggregate counts per physical processor core", AGGR_CORE),
OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode, OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode,
...@@ -2346,6 +2398,10 @@ static int __cmd_report(int argc, const char **argv) ...@@ -2346,6 +2398,10 @@ static int __cmd_report(int argc, const char **argv)
"aggregate counts per processor socket", AGGR_SOCKET), "aggregate counts per processor socket", AGGR_SOCKET),
OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode, OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode,
"aggregate counts per processor die", AGGR_DIE), "aggregate counts per processor die", AGGR_DIE),
OPT_CALLBACK_OPTARG(0, "per-cache", &perf_stat.aggr_mode, &perf_stat.aggr_level,
"cache level",
"aggregate count at this cache level (Default: LLC)",
parse_cache_level),
OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode, OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode,
"aggregate counts per physical processor core", AGGR_CORE), "aggregate counts per physical processor core", AGGR_CORE),
OPT_SET_UINT(0, "per-node", &perf_stat.aggr_mode, OPT_SET_UINT(0, "per-node", &perf_stat.aggr_mode,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment