Commit 0ef9ec43 authored by Brenden Blanco's avatar Brenden Blanco Committed by GitHub

Merge pull request #770 from palmtenor/pmu_api

Add basic support for BPF perf event 
parents 315998d8 2f3cdbf0
.TH llcstat 8 "2015-08-18" "USER COMMANDS"
.SH NAME
llcstat \- Trace cache references and cache misses. Uses Linux eBPF/bcc.
.SH SYNOPSIS
.B llcstat [\-h] [\-c SAMPLE_PERIOD] [duration]
.SH DESCRIPTION
llcstat traces cache references and cache misses system-side, and summarizes
them by PID and CPU. These events have different meanings on different
architecture. For x86-64, they mean misses and references to LLC.
This can be useful to locate and debug performance issues
caused by cache hit rate.
This works by sampling corresponding events defined in uapi/linux/perf_event.h,
namely PERF_COUNT_HW_CACHE_REFERENCES and PERF_COUNT_HW_CACHE_MISSES, using
BPF perf event tracing. Upon each sampled event, the attached BPF program
records the PID and CPU ID on which the event happened, and stores it in table.
This makes use of a Linux 4.9 feature (BPF_PROG_TYPE_PERF_EVENT).
Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH OPTIONS
.TP
\-h
Print usage message.
.TP
\-c SAMPLE_PERIOD
Sample one in this many cache reference and cache miss events.
.TP
duration
Duration to trace, in seconds.
.SH EXAMPLES
.TP
Sample one in 100 events, trace for 20 seconds:
#
.B llcstat -c 100 20
.SH FIELDS
.TP
PID
Process ID
.TP
NAME
Process name
.TP
CPU
CPU ID
.TP
REFERENCE
Number of cache reference events
.TP
MISS
Number of cache miss events
.TP
HIT%
Cache hit ratio
.SH SOURCE
This is from bcc.
.IP
https://github.com/iovisor/bcc
.PP
Also look in the bcc distribution for a companion _examples.txt file containing
example usage, output, and commentary for this tool.
.SH OS
Linux
.SH STABILITY
Unstable - in development.
.SH AUTHOR
Teng Qin
...@@ -95,6 +95,7 @@ enum bpf_prog_type { ...@@ -95,6 +95,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SCHED_ACT, BPF_PROG_TYPE_SCHED_ACT,
BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
}; };
#define BPF_PSEUDO_MAP_FD 1 #define BPF_PSEUDO_MAP_FD 1
......
...@@ -570,3 +570,60 @@ cleanup: ...@@ -570,3 +570,60 @@ cleanup:
close(sock); close(sock);
return ret; return ret;
} }
int bpf_attach_perf_event(int progfd, uint32_t ev_type, uint32_t ev_config,
uint64_t sample_period, uint64_t sample_freq,
pid_t pid, int cpu, int group_fd) {
if (ev_type != PERF_TYPE_HARDWARE && ev_type != PERF_TYPE_SOFTWARE) {
fprintf(stderr, "Unsupported perf event type\n");
return -1;
}
if ((ev_type == PERF_TYPE_HARDWARE && ev_config >= PERF_COUNT_HW_MAX) ||
(ev_type == PERF_TYPE_SOFTWARE && ev_config >= PERF_COUNT_SW_MAX)) {
fprintf(stderr, "Invalid perf event config\n");
return -1;
}
if (!((sample_period > 0) ^ (sample_freq > 0))) {
fprintf(
stderr, "Exactly one of sample_period / sample_freq should be set\n"
);
return -1;
}
struct perf_event_attr attr = {};
attr.type = ev_type;
attr.config = ev_config;
attr.inherit = 1;
if (sample_freq > 0) {
attr.freq = 1;
attr.sample_freq = sample_freq;
} else {
attr.sample_period = sample_period;
}
int fd = syscall(
__NR_perf_event_open, &attr, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC
);
if (fd < 0) {
perror("perf_event_open failed");
return -1;
}
if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, progfd) != 0) {
perror("ioctl(PERF_EVENT_IOC_SET_BPF) failed");
close(fd);
return -1;
}
if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) != 0) {
perror("ioctl(PERF_EVENT_IOC_ENABLE) failed");
close(fd);
return -1;
}
return fd;
}
int bpf_detach_perf_event(uint32_t ev_type, uint32_t ev_config) {
// Right now, there is nothing to do, but it's a good idea to encourage
// callers to detach anything they attach.
return 0;
}
...@@ -64,6 +64,13 @@ void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, ...@@ -64,6 +64,13 @@ void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid,
/* attached a prog expressed by progfd to the device specified in dev_name */ /* attached a prog expressed by progfd to the device specified in dev_name */
int bpf_attach_xdp(const char *dev_name, int progfd); int bpf_attach_xdp(const char *dev_name, int progfd);
// attach a prog expressed by progfd to run on a specific perf event, with
// certain sample period or sample frequency
int bpf_attach_perf_event(int progfd, uint32_t ev_type, uint32_t ev_config,
uint64_t sample_period, uint64_t sample_freq,
pid_t pid, int cpu, int group_fd);
int bpf_detach_perf_event(uint32_t ev_type, uint32_t ev_config);
#define LOG_BUF_SIZE 65536 #define LOG_BUF_SIZE 65536
// Put non-static/inline functions in their own section with this prefix + // Put non-static/inline functions in their own section with this prefix +
......
...@@ -62,13 +62,47 @@ class SymbolCache(object): ...@@ -62,13 +62,47 @@ class SymbolCache(object):
return -1 return -1
return addr.value return addr.value
class PerfType:
# From perf_type_id in uapi/linux/perf_event.h
HARDWARE = 0
SOFTWARE = 1
class PerfHWConfig:
# From perf_hw_id in uapi/linux/perf_event.h
CPU_CYCLES = 0
INSTRUCTIONS = 1
CACHE_REFERENCES = 2
CACHE_MISSES = 3
BRANCH_INSTRUCTIONS = 4
BRANCH_MISSES = 5
BUS_CYCLES = 6
STALLED_CYCLES_FRONTEND = 7
STALLED_CYCLES_BACKEND = 8
REF_CPU_CYCLES = 9
class PerfSWConfig:
# From perf_sw_id in uapi/linux/perf_event.h
CPU_CLOCK = 0
TASK_CLOCK = 1
PAGE_FAULTS = 2
CONTEXT_SWITCHES = 3
CPU_MIGRATIONS = 4
PAGE_FAULTS_MIN = 5
PAGE_FAULTS_MAJ = 6
ALIGNMENT_FAULTS = 7
EMULATION_FAULTS = 8
DUMMY = 9
BPF_OUTPUT = 10
class BPF(object): class BPF(object):
# From bpf_prog_type in uapi/linux/bpf.h
SOCKET_FILTER = 1 SOCKET_FILTER = 1
KPROBE = 2 KPROBE = 2
SCHED_CLS = 3 SCHED_CLS = 3
SCHED_ACT = 4 SCHED_ACT = 4
TRACEPOINT = 5 TRACEPOINT = 5
XDP = 6 XDP = 6
PERF_EVENT = 7
_probe_repl = re.compile("[^a-zA-Z0-9_]") _probe_repl = re.compile("[^a-zA-Z0-9_]")
_sym_caches = {} _sym_caches = {}
...@@ -168,6 +202,7 @@ class BPF(object): ...@@ -168,6 +202,7 @@ class BPF(object):
self.open_kprobes = {} self.open_kprobes = {}
self.open_uprobes = {} self.open_uprobes = {}
self.open_tracepoints = {} self.open_tracepoints = {}
self.open_perf_events = {}
self.tracefile = None self.tracefile = None
atexit.register(self.cleanup) atexit.register(self.cleanup)
...@@ -608,6 +643,41 @@ class BPF(object): ...@@ -608,6 +643,41 @@ class BPF(object):
raise Exception("Failed to detach BPF from tracepoint") raise Exception("Failed to detach BPF from tracepoint")
del self.open_tracepoints[tp] del self.open_tracepoints[tp]
def _attach_perf_event(self, progfd, ev_type, ev_config,
sample_period, sample_freq, pid, cpu, group_fd):
res = lib.bpf_attach_perf_event(progfd, ev_type, ev_config,
sample_period, sample_freq, pid, cpu, group_fd)
if res < 0:
raise Exception("Failed to attach BPF to perf event")
return res
def attach_perf_event(self, ev_type=-1, ev_config=-1, fn_name="",
sample_period=0, sample_freq=0, pid=-1, cpu=-1, group_fd=-1):
fn = self.load_func(fn_name, BPF.PERF_EVENT)
res = {}
if cpu >= 0:
res[cpu] = self._attach_perf_event(fn.fd, ev_type, ev_config,
sample_period, sample_freq, pid, cpu, group_fd)
else:
for i in range(0, multiprocessing.cpu_count()):
res[i] = self._attach_perf_event(fn.fd, ev_type, ev_config,
sample_period, sample_freq, pid, i, group_fd)
self.open_perf_events[(ev_type, ev_config)] = res
def detach_perf_event(self, ev_type=-1, ev_config=-1):
try:
fds = self.open_perf_events[(ev_type, ev_config)]
except KeyError:
raise Exception("Perf event type {} config {} not attached".format(
ev_type, ev_config))
for fd in fds.values():
os.close(fd)
res = lib.bpf_detach_perf_event(ev_type, ev_config)
if res < 0:
raise Exception("Failed to detach BPF from perf event")
del self.open_perf_events[(ev_type, ev_config)]
return res
def _add_uprobe(self, name, probe): def _add_uprobe(self, name, probe):
global _num_open_probes global _num_open_probes
self.open_uprobes[name] = probe self.open_uprobes[name] = probe
...@@ -975,6 +1045,8 @@ class BPF(object): ...@@ -975,6 +1045,8 @@ class BPF(object):
(tp_category, tp_name) = k.split(':') (tp_category, tp_name) = k.split(':')
lib.bpf_detach_tracepoint(tp_category, tp_name) lib.bpf_detach_tracepoint(tp_category, tp_name)
self.open_tracepoints.clear() self.open_tracepoints.clear()
for (ev_type, ev_config) in list(self.open_perf_events.keys()):
self.detach_perf_event(ev_type, ev_config)
if self.tracefile: if self.tracefile:
self.tracefile.close() self.tracefile.close()
......
...@@ -113,6 +113,12 @@ lib.perf_reader_fd.argtypes = [ct.c_void_p] ...@@ -113,6 +113,12 @@ lib.perf_reader_fd.argtypes = [ct.c_void_p]
lib.bpf_attach_xdp.restype = ct.c_int; lib.bpf_attach_xdp.restype = ct.c_int;
lib.bpf_attach_xdp.argtypes = [ct.c_char_p, ct.c_int] lib.bpf_attach_xdp.argtypes = [ct.c_char_p, ct.c_int]
lib.bpf_attach_perf_event.restype = ct.c_int;
lib.bpf_attach_perf_event.argtype = [ct.c_int, ct.c_uint, ct.c_uint, ct.c_ulonglong, ct.c_ulonglong,
ct.c_int, ct.c_int, ct.c_int]
lib.bpf_detach_perf_event.restype = ct.c_int;
lib.bpf_detach_perf_event.argtype = [ct.c_uint, ct.c_uint]
# bcc symbol helpers # bcc symbol helpers
class bcc_symbol(ct.Structure): class bcc_symbol(ct.Structure):
_fields_ = [ _fields_ = [
......
#!/usr/bin/python
#
# llcstat.py Summarize cache references and cache misses by PID.
# Cache reference and cache miss are corresponding events defined in
# uapi/linux/perf_event.h, it varies to different architecture.
# On x86-64, they mean LLC references and LLC misses.
#
# For Linux, uses BCC, eBPF. Embedded C.
#
# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
#
# Copyright (c) 2016 Facebook, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 19-Oct-2016 Teng Qin Created this.
from __future__ import print_function
import argparse
from bcc import BPF, PerfType, PerfHWConfig
import signal
from time import sleep
parser = argparse.ArgumentParser(
description="Summarize cache references and misses by PID",
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(
"-c", "--sample_period", type=int, default=100,
help="Sample one in this many number of cache reference / miss events")
parser.add_argument(
"duration", nargs="?", default=10, help="Duration, in seconds, to run")
args = parser.parse_args()
# load BPF program
b = BPF(text="""
#include <linux/ptrace.h>
#include <uapi/linux/bpf_perf_event.h>
struct key_t {
int cpu;
int pid;
char name[TASK_COMM_LEN];
};
BPF_HASH(ref_count, struct key_t);
BPF_HASH(miss_count, struct key_t);
static inline __attribute__((always_inline)) void get_key(struct key_t* key) {
key->cpu = bpf_get_smp_processor_id();
key->pid = bpf_get_current_pid_tgid();
bpf_get_current_comm(&(key->name), sizeof(key->name));
}
int on_cache_miss(struct bpf_perf_event_data *ctx) {
struct key_t key = {};
get_key(&key);
u64 zero = 0, *val;
val = miss_count.lookup_or_init(&key, &zero);
(*val) += ctx->sample_period;
return 0;
}
int on_cache_ref(struct bpf_perf_event_data *ctx) {
struct key_t key = {};
get_key(&key);
u64 zero = 0, *val;
val = ref_count.lookup_or_init(&key, &zero);
(*val) += ctx->sample_period;
return 0;
}
""")
b.attach_perf_event(
ev_type=PerfType.HARDWARE, ev_config=PerfHWConfig.CACHE_MISSES,
fn_name="on_cache_miss", sample_period=args.sample_period)
b.attach_perf_event(
ev_type=PerfType.HARDWARE, ev_config=PerfHWConfig.CACHE_REFERENCES,
fn_name="on_cache_ref", sample_period=args.sample_period)
print("Running for {} seconds or hit Ctrl-C to end.".format(args.duration))
try:
sleep(float(args.duration))
except KeyboardInterrupt:
signal.signal(signal.SIGINT, lambda signal, frame: print())
miss_count = {}
for (k, v) in b.get_table('miss_count').items():
miss_count[(k.pid, k.cpu, k.name)] = v.value
print('PID NAME CPU REFERENCE MISS HIT%')
tot_ref = 0
tot_miss = 0
for (k, v) in b.get_table('ref_count').items():
try:
miss = miss_count[(k.pid, k.cpu, k.name)]
except KeyError:
miss = 0
tot_ref += v.value
tot_miss += miss
# This happens on some PIDs due to missed counts caused by sampling
hit = (v.value - miss) if (v.value >= miss) else 0
print('{:<8d} {:<16s} {:<4d} {:>12d} {:>12d} {:>6.2f}%'.format(
k.pid, k.name, k.cpu, v.value, miss,
(float(hit) / float(v.value)) * 100.0))
print('Total References: {} Total Misses: {} Hit Rate: {:.2f}%'.format(
tot_ref, tot_miss, (float(tot_ref - tot_miss) / float(tot_ref)) * 100.0))
Demonstrations of llcstat.
llcstat traces cache reference and cache miss events system-wide, and summarizes
them by PID and CPU.
These events, defined in uapi/linux/perf_event.h, have different meanings on
different architecture. For x86-64, they mean misses and references to LLC.
Example output:
# ./llcstat.py 20 -c 5000
Running for 20 seconds or hit Ctrl-C to end.
PID NAME CPU REFERENCE MISS HIT%
0 swapper/15 15 3515000 640000 81.79%
238 migration/38 38 5000 0 100.00%
4512 ntpd 11 5000 0 100.00%
150867 ipmitool 3 25000 5000 80.00%
150895 lscpu 17 280000 25000 91.07%
151807 ipmitool 15 15000 5000 66.67%
150757 awk 2 15000 5000 66.67%
151213 chef-client 5 1770000 240000 86.44%
151822 scribe-dispatch 12 15000 0 100.00%
123386 mysqld 5 5000 0 100.00%
[...]
Total References: 518920000 Total Misses: 90265000 Hit Rate: 82.61%
This shows each PID's cache hit rate during the 20 seconds run period.
USAGE message:
# ./llcstat.py --help
usage: llcstat.py [-h] [-c SAMPLE_PERIOD] [duration]
Summarize cache references and misses by PID
positional arguments:
duration Duration, in seconds, to run
optional arguments:
-h, --help show this help message and exit
-c SAMPLE_PERIOD, --sample_period SAMPLE_PERIOD
Sample one in this many number of cache reference
and miss events
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment