Commit d0daf6a4 authored by Brenden Blanco's avatar Brenden Blanco

Add perf_output support for high rate events

This adds support for the bpf_perf_event_output command. This is
intended for per-process events from bpf to userspace at high rate. The
events from the bpf program can be completely customized.
Signed-off-by: default avatarBrenden Blanco <bblanco@plumgrid.com>
parent 33d00037
#!/usr/bin/env python
# Copyright (c) PLUMgrid, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
# This is an example of tracing an event and printing custom fields.
# run in project examples directory with:
# sudo ./trace_fields.py"
import atexit
from bcc import BPF
import ctypes
counter = 0
def cb(foo, data, size):
global counter
counter += 1
prog = """
BPF_PERF_ARRAY(events, 2);
BPF_TABLE("array", int, u64, counters, 10);
int kprobe__sys_write(void *ctx) {
struct {
u64 ts;
} data = {bpf_ktime_get_ns()};
if (events.perf_output(ctx, 0, &data, sizeof(data)) < 0)
bpf_trace_printk("perf_output failed\\n");
int zero = 0;
u64 *val = counters.lookup(&zero);
if (val) lock_xadd(val, 1);
return 0;
}
"""
b = BPF(text=prog)
b["events"].open_perf_buffer(0, cb, None)
@atexit.register
def print_counter():
global counter
global b
print("counter = %d vs %d" % (counter, b["counters"][ctypes.c_int(0)].value))
while 1:
b.kprobe_poll()
......@@ -42,6 +42,19 @@ struct _name##_table_t { \
__attribute__((section("maps/" _table_type))) \
struct _name##_table_t _name
#define BPF_PERF_ARRAY(_name, _max_entries) \
struct _name##_table_t { \
int key; \
u32 leaf; \
/* counter = map.perf_read(index) */ \
u64 (*perf_read) (int); \
/* map.perf_ouput(ctx, index, data, data_size) */ \
int (*perf_output) (void *, int, void *, u32); \
u32 data[_max_entries]; \
}; \
__attribute__((section("maps/perf_array"))) \
struct _name##_table_t _name
#define BPF_HASH1(_name) \
BPF_TABLE("hash", u64, u64, _name, 10240)
#define BPF_HASH2(_name, _key_type) \
......@@ -117,6 +130,16 @@ static int (*bpf_skb_get_tunnel_key)(void *ctx, void *to, u32 size, u64 flags) =
(void *) BPF_FUNC_skb_get_tunnel_key;
static int (*bpf_skb_set_tunnel_key)(void *ctx, void *from, u32 size, u64 flags) =
(void *) BPF_FUNC_skb_set_tunnel_key;
static int (*bpf_perf_event_read)(void *map, u32 index) =
(void *) BPF_FUNC_perf_event_read;
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
static int (*bpf_redirect)(int ifindex, u32 flags) =
(void *) BPF_FUNC_redirect;
static u32 (*bpf_get_route_realm)(void *ctx) =
(void *) BPF_FUNC_get_route_realm;
static int (*bpf_perf_event_output)(void *ctx, void *map, u32 index, void *data, u32 size) =
(void *) BPF_FUNC_perf_event_output;
#endif
/* llvm builtin functions that eBPF C program may use to
......
......@@ -332,6 +332,13 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
}
txt += "typeof(" + name + ".leaf) *_leaf = " + lookup + ", &_key); ";
txt += "if (_leaf) (*_leaf)++; })";
} else if (memb_name == "perf_output") {
string name = Ref->getDecl()->getName();
string arg0 = rewriter_.getRewrittenText(SourceRange(Call->getArg(0)->getLocStart(),
Call->getArg(0)->getLocEnd()));
string args_other = rewriter_.getRewrittenText(SourceRange(Call->getArg(1)->getLocStart(),
Call->getArg(3)->getLocEnd()));
txt = "bpf_perf_event_output(" + arg0 + ", bpf_pseudo_fd(1, " + fd + "), " + args_other + ")";
} else {
if (memb_name == "lookup") {
prefix = "bpf_map_lookup_elem";
......@@ -345,6 +352,9 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
} else if (memb_name == "call") {
prefix = "bpf_tail_call_";
suffix = ")";
} else if (memb_name == "perf_read") {
prefix = "bpf_perf_event_read";
suffix = ")";
} else {
C.getDiagnostics().Report(Call->getLocStart(), diag::err_expected)
<< "valid bpf_table operation";
......@@ -482,6 +492,13 @@ bool BTypeVisitor::VisitVarDecl(VarDecl *Decl) {
}
const RecordDecl *RD = R->getDecl()->getDefinition();
int major = 0, minor = 0;
struct utsname un;
if (uname(&un) == 0) {
// release format: <major>.<minor>.<revision>[-<othertag>]
sscanf(un.release, "%d.%d.", &major, &minor);
}
TableDesc table;
table.name = Decl->getName();
......@@ -519,20 +536,20 @@ bool BTypeVisitor::VisitVarDecl(VarDecl *Decl) {
diag_.Report(Decl->getLocStart(), diag_id) << table.leaf_desc;
}
} else if (A->getName() == "maps/prog") {
struct utsname un;
if (uname(&un) == 0) {
int major = 0, minor = 0;
// release format: <major>.<minor>.<revision>[-<othertag>]
sscanf(un.release, "%d.%d.", &major, &minor);
if (KERNEL_VERSION(major,minor,0) >= KERNEL_VERSION(4,2,0))
map_type = BPF_MAP_TYPE_PROG_ARRAY;
} else if (A->getName() == "maps/perf_array") {
if (KERNEL_VERSION(major,minor,0) >= KERNEL_VERSION(4,3,0))
map_type = BPF_MAP_TYPE_PERF_EVENT_ARRAY;
}
if (map_type == BPF_MAP_TYPE_UNSPEC) {
C.getDiagnostics().Report(Decl->getLocStart(), diag::err_expected)
<< "kernel supporting maps/prog";
unsigned diag_id = C.getDiagnostics().getCustomDiagID(DiagnosticsEngine::Error,
"unsupported map type: %0");
C.getDiagnostics().Report(Decl->getLocStart(), diag_id) << A->getName();
return false;
}
}
table.type = map_type;
table.fd = bpf_create_map(map_type, table.key_size, table.leaf_size, table.max_entries);
if (table.fd < 0) {
......
......@@ -178,8 +178,8 @@ int bpf_attach_socket(int sock, int prog) {
static int bpf_attach_tracing_event(int progfd, const char *event_path,
struct perf_reader *reader, int pid, int cpu, int group_fd) {
int efd = -1, rc = -1, pfd = -1;
ssize_t bytes = -1;
int efd = -1, rc = -1, pfd;
ssize_t bytes;
char buf[256];
struct perf_event_attr attr = {};
......@@ -206,8 +206,9 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path,
perror("perf_event_open");
goto cleanup;
}
perf_reader_set_fd(reader, pfd);
if (perf_reader_mmap(reader, pfd, attr.sample_type) < 0)
if (perf_reader_mmap(reader, attr.type, attr.sample_type) < 0)
goto cleanup;
if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, progfd) < 0) {
......@@ -219,14 +220,11 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path,
goto cleanup;
}
rc = pfd;
pfd = -1;
rc = 0;
cleanup:
if (efd >= 0)
close(efd);
if (pfd >= 0)
close(pfd);
return rc;
}
......@@ -239,7 +237,7 @@ void * bpf_attach_kprobe(int progfd, const char *event,
char buf[256];
struct perf_reader *reader = NULL;
reader = perf_reader_new(-1, 8, cb, cb_cookie);
reader = perf_reader_new(cb, NULL, cb_cookie);
if (!reader)
goto cleanup;
......@@ -292,3 +290,40 @@ cleanup:
return rc;
}
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie) {
int rc = -1, pfd;
struct perf_event_attr attr = {};
struct perf_reader *reader = perf_reader_new(NULL, raw_cb, cb_cookie);
if (!reader)
goto cleanup;
attr.config = PERF_COUNT_SW_BPF_OUTPUT;
attr.type = PERF_TYPE_SOFTWARE;
attr.sample_type = PERF_SAMPLE_RAW;
pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
if (pfd < 0) {
perror("perf_event_open");
goto cleanup;
}
perf_reader_set_fd(reader, pfd);
if (perf_reader_mmap(reader, attr.type, attr.sample_type) < 0)
goto cleanup;
if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
perror("ioctl(PERF_EVENT_IOC_ENABLE)");
goto cleanup;
}
rc = 0;
cleanup:
if (reader && rc < 0) {
perf_reader_free(reader);
reader = NULL;
}
return reader;
}
......@@ -26,8 +26,11 @@
#include "libbpf.h"
#include "perf_reader.h"
int perf_reader_page_cnt = 8;
struct perf_reader {
perf_reader_cb cb;
perf_reader_raw_cb raw_cb;
void *cb_cookie; // to be returned in the cb
void *buf; // for keeping segmented data
size_t buf_size;
......@@ -35,18 +38,20 @@ struct perf_reader {
int page_size;
int page_cnt;
int fd;
uint32_t type;
uint64_t sample_type;
};
struct perf_reader * perf_reader_new(int fd, int page_cnt, perf_reader_cb cb, void *cb_cookie) {
struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie) {
struct perf_reader *reader = calloc(1, sizeof(struct perf_reader));
if (!reader)
return NULL;
reader->cb = cb;
reader->raw_cb = raw_cb;
reader->cb_cookie = cb_cookie;
reader->fd = fd;
reader->fd = -1;
reader->page_size = getpagesize();
reader->page_cnt = page_cnt;
reader->page_cnt = perf_reader_page_cnt;
return reader;
}
......@@ -61,18 +66,20 @@ void perf_reader_free(void *ptr) {
}
}
int perf_reader_mmap(struct perf_reader *reader, int fd, uint64_t sample_type) {
int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type) {
int mmap_size = reader->page_size * (reader->page_cnt + 1);
if (!reader->cb)
return 0;
if (reader->fd < 0) {
fprintf(stderr, "%s: reader fd is not set\n", __FUNCTION__);
return -1;
}
reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE , MAP_SHARED, fd, 0);
reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE , MAP_SHARED, reader->fd, 0);
if (reader->base == MAP_FAILED) {
perror("mmap");
return -1;
}
reader->fd = fd;
reader->type = type;
reader->sample_type = sample_type;
return 0;
......@@ -90,7 +97,7 @@ struct perf_sample_trace_kprobe {
uint64_t ip;
};
static void sample_parse(struct perf_reader *reader, void *data, int size) {
static void parse_tracepoint(struct perf_reader *reader, void *data, int size) {
uint8_t *ptr = data;
struct perf_event_header *header = (void *)data;
......@@ -153,6 +160,40 @@ static void sample_parse(struct perf_reader *reader, void *data, int size) {
reader->cb(reader->cb_cookie, tk ? tk->common.pid : -1, num_callchain, callchain);
}
static void parse_sw(struct perf_reader *reader, void *data, int size) {
uint8_t *ptr = data;
struct perf_event_header *header = (void *)data;
struct {
uint32_t size;
char data[0];
} *raw = NULL;
ptr += sizeof(*header);
if (ptr > (uint8_t *)data + size) {
fprintf(stderr, "%s: corrupt sample header\n", __FUNCTION__);
return;
}
if (reader->sample_type & PERF_SAMPLE_RAW) {
raw = (void *)ptr;
ptr += sizeof(raw->size) + raw->size;
if (ptr > (uint8_t *)data + size) {
fprintf(stderr, "%s: corrupt raw sample\n", __FUNCTION__);
return;
}
}
// sanity check
if (ptr != (uint8_t *)data + size) {
fprintf(stderr, "%s: extra data at end of sample\n", __FUNCTION__);
return;
}
if (reader->raw_cb)
reader->raw_cb(reader->cb_cookie, raw->data, raw->size);
}
static uint64_t read_data_head(struct perf_event_mmap_page *perf_header) {
uint64_t data_head = *((volatile uint64_t *)&perf_header->data_head);
asm volatile("" ::: "memory");
......@@ -194,12 +235,16 @@ static void event_read(struct perf_reader *reader) {
ptr = reader->buf;
}
if (e->type == PERF_RECORD_LOST)
if (e->type == PERF_RECORD_LOST) {
fprintf(stderr, "Lost %lu samples\n", *(uint64_t *)(ptr + sizeof(*e)));
else if (e->type == PERF_RECORD_SAMPLE)
sample_parse(reader, ptr, e->size);
else
} else if (e->type == PERF_RECORD_SAMPLE) {
if (reader->type == PERF_TYPE_TRACEPOINT)
parse_tracepoint(reader, ptr, e->size);
else if (reader->type == PERF_TYPE_SOFTWARE)
parse_sw(reader, ptr, e->size);
} else {
fprintf(stderr, "%s: unknown sample type %d\n", __FUNCTION__, e->type);
}
write_data_tail(perf_header, perf_header->data_tail + e->size);
}
......@@ -223,3 +268,10 @@ int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout)
return 0;
}
void perf_reader_set_fd(struct perf_reader *reader, int fd) {
reader->fd = fd;
}
int perf_reader_fd(struct perf_reader *reader) {
return reader->fd;
}
......@@ -16,7 +16,9 @@
struct perf_reader;
struct perf_reader * perf_reader_new(int fd, int page_cnt, perf_reader_cb cb, void *cb_cookie);
struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie);
void perf_reader_free(void *ptr);
int perf_reader_mmap(struct perf_reader *reader, int fd, unsigned long sample_type);
int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type);
int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout);
int perf_reader_fd(struct perf_reader *reader);
void perf_reader_set_fd(struct perf_reader *reader, int fd);
......@@ -42,6 +42,7 @@ int bpf_open_raw_sock(const char *name);
typedef void (*perf_reader_cb)(void *cb_cookie, int pid, uint64_t callchain_num,
void *callchain);
typedef void (*perf_reader_raw_cb)(void *cb_cookie, void *raw, int raw_size);
void * bpf_attach_kprobe(int progfd, const char *event, const char *event_desc,
int pid, int cpu, int group_fd, perf_reader_cb cb,
......
......@@ -89,14 +89,19 @@ lib.bpf_prog_load.argtypes = [ct.c_int, ct.c_void_p, ct.c_size_t,
lib.bpf_attach_kprobe.restype = ct.c_void_p
_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_int,
ct.c_ulonglong, ct.POINTER(ct.c_ulonglong))
_RAW_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_void_p, ct.c_int)
lib.bpf_attach_kprobe.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p, ct.c_int,
ct.c_int, ct.c_int, _CB_TYPE, ct.py_object]
lib.bpf_detach_kprobe.restype = ct.c_int
lib.bpf_detach_kprobe.argtypes = [ct.c_char_p]
lib.bpf_open_perf_buffer.restype = ct.c_void_p
lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, ct.py_object]
lib.perf_reader_poll.restype = ct.c_int
lib.perf_reader_poll.argtypes = [ct.c_int, ct.POINTER(ct.c_void_p), ct.c_int]
lib.perf_reader_free.restype = None
lib.perf_reader_free.argtypes = [ct.c_void_p]
lib.perf_reader_fd.restype = int
lib.perf_reader_fd.argtypes = [ct.c_void_p]
open_kprobes = {}
tracefile = None
......@@ -111,6 +116,7 @@ stars_max = 40
def cleanup_kprobes():
for k, v in open_kprobes.items():
lib.perf_reader_free(v)
if isinstance(k, str):
desc = "-:kprobes/%s" % k
lib.bpf_detach_kprobe(desc.encode("ascii"))
open_kprobes.clear()
......@@ -126,6 +132,7 @@ class BPF(object):
HASH = 1
ARRAY = 2
PROG_ARRAY = 3
PERF_EVENT_ARRAY = 4
class Function(object):
def __init__(self, bpf, name, fd):
......@@ -178,6 +185,21 @@ class BPF(object):
raise Exception("Could not scanf leaf")
return leaf
def open_perf_buffer(self, key, cb, cookie):
reader = lib.bpf_open_perf_buffer(_RAW_CB_TYPE(cb),
ct.cast(id(cookie), ct.py_object))
if not reader:
raise Exception("Could not open perf buffer")
fd = lib.perf_reader_fd(reader)
self[self.Key(key)] = self.Leaf(fd)
open_kprobes[(id(self), key)] = reader
def close_perf_buffer(self, key):
reader = open_kprobes.get((id(self), key))
if reader:
lib.perf_reader_free(reader)
del(open_kprobes[(id(self), key)])
def __getitem__(self, key):
key_p = ct.pointer(key)
leaf = self.Leaf()
......@@ -208,7 +230,7 @@ class BPF(object):
ttype = lib.bpf_table_type_id(self.bpf.module, self.map_id)
# Deleting from array type maps does not have an effect, so
# zero out the entry instead.
if ttype in (BPF.ARRAY, BPF.PROG_ARRAY):
if ttype in (BPF.ARRAY, BPF.PROG_ARRAY, BPF.PERF_EVENT_ARRAY):
leaf = self.Leaf()
leaf_p = ct.pointer(leaf)
res = lib.bpf_update_elem(self.map_fd,
......@@ -216,6 +238,8 @@ class BPF(object):
ct.cast(leaf_p, ct.c_void_p), 0)
if res < 0:
raise Exception("Could not clear item")
if ttype == BPF.PERF_EVENT_ARRAY:
self.close_perf_buffer(key)
else:
res = lib.bpf_delete_elem(self.map_fd,
ct.cast(key_p, ct.c_void_p))
......@@ -792,5 +816,5 @@ class BPF(object):
try:
lib.perf_reader_poll(len(open_kprobes), readers, timeout)
except KeyboardInterrupt:
pass
exit()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment