Commit 8207d10e authored by Brenden Blanco's avatar Brenden Blanco

Add ability to consume perf events in python

This adds the ability to consume perf events in libbpf/python using the
ring buffer. For now, this is the only way to get access to the function
call graph. Only kernel functions are supported.

It does this by introducing a new set of libbpf helper functions that
can open the perf fd, mmap it, and poll over the events as they are
submitted by the kernel. This allow for faster event processing than
trace_printks, but has not been tested.

The functionality is disabled by default, the user can enable it by
passing a non-empty cb parameter into the BPF constructor. That cb
function will be invoked for each event that is read from the buffer.
Buffers are per-fd, so each event is distinct as well as separated from
other processes that may be running simultaneously.

The initial test case uses this functionality to build a histogram of
events keyed by the callchain.
Signed-off-by: default avatarBrenden Blanco <bblanco@plumgrid.com>
parent b262e26d
......@@ -35,7 +35,7 @@ endif()
# tell the shared library where it is being installed so it can find shared header files
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBCC_INSTALL_PREFIX='\"${CMAKE_INSTALL_PREFIX}\"'")
add_library(bcc SHARED bpf_common.cc bpf_module.cc libbpf.c)
add_library(bcc SHARED bpf_common.cc bpf_module.cc libbpf.c perf_reader.c)
set_target_properties(bcc PROPERTIES VERSION ${REVISION_LAST} SOVERSION 0)
# BPF is still experimental otherwise it should be available
......
......@@ -33,6 +33,7 @@
#include <unistd.h>
#include "libbpf.h"
#include "perf_reader.h"
// TODO: remove these defines when linux-libc-dev exports them properly
......@@ -175,8 +176,8 @@ int bpf_attach_socket(int sock, int prog) {
return setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog, sizeof(prog));
}
static int bpf_attach_tracing_event(int progfd, const char *event_path, pid_t pid, int cpu, int group_fd)
{
static int bpf_attach_tracing_event(int progfd, const char *event_path,
struct perf_reader *reader, int pid, int cpu, int group_fd) {
int efd = -1, rc = -1, pfd = -1;
ssize_t bytes = -1;
char buf[256];
......@@ -197,7 +198,7 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path, pid_t pi
buf[bytes] = '\0';
attr.config = strtol(buf, NULL, 0);
attr.type = PERF_TYPE_TRACEPOINT;
attr.sample_type = PERF_SAMPLE_RAW;
attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
attr.sample_period = 1;
attr.wakeup_events = 1;
pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC);
......@@ -205,6 +206,10 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path, pid_t pi
perror("perf_event_open");
goto cleanup;
}
if (perf_reader_mmap(reader, pfd, attr.sample_type) < 0)
goto cleanup;
if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, progfd) < 0) {
perror("ioctl(PERF_EVENT_IOC_SET_BPF)");
goto cleanup;
......@@ -226,11 +231,17 @@ cleanup:
return rc;
}
int bpf_attach_kprobe(int progfd, const char *event,
const char *event_desc, pid_t pid,
int cpu, int group_fd) {
void * bpf_attach_kprobe(int progfd, const char *event,
const char *event_desc, pid_t pid,
int cpu, int group_fd, perf_reader_cb cb,
void *cb_cookie) {
int rc = -1, kfd = -1;
char buf[256];
struct perf_reader *reader = NULL;
reader = perf_reader_new(-1, 8, cb, cb_cookie);
if (!reader)
goto cleanup;
kfd = open("/sys/kernel/debug/tracing/kprobe_events", O_WRONLY | O_APPEND, 0);
if (kfd < 0) {
......@@ -246,13 +257,17 @@ int bpf_attach_kprobe(int progfd, const char *event,
}
snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/kprobes/%s", event);
rc = bpf_attach_tracing_event(progfd, buf, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/);
rc = bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd);
cleanup:
if (kfd >= 0)
close(kfd);
if (reader && rc < 0) {
perf_reader_free(reader);
reader = NULL;
}
return rc;
return reader;
}
int bpf_detach_kprobe(const char *event_desc) {
......
/*
* Copyright (c) 2015 PLUMgrid, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <linux/perf_event.h>
#include <poll.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include "libbpf.h"
#include "perf_reader.h"
struct perf_reader {
perf_reader_cb cb;
void *cb_cookie; // to be returned in the cb
void *buf; // for keeping segmented data
size_t buf_size;
void *base;
int page_size;
int page_cnt;
int fd;
uint64_t sample_type;
};
struct perf_reader * perf_reader_new(int fd, int page_cnt, perf_reader_cb cb, void *cb_cookie) {
struct perf_reader *reader = calloc(1, sizeof(struct perf_reader));
if (!reader)
return NULL;
reader->cb = cb;
reader->cb_cookie = cb_cookie;
reader->fd = fd;
reader->page_size = getpagesize();
reader->page_cnt = page_cnt;
return reader;
}
void perf_reader_free(void *ptr) {
if (ptr) {
struct perf_reader *reader = ptr;
munmap(reader->base, reader->page_size * (reader->page_cnt + 1));
close(reader->fd);
free(reader->buf);
free(ptr);
}
}
int perf_reader_mmap(struct perf_reader *reader, int fd, uint64_t sample_type) {
int mmap_size = reader->page_size * (reader->page_cnt + 1);
if (!reader->cb)
return 0;
reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE , MAP_SHARED, fd, 0);
if (reader->base == MAP_FAILED) {
perror("mmap");
return -1;
}
reader->fd = fd;
reader->sample_type = sample_type;
return 0;
}
struct perf_sample_trace_common {
uint16_t id;
uint8_t flags;
uint8_t preempt_count;
int pid;
};
struct perf_sample_trace_kprobe {
struct perf_sample_trace_common common;
uint64_t ip;
};
static void sample_parse(struct perf_reader *reader, void *data, int size) {
uint8_t *ptr = data;
struct perf_event_header *header = (void *)data;
struct perf_sample_trace_kprobe *tk = NULL;
uint64_t *callchain = NULL;
uint64_t num_callchain = 0;
ptr += sizeof(*header);
if (ptr > (uint8_t *)data + size) {
fprintf(stderr, "%s: corrupt sample header\n", __FUNCTION__);
return;
}
if (reader->sample_type & PERF_SAMPLE_CALLCHAIN) {
struct {
uint64_t nr;
uint64_t ips[0];
} *cc = (void *)ptr;
ptr += sizeof(cc->nr) + sizeof(*cc->ips) * cc->nr;
// size sanity check
if (ptr > (uint8_t *)data + size) {
fprintf(stderr, "%s: corrupt callchain sample\n", __FUNCTION__);
return;
}
int i;
// don't include magic numbers in the call chain
for (i = 0; i < cc->nr; ++i) {
if (cc->ips[i] == PERF_CONTEXT_USER)
break;
if (cc->ips[i] >= PERF_CONTEXT_MAX)
continue;
if (!callchain)
callchain = &cc->ips[i];
++num_callchain;
}
}
// for kprobes, raw samples just include the common data structure and the
// instruction pointer
if (reader->sample_type & PERF_SAMPLE_RAW) {
struct {
uint32_t size;
char data[0];
} *raw = (void *)ptr;
ptr += sizeof(raw->size) + raw->size;
if (ptr > (uint8_t *)data + size) {
fprintf(stderr, "%s: corrupt raw sample\n", __FUNCTION__);
return;
}
tk = (void *)raw->data;
}
// sanity check
if (ptr != (uint8_t *)data + size) {
fprintf(stderr, "%s: extra data at end of sample\n", __FUNCTION__);
return;
}
// call out to the user with the parsed data
if (reader->cb)
reader->cb(reader->cb_cookie, tk ? tk->common.pid : -1, num_callchain, callchain);
}
static uint64_t read_data_head(struct perf_event_mmap_page *perf_header) {
uint64_t data_head = *((volatile uint64_t *)&perf_header->data_head);
asm volatile("" ::: "memory");
return data_head;
}
static void write_data_tail(struct perf_event_mmap_page *perf_header, uint64_t data_tail) {
asm volatile("" ::: "memory");
perf_header->data_tail = data_tail;
}
static void event_read(struct perf_reader *reader) {
struct perf_event_mmap_page *perf_header = reader->base;
uint64_t buffer_size = (uint64_t)reader->page_size * reader->page_cnt;
uint64_t data_head;
uint8_t *base = (uint8_t *)reader->base + reader->page_size;
uint8_t *sentinel = (uint8_t *)reader->base + buffer_size + reader->page_size;
uint8_t *begin, *end;
// Consume all the events on this ring, calling the cb function for each one.
// The message may fall on the ring boundary, in which case copy the message
// into a malloced buffer.
for (data_head = read_data_head(perf_header); perf_header->data_tail != data_head;
data_head = read_data_head(perf_header)) {
uint64_t data_tail = perf_header->data_tail;
uint8_t *ptr;
begin = base + data_tail % buffer_size;
// event header is u64, won't wrap
struct perf_event_header *e = (void *)begin;
ptr = begin;
end = base + (data_tail + e->size) % buffer_size;
if (end < begin) {
// perf event wraps around the ring, make a contiguous copy
reader->buf = realloc(reader->buf, e->size);
size_t len = sentinel - begin;
memcpy(reader->buf, begin, len);
memcpy(reader->buf + len, base, e->size - len);
ptr = reader->buf;
}
if (e->type == PERF_RECORD_LOST)
fprintf(stderr, "Lost %lu samples\n", *(uint64_t *)(ptr + sizeof(*e)));
else if (e->type == PERF_RECORD_SAMPLE)
sample_parse(reader, ptr, e->size);
else
fprintf(stderr, "%s: unknown sample type %d\n", __FUNCTION__, e->type);
write_data_tail(perf_header, perf_header->data_tail + e->size);
}
}
int perf_reader_poll(int num_readers, struct perf_reader **readers) {
struct pollfd pfds[] = {
{readers[0]->fd, POLLIN},
};
if (poll(pfds, num_readers, -1) > 0) {
int i;
for (i = 0; i < num_readers; ++i) {
if (pfds[i].revents & POLLIN)
event_read(readers[i]);
}
}
return 0;
}
/*
* Copyright (c) 2015 PLUMgrid, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
struct perf_reader;
struct perf_reader * perf_reader_new(int fd, int page_cnt, perf_reader_cb cb, void *cb_cookie);
void perf_reader_free(void *ptr);
int perf_reader_mmap(struct perf_reader *reader, int fd, unsigned long sample_type);
int perf_reader_poll(int num_readers, struct perf_reader **readers);
......@@ -40,7 +40,12 @@ int bpf_attach_socket(int sockfd, int progfd);
/* create RAW socket and bind to interface 'name' */
int bpf_open_raw_sock(const char *name);
int bpf_attach_kprobe(int progfd, const char *event, const char *event_desc, int pid, int cpu, int group_fd);
typedef void (*perf_reader_cb)(void *cb_cookie, int pid, uint64_t callchain_num,
void *callchain);
void * bpf_attach_kprobe(int progfd, const char *event, const char *event_desc,
int pid, int cpu, int group_fd, perf_reader_cb cb,
void *cb_cookie);
int bpf_detach_kprobe(const char *event_desc);
#define LOG_BUF_SIZE 65536
......
......@@ -86,11 +86,17 @@ lib.bpf_attach_socket.argtypes = [ct.c_int, ct.c_int]
lib.bpf_prog_load.restype = ct.c_int
lib.bpf_prog_load.argtypes = [ct.c_int, ct.c_void_p, ct.c_size_t,
ct.c_char_p, ct.c_uint, ct.c_char_p, ct.c_uint]
lib.bpf_attach_kprobe.restype = ct.c_int
lib.bpf_attach_kprobe.argtypes = [ct.c_int, ct.c_char_p,
ct.c_char_p, ct.c_int, ct.c_int, ct.c_int]
lib.bpf_attach_kprobe.restype = ct.c_void_p
_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_int,
ct.c_ulonglong, ct.POINTER(ct.c_ulonglong))
lib.bpf_attach_kprobe.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p, ct.c_int,
ct.c_int, ct.c_int, _CB_TYPE, ct.py_object]
lib.bpf_detach_kprobe.restype = ct.c_int
lib.bpf_detach_kprobe.argtypes = [ct.c_char_p]
lib.perf_reader_poll.restype = ct.c_int
lib.perf_reader_poll.argtypes = [ct.c_int, ct.POINTER(ct.c_void_p)]
lib.perf_reader_free.restype = None
lib.perf_reader_free.argtypes = [ct.c_void_p]
open_kprobes = {}
tracefile = None
......@@ -104,9 +110,10 @@ stars_max = 40
@atexit.register
def cleanup_kprobes():
for k, v in open_kprobes.items():
os.close(v)
lib.perf_reader_free(v)
desc = "-:kprobes/%s" % k
lib.bpf_detach_kprobe(desc.encode("ascii"))
open_kprobes.clear()
if tracefile:
tracefile.close()
......@@ -344,7 +351,7 @@ class BPF(object):
raise Exception("Could not find file %s" % filename)
return filename
def __init__(self, src_file="", hdr_file="", text=None, debug=0):
def __init__(self, src_file="", hdr_file="", text=None, cb=None, debug=0):
"""Create a a new BPF module with the given source code.
Note:
......@@ -360,6 +367,8 @@ class BPF(object):
0x2: print BPF bytecode to stderr
"""
self._reader_cb_impl = _CB_TYPE(BPF._reader_cb)
self._user_cb = cb
self.debug = debug
self.funcs = {}
self.tables = {}
......@@ -502,6 +511,11 @@ class BPF(object):
def __iter__(self):
return self.tables.__iter__()
def _reader_cb(self, pid, callchain_num, callchain):
if self._user_cb:
cc = tuple(callchain[i] for i in range(0, callchain_num))
self._user_cb(pid, cc)
@staticmethod
def attach_raw_socket(fn, dev):
if not isinstance(fn, BPF.Function):
......@@ -528,7 +542,7 @@ class BPF(object):
(line != "\n" and line not in blacklist)]
def attach_kprobe(self, event="", fn_name="", event_re="",
pid=0, cpu=-1, group_fd=-1):
pid=-1, cpu=0, group_fd=-1):
# allow the caller to glob multiple functions together
if event_re:
......@@ -544,8 +558,10 @@ class BPF(object):
ev_name = "p_" + event.replace("+", "_").replace(".", "_")
desc = "p:kprobes/%s %s" % (ev_name, event)
res = lib.bpf_attach_kprobe(fn.fd, ev_name.encode("ascii"),
desc.encode("ascii"), pid, cpu, group_fd)
if res < 0:
desc.encode("ascii"), pid, cpu, group_fd,
self._reader_cb_impl, ct.cast(id(self), ct.py_object))
res = ct.cast(res, ct.c_void_p)
if res == None:
raise Exception("Failed to attach BPF to kprobe")
open_kprobes[ev_name] = res
return self
......@@ -579,8 +595,10 @@ class BPF(object):
ev_name = "r_" + event.replace("+", "_").replace(".", "_")
desc = "r:kprobes/%s %s" % (ev_name, event)
res = lib.bpf_attach_kprobe(fn.fd, ev_name.encode("ascii"),
desc.encode("ascii"), pid, cpu, group_fd)
if res < 0:
desc.encode("ascii"), pid, cpu, group_fd,
self._reader_cb_impl, ct.cast(id(self), ct.py_object))
res = ct.cast(res, ct.c_void_p)
if res == None:
raise Exception("Failed to attach BPF to kprobe")
open_kprobes[ev_name] = res
return self
......@@ -751,3 +769,18 @@ class BPF(object):
event_re is used while attaching and detaching probes
"""
return len(open_kprobes)
def kprobe_poll(self):
"""kprobe_poll(self)
Poll from the ring buffers for all of the open kprobes, calling the
cb() that was given in the BPF constructor for each entry.
"""
readers = (ct.c_void_p * len(open_kprobes))()
for i, v in enumerate(open_kprobes.values()):
readers[i] = v
try:
lib.perf_reader_poll(len(open_kprobes), readers)
except KeyboardInterrupt:
pass
......@@ -42,3 +42,5 @@ add_test(NAME py_test_clang WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMAND ${TEST_WRAPPER} py_clang sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_clang.py)
add_test(NAME py_test_histogram WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMAND ${TEST_WRAPPER} py_histogram sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_histogram.py)
add_test(NAME py_test_callchain WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMAND ${TEST_WRAPPER} py_callchain sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_callchain.py)
#!/usr/bin/env python
# Copyright (c) PLUMgrid, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
from bcc import BPF
import time
from unittest import main, TestCase
class TestCallchain(TestCase):
def test_callchain1(self):
hist = {}
def cb(pid, callchain):
counter = hist.get(callchain, 0)
counter += 1
hist[callchain] = counter
b = BPF(text="""
#include <linux/ptrace.h>
int kprobe__finish_task_switch(struct pt_regs *ctx) {
return 1;
}
""", cb=cb)
start = time.time()
while time.time() < start + 1:
b.kprobe_poll()
for k, v in hist.items():
syms = [b.ksym(addr) for addr in k]
print("%-08d:" % v, syms)
if __name__ == "__main__":
main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment