Commit 5f5687e4 authored by Mark Drayton's avatar Mark Drayton Committed by mark

Make perf ring buffer size configurable

As discussed in #966, this PR makes the size of the ring buffer used to send
data to userspace configurable. It changes the Python, Lua and C++ APIs to
expose this knob.

It also defaults the buffer size to a larger value (64 pages per CPU, an 8x
increase) for several tools which produce a lot of output, as well as making it
configurable in `trace` via a `-b` flag.
parent 02884a02
......@@ -865,9 +865,9 @@ These are equivalent.
### 2. open_perf_buffer()
Syntax: ```table.open_perf_buffers(callback)```
Syntax: ```table.open_perf_buffers(callback, page_cnt=N)```
This operates on a table as defined in BPF as BPF_PERF_OUTPUT(), and associates the callback Python function ```callback``` to be called when data is available in the perf ring buffer. This is part of the recommended mechanism for transferring per-event data from kernel to user space.
This operates on a table as defined in BPF as BPF_PERF_OUTPUT(), and associates the callback Python function ```callback``` to be called when data is available in the perf ring buffer. This is part of the recommended mechanism for transferring per-event data from kernel to user space. The size of the perf ring buffer can be specified via the ```page_cnt``` parameter, which must be a power of two number of pages and defaults to 8.
Example:
......
......@@ -24,7 +24,7 @@ return function(BPF)
print("%-9s %-6d %s" % {os.date("%H:%M:%S"), tonumber(event.pid), ffi.string(event.str)})
end
b:get_table("events"):open_perf_buffer(print_readline, "struct { uint64_t pid; char str[80]; }")
b:get_table("events"):open_perf_buffer(print_readline, "struct { uint64_t pid; char str[80]; }", nil)
print("%-9s %-6s %s" % {"TIME", "PID", "COMMAND"})
b:kprobe_poll_loop()
......
......@@ -392,11 +392,14 @@ StatusTuple BPF::detach_perf_event(uint32_t ev_type, uint32_t ev_config) {
}
StatusTuple BPF::open_perf_buffer(const std::string& name,
perf_reader_raw_cb cb, void* cb_cookie) {
perf_reader_raw_cb cb, void* cb_cookie,
int page_cnt) {
if (perf_buffers_.find(name) == perf_buffers_.end())
perf_buffers_[name] = new BPFPerfBuffer(bpf_module_.get(), name);
if ((page_cnt & (page_cnt - 1)) != 0)
return StatusTuple(-1, "open_perf_buffer page_cnt must be a power of two");
auto table = perf_buffers_[name];
TRY2(table->open_all_cpu(cb, cb_cookie));
TRY2(table->open_all_cpu(cb, cb_cookie, page_cnt));
return StatusTuple(0);
}
......
......@@ -27,6 +27,8 @@
#include "compat/linux/bpf.h"
#include "libbpf.h"
static const int DEFAULT_PERF_BUFFER_PAGE_CNT = 8;
namespace ebpf {
struct open_probe_t {
......@@ -96,7 +98,8 @@ public:
}
StatusTuple open_perf_buffer(const std::string& name, perf_reader_raw_cb cb,
void* cb_cookie = nullptr);
void* cb_cookie = nullptr,
int page_cnt = DEFAULT_PERF_BUFFER_PAGE_CNT);
StatusTuple close_perf_buffer(const std::string& name);
void poll_perf_buffer(const std::string& name, int timeout = -1);
......
......@@ -67,11 +67,11 @@ std::vector<std::string> BPFStackTable::get_stack_symbol(int stack_id,
}
StatusTuple BPFPerfBuffer::open_on_cpu(perf_reader_raw_cb cb, int cpu,
void* cb_cookie) {
void* cb_cookie, int page_cnt) {
if (cpu_readers_.find(cpu) != cpu_readers_.end())
return StatusTuple(-1, "Perf buffer already open on CPU %d", cpu);
auto reader =
static_cast<perf_reader*>(bpf_open_perf_buffer(cb, cb_cookie, -1, cpu));
static_cast<perf_reader*>(bpf_open_perf_buffer(cb, cb_cookie, -1, cpu, page_cnt));
if (reader == nullptr)
return StatusTuple(-1, "Unable to construct perf reader");
int reader_fd = perf_reader_fd(reader);
......@@ -86,12 +86,12 @@ StatusTuple BPFPerfBuffer::open_on_cpu(perf_reader_raw_cb cb, int cpu,
}
StatusTuple BPFPerfBuffer::open_all_cpu(perf_reader_raw_cb cb,
void* cb_cookie) {
void* cb_cookie, int page_cnt) {
if (cpu_readers_.size() != 0 || readers_.size() != 0)
return StatusTuple(-1, "Previously opened perf buffer not cleaned");
for (int i: get_online_cpus()) {
auto res = open_on_cpu(cb, i, cb_cookie);
auto res = open_on_cpu(cb, i, cb_cookie, page_cnt);
if (res.code() != 0) {
TRY2(close_all_cpu());
return res;
......
......@@ -126,12 +126,14 @@ public:
: BPFTableBase<int, int>(bpf_module, name) {}
~BPFPerfBuffer();
StatusTuple open_all_cpu(perf_reader_raw_cb cb, void* cb_cookie);
StatusTuple open_all_cpu(perf_reader_raw_cb cb, void* cb_cookie,
int page_cnt);
StatusTuple close_all_cpu();
void poll(int timeout);
private:
StatusTuple open_on_cpu(perf_reader_raw_cb cb, int cpu, void* cb_cookie);
StatusTuple open_on_cpu(perf_reader_raw_cb cb, int cpu, void* cb_cookie,
int page_cnt);
StatusTuple close_on_cpu(int cpu);
std::map<int, perf_reader*> cpu_readers_;
......
......@@ -65,6 +65,8 @@
#define PERF_FLAG_FD_CLOEXEC (1UL << 3)
#endif
static int probe_perf_reader_page_cnt = 8;
static __u64 ptr_to_u64(void *ptr)
{
return (__u64) (unsigned long) ptr;
......@@ -351,7 +353,7 @@ void * bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type, con
int n;
snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid());
reader = perf_reader_new(cb, NULL, cb_cookie);
reader = perf_reader_new(cb, NULL, cb_cookie, probe_perf_reader_page_cnt);
if (!reader)
goto error;
......@@ -411,7 +413,7 @@ void * bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, con
int n;
snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid());
reader = perf_reader_new(cb, NULL, cb_cookie);
reader = perf_reader_new(cb, NULL, cb_cookie, probe_perf_reader_page_cnt);
if (!reader)
goto error;
......@@ -493,7 +495,7 @@ void * bpf_attach_tracepoint(int progfd, const char *tp_category,
char buf[256];
struct perf_reader *reader = NULL;
reader = perf_reader_new(cb, NULL, cb_cookie);
reader = perf_reader_new(cb, NULL, cb_cookie, probe_perf_reader_page_cnt);
if (!reader)
goto error;
......@@ -515,12 +517,13 @@ int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) {
return 0;
}
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu) {
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid,
int cpu, int page_cnt) {
int pfd;
struct perf_event_attr attr = {};
struct perf_reader *reader = NULL;
reader = perf_reader_new(NULL, raw_cb, cb_cookie);
reader = perf_reader_new(NULL, raw_cb, cb_cookie, page_cnt);
if (!reader)
goto error;
......
......@@ -68,7 +68,8 @@ void * bpf_attach_tracepoint(int progfd, const char *tp_category,
int group_fd, perf_reader_cb cb, void *cb_cookie);
int bpf_detach_tracepoint(const char *tp_category, const char *tp_name);
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu);
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid,
int cpu, int page_cnt);
/* attached a prog expressed by progfd to the device specified in dev_name */
int bpf_attach_xdp(const char *dev_name, int progfd);
......
......@@ -26,8 +26,6 @@
#include "libbpf.h"
#include "perf_reader.h"
int perf_reader_page_cnt = 8;
struct perf_reader {
perf_reader_cb cb;
perf_reader_raw_cb raw_cb;
......@@ -42,7 +40,8 @@ struct perf_reader {
uint64_t sample_type;
};
struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie) {
struct perf_reader * perf_reader_new(perf_reader_cb cb,
perf_reader_raw_cb raw_cb, void *cb_cookie, int page_cnt) {
struct perf_reader *reader = calloc(1, sizeof(struct perf_reader));
if (!reader)
return NULL;
......@@ -51,7 +50,7 @@ struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_c
reader->cb_cookie = cb_cookie;
reader->fd = -1;
reader->page_size = getpagesize();
reader->page_cnt = perf_reader_page_cnt;
reader->page_cnt = page_cnt;
return reader;
}
......
......@@ -25,7 +25,8 @@ extern "C" {
struct perf_reader;
struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie);
struct perf_reader * perf_reader_new(perf_reader_cb cb,
perf_reader_raw_cb raw_cb, void *cb_cookie, int page_cnt);
void perf_reader_free(void *ptr);
int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type);
int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout);
......
......@@ -54,7 +54,7 @@ void * bpf_attach_uprobe(int progfd, int attach_type, const char *ev_name,
int bpf_detach_uprobe(const char *ev_name);
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu);
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu, int page_cnt);
]]
ffi.cdef[[
......
......@@ -243,13 +243,14 @@ local function _perf_id(id, cpu)
return string.format("bcc:perf_event_array:%d:%d", tonumber(id), cpu or 0)
end
function PerfEventArray:_open_perf_buffer(cpu, callback, ctype)
function PerfEventArray:_open_perf_buffer(cpu, callback, ctype, page_cnt)
local _cb = ffi.cast("perf_reader_raw_cb",
function (cookie, data, size)
callback(cpu, ctype(data)[0])
end)
local reader = libbcc.bpf_open_perf_buffer(_cb, nil, -1, cpu)
-- default to 8 pages per buffer
local reader = libbcc.bpf_open_perf_buffer(_cb, nil, -1, cpu, page_cnt or 8)
assert(reader, "failed to open perf buffer")
local fd = libbcc.perf_reader_fd(reader)
......@@ -258,11 +259,11 @@ function PerfEventArray:_open_perf_buffer(cpu, callback, ctype)
self._callbacks[cpu] = _cb
end
function PerfEventArray:open_perf_buffer(callback, data_type, ...)
function PerfEventArray:open_perf_buffer(callback, data_type, data_params, page_cnt)
assert(data_type, "a data type is needed for callback conversion")
local ctype = ffi.typeof(data_type.."*", ...)
local ctype = ffi.typeof(data_type.."*", unpack(data_params or {}))
for i = 0, Posix.cpu_count() - 1 do
self:_open_perf_buffer(i, callback, ctype)
self:_open_perf_buffer(i, callback, ctype, page_cnt)
end
end
......
......@@ -102,7 +102,7 @@ lib.bpf_attach_tracepoint.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p, ct.c_i
lib.bpf_detach_tracepoint.restype = ct.c_int
lib.bpf_detach_tracepoint.argtypes = [ct.c_char_p, ct.c_char_p]
lib.bpf_open_perf_buffer.restype = ct.c_void_p
lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, ct.py_object, ct.c_int, ct.c_int]
lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, ct.py_object, ct.c_int, ct.c_int, ct.c_int]
lib.bpf_open_perf_event.restype = ct.c_int
lib.bpf_open_perf_event.argtypes = [ct.c_uint, ct.c_ulonglong, ct.c_int, ct.c_int]
lib.perf_reader_poll.restype = ct.c_int
......
......@@ -507,20 +507,25 @@ class PerfEventArray(ArrayBase):
super(PerfEventArray, self).__delitem__(key)
self.close_perf_buffer(key)
def open_perf_buffer(self, callback):
def open_perf_buffer(self, callback, page_cnt=8):
"""open_perf_buffers(callback)
Opens a set of per-cpu ring buffer to receive custom perf event
data from the bpf program. The callback will be invoked for each
event submitted from the kernel, up to millions per second.
event submitted from the kernel, up to millions per second. Use
page_cnt to change the size of the per-cpu ring buffer. The value
must be a power of two and defaults to 8.
"""
if page_cnt & (page_cnt - 1) != 0:
raise Exception("Perf buffer page_cnt must be a power of two")
for i in get_online_cpus():
self._open_perf_buffer(i, callback)
self._open_perf_buffer(i, callback, page_cnt)
def _open_perf_buffer(self, cpu, callback):
def _open_perf_buffer(self, cpu, callback, page_cnt):
fn = _RAW_CB_TYPE(lambda _, data, size: callback(cpu, data, size))
reader = lib.bpf_open_perf_buffer(fn, None, -1, cpu)
reader = lib.bpf_open_perf_buffer(fn, None, -1, cpu, page_cnt)
if not reader:
raise Exception("Could not open perf buffer")
fd = lib.perf_reader_fd(reader)
......
......@@ -175,9 +175,9 @@ return function(BPF, utils)
uint64_t sector;
uint64_t len;
uint64_t ts;
char disk_name[%d];
char name[%d];
char disk_name[$];
char name[$];
}
]] % {DISK_NAME_LEN, TASK_COMM_LEN})
]], {DISK_NAME_LEN, TASK_COMM_LEN}, 64)
bpf:kprobe_poll_loop()
end
......@@ -182,6 +182,6 @@ def print_event(cpu, data, size):
start_ts = 1
# loop with callback to print_event
b["events"].open_perf_buffer(print_event)
b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
......@@ -343,6 +343,6 @@ else:
"BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
# read events
b["events"].open_perf_buffer(print_event)
b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
......@@ -205,7 +205,7 @@ slept = float(0)
trigger = int(0.8 * (1000000000 / frequency))
# read events
b["events"].open_perf_buffer(print_event)
b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
# allow some buffering by calling sleep(), to reduce the context switch
# rate and lower overhead.
......
......@@ -131,7 +131,7 @@ print("Tracing database queries for pids %s slower than %d ms..." %
(', '.join(map(str, args.pids)), args.threshold))
print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY"))
bpf["events"].open_perf_buffer(print_event)
bpf["events"].open_perf_buffer(print_event, page_cnt=64)
while True:
bpf.kprobe_poll()
......@@ -153,6 +153,6 @@ def print_event(cpu, data, size):
# header
print("%-11s %-6s %-16s %1s %s" % ("TIME(s)", "PID", "COMM", "T", "FILE"))
b["events"].open_perf_buffer(print_event)
b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
......@@ -337,6 +337,6 @@ else:
"BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
# read events
b["events"].open_perf_buffer(print_event)
b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
......@@ -243,6 +243,6 @@ def print_event(cpu, data, size):
time.time() - start_ts, event.comm, event.pid, mode_s[event.mode],
event.sz, ms, name))
b["events"].open_perf_buffer(print_event)
b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
......@@ -128,6 +128,6 @@ def print_event(cpu, data, size):
event.pid, float(event.delta) / 1000000, event.query))
# loop with callback to print_event
b["events"].open_perf_buffer(print_event)
b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
......@@ -178,6 +178,6 @@ def print_event(cpu, data, size):
event.comm, fd_s, err, event.fname))
# loop with callback to print_event
b["events"].open_perf_buffer(print_event)
b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
......@@ -102,6 +102,6 @@ return function(BPF, utils)
bpf:get_table("events"):open_perf_buffer(print_event,
"struct { uint64_t stack_id; uint32_t pid; char comm[$]; }",
TASK_COMM_LEN)
{TASK_COMM_LEN})
bpf:kprobe_poll_loop()
end
......@@ -159,6 +159,6 @@ def print_event(cpu, data, size):
fd_s, err, event.fname))
# loop with callback to print_event
b["events"].open_perf_buffer(print_event)
b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
......@@ -354,7 +354,7 @@ print(header_string % ("PID", "COMM",
start_ts = 0
# read events
b["ipv4_events"].open_perf_buffer(print_ipv4_event)
b["ipv6_events"].open_perf_buffer(print_ipv6_event)
b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
while 1:
b.kprobe_poll()
......@@ -29,6 +29,7 @@ class Probe(object):
use_localtime = True
tgid = -1
pid = -1
page_cnt = None
@classmethod
def configure(cls, args):
......@@ -38,6 +39,7 @@ class Probe(object):
cls.first_ts = BPF.monotonic_time()
cls.tgid = args.tgid or -1
cls.pid = args.pid or -1
cls.page_cnt = args.buffer_pages
def __init__(self, probe, string_size, kernel_stack, user_stack):
self.usdt = None
......@@ -510,7 +512,8 @@ BPF_PERF_OUTPUT(%s);
self._attach_u(bpf)
self.python_struct = self._generate_python_data_decl()
callback = partial(self.print_event, bpf)
bpf[self.events_name].open_perf_buffer(callback)
bpf[self.events_name].open_perf_buffer(callback,
page_cnt=self.page_cnt)
def _attach_k(self, bpf):
if self.probe_type == "r":
......@@ -543,6 +546,7 @@ BPF_PERF_OUTPUT(%s);
pid=Probe.tgid)
class Tool(object):
DEFAULT_PERF_BUFFER_PAGES = 64
examples = """
EXAMPLES:
......@@ -577,6 +581,10 @@ trace 'p::SyS_nanosleep(struct timespec *ts) "sleep for %lld ns", ts->tv_nsec'
"functions and print trace messages.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=Tool.examples)
parser.add_argument("-b", "--buffer-pages", type=int,
default=Tool.DEFAULT_PERF_BUFFER_PAGES,
help="number of pages to use for perf_events ring buffer "
"(default: %(default)d)")
# we'll refer to the userspace concepts of "pid" and "tid" by
# their kernel names -- tgid and pid -- inside the script
parser.add_argument("-p", "--pid", type=int, metavar="PID",
......
......@@ -201,11 +201,28 @@ In this example, we traced the "ls ~" command as it was opening its shared
libraries and then accessing the /home/vagrant directory listing.
Lastly, if a high-frequency event is traced you may overflow the perf ring
buffer. This shows as "Lost N samples":
# trace sys_open
5087 5087 pgrep sys_open
5087 5087 pgrep sys_open
5087 5087 pgrep sys_open
5087 5087 pgrep sys_open
5087 5087 pgrep sys_open
Lost 764896 samples
Lost 764896 samples
Lost 764896 samples
The perf ring buffer size can be changed with -b. The unit is size per-CPU buffer
size and is measured in pages. The value must be a power of two and defaults to
64 pages.
USAGE message:
# trace -h
usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S]
[-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-I header]
usage: trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [-v] [-Z STRING_SIZE]
[-S] [-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-I header]
probe [probe ...]
Attach to functions and print trace messages.
......@@ -215,6 +232,9 @@ positional arguments:
optional arguments:
-h, --help show this help message and exit
-b BUFFER_PAGES, --buffer-pages BUFFER_PAGES
number of pages to use for perf_events ring buffer
(default: 64)
-p PID, --pid PID id of the process to trace (optional)
-L TID, --tid TID id of the thread to trace (optional)
-v, --verbose print resulting BPF program code before executing
......@@ -224,7 +244,7 @@ optional arguments:
-M MAX_EVENTS, --max-events MAX_EVENTS
number of events to print before quitting
-t, --timestamp print timestamp column (offset from trace start)
-T, --time print time column
-T, --time print time column
-K, --kernel-stack output kernel stack trace
-U, --user-stack output user stack trace
-I header, --include header
......@@ -247,9 +267,9 @@ trace 'c:malloc "size = %d", arg1'
Trace malloc calls and print the size being allocated
trace 'p:c:write (arg1 == 1) "writing %d bytes to STDOUT", arg3'
Trace the write() call from libc to monitor writes to STDOUT
trace 'r::__kmalloc (retval == 0) "kmalloc failed!"
trace 'r::__kmalloc (retval == 0) "kmalloc failed!"'
Trace returns from __kmalloc which returned a null pointer
trace 'r:c:malloc (retval) "allocated = %x", retval
trace 'r:c:malloc (retval) "allocated = %x", retval'
Trace returns from malloc and print non-NULL allocated buffers
trace 't:block:block_rq_complete "sectors=%d", args->nr_sector'
Trace the block_rq_complete kernel tracepoint and print # of tx sectors
......
......@@ -293,6 +293,6 @@ else:
"BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
# read events
b["events"].open_perf_buffer(print_event)
b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
......@@ -297,6 +297,6 @@ else:
"BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
# read events
b["events"].open_perf_buffer(print_event)
b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment