Commit 5f5687e4 authored by Mark Drayton's avatar Mark Drayton Committed by mark

Make perf ring buffer size configurable

As discussed in #966, this PR makes the size of the ring buffer used to send
data to userspace configurable. It changes the Python, Lua and C++ APIs to
expose this knob.

It also defaults the buffer size to a larger value (64 pages per CPU, an 8x
increase) for several tools which produce a lot of output, as well as making it
configurable in `trace` via a `-b` flag.
parent 02884a02
...@@ -865,9 +865,9 @@ These are equivalent. ...@@ -865,9 +865,9 @@ These are equivalent.
### 2. open_perf_buffer() ### 2. open_perf_buffer()
Syntax: ```table.open_perf_buffers(callback)``` Syntax: ```table.open_perf_buffers(callback, page_cnt=N)```
This operates on a table as defined in BPF as BPF_PERF_OUTPUT(), and associates the callback Python function ```callback``` to be called when data is available in the perf ring buffer. This is part of the recommended mechanism for transferring per-event data from kernel to user space. This operates on a table as defined in BPF as BPF_PERF_OUTPUT(), and associates the callback Python function ```callback``` to be called when data is available in the perf ring buffer. This is part of the recommended mechanism for transferring per-event data from kernel to user space. The size of the perf ring buffer can be specified via the ```page_cnt``` parameter, which must be a power of two number of pages and defaults to 8.
Example: Example:
......
...@@ -24,7 +24,7 @@ return function(BPF) ...@@ -24,7 +24,7 @@ return function(BPF)
print("%-9s %-6d %s" % {os.date("%H:%M:%S"), tonumber(event.pid), ffi.string(event.str)}) print("%-9s %-6d %s" % {os.date("%H:%M:%S"), tonumber(event.pid), ffi.string(event.str)})
end end
b:get_table("events"):open_perf_buffer(print_readline, "struct { uint64_t pid; char str[80]; }") b:get_table("events"):open_perf_buffer(print_readline, "struct { uint64_t pid; char str[80]; }", nil)
print("%-9s %-6s %s" % {"TIME", "PID", "COMMAND"}) print("%-9s %-6s %s" % {"TIME", "PID", "COMMAND"})
b:kprobe_poll_loop() b:kprobe_poll_loop()
......
...@@ -392,11 +392,14 @@ StatusTuple BPF::detach_perf_event(uint32_t ev_type, uint32_t ev_config) { ...@@ -392,11 +392,14 @@ StatusTuple BPF::detach_perf_event(uint32_t ev_type, uint32_t ev_config) {
} }
StatusTuple BPF::open_perf_buffer(const std::string& name, StatusTuple BPF::open_perf_buffer(const std::string& name,
perf_reader_raw_cb cb, void* cb_cookie) { perf_reader_raw_cb cb, void* cb_cookie,
int page_cnt) {
if (perf_buffers_.find(name) == perf_buffers_.end()) if (perf_buffers_.find(name) == perf_buffers_.end())
perf_buffers_[name] = new BPFPerfBuffer(bpf_module_.get(), name); perf_buffers_[name] = new BPFPerfBuffer(bpf_module_.get(), name);
if ((page_cnt & (page_cnt - 1)) != 0)
return StatusTuple(-1, "open_perf_buffer page_cnt must be a power of two");
auto table = perf_buffers_[name]; auto table = perf_buffers_[name];
TRY2(table->open_all_cpu(cb, cb_cookie)); TRY2(table->open_all_cpu(cb, cb_cookie, page_cnt));
return StatusTuple(0); return StatusTuple(0);
} }
......
...@@ -27,6 +27,8 @@ ...@@ -27,6 +27,8 @@
#include "compat/linux/bpf.h" #include "compat/linux/bpf.h"
#include "libbpf.h" #include "libbpf.h"
static const int DEFAULT_PERF_BUFFER_PAGE_CNT = 8;
namespace ebpf { namespace ebpf {
struct open_probe_t { struct open_probe_t {
...@@ -96,7 +98,8 @@ public: ...@@ -96,7 +98,8 @@ public:
} }
StatusTuple open_perf_buffer(const std::string& name, perf_reader_raw_cb cb, StatusTuple open_perf_buffer(const std::string& name, perf_reader_raw_cb cb,
void* cb_cookie = nullptr); void* cb_cookie = nullptr,
int page_cnt = DEFAULT_PERF_BUFFER_PAGE_CNT);
StatusTuple close_perf_buffer(const std::string& name); StatusTuple close_perf_buffer(const std::string& name);
void poll_perf_buffer(const std::string& name, int timeout = -1); void poll_perf_buffer(const std::string& name, int timeout = -1);
......
...@@ -67,11 +67,11 @@ std::vector<std::string> BPFStackTable::get_stack_symbol(int stack_id, ...@@ -67,11 +67,11 @@ std::vector<std::string> BPFStackTable::get_stack_symbol(int stack_id,
} }
StatusTuple BPFPerfBuffer::open_on_cpu(perf_reader_raw_cb cb, int cpu, StatusTuple BPFPerfBuffer::open_on_cpu(perf_reader_raw_cb cb, int cpu,
void* cb_cookie) { void* cb_cookie, int page_cnt) {
if (cpu_readers_.find(cpu) != cpu_readers_.end()) if (cpu_readers_.find(cpu) != cpu_readers_.end())
return StatusTuple(-1, "Perf buffer already open on CPU %d", cpu); return StatusTuple(-1, "Perf buffer already open on CPU %d", cpu);
auto reader = auto reader =
static_cast<perf_reader*>(bpf_open_perf_buffer(cb, cb_cookie, -1, cpu)); static_cast<perf_reader*>(bpf_open_perf_buffer(cb, cb_cookie, -1, cpu, page_cnt));
if (reader == nullptr) if (reader == nullptr)
return StatusTuple(-1, "Unable to construct perf reader"); return StatusTuple(-1, "Unable to construct perf reader");
int reader_fd = perf_reader_fd(reader); int reader_fd = perf_reader_fd(reader);
...@@ -86,12 +86,12 @@ StatusTuple BPFPerfBuffer::open_on_cpu(perf_reader_raw_cb cb, int cpu, ...@@ -86,12 +86,12 @@ StatusTuple BPFPerfBuffer::open_on_cpu(perf_reader_raw_cb cb, int cpu,
} }
StatusTuple BPFPerfBuffer::open_all_cpu(perf_reader_raw_cb cb, StatusTuple BPFPerfBuffer::open_all_cpu(perf_reader_raw_cb cb,
void* cb_cookie) { void* cb_cookie, int page_cnt) {
if (cpu_readers_.size() != 0 || readers_.size() != 0) if (cpu_readers_.size() != 0 || readers_.size() != 0)
return StatusTuple(-1, "Previously opened perf buffer not cleaned"); return StatusTuple(-1, "Previously opened perf buffer not cleaned");
for (int i: get_online_cpus()) { for (int i: get_online_cpus()) {
auto res = open_on_cpu(cb, i, cb_cookie); auto res = open_on_cpu(cb, i, cb_cookie, page_cnt);
if (res.code() != 0) { if (res.code() != 0) {
TRY2(close_all_cpu()); TRY2(close_all_cpu());
return res; return res;
......
...@@ -126,12 +126,14 @@ public: ...@@ -126,12 +126,14 @@ public:
: BPFTableBase<int, int>(bpf_module, name) {} : BPFTableBase<int, int>(bpf_module, name) {}
~BPFPerfBuffer(); ~BPFPerfBuffer();
StatusTuple open_all_cpu(perf_reader_raw_cb cb, void* cb_cookie); StatusTuple open_all_cpu(perf_reader_raw_cb cb, void* cb_cookie,
int page_cnt);
StatusTuple close_all_cpu(); StatusTuple close_all_cpu();
void poll(int timeout); void poll(int timeout);
private: private:
StatusTuple open_on_cpu(perf_reader_raw_cb cb, int cpu, void* cb_cookie); StatusTuple open_on_cpu(perf_reader_raw_cb cb, int cpu, void* cb_cookie,
int page_cnt);
StatusTuple close_on_cpu(int cpu); StatusTuple close_on_cpu(int cpu);
std::map<int, perf_reader*> cpu_readers_; std::map<int, perf_reader*> cpu_readers_;
......
...@@ -65,6 +65,8 @@ ...@@ -65,6 +65,8 @@
#define PERF_FLAG_FD_CLOEXEC (1UL << 3) #define PERF_FLAG_FD_CLOEXEC (1UL << 3)
#endif #endif
static int probe_perf_reader_page_cnt = 8;
static __u64 ptr_to_u64(void *ptr) static __u64 ptr_to_u64(void *ptr)
{ {
return (__u64) (unsigned long) ptr; return (__u64) (unsigned long) ptr;
...@@ -351,7 +353,7 @@ void * bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type, con ...@@ -351,7 +353,7 @@ void * bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type, con
int n; int n;
snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid()); snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid());
reader = perf_reader_new(cb, NULL, cb_cookie); reader = perf_reader_new(cb, NULL, cb_cookie, probe_perf_reader_page_cnt);
if (!reader) if (!reader)
goto error; goto error;
...@@ -411,7 +413,7 @@ void * bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, con ...@@ -411,7 +413,7 @@ void * bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, con
int n; int n;
snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid()); snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid());
reader = perf_reader_new(cb, NULL, cb_cookie); reader = perf_reader_new(cb, NULL, cb_cookie, probe_perf_reader_page_cnt);
if (!reader) if (!reader)
goto error; goto error;
...@@ -493,7 +495,7 @@ void * bpf_attach_tracepoint(int progfd, const char *tp_category, ...@@ -493,7 +495,7 @@ void * bpf_attach_tracepoint(int progfd, const char *tp_category,
char buf[256]; char buf[256];
struct perf_reader *reader = NULL; struct perf_reader *reader = NULL;
reader = perf_reader_new(cb, NULL, cb_cookie); reader = perf_reader_new(cb, NULL, cb_cookie, probe_perf_reader_page_cnt);
if (!reader) if (!reader)
goto error; goto error;
...@@ -515,12 +517,13 @@ int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) { ...@@ -515,12 +517,13 @@ int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) {
return 0; return 0;
} }
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu) { void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid,
int cpu, int page_cnt) {
int pfd; int pfd;
struct perf_event_attr attr = {}; struct perf_event_attr attr = {};
struct perf_reader *reader = NULL; struct perf_reader *reader = NULL;
reader = perf_reader_new(NULL, raw_cb, cb_cookie); reader = perf_reader_new(NULL, raw_cb, cb_cookie, page_cnt);
if (!reader) if (!reader)
goto error; goto error;
......
...@@ -68,7 +68,8 @@ void * bpf_attach_tracepoint(int progfd, const char *tp_category, ...@@ -68,7 +68,8 @@ void * bpf_attach_tracepoint(int progfd, const char *tp_category,
int group_fd, perf_reader_cb cb, void *cb_cookie); int group_fd, perf_reader_cb cb, void *cb_cookie);
int bpf_detach_tracepoint(const char *tp_category, const char *tp_name); int bpf_detach_tracepoint(const char *tp_category, const char *tp_name);
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu); void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid,
int cpu, int page_cnt);
/* attached a prog expressed by progfd to the device specified in dev_name */ /* attached a prog expressed by progfd to the device specified in dev_name */
int bpf_attach_xdp(const char *dev_name, int progfd); int bpf_attach_xdp(const char *dev_name, int progfd);
......
...@@ -26,8 +26,6 @@ ...@@ -26,8 +26,6 @@
#include "libbpf.h" #include "libbpf.h"
#include "perf_reader.h" #include "perf_reader.h"
int perf_reader_page_cnt = 8;
struct perf_reader { struct perf_reader {
perf_reader_cb cb; perf_reader_cb cb;
perf_reader_raw_cb raw_cb; perf_reader_raw_cb raw_cb;
...@@ -42,7 +40,8 @@ struct perf_reader { ...@@ -42,7 +40,8 @@ struct perf_reader {
uint64_t sample_type; uint64_t sample_type;
}; };
struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie) { struct perf_reader * perf_reader_new(perf_reader_cb cb,
perf_reader_raw_cb raw_cb, void *cb_cookie, int page_cnt) {
struct perf_reader *reader = calloc(1, sizeof(struct perf_reader)); struct perf_reader *reader = calloc(1, sizeof(struct perf_reader));
if (!reader) if (!reader)
return NULL; return NULL;
...@@ -51,7 +50,7 @@ struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_c ...@@ -51,7 +50,7 @@ struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_c
reader->cb_cookie = cb_cookie; reader->cb_cookie = cb_cookie;
reader->fd = -1; reader->fd = -1;
reader->page_size = getpagesize(); reader->page_size = getpagesize();
reader->page_cnt = perf_reader_page_cnt; reader->page_cnt = page_cnt;
return reader; return reader;
} }
......
...@@ -25,7 +25,8 @@ extern "C" { ...@@ -25,7 +25,8 @@ extern "C" {
struct perf_reader; struct perf_reader;
struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie); struct perf_reader * perf_reader_new(perf_reader_cb cb,
perf_reader_raw_cb raw_cb, void *cb_cookie, int page_cnt);
void perf_reader_free(void *ptr); void perf_reader_free(void *ptr);
int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type); int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type);
int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout); int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout);
......
...@@ -54,7 +54,7 @@ void * bpf_attach_uprobe(int progfd, int attach_type, const char *ev_name, ...@@ -54,7 +54,7 @@ void * bpf_attach_uprobe(int progfd, int attach_type, const char *ev_name,
int bpf_detach_uprobe(const char *ev_name); int bpf_detach_uprobe(const char *ev_name);
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu); void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu, int page_cnt);
]] ]]
ffi.cdef[[ ffi.cdef[[
......
...@@ -243,13 +243,14 @@ local function _perf_id(id, cpu) ...@@ -243,13 +243,14 @@ local function _perf_id(id, cpu)
return string.format("bcc:perf_event_array:%d:%d", tonumber(id), cpu or 0) return string.format("bcc:perf_event_array:%d:%d", tonumber(id), cpu or 0)
end end
function PerfEventArray:_open_perf_buffer(cpu, callback, ctype) function PerfEventArray:_open_perf_buffer(cpu, callback, ctype, page_cnt)
local _cb = ffi.cast("perf_reader_raw_cb", local _cb = ffi.cast("perf_reader_raw_cb",
function (cookie, data, size) function (cookie, data, size)
callback(cpu, ctype(data)[0]) callback(cpu, ctype(data)[0])
end) end)
local reader = libbcc.bpf_open_perf_buffer(_cb, nil, -1, cpu) -- default to 8 pages per buffer
local reader = libbcc.bpf_open_perf_buffer(_cb, nil, -1, cpu, page_cnt or 8)
assert(reader, "failed to open perf buffer") assert(reader, "failed to open perf buffer")
local fd = libbcc.perf_reader_fd(reader) local fd = libbcc.perf_reader_fd(reader)
...@@ -258,11 +259,11 @@ function PerfEventArray:_open_perf_buffer(cpu, callback, ctype) ...@@ -258,11 +259,11 @@ function PerfEventArray:_open_perf_buffer(cpu, callback, ctype)
self._callbacks[cpu] = _cb self._callbacks[cpu] = _cb
end end
function PerfEventArray:open_perf_buffer(callback, data_type, ...) function PerfEventArray:open_perf_buffer(callback, data_type, data_params, page_cnt)
assert(data_type, "a data type is needed for callback conversion") assert(data_type, "a data type is needed for callback conversion")
local ctype = ffi.typeof(data_type.."*", ...) local ctype = ffi.typeof(data_type.."*", unpack(data_params or {}))
for i = 0, Posix.cpu_count() - 1 do for i = 0, Posix.cpu_count() - 1 do
self:_open_perf_buffer(i, callback, ctype) self:_open_perf_buffer(i, callback, ctype, page_cnt)
end end
end end
......
...@@ -102,7 +102,7 @@ lib.bpf_attach_tracepoint.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p, ct.c_i ...@@ -102,7 +102,7 @@ lib.bpf_attach_tracepoint.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p, ct.c_i
lib.bpf_detach_tracepoint.restype = ct.c_int lib.bpf_detach_tracepoint.restype = ct.c_int
lib.bpf_detach_tracepoint.argtypes = [ct.c_char_p, ct.c_char_p] lib.bpf_detach_tracepoint.argtypes = [ct.c_char_p, ct.c_char_p]
lib.bpf_open_perf_buffer.restype = ct.c_void_p lib.bpf_open_perf_buffer.restype = ct.c_void_p
lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, ct.py_object, ct.c_int, ct.c_int] lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, ct.py_object, ct.c_int, ct.c_int, ct.c_int]
lib.bpf_open_perf_event.restype = ct.c_int lib.bpf_open_perf_event.restype = ct.c_int
lib.bpf_open_perf_event.argtypes = [ct.c_uint, ct.c_ulonglong, ct.c_int, ct.c_int] lib.bpf_open_perf_event.argtypes = [ct.c_uint, ct.c_ulonglong, ct.c_int, ct.c_int]
lib.perf_reader_poll.restype = ct.c_int lib.perf_reader_poll.restype = ct.c_int
......
...@@ -507,20 +507,25 @@ class PerfEventArray(ArrayBase): ...@@ -507,20 +507,25 @@ class PerfEventArray(ArrayBase):
super(PerfEventArray, self).__delitem__(key) super(PerfEventArray, self).__delitem__(key)
self.close_perf_buffer(key) self.close_perf_buffer(key)
def open_perf_buffer(self, callback): def open_perf_buffer(self, callback, page_cnt=8):
"""open_perf_buffers(callback) """open_perf_buffers(callback)
Opens a set of per-cpu ring buffer to receive custom perf event Opens a set of per-cpu ring buffer to receive custom perf event
data from the bpf program. The callback will be invoked for each data from the bpf program. The callback will be invoked for each
event submitted from the kernel, up to millions per second. event submitted from the kernel, up to millions per second. Use
page_cnt to change the size of the per-cpu ring buffer. The value
must be a power of two and defaults to 8.
""" """
if page_cnt & (page_cnt - 1) != 0:
raise Exception("Perf buffer page_cnt must be a power of two")
for i in get_online_cpus(): for i in get_online_cpus():
self._open_perf_buffer(i, callback) self._open_perf_buffer(i, callback, page_cnt)
def _open_perf_buffer(self, cpu, callback): def _open_perf_buffer(self, cpu, callback, page_cnt):
fn = _RAW_CB_TYPE(lambda _, data, size: callback(cpu, data, size)) fn = _RAW_CB_TYPE(lambda _, data, size: callback(cpu, data, size))
reader = lib.bpf_open_perf_buffer(fn, None, -1, cpu) reader = lib.bpf_open_perf_buffer(fn, None, -1, cpu, page_cnt)
if not reader: if not reader:
raise Exception("Could not open perf buffer") raise Exception("Could not open perf buffer")
fd = lib.perf_reader_fd(reader) fd = lib.perf_reader_fd(reader)
......
...@@ -175,9 +175,9 @@ return function(BPF, utils) ...@@ -175,9 +175,9 @@ return function(BPF, utils)
uint64_t sector; uint64_t sector;
uint64_t len; uint64_t len;
uint64_t ts; uint64_t ts;
char disk_name[%d]; char disk_name[$];
char name[%d]; char name[$];
} }
]] % {DISK_NAME_LEN, TASK_COMM_LEN}) ]], {DISK_NAME_LEN, TASK_COMM_LEN}, 64)
bpf:kprobe_poll_loop() bpf:kprobe_poll_loop()
end end
...@@ -182,6 +182,6 @@ def print_event(cpu, data, size): ...@@ -182,6 +182,6 @@ def print_event(cpu, data, size):
start_ts = 1 start_ts = 1
# loop with callback to print_event # loop with callback to print_event
b["events"].open_perf_buffer(print_event) b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1: while 1:
b.kprobe_poll() b.kprobe_poll()
...@@ -343,6 +343,6 @@ else: ...@@ -343,6 +343,6 @@ else:
"BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
# read events # read events
b["events"].open_perf_buffer(print_event) b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1: while 1:
b.kprobe_poll() b.kprobe_poll()
...@@ -205,7 +205,7 @@ slept = float(0) ...@@ -205,7 +205,7 @@ slept = float(0)
trigger = int(0.8 * (1000000000 / frequency)) trigger = int(0.8 * (1000000000 / frequency))
# read events # read events
b["events"].open_perf_buffer(print_event) b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1: while 1:
# allow some buffering by calling sleep(), to reduce the context switch # allow some buffering by calling sleep(), to reduce the context switch
# rate and lower overhead. # rate and lower overhead.
......
...@@ -131,7 +131,7 @@ print("Tracing database queries for pids %s slower than %d ms..." % ...@@ -131,7 +131,7 @@ print("Tracing database queries for pids %s slower than %d ms..." %
(', '.join(map(str, args.pids)), args.threshold)) (', '.join(map(str, args.pids)), args.threshold))
print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY")) print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY"))
bpf["events"].open_perf_buffer(print_event) bpf["events"].open_perf_buffer(print_event, page_cnt=64)
while True: while True:
bpf.kprobe_poll() bpf.kprobe_poll()
...@@ -153,6 +153,6 @@ def print_event(cpu, data, size): ...@@ -153,6 +153,6 @@ def print_event(cpu, data, size):
# header # header
print("%-11s %-6s %-16s %1s %s" % ("TIME(s)", "PID", "COMM", "T", "FILE")) print("%-11s %-6s %-16s %1s %s" % ("TIME(s)", "PID", "COMM", "T", "FILE"))
b["events"].open_perf_buffer(print_event) b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1: while 1:
b.kprobe_poll() b.kprobe_poll()
...@@ -337,6 +337,6 @@ else: ...@@ -337,6 +337,6 @@ else:
"BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
# read events # read events
b["events"].open_perf_buffer(print_event) b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1: while 1:
b.kprobe_poll() b.kprobe_poll()
...@@ -243,6 +243,6 @@ def print_event(cpu, data, size): ...@@ -243,6 +243,6 @@ def print_event(cpu, data, size):
time.time() - start_ts, event.comm, event.pid, mode_s[event.mode], time.time() - start_ts, event.comm, event.pid, mode_s[event.mode],
event.sz, ms, name)) event.sz, ms, name))
b["events"].open_perf_buffer(print_event) b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1: while 1:
b.kprobe_poll() b.kprobe_poll()
...@@ -128,6 +128,6 @@ def print_event(cpu, data, size): ...@@ -128,6 +128,6 @@ def print_event(cpu, data, size):
event.pid, float(event.delta) / 1000000, event.query)) event.pid, float(event.delta) / 1000000, event.query))
# loop with callback to print_event # loop with callback to print_event
b["events"].open_perf_buffer(print_event) b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1: while 1:
b.kprobe_poll() b.kprobe_poll()
...@@ -178,6 +178,6 @@ def print_event(cpu, data, size): ...@@ -178,6 +178,6 @@ def print_event(cpu, data, size):
event.comm, fd_s, err, event.fname)) event.comm, fd_s, err, event.fname))
# loop with callback to print_event # loop with callback to print_event
b["events"].open_perf_buffer(print_event) b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1: while 1:
b.kprobe_poll() b.kprobe_poll()
...@@ -102,6 +102,6 @@ return function(BPF, utils) ...@@ -102,6 +102,6 @@ return function(BPF, utils)
bpf:get_table("events"):open_perf_buffer(print_event, bpf:get_table("events"):open_perf_buffer(print_event,
"struct { uint64_t stack_id; uint32_t pid; char comm[$]; }", "struct { uint64_t stack_id; uint32_t pid; char comm[$]; }",
TASK_COMM_LEN) {TASK_COMM_LEN})
bpf:kprobe_poll_loop() bpf:kprobe_poll_loop()
end end
...@@ -159,6 +159,6 @@ def print_event(cpu, data, size): ...@@ -159,6 +159,6 @@ def print_event(cpu, data, size):
fd_s, err, event.fname)) fd_s, err, event.fname))
# loop with callback to print_event # loop with callback to print_event
b["events"].open_perf_buffer(print_event) b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1: while 1:
b.kprobe_poll() b.kprobe_poll()
...@@ -354,7 +354,7 @@ print(header_string % ("PID", "COMM", ...@@ -354,7 +354,7 @@ print(header_string % ("PID", "COMM",
start_ts = 0 start_ts = 0
# read events # read events
b["ipv4_events"].open_perf_buffer(print_ipv4_event) b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
b["ipv6_events"].open_perf_buffer(print_ipv6_event) b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
while 1: while 1:
b.kprobe_poll() b.kprobe_poll()
...@@ -29,6 +29,7 @@ class Probe(object): ...@@ -29,6 +29,7 @@ class Probe(object):
use_localtime = True use_localtime = True
tgid = -1 tgid = -1
pid = -1 pid = -1
page_cnt = None
@classmethod @classmethod
def configure(cls, args): def configure(cls, args):
...@@ -38,6 +39,7 @@ class Probe(object): ...@@ -38,6 +39,7 @@ class Probe(object):
cls.first_ts = BPF.monotonic_time() cls.first_ts = BPF.monotonic_time()
cls.tgid = args.tgid or -1 cls.tgid = args.tgid or -1
cls.pid = args.pid or -1 cls.pid = args.pid or -1
cls.page_cnt = args.buffer_pages
def __init__(self, probe, string_size, kernel_stack, user_stack): def __init__(self, probe, string_size, kernel_stack, user_stack):
self.usdt = None self.usdt = None
...@@ -510,7 +512,8 @@ BPF_PERF_OUTPUT(%s); ...@@ -510,7 +512,8 @@ BPF_PERF_OUTPUT(%s);
self._attach_u(bpf) self._attach_u(bpf)
self.python_struct = self._generate_python_data_decl() self.python_struct = self._generate_python_data_decl()
callback = partial(self.print_event, bpf) callback = partial(self.print_event, bpf)
bpf[self.events_name].open_perf_buffer(callback) bpf[self.events_name].open_perf_buffer(callback,
page_cnt=self.page_cnt)
def _attach_k(self, bpf): def _attach_k(self, bpf):
if self.probe_type == "r": if self.probe_type == "r":
...@@ -543,6 +546,7 @@ BPF_PERF_OUTPUT(%s); ...@@ -543,6 +546,7 @@ BPF_PERF_OUTPUT(%s);
pid=Probe.tgid) pid=Probe.tgid)
class Tool(object): class Tool(object):
DEFAULT_PERF_BUFFER_PAGES = 64
examples = """ examples = """
EXAMPLES: EXAMPLES:
...@@ -577,6 +581,10 @@ trace 'p::SyS_nanosleep(struct timespec *ts) "sleep for %lld ns", ts->tv_nsec' ...@@ -577,6 +581,10 @@ trace 'p::SyS_nanosleep(struct timespec *ts) "sleep for %lld ns", ts->tv_nsec'
"functions and print trace messages.", "functions and print trace messages.",
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=Tool.examples) epilog=Tool.examples)
parser.add_argument("-b", "--buffer-pages", type=int,
default=Tool.DEFAULT_PERF_BUFFER_PAGES,
help="number of pages to use for perf_events ring buffer "
"(default: %(default)d)")
# we'll refer to the userspace concepts of "pid" and "tid" by # we'll refer to the userspace concepts of "pid" and "tid" by
# their kernel names -- tgid and pid -- inside the script # their kernel names -- tgid and pid -- inside the script
parser.add_argument("-p", "--pid", type=int, metavar="PID", parser.add_argument("-p", "--pid", type=int, metavar="PID",
......
...@@ -201,11 +201,28 @@ In this example, we traced the "ls ~" command as it was opening its shared ...@@ -201,11 +201,28 @@ In this example, we traced the "ls ~" command as it was opening its shared
libraries and then accessing the /home/vagrant directory listing. libraries and then accessing the /home/vagrant directory listing.
Lastly, if a high-frequency event is traced you may overflow the perf ring
buffer. This shows as "Lost N samples":
# trace sys_open
5087 5087 pgrep sys_open
5087 5087 pgrep sys_open
5087 5087 pgrep sys_open
5087 5087 pgrep sys_open
5087 5087 pgrep sys_open
Lost 764896 samples
Lost 764896 samples
Lost 764896 samples
The perf ring buffer size can be changed with -b. The unit is size per-CPU buffer
size and is measured in pages. The value must be a power of two and defaults to
64 pages.
USAGE message: USAGE message:
# trace -h usage: trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [-v] [-Z STRING_SIZE]
usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S] [-S] [-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-I header]
[-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-I header]
probe [probe ...] probe [probe ...]
Attach to functions and print trace messages. Attach to functions and print trace messages.
...@@ -215,6 +232,9 @@ positional arguments: ...@@ -215,6 +232,9 @@ positional arguments:
optional arguments: optional arguments:
-h, --help show this help message and exit -h, --help show this help message and exit
-b BUFFER_PAGES, --buffer-pages BUFFER_PAGES
number of pages to use for perf_events ring buffer
(default: 64)
-p PID, --pid PID id of the process to trace (optional) -p PID, --pid PID id of the process to trace (optional)
-L TID, --tid TID id of the thread to trace (optional) -L TID, --tid TID id of the thread to trace (optional)
-v, --verbose print resulting BPF program code before executing -v, --verbose print resulting BPF program code before executing
...@@ -224,7 +244,7 @@ optional arguments: ...@@ -224,7 +244,7 @@ optional arguments:
-M MAX_EVENTS, --max-events MAX_EVENTS -M MAX_EVENTS, --max-events MAX_EVENTS
number of events to print before quitting number of events to print before quitting
-t, --timestamp print timestamp column (offset from trace start) -t, --timestamp print timestamp column (offset from trace start)
-T, --time print time column -T, --time print time column
-K, --kernel-stack output kernel stack trace -K, --kernel-stack output kernel stack trace
-U, --user-stack output user stack trace -U, --user-stack output user stack trace
-I header, --include header -I header, --include header
...@@ -247,9 +267,9 @@ trace 'c:malloc "size = %d", arg1' ...@@ -247,9 +267,9 @@ trace 'c:malloc "size = %d", arg1'
Trace malloc calls and print the size being allocated Trace malloc calls and print the size being allocated
trace 'p:c:write (arg1 == 1) "writing %d bytes to STDOUT", arg3' trace 'p:c:write (arg1 == 1) "writing %d bytes to STDOUT", arg3'
Trace the write() call from libc to monitor writes to STDOUT Trace the write() call from libc to monitor writes to STDOUT
trace 'r::__kmalloc (retval == 0) "kmalloc failed!" trace 'r::__kmalloc (retval == 0) "kmalloc failed!"'
Trace returns from __kmalloc which returned a null pointer Trace returns from __kmalloc which returned a null pointer
trace 'r:c:malloc (retval) "allocated = %x", retval trace 'r:c:malloc (retval) "allocated = %x", retval'
Trace returns from malloc and print non-NULL allocated buffers Trace returns from malloc and print non-NULL allocated buffers
trace 't:block:block_rq_complete "sectors=%d", args->nr_sector' trace 't:block:block_rq_complete "sectors=%d", args->nr_sector'
Trace the block_rq_complete kernel tracepoint and print # of tx sectors Trace the block_rq_complete kernel tracepoint and print # of tx sectors
......
...@@ -293,6 +293,6 @@ else: ...@@ -293,6 +293,6 @@ else:
"BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
# read events # read events
b["events"].open_perf_buffer(print_event) b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1: while 1:
b.kprobe_poll() b.kprobe_poll()
...@@ -297,6 +297,6 @@ else: ...@@ -297,6 +297,6 @@ else:
"BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
# read events # read events
b["events"].open_perf_buffer(print_event) b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1: while 1:
b.kprobe_poll() b.kprobe_poll()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment