some bcc examples and tools

48fbc3ea · Brendan Gregg · 5484badb · 48fbc3ea · 48fbc3ea · 48fbc3ea
Commit 48fbc3ea authored Aug 18, 2015 by Brendan Gregg
24 changed files
--- a/examples/bitehist.c
+++ b/examples/bitehist.c
+/*
+ * bitehist.c	Block I/O size histogram.
+ *		For Linux, uses BCC, eBPF. See .py file.
+ *
+ * Based on eBPF sample tracex2 by Alexi Starovoitov.
+ * Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 15-Aug-2015	Brendan Gregg	Created this.
+ */
+#include <uapi/linux/ptrace.h>
+#include "../../linux-4.2-rc5/include/linux/blkdev.h"
+BPF_TABLE("array", int, u64, dist, 64);
+static unsigned int log2(unsigned int v)
+{
+	unsigned int r;
+	unsigned int shift;
+	r = (v > 0xFFFF) << 4; v >>= r;
+	shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+	shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+	shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+	r |= (v >> 1);
+	return r;
+}
+static unsigned int log2l(unsigned long v)
+{
+	unsigned int hi = v >> 32;
+	if (hi)
+		return log2(hi) + 32 + 1;
+	else
+		return log2(v) + 1;
+}
+int do_request(struct pt_regs *ctx, struct request *req)
+{
+	int index = log2l(req->__data_len / 1024);
+	u64 *leaf = dist.lookup(&index);
+	if (leaf) (*leaf)++;
+	return 0;
+}
--- a/examples/bitehist.py
+++ b/examples/bitehist.py
+#!/usr/bin/python
+#
+# bitehist.py	Block I/O size histogram.
+#		For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of using a histogram to show a distribution.
+#
+# USAGE: bitehist.py [interval [count]]
+#
+# The default interval is 5 seconds. A Ctrl-C will print the partially
+# gathered histogram then exit.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 15-Aug-2015	Brendan Gregg	Created this.
+from bpf import BPF
+from ctypes import c_ushort, c_int, c_ulonglong
+from time import sleep
+from sys import argv
+def usage():
+	print("USAGE: %s [interval [count]]" % argv[0])
+	exit()
+# arguments
+interval = 5
+count = -1
+if len(argv) > 1:
+	try:
+		interval = int(argv[1])
+		if interval == 0:
+			raise
+		if len(argv) > 2:
+			count = int(argv[2])
+	except:	# also catches -h, --help
+		usage()
+# load BPF program
+b = BPF(src_file = "bitehist.c")
+BPF.attach_kprobe(b.load_func("do_request", BPF.KPROBE), "blk_start_request")
+dist = b.get_table("dist", c_int, c_ulonglong)
+dist_max = 64
+# header
+print("Tracing... Hit Ctrl-C to end.")
+last = {}
+for i in range(1, dist_max + 1):
+	last[i] = 0
+# functions
+stars_max = 38
+def stars(val, val_max, width):
+	i = 0
+	text = ""
+	while (1):
+		if (i > (width * val / val_max) - 1) or (i > width - 1):
+			break
+		text += "*"
+		i += 1
+	if val > val_max:
+		text = text[:-1] + "+"
+	return text
+def print_log2_hist(d, val_type):
+	idx_max = -1
+	val_max = 0
+	for i in range(1, dist_max + 1):
+		try:
+			val = dist[c_int(i)].value - last[i]
+			if (val > 0):
+				idx_max = i
+			if (val > val_max):
+				val_max = val
+		except:
+			break
+	if idx_max > 0:
+		print("     %-15s : count     distribution" % val_type);
+	for i in range(1, idx_max + 1):
+		low = (1 << i) >> 1
+		high = (1 << i) - 1
+		if (low == high):
+			low -= 1
+		try:
+			val = dist[c_int(i)].value - last[i]
+			print("%8d -> %-8d : %-8d |%-*s|" % (low, high, val,
+			    stars_max, stars(val, val_max, stars_max)))
+			last[i] = dist[c_int(i)].value
+		except:
+			break
+# output
+loop = 0
+do_exit = 0
+while (1):
+	if count > 0:
+		loop += 1
+		if loop > count:
+			exit()
+	try:
+		sleep(interval)
+	except KeyboardInterrupt:
+		pass; do_exit = 1
+	print
+	print_log2_hist(dist, "kbytes")
+	if do_exit:
+		exit()
--- a/examples/bitehist_example.txt
+++ b/examples/bitehist_example.txt
+Demonstrations of bitehist.py, the Linux eBPF/bcc version.
+This prints a power-of-2 histogram to show the block I/O size distribution.
+By default, a summary is printed every five seconds:
+# ./bitehist.py
+Tracing... Hit Ctrl-C to end.
+     kbytes          : count     distribution
+       0 -> 1        : 0        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 26       |*************                         |
+       8 -> 15       : 3        |*                                     |
+      16 -> 31       : 5        |**                                    |
+      32 -> 63       : 6        |***                                   |
+      64 -> 127      : 7        |***                                   |
+     128 -> 255      : 75       |**************************************|
+     kbytes          : count     distribution
+       0 -> 1        : 0        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 83       |**************************************|
+       8 -> 15       : 2        |                                      |
+      16 -> 31       : 6        |**                                    |
+      32 -> 63       : 6        |**                                    |
+      64 -> 127      : 5        |**                                    |
+     128 -> 255      : 21       |*********                             |
+^C
+     kbytes          : count     distribution
+       0 -> 1        : 0        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 8        |**************************************|
+The first output shows a bimodal distribution. The largest mode of 75 I/O were
+between 128 and 255 Kbytes in size, and another mode of 26 I/O were between 4
+and 7 Kbytes in size. 
+The next output summary shows the workload is doing more 4 - 7 Kbyte I/O.
+The final output is partial, showing what was measured until Ctrl-C was hit.
+For an output interval of one second, and three summaries:
+# ./bitehist.py 1 3
+Tracing... Hit Ctrl-C to end.
+     kbytes          : count     distribution
+       0 -> 1        : 0        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 4        |**************************************|
+     kbytes          : count     distribution
+       0 -> 1        : 0        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 5        |**************************************|
+       8 -> 15       : 0        |                                      |
+      16 -> 31       : 0        |                                      |
+      32 -> 63       : 1        |*******                               |
+     kbytes          : count     distribution
+       0 -> 1        : 0        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 4        |**************************************|
+Full usage:
+# ./bitehist.py -h
+USAGE: ./bitehist.py [interval [count]]
--- a/examples/disksnoop.c
+++ b/examples/disksnoop.c
+/*
+ * disksnoop.c	Trace block device I/O: basic version of iosnoop.
+ *		For Linux, uses BCC, eBPF. See .py file.
+ *
+ * Copyright (c) 2015 Brendan Gregg.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 11-Aug-2015	Brendan Gregg	Created this.
+ */
+#include <uapi/linux/ptrace.h>
+#include "../../linux-4.2-rc5/include/linux/blkdev.h"
+struct key_t {
+	struct request *req;
+};
+BPF_TABLE("hash", struct key_t, u64, start, 10240);
+int do_request(struct pt_regs *ctx, struct request *req) {
+	struct key_t key = {};
+	u64 ts;
+	// stash start timestamp by request ptr
+	ts = bpf_ktime_get_ns();
+	key.req = req;
+	start.update(&key, &ts);
+	return 0;
+}
+int do_completion(struct pt_regs *ctx, struct request *req) {
+	struct key_t key = {};
+	u64 *tsp, delta;
+	key.req = req;
+	tsp = start.lookup(&key);
+	if (tsp != 0) {
+		delta = bpf_ktime_get_ns() - *tsp;
+		bpf_trace_printk("%d %x %d\n", req->__data_len,
+		    req->cmd_flags, delta / 1000);
+		start.delete(&key);
+	}
+	return 0;
+}
--- a/examples/disksnoop.py
+++ b/examples/disksnoop.py
+#!/usr/bin/python
+#
+# disksnoop.py	Trace block device I/O: basic version of iosnoop.
+#		For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of tracing latency.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 11-Aug-2015	Brendan Gregg	Created this.
+from bpf import BPF
+import sys
+REQ_WRITE = 1		# from include/linux/blk_types.h
+# load BPF program
+b = BPF(src_file="disksnoop.c")
+BPF.attach_kprobe(b.load_func("do_request", BPF.KPROBE), "blk_start_request")
+BPF.attach_kprobe(b.load_func("do_completion", BPF.KPROBE), "blk_update_request")
+# header
+print "%-18s %-2s %-7s %8s" % ("TIME(s)", "T", "BYTES", "LAT(ms)")
+# open trace pipe
+try:
+	trace = open("/sys/kernel/debug/tracing/trace_pipe", "r")
+except:
+	print >> sys.stderr, "ERROR: opening trace_pipe"
+	exit(1)
+# format output
+while 1:
+	try:
+		line = trace.readline().rstrip()
+	except KeyboardInterrupt:
+		pass; exit()
+	prolog, time_s, colon, bytes_s, flags_s, us_s = \
+		line.rsplit(" ", 5)
+	time_s = time_s[:-1]	# strip trailing ":"
+	flags = int(flags_s, 16)
+	if flags & REQ_WRITE:
+		type_s = "W"
+	elif bytes_s == "0":	# see blk_fill_rwbs() for logic
+		type_s = "M"
+	else:
+		type_s = "R"
+	ms = float(int(us_s, 10)) / 1000
+	print "%-18s %-2s %-7s %8.2f" % (time_s, type_s, bytes_s, ms)
--- a/examples/disksnoop_example.txt
+++ b/examples/disksnoop_example.txt
+Demonstrations of disksnoop.py, the Linux eBPF/bcc version.
+This traces block I/O, a prints a line to summarize each I/O completed:
+# ./disksnoop.py 
+TIME(s)            T  BYTES    LAT(ms)
+16458043.435457    W  4096        2.73
+16458043.435981    W  4096        3.24
+16458043.436012    W  4096        3.13
+16458043.437326    W  4096        4.44
+16458044.126545    R  4096       42.82
+16458044.129872    R  4096        3.24
+16458044.130705    R  4096        0.73
+16458044.142813    R  4096       12.01
+16458044.147302    R  4096        4.33
+16458044.148117    R  4096        0.71
+16458044.148950    R  4096        0.70
+16458044.164332    R  4096       15.29
+16458044.168003    R  4096        3.58
+16458044.171676    R  4096        3.59
+16458044.172453    R  4096        0.72
+16458044.173213    R  4096        0.71
+16458044.173989    R  4096        0.72
+16458044.174739    R  4096        0.70
+16458044.190334    R  4096       15.52
+16458044.196608    R  4096        6.17
+16458044.203091    R  4096        6.35
+The output includes a basic timestamp (in seconds), the type of I/O (W == write,
+R == read, M == metadata), the size of the I/O in bytes, and the latency (or
+duration) of the I/O in milliseconds.
+The latency is measured from I/O request to the device, to the device
+completion. This excludes latency spent queued in the OS.
+Most of the I/O in this example were 0.7 and 4 milliseconds in duration. There
+was an outlier of 42.82 milliseconds, a read which followed many writes (the
+high latency may have been caused by the writes still being serviced on the
+storage device).
--- a/examples/vfsreadlat.c
+++ b/examples/vfsreadlat.c
+/*
+ * vfsreadlat.c		VFS read latency distribution.
+ *			For Linux, uses BCC, eBPF. See .py file.
+ *
+ * Based on eBPF sample tracex2 by Alexi Starovoitov.
+ * Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 15-Aug-2015	Brendan Gregg	Created this.
+ */
+#include <uapi/linux/ptrace.h>
+struct key_t {
+	u32 pid;
+};
+BPF_TABLE("hash", struct key_t, u64, start, 10240);
+BPF_TABLE("array", int, u64, dist, 64);
+static unsigned int log2(unsigned int v)
+{
+	unsigned int r;
+	unsigned int shift;
+	r = (v > 0xFFFF) << 4; v >>= r;
+	shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+	shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+	shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+	r |= (v >> 1);
+	return r;
+}
+static unsigned int log2l(unsigned long v)
+{
+	unsigned int hi = v >> 32;
+	if (hi)
+		return log2(hi) + 32 + 1;
+	else
+		return log2(v) + 1;
+}
+int do_entry(struct pt_regs *ctx)
+{
+	struct key_t key = {};
+	u64 ts, *val, zero = 0;
+	key.pid = bpf_get_current_pid_tgid();
+	ts = bpf_ktime_get_ns();
+	start.update(&key, &ts);
+	return 0;
+}
+int do_return(struct pt_regs *ctx)
+{
+	struct key_t key = {};
+	u64 *tsp, delta;
+	key.pid = bpf_get_current_pid_tgid();
+	tsp = start.lookup(&key);
+	if (tsp != 0) {
+		delta = bpf_ktime_get_ns() - *tsp;
+		int index = log2l(delta / 1000);
+		u64 *leaf = dist.lookup(&index);
+		if (leaf) (*leaf)++;
+		start.delete(&key);
+	}
+	return 0;
+}
--- a/examples/vfsreadlat.py
+++ b/examples/vfsreadlat.py
+#!/usr/bin/python
+#
+# vfsreadlat.py		VFS read latency distribution.
+#			For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of a function latency distribution histogram.
+#
+# USAGE: vfsreadlat.py [interval [count]]
+#
+# The default interval is 5 seconds. A Ctrl-C will print the partially
+# gathered histogram then exit.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 15-Aug-2015	Brendan Gregg	Created this.
+from bpf import BPF
+from ctypes import c_ushort, c_int, c_ulonglong
+from time import sleep
+from sys import argv
+def usage():
+	print("USAGE: %s [interval [count]]" % argv[0])
+	exit()
+# arguments
+interval = 5
+count = -1
+if len(argv) > 1:
+	try:
+		interval = int(argv[1])
+		if interval == 0:
+			raise
+		if len(argv) > 2:
+			count = int(argv[2])
+	except:	# also catches -h, --help
+		usage()
+# load BPF program
+b = BPF(src_file = "vfsreadlat.c")
+BPF.attach_kprobe(b.load_func("do_entry", BPF.KPROBE), "vfs_read")
+BPF.attach_kretprobe(b.load_func("do_return", BPF.KPROBE), "vfs_read")
+dist = b.get_table("dist", c_int, c_ulonglong)
+dist_max = 64
+# header
+print("Tracing... Hit Ctrl-C to end.")
+last = {}
+for i in range(1, dist_max + 1):
+	last[i] = 0
+# functions
+stars_max = 38
+def stars(val, val_max, width):
+	i = 0
+	text = ""
+	while (1):
+		if (i > (width * val / val_max) - 1) or (i > width - 1):
+			break
+		text += "*"
+		i += 1
+	if val > val_max:
+		text = text[:-1] + "+"
+	return text
+def print_log2_hist(d, val_type):
+	idx_max = -1
+	val_max = 0
+	for i in range(1, dist_max + 1):
+		try:
+			val = dist[c_int(i)].value - last[i]
+			if (val > 0):
+				idx_max = i
+			if (val > val_max):
+				val_max = val
+		except:
+			break
+	if idx_max > 0:
+		print("     %-15s : count     distribution" % val_type);
+	for i in range(1, idx_max + 1):
+		low = (1 << i) >> 1
+		high = (1 << i) - 1
+		if (low == high):
+			low -= 1
+		try:
+			val = dist[c_int(i)].value - last[i]
+			print("%8d -> %-8d : %-8d |%-*s|" % (low, high, val,
+			    stars_max, stars(val, val_max, stars_max)))
+			last[i] = dist[c_int(i)].value
+		except:
+			break
+# output
+loop = 0
+do_exit = 0
+while (1):
+	if count > 0:
+		loop += 1
+		if loop > count:
+			exit()
+	try:
+		sleep(interval)
+	except KeyboardInterrupt:
+		pass; do_exit = 1
+	print
+	print_log2_hist(dist, "usecs")
+	if do_exit:
+		exit()
--- a/examples/vfsreadlat_example.txt
+++ b/examples/vfsreadlat_example.txt
+Demonstrations of vfsreadlat.py, the Linux eBPF/bcc version.
+This example traces the latency of vfs_read (time from call to return), printing 
+it as a histogram distribution. By default, output is every five seconds:
+# ./vfsreadlat.py 
+Tracing... Hit Ctrl-C to end.
+     usecs           : count     distribution
+       0 -> 1        : 4457     |*************************************+|
+       2 -> 3        : 447      |***                                   |
+       4 -> 7        : 2059     |*****************                     |
+       8 -> 15       : 1179     |**********                            |
+      16 -> 31       : 63       |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 2        |                                      |
+     128 -> 255      : 0        |                                      |
+     256 -> 511      : 3        |                                      |
+     512 -> 1023     : 1        |                                      |
+    1024 -> 2047     : 3        |                                      |
+    2048 -> 4095     : 2        |                                      |
+    4096 -> 8191     : 0        |                                      |
+    8192 -> 16383    : 0        |                                      |
+   16384 -> 32767    : 0        |                                      |
+   32768 -> 65535    : 0        |                                      |
+   65536 -> 131071   : 4        |                                      |
+  131072 -> 262143   : 2        |                                      |
+  262144 -> 524287   : 0        |                                      |
+  524288 -> 1048575  : 4        |                                      |
+^C
+     usecs           : count     distribution
+       0 -> 1        : 241      |*************************************+|
+       2 -> 3        : 17       |**                                    |
+       4 -> 7        : 2        |                                      |
+       8 -> 15       : 4        |                                      |
+      16 -> 31       : 2        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 1        |                                      |
+     128 -> 255      : 0        |                                      |
+     256 -> 511      : 1        |                                      |
+     512 -> 1023     : 1        |                                      |
+    1024 -> 2047     : 0        |                                      |
+    2048 -> 4095     : 1        |                                      |
+    4096 -> 8191     : 0        |                                      |
+    8192 -> 16383    : 0        |                                      |
+   16384 -> 32767    : 0        |                                      |
+   32768 -> 65535    : 0        |                                      |
+   65536 -> 131071   : 0        |                                      |
+  131072 -> 262143   : 0        |                                      |
+  262144 -> 524287   : 0        |                                      |
+  524288 -> 1048575  : 1        |                                      |
+These examples show outliers in the 524 - 1048 milliseconds range. Since
+vfs_read() will catch many types of events, this could be anything including
+keystroke latency on ssh sessions. Further drilling with bcc will be necessary
+to identify more details.
+Full usage:
+# ./vfsreadlat.py -h
+USAGE: ./vfsreadlat.py [interval [count]]
--- a/man/man8/pidpersec.8
+++ b/man/man8/pidpersec.8
+.TH pidpersec 8  "2015-08-18" "USER COMMANDS"
+.SH NAME
+pidpersec \- Count new processes (via fork()). Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B pidpersec
+.SH DESCRIPTION
+pidpersec shows how many new processes were created each second. There
+can be performance issues caused by many short-lived processes, which may not
+be visible in sampling tools like top(1). pidpersec provides one way to
+investigate this behavior.
+This works by tracing the kernel sched_fork() function using dynamic tracing,
+and will need updating to match any changes to this function.
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Count new processes created each second:
+#
+.B pidpersec
+.SH OVERHEAD
+This traces the kernel fork function, and maintains an in-kernel count which is
+read asynchronously from user-space. As the rate of this is generally expected to
+be low (<< 1000/s), the overhead is also expected to be negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+top(1)
--- a/man/man8/syncsnoop.8
+++ b/man/man8/syncsnoop.8
+.TH syncsnoop 8  "2015-08-18" "USER COMMANDS"
+.SH NAME
+syncsnoop \- Trace sync() syscall. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B syncsnoop
+.SH DESCRIPTION
+syncsnoop traces calls to sync(), which flushes file system buffers to
+storage devices. These calls can cause performance perturbations, and it can
+be useful to know if they are happening and how frequently.
+This works by tracing the kernel sys_sync() function using dynamic tracing, and
+will need updating to match any changes to this function.
+This program is also a basic example of eBPF/bcc.
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Trace calls to sync():
+#
+.B syncsnoop
+.SH FIELDS
+.TP
+TIME(s)
+Time of the call, in seconds.
+.TP
+CALL
+Call traced.
+.SH OVERHEAD
+This traces the kernel sync function and prints output for each event. As the
+rate of this is generally expected to be low (<< 100/s), the overhead is also
+expected to be negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+iostat(1)
--- a/man/man8/vfscount.8
+++ b/man/man8/vfscount.8
+.TH vfscount 8  "2015-08-18" "USER COMMANDS"
+.SH NAME
+vfscount \- Count some common VFS calls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B vfscount
+.SH DESCRIPTION
+This counts common VFS calls. This can be useful for general workload
+characterization of these operations.
+This works by tracing some kernel vfs functions using dynamic tracing, and will
+need updating to match any changes to these functions. Edit the script to
+customize and add functions to trace, which is easy to do.
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Count some VFS calls until Ctrl-C is hit:
+#
+.B vfscount
+.SH FIELDS
+.TP
+ADDR
+Address of the instruction pointer that was traced (only useful if the FUNC column is suspicious and you would like to double check the translation).
+.TP
+FUNC
+Kernel function name
+.TP
+COUNT
+Number of calls while tracing
+.SH OVERHEAD
+This traces various kernel vfs functions and maintains in-kernel counts, which
+are asynchronously copied to user-space. While the rate of VFS operations can
+be very high (>1M/sec), this is a relatively efficient way to trace these
+events, and so the overhead is expected to be small for normal workloads.
+Measure in a test environment.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+vfsstat(8)
--- a/man/man8/vfsstat.8
+++ b/man/man8/vfsstat.8
+.TH vfsstat 8  "2015-08-18" "USER COMMANDS"
+.SH NAME
+vfsstat \- Statistics for some common VFS calls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B vfsstat
+[interval [count]]
+.SH DESCRIPTION
+This traces some common VFS calls and prints per-second summaries. This can
+be useful for general workload characterization, and looking for patterns
+in operation usage over time.
+This works by tracing some kernel vfs functions using dynamic tracing, and will
+need updating to match any changes to these functions. Edit the script to
+customize which functions are traced. Also see vfscount, which is more
+easily customized to trace multiple functions.
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Print summaries each second:
+#
+.B vfsstat
+.TP
+Print output every five seconds, three times:
+#
+.B vfsstat 5 3
+.SH FIELDS
+.TP
+READ/s
+Number of vfs_read() calls as a per-second average.
+.TP
+WRITE/s
+Number of vfs_write() calls as a per-second average.
+.TP
+CREATE/s
+Number of vfs_create() calls as a per-second average.
+.TP
+OPEN/s
+Number of vfs_open() calls as a per-second average.
+.TP
+FSYNC/s
+Number of vfs_fsync() calls as a per-second average.
+.SH OVERHEAD
+This traces various kernel vfs functions and maintains in-kernel counts, which
+are asynchronously copied to user-space. While the rate of VFS operations can
+be very high (>1M/sec), this is a relatively efficient way to trace these
+events, and so the overhead is expected to be small for normal workloads.
+Measure in a test environment.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+vfscount(8)
--- a/tools/pidpersec
+++ b/tools/pidpersec
+#!/usr/bin/python
+#
+# pidpersec	Count new processes (via fork).
+#		For Linux, uses BCC, eBPF. See .c file.
+#
+# USAGE: pidpersec
+#
+# Written as a basic example of counting an event.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 11-Aug-2015	Brendan Gregg	Created this.
+from bpf import BPF
+from ctypes import c_ushort, c_int, c_ulonglong
+from time import sleep, strftime
+# load BPF program
+b = BPF(src_file = "pidpersec.c")
+BPF.attach_kprobe(b.load_func("do_count", BPF.KPROBE), "sched_fork")
+stats = b.get_table("stats", c_int, c_ulonglong)
+# stat indexes
+S_COUNT = 1
+# header
+print "Tracing... Ctrl-C to end."
+# output
+last = 0
+while (1):
+	try:
+		sleep(1)
+	except KeyboardInterrupt:
+		pass; exit()
+	print "%s: PIDs/sec: %d" % (strftime("%H:%M:%S"),
+	    (stats[c_int(S_COUNT)].value - last))
+	last = stats[c_int(S_COUNT)].value
--- a/tools/pidpersec.c
+++ b/tools/pidpersec.c
+/*
+ * pidpersec.c	Count new processes (via fork).
+ *		For Linux, uses BCC, eBPF. See the Python front-end.
+ *
+ * USAGE: pidpersec.py
+ *
+ * Copyright (c) 2015 Brendan Gregg.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 11-Aug-2015	Brendan Gregg	Created this.
+ */
+#include <uapi/linux/ptrace.h>
+enum stat_types {
+	S_COUNT = 1,
+	S_MAXSTAT
+};
+BPF_TABLE("array", int, u64, stats, S_MAXSTAT + 1);
+void stats_increment(int key) {
+	u64 *leaf = stats.lookup(&key);
+	if (leaf) (*leaf)++;
+}
+void do_count(struct pt_regs *ctx) { stats_increment(S_COUNT); }
--- a/tools/pidpersec_example.txt
+++ b/tools/pidpersec_example.txt
+Demonstrations of pidpersec, the Linux eBPF/bcc version.
+This shows the number of new processes created per second, measured by tracing
+the kernel fork() routine:
+# ./pidpersec
+Tracing... Ctrl-C to end.
+18:33:06: PIDs/sec: 4
+18:33:07: PIDs/sec: 5
+18:33:08: PIDs/sec: 4
+18:33:09: PIDs/sec: 4
+18:33:10: PIDs/sec: 21
+18:33:11: PIDs/sec: 5
+18:33:12: PIDs/sec: 4
+18:33:13: PIDs/sec: 4
+Each second there are four new processes (this happens to be caused by a
+launcher script that is retrying in a loop, and encountering errors).
+At 18:33:10, I typed "man ls" in another server session, which caused an
+increase in the number of new processes as the necessary commands were run.
--- a/tools/syncsnoop
+++ b/tools/syncsnoop
+#!/usr/bin/python
+#
+# syncsnoop	Trace sync() syscall.
+#		For Linux, uses BCC, eBPF. Embedded C.
+#
+# Written as a basic example of BCC trace & reformat. See
+# examples/hello_world.py for a BCC trace with default output example.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 13-Aug-2015	Brendan Gregg	Created this.
+from bpf import BPF
+import sys
+# load BPF program
+b = BPF(text = """
+int do_sync(void *ctx) {
+	bpf_trace_printk("sync()\\n");
+	return 0;
+};
+""")
+BPF.attach_kprobe(b.load_func("do_sync", BPF.KPROBE), "sys_sync")
+# header
+print "%-18s %s" % ("TIME(s)", "CALL")
+# open trace pipe
+try:
+	trace = open("/sys/kernel/debug/tracing/trace_pipe", "r")
+except:
+	print >> sys.stderr, "ERROR: opening trace_pipe"
+	exit(1)
+# format output
+while 1:
+	try:
+		line = trace.readline().rstrip()
+	except KeyboardInterrupt:
+		pass; exit()
+	prolog, time_s, colon, word = line.rsplit(" ", 3)
+	time_s = time_s[:-1]	# strip trailing ":"
+	print "%-18s %s" % (time_s, word)
--- a/tools/syncsnoop_example.txt
+++ b/tools/syncsnoop_example.txt
+Demonstrations of syncsnoop, the Linux eBPF/bcc version.
+This program traces calls to the kernel sync() routine, with basic timestamps:
+# ./syncsnoop 
+TIME(s)            CALL
+16458148.611952    sync()
+16458151.533709    sync()
+^C
+While tracing, the "sync" command was executed in another server session.
+This can be useful to identify that sync() is being called, and its frequency.
--- a/tools/vfscount
+++ b/tools/vfscount
+#!/usr/bin/python
+#
+# vfscount	Count some VFS calls.
+#		For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of counting functions.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Aug-2015	Brendan Gregg	Created this.
+from bpf import BPF
+from ctypes import c_ushort, c_int, c_ulonglong
+from time import sleep, strftime
+from sys import stderr
+# kernel symbol translation
+ksym_addrs = []			# addresses for binary search
+ksym_names = []			# same index as ksym_addrs
+def load_kallsyms():
+	symfile = "/proc/kallsyms"
+	try:
+		syms = open(symfile, "r")
+	except:
+		print >> stderr, "ERROR: reading " + symfile
+		exit()
+	line = syms.readline()
+	for line in iter(syms):
+		cols = line.split()
+		name = cols[2]
+		if name[:4] != "vfs_":	# perf optimization
+			continue
+		addr = int(cols[0], 16)
+		ksym_addrs.append(addr)
+		ksym_names.append(name)
+	syms.close()
+def ksym(addr):
+	start = -1
+	end = len(ksym_addrs)
+	while end != start + 1:
+		mid = (start + end) / 2
+		if addr < ksym_addrs[mid]:
+			end = mid
+		else:
+			start = mid
+	if start == -1:
+		return "[unknown]"
+	return ksym_names[start]
+load_kallsyms()
+# load BPF program
+b = BPF(src_file = "vfscount.c")
+fn = b.load_func("do_count", BPF.KPROBE)
+BPF.attach_kprobe(fn, "vfs_read")
+BPF.attach_kprobe(fn, "vfs_write")
+BPF.attach_kprobe(fn, "vfs_fsync")
+BPF.attach_kprobe(fn, "vfs_open")
+BPF.attach_kprobe(fn, "vfs_create")
+counts = b.get_table("counts")
+# header
+print "Tracing... Ctrl-C to end."
+# output
+try:
+	sleep(99999999)
+except KeyboardInterrupt:
+	pass
+print "\n%-16s %-12s %8s" % ("ADDR", "FUNC", "COUNT")
+for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+	print "%-16x %-12s %8d" % (k.ip, ksym(k.ip), v.value)
--- a/tools/vfscount.c
+++ b/tools/vfscount.c
+/*
+ * vfscount.c	Count some VFS calls.
+ *		For Linux, uses BCC, eBPF. See the Python front-end.
+ *
+ * USAGE: vfscount.py
+ *
+ * Copyright (c) 2015 Brendan Gregg.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 14-Aug-2015	Brendan Gregg	Created this.
+ */
+#include <uapi/linux/ptrace.h>
+struct key_t {
+	u64 ip;
+};
+BPF_TABLE("hash", struct key_t, u64, counts, 256);
+int do_count(struct pt_regs *ctx) {
+	struct key_t key = {};
+	u64 zero = 0, *val;
+	key.ip = ctx->ip;
+	val = counts.lookup_or_init(&key, &zero);
+	(*val)++;
+	return 0;
+}
--- a/tools/vfscount_example.txt
+++ b/tools/vfscount_example.txt
+Demonstrations of vfscount, the Linux eBPF/bcc version.
+This counts VFS calls, by tracing various kernel calls beginning with "vfs_"
+(edit the script to customize):
+# ./vfscount 
+Tracing... Ctrl-C to end.
+^C
+ADDR             FUNC            COUNT
+ffffffff811f2cc1 vfs_create         24
+ffffffff811e71c1 vfs_write         203
+ffffffff811e6061 vfs_open          765
+ffffffff811e7091 vfs_read         1852
+This can be useful for workload characterization, to see what types of
+operations are in use.
--- a/tools/vfsstat
+++ b/tools/vfsstat
+#!/usr/bin/python
+#
+# vfsstat	Count some VFS calls.
+#		For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of counting multiple events as a stat tool.
+#
+# USAGE: vfsstat [interval [count]]
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Aug-2015	Brendan Gregg	Created this.
+from __future__ import print_function
+from bpf import BPF
+from ctypes import c_ushort, c_int, c_ulonglong
+from time import sleep, strftime
+from sys import argv
+def usage():
+	print("USAGE: %s [interval [count]]" % argv[0])
+	exit()
+# arguments
+interval = 1
+count = -1
+if len(argv) > 1:
+	try:
+		interval = int(argv[1])
+		if interval == 0:
+			raise
+		if len(argv) > 2:
+			count = int(argv[2])
+	except:	# also catches -h, --help
+		usage()
+# load BPF program
+b = BPF(src_file = "vfsstat.c")
+BPF.attach_kprobe(b.load_func("do_read", BPF.KPROBE), "vfs_read")
+BPF.attach_kprobe(b.load_func("do_write", BPF.KPROBE), "vfs_write")
+BPF.attach_kprobe(b.load_func("do_fsync", BPF.KPROBE), "vfs_fsync")
+BPF.attach_kprobe(b.load_func("do_open", BPF.KPROBE), "vfs_open")
+BPF.attach_kprobe(b.load_func("do_create", BPF.KPROBE), "vfs_create")
+stats = b.get_table("stats", c_int, c_ulonglong)
+# stat column labels and indexes
+stat_types = {
+	"READ" : 1,
+	"WRITE" : 2,
+	"FSYNC" : 3,
+	"OPEN" : 4,
+	"CREATE" : 5
+}
+# header
+print("%-8s  " % "TIME", end="")
+last = {}
+for stype in stat_types.keys():
+	print(" %8s" % (stype + "/s"), end="")
+	idx = stat_types[stype]
+	last[idx] = 0
+print("")
+# output
+i = 0
+while (1):
+	if count > 0:
+		i += 1
+		if i > count:
+			exit()
+	try:
+		sleep(interval)
+	except KeyboardInterrupt:
+		pass; exit()
+	print("%-8s: " % strftime("%H:%M:%S"), end="")
+	# print each statistic as a column
+	for stype in stat_types.keys():
+		idx = stat_types[stype]
+		try:
+			delta = stats[c_int(idx)].value - last[idx]
+			print(" %8d" % (delta / interval), end="")
+			last[idx] = stats[c_int(idx)].value
+		except:
+			print(" %8d" % 0, end="")
+	print("")
--- a/tools/vfsstat.c
+++ b/tools/vfsstat.c
+/*
+ * vfsstat.c	Count some VFS calls.
+ *		For Linux, uses BCC, eBPF. See the Python front-end.
+ *
+ * USAGE: vfsstat.py [interval [count]]
+ *
+ * Copyright (c) 2015 Brendan Gregg.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 14-Aug-2015	Brendan Gregg	Created this.
+ */
+#include <uapi/linux/ptrace.h>
+enum stat_types {
+	S_READ = 1,
+	S_WRITE,
+	S_FSYNC,
+	S_OPEN,
+	S_CREATE,
+	S_MAXSTAT
+};
+BPF_TABLE("array", int, u64, stats, S_MAXSTAT + 1);
+void stats_increment(int key) {
+	u64 *leaf = stats.lookup(&key);
+	if (leaf) (*leaf)++;
+}
+void do_read(struct pt_regs *ctx) { stats_increment(S_READ); }
+void do_write(struct pt_regs *ctx) { stats_increment(S_WRITE); }
+void do_fsync(struct pt_regs *ctx) { stats_increment(S_FSYNC); }
+void do_open(struct pt_regs *ctx) { stats_increment(S_OPEN); }
+void do_create(struct pt_regs *ctx) { stats_increment(S_CREATE); }
--- a/tools/vfsstat_example.txt
+++ b/tools/vfsstat_example.txt
+Demonstrations of vfsstat, the Linux eBPF/bcc version.
+This traces some common VFS calls and prints per-second summaries. By default,
+the output interval is one second:
+# ./vfsstat 
+TIME         READ/s  WRITE/s CREATE/s   OPEN/s  FSYNC/s
+18:35:32:       231       12        4       98        0
+18:35:33:       274       13        4      106        0
+18:35:34:       586       86        4      251        0
+18:35:35:       241       15        4       99        0
+18:35:36:       232       10        4       98        0
+18:35:37:       244       10        4      107        0
+18:35:38:       235       13        4       97        0
+18:35:39:      6749     2633        4     1446        0
+18:35:40:       277       31        4      115        0
+18:35:41:       238       16        6      102        0
+18:35:42:       284       50        8      114        0
+^C
+Here we are using an output interval of five seconds, and printing three output
+lines:
+# ./vfsstat 5 3
+TIME         READ/s  WRITE/s CREATE/s   OPEN/s  FSYNC/s
+18:35:55:       238        8        3      101        0
+18:36:00:       962      233        4      247        0
+18:36:05:       241        8        3      100        0
+Full usage:
+# ./vfsstat -h
+USAGE: ./vfsstat [interval [count]]