Commit 5cd2e0bc authored by Sasha Goldshtein's avatar Sasha Goldshtein

Merge remote-tracking branch 'upstream/master'

parents dc974939 788f18d2
...@@ -67,6 +67,7 @@ Tools: ...@@ -67,6 +67,7 @@ Tools:
- tools/[bashreadline](tools/bashreadline.py): Print entered bash commands system wide. [Examples](tools/bashreadline_example.txt). - tools/[bashreadline](tools/bashreadline.py): Print entered bash commands system wide. [Examples](tools/bashreadline_example.txt).
- tools/[biolatency](tools/biolatency.py): Summarize block device I/O latency as a histogram. [Examples](tools/biolatency_example.txt). - tools/[biolatency](tools/biolatency.py): Summarize block device I/O latency as a histogram. [Examples](tools/biolatency_example.txt).
- tools/[biotop](tools/biotop.py): Top for disks: Summarize block device I/O by process. [Examples](tools/biotop_example.txt).
- tools/[biosnoop](tools/biosnoop.py): Trace block device I/O with PID and latency. [Examples](tools/biosnoop_example.txt). - tools/[biosnoop](tools/biosnoop.py): Trace block device I/O with PID and latency. [Examples](tools/biosnoop_example.txt).
- tools/[bitesize](tools/bitesize.py): Show per process I/O size histogram. [Examples](tools/bitesize_example.txt). - tools/[bitesize](tools/bitesize.py): Show per process I/O size histogram. [Examples](tools/bitesize_example.txt).
- tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt). - tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt).
......
.TH biotop 8 "2016-02-06" "USER COMMANDS"
.SH NAME
biotop \- Block device (disk) I/O by process top.
.SH SYNOPSIS
.B biotop [\-h] [\-C] [\-r MAXROWS] [interval] [count]
.SH DESCRIPTION
This is top for disks.
This traces block device I/O (disk I/O), and prints a per-process summary every
interval (by default, 1 second). The summary is sorted on the top disk
consumers by throughput (Kbytes). The PID and process name shown are measured
from when the I/O was first created, which usually identifies the responsible
process.
For efficiency, this uses in-kernel eBPF maps to cache process details (PID and
comm) by I/O request, as well as a starting timestamp for calculating I/O
latency, and the final summary.
This works by tracing various kernel blk_*() functions using dynamic tracing,
and will need updating to match any changes to these functions.
Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH EXAMPLES
.TP
Summarize block device I/O by process, 1 second screen refresh:
#
.B biotop
.TP
Don't clear the screen:
#
.B biotop -C
.TP
5 second summaries, 10 times only:
#
.B biotop 5 10
.SH FIELDS
.TP
loadavg:
The contents of /proc/loadavg
.TP
PID
Cached process ID, if present. This usually (but isn't guaranteed) to identify
the responsible process for the I/O.
.TP
COMM
Cached process name, if present. This usually (but isn't guaranteed) to identify
the responsible process for the I/O.
.TP
D
Direction: R == read, W == write.
.TP
MAJ
Major device number.
.TP
MIN
Minor device number.
.TP
DISK
Disk device name.
.TP
I/O
Number of I/O during the interval.
.TP
Kbytes
Total Kbytes for these I/O, during the interval.
.TP
AVGms
Average time for the I/O (latency) from the issue to the device, to its
completion, in milliseconds.
.SH OVERHEAD
Since block device I/O usually has a relatively low frequency (< 10,000/s),
the overhead for this tool is expected to be low or negligible. For high IOPS
storage systems, test and quantify before use.
.SH SOURCE
This is from bcc.
.IP
https://github.com/iovisor/bcc
.PP
Also look in the bcc distribution for a companion _examples.txt file containing
example usage, output, and commentary for this tool.
.SH OS
Linux
.SH STABILITY
Unstable - in development.
.SH AUTHOR
Brendan Gregg
.SH SEE ALSO
biosnoop(8), biolatency(8), iostat(1)
...@@ -148,6 +148,19 @@ static void parse_type(IRBuilder<> &B, vector<Value *> *args, string *fmt, ...@@ -148,6 +148,19 @@ static void parse_type(IRBuilder<> &B, vector<Value *> *args, string *fmt,
*fmt += " "; *fmt += " ";
} }
*fmt += "}"; *fmt += "}";
} else if (ArrayType *at = dyn_cast<ArrayType>(type)) {
*fmt += "[ ";
for (size_t i = 0; i < at->getNumElements(); ++i) {
parse_type(B, args, fmt, at->getElementType(), B.CreateStructGEP(type, out, i), is_writer);
*fmt += " ";
}
*fmt += "]";
} else if (PointerType *pt = dyn_cast<PointerType>(type)) {
*fmt += "0xl";
if (is_writer)
*fmt += "x";
else
*fmt += "i";
} else if (IntegerType *it = dyn_cast<IntegerType>(type)) { } else if (IntegerType *it = dyn_cast<IntegerType>(type)) {
if (is_writer) if (is_writer)
*fmt += "0x"; *fmt += "0x";
......
...@@ -286,6 +286,30 @@ class BPF(object): ...@@ -286,6 +286,30 @@ class BPF(object):
if res < 0: if res < 0:
raise KeyError raise KeyError
# override the MutableMapping's implementation of these since they
# don't handle KeyError nicely
def itervalues(self):
for key in self:
# a map entry may be deleted in between discovering the key and
# fetching the value, suppress such errors
try:
yield self[key]
except KeyError:
pass
def iteritems(self):
for key in self:
try:
yield (key, self[key])
except KeyError:
pass
def items(self):
return [item for item in self.iteritems()]
def values(self):
return [value for value in self.itervalues()]
def clear(self): def clear(self):
# default clear uses popitem, which can race with the bpf prog # default clear uses popitem, which can race with the bpf prog
for k in self.keys(): for k in self.keys():
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
# Licensed under the Apache License, Version 2.0 (the "License") # Licensed under the Apache License, Version 2.0 (the "License")
from bcc import BPF from bcc import BPF
import ctypes
from unittest import main, TestCase from unittest import main, TestCase
class TestClang(TestCase): class TestClang(TestCase):
...@@ -89,6 +90,22 @@ int foo(void *ctx) { ...@@ -89,6 +90,22 @@ int foo(void *ctx) {
self.assertEqual(l.s.a, 5) self.assertEqual(l.s.a, 5)
self.assertEqual(l.s.b, 6) self.assertEqual(l.s.b, 6)
def test_sscanf_array(self):
text = """
BPF_TABLE("hash", int, struct { u32 a[3]; u32 b; }, stats, 10);
"""
b = BPF(text=text, debug=0)
t = b.get_table("stats")
s1 = t.key_sprintf(t.Key(2))
self.assertEqual(s1, b"0x2")
s2 = t.leaf_sprintf(t.Leaf((ctypes.c_uint * 3)(1,2,3), 4))
self.assertEqual(s2, b"{ [ 0x1 0x2 0x3 ] 0x4 }")
l = t.leaf_scanf(s2)
self.assertEqual(l.a[0], 1)
self.assertEqual(l.a[1], 2)
self.assertEqual(l.a[2], 3)
self.assertEqual(l.b, 4)
def test_iosnoop(self): def test_iosnoop(self):
text = """ text = """
#include <linux/blkdev.h> #include <linux/blkdev.h>
......
#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
#
# biotop block device (disk) I/O by process.
# For Linux, uses BCC, eBPF.
#
# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
#
# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
# request, as well as a starting timestamp for calculating I/O latency.
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 06-Feb-2016 Brendan Gregg Created this.
from __future__ import print_function
from bcc import BPF
from time import sleep, strftime
import argparse
import signal
from subprocess import call
# arguments
examples = """examples:
./biotop # block device I/O top, 1 second refresh
./biotop -C # don't clear the screen
./biotop 5 # 5 second summaries
./biotop 5 10 # 5 second summaries, 10 times only
"""
parser = argparse.ArgumentParser(
description="Block device (disk) I/O by process",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-C", "--noclear", action="store_true",
help="don't clear the screen")
parser.add_argument("-r", "--maxrows", default=20,
help="maximum rows to print, default 20")
parser.add_argument("interval", nargs="?", default=1,
help="output interval, in seconds")
parser.add_argument("count", nargs="?", default=99999999,
help="number of outputs")
args = parser.parse_args()
interval = int(args.interval)
countdown = int(args.count)
maxrows = int(args.maxrows)
clear = not int(args.noclear)
# linux stats
loadavg = "/proc/loadavg"
diskstats = "/proc/diskstats"
# signal handler
def signal_ignore(signal, frame):
print()
# load BPF program
b = BPF(text="""
#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>
// for saving process info by request
struct who_t {
u32 pid;
char name[TASK_COMM_LEN];
};
// the key for the output summary
struct info_t {
u32 pid;
int type;
int major;
int minor;
char name[TASK_COMM_LEN];
};
// the value of the output summary
struct val_t {
u64 bytes;
u64 us;
u32 io;
};
BPF_HASH(start, struct request *);
BPF_HASH(whobyreq, struct request *, struct who_t);
BPF_HASH(counts, struct info_t, struct val_t);
// cache PID and comm by-req
int trace_pid_start(struct pt_regs *ctx, struct request *req)
{
struct who_t who = {};
if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
who.pid = bpf_get_current_pid_tgid();
whobyreq.update(&req, &who);
}
return 0;
}
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
u64 ts;
ts = bpf_ktime_get_ns();
start.update(&req, &ts);
return 0;
}
// output
int trace_req_completion(struct pt_regs *ctx, struct request *req)
{
u64 *tsp;
// fetch timestamp and calculate delta
tsp = start.lookup(&req);
if (tsp == 0) {
return 0; // missed tracing issue
}
struct who_t *whop;
struct val_t *valp, zero = {};
u64 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
// setup info_t key
struct info_t info = {};
info.major = req->rq_disk->major;
info.minor = req->rq_disk->first_minor;
info.type = req->cmd_flags & REQ_WRITE;
whop = whobyreq.lookup(&req);
if (whop == 0) {
// missed pid who, save stats as pid 0
valp = counts.lookup_or_init(&info, &zero);
} else {
info.pid = whop->pid;
__builtin_memcpy(&info.name, whop->name, sizeof(info.name));
valp = counts.lookup_or_init(&info, &zero);
}
// save stats
valp->us += delta_us;
valp->bytes += req->__data_len;
valp->io++;
start.delete(&req);
whobyreq.delete(&req);
return 0;
}
""", debug=0)
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_account_io_completion",
fn_name="trace_req_completion")
print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
# cache disk major,minor -> diskname
disklookup = {}
with open(diskstats) as stats:
for line in stats:
a = line.split()
disklookup[a[0] + "," + a[1]] = a[2]
# output
exiting = 0
while 1:
try:
sleep(interval)
except KeyboardInterrupt:
exiting = 1
# header
if clear:
call("clear")
else:
print()
with open(loadavg) as stats:
print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
print("%-6s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
"D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"))
# by-PID output
counts = b.get_table("counts")
line = 0
for k, v in reversed(sorted(counts.items(),
key=lambda counts: counts[1].bytes)):
# lookup disk
disk = str(k.major) + "," + str(k.minor)
if disk in disklookup:
diskname = disklookup[disk]
else:
diskname = "?"
# print line
avg_ms = (float(v.us) / 1000) / v.io
print("%-6d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid, k.name,
"W" if k.type else "R", k.major, k.minor, diskname, v.io,
v.bytes / 1024, avg_ms))
line += 1
if line >= maxrows:
break
counts.clear()
countdown -= 1
if exiting or countdown == 0:
print("Detaching...")
exit()
Demonstrations of biotop, the Linux eBPF/bcc version.
Short for block device I/O top, biotop summarizes which processes are
performing disk I/O. It's top for disks. Sample output:
# ./biotop
Tracing... Output every 1 secs. Hit Ctrl-C to end
08:04:11 loadavg: 1.48 0.87 0.45 1/287 14547
PID COMM D MAJ MIN DISK I/O Kbytes AVGms
14501 cksum R 202 1 xvda1 361 28832 3.39
6961 dd R 202 1 xvda1 1628 13024 0.59
13855 dd R 202 1 xvda1 1627 13016 0.59
326 jbd2/xvda1-8 W 202 1 xvda1 3 168 3.00
1880 supervise W 202 1 xvda1 2 8 6.71
1873 supervise W 202 1 xvda1 2 8 2.51
1871 supervise W 202 1 xvda1 2 8 1.57
1876 supervise W 202 1 xvda1 2 8 1.22
1892 supervise W 202 1 xvda1 2 8 0.62
1878 supervise W 202 1 xvda1 2 8 0.78
1886 supervise W 202 1 xvda1 2 8 1.30
1894 supervise W 202 1 xvda1 2 8 3.46
1869 supervise W 202 1 xvda1 2 8 0.73
1888 supervise W 202 1 xvda1 2 8 1.48
By default the screen refreshes every 1 second, and shows the top 20 disk
consumers, sorted on total Kbytes. The first line printed is the header,
which has the time and then the contents of /proc/loadavg.
For the interval summarized by the output above, the "cksum" command performed
361 disk reads to the "xvda1" device, for a total of 28832 Kbytes, with an
average I/O time of 3.39 ms. Two "dd" processes were also reading from the
same disk, which a higher I/O rate and lower latency. While the average I/O
size is not printed, it can be determined by dividing the Kbytes column by
the I/O column.
The columns through to Kbytes show the workload applied. The final column,
AVGms, shows resulting performance. Other bcc tools can be used to get more
details when needed: biolatency and biosnoop.
Many years ago I created the original "iotop", and later regretted not calling
it diskiotop or blockiotop, as "io" alone is ambiguous. This time it is biotop.
The -C option can be used to prevent the screen from clearing (my preference).
Here's using it with a 5 second interval:
# ./biotop -C 5
Tracing... Output every 5 secs. Hit Ctrl-C to end
08:09:44 loadavg: 0.42 0.44 0.39 2/282 22115
PID COMM D MAJ MIN DISK I/O Kbytes AVGms
22069 dd R 202 1 xvda1 5993 47976 0.33
326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.67
1866 svscan R 202 1 xvda1 33 132 1.24
1880 supervise W 202 1 xvda1 10 40 0.56
1873 supervise W 202 1 xvda1 10 40 0.79
1871 supervise W 202 1 xvda1 10 40 0.78
1876 supervise W 202 1 xvda1 10 40 0.68
1892 supervise W 202 1 xvda1 10 40 0.71
1878 supervise W 202 1 xvda1 10 40 0.65
1886 supervise W 202 1 xvda1 10 40 0.78
1894 supervise W 202 1 xvda1 10 40 0.80
1869 supervise W 202 1 xvda1 10 40 0.91
1888 supervise W 202 1 xvda1 10 40 0.63
22069 bash R 202 1 xvda1 1 16 19.94
9251 kworker/u16:2 W 202 16 xvdb 2 8 0.13
08:09:49 loadavg: 0.47 0.44 0.39 1/282 22231
PID COMM D MAJ MIN DISK I/O Kbytes AVGms
22069 dd R 202 1 xvda1 13450 107600 0.35
22199 cksum R 202 1 xvda1 941 45548 4.63
326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.93
24467 kworker/0:2 W 202 16 xvdb 1 64 0.28
1880 supervise W 202 1 xvda1 10 40 0.81
1873 supervise W 202 1 xvda1 10 40 0.81
1871 supervise W 202 1 xvda1 10 40 1.03
1876 supervise W 202 1 xvda1 10 40 0.76
1892 supervise W 202 1 xvda1 10 40 0.74
1878 supervise W 202 1 xvda1 10 40 0.94
1886 supervise W 202 1 xvda1 10 40 0.76
1894 supervise W 202 1 xvda1 10 40 0.69
1869 supervise W 202 1 xvda1 10 40 0.72
1888 supervise W 202 1 xvda1 10 40 1.70
22199 bash R 202 1 xvda1 2 20 0.35
482 xfsaild/md0 W 202 16 xvdb 5 13 0.27
482 xfsaild/md0 W 202 32 xvdc 2 8 0.33
31331 pickup R 202 1 xvda1 1 4 0.31
08:09:54 loadavg: 0.51 0.45 0.39 2/282 22346
PID COMM D MAJ MIN DISK I/O Kbytes AVGms
22069 dd R 202 1 xvda1 14689 117512 0.32
326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.33
1880 supervise W 202 1 xvda1 10 40 0.65
1873 supervise W 202 1 xvda1 10 40 1.08
1871 supervise W 202 1 xvda1 10 40 0.66
1876 supervise W 202 1 xvda1 10 40 0.79
1892 supervise W 202 1 xvda1 10 40 0.67
1878 supervise W 202 1 xvda1 10 40 0.66
1886 supervise W 202 1 xvda1 10 40 1.02
1894 supervise W 202 1 xvda1 10 40 0.88
1869 supervise W 202 1 xvda1 10 40 0.89
1888 supervise W 202 1 xvda1 10 40 1.25
08:09:59 loadavg: 0.55 0.46 0.40 2/282 22461
PID COMM D MAJ MIN DISK I/O Kbytes AVGms
22069 dd R 202 1 xvda1 14442 115536 0.33
326 jbd2/xvda1-8 W 202 1 xvda1 3 168 3.46
1880 supervise W 202 1 xvda1 10 40 0.87
1873 supervise W 202 1 xvda1 10 40 0.87
1871 supervise W 202 1 xvda1 10 40 0.78
1876 supervise W 202 1 xvda1 10 40 0.86
1892 supervise W 202 1 xvda1 10 40 0.89
1878 supervise W 202 1 xvda1 10 40 0.87
1886 supervise W 202 1 xvda1 10 40 0.86
1894 supervise W 202 1 xvda1 10 40 1.06
1869 supervise W 202 1 xvda1 10 40 1.12
1888 supervise W 202 1 xvda1 10 40 0.98
08:10:04 loadavg: 0.59 0.47 0.40 3/282 22576
PID COMM D MAJ MIN DISK I/O Kbytes AVGms
22069 dd R 202 1 xvda1 14179 113432 0.34
326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.39
1880 supervise W 202 1 xvda1 10 40 0.81
1873 supervise W 202 1 xvda1 10 40 1.02
1871 supervise W 202 1 xvda1 10 40 1.15
1876 supervise W 202 1 xvda1 10 40 1.10
1892 supervise W 202 1 xvda1 10 40 0.77
1878 supervise W 202 1 xvda1 10 40 0.72
1886 supervise W 202 1 xvda1 10 40 0.81
1894 supervise W 202 1 xvda1 10 40 0.86
1869 supervise W 202 1 xvda1 10 40 0.83
1888 supervise W 202 1 xvda1 10 40 0.79
24467 kworker/0:2 R 202 32 xvdc 3 12 0.26
1056 cron R 202 1 xvda1 2 8 0.30
24467 kworker/0:2 R 202 16 xvdb 1 4 0.23
08:10:09 loadavg: 0.54 0.46 0.40 2/281 22668
PID COMM D MAJ MIN DISK I/O Kbytes AVGms
22069 dd R 202 1 xvda1 250 2000 0.34
326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.40
1880 supervise W 202 1 xvda1 8 32 0.93
1873 supervise W 202 1 xvda1 8 32 0.76
1871 supervise W 202 1 xvda1 8 32 0.60
1876 supervise W 202 1 xvda1 8 32 0.61
1892 supervise W 202 1 xvda1 8 32 0.68
1878 supervise W 202 1 xvda1 8 32 0.90
1886 supervise W 202 1 xvda1 8 32 0.57
1894 supervise W 202 1 xvda1 8 32 0.97
1869 supervise W 202 1 xvda1 8 32 0.69
1888 supervise W 202 1 xvda1 8 32 0.67
This shows another "dd" command reading from xvda1. On this system, various
"supervise" processes do 8 disk writes per second, every second (they are
creating and updating "status" files).
USAGE message:
# ./biotop.py -h
usage: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
Block device (disk) I/O by process
positional arguments:
interval output interval, in seconds
count number of outputs
optional arguments:
-h, --help show this help message and exit
-C, --noclear don't clear the screen
-r MAXROWS, --maxrows MAXROWS
maximum rows to print, default 20
examples:
./biotop # block device I/O top, 1 second refresh
./biotop -C # don't clear the screen
./biotop 5 # 5 second summaries
./biotop 5 10 # 5 second summaries, 10 times only
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment