Merge remote-tracking branch 'upstream/master'

5cd2e0bc · Sasha Goldshtein · dc974939 · 788f18d2 · 5cd2e0bc · 5cd2e0bc
Commit 5cd2e0bc authored Feb 07, 2016 by Sasha Goldshtein
7 changed files
--- a/README.md
+++ b/README.md
@@ -67,6 +67,7 @@ Tools:
 - tools/[bashreadline](tools/bashreadline.py): Print entered bash commands system wide. [Examples](tools/bashreadline_example.txt).
 - tools/[biolatency](tools/biolatency.py): Summarize block device I/O latency as a histogram. [Examples](tools/biolatency_example.txt).
+- tools/[biotop](tools/biotop.py): Top for disks: Summarize block device I/O by process. [Examples](tools/biotop_example.txt).
 - tools/[biosnoop](tools/biosnoop.py): Trace block device I/O with PID and latency. [Examples](tools/biosnoop_example.txt).
 - tools/[bitesize](tools/bitesize.py): Show per process I/O size histogram. [Examples](tools/bitesize_example.txt).
 - tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt).

--- a/man/man8/biotop.8
+++ b/man/man8/biotop.8
+.TH biotop 8  "2016-02-06" "USER COMMANDS"
+.SH NAME
+biotop \- Block device (disk) I/O by process top.
+.SH SYNOPSIS
+.B biotop [\-h] [\-C] [\-r MAXROWS] [interval] [count]
+.SH DESCRIPTION
+This is top for disks. 
+This traces block device I/O (disk I/O), and prints a per-process summary every
+interval (by default, 1 second). The summary is sorted on the top disk
+consumers by throughput (Kbytes). The PID and process name shown are measured
+from when the I/O was first created, which usually identifies the responsible
+process.
+For efficiency, this uses in-kernel eBPF maps to cache process details (PID and
+comm) by I/O request, as well as a starting timestamp for calculating I/O
+latency, and the final summary.
+This works by tracing various kernel blk_*() functions using dynamic tracing,
+and will need updating to match any changes to these functions.
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Summarize block device I/O by process, 1 second screen refresh:
+#
+.B biotop
+.TP
+Don't clear the screen:
+#
+.B biotop -C
+.TP
+5 second summaries, 10 times only:
+#
+.B biotop 5 10
+.SH FIELDS
+.TP
+loadavg:
+The contents of /proc/loadavg
+.TP
+PID
+Cached process ID, if present. This usually (but isn't guaranteed) to identify
+the responsible process for the I/O.
+.TP
+COMM
+Cached process name, if present. This usually (but isn't guaranteed) to identify
+the responsible process for the I/O.
+.TP
+D
+Direction: R == read, W == write.
+.TP
+MAJ
+Major device number.
+.TP
+MIN
+Minor device number.
+.TP
+DISK
+Disk device name.
+.TP
+I/O
+Number of I/O during the interval.
+.TP
+Kbytes
+Total Kbytes for these I/O, during the interval.
+.TP
+AVGms
+Average time for the I/O (latency) from the issue to the device, to its
+completion, in milliseconds.
+.SH OVERHEAD
+Since block device I/O usually has a relatively low frequency (< 10,000/s),
+the overhead for this tool is expected to be low or negligible. For high IOPS
+storage systems, test and quantify before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+biosnoop(8), biolatency(8), iostat(1)
--- a/src/cc/bpf_module.cc
+++ b/src/cc/bpf_module.cc
@@ -148,6 +148,19 @@ static void parse_type(IRBuilder<> &B, vector<Value *> *args, string *fmt,
      *fmt += " ";
    }
    *fmt += "}";
+  } else if (ArrayType *at = dyn_cast<ArrayType>(type)) {
+    *fmt += "[ ";
+    for (size_t i = 0; i < at->getNumElements(); ++i) {
+      parse_type(B, args, fmt, at->getElementType(), B.CreateStructGEP(type, out, i), is_writer);
+      *fmt += " ";
+    }
+    *fmt += "]";
+  } else if (PointerType *pt = dyn_cast<PointerType>(type)) {
+    *fmt += "0xl";
+    if (is_writer)
+      *fmt += "x";
+    else
+      *fmt += "i";
  } else if (IntegerType *it = dyn_cast<IntegerType>(type)) {
    if (is_writer)
      *fmt += "0x";

--- a/src/python/bcc/__init__.py
+++ b/src/python/bcc/__init__.py
@@ -286,6 +286,30 @@ class BPF(object):
                if res < 0:
                    raise KeyError
+        # override the MutableMapping's implementation of these since they
+        # don't handle KeyError nicely
+        def itervalues(self):
+            for key in self:
+                # a map entry may be deleted in between discovering the key and
+                # fetching the value, suppress such errors
+                try:
+                    yield self[key]
+                except KeyError:
+                    pass
+        def iteritems(self):
+            for key in self:
+                try:
+                    yield (key, self[key])
+                except KeyError:
+                    pass
+        def items(self):
+            return [item for item in self.iteritems()]
+        def values(self):
+            return [value for value in self.itervalues()]
        def clear(self):
            # default clear uses popitem, which can race with the bpf prog
            for k in self.keys():

--- a/tests/cc/test_clang.py
+++ b/tests/cc/test_clang.py
@@ -3,6 +3,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 from bcc import BPF
+import ctypes
 from unittest import main, TestCase
 class TestClang(TestCase):
@@ -89,6 +90,22 @@ int foo(void *ctx) {
        self.assertEqual(l.s.a, 5)
        self.assertEqual(l.s.b, 6)
+    def test_sscanf_array(self):
+        text = """
+BPF_TABLE("hash", int, struct { u32 a[3]; u32 b; }, stats, 10);
+"""
+        b = BPF(text=text, debug=0)
+        t = b.get_table("stats")
+        s1 = t.key_sprintf(t.Key(2))
+        self.assertEqual(s1, b"0x2")
+        s2 = t.leaf_sprintf(t.Leaf((ctypes.c_uint * 3)(1,2,3), 4))
+        self.assertEqual(s2, b"{ [ 0x1 0x2 0x3 ] 0x4 }")
+        l = t.leaf_scanf(s2)
+        self.assertEqual(l.a[0], 1)
+        self.assertEqual(l.a[1], 2)
+        self.assertEqual(l.a[2], 3)
+        self.assertEqual(l.b, 4)
    def test_iosnoop(self):
        text = """
 #include <linux/blkdev.h>

--- a/tools/biotop.py
+++ b/tools/biotop.py
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# biotop  block device (disk) I/O by process.
+#         For Linux, uses BCC, eBPF.
+#
+# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
+#
+# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
+# request, as well as a starting timestamp for calculating I/O latency.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 06-Feb-2016   Brendan Gregg   Created this.
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+import signal
+from subprocess import call
+# arguments
+examples = """examples:
+    ./biotop            # block device I/O top, 1 second refresh
+    ./biotop -C         # don't clear the screen
+    ./biotop 5          # 5 second summaries
+    ./biotop 5 10       # 5 second summaries, 10 times only
+"""
+parser = argparse.ArgumentParser(
+    description="Block device (disk) I/O by process",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-C", "--noclear", action="store_true",
+    help="don't clear the screen")
+parser.add_argument("-r", "--maxrows", default=20,
+    help="maximum rows to print, default 20")
+parser.add_argument("interval", nargs="?", default=1,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+args = parser.parse_args()
+interval = int(args.interval)
+countdown = int(args.count)
+maxrows = int(args.maxrows)
+clear = not int(args.noclear)
+# linux stats
+loadavg = "/proc/loadavg"
+diskstats = "/proc/diskstats"
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+// for saving process info by request
+struct who_t {
+    u32 pid;
+    char name[TASK_COMM_LEN];
+};
+// the key for the output summary
+struct info_t {
+    u32 pid;
+    int type;
+    int major;
+    int minor;
+    char name[TASK_COMM_LEN];
+};
+// the value of the output summary
+struct val_t {
+    u64 bytes;
+    u64 us;
+    u32 io;
+};
+BPF_HASH(start, struct request *);
+BPF_HASH(whobyreq, struct request *, struct who_t);
+BPF_HASH(counts, struct info_t, struct val_t);
+// cache PID and comm by-req
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+    struct who_t who = {};
+    if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
+        who.pid = bpf_get_current_pid_tgid();
+        whobyreq.update(&req, &who);
+    }
+    return 0;
+}
+// time block I/O
+int trace_req_start(struct pt_regs *ctx, struct request *req)
+{
+    u64 ts;
+    ts = bpf_ktime_get_ns();
+    start.update(&req, &ts);
+    return 0;
+}
+// output
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+    u64 *tsp;
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&req);
+    if (tsp == 0) {
+        return 0;    // missed tracing issue
+    }
+    struct who_t *whop;
+    struct val_t *valp, zero = {};
+    u64 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
+    // setup info_t key
+    struct info_t info = {};
+    info.major = req->rq_disk->major;
+    info.minor = req->rq_disk->first_minor;
+    info.type = req->cmd_flags & REQ_WRITE;
+    whop = whobyreq.lookup(&req);
+    if (whop == 0) {
+        // missed pid who, save stats as pid 0
+        valp = counts.lookup_or_init(&info, &zero);
+    } else {
+        info.pid = whop->pid;
+        __builtin_memcpy(&info.name, whop->name, sizeof(info.name));
+        valp = counts.lookup_or_init(&info, &zero);
+    }
+    // save stats
+    valp->us += delta_us;
+    valp->bytes += req->__data_len;
+    valp->io++;
+    start.delete(&req);
+    whobyreq.delete(&req);
+    return 0;
+}
+""", debug=0)
+b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
+b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
+b.attach_kprobe(event="blk_account_io_completion",
+    fn_name="trace_req_completion")
+print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
+# cache disk major,minor -> diskname
+disklookup = {}
+with open(diskstats) as stats:
+    for line in stats:
+        a = line.split()
+        disklookup[a[0] + "," + a[1]] = a[2]
+# output
+exiting = 0
+while 1:
+    try:
+        sleep(interval)
+    except KeyboardInterrupt:
+        exiting = 1
+    # header
+    if clear:
+        call("clear")
+    else:
+        print()
+    with open(loadavg) as stats:
+        print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
+    print("%-6s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
+        "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"))
+    # by-PID output
+    counts = b.get_table("counts")
+    line = 0
+    for k, v in reversed(sorted(counts.items(),
+                                key=lambda counts: counts[1].bytes)):
+        # lookup disk
+        disk = str(k.major) + "," + str(k.minor)
+        if disk in disklookup:
+            diskname = disklookup[disk]
+        else:
+            diskname = "?"
+        # print line
+        avg_ms = (float(v.us) / 1000) / v.io
+        print("%-6d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid, k.name,
+            "W" if k.type else "R", k.major, k.minor, diskname, v.io,
+            v.bytes / 1024, avg_ms))
+        line += 1
+        if line >= maxrows:
+            break
+    counts.clear()
+    countdown -= 1
+    if exiting or countdown == 0:
+        print("Detaching...")
+        exit()
--- a/tools/biotop_example.txt
+++ b/tools/biotop_example.txt
+Demonstrations of biotop, the Linux eBPF/bcc version.
+Short for block device I/O top, biotop summarizes which processes are
+performing disk I/O. It's top for disks. Sample output:
+# ./biotop
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+08:04:11 loadavg: 1.48 0.87 0.45 1/287 14547
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+14501  cksum            R 202 1   xvda1      361   28832   3.39
+6961   dd               R 202 1   xvda1     1628   13024   0.59
+13855  dd               R 202 1   xvda1     1627   13016   0.59
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   3.00
+1880   supervise        W 202 1   xvda1        2       8   6.71
+1873   supervise        W 202 1   xvda1        2       8   2.51
+1871   supervise        W 202 1   xvda1        2       8   1.57
+1876   supervise        W 202 1   xvda1        2       8   1.22
+1892   supervise        W 202 1   xvda1        2       8   0.62
+1878   supervise        W 202 1   xvda1        2       8   0.78
+1886   supervise        W 202 1   xvda1        2       8   1.30
+1894   supervise        W 202 1   xvda1        2       8   3.46
+1869   supervise        W 202 1   xvda1        2       8   0.73
+1888   supervise        W 202 1   xvda1        2       8   1.48
+By default the screen refreshes every 1 second, and shows the top 20 disk
+consumers, sorted on total Kbytes. The first line printed is the header,
+which has the time and then the contents of /proc/loadavg.
+For the interval summarized by the output above, the "cksum" command performed
+361 disk reads to the "xvda1" device, for a total of 28832 Kbytes, with an
+average I/O time of 3.39 ms. Two "dd" processes were also reading from the
+same disk, which a higher I/O rate and lower latency. While the average I/O
+size is not printed, it can be determined by dividing the Kbytes column by
+the I/O column.
+The columns through to Kbytes show the workload applied. The final column,
+AVGms, shows resulting performance. Other bcc tools can be used to get more
+details when needed: biolatency and biosnoop.
+Many years ago I created the original "iotop", and later regretted not calling
+it diskiotop or blockiotop, as "io" alone is ambiguous. This time it is biotop.
+The -C option can be used to prevent the screen from clearing (my preference).
+Here's using it with a 5 second interval:
+# ./biotop -C 5
+Tracing... Output every 5 secs. Hit Ctrl-C to end
+08:09:44 loadavg: 0.42 0.44 0.39 2/282 22115
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+22069  dd               R 202 1   xvda1     5993   47976   0.33
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   2.67
+1866   svscan           R 202 1   xvda1       33     132   1.24
+1880   supervise        W 202 1   xvda1       10      40   0.56
+1873   supervise        W 202 1   xvda1       10      40   0.79
+1871   supervise        W 202 1   xvda1       10      40   0.78
+1876   supervise        W 202 1   xvda1       10      40   0.68
+1892   supervise        W 202 1   xvda1       10      40   0.71
+1878   supervise        W 202 1   xvda1       10      40   0.65
+1886   supervise        W 202 1   xvda1       10      40   0.78
+1894   supervise        W 202 1   xvda1       10      40   0.80
+1869   supervise        W 202 1   xvda1       10      40   0.91
+1888   supervise        W 202 1   xvda1       10      40   0.63
+22069  bash             R 202 1   xvda1        1      16  19.94
+9251   kworker/u16:2    W 202 16  xvdb         2       8   0.13
+08:09:49 loadavg: 0.47 0.44 0.39 1/282 22231
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+22069  dd               R 202 1   xvda1    13450  107600   0.35
+22199  cksum            R 202 1   xvda1      941   45548   4.63
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   2.93
+24467  kworker/0:2      W 202 16  xvdb         1      64   0.28
+1880   supervise        W 202 1   xvda1       10      40   0.81
+1873   supervise        W 202 1   xvda1       10      40   0.81
+1871   supervise        W 202 1   xvda1       10      40   1.03
+1876   supervise        W 202 1   xvda1       10      40   0.76
+1892   supervise        W 202 1   xvda1       10      40   0.74
+1878   supervise        W 202 1   xvda1       10      40   0.94
+1886   supervise        W 202 1   xvda1       10      40   0.76
+1894   supervise        W 202 1   xvda1       10      40   0.69
+1869   supervise        W 202 1   xvda1       10      40   0.72
+1888   supervise        W 202 1   xvda1       10      40   1.70
+22199  bash             R 202 1   xvda1        2      20   0.35
+482    xfsaild/md0      W 202 16  xvdb         5      13   0.27
+482    xfsaild/md0      W 202 32  xvdc         2       8   0.33
+31331  pickup           R 202 1   xvda1        1       4   0.31
+08:09:54 loadavg: 0.51 0.45 0.39 2/282 22346
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+22069  dd               R 202 1   xvda1    14689  117512   0.32
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   2.33
+1880   supervise        W 202 1   xvda1       10      40   0.65
+1873   supervise        W 202 1   xvda1       10      40   1.08
+1871   supervise        W 202 1   xvda1       10      40   0.66
+1876   supervise        W 202 1   xvda1       10      40   0.79
+1892   supervise        W 202 1   xvda1       10      40   0.67
+1878   supervise        W 202 1   xvda1       10      40   0.66
+1886   supervise        W 202 1   xvda1       10      40   1.02
+1894   supervise        W 202 1   xvda1       10      40   0.88
+1869   supervise        W 202 1   xvda1       10      40   0.89
+1888   supervise        W 202 1   xvda1       10      40   1.25
+08:09:59 loadavg: 0.55 0.46 0.40 2/282 22461
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+22069  dd               R 202 1   xvda1    14442  115536   0.33
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   3.46
+1880   supervise        W 202 1   xvda1       10      40   0.87
+1873   supervise        W 202 1   xvda1       10      40   0.87
+1871   supervise        W 202 1   xvda1       10      40   0.78
+1876   supervise        W 202 1   xvda1       10      40   0.86
+1892   supervise        W 202 1   xvda1       10      40   0.89
+1878   supervise        W 202 1   xvda1       10      40   0.87
+1886   supervise        W 202 1   xvda1       10      40   0.86
+1894   supervise        W 202 1   xvda1       10      40   1.06
+1869   supervise        W 202 1   xvda1       10      40   1.12
+1888   supervise        W 202 1   xvda1       10      40   0.98
+08:10:04 loadavg: 0.59 0.47 0.40 3/282 22576
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+22069  dd               R 202 1   xvda1    14179  113432   0.34
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   2.39
+1880   supervise        W 202 1   xvda1       10      40   0.81
+1873   supervise        W 202 1   xvda1       10      40   1.02
+1871   supervise        W 202 1   xvda1       10      40   1.15
+1876   supervise        W 202 1   xvda1       10      40   1.10
+1892   supervise        W 202 1   xvda1       10      40   0.77
+1878   supervise        W 202 1   xvda1       10      40   0.72
+1886   supervise        W 202 1   xvda1       10      40   0.81
+1894   supervise        W 202 1   xvda1       10      40   0.86
+1869   supervise        W 202 1   xvda1       10      40   0.83
+1888   supervise        W 202 1   xvda1       10      40   0.79
+24467  kworker/0:2      R 202 32  xvdc         3      12   0.26
+1056   cron             R 202 1   xvda1        2       8   0.30
+24467  kworker/0:2      R 202 16  xvdb         1       4   0.23
+08:10:09 loadavg: 0.54 0.46 0.40 2/281 22668
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+22069  dd               R 202 1   xvda1      250    2000   0.34
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   2.40
+1880   supervise        W 202 1   xvda1        8      32   0.93
+1873   supervise        W 202 1   xvda1        8      32   0.76
+1871   supervise        W 202 1   xvda1        8      32   0.60
+1876   supervise        W 202 1   xvda1        8      32   0.61
+1892   supervise        W 202 1   xvda1        8      32   0.68
+1878   supervise        W 202 1   xvda1        8      32   0.90
+1886   supervise        W 202 1   xvda1        8      32   0.57
+1894   supervise        W 202 1   xvda1        8      32   0.97
+1869   supervise        W 202 1   xvda1        8      32   0.69
+1888   supervise        W 202 1   xvda1        8      32   0.67
+This shows another "dd" command reading from xvda1. On this system, various
+"supervise" processes do 8 disk writes per second, every second (they are
+creating and updating "status" files).
+USAGE message:
+# ./biotop.py -h
+usage: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
+Block device (disk) I/O by process
+positional arguments:
+  interval              output interval, in seconds
+  count                 number of outputs
+optional arguments:
+  -h, --help            show this help message and exit
+  -C, --noclear         don't clear the screen
+  -r MAXROWS, --maxrows MAXROWS
+                        maximum rows to print, default 20
+examples:
+    ./biotop            # block device I/O top, 1 second refresh
+    ./biotop -C         # don't clear the screen
+    ./biotop 5          # 5 second summaries
+    ./biotop 5 10       # 5 second summaries, 10 times only