statsnoop

ad341c94 · Brendan Gregg · fd10c90f · ad341c94 · ad341c94 · ad341c94
Commit ad341c94 authored Feb 09, 2016 by Brendan Gregg
Showing with 278 additions and 0 deletions

README.md README.md +1 -0

man/man8/statsnoop.8 man/man8/statsnoop.8 +86 -0

tools/statsnoop.py tools/statsnoop.py +118 -0

tools/statsnoop_example.txt tools/statsnoop_example.txt +73 -0

No files found.
--- a/README.md
+++ b/README.md
@@ -87,6 +87,7 @@ Tools:
 - tools/[softirqs](tools/softirqs.py):  Measure soft IRQ (soft interrupt) event time. [Examples](tools/softirqs_example.txt).
 - tools/[stackcount](tools/stackcount.py): Count kernel function calls and their stack traces. [Examples](tools/stackcount_example.txt).
 - tools/[stacksnoop](tools/stacksnoop.py): Trace a kernel function and print all kernel stack traces. [Examples](tools/stacksnoop_example.txt).
+- tools/[statsnoop](tools/statsnoop.py): Trace stat() syscalls. [Examples](tools/statsnoop_example.txt).
 - tools/[syncsnoop](tools/syncsnoop.py): Trace sync() syscall. [Examples](tools/syncsnoop_example.txt).
 - tools/[tcpaccept](tools/tcpaccept.py): Trace TCP passive connections (accept()). [Examples](tools/tcpaccept_example.txt).
 - tools/[tcpconnect](tools/tcpconnect.py): Trace TCP active connections (connect()). [Examples](tools/tcpconnect_example.txt).

--- a/man/man8/statsnoop.8
+++ b/man/man8/statsnoop.8
+.TH statsnoop 8  "2016-02-08" "USER COMMANDS"
+.SH NAME
+statsnoop \- Trace stat() syscalls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B statsnoop [\-h] [\-t] [\-x] [\-p PID]
+.SH DESCRIPTION
+statsnoop traces the different stat() syscalls, showing which processes are
+attempting to read information about which files. This can be useful for
+determining the location of config and log files, or for troubleshooting
+applications that are failing, especially on startup.
+
+This works by tracing various kernel sys_stat() functions using dynamic
+tracing, and will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-t
+Include a timestamp column.
+.TP
+\-x
+Only print failed stats.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.SH EXAMPLES
+.TP
+Trace all stat() syscalls:
+#
+.B statsnoop
+.TP
+Trace all stat() syscalls, and include timestamps:
+#
+.B statsnoop \-t
+.TP
+Trace only stat() syscalls that failed:
+#
+.B statsnoop \-x
+.TP
+Trace PID 181 only:
+#
+.B statsnoop \-p 181
+.SH FIELDS
+.TP
+TIME(s)
+Time of the call, in seconds.
+.TP
+PID
+Process ID
+.TP
+COMM
+Process name
+.TP
+FD
+File descriptor (if success), or -1 (if failed)
+.TP
+ERR
+Error number (see the system's errno.h)
+.TP
+PATH
+Open path
+.SH OVERHEAD
+This traces the kernel stat function and prints output for each event. As the
+rate of this is generally expected to be low (< 1000/s), the overhead is also
+expected to be negligible. If you have an application that is calling a high
+rate of stat()s, then test and understand overhead before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+opensnoop(1)
--- a/tools/statsnoop.py
+++ b/tools/statsnoop.py
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# statsnoop Trace stat() syscalls.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: statsnoop [-h] [-t] [-x] [-p PID]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 08-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+
+# arguments
+examples = """examples:
+    ./statsnoop           # trace all stat() syscalls
+    ./statsnoop -t        # include timestamps
+    ./statsnoop -x        # only show failed stats
+    ./statsnoop -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace stat() syscalls",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-x", "--failed", action="store_true",
+    help="only show failed stats")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+BPF_HASH(args_filename, u32, const char *);
+
+int trace_entry(struct pt_regs *ctx, const char __user *filename)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+
+    FILTER
+    args_filename.update(&pid, &filename);
+
+    return 0;
+};
+
+int trace_return(struct pt_regs *ctx)
+{
+    const char **filenamep;
+    int ret = ctx->ax;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    filenamep = args_filename.lookup(&pid);
+    if (filenamep == 0) {
+        // missed entry
+        return 0;
+    }
+
+    bpf_trace_printk("%s %d\\n", *filenamep, ret);
+    args_filename.delete(&pid);
+
+    return 0;
+}
+"""
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="sys_stat", fn_name="trace_entry")
+b.attach_kprobe(event="sys_statfs", fn_name="trace_entry")
+b.attach_kprobe(event="sys_newstat", fn_name="trace_entry")
+b.attach_kretprobe(event="sys_stat", fn_name="trace_return")
+b.attach_kretprobe(event="sys_statfs", fn_name="trace_return")
+b.attach_kretprobe(event="sys_newstat", fn_name="trace_return")
+
+# header
+if args.timestamp:
+    print("%-14s" % ("TIME(s)"), end="")
+print("%-6s %-16s %4s %3s %s" % ("PID", "COMM", "FD", "ERR", "PATH"))
+
+start_ts = 0
+
+# format output
+while 1:
+    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    (filename, ret_s) = msg.split(" ")
+
+    ret = int(ret_s)
+    if (args.failed and (ret >= 0)):
+        continue
+
+    # split return value into FD and errno columns
+    if ret >= 0:
+        fd_s = ret
+        err = 0
+    else:
+        fd_s = "-1"
+        err = - ret
+
+    # print columns
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = ts
+        print("%-14.9f" % (ts - start_ts), end="")
+    print("%-6d %-16s %4s %3s %s" % (pid, task, fd_s, err, filename))
--- a/tools/statsnoop_example.txt
+++ b/tools/statsnoop_example.txt
+Demonstrations of statsnoop, the Linux eBPF/bcc version.
+
+
+statsnoop traces the different stat() syscalls system-wide, and prints various
+details. Example output:
+
+# ./statsnoop 
+PID    COMM               FD ERR PATH
+31126  bash                0   0 .
+31126  bash               -1   2 /usr/local/sbin/iconfig
+31126  bash               -1   2 /usr/local/bin/iconfig
+31126  bash               -1   2 /usr/sbin/iconfig
+31126  bash               -1   2 /usr/bin/iconfig
+31126  bash               -1   2 /sbin/iconfig
+31126  bash               -1   2 /bin/iconfig
+31126  bash               -1   2 /usr/games/iconfig
+31126  bash               -1   2 /usr/local/games/iconfig
+31126  bash               -1   2 /apps/python/bin/iconfig
+31126  bash               -1   2 /mnt/src/llvm/build/bin/iconfig
+8902   command-not-fou    -1   2 /usr/bin/Modules/Setup
+8902   command-not-fou    -1   2 /usr/bin/lib/python3.4/os.py
+8902   command-not-fou    -1   2 /usr/bin/lib/python3.4/os.pyc
+8902   command-not-fou     0   0 /usr/lib/python3.4/os.py
+8902   command-not-fou    -1   2 /usr/bin/pybuilddir.txt
+8902   command-not-fou    -1   2 /usr/bin/lib/python3.4/lib-dynload
+8902   command-not-fou     0   0 /usr/lib/python3.4/lib-dynload
+8902   command-not-fou     0   0 /apps/python/lib/python2.7/site-packages
+8902   command-not-fou     0   0 /apps/python/lib/python2.7/site-packages
+8902   command-not-fou     0   0 /apps/python/lib/python2.7/site-packages
+8902   command-not-fou     0   0 /usr/lib/python3.4/
+8902   command-not-fou     0   0 /usr/lib/python3.4/
+[...]
+
+This output has caught me mistyping a command in another shell, "iconfig"
+instead of "ifconfig". The first several lines show the bash shell searching
+the $PATH, and failing to find it (ERR == 2 is file not found). Then, a
+"command-not-found" program executes (the name is truncated to 16 characters
+in the COMM field), which begins the process of searching for and suggesting
+a package. ie, this:
+
+# iconfig
+No command 'iconfig' found, did you mean:
+ Command 'vconfig' from package 'vlan' (main)
+ Command 'fconfig' from package 'redboot-tools' (universe)
+ Command 'mconfig' from package 'mono-devel' (main)
+ Command 'iwconfig' from package 'wireless-tools' (main)
+ Command 'zconfig' from package 'python-zconfig' (universe)
+ Command 'ifconfig' from package 'net-tools' (main)
+iconfig: command not found
+
+statsnoop can be used for general debugging, to see what file information has
+been requested, and whether those files exist. It can be used as a companion
+to opensnoop, which shows what files were actually opened.
+
+
+USAGE message:
+
+# ./statsnoop -h
+usage: statsnoop [-h] [-t] [-x] [-p PID]
+
+Trace stat() syscalls
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -t, --timestamp    include timestamp on output
+  -x, --failed       only show failed stats
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./statsnoop           # trace all stat() syscalls
+    ./statsnoop -t        # include timestamps
+    ./statsnoop -x        # only show failed stats
+    ./statsnoop -p 181    # only trace PID 181