Commit 74347318 authored by Mark Drayton's avatar Mark Drayton

fileslower/filetop: use de->d_name.name, add filtering

parent 4ebb7cf1
......@@ -2,7 +2,7 @@
.SH NAME
fileslower \- Trace slow synchronous file reads and writes.
.SH SYNOPSIS
.B fileslower [\-h] [\-p PID] [min_ms]
.B fileslower [\-h] [\-p PID] [-a] [min_ms]
.SH DESCRIPTION
This script uses kernel dynamic tracing of synchronous reads and writes
at the VFS interface, to identify slow file reads and writes for any file
......@@ -30,6 +30,9 @@ CONFIG_BPF and bcc.
\-p PID
Trace this PID only.
.TP
\-a
Include non-regular file types in output (sockets, FIFOs, etc).
.TP
min_ms
Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms.
.SH EXAMPLES
......
......@@ -4,11 +4,12 @@ filetop \- File reads and writes by filename and process. Top for files.
.SH SYNOPSIS
.B filetop [\-h] [\-C] [\-r MAXROWS] [\-p PID] [interval] [count]
.SH DESCRIPTION
This is top for files.
This is top for files.
This traces file reads and writes, and prints a per-file summary every
interval (by default, 1 second). The summary is sorted on the highest read
throughput (Kbytes).
throughput (Kbytes). By default only IO on regular files is shown. The -a
option will list all file types (sokets, FIFOs, etc).
This uses in-kernel eBPF maps to store per process summaries for efficiency.
......@@ -29,6 +30,9 @@ Since this uses BPF, only the root user can use this tool.
CONFIG_BPF and bcc.
.SH OPTIONS
.TP
\-a
Include non-regular file types (sockets, FIFOs, etc).
.TP
\-C
Don't clear the screen.
.TP
......
......@@ -4,7 +4,7 @@
# fileslower Trace slow synchronous file reads and writes.
# For Linux, uses BCC, eBPF.
#
# USAGE: fileslower [-h] [-p PID] [min_ms]
# USAGE: fileslower [-h] [-p PID] [-a] [min_ms]
#
# This script uses kernel dynamic tracing of synchronous reads and writes
# at the VFS interface, to identify slow file reads and writes for any file
......@@ -32,7 +32,6 @@ from __future__ import print_function
from bcc import BPF
import argparse
import ctypes as ct
import signal
import time
# arguments
......@@ -45,19 +44,17 @@ parser = argparse.ArgumentParser(
description="Trace slow synchronous file reads and writes",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-p", "--pid",
parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid",
help="trace this PID only")
parser.add_argument("-a", "--all-files", action="store_true",
help="include non-regular file types (sockets, FIFOs, etc)")
parser.add_argument("min_ms", nargs="?", default='10',
help="minimum I/O duration to trace, in ms (default 10)")
args = parser.parse_args()
min_ms = int(args.min_ms)
pid = args.pid
tgid = args.tgid
debug = 0
# signal handler
def signal_ignore(signal, frame):
print()
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
......@@ -72,6 +69,8 @@ enum trace_mode {
struct val_t {
u32 sz;
u64 ts;
u32 name_len;
// de->d_name.name may point to de->d_iname so limit len accordingly
char name[DNAME_INLINE_LEN];
char comm[TASK_COMM_LEN];
};
......@@ -81,6 +80,7 @@ struct data_t {
u32 pid;
u32 sz;
u64 delta_us;
u32 name_len;
char name[DNAME_INLINE_LEN];
char comm[TASK_COMM_LEN];
};
......@@ -92,22 +92,25 @@ BPF_PERF_OUTPUT(events);
static int trace_rw_entry(struct pt_regs *ctx, struct file *file,
char __user *buf, size_t count)
{
u32 pid;
pid = bpf_get_current_pid_tgid();
if (FILTER)
u32 tgid = bpf_get_current_pid_tgid() >> 32;
if (TGID_FILTER)
return 0;
u32 pid = bpf_get_current_pid_tgid();
// skip I/O lacking a filename
struct dentry *de = file->f_path.dentry;
if (de->d_iname[0] == 0)
int mode = file->f_inode->i_mode;
if (de->d_name.len == 0 || TYPE_FILTER)
return 0;
// store size and timestamp by pid
struct val_t val = {};
val.sz = count;
val.ts = bpf_ktime_get_ns();
__builtin_memcpy(&val.name, de->d_iname, sizeof(val.name));
val.name_len = de->d_name.len;
bpf_probe_read(&val.name, sizeof(val.name), (void *)de->d_name.name);
bpf_get_current_comm(&val.comm, sizeof(val.comm));
entryinfo.update(&pid, &val);
......@@ -153,6 +156,7 @@ static int trace_rw_return(struct pt_regs *ctx, int type)
data.pid = pid;
data.sz = valp->sz;
data.delta_us = delta_us;
data.name_len = valp->name_len;
bpf_probe_read(&data.name, sizeof(data.name), valp->name);
bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm);
events.perf_submit(ctx, &data, sizeof(data));
......@@ -172,15 +176,20 @@ int trace_write_return(struct pt_regs *ctx)
"""
bpf_text = bpf_text.replace('MIN_US', str(min_ms * 1000))
if args.pid:
bpf_text = bpf_text.replace('FILTER', 'pid != %s' % pid)
if args.tgid:
bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % tgid)
else:
bpf_text = bpf_text.replace('FILTER', '0')
bpf_text = bpf_text.replace('TGID_FILTER', '0')
if args.all_files:
bpf_text = bpf_text.replace('TYPE_FILTER', '0')
else:
bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)')
if debug:
print(bpf_text)
# initialize BPF
b = BPF(text=bpf_text)
b = BPF(text=bpf_text,)
# I'd rather trace these via new_sync_read/new_sync_write (which used to be
# do_sync_read/do_sync_write), but those became static. So trace these from
......@@ -205,6 +214,7 @@ class Data(ct.Structure):
("pid", ct.c_uint),
("sz", ct.c_uint),
("delta_us", ct.c_ulonglong),
("name_len", ct.c_uint),
("name", ct.c_char * DNAME_INLINE_LEN),
("comm", ct.c_char * TASK_COMM_LEN),
]
......@@ -216,7 +226,7 @@ mode_s = {
# header
print("Tracing sync read/writes slower than %d ms" % min_ms)
print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "PID", "D",
print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "TID", "D",
"BYTES", "LAT(ms)", "FILENAME"))
start_ts = time.time()
......@@ -225,10 +235,13 @@ def print_event(cpu, data, size):
event = ct.cast(data, ct.POINTER(Data)).contents
ms = float(event.delta_us) / 1000
name = event.name
if event.name_len > DNAME_INLINE_LEN:
name = name[:-3] + "..."
print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % (
time.time() - start_ts, event.comm, event.pid, mode_s[event.mode],
event.sz, ms, event.name))
event.sz, ms, name))
b["events"].open_perf_buffer(print_event)
while 1:
......
......@@ -105,7 +105,7 @@ locks, run queue latency, etc. These can be explored using other commands.
USAGE message:
# ./fileslower -h
usage: fileslower [-h] [-p PID] [min_ms]
usage: fileslower.py [-h] [-p PID] [-a] [min_ms]
Trace slow synchronous file reads and writes
......@@ -115,6 +115,7 @@ positional arguments:
optional arguments:
-h, --help show this help message and exit
-p PID, --pid PID trace this PID only
-a, --all-files include non-regular file types
examples:
./fileslower # trace sync file I/O slower than 10 ms (default)
......
......@@ -32,11 +32,13 @@ parser = argparse.ArgumentParser(
description="File reads and writes by process",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-a", "--all-files", action="store_true",
help="include non-regular file types (sockets, FIFOs, etc)")
parser.add_argument("-C", "--noclear", action="store_true",
help="don't clear the screen")
parser.add_argument("-r", "--maxrows", default=20,
help="maximum rows to print, default 20")
parser.add_argument("-p", "--pid",
parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid",
help="trace this PID only")
parser.add_argument("interval", nargs="?", default=1,
help="output interval, in seconds")
......@@ -61,13 +63,13 @@ bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>
#define MAX_FILE_LEN 32
// the key for the output summary
struct info_t {
u32 pid;
char name[TASK_COMM_LEN];
char file[MAX_FILE_LEN];
u32 name_len;
char comm[TASK_COMM_LEN];
// de->d_name.name may point to de->d_iname so limit len accordingly
char name[DNAME_INLINE_LEN];
char type;
};
......@@ -84,22 +86,23 @@ BPF_HASH(counts, struct info_t, struct val_t);
static int do_entry(struct pt_regs *ctx, struct file *file,
char __user *buf, size_t count, int is_read)
{
u32 pid;
pid = bpf_get_current_pid_tgid();
if (FILTER)
u32 tgid = bpf_get_current_pid_tgid() >> 32;
if (TGID_FILTER)
return 0;
u32 pid = bpf_get_current_pid_tgid();
// skip I/O lacking a filename
struct dentry *de = file->f_path.dentry;
if (de->d_iname[0] == 0)
int mode = file->f_inode->i_mode;
if (de->d_name.len == 0 || TYPE_FILTER)
return 0;
// store counts and sizes by pid & file
struct info_t info = {.pid = pid};
bpf_get_current_comm(&info.name, sizeof(info.name));
__builtin_memcpy(&info.file, de->d_iname, sizeof(info.file));
int mode = file->f_inode->i_mode;
bpf_get_current_comm(&info.comm, sizeof(info.comm));
info.name_len = de->d_name.len;
bpf_probe_read(&info.name, sizeof(info.name), (void *)de->d_name.name);
if (S_ISREG(mode)) {
info.type = 'R';
} else if (S_ISSOCK(mode)) {
......@@ -134,17 +137,28 @@ int trace_write_entry(struct pt_regs *ctx, struct file *file,
}
"""
if args.pid:
bpf_text = bpf_text.replace('FILTER', 'pid != %s' % args.pid)
if args.tgid:
bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % args.tgid)
else:
bpf_text = bpf_text.replace('TGID_FILTER', '0')
if args.all_files:
bpf_text = bpf_text.replace('TYPE_FILTER', '0')
else:
bpf_text = bpf_text.replace('FILTER', '0')
bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)')
if debug:
print(bpf_text)
# initialize BPF
b = BPF(text=bpf_text)
b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry")
b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry")
try:
b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry")
except:
# older kernels don't have __vfs_write so try vfs_write instead
b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry")
DNAME_INLINE_LEN = 32 # linux/dcache.h
print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
......@@ -163,19 +177,22 @@ while 1:
print()
with open(loadavg) as stats:
print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
print("%-6s %-16s %-6s %-6s %-7s %-7s %1s %s" % ("PID", "COMM",
print("%-6s %-16s %-6s %-6s %-7s %-7s %1s %s" % ("TID", "COMM",
"READS", "WRITES", "R_Kb", "W_Kb", "T", "FILE"))
# by-PID output
# by-TID output
counts = b.get_table("counts")
line = 0
for k, v in reversed(sorted(counts.items(),
key=lambda counts: counts[1].rbytes)):
name = k.name
if k.name_len > DNAME_INLINE_LEN:
name = name[:-3] + "..."
# print line
print("%-6d %-16s %-6d %-6d %-7d %-7d %1s %s" % (k.pid, k.name,
print("%-6d %-16s %-6d %-6d %-7d %-7d %1s %s" % (k.pid, k.comm,
v.reads, v.writes, v.rbytes / 1024, v.wbytes / 1024, k.type,
k.file))
name))
line += 1
if line >= maxrows:
......
......@@ -24,11 +24,9 @@ PID COMM READS WRITES R_Kb W_Kb T FILE
26628 ld 18 0 72 0 R hibernate.o
26628 ld 16 0 64 0 R suspend.o
26628 ld 16 0 64 0 R snapshot.o
26630 cat 1 0 64 0 O null
26628 ld 16 0 64 0 R qos.o
26628 ld 13 0 52 0 R main.o
26628 ld 12 0 52 0 R swap.o
12421 sshd 3 0 48 0 O ptmx
[...]
This shows various files read and written during a Linux kernel build. The
......@@ -40,7 +38,8 @@ While not printed, the average read and write size can be calculated by
dividing R_Kb by READS, and the same for writes.
The "T" column indicates the type of the file: "R" for regular files, "S" for
sockets, and "O" for other (including pipes).
sockets, and "O" for other (including pipes). By default only regular files are
shown; use the -a option to show all file types.
This script works by tracing the vfs_read() and vfs_write() functions using
kernel dynamic tracing, which instruments explicit read and write calls. If
......@@ -65,7 +64,6 @@ Tracing... Output every 1 secs. Hit Ctrl-C to end
PID COMM READS WRITES R_Kb W_Kb T FILE
32672 cksum 5006 0 320384 0 R data1
12296 sshd 2 0 32 0 O ptmx
809 run 2 0 8 0 R nsswitch.conf
811 run 2 0 8 0 R nsswitch.conf
804 chown 2 0 8 0 R nsswitch.conf
......@@ -89,13 +87,13 @@ PID COMM READS WRITES R_Kb W_Kb T FILE
848 run 2 0 8 0 R nsswitch.conf
[...]
This output shows a cksum command reading data1. Note that
This output shows a cksum command reading data1.
An optional interval and optional count can also be added to the end of the
command line. For example, for 1 second interval, and 3 summaries in total:
# ./filetop -Cr 5 1 3
# ./filetop -Cr 5 -a 1 3
Tracing... Output every 1 secs. Hit Ctrl-C to end
08:08:20 loadavg: 0.30 0.42 0.31 3/282 5187
......@@ -126,15 +124,15 @@ PID COMM READS WRITES R_Kb W_Kb T FILE
5227 run 2 0 8 0 R nsswitch.conf
Detaching...
This example has caught heavy socket I/O from an sshd process, showing up as
non-regular file types (the "O" for other, and "S" for socket, in the type
column: "T").
This example shows the -a option to include all file types. It caught heavy
socket I/O from an sshd process, showing up as non-regular file types (the "O"
for other, and "S" for socket, in the type column: "T").
USAGE message:
# ./filetop -h
usage: filetop [-h] [-C] [-r MAXROWS] [-p PID] [interval] [count]
usage: filetop.py [-h] [-a] [-C] [-r MAXROWS] [-p PID] [interval] [count]
File reads and writes by process
......@@ -144,6 +142,7 @@ positional arguments:
optional arguments:
-h, --help show this help message and exit
-a, --all-files include non-regular file types (sockets, FIFOs, etc)
-C, --noclear don't clear the screen
-r MAXROWS, --maxrows MAXROWS
maximum rows to print, default 20
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment