add runqlen tool

251823ab · Brendan Gregg · 39cece4d · 251823ab · 251823ab · 251823ab
Commit 251823ab authored Dec 14, 2016 by Brendan Gregg
5 changed files
--- a/README.md
+++ b/README.md
@@ -112,6 +112,7 @@ Examples:
 - tools/[profile](tools/profile.py): Profile CPU usage by sampling stack traces at a timed interval. [Examples](tools/profile_example.txt).
 - tools/[reset-trace](tools/reset-trace.sh): Reset the state of tracing. Maintenance tool only. [Examples](tools/reset-trace_example.txt).
 - tools/[runqlat](tools/runqlat.py): Run queue (scheduler) latency as a histogram. [Examples](tools/runqlat_example.txt).
+- tools/[runqlen](tools/runqlen.py): Run queue length as a histogram. [Examples](tools/runqlen_example.txt).
 - tools/[softirqs](tools/softirqs.py):  Measure soft IRQ (soft interrupt) event time. [Examples](tools/softirqs_example.txt).
 - tools/[solisten](tools/solisten.py): Trace TCP socket listen. [Examples](tools/solisten_example.txt).
 - tools/[sslsniff](tools/sslsniff.py): Sniff OpenSSL written and readed data. [Examples](tools/sslsniff_example.txt).

--- a/man/man8/runqlat.8
+++ b/man/man8/runqlat.8
@@ -102,4 +102,4 @@ Unstable - in development.
 .SH AUTHOR
 Brendan Gregg
 .SH SEE ALSO
-pidstat(1)
+runqlen(8), pidstat(1)
--- a/man/man8/runqlen.8
+++ b/man/man8/runqlen.8
+.TH runqlen 8  "2016-12-12" "USER COMMANDS"
+.SH NAME
+runqlen \- Scheduler run queue length as a histogram.
+.SH SYNOPSIS
+.B runqlen [\-h] [\-T] [\-O] [\-C] [interval] [count]
+.SH DESCRIPTION
+This program summarizes scheduler queue length as a histogram, and can also
+show run queue occupancy. It works by sampling the run queue length on all
+CPUs at 99 Hertz.
+This tool can be used to identify imbalances, eg, when processes are bound
+to CPUs causing queueing, or interrupt mappings causing the same.
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Include timestamps on output.
+.TP
+\-O
+Report run queue occupancy.
+.TP
+\-C
+Report for each CPU.
+.TP
+interval
+Output interval, in seconds.
+.TP
+count
+Number of outputs.
+.SH EXAMPLES
+.TP
+Summarize run queue length as a histogram:
+#
+.B runqlen
+.TP
+Print 1 second summaries, 10 times:
+#
+.B runqlen 1 10
+.TP
+Print output every second, with timestamps, and show each CPU separately:
+#
+.B runqlen \-CT 1
+.TP
+Print run queue occupancy every second:
+#
+.B runqlen \-O 1
+.TP
+Print run queue occupancy, with timetamps, for each CPU:
+#
+.B runqlen \-COT 1
+.SH FIELDS
+.TP
+runqlen
+Scheduler run queue length: the number of threads (tasks) waiting to run,
+(excluding including the currently running task).
+.TP
+count
+Number of samples at this queue length.
+.TP
+distribution
+An ASCII bar chart to visualize the distribution (count column)
+.SH OVERHEAD
+This uses sampling at 99 Hertz (on all CPUs), and in-kernel summaries, which
+should make overhead negligible. This does not trace scheduler events, like
+runqlen does, which comes at a much higher overhead cost.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+runqlat(8), pidstat(1)
--- a/tools/runqlen.py
+++ b/tools/runqlen.py
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# runqlen    Summarize scheduler run queue length as a histogram.
+#            For Linux, uses BCC, eBPF.
+#
+# This counts the length of the run queue, excluding the currently running
+# thread, and shows it as a histogram.
+#
+# Also answers run queue occupancy.
+#
+# USAGE: runqlen [-h] [-T] [-Q] [-m] [-D] [interval] [count]
+#
+# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
+# a version of this tool that may work on Linux 4.6 - 4.8.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 12-Dec-2016   Brendan Gregg   Created this.
+from __future__ import print_function
+from bcc import BPF, PerfType, PerfSWConfig
+from time import sleep, strftime
+import argparse
+# arguments
+examples = """examples:
+    ./runqlen            # summarize run queue length as a histogram
+    ./runqlen 1 10       # print 1 second summaries, 10 times
+    ./runqlen -T 1       # 1s summaries and timestamps
+    ./runqlen -O         # report run queue occupancy
+    ./runqlen -C         # show each CPU separately
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize scheduler run queue length as a histogram",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-O", "--runqocc", action="store_true",
+    help="report run queue occupancy")
+parser.add_argument("-C", "--cpus", action="store_true",
+    help="print output for each CPU separately")
+parser.add_argument("interval", nargs="?", default=99999999,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+args = parser.parse_args()
+countdown = int(args.count)
+debug = 0
+frequency = 99
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+// Declare enough of cfs_rq to find nr_running, since we can't #import the
+// header. This will need maintenance. It is from kernel/sched/sched.h:
+struct cfs_rq_partial {
+    struct load_weight load;
+    unsigned int nr_running, h_nr_running;
+};
+typedef struct cpu_key {
+    int cpu;
+    unsigned int slot;
+} cpu_key_t;
+STORAGE
+int do_perf_event()
+{
+    unsigned int len = 0;
+    pid_t pid = 0;
+    struct task_struct *task = NULL;
+    struct cfs_rq_partial *my_q = NULL;
+    // Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
+    // unstable interface and may need maintenance. Perhaps a future version
+    // of BPF will support task_rq(p) or something similar as a more reliable
+    // interface.
+    task = (struct task_struct *)bpf_get_current_task();
+    bpf_probe_read(&my_q, sizeof(my_q), &task->se.cfs_rq);
+    bpf_probe_read(&len, sizeof(len), &my_q->nr_running);
+    // Decrement idle thread by dropping the run queue by one. We could do
+    // this other ways if needed, like matching on task->pid.
+    if (len > 0)
+        len--;
+    STORE
+    return 0;
+}
+"""
+if args.cpus:
+    bpf_text = bpf_text.replace('STORAGE',
+        'BPF_HISTOGRAM(dist, cpu_key_t);')
+    bpf_text = bpf_text.replace('STORE', 'cpu_key_t key = {.slot = len}; ' +
+        'bpf_probe_read(&key.cpu, sizeof(key.cpu), &task->wake_cpu);' +
+        'dist.increment(key);')
+else:
+    bpf_text = bpf_text.replace('STORAGE',
+        'BPF_HISTOGRAM(dist, unsigned int);')
+    bpf_text = bpf_text.replace('STORE', 'dist.increment(len);')
+# code substitutions
+if debug:
+    print(bpf_text)
+# load BPF program
+b = BPF(text=bpf_text)
+# initialize BPF & perf_events
+b = BPF(text=bpf_text)
+b.attach_perf_event(ev_type=PerfType.SOFTWARE,
+    ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event",
+    sample_period=0, sample_freq=frequency)
+print("Sampling run queue length... Hit Ctrl-C to end.")
+# output
+exiting = 0 if args.interval else 1
+dist = b.get_table("dist")
+while (1):
+    try:
+        sleep(int(args.interval))
+    except KeyboardInterrupt:
+        exiting = 1
+    print()
+    if args.timestamp:
+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
+    if args.runqocc:
+        if args.cpus:
+            # run queue occupancy, per-CPU summary
+            idle = {}
+            queued = {}
+            cpumax = 0
+            for k, v in dist.items():
+                if k.cpu > cpumax:
+                    cpumax = k.cpu
+            for c in range(0, cpumax + 1):
+                idle[c] = 0
+                queued[c] = 0
+            for k, v in dist.items():
+                if k.slot == 0:
+                    idle[k.cpu] += v.value
+                else:
+                    queued[k.cpu] += v.value
+            for c in range(0, cpumax + 1):
+                samples = idle[c] + queued[c]
+                if samples:
+                    runqocc = float(queued[c]) / samples
+                else:
+                    runqocc = 0
+                print("runqocc, CPU %-3d %6.2f%%" % (c, 100 * runqocc))
+        else:
+            # run queue occupancy, system-wide summary
+            idle = 0
+            queued = 0
+            for k, v in dist.items():
+                if k.value == 0:
+                    idle += v.value
+                else:
+                    queued += v.value
+            samples = idle + queued
+            if samples:
+                runqocc = float(queued) / samples
+            else:
+                runqocc = 0
+            print("runqocc: %0.2f%%" % (100 * runqocc))
+    else:
+        # run queue length histograms
+        dist.print_linear_hist("runqlen", "cpu")
+    dist.clear()
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
--- a/tools/runqlen_example.txt
+++ b/tools/runqlen_example.txt