Python-level sampling profiler

Uses setitimer() to set a recurring signal, and prints a Python stacktrace at the next safepoint aka allowGLReadPreemption. This is not great since allowGLReadPreemption can happen a decent amount later than the signal. (I'll play around with trying to get the signal to be acted on sooner, but it might be better to wait for full signal-handling support.) Still, it seems to provide some decent high-level info. For example, half of the startup time of the django-template benchmark seems to be due to regular expressions.

Python-level sampling profiler
Uses setitimer() to set a recurring signal, and prints a Python stacktrace at the next safepoint aka allowGLReadPreemption. This is not great since allowGLReadPreemption can happen a decent amount later than the signal. (I'll play around with trying to get the signal to be acted on sooner, but it might be better to wait for full signal-handling support.) Still, it seems to provide some decent high-level info. For example, half of the startup time of the django-template benchmark seems to be due to regular expressions.
7596e61a · Kevin Modzelewski · d618cd63 · 7596e61a · 7596e61a · 7596e61a
Commit 7596e61a authored May 29, 2015 by Kevin Modzelewski
13 changed files
--- a/src/codegen/ast_interpreter.cpp
+++ b/src/codegen/ast_interpreter.cpp
@@ -300,8 +300,6 @@ public:
 };

 Value ASTInterpreter::execute(ASTInterpreter& interpreter, CFGBlock* start_block, AST_stmt* start_at) {
-    threading::allowGLReadPreemption();
-
    STAT_TIMER(t0, "us_timer_astinterpreter_execute");

    void* frame_addr = __builtin_frame_address(0);
@@ -315,6 +313,11 @@ Value ASTInterpreter::execute(ASTInterpreter& interpreter, CFGBlock* start_block
        start_at = start_block->body[0];
    }

+    // Important that this happens after RegisterHelper:
+    interpreter.current_inst = start_at;
+    threading::allowGLReadPreemption();
+    interpreter.current_inst = NULL;
+
    interpreter.current_block = start_block;
    bool started = false;
    for (auto s : start_block->body) {

--- a/src/codegen/codegen.h
+++ b/src/codegen/codegen.h
@@ -87,6 +87,8 @@ extern GlobalState g;
 // in runtime_hooks.cpp:
 void initGlobalFuncs(GlobalState& g);

+extern int sigprof_pending;
+
 DS_DECLARE_RWLOCK(codegen_rwlock);
 }


--- a/src/codegen/entry.cpp
+++ b/src/codegen/entry.cpp
@@ -55,6 +55,8 @@

 namespace pyston {

+int sigprof_pending = 0;
+
 GlobalState g;

 extern "C" {
@@ -356,6 +358,10 @@ static void handle_sigusr1(int signum) {
    _printStacktrace();
 }

+static void handle_sigprof(int signum) {
+    sigprof_pending++;
+}
+
 static void handle_sigint(int signum) {
    assert(signum == SIGINT);
    // TODO: this should set a flag saying a KeyboardInterrupt is pending.
@@ -461,6 +467,13 @@ void initCodegen() {
    signal(SIGUSR1, &handle_sigusr1);
    signal(SIGINT, &handle_sigint);

+#if ENABLE_SAMPLING_PROFILER
+    struct itimerval prof_timer;
+    prof_timer.it_value.tv_sec = prof_timer.it_interval.tv_sec = 0;
+    prof_timer.it_value.tv_usec = prof_timer.it_interval.tv_usec = 1000;
+    signal(SIGPROF, handle_sigprof);
+    setitimer(ITIMER_PROF, &prof_timer, NULL);
+#endif

    // There are some parts of llvm that are only configurable through command line args,
    // so construct a fake argc/argv pair and pass it to the llvm command line machinery:

--- a/src/codegen/irgen.cpp
+++ b/src/codegen/irgen.cpp
@@ -594,7 +594,7 @@ static void emitBBs(IRGenState* irstate, TypeAnalysis* types, const OSREntryDesc

            // Function-entry safepoint:
            // TODO might be more efficient to do post-call safepoints?
-            generator->doSafePoint();
+            generator->doSafePoint(block->body[0]);
        } else if (entry_descriptor && block == entry_descriptor->backedge->target) {
            assert(block->predecessors.size() > 1);
            assert(osr_entry_block);
@@ -763,7 +763,7 @@ static void emitBBs(IRGenState* irstate, TypeAnalysis* types, const OSREntryDesc
            if (predecessor->idx > block->idx) {
                // Loop safepoint:
                // TODO does it matter which side of the backedge these are on?
-                generator->doSafePoint();
+                generator->doSafePoint(block->body[0]);
                break;
            }
        }

--- a/src/codegen/irgen/irgenerator.cpp
+++ b/src/codegen/irgen/irgenerator.cpp
@@ -2618,7 +2618,15 @@ public:
        }
    }

-    void doSafePoint() override { emitter.getBuilder()->CreateCall(g.funcs.allowGLReadPreemption); }
+    void doSafePoint(AST_stmt* next_statement) override {
+// If the sampling profiler is turned on (and eventually, destructors), we need frame-introspection
+// support while in allowGLReadPreemption:
+#if ENABLE_SAMPLING_PROFILER
+        emitter.createCall(UnwindInfo(next_statement, NULL), g.funcs.allowGLReadPreemption);
+#else
+        emitter.getBuilder()->CreateCall(g.funcs.allowGLReadPreemption);
+#endif
+    }
 };

 IRGenerator* createIRGenerator(IRGenState* irstate, std::unordered_map<CFGBlock*, llvm::BasicBlock*>& entry_blocks,

--- a/src/codegen/irgen/irgenerator.h
+++ b/src/codegen/irgen/irgenerator.h
@@ -127,7 +127,7 @@ public:
    virtual void copySymbolsFrom(SymbolTable* st) = 0;
    virtual void run(const CFGBlock* block) = 0; // primary entry point
    virtual EndingState getEndingSymbolTable() = 0;
-    virtual void doSafePoint() = 0;
+    virtual void doSafePoint(AST_stmt* next_statement) = 0;
    virtual void addFrameStackmapArgs(PatchpointInfo* pp, AST_stmt* current_stmt,
                                      std::vector<llvm::Value*>& stackmap_args) = 0;
 };

--- a/src/codegen/parser.cpp
+++ b/src/codegen/parser.cpp
@@ -1000,7 +1000,7 @@ AST_Module* parse_file(const char* fn) {

    if (ENABLE_PYPA_PARSER) {
        AST_Module* rtn = pypa_parse(fn);
-        RELEASE_ASSERT(rtn, "unknown parse error");
+        RELEASE_ASSERT(rtn, "unknown parse error (possibly: '%s'?)", strerror(errno));
        return rtn;
    }


--- a/src/codegen/unwinding.cpp
+++ b/src/codegen/unwinding.cpp
@@ -345,7 +345,7 @@ public:
                    return reinterpret_cast<AST_stmt*>(readLocation(e.locations[0]));
                }
            }
-            abort();
+            RELEASE_ASSERT(0, "no frame info found at offset 0x%x / ip 0x%lx!", offset, ip);
        } else if (id.type == PythonFrameId::INTERPRETED) {
            return getCurrentStatementForInterpretedFrame((void*)id.bp);
        }

--- a/src/core/options.h
+++ b/src/core/options.h
@@ -47,6 +47,8 @@ extern bool ENABLE_ICS, ENABLE_ICGENERICS, ENABLE_ICGETITEMS, ENABLE_ICSETITEMS,

 // Due to a temporary LLVM limitation, represent bools as i64's instead of i1's.
 extern bool BOOLS_AS_I64;
+
+#define ENABLE_SAMPLING_PROFILER 0
 }
 }


--- a/src/core/threading.cpp
+++ b/src/core/threading.cpp
@@ -24,12 +24,14 @@

 #include "Python.h"

+#include "codegen/codegen.h" // sigprof_pending
 #include "core/common.h"
 #include "core/options.h"
 #include "core/stats.h"
 #include "core/thread_utils.h"
 #include "core/util.h"
 #include "gc/collector.h"
+#include "runtime/objmodel.h" // _printStacktrace

 namespace pyston {
 namespace threading {
@@ -528,6 +530,18 @@ int gil_check_count = 0;
 // We could enforce fairness by having a FIFO of events (implementd with mutexes?)
 // and make sure to always wake up the longest-waiting one.
 void allowGLReadPreemption() {
+#if ENABLE_SAMPLING_PROFILER
+    if (unlikely(sigprof_pending)) {
+        // Output multiple stacktraces if we received multiple signals
+        // between being able to handle it (such as being in LLVM or the GC),
+        // to try to fully account for that time.
+        while (sigprof_pending) {
+            _printStacktrace();
+            sigprof_pending--;
+        }
+    }
+#endif
+
    // Double-checked locking: first read with no ordering constraint:
    if (!threads_waiting_on_gil.load(std::memory_order_relaxed))
        return;

--- a/src/runtime/stacktrace.cpp
+++ b/src/runtime/stacktrace.cpp
@@ -101,7 +101,16 @@ void raiseSyntaxErrorHelper(const std::string& file, const std::string& func, AS
 }

 void _printStacktrace() {
+    static bool recursive = false;
+
+    if (recursive) {
+        fprintf(stderr, "_printStacktrace ran into an issue; refusing to try it again!\n");
+        return;
+    }
+
+    recursive = true;
    printTraceback(getTraceback());
+    recursive = false;
 }

 // where should this go...

--- a/src/runtime/traceback.cpp
+++ b/src/runtime/traceback.cpp
@@ -85,6 +85,7 @@ void printTraceback(Box* b) {
                fprintf(stderr, "    %.*s\n", (int)r, ptr);
                free(buf);
            }
+            fclose(f);
        }
    }
 }

--- a/tools/process_stackdump.py
+++ b/tools/process_stackdump.py
+import sys
+
+if __name__ == "__main__":
+    fn = sys.argv[1]
+
+    tracebacks = []
+    cur_traceback = []
+    with open(fn) as f:
+        for l in f:
+            if l.startswith("Traceback"):
+                if cur_traceback:
+                    tracebacks.append(''.join(cur_traceback).strip())
+                cur_traceback = []
+            else:
+                cur_traceback.append(l)
+    if cur_traceback:
+        tracebacks.append(''.join(cur_traceback).strip())
+        cur_traceback = []
+
+    counts = {}
+    for t in tracebacks:
+        # dedupe on:
+        # key = t # full traceback
+        # key = '\n'.join(t.split('\n')[-8:]) # last 4 stack frames
+        # key = '\n'.join(t.split('\n')[-2:]) # last stack frame
+        key = t.split('  File "')[-1].split()[0][:-2] # filename of last stack trace
+        counts[key] = counts.get(key, 0) + 1
+
+    n = len(tracebacks)
+
+    NUM_DISPLAY = 20
+
+    entries = sorted(counts.items(), key=lambda (k, v): v)
+
+    if len(counts) > NUM_DISPLAY:
+        num_hidden = 0
+        counts_hidden = 0
+
+        for k, v in entries[:-NUM_DISPLAY]:
+            num_hidden += 1
+            counts_hidden += v
+        print "Hiding %d entries that occurred %d (%.1f%%) times" % (num_hidden, counts_hidden, 100.0 * counts_hidden / n)
+
+    for k, v in sorted(counts.items(), key=lambda (k, v): v)[-NUM_DISPLAY:]:
+        print
+        print "Occurs %d (%.1f%%) times:" % (v, 100.0 * v / n)
+        print k
+
+    print "Total tracebacks:", n