Try to encourage the AGRP fast-path to get inlined

Put the common-case (where we don't do any work) in an inline-able function, and keep the slow stuff hidden.

Try to encourage the AGRP fast-path to get inlined
Put the common-case (where we don't do any work) in an inline-able function, and keep the slow stuff hidden.
7fd78399 · Kevin Modzelewski · 544d93c7 · 7fd78399 · 7fd78399 · 7fd78399
Commit 7fd78399 authored Jul 07, 2015 by Kevin Modzelewski
4 changed files
--- a/section_ordering.txt
+++ b/section_ordering.txt
@@ -11,6 +11,7 @@
 .text._ZN6pyston15objectNewNoArgsEPNS_10BoxedClassE
 .text._PyIndex_Check
 .text._ZN6pyston9threading21allowGLReadPreemptionEv
+.text._ZN6pyston9threading22_allowGLReadPreemptionEv
 .text._ZN6pyston9getOpNameEi
 .text._ZN6pyston8callFuncEPNS_17BoxedFunctionBaseEPNS_15CallRewriteArgsENS_11ArgPassSpecEPNS_3BoxES6_S6_PS6_PKSt6vectorIPKSsSaISA_EE
 .text._ZN6pyston2gc9GCVisitor5visitEPv

--- a/src/core/threading.cpp
+++ b/src/core/threading.cpp
@@ -14,7 +14,6 @@

 #include "core/threading.h"

-#include <atomic>
 #include <cstdio>
 #include <cstdlib>
 #include <err.h>
@@ -481,7 +480,7 @@ extern "C" void endAllowThreads() noexcept {

 static pthread_mutex_t gil = PTHREAD_MUTEX_INITIALIZER;

-static std::atomic<int> threads_waiting_on_gil(0);
+std::atomic<int> threads_waiting_on_gil(0);
 static pthread_cond_t gil_acquired = PTHREAD_COND_INITIALIZER;

 extern "C" void PyEval_ReInitThreads() noexcept {
@@ -524,9 +523,6 @@ void releaseGLWrite() {
    pthread_mutex_unlock(&gil);
 }

-#define GIL_CHECK_INTERVAL 1000
-// Note: this doesn't need to be an atomic, since it should
-// only be accessed by the thread that holds the gil:
 int gil_check_count = 0;

 // TODO: this function is fair in that it forces a thread to give up the GIL
@@ -535,37 +531,19 @@ int gil_check_count = 0;
 // switching back and forth, and a third that never gets run.
 // We could enforce fairness by having a FIFO of events (implementd with mutexes?)
 // and make sure to always wake up the longest-waiting one.
-void allowGLReadPreemption() {
-#if ENABLE_SAMPLING_PROFILER
-    if (unlikely(sigprof_pending)) {
-        // Output multiple stacktraces if we received multiple signals
-        // between being able to handle it (such as being in LLVM or the GC),
-        // to try to fully account for that time.
-        while (sigprof_pending) {
-            _printStacktrace();
-            sigprof_pending--;
-        }
-    }
-#endif
+void _allowGLReadPreemption() {
+    assert(gil_check_count >= GIL_CHECK_INTERVAL);
+    gil_check_count = 0;

-    // Double-checked locking: first read with no ordering constraint:
-    if (!threads_waiting_on_gil.load(std::memory_order_relaxed))
+    // Double check this, since if we are wrong about there being a thread waiting on the gil,
+    // we're going to get stuck in the following pthread_cond_wait:
+    if (!threads_waiting_on_gil.load(std::memory_order_seq_cst))
        return;

-    gil_check_count++;
-    if (gil_check_count >= GIL_CHECK_INTERVAL) {
-        gil_check_count = 0;
-
-        // Double check this, since if we are wrong about there being a thread waiting on the gil,
-        // we're going to get stuck in the following pthread_cond_wait:
-        if (!threads_waiting_on_gil.load(std::memory_order_seq_cst))
-            return;
-
-        threads_waiting_on_gil++;
-        pthread_cond_wait(&gil_acquired, &gil);
-        threads_waiting_on_gil--;
-        pthread_cond_signal(&gil_acquired);
-    }
+    threads_waiting_on_gil++;
+    pthread_cond_wait(&gil_acquired, &gil);
+    threads_waiting_on_gil--;
+    pthread_cond_signal(&gil_acquired);
 }
 #elif THREADING_USE_GRWL
 static pthread_rwlock_t grwl = PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP;

--- a/src/core/threading.h
+++ b/src/core/threading.h
@@ -15,6 +15,7 @@
 #ifndef PYSTON_CORE_THREADING_H
 #define PYSTON_CORE_THREADING_H

+#include <atomic>
 #include <cstdint>
 #include <cstring>
 #include <ucontext.h>
@@ -81,7 +82,36 @@ void acquireGLRead();
 void releaseGLRead();
 void acquireGLWrite();
 void releaseGLWrite();
-void allowGLReadPreemption();
+void _allowGLReadPreemption();
+
+#define GIL_CHECK_INTERVAL 1000
+// Note: this doesn't need to be an atomic, since it should
+// only be accessed by the thread that holds the gil:
+extern int gil_check_count;
+extern std::atomic<int> threads_waiting_on_gil;
+inline void allowGLReadPreemption() {
+#if ENABLE_SAMPLING_PROFILER
+    if (unlikely(sigprof_pending)) {
+        // Output multiple stacktraces if we received multiple signals
+        // between being able to handle it (such as being in LLVM or the GC),
+        // to try to fully account for that time.
+        while (sigprof_pending) {
+            _printStacktrace();
+            sigprof_pending--;
+        }
+    }
+#endif
+
+    // Double-checked locking: first read with no ordering constraint:
+    if (!threads_waiting_on_gil.load(std::memory_order_relaxed))
+        return;
+
+    gil_check_count++;
+    if (likely(gil_check_count < GIL_CHECK_INTERVAL))
+        return;
+
+    _allowGLReadPreemption();
+}
 // Note: promoteGL is free to drop the lock and then reacquire
 void promoteGL();
 void demoteGL();

--- a/src/runtime/inline/link_forcer.cpp
+++ b/src/runtime/inline/link_forcer.cpp
@@ -17,6 +17,7 @@

 #include "codegen/irgen/hooks.h"
 #include "core/ast.h"
+#include "core/threading.h"
 #include "core/types.h"
 #include "gc/heap.h"
 #include "runtime/complex.h"
@@ -138,6 +139,8 @@ void force() {
    FORCE(boxedLocalsGet);
    FORCE(boxedLocalsDel);

+    FORCE(threading::allowGLReadPreemption);
+
    // FORCE(listIter);
 }
 }