Add enough locking to make the GRWL somewhat safe

Add some basic locking to: - code generation (one lock for all of it) - garbage collection (spin lock for allocations, global serialization for collections) - lists (mutex per list object) Can run the GRWL on some simple tests (microbenchmarks/thread_contention.py and thread_uncontended.py) Performance is not great yet

Add enough locking to make the GRWL somewhat safe
Add some basic locking to: - code generation (one lock for all of it) - garbage collection (spin lock for allocations, global serialization for collections) - lists (mutex per list object) Can run the GRWL on some simple tests (microbenchmarks/thread_contention.py and thread_uncontended.py) Performance is not great yet
94061fa5 · Kevin Modzelewski · aa2b47ec · 94061fa5 · 94061fa5 · 94061fa5
Commit 94061fa5 authored Jun 16, 2014 by Kevin Modzelewski
24 changed files
--- a/docs/INSTALLING.md
+++ b/docs/INSTALLING.md
@@ -208,3 +208,31 @@ sudo apt-get install rlwrap
 ```

 and when you do `make run`, the Make system will invoke rlwrap.  If you want to invoke the repl manually, you can do `rlwrap ./pyston`
+
+### ninja-based LLVM build
+
+Ninja is supposed to be faster than make; I've only tried it very briefly, and it does seem to be faster when modifying LLVM files.  May or may not be worth using; thought I'd jot down my notes though:
+
+```
+cd ~/pyston_deps
+git clone https://github.com/martine/ninja.git
+cd ninja
+git checkout v1.4.0
+./bootstrap.py
+```
+
+```
+cd ~/pyston_deps
+wget http://www.cmake.org/files/v3.0/cmake-3.0.0.tar.gz
+cd cmake-3.0.0
+./configure
+make -j4
+```
+
+```
+cd ~/pyston_deps
+mkdir llvm-trunk-cmake
+cd llvm-trunk-cmake
+CXX=g++ CC=gcc PATH=~/pyston_deps/gcc-4.8.2-install/bin:$PATH:~/pyston_deps/ninja CMAKE_MAKE_PROGRAM=~/pyston_deps/ninja/ninja ~/pyston_deps/cmake-3.0.0/bin/cmake ../llvm-trunk -G Ninja -DLLVM_TARGETS_TO_BUILD=host -DCMAKE_BUILD_TYPE=RELEASE -DLLVM_ENABLE_ASSERTIONS=ON
+~/pyston_deps/ninja/ninja # runs in parallel
+```
--- a/microbenchmarks/thread_contention.py
+++ b/microbenchmarks/thread_contention.py
+from thread import start_new_thread
+import time
+
+work = []
+done = []
+def run(idx, num):
+    print "thread %d starting" % idx, work
+    for i in xrange(num):
+        t = work.pop()
+        work.append(t - 1)
+        if i % 100000 == 0:
+            print idx, i
+    done.append(num)
+    print "thread %d done" % idx
+
+print "starting!"
+
+nthreads = 1
+N = 20000000 / nthreads
+for i in xrange(nthreads):
+    work.append(N)
+for i in xrange(nthreads):
+    t = start_new_thread(run, (i, N))
+
+while len(done) < nthreads:
+    time.sleep(0.1)
+
+# print work
+assert sum(work) == 0
+print work
--- a/microbenchmarks/thread_uncontended.py
+++ b/microbenchmarks/thread_uncontended.py
+from thread import start_new_thread
+import time
+
+done = []
+def run(idx, work, num):
+    print "thread %d starting" % idx, work
+    for i in xrange(num):
+        # t = work.pop()
+        # work.append(t - 1)
+        if i % 100000 == 0:
+            print idx, i
+    done.append(num)
+    print "thread %d done" % idx
+
+print "starting!"
+
+nthreads = 1
+N = 20000000 / nthreads
+for i in xrange(nthreads):
+    t = start_new_thread(run, (i, [N], N))
+
+while len(done) < nthreads:
+    time.sleep(0.1)
+
+# print work
--- a/src/Makefile
+++ b/src/Makefile
@@ -710,12 +710,18 @@ endef
 RUN_DEPS := ext

 .PHONY: run run_release profile
-run: $(RUN_DEPS)
+run: pyston_dbg $(RUN_DEPS)
 	if which rlwrap >/dev/null; then\
 		rlwrap ./pyston_dbg $(ARGS) ;\
 	else \
 		./pyston_dbg $(ARGS) ;\
 	fi
+dbg: pyston_dbg $(RUN_DEPS)
+	if which rlwrap >/dev/null; then\
+		rlwrap zsh -c 'ulimit -v $(MAX_DBG_MEM_KB); $(GDB) $(GDB_CMDS) --args ./pyston_dbg $(ARGS)' ; \
+	else \
+		zsh -c 'ulimit -v $(MAX_DBG_MEM_KB); $(GDB) $(GDB_CMDS) --args ./pyston_dbg $(ARGS)' ; \
+	fi
 run_release: pyston $(RUN_DEPS)
 	./pyston $(ARGS)
 profile: pyston_profile $(RUN_DEPS)
@@ -785,7 +791,7 @@ $(call make_search,pprof_%)

 perf_%: %.py pyston
 	perf record -g -- ./pyston -q -p $(ARGS) $<
-	perf report -v -g flat,1000 | bash $(TOOLS_DIR)/cumulate.sh | less -S
+	perf report -v -n -g flat,1000 | bash $(TOOLS_DIR)/cumulate.sh | less -S
 $(call make_search,perf_%)
 perf_dbg_%: %.py pyston_dbg
 	perf record -g -- ./pyston_dbg -q -p $(ARGS) $<

--- a/src/codegen/codegen.cpp
+++ b/src/codegen/codegen.cpp
@@ -28,6 +28,8 @@

 namespace pyston {

+DS_DEFINE_RWLOCK(codegen_rwlock);
+
 void FunctionAddressRegistry::registerFunction(const std::string& name, void* addr, int length,
                                               llvm::Function* llvm_func) {
    assert(addr);

--- a/src/codegen/codegen.h
+++ b/src/codegen/codegen.h
@@ -18,6 +18,7 @@
 #include <unordered_map>

 #include "codegen/runtime_hooks.h"
+#include "core/threading.h"
 #include "core/types.h"

 namespace llvm {
@@ -85,6 +86,8 @@ extern GlobalState g;
 void initGlobalFuncs(GlobalState& g);

 const LineInfo* getLineInfoFor(uint64_t inst_addr);
+
+DS_DECLARE_RWLOCK(codegen_rwlock);
 }

 #endif
--- a/src/codegen/irgen/hooks.cpp
+++ b/src/codegen/irgen/hooks.cpp
@@ -140,6 +140,7 @@ static void compileIR(CompiledFunction* cf, EffortLevel::EffortLevel effort) {

 // Compiles a new version of the function with the given signature and adds it to the list;
 // should only be called after checking to see if the other versions would work.
+// The codegen_lock needs to be held in W mode before calling this function:
 CompiledFunction* compileFunction(CLFunction* f, FunctionSpecialization* spec, EffortLevel::EffortLevel effort,
                                  const OSREntryDescriptor* entry) {
    Timer _t("for compileFunction()");
@@ -239,6 +240,8 @@ CompiledFunction* compileFunction(CLFunction* f, FunctionSpecialization* spec, E
 }

 void compileAndRunModule(AST_Module* m, BoxedModule* bm) {
+    LOCK_REGION(codegen_rwlock.asWrite());
+
    Timer _t("for compileModule()");

    ScopingAnalysis* scoping = runScopingAnalysis(m);
@@ -268,6 +271,8 @@ void compileAndRunModule(AST_Module* m, BoxedModule* bm) {
 /// The cf must be an active version in its parents CLFunction; the given
 /// version will be replaced by the new version, which will be returned.
 static CompiledFunction* _doReopt(CompiledFunction* cf, EffortLevel::EffortLevel new_effort) {
+    LOCK_REGION(codegen_rwlock.asWrite());
+
    assert(cf->clfunc->versions.size());

    assert(cf);
@@ -304,6 +309,8 @@ static CompiledFunction* _doReopt(CompiledFunction* cf, EffortLevel::EffortLevel

 static StatCounter stat_osrexits("OSR exits");
 void* compilePartialFunc(OSRExit* exit) {
+    LOCK_REGION(codegen_rwlock.asWrite());
+
    assert(exit);
    assert(exit->parent_cf);
    assert(exit->parent_cf->effort < EffortLevel::MAXIMAL);

--- a/src/codegen/llvm_interpreter.cpp
+++ b/src/codegen/llvm_interpreter.cpp
@@ -216,7 +216,7 @@ threading::PerThread<root_stack_t> thread_local root_stack(&root_stack_set);
 void gatherInterpreterRoots(GCVisitor* visitor) {
    // In theory this lock should be superfluous since we should only call this
    // inside a sequential section, but lock it anyway:
-    threading::LockedRegion _lock(&root_stack_set.lock);
+    LOCK_REGION(&root_stack_set.lock);

    for (auto& p : root_stack_set.map) {
        for (const SymMap* sym_map : *p.second) {

--- a/src/codegen/runtime_hooks.h
+++ b/src/codegen/runtime_hooks.h
@@ -15,6 +15,10 @@
 #ifndef PYSTON_CODEGEN_RUNTIMEHOOKS_H
 #define PYSTON_CODEGEN_RUNTIMEHOOKS_H

+// This file doesn't actually need to include core/types.h, but including it works around this clang bug:
+// http://lists.cs.uiuc.edu/pipermail/cfe-dev/2014-June/037519.html
+#include "core/types.h"
+
 namespace llvm {
 class Value;
 }

--- a/src/core/common.h
+++ b/src/core/common.h
@@ -29,6 +29,8 @@

 #define _STRINGIFY(N) #N
 #define STRINGIFY(N) _STRINGIFY(N)
+#define _CAT(A, B) A##B
+#define CAT(A, B) _CAT(A, B)

 // GCC and clang handle always_inline very differently;
 // we mostly only care about it for the stdlib, so just remove the attributes

--- a/src/core/thread_utils.h
+++ b/src/core/thread_utils.h
@@ -16,24 +16,109 @@
 #define PYSTON_CORE_THREADUTILS_H

 #include <pthread.h>
+#include <unordered_map>

 namespace pyston {
 namespace threading {

-class LockedRegion {
+template <typename T> class _LockedRegion {
 private:
-    pthread_mutex_t* mutex;
+    T* const mutex;

 public:
-    LockedRegion(pthread_mutex_t* mutex) : mutex(mutex) { pthread_mutex_lock(mutex); }
-    ~LockedRegion() { pthread_mutex_unlock(mutex); }
+    _LockedRegion(T* mutex) : mutex(mutex) { mutex->lock(); }
+    ~_LockedRegion() { mutex->unlock(); }
 };

+template <typename T> _LockedRegion<T> _makeLockedRegion(T* mutex) {
+    return _LockedRegion<T>(mutex);
+}
+template <typename T> _LockedRegion<T> _makeLockedRegion(T& mutex) {
+    return _LockedRegion<T>(&mutex);
+}
+#define LOCK_REGION(lock) auto CAT(_lock_, __LINE__) = pyston::threading::_makeLockedRegion(lock)
+
+class NopLock {
+public:
+    void lock() {}
+    void unlock() {}
+
+    NopLock* asRead() { return this; }
+    NopLock* asWrite() { return this; }
+};
+
+class PthreadFastMutex {
+private:
+    pthread_mutex_t mutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP;
+
+public:
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+
+    PthreadFastMutex* asRead() { return this; }
+    PthreadFastMutex* asWrite() { return this; }
+};
+
+class PthreadMutex {
+private:
+    pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+
+public:
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+
+    PthreadMutex* asRead() { return this; }
+    PthreadMutex* asWrite() { return this; }
+};
+
+class PthreadRWLock {
+private:
+    pthread_rwlock_t rwlock = PTHREAD_RWLOCK_INITIALIZER;
+
+public:
+    class PthreadRWLockRead {
+    private:
+        pthread_rwlock_t rwlock;
+        PthreadRWLockRead() = delete;
+
+    public:
+        void lock() { pthread_rwlock_rdlock(&rwlock); }
+        void unlock() { pthread_rwlock_unlock(&rwlock); }
+    };
+
+    class PthreadRWLockWrite {
+    private:
+        pthread_rwlock_t rwlock;
+        PthreadRWLockWrite() = delete;
+
+    public:
+        void lock() { pthread_rwlock_wrlock(&rwlock); }
+        void unlock() { pthread_rwlock_unlock(&rwlock); }
+    };
+
+    PthreadRWLockRead* asRead() { return reinterpret_cast<PthreadRWLockRead*>(this); }
+
+    PthreadRWLockWrite* asWrite() { return reinterpret_cast<PthreadRWLockWrite*>(this); }
+};
+
+class PthreadSpinLock {
+private:
+    pthread_spinlock_t spinlock;
+
+public:
+    PthreadSpinLock() { pthread_spin_init(&spinlock, false); }
+
+    void lock() { pthread_spin_lock(&spinlock); }
+    void unlock() { pthread_spin_unlock(&spinlock); }
+
+    PthreadSpinLock* asRead() { return this; }
+    PthreadSpinLock* asWrite() { return this; }
+};


 template <typename T> class PerThreadSet {
 public:
-    pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+    PthreadFastMutex lock;
    std::unordered_map<pthread_t, T*> map;
 };

@@ -46,13 +131,13 @@ public:
    T value;

    PerThread(PerThreadSet<T>* set) : set(set), self(pthread_self()) {
-        LockedRegion _lock(&set->lock);
+        LOCK_REGION(&set->lock);

        set->map[self] = &value;
    }

    ~PerThread() {
-        LockedRegion _lock(&set->lock);
+        LOCK_REGION(&set->lock);

        assert(set->map.count(self) == 1);
        set->map.erase(self);

--- a/src/core/threading.cpp
+++ b/src/core/threading.cpp
@@ -46,7 +46,7 @@ int tgkill(int tgid, int tid, int sig) {
 // and wait until they start up.
 int num_starting_threads(0);

-static pthread_mutex_t threading_lock = PTHREAD_MUTEX_INITIALIZER;
+PthreadFastMutex threading_lock;
 struct ThreadInfo {
    // "bottom" in the sense of a stack, which in a down-growing stack is the highest address:
    void* stack_bottom;
@@ -86,14 +86,14 @@ std::vector<ThreadState> getAllThreadStates() {
    // though I suppose that will have been taken care of
    // by the caller of this function.

-    LockedRegion _lock(&threading_lock);
+    LOCK_REGION(&threading_lock);

    while (true) {
        // TODO shouldn't busy-wait:
        if (num_starting_threads) {
-            pthread_mutex_unlock(&threading_lock);
+            threading_lock.unlock();
            sleep(0);
-            pthread_mutex_lock(&threading_lock);
+            threading_lock.lock();
        } else {
            break;
        }
@@ -129,9 +129,9 @@ std::vector<ThreadState> getAllThreadStates() {

    // TODO shouldn't busy-wait:
    while (signals_waiting) {
-        pthread_mutex_unlock(&threading_lock);
+        threading_lock.unlock();
        sleep(0);
-        pthread_mutex_lock(&threading_lock);
+        threading_lock.lock();
    }

    assert(num_starting_threads == 0);
@@ -140,7 +140,7 @@ std::vector<ThreadState> getAllThreadStates() {
 }

 static void _thread_context_dump(int signum, siginfo_t* info, void* _context) {
-    LockedRegion _lock(&threading_lock);
+    LOCK_REGION(&threading_lock);

    ucontext_t* context = static_cast<ucontext_t*>(_context);

@@ -169,7 +169,7 @@ static void* _thread_start(void* _arg) {
    delete arg;

    {
-        LockedRegion _lock(&threading_lock);
+        LOCK_REGION(&threading_lock);

        pid_t tid = gettid();
        pthread_t current_thread = pthread_self();
@@ -207,7 +207,7 @@ static void* _thread_start(void* _arg) {
    void* rtn = start_func(arg1, arg2, arg3);

    {
-        LockedRegion _lock(&threading_lock);
+        LOCK_REGION(&threading_lock);

        current_threads.erase(gettid());
        saved_thread_states.erase(gettid());
@@ -220,7 +220,7 @@ static void* _thread_start(void* _arg) {

 intptr_t start_thread(void* (*start_func)(Box*, Box*, Box*), Box* arg1, Box* arg2, Box* arg3) {
    {
-        LockedRegion _lock(&threading_lock);
+        LOCK_REGION(&threading_lock);
        num_starting_threads++;
    }

@@ -282,7 +282,7 @@ static void* find_stack() {

 intptr_t call_frame_base;
 void registerMainThread() {
-    LockedRegion _lock(&threading_lock);
+    LOCK_REGION(&threading_lock);

    // Would be nice if we could set this to the pthread start_thread,
    // since _thread_start doesn't always show up in the traceback.
@@ -316,7 +316,7 @@ GLAllowThreadsReadRegion::GLAllowThreadsReadRegion() {
    releaseGLRead();

    {
-        LockedRegion _lock(&threading_lock);
+        LOCK_REGION(&threading_lock);

        ThreadStateInternal& state = saved_thread_states[gettid()];
        assert(!state.valid);
@@ -327,7 +327,7 @@ GLAllowThreadsReadRegion::GLAllowThreadsReadRegion() {

 GLAllowThreadsReadRegion::~GLAllowThreadsReadRegion() {
    {
-        LockedRegion _lock(&threading_lock);
+        LOCK_REGION(&threading_lock);
        saved_thread_states[gettid()].valid = false;
    }

@@ -337,6 +337,10 @@ GLAllowThreadsReadRegion::~GLAllowThreadsReadRegion() {


 #if THREADING_USE_GIL
+#if THREADING_USE_GRWL
+#error "Can't turn on both the GIL and the GRWL!"
+#endif
+
 static pthread_mutex_t gil = PTHREAD_MUTEX_INITIALIZER;

 static std::atomic<int> threads_waiting_on_gil(0);
@@ -367,9 +371,7 @@ void allowGLReadPreemption() {
        acquireGLRead();
    }
 }
-#endif
-
-#if THREADING_USE_GRWL
+#elif THREADING_USE_GRWL
 static pthread_rwlock_t grwl = PTHREAD_RWLOCK_INITIALIZER;

 enum class GRWLHeldState {
@@ -420,16 +422,23 @@ void demoteGL() {
    acquireGLRead();
 }

+static __thread int gl_check_count = 0;
 void allowGLReadPreemption() {
    assert(grwl_state == GRWLHeldState::R);

-    if (!writers_waiting.load(std::memory_order_relaxed))
+    gl_check_count++;
+    if (gl_check_count < 1000)
+        return;
+    gl_check_count = 0;
+
+    if (__builtin_expect(!writers_waiting.load(std::memory_order_relaxed), 1))
        return;

    pthread_rwlock_unlock(&grwl);
+    // printf("waiters!\n");
+    sleep(0);
    pthread_rwlock_rdlock(&grwl);
 }
-
 #endif

 } // namespace threading

--- a/src/core/threading.h
+++ b/src/core/threading.h
@@ -20,6 +20,9 @@
 #include <ucontext.h>
 #include <vector>

+#include "core/common.h"
+#include "core/thread_utils.h"
+
 namespace pyston {
 class Box;

@@ -58,6 +61,22 @@ void* getStackBottom();
 #define THREADING_USE_GRWL 0
 #define THREADING_SAFE_DATASTRUCTURES THREADING_USE_GRWL

+#if THREADING_SAFE_DATASTRUCTURES
+#define DS_DEFINE_MUTEX(name) pyston::threading::PthreadFastMutex name
+
+#define DS_DECLARE_RWLOCK(name) extern pyston::threading::PthreadRWLock name
+#define DS_DEFINE_RWLOCK(name) pyston::threading::PthreadRWLock name
+
+#define DS_DEFINE_SPINLOCK(name) pyston::threading::PthreadSpinLock name
+#else
+#define DS_DEFINE_MUTEX(name) pyston::threading::NopLock name
+
+#define DS_DECLARE_RWLOCK(name) extern pyston::threading::NopLock name
+#define DS_DEFINE_RWLOCK(name) pyston::threading::NopLock name
+
+#define DS_DEFINE_SPINLOCK(name) pyston::threading::NopLock name
+#endif
+
 void acquireGLRead();
 void releaseGLRead();
 void acquireGLWrite();
@@ -102,6 +121,23 @@ inline void demoteGL() {
 }
 #endif

+#if !THREADING_USE_GIL && !THREADING_USE_GRWL
+inline void acquireGLRead() {
+}
+inline void releaseGLRead() {
+}
+inline void acquireGLWrite() {
+}
+inline void releaseGLWrite() {
+}
+inline void promoteGL() {
+}
+inline void demoteGL() {
+}
+inline void allowGLReadPreemption() {
+}
+#endif
+

 } // namespace threading
 } // namespace pyston

--- a/src/core/util.cpp
+++ b/src/core/util.cpp
@@ -26,20 +26,28 @@
 namespace pyston {

 int Timer::level = 0;
-Timer::Timer(const char* desc, int min_usec) : min_usec(min_usec), ended(true) {
+
+Timer::Timer(const char* desc) : min_usec(-1), ended(true) {
+    restart(desc);
+}
+Timer::Timer(const char* desc, long min_usec) : min_usec(min_usec), ended(true) {
    restart(desc);
 }

-void Timer::restart(const char* newdesc, int min_usec) {
+void Timer::restart(const char* newdesc) {
    assert(ended);

    desc = newdesc;
-    this->min_usec = min_usec;
    gettimeofday(&start_time, NULL);
    Timer::level++;
    ended = false;
 }

+void Timer::restart(const char* newdesc, long new_min_usec) {
+    this->min_usec = new_min_usec;
+    restart(newdesc);
+}
+
 long Timer::end() {
    if (!ended) {
        timeval end;

--- a/src/core/util.h
+++ b/src/core/util.h
@@ -28,18 +28,21 @@ private:
    static int level;
    timeval start_time;
    const char* desc;
-    int min_usec;
+    long min_usec;
    bool ended;

 public:
-    Timer(const char* desc, int min_usec = -1);
+    Timer(const char* desc);
+    Timer(const char* desc, long min_usec);
    ~Timer();

-    void restart(const char* newdesc, int min_usec = -1);
+    void restart(const char* newdesc, long new_min_usec);
+    void restart(const char* newdesc);
+
    long end();
-    long split(const char* newdesc, int min_usec = -1) {
+    long split(const char* newdesc) {
        long rtn = end();
-        restart(newdesc, min_usec);
+        restart(newdesc);
        return rtn;
    }
 };

--- a/src/gc/collector.cpp
+++ b/src/gc/collector.cpp
@@ -22,6 +22,7 @@
 #include "core/common.h"
 #include "core/threading.h"
 #include "core/types.h"
+#include "core/util.h"
 #include "gc/heap.h"
 #include "gc/root_finder.h"

@@ -35,9 +36,6 @@
 namespace pyston {
 namespace gc {

-// unsigned numAllocs = 0;
-unsigned bytesAllocatedSinceCollection = 0;
-
 static TraceStack roots;
 void registerStaticRootObj(void* obj) {
    assert(global_heap.getAllocationFromInteriorPointer(obj));
@@ -159,17 +157,26 @@ void runCollection() {
    static StatCounter sc("gc_collections");
    sc.log();

-    threading::GLPromoteRegion _lock;
-
    if (VERBOSITY("gc") >= 2)
        printf("Collection #%d\n", ++ncollections);

+    threading::GLPromoteRegion _lock;
+
+    Timer _t("collecting", /*min_usec=*/10000);
+
+
    // if (ncollections == 754) {
    // raise(SIGTRAP);
    //}

    markPhase();
    sweepPhase();
+    if (VERBOSITY("gc") >= 2)
+        printf("Collection #%d done\n", ++ncollections);
+
+    long us = _t.end();
+    static StatCounter sc_us("gc_collections_us");
+    sc_us.log(us);
 }

 } // namespace gc

--- a/src/gc/heap.cpp
+++ b/src/gc/heap.cpp
@@ -34,7 +34,7 @@ namespace gc {

 // extern unsigned numAllocs;
 //#define ALLOCS_PER_COLLECTION 1000
-extern unsigned bytesAllocatedSinceCollection;
+unsigned bytesAllocatedSinceCollection;
 #define ALLOCBYTES_PER_COLLECTION 2000000

 void _collectIfNeeded(size_t bytes) {
@@ -97,6 +97,8 @@ struct LargeObj {
 void* Heap::allocLarge(size_t size) {
    _collectIfNeeded(size);

+    LOCK_REGION(lock);
+
    size_t total_size = size + sizeof(LargeObj);
    total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
    LargeObj* rtn = (LargeObj*)large_arena.doMmap(total_size);
@@ -148,6 +150,8 @@ static Block* alloc_block(uint64_t size, Block** prev) {
 void* Heap::allocSmall(size_t rounded_size, Block** prev, Block** full_head) {
    _collectIfNeeded(rounded_size);

+    LOCK_REGION(lock);
+
    Block* cur = *prev;
    assert(!cur || prev == cur->prev);
    int scanned = 0;

--- a/src/gc/heap.h
+++ b/src/gc/heap.h
@@ -18,6 +18,7 @@
 #include <cstdint>

 #include "core/common.h"
+#include "core/threading.h"

 namespace pyston {
 namespace gc {
@@ -82,6 +83,9 @@ private:
    void* allocSmall(size_t rounded_size, Block** head, Block** full_head);
    void* allocLarge(size_t bytes);

+    // DS_DEFINE_MUTEX(lock);
+    DS_DEFINE_SPINLOCK(lock);
+
 public:
    void* realloc(void* ptr, size_t bytes);


--- a/src/runtime/inline/list.cpp
+++ b/src/runtime/inline/list.cpp
@@ -110,6 +110,8 @@ extern "C" void listAppendInternal(Box* s, Box* v) {


 extern "C" void listAppendArrayInternal(Box* s, Box** v, int nelts) {
+    // Lock must be held!
+
    assert(s->cls == list_cls);
    BoxedList* self = static_cast<BoxedList*>(s);

@@ -127,6 +129,8 @@ extern "C" Box* listAppend(Box* s, Box* v) {
    assert(s->cls == list_cls);
    BoxedList* self = static_cast<BoxedList*>(s);

+    LOCK_REGION(self->lock.asWrite());
+
    listAppendInternal(self, v);

    return None;

--- a/src/runtime/list.cpp
+++ b/src/runtime/list.cpp
@@ -32,6 +32,8 @@
 namespace pyston {

 extern "C" Box* listRepr(BoxedList* self) {
+    LOCK_REGION(self->lock.asRead());
+
    // TODO highly inefficient with all the string copying
    std::ostringstream os;
    os << '[';
@@ -51,6 +53,8 @@ extern "C" Box* listNonzero(BoxedList* self) {
 }

 extern "C" Box* listPop(BoxedList* self, Box* idx) {
+    LOCK_REGION(self->lock.asWrite());
+
    if (idx == None) {
        if (self->size == 0) {
            raiseExcHelper(IndexError, "pop from empty list");
@@ -115,6 +119,8 @@ Box* _listSlice(BoxedList* self, i64 start, i64 stop, i64 step) {
 }

 extern "C" Box* listGetitemInt(BoxedList* self, BoxedInt* slice) {
+    LOCK_REGION(self->lock.asRead());
+
    assert(self->cls == list_cls);
    assert(slice->cls == int_cls);
    int64_t n = slice->n;
@@ -129,6 +135,8 @@ extern "C" Box* listGetitemInt(BoxedList* self, BoxedInt* slice) {
 }

 extern "C" Box* listGetitemSlice(BoxedList* self, BoxedSlice* slice) {
+    LOCK_REGION(self->lock.asRead());
+
    assert(self->cls == list_cls);
    assert(slice->cls == slice_cls);
    i64 start, stop, step;
@@ -149,6 +157,9 @@ extern "C" Box* listGetitem(BoxedList* self, Box* slice) {


 extern "C" Box* listSetitemInt(BoxedList* self, BoxedInt* slice, Box* v) {
+    // I think r lock is ok here, since we don't change the list structure:
+    LOCK_REGION(self->lock.asRead());
+
    assert(self->cls == list_cls);
    assert(slice->cls == int_cls);
    int64_t n = slice->n;
@@ -164,6 +175,8 @@ extern "C" Box* listSetitemInt(BoxedList* self, BoxedInt* slice, Box* v) {
 }

 extern "C" Box* listSetitemSlice(BoxedList* self, BoxedSlice* slice, Box* v) {
+    LOCK_REGION(self->lock.asWrite());
+
    assert(self->cls == list_cls);
    assert(slice->cls == slice_cls);
    i64 start, stop, step;
@@ -204,6 +217,8 @@ extern "C" Box* listSetitem(BoxedList* self, Box* slice, Box* v) {
 }

 extern "C" Box* listDelitemInt(BoxedList* self, BoxedInt* slice) {
+    LOCK_REGION(self->lock.asWrite());
+
    int64_t n = slice->n;
    if (n < 0)
        n = self->size + n;
@@ -217,6 +232,8 @@ extern "C" Box* listDelitemInt(BoxedList* self, BoxedInt* slice) {
 }

 extern "C" Box* listDelitemSlice(BoxedList* self, BoxedSlice* slice) {
+    LOCK_REGION(self->lock.asWrite());
+
    i64 start, stop, step;
    parseSlice(slice, self->size, &start, &stop, &step);
    RELEASE_ASSERT(step == 1, "step sizes must be 1 for now");
@@ -233,6 +250,8 @@ extern "C" Box* listDelitemSlice(BoxedList* self, BoxedSlice* slice) {
 }

 extern "C" Box* listDelitem(BoxedList* self, Box* slice) {
+    LOCK_REGION(self->lock.asWrite());
+
    Box* rtn;

    if (slice->cls == int_cls) {
@@ -251,6 +270,8 @@ extern "C" Box* listInsert(BoxedList* self, Box* idx, Box* v) {
        raiseExcHelper(TypeError, "an integer is required");
    }

+    LOCK_REGION(self->lock.asWrite());
+
    int64_t n = static_cast<BoxedInt*>(idx)->n;
    if (n < 0)
        n = self->size + n;
@@ -277,6 +298,8 @@ Box* listMul(BoxedList* self, Box* rhs) {
        raiseExcHelper(TypeError, "can't multiply sequence by non-int of type '%s'", getTypeName(rhs)->c_str());
    }

+    LOCK_REGION(self->lock.asRead());
+
    int n = static_cast<BoxedInt*>(rhs)->n;
    int s = self->size;

@@ -300,6 +323,8 @@ Box* listIAdd(BoxedList* self, Box* _rhs) {
        raiseExcHelper(TypeError, "can only concatenate list (not \"%s\") to list", getTypeName(_rhs)->c_str());
    }

+    LOCK_REGION(self->lock.asWrite());
+
    BoxedList* rhs = static_cast<BoxedList*>(_rhs);

    int s1 = self->size;
@@ -316,6 +341,8 @@ Box* listAdd(BoxedList* self, Box* _rhs) {
        raiseExcHelper(TypeError, "can only concatenate list (not \"%s\") to list", getTypeName(_rhs)->c_str());
    }

+    LOCK_REGION(self->lock.asRead());
+
    BoxedList* rhs = static_cast<BoxedList*>(_rhs);

    BoxedList* rtn = new BoxedList();
@@ -331,6 +358,8 @@ Box* listAdd(BoxedList* self, Box* _rhs) {
 }

 Box* listSort1(BoxedList* self) {
+    LOCK_REGION(self->lock.asWrite());
+
    assert(self->cls == list_cls);

    std::sort<Box**, PyLt>(self->elts->elts, self->elts->elts + self->size, PyLt());
@@ -339,6 +368,8 @@ Box* listSort1(BoxedList* self) {
 }

 Box* listContains(BoxedList* self, Box* elt) {
+    LOCK_REGION(self->lock.asRead());
+
    int size = self->size;
    for (int i = 0; i < size; i++) {
        Box* e = self->elts->elts[i];
@@ -351,6 +382,8 @@ Box* listContains(BoxedList* self, Box* elt) {
 }

 Box* listCount(BoxedList* self, Box* elt) {
+    LOCK_REGION(self->lock.asRead());
+
    int size = self->size;
    int count = 0;

@@ -365,6 +398,8 @@ Box* listCount(BoxedList* self, Box* elt) {
 }

 Box* listIndex(BoxedList* self, Box* elt) {
+    LOCK_REGION(self->lock.asRead());
+
    int size = self->size;

    for (int i = 0; i < size; i++) {
@@ -380,6 +415,8 @@ Box* listIndex(BoxedList* self, Box* elt) {
 }

 Box* listRemove(BoxedList* self, Box* elt) {
+    LOCK_REGION(self->lock.asWrite());
+
    assert(self->cls == list_cls);

    for (int i = 0; i < self->size; i++) {
@@ -398,6 +435,8 @@ Box* listRemove(BoxedList* self, Box* elt) {
 }

 Box* listReverse(BoxedList* self) {
+    LOCK_REGION(self->lock.asWrite());
+
    assert(self->cls == list_cls);
    for (int i = 0, j = self->size - 1; i < j; i++, j--) {
        Box* e = self->elts->elts[i];
@@ -475,6 +514,9 @@ Box* listEq(BoxedList* self, Box* rhs) {
    if (rhs->cls != list_cls) {
        return NotImplemented;
    }
+
+    LOCK_REGION(self->lock.asRead());
+
    return _listCmp(self, static_cast<BoxedList*>(rhs), AST_TYPE::Eq);
 }


--- a/src/runtime/objmodel.cpp
+++ b/src/runtime/objmodel.cpp
@@ -1577,6 +1577,60 @@ static inline Box*& getArg(int idx, Box*& arg1, Box*& arg2, Box*& arg3, Box** ar
    return args[idx - 3];
 }

+static CompiledFunction* pickVersion(CLFunction* f, int num_output_args, Box* oarg1, Box* oarg2, Box* oarg3,
+                                     Box** oargs) {
+    LOCK_REGION(codegen_rwlock.asWrite());
+
+    CompiledFunction* chosen_cf = NULL;
+    for (CompiledFunction* cf : f->versions) {
+        assert(cf->spec->arg_types.size() == num_output_args);
+
+        if (cf->spec->rtn_type->llvmType() != UNKNOWN->llvmType())
+            continue;
+
+        bool works = true;
+        for (int i = 0; i < num_output_args; i++) {
+            Box* arg = getArg(i, oarg1, oarg2, oarg3, oargs);
+
+            ConcreteCompilerType* t = cf->spec->arg_types[i];
+            if ((arg && !t->isFitBy(arg->cls)) || (!arg && t != UNKNOWN)) {
+                works = false;
+                break;
+            }
+        }
+
+        if (!works)
+            continue;
+
+        chosen_cf = cf;
+        break;
+    }
+
+    if (chosen_cf == NULL) {
+        if (f->source == NULL) {
+            // TODO I don't think this should be happening any more?
+            printf("Error: couldn't find suitable function version and no source to recompile!\n");
+            abort();
+        }
+
+        std::vector<ConcreteCompilerType*> arg_types;
+        for (int i = 0; i < num_output_args; i++) {
+            Box* arg = getArg(i, oarg1, oarg2, oarg3, oargs);
+            assert(arg); // only builtin functions can pass NULL args
+
+            arg_types.push_back(typeFromClass(arg->cls));
+        }
+        FunctionSpecialization* spec = new FunctionSpecialization(UNKNOWN, arg_types);
+
+        EffortLevel::EffortLevel new_effort = initialEffort();
+
+        // this also pushes the new CompiledVersion to the back of the version list:
+        chosen_cf = compileFunction(f, spec, new_effort, NULL);
+    }
+
+    return chosen_cf;
+}
+
 Box* callFunc(BoxedFunction* func, CallRewriteArgs* rewrite_args, ArgPassSpec argspec, Box* arg1, Box* arg2, Box* arg3,
              Box** args, const std::vector<const std::string*>* keyword_names) {
    /*
@@ -1835,54 +1889,7 @@ Box* callFunc(BoxedFunction* func, CallRewriteArgs* rewrite_args, ArgPassSpec ar



-    // Pick a specific version to use:
-
-    CompiledFunction* chosen_cf = NULL;
-    for (CompiledFunction* cf : f->versions) {
-        assert(cf->spec->arg_types.size() == num_output_args);
-
-        if (cf->spec->rtn_type->llvmType() != UNKNOWN->llvmType())
-            continue;
-
-        bool works = true;
-        for (int i = 0; i < num_output_args; i++) {
-            Box* arg = getArg(i, oarg1, oarg2, oarg3, oargs);
-
-            ConcreteCompilerType* t = cf->spec->arg_types[i];
-            if ((arg && !t->isFitBy(arg->cls)) || (!arg && t != UNKNOWN)) {
-                works = false;
-                break;
-            }
-        }
-
-        if (!works)
-            continue;
-
-        chosen_cf = cf;
-        break;
-    }
-
-    if (chosen_cf == NULL) {
-        if (f->source == NULL) {
-            // TODO I don't think this should be happening any more?
-            printf("Error: couldn't find suitable function version and no source to recompile!\n");
-            abort();
-        }
-
-        std::vector<ConcreteCompilerType*> arg_types;
-        for (int i = 0; i < num_output_args; i++) {
-            Box* arg = getArg(i, oarg1, oarg2, oarg3, oargs);
-            assert(arg); // only builtin functions can pass NULL args
-
-            arg_types.push_back(typeFromClass(arg->cls));
-        }
-        FunctionSpecialization* spec = new FunctionSpecialization(UNKNOWN, arg_types);
-
-        EffortLevel::EffortLevel new_effort = initialEffort();
-
-        // this also pushes the new CompiledVersion to the back of the version list:
-        chosen_cf = compileFunction(f, spec, new_effort, NULL);
-    }
+    CompiledFunction* chosen_cf = pickVersion(f, num_output_args, oarg1, oarg2, oarg3, oargs);

    assert(chosen_cf->is_interpreted == (chosen_cf->code == NULL));
    if (chosen_cf->is_interpreted) {

--- a/src/runtime/types.h
+++ b/src/runtime/types.h
@@ -15,6 +15,7 @@
 #ifndef PYSTON_RUNTIME_TYPES_H
 #define PYSTON_RUNTIME_TYPES_H

+#include "core/threading.h"
 #include "core/types.h"

 namespace pyston {
@@ -218,6 +219,8 @@ public:
    int64_t size, capacity;
    GCdArray* elts;

+    DS_DEFINE_MUTEX(lock);
+
    BoxedList() __attribute__((visibility("default"))) : Box(&list_flavor, list_cls), size(0), capacity(0) {}

    void ensure(int space);

--- a/test/tests/thread_contention_test.py
+++ b/test/tests/thread_contention_test.py
 from thread import start_new_thread
+import time

 work = []
 done = []
@@ -18,7 +19,7 @@ for i in xrange(nthreads):
    t = start_new_thread(run, (N,))

 while len(done) < nthreads:
-    pass
+    time.sleep(0)

 # print work
 assert sum(work) == 0
--- a/test/tests/thread_memory_model_test.py
+++ b/test/tests/thread_memory_model_test.py
+from thread import start_new_thread
+import time
+
+a = 0
+b = 0
+done = []
+def set_thread(num):
+    global a, b
+    print "starting set_thread", num
+    for i in xrange(num):
+        a += 1
+        b += 1
+
+        if i % 10000 == 0:
+            print i
+    done.append(None)
+
+def check_thread(num):
+    while b < num:
+        _b = b
+        _a = a
+        assert _a >= _b, (_a, _b)
+    done.append(None)
+
+print "starting!"
+
+N = 100000
+start_new_thread(check_thread, (N,))
+start_new_thread(set_thread, (N,))
+
+while len(done) < 2:
+    time.sleep(0)