Starting to add per-thread caches to the allocator

Trying to add a generic PerThread class has involved a journey into the wonderful world of template programming, including C++11 variadic templates.

Starting to add per-thread caches to the allocator
Trying to add a generic PerThread class has involved a journey into the wonderful world of template programming, including C++11 variadic templates.
8a3a1792 · Kevin Modzelewski · 2afd421b · 8a3a1792 · 8a3a1792 · 8a3a1792
Commit 8a3a1792 authored Jun 16, 2014 by Kevin Modzelewski
Showing with 107 additions and 25 deletions

src/codegen/llvm_interpreter.cpp src/codegen/llvm_interpreter.cpp +5 -10

src/core/thread_utils.h src/core/thread_utils.h +74 -15

src/gc/heap.cpp src/gc/heap.cpp +12 -0

src/gc/heap.h src/gc/heap.h +16 -0

No files found.
--- a/src/codegen/llvm_interpreter.cpp
+++ b/src/codegen/llvm_interpreter.cpp
@@ -211,20 +211,15 @@ static std::unordered_map<void*, llvm::Instruction*> cur_instruction_map;
 typedef std::vector<const SymMap*> root_stack_t;
 threading::PerThreadSet<root_stack_t> root_stack_set;
-threading::PerThread<root_stack_t> thread_local root_stack(&root_stack_set);
 void gatherInterpreterRoots(GCVisitor* visitor) {
-    // In theory this lock should be superfluous since we should only call this
+    root_stack_set.forEachValue(std::function<void(root_stack_t*, GCVisitor*)>([](root_stack_t* v, GCVisitor* visitor) {
-    // inside a sequential section, but lock it anyway:
+        for (const SymMap* sym_map : *v) {
-    LOCK_REGION(&root_stack_set.lock);
-    for (auto& p : root_stack_set.map) {
-        for (const SymMap* sym_map : *p.second) {
            for (const auto& p2 : *sym_map) {
                visitor->visitPotential(p2.second.o);
            }
        }
-    }
+    }), visitor);
 }
 class UnregisterHelper {
@@ -235,7 +230,7 @@ public:
    constexpr UnregisterHelper(void* frame_ptr) : frame_ptr(frame_ptr) {}
    ~UnregisterHelper() {
-        root_stack.value.pop_back();
+        root_stack_set.get()->pop_back();
        assert(cur_instruction_map.count(frame_ptr));
        cur_instruction_map.erase(frame_ptr);
@@ -280,7 +275,7 @@ Box* interpretFunction(llvm::Function* f, int nargs, Box* arg1, Box* arg2, Box*
    SymMap symbols;
    void* frame_ptr = __builtin_frame_address(0);
-    root_stack.value.push_back(&symbols);
+    root_stack_set.get()->push_back(&symbols);
    UnregisterHelper helper(frame_ptr);
    int arg_num = -1;

--- a/src/core/thread_utils.h
+++ b/src/core/thread_utils.h
@@ -15,6 +15,7 @@
 #ifndef PYSTON_CORE_THREADUTILS_H
 #define PYSTON_CORE_THREADUTILS_H
+#include <functional>
 #include <pthread.h>
 #include <unordered_map>
@@ -120,31 +121,89 @@ public:
 };
-template <typename T> class PerThreadSet {
+namespace impl {
-public:
+// From http://stackoverflow.com/questions/7858817/unpacking-a-tuple-to-call-a-matching-function-pointer
-    PthreadFastMutex lock;
+template<int ...>
-    std::unordered_map<pthread_t, T*> map;
+struct seq { };
+template<int N, int ...S>
+struct gens : gens<N-1, N-1, S...> { };
+template<int ...S>
+struct gens<0, S...> {
+    typedef seq<S...> type;
 };
+}
-template <typename T> class PerThread {
+template <typename T, typename... CtorArgs> class PerThreadSet {
 private:
-    PerThreadSet<T>* set;
+    pthread_key_t pthread_key;
-    pthread_t self;
+    PthreadFastMutex lock;
+    struct Storage {
+        PerThreadSet<T, CtorArgs...> *self;
+        T val;
+    };
+    std::unordered_map<pthread_t, Storage*> map;
+    std::tuple<CtorArgs...> ctor_args;
+    static void dtor(void* val) {
+        Storage* s = static_cast<Storage*>(val);
+        assert(s);
+        auto* self = s->self;
+        LOCK_REGION(&self->lock);
+        // I assume this destructor gets called on the same thread
+        // that this data is bound to:
+        assert(self->map.count(pthread_self()));
+        self->map.erase(pthread_self());
+        delete s;
+    }
+    template <int ...S>
+    Storage* make(impl::seq<S...>) {
+        return new Storage {.self=this, .val=T(std::get<S>(ctor_args)...) };
+    }
 public:
-    T value;
+    PerThreadSet(CtorArgs... ctor_args) : ctor_args(std::forward<CtorArgs>(ctor_args)...) {
+        int code = pthread_key_create(&pthread_key, &dtor);
+    }
+    void forEachValue(std::function<void(T*)> f) {
+        LOCK_REGION(&lock);
-    PerThread(PerThreadSet<T>* set) : set(set), self(pthread_self()) {
+        for (auto& p : map) {
-        LOCK_REGION(&set->lock);
+            f(&p.second->val);
+        }
+    }
+    template <typename... Arguments>
+    void forEachValue(std::function<void(T*, Arguments...)> f, Arguments... args) {
+        LOCK_REGION(&lock);
-        set->map[self] = &value;
+        for (auto& p : map) {
+            f(&p.second->val, std::forward<Arguments>(args)...);
        }
+    }
+    T* get() {
+        // Is there even much benefit to using pthread_getspecific here, as opposed to looking
+        // it up in the map?  I suppose it avoids locking
+        Storage* s = static_cast<Storage*>(pthread_getspecific(pthread_key));
+        if (!s) {
+            s = make(typename impl::gens<sizeof...(CtorArgs)>::type());
-    ~PerThread() {
+            LOCK_REGION(&lock);
-        LOCK_REGION(&set->lock);
+            int code = pthread_setspecific(pthread_key, s);
+            assert(code == 0);
-        assert(set->map.count(self) == 1);
+            map[pthread_self()] = s;
-        set->map.erase(self);
+        }
+        return &s->val;
    }
 };

--- a/src/gc/heap.cpp
+++ b/src/gc/heap.cpp
@@ -147,9 +147,21 @@ static Block* alloc_block(uint64_t size, Block** prev) {
    return rtn;
 }
+Heap::ThreadBlockCache::~ThreadBlockCache() {
+    LOCK_REGION(heap->lock);
+    for (int i = 0; i < NUM_BUCKETS; i++) {
+        if (cache_heads[i] == NULL)
+            continue;
+        assert(0);
+    }
+}
 void* Heap::allocSmall(size_t rounded_size, Block** prev, Block** full_head) {
    _collectIfNeeded(rounded_size);
+    ThreadBlockCache* cache = thread_caches.get();
    LOCK_REGION(lock);
    Block* cur = *prev;

--- a/src/gc/heap.h
+++ b/src/gc/heap.h
@@ -86,7 +86,23 @@ private:
    // DS_DEFINE_MUTEX(lock);
    DS_DEFINE_SPINLOCK(lock);
+    struct ThreadBlockCache {
+        Heap* heap;
+        Block* cache_heads[NUM_BUCKETS];
+        ThreadBlockCache(Heap* heap) : heap(heap) {
+            memset(cache_heads, 0, sizeof(cache_heads));
+        }
+        ~ThreadBlockCache();
+    };
+    friend class ThreadBlockCache;
+    // TODO only use thread caches if we're in GRWL mode?
+    threading::PerThreadSet<ThreadBlockCache, Heap*> thread_caches;
 public:
+    Heap() : thread_caches(this) {
+    }
    void* realloc(void* ptr, size_t bytes);
    void* alloc(size_t bytes) {