Commit 8a3a1792 authored by Kevin Modzelewski's avatar Kevin Modzelewski

Starting to add per-thread caches to the allocator

Trying to add a generic PerThread class has involved
a journey into the wonderful world of template programming,
including C++11 variadic templates.
parent 2afd421b
...@@ -211,20 +211,15 @@ static std::unordered_map<void*, llvm::Instruction*> cur_instruction_map; ...@@ -211,20 +211,15 @@ static std::unordered_map<void*, llvm::Instruction*> cur_instruction_map;
typedef std::vector<const SymMap*> root_stack_t; typedef std::vector<const SymMap*> root_stack_t;
threading::PerThreadSet<root_stack_t> root_stack_set; threading::PerThreadSet<root_stack_t> root_stack_set;
threading::PerThread<root_stack_t> thread_local root_stack(&root_stack_set);
void gatherInterpreterRoots(GCVisitor* visitor) { void gatherInterpreterRoots(GCVisitor* visitor) {
// In theory this lock should be superfluous since we should only call this root_stack_set.forEachValue(std::function<void(root_stack_t*, GCVisitor*)>([](root_stack_t* v, GCVisitor* visitor) {
// inside a sequential section, but lock it anyway: for (const SymMap* sym_map : *v) {
LOCK_REGION(&root_stack_set.lock);
for (auto& p : root_stack_set.map) {
for (const SymMap* sym_map : *p.second) {
for (const auto& p2 : *sym_map) { for (const auto& p2 : *sym_map) {
visitor->visitPotential(p2.second.o); visitor->visitPotential(p2.second.o);
} }
} }
} }), visitor);
} }
class UnregisterHelper { class UnregisterHelper {
...@@ -235,7 +230,7 @@ public: ...@@ -235,7 +230,7 @@ public:
constexpr UnregisterHelper(void* frame_ptr) : frame_ptr(frame_ptr) {} constexpr UnregisterHelper(void* frame_ptr) : frame_ptr(frame_ptr) {}
~UnregisterHelper() { ~UnregisterHelper() {
root_stack.value.pop_back(); root_stack_set.get()->pop_back();
assert(cur_instruction_map.count(frame_ptr)); assert(cur_instruction_map.count(frame_ptr));
cur_instruction_map.erase(frame_ptr); cur_instruction_map.erase(frame_ptr);
...@@ -280,7 +275,7 @@ Box* interpretFunction(llvm::Function* f, int nargs, Box* arg1, Box* arg2, Box* ...@@ -280,7 +275,7 @@ Box* interpretFunction(llvm::Function* f, int nargs, Box* arg1, Box* arg2, Box*
SymMap symbols; SymMap symbols;
void* frame_ptr = __builtin_frame_address(0); void* frame_ptr = __builtin_frame_address(0);
root_stack.value.push_back(&symbols); root_stack_set.get()->push_back(&symbols);
UnregisterHelper helper(frame_ptr); UnregisterHelper helper(frame_ptr);
int arg_num = -1; int arg_num = -1;
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#ifndef PYSTON_CORE_THREADUTILS_H #ifndef PYSTON_CORE_THREADUTILS_H
#define PYSTON_CORE_THREADUTILS_H #define PYSTON_CORE_THREADUTILS_H
#include <functional>
#include <pthread.h> #include <pthread.h>
#include <unordered_map> #include <unordered_map>
...@@ -120,31 +121,89 @@ public: ...@@ -120,31 +121,89 @@ public:
}; };
template <typename T> class PerThreadSet { namespace impl {
public: // From http://stackoverflow.com/questions/7858817/unpacking-a-tuple-to-call-a-matching-function-pointer
PthreadFastMutex lock; template<int ...>
std::unordered_map<pthread_t, T*> map; struct seq { };
template<int N, int ...S>
struct gens : gens<N-1, N-1, S...> { };
template<int ...S>
struct gens<0, S...> {
typedef seq<S...> type;
}; };
}
template <typename T> class PerThread { template <typename T, typename... CtorArgs> class PerThreadSet {
private: private:
PerThreadSet<T>* set; pthread_key_t pthread_key;
pthread_t self; PthreadFastMutex lock;
struct Storage {
PerThreadSet<T, CtorArgs...> *self;
T val;
};
std::unordered_map<pthread_t, Storage*> map;
std::tuple<CtorArgs...> ctor_args;
static void dtor(void* val) {
Storage* s = static_cast<Storage*>(val);
assert(s);
auto* self = s->self;
LOCK_REGION(&self->lock);
// I assume this destructor gets called on the same thread
// that this data is bound to:
assert(self->map.count(pthread_self()));
self->map.erase(pthread_self());
delete s;
}
template <int ...S>
Storage* make(impl::seq<S...>) {
return new Storage {.self=this, .val=T(std::get<S>(ctor_args)...) };
}
public: public:
T value; PerThreadSet(CtorArgs... ctor_args) : ctor_args(std::forward<CtorArgs>(ctor_args)...) {
int code = pthread_key_create(&pthread_key, &dtor);
}
void forEachValue(std::function<void(T*)> f) {
LOCK_REGION(&lock);
PerThread(PerThreadSet<T>* set) : set(set), self(pthread_self()) { for (auto& p : map) {
LOCK_REGION(&set->lock); f(&p.second->val);
}
}
template <typename... Arguments>
void forEachValue(std::function<void(T*, Arguments...)> f, Arguments... args) {
LOCK_REGION(&lock);
set->map[self] = &value; for (auto& p : map) {
f(&p.second->val, std::forward<Arguments>(args)...);
} }
}
T* get() {
// Is there even much benefit to using pthread_getspecific here, as opposed to looking
// it up in the map? I suppose it avoids locking
Storage* s = static_cast<Storage*>(pthread_getspecific(pthread_key));
if (!s) {
s = make(typename impl::gens<sizeof...(CtorArgs)>::type());
~PerThread() { LOCK_REGION(&lock);
LOCK_REGION(&set->lock); int code = pthread_setspecific(pthread_key, s);
assert(code == 0);
assert(set->map.count(self) == 1); map[pthread_self()] = s;
set->map.erase(self); }
return &s->val;
} }
}; };
......
...@@ -147,9 +147,21 @@ static Block* alloc_block(uint64_t size, Block** prev) { ...@@ -147,9 +147,21 @@ static Block* alloc_block(uint64_t size, Block** prev) {
return rtn; return rtn;
} }
Heap::ThreadBlockCache::~ThreadBlockCache() {
LOCK_REGION(heap->lock);
for (int i = 0; i < NUM_BUCKETS; i++) {
if (cache_heads[i] == NULL)
continue;
assert(0);
}
}
void* Heap::allocSmall(size_t rounded_size, Block** prev, Block** full_head) { void* Heap::allocSmall(size_t rounded_size, Block** prev, Block** full_head) {
_collectIfNeeded(rounded_size); _collectIfNeeded(rounded_size);
ThreadBlockCache* cache = thread_caches.get();
LOCK_REGION(lock); LOCK_REGION(lock);
Block* cur = *prev; Block* cur = *prev;
......
...@@ -86,7 +86,23 @@ private: ...@@ -86,7 +86,23 @@ private:
// DS_DEFINE_MUTEX(lock); // DS_DEFINE_MUTEX(lock);
DS_DEFINE_SPINLOCK(lock); DS_DEFINE_SPINLOCK(lock);
struct ThreadBlockCache {
Heap* heap;
Block* cache_heads[NUM_BUCKETS];
ThreadBlockCache(Heap* heap) : heap(heap) {
memset(cache_heads, 0, sizeof(cache_heads));
}
~ThreadBlockCache();
};
friend class ThreadBlockCache;
// TODO only use thread caches if we're in GRWL mode?
threading::PerThreadSet<ThreadBlockCache, Heap*> thread_caches;
public: public:
Heap() : thread_caches(this) {
}
void* realloc(void* ptr, size_t bytes); void* realloc(void* ptr, size_t bytes);
void* alloc(size_t bytes) { void* alloc(size_t bytes) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment