Commit 8a3a1792 authored by Kevin Modzelewski's avatar Kevin Modzelewski

Starting to add per-thread caches to the allocator

Trying to add a generic PerThread class has involved
a journey into the wonderful world of template programming,
including C++11 variadic templates.
parent 2afd421b
......@@ -211,20 +211,15 @@ static std::unordered_map<void*, llvm::Instruction*> cur_instruction_map;
typedef std::vector<const SymMap*> root_stack_t;
threading::PerThreadSet<root_stack_t> root_stack_set;
threading::PerThread<root_stack_t> thread_local root_stack(&root_stack_set);
void gatherInterpreterRoots(GCVisitor* visitor) {
// In theory this lock should be superfluous since we should only call this
// inside a sequential section, but lock it anyway:
LOCK_REGION(&root_stack_set.lock);
for (auto& p : root_stack_set.map) {
for (const SymMap* sym_map : *p.second) {
root_stack_set.forEachValue(std::function<void(root_stack_t*, GCVisitor*)>([](root_stack_t* v, GCVisitor* visitor) {
for (const SymMap* sym_map : *v) {
for (const auto& p2 : *sym_map) {
visitor->visitPotential(p2.second.o);
}
}
}
}), visitor);
}
class UnregisterHelper {
......@@ -235,7 +230,7 @@ public:
constexpr UnregisterHelper(void* frame_ptr) : frame_ptr(frame_ptr) {}
~UnregisterHelper() {
root_stack.value.pop_back();
root_stack_set.get()->pop_back();
assert(cur_instruction_map.count(frame_ptr));
cur_instruction_map.erase(frame_ptr);
......@@ -280,7 +275,7 @@ Box* interpretFunction(llvm::Function* f, int nargs, Box* arg1, Box* arg2, Box*
SymMap symbols;
void* frame_ptr = __builtin_frame_address(0);
root_stack.value.push_back(&symbols);
root_stack_set.get()->push_back(&symbols);
UnregisterHelper helper(frame_ptr);
int arg_num = -1;
......
......@@ -15,6 +15,7 @@
#ifndef PYSTON_CORE_THREADUTILS_H
#define PYSTON_CORE_THREADUTILS_H
#include <functional>
#include <pthread.h>
#include <unordered_map>
......@@ -120,31 +121,89 @@ public:
};
template <typename T> class PerThreadSet {
public:
PthreadFastMutex lock;
std::unordered_map<pthread_t, T*> map;
namespace impl {
// From http://stackoverflow.com/questions/7858817/unpacking-a-tuple-to-call-a-matching-function-pointer
template<int ...>
struct seq { };
template<int N, int ...S>
struct gens : gens<N-1, N-1, S...> { };
template<int ...S>
struct gens<0, S...> {
typedef seq<S...> type;
};
}
template <typename T> class PerThread {
template <typename T, typename... CtorArgs> class PerThreadSet {
private:
PerThreadSet<T>* set;
pthread_t self;
pthread_key_t pthread_key;
PthreadFastMutex lock;
struct Storage {
PerThreadSet<T, CtorArgs...> *self;
T val;
};
std::unordered_map<pthread_t, Storage*> map;
std::tuple<CtorArgs...> ctor_args;
static void dtor(void* val) {
Storage* s = static_cast<Storage*>(val);
assert(s);
auto* self = s->self;
LOCK_REGION(&self->lock);
// I assume this destructor gets called on the same thread
// that this data is bound to:
assert(self->map.count(pthread_self()));
self->map.erase(pthread_self());
delete s;
}
template <int ...S>
Storage* make(impl::seq<S...>) {
return new Storage {.self=this, .val=T(std::get<S>(ctor_args)...) };
}
public:
T value;
PerThreadSet(CtorArgs... ctor_args) : ctor_args(std::forward<CtorArgs>(ctor_args)...) {
int code = pthread_key_create(&pthread_key, &dtor);
}
void forEachValue(std::function<void(T*)> f) {
LOCK_REGION(&lock);
PerThread(PerThreadSet<T>* set) : set(set), self(pthread_self()) {
LOCK_REGION(&set->lock);
for (auto& p : map) {
f(&p.second->val);
}
}
template <typename... Arguments>
void forEachValue(std::function<void(T*, Arguments...)> f, Arguments... args) {
LOCK_REGION(&lock);
set->map[self] = &value;
for (auto& p : map) {
f(&p.second->val, std::forward<Arguments>(args)...);
}
}
T* get() {
// Is there even much benefit to using pthread_getspecific here, as opposed to looking
// it up in the map? I suppose it avoids locking
Storage* s = static_cast<Storage*>(pthread_getspecific(pthread_key));
if (!s) {
s = make(typename impl::gens<sizeof...(CtorArgs)>::type());
~PerThread() {
LOCK_REGION(&set->lock);
LOCK_REGION(&lock);
int code = pthread_setspecific(pthread_key, s);
assert(code == 0);
assert(set->map.count(self) == 1);
set->map.erase(self);
map[pthread_self()] = s;
}
return &s->val;
}
};
......
......@@ -147,9 +147,21 @@ static Block* alloc_block(uint64_t size, Block** prev) {
return rtn;
}
Heap::ThreadBlockCache::~ThreadBlockCache() {
LOCK_REGION(heap->lock);
for (int i = 0; i < NUM_BUCKETS; i++) {
if (cache_heads[i] == NULL)
continue;
assert(0);
}
}
void* Heap::allocSmall(size_t rounded_size, Block** prev, Block** full_head) {
_collectIfNeeded(rounded_size);
ThreadBlockCache* cache = thread_caches.get();
LOCK_REGION(lock);
Block* cur = *prev;
......
......@@ -86,7 +86,23 @@ private:
// DS_DEFINE_MUTEX(lock);
DS_DEFINE_SPINLOCK(lock);
struct ThreadBlockCache {
Heap* heap;
Block* cache_heads[NUM_BUCKETS];
ThreadBlockCache(Heap* heap) : heap(heap) {
memset(cache_heads, 0, sizeof(cache_heads));
}
~ThreadBlockCache();
};
friend class ThreadBlockCache;
// TODO only use thread caches if we're in GRWL mode?
threading::PerThreadSet<ThreadBlockCache, Heap*> thread_caches;
public:
Heap() : thread_caches(this) {
}
void* realloc(void* ptr, size_t bytes);
void* alloc(size_t bytes) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment