Commit 94061fa5 authored by Kevin Modzelewski's avatar Kevin Modzelewski

Add enough locking to make the GRWL somewhat safe

Add some basic locking to:
- code generation (one lock for all of it)
- garbage collection (spin lock for allocations, global serialization for collections)
- lists (mutex per list object)

Can run the GRWL on some simple tests (microbenchmarks/thread_contention.py and thread_uncontended.py)
Performance is not great yet
parent aa2b47ec
......@@ -208,3 +208,31 @@ sudo apt-get install rlwrap
```
and when you do `make run`, the Make system will invoke rlwrap. If you want to invoke the repl manually, you can do `rlwrap ./pyston`
### ninja-based LLVM build
Ninja is supposed to be faster than make; I've only tried it very briefly, and it does seem to be faster when modifying LLVM files. May or may not be worth using; thought I'd jot down my notes though:
```
cd ~/pyston_deps
git clone https://github.com/martine/ninja.git
cd ninja
git checkout v1.4.0
./bootstrap.py
```
```
cd ~/pyston_deps
wget http://www.cmake.org/files/v3.0/cmake-3.0.0.tar.gz
cd cmake-3.0.0
./configure
make -j4
```
```
cd ~/pyston_deps
mkdir llvm-trunk-cmake
cd llvm-trunk-cmake
CXX=g++ CC=gcc PATH=~/pyston_deps/gcc-4.8.2-install/bin:$PATH:~/pyston_deps/ninja CMAKE_MAKE_PROGRAM=~/pyston_deps/ninja/ninja ~/pyston_deps/cmake-3.0.0/bin/cmake ../llvm-trunk -G Ninja -DLLVM_TARGETS_TO_BUILD=host -DCMAKE_BUILD_TYPE=RELEASE -DLLVM_ENABLE_ASSERTIONS=ON
~/pyston_deps/ninja/ninja # runs in parallel
```
from thread import start_new_thread
import time
work = []
done = []
def run(idx, num):
print "thread %d starting" % idx, work
for i in xrange(num):
t = work.pop()
work.append(t - 1)
if i % 100000 == 0:
print idx, i
done.append(num)
print "thread %d done" % idx
print "starting!"
nthreads = 1
N = 20000000 / nthreads
for i in xrange(nthreads):
work.append(N)
for i in xrange(nthreads):
t = start_new_thread(run, (i, N))
while len(done) < nthreads:
time.sleep(0.1)
# print work
assert sum(work) == 0
print work
from thread import start_new_thread
import time
done = []
def run(idx, work, num):
print "thread %d starting" % idx, work
for i in xrange(num):
# t = work.pop()
# work.append(t - 1)
if i % 100000 == 0:
print idx, i
done.append(num)
print "thread %d done" % idx
print "starting!"
nthreads = 1
N = 20000000 / nthreads
for i in xrange(nthreads):
t = start_new_thread(run, (i, [N], N))
while len(done) < nthreads:
time.sleep(0.1)
# print work
......@@ -710,12 +710,18 @@ endef
RUN_DEPS := ext
.PHONY: run run_release profile
run: $(RUN_DEPS)
run: pyston_dbg $(RUN_DEPS)
if which rlwrap >/dev/null; then\
rlwrap ./pyston_dbg $(ARGS) ;\
else \
./pyston_dbg $(ARGS) ;\
fi
dbg: pyston_dbg $(RUN_DEPS)
if which rlwrap >/dev/null; then\
rlwrap zsh -c 'ulimit -v $(MAX_DBG_MEM_KB); $(GDB) $(GDB_CMDS) --args ./pyston_dbg $(ARGS)' ; \
else \
zsh -c 'ulimit -v $(MAX_DBG_MEM_KB); $(GDB) $(GDB_CMDS) --args ./pyston_dbg $(ARGS)' ; \
fi
run_release: pyston $(RUN_DEPS)
./pyston $(ARGS)
profile: pyston_profile $(RUN_DEPS)
......@@ -785,7 +791,7 @@ $(call make_search,pprof_%)
perf_%: %.py pyston
perf record -g -- ./pyston -q -p $(ARGS) $<
perf report -v -g flat,1000 | bash $(TOOLS_DIR)/cumulate.sh | less -S
perf report -v -n -g flat,1000 | bash $(TOOLS_DIR)/cumulate.sh | less -S
$(call make_search,perf_%)
perf_dbg_%: %.py pyston_dbg
perf record -g -- ./pyston_dbg -q -p $(ARGS) $<
......
......@@ -28,6 +28,8 @@
namespace pyston {
DS_DEFINE_RWLOCK(codegen_rwlock);
void FunctionAddressRegistry::registerFunction(const std::string& name, void* addr, int length,
llvm::Function* llvm_func) {
assert(addr);
......
......@@ -18,6 +18,7 @@
#include <unordered_map>
#include "codegen/runtime_hooks.h"
#include "core/threading.h"
#include "core/types.h"
namespace llvm {
......@@ -85,6 +86,8 @@ extern GlobalState g;
void initGlobalFuncs(GlobalState& g);
const LineInfo* getLineInfoFor(uint64_t inst_addr);
DS_DECLARE_RWLOCK(codegen_rwlock);
}
#endif
......@@ -140,6 +140,7 @@ static void compileIR(CompiledFunction* cf, EffortLevel::EffortLevel effort) {
// Compiles a new version of the function with the given signature and adds it to the list;
// should only be called after checking to see if the other versions would work.
// The codegen_lock needs to be held in W mode before calling this function:
CompiledFunction* compileFunction(CLFunction* f, FunctionSpecialization* spec, EffortLevel::EffortLevel effort,
const OSREntryDescriptor* entry) {
Timer _t("for compileFunction()");
......@@ -239,6 +240,8 @@ CompiledFunction* compileFunction(CLFunction* f, FunctionSpecialization* spec, E
}
void compileAndRunModule(AST_Module* m, BoxedModule* bm) {
LOCK_REGION(codegen_rwlock.asWrite());
Timer _t("for compileModule()");
ScopingAnalysis* scoping = runScopingAnalysis(m);
......@@ -268,6 +271,8 @@ void compileAndRunModule(AST_Module* m, BoxedModule* bm) {
/// The cf must be an active version in its parents CLFunction; the given
/// version will be replaced by the new version, which will be returned.
static CompiledFunction* _doReopt(CompiledFunction* cf, EffortLevel::EffortLevel new_effort) {
LOCK_REGION(codegen_rwlock.asWrite());
assert(cf->clfunc->versions.size());
assert(cf);
......@@ -304,6 +309,8 @@ static CompiledFunction* _doReopt(CompiledFunction* cf, EffortLevel::EffortLevel
static StatCounter stat_osrexits("OSR exits");
void* compilePartialFunc(OSRExit* exit) {
LOCK_REGION(codegen_rwlock.asWrite());
assert(exit);
assert(exit->parent_cf);
assert(exit->parent_cf->effort < EffortLevel::MAXIMAL);
......
......@@ -216,7 +216,7 @@ threading::PerThread<root_stack_t> thread_local root_stack(&root_stack_set);
void gatherInterpreterRoots(GCVisitor* visitor) {
// In theory this lock should be superfluous since we should only call this
// inside a sequential section, but lock it anyway:
threading::LockedRegion _lock(&root_stack_set.lock);
LOCK_REGION(&root_stack_set.lock);
for (auto& p : root_stack_set.map) {
for (const SymMap* sym_map : *p.second) {
......
......@@ -15,6 +15,10 @@
#ifndef PYSTON_CODEGEN_RUNTIMEHOOKS_H
#define PYSTON_CODEGEN_RUNTIMEHOOKS_H
// This file doesn't actually need to include core/types.h, but including it works around this clang bug:
// http://lists.cs.uiuc.edu/pipermail/cfe-dev/2014-June/037519.html
#include "core/types.h"
namespace llvm {
class Value;
}
......
......@@ -29,6 +29,8 @@
#define _STRINGIFY(N) #N
#define STRINGIFY(N) _STRINGIFY(N)
#define _CAT(A, B) A##B
#define CAT(A, B) _CAT(A, B)
// GCC and clang handle always_inline very differently;
// we mostly only care about it for the stdlib, so just remove the attributes
......
......@@ -16,24 +16,109 @@
#define PYSTON_CORE_THREADUTILS_H
#include <pthread.h>
#include <unordered_map>
namespace pyston {
namespace threading {
class LockedRegion {
template <typename T> class _LockedRegion {
private:
pthread_mutex_t* mutex;
T* const mutex;
public:
LockedRegion(pthread_mutex_t* mutex) : mutex(mutex) { pthread_mutex_lock(mutex); }
~LockedRegion() { pthread_mutex_unlock(mutex); }
_LockedRegion(T* mutex) : mutex(mutex) { mutex->lock(); }
~_LockedRegion() { mutex->unlock(); }
};
template <typename T> _LockedRegion<T> _makeLockedRegion(T* mutex) {
return _LockedRegion<T>(mutex);
}
template <typename T> _LockedRegion<T> _makeLockedRegion(T& mutex) {
return _LockedRegion<T>(&mutex);
}
#define LOCK_REGION(lock) auto CAT(_lock_, __LINE__) = pyston::threading::_makeLockedRegion(lock)
class NopLock {
public:
void lock() {}
void unlock() {}
NopLock* asRead() { return this; }
NopLock* asWrite() { return this; }
};
class PthreadFastMutex {
private:
pthread_mutex_t mutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP;
public:
void lock() { pthread_mutex_lock(&mutex); }
void unlock() { pthread_mutex_unlock(&mutex); }
PthreadFastMutex* asRead() { return this; }
PthreadFastMutex* asWrite() { return this; }
};
class PthreadMutex {
private:
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
public:
void lock() { pthread_mutex_lock(&mutex); }
void unlock() { pthread_mutex_unlock(&mutex); }
PthreadMutex* asRead() { return this; }
PthreadMutex* asWrite() { return this; }
};
class PthreadRWLock {
private:
pthread_rwlock_t rwlock = PTHREAD_RWLOCK_INITIALIZER;
public:
class PthreadRWLockRead {
private:
pthread_rwlock_t rwlock;
PthreadRWLockRead() = delete;
public:
void lock() { pthread_rwlock_rdlock(&rwlock); }
void unlock() { pthread_rwlock_unlock(&rwlock); }
};
class PthreadRWLockWrite {
private:
pthread_rwlock_t rwlock;
PthreadRWLockWrite() = delete;
public:
void lock() { pthread_rwlock_wrlock(&rwlock); }
void unlock() { pthread_rwlock_unlock(&rwlock); }
};
PthreadRWLockRead* asRead() { return reinterpret_cast<PthreadRWLockRead*>(this); }
PthreadRWLockWrite* asWrite() { return reinterpret_cast<PthreadRWLockWrite*>(this); }
};
class PthreadSpinLock {
private:
pthread_spinlock_t spinlock;
public:
PthreadSpinLock() { pthread_spin_init(&spinlock, false); }
void lock() { pthread_spin_lock(&spinlock); }
void unlock() { pthread_spin_unlock(&spinlock); }
PthreadSpinLock* asRead() { return this; }
PthreadSpinLock* asWrite() { return this; }
};
template <typename T> class PerThreadSet {
public:
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
PthreadFastMutex lock;
std::unordered_map<pthread_t, T*> map;
};
......@@ -46,13 +131,13 @@ public:
T value;
PerThread(PerThreadSet<T>* set) : set(set), self(pthread_self()) {
LockedRegion _lock(&set->lock);
LOCK_REGION(&set->lock);
set->map[self] = &value;
}
~PerThread() {
LockedRegion _lock(&set->lock);
LOCK_REGION(&set->lock);
assert(set->map.count(self) == 1);
set->map.erase(self);
......
......@@ -46,7 +46,7 @@ int tgkill(int tgid, int tid, int sig) {
// and wait until they start up.
int num_starting_threads(0);
static pthread_mutex_t threading_lock = PTHREAD_MUTEX_INITIALIZER;
PthreadFastMutex threading_lock;
struct ThreadInfo {
// "bottom" in the sense of a stack, which in a down-growing stack is the highest address:
void* stack_bottom;
......@@ -86,14 +86,14 @@ std::vector<ThreadState> getAllThreadStates() {
// though I suppose that will have been taken care of
// by the caller of this function.
LockedRegion _lock(&threading_lock);
LOCK_REGION(&threading_lock);
while (true) {
// TODO shouldn't busy-wait:
if (num_starting_threads) {
pthread_mutex_unlock(&threading_lock);
threading_lock.unlock();
sleep(0);
pthread_mutex_lock(&threading_lock);
threading_lock.lock();
} else {
break;
}
......@@ -129,9 +129,9 @@ std::vector<ThreadState> getAllThreadStates() {
// TODO shouldn't busy-wait:
while (signals_waiting) {
pthread_mutex_unlock(&threading_lock);
threading_lock.unlock();
sleep(0);
pthread_mutex_lock(&threading_lock);
threading_lock.lock();
}
assert(num_starting_threads == 0);
......@@ -140,7 +140,7 @@ std::vector<ThreadState> getAllThreadStates() {
}
static void _thread_context_dump(int signum, siginfo_t* info, void* _context) {
LockedRegion _lock(&threading_lock);
LOCK_REGION(&threading_lock);
ucontext_t* context = static_cast<ucontext_t*>(_context);
......@@ -169,7 +169,7 @@ static void* _thread_start(void* _arg) {
delete arg;
{
LockedRegion _lock(&threading_lock);
LOCK_REGION(&threading_lock);
pid_t tid = gettid();
pthread_t current_thread = pthread_self();
......@@ -207,7 +207,7 @@ static void* _thread_start(void* _arg) {
void* rtn = start_func(arg1, arg2, arg3);
{
LockedRegion _lock(&threading_lock);
LOCK_REGION(&threading_lock);
current_threads.erase(gettid());
saved_thread_states.erase(gettid());
......@@ -220,7 +220,7 @@ static void* _thread_start(void* _arg) {
intptr_t start_thread(void* (*start_func)(Box*, Box*, Box*), Box* arg1, Box* arg2, Box* arg3) {
{
LockedRegion _lock(&threading_lock);
LOCK_REGION(&threading_lock);
num_starting_threads++;
}
......@@ -282,7 +282,7 @@ static void* find_stack() {
intptr_t call_frame_base;
void registerMainThread() {
LockedRegion _lock(&threading_lock);
LOCK_REGION(&threading_lock);
// Would be nice if we could set this to the pthread start_thread,
// since _thread_start doesn't always show up in the traceback.
......@@ -316,7 +316,7 @@ GLAllowThreadsReadRegion::GLAllowThreadsReadRegion() {
releaseGLRead();
{
LockedRegion _lock(&threading_lock);
LOCK_REGION(&threading_lock);
ThreadStateInternal& state = saved_thread_states[gettid()];
assert(!state.valid);
......@@ -327,7 +327,7 @@ GLAllowThreadsReadRegion::GLAllowThreadsReadRegion() {
GLAllowThreadsReadRegion::~GLAllowThreadsReadRegion() {
{
LockedRegion _lock(&threading_lock);
LOCK_REGION(&threading_lock);
saved_thread_states[gettid()].valid = false;
}
......@@ -337,6 +337,10 @@ GLAllowThreadsReadRegion::~GLAllowThreadsReadRegion() {
#if THREADING_USE_GIL
#if THREADING_USE_GRWL
#error "Can't turn on both the GIL and the GRWL!"
#endif
static pthread_mutex_t gil = PTHREAD_MUTEX_INITIALIZER;
static std::atomic<int> threads_waiting_on_gil(0);
......@@ -367,9 +371,7 @@ void allowGLReadPreemption() {
acquireGLRead();
}
}
#endif
#if THREADING_USE_GRWL
#elif THREADING_USE_GRWL
static pthread_rwlock_t grwl = PTHREAD_RWLOCK_INITIALIZER;
enum class GRWLHeldState {
......@@ -420,16 +422,23 @@ void demoteGL() {
acquireGLRead();
}
static __thread int gl_check_count = 0;
void allowGLReadPreemption() {
assert(grwl_state == GRWLHeldState::R);
if (!writers_waiting.load(std::memory_order_relaxed))
gl_check_count++;
if (gl_check_count < 1000)
return;
gl_check_count = 0;
if (__builtin_expect(!writers_waiting.load(std::memory_order_relaxed), 1))
return;
pthread_rwlock_unlock(&grwl);
// printf("waiters!\n");
sleep(0);
pthread_rwlock_rdlock(&grwl);
}
#endif
} // namespace threading
......
......@@ -20,6 +20,9 @@
#include <ucontext.h>
#include <vector>
#include "core/common.h"
#include "core/thread_utils.h"
namespace pyston {
class Box;
......@@ -58,6 +61,22 @@ void* getStackBottom();
#define THREADING_USE_GRWL 0
#define THREADING_SAFE_DATASTRUCTURES THREADING_USE_GRWL
#if THREADING_SAFE_DATASTRUCTURES
#define DS_DEFINE_MUTEX(name) pyston::threading::PthreadFastMutex name
#define DS_DECLARE_RWLOCK(name) extern pyston::threading::PthreadRWLock name
#define DS_DEFINE_RWLOCK(name) pyston::threading::PthreadRWLock name
#define DS_DEFINE_SPINLOCK(name) pyston::threading::PthreadSpinLock name
#else
#define DS_DEFINE_MUTEX(name) pyston::threading::NopLock name
#define DS_DECLARE_RWLOCK(name) extern pyston::threading::NopLock name
#define DS_DEFINE_RWLOCK(name) pyston::threading::NopLock name
#define DS_DEFINE_SPINLOCK(name) pyston::threading::NopLock name
#endif
void acquireGLRead();
void releaseGLRead();
void acquireGLWrite();
......@@ -102,6 +121,23 @@ inline void demoteGL() {
}
#endif
#if !THREADING_USE_GIL && !THREADING_USE_GRWL
inline void acquireGLRead() {
}
inline void releaseGLRead() {
}
inline void acquireGLWrite() {
}
inline void releaseGLWrite() {
}
inline void promoteGL() {
}
inline void demoteGL() {
}
inline void allowGLReadPreemption() {
}
#endif
} // namespace threading
} // namespace pyston
......
......@@ -26,20 +26,28 @@
namespace pyston {
int Timer::level = 0;
Timer::Timer(const char* desc, int min_usec) : min_usec(min_usec), ended(true) {
Timer::Timer(const char* desc) : min_usec(-1), ended(true) {
restart(desc);
}
Timer::Timer(const char* desc, long min_usec) : min_usec(min_usec), ended(true) {
restart(desc);
}
void Timer::restart(const char* newdesc, int min_usec) {
void Timer::restart(const char* newdesc) {
assert(ended);
desc = newdesc;
this->min_usec = min_usec;
gettimeofday(&start_time, NULL);
Timer::level++;
ended = false;
}
void Timer::restart(const char* newdesc, long new_min_usec) {
this->min_usec = new_min_usec;
restart(newdesc);
}
long Timer::end() {
if (!ended) {
timeval end;
......
......@@ -28,18 +28,21 @@ private:
static int level;
timeval start_time;
const char* desc;
int min_usec;
long min_usec;
bool ended;
public:
Timer(const char* desc, int min_usec = -1);
Timer(const char* desc);
Timer(const char* desc, long min_usec);
~Timer();
void restart(const char* newdesc, int min_usec = -1);
void restart(const char* newdesc, long new_min_usec);
void restart(const char* newdesc);
long end();
long split(const char* newdesc, int min_usec = -1) {
long split(const char* newdesc) {
long rtn = end();
restart(newdesc, min_usec);
restart(newdesc);
return rtn;
}
};
......
......@@ -22,6 +22,7 @@
#include "core/common.h"
#include "core/threading.h"
#include "core/types.h"
#include "core/util.h"
#include "gc/heap.h"
#include "gc/root_finder.h"
......@@ -35,9 +36,6 @@
namespace pyston {
namespace gc {
// unsigned numAllocs = 0;
unsigned bytesAllocatedSinceCollection = 0;
static TraceStack roots;
void registerStaticRootObj(void* obj) {
assert(global_heap.getAllocationFromInteriorPointer(obj));
......@@ -159,17 +157,26 @@ void runCollection() {
static StatCounter sc("gc_collections");
sc.log();
threading::GLPromoteRegion _lock;
if (VERBOSITY("gc") >= 2)
printf("Collection #%d\n", ++ncollections);
threading::GLPromoteRegion _lock;
Timer _t("collecting", /*min_usec=*/10000);
// if (ncollections == 754) {
// raise(SIGTRAP);
//}
markPhase();
sweepPhase();
if (VERBOSITY("gc") >= 2)
printf("Collection #%d done\n", ++ncollections);
long us = _t.end();
static StatCounter sc_us("gc_collections_us");
sc_us.log(us);
}
} // namespace gc
......
......@@ -34,7 +34,7 @@ namespace gc {
// extern unsigned numAllocs;
//#define ALLOCS_PER_COLLECTION 1000
extern unsigned bytesAllocatedSinceCollection;
unsigned bytesAllocatedSinceCollection;
#define ALLOCBYTES_PER_COLLECTION 2000000
void _collectIfNeeded(size_t bytes) {
......@@ -97,6 +97,8 @@ struct LargeObj {
void* Heap::allocLarge(size_t size) {
_collectIfNeeded(size);
LOCK_REGION(lock);
size_t total_size = size + sizeof(LargeObj);
total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
LargeObj* rtn = (LargeObj*)large_arena.doMmap(total_size);
......@@ -148,6 +150,8 @@ static Block* alloc_block(uint64_t size, Block** prev) {
void* Heap::allocSmall(size_t rounded_size, Block** prev, Block** full_head) {
_collectIfNeeded(rounded_size);
LOCK_REGION(lock);
Block* cur = *prev;
assert(!cur || prev == cur->prev);
int scanned = 0;
......
......@@ -18,6 +18,7 @@
#include <cstdint>
#include "core/common.h"
#include "core/threading.h"
namespace pyston {
namespace gc {
......@@ -82,6 +83,9 @@ private:
void* allocSmall(size_t rounded_size, Block** head, Block** full_head);
void* allocLarge(size_t bytes);
// DS_DEFINE_MUTEX(lock);
DS_DEFINE_SPINLOCK(lock);
public:
void* realloc(void* ptr, size_t bytes);
......
......@@ -110,6 +110,8 @@ extern "C" void listAppendInternal(Box* s, Box* v) {
extern "C" void listAppendArrayInternal(Box* s, Box** v, int nelts) {
// Lock must be held!
assert(s->cls == list_cls);
BoxedList* self = static_cast<BoxedList*>(s);
......@@ -127,6 +129,8 @@ extern "C" Box* listAppend(Box* s, Box* v) {
assert(s->cls == list_cls);
BoxedList* self = static_cast<BoxedList*>(s);
LOCK_REGION(self->lock.asWrite());
listAppendInternal(self, v);
return None;
......
......@@ -32,6 +32,8 @@
namespace pyston {
extern "C" Box* listRepr(BoxedList* self) {
LOCK_REGION(self->lock.asRead());
// TODO highly inefficient with all the string copying
std::ostringstream os;
os << '[';
......@@ -51,6 +53,8 @@ extern "C" Box* listNonzero(BoxedList* self) {
}
extern "C" Box* listPop(BoxedList* self, Box* idx) {
LOCK_REGION(self->lock.asWrite());
if (idx == None) {
if (self->size == 0) {
raiseExcHelper(IndexError, "pop from empty list");
......@@ -115,6 +119,8 @@ Box* _listSlice(BoxedList* self, i64 start, i64 stop, i64 step) {
}
extern "C" Box* listGetitemInt(BoxedList* self, BoxedInt* slice) {
LOCK_REGION(self->lock.asRead());
assert(self->cls == list_cls);
assert(slice->cls == int_cls);
int64_t n = slice->n;
......@@ -129,6 +135,8 @@ extern "C" Box* listGetitemInt(BoxedList* self, BoxedInt* slice) {
}
extern "C" Box* listGetitemSlice(BoxedList* self, BoxedSlice* slice) {
LOCK_REGION(self->lock.asRead());
assert(self->cls == list_cls);
assert(slice->cls == slice_cls);
i64 start, stop, step;
......@@ -149,6 +157,9 @@ extern "C" Box* listGetitem(BoxedList* self, Box* slice) {
extern "C" Box* listSetitemInt(BoxedList* self, BoxedInt* slice, Box* v) {
// I think r lock is ok here, since we don't change the list structure:
LOCK_REGION(self->lock.asRead());
assert(self->cls == list_cls);
assert(slice->cls == int_cls);
int64_t n = slice->n;
......@@ -164,6 +175,8 @@ extern "C" Box* listSetitemInt(BoxedList* self, BoxedInt* slice, Box* v) {
}
extern "C" Box* listSetitemSlice(BoxedList* self, BoxedSlice* slice, Box* v) {
LOCK_REGION(self->lock.asWrite());
assert(self->cls == list_cls);
assert(slice->cls == slice_cls);
i64 start, stop, step;
......@@ -204,6 +217,8 @@ extern "C" Box* listSetitem(BoxedList* self, Box* slice, Box* v) {
}
extern "C" Box* listDelitemInt(BoxedList* self, BoxedInt* slice) {
LOCK_REGION(self->lock.asWrite());
int64_t n = slice->n;
if (n < 0)
n = self->size + n;
......@@ -217,6 +232,8 @@ extern "C" Box* listDelitemInt(BoxedList* self, BoxedInt* slice) {
}
extern "C" Box* listDelitemSlice(BoxedList* self, BoxedSlice* slice) {
LOCK_REGION(self->lock.asWrite());
i64 start, stop, step;
parseSlice(slice, self->size, &start, &stop, &step);
RELEASE_ASSERT(step == 1, "step sizes must be 1 for now");
......@@ -233,6 +250,8 @@ extern "C" Box* listDelitemSlice(BoxedList* self, BoxedSlice* slice) {
}
extern "C" Box* listDelitem(BoxedList* self, Box* slice) {
LOCK_REGION(self->lock.asWrite());
Box* rtn;
if (slice->cls == int_cls) {
......@@ -251,6 +270,8 @@ extern "C" Box* listInsert(BoxedList* self, Box* idx, Box* v) {
raiseExcHelper(TypeError, "an integer is required");
}
LOCK_REGION(self->lock.asWrite());
int64_t n = static_cast<BoxedInt*>(idx)->n;
if (n < 0)
n = self->size + n;
......@@ -277,6 +298,8 @@ Box* listMul(BoxedList* self, Box* rhs) {
raiseExcHelper(TypeError, "can't multiply sequence by non-int of type '%s'", getTypeName(rhs)->c_str());
}
LOCK_REGION(self->lock.asRead());
int n = static_cast<BoxedInt*>(rhs)->n;
int s = self->size;
......@@ -300,6 +323,8 @@ Box* listIAdd(BoxedList* self, Box* _rhs) {
raiseExcHelper(TypeError, "can only concatenate list (not \"%s\") to list", getTypeName(_rhs)->c_str());
}
LOCK_REGION(self->lock.asWrite());
BoxedList* rhs = static_cast<BoxedList*>(_rhs);
int s1 = self->size;
......@@ -316,6 +341,8 @@ Box* listAdd(BoxedList* self, Box* _rhs) {
raiseExcHelper(TypeError, "can only concatenate list (not \"%s\") to list", getTypeName(_rhs)->c_str());
}
LOCK_REGION(self->lock.asRead());
BoxedList* rhs = static_cast<BoxedList*>(_rhs);
BoxedList* rtn = new BoxedList();
......@@ -331,6 +358,8 @@ Box* listAdd(BoxedList* self, Box* _rhs) {
}
Box* listSort1(BoxedList* self) {
LOCK_REGION(self->lock.asWrite());
assert(self->cls == list_cls);
std::sort<Box**, PyLt>(self->elts->elts, self->elts->elts + self->size, PyLt());
......@@ -339,6 +368,8 @@ Box* listSort1(BoxedList* self) {
}
Box* listContains(BoxedList* self, Box* elt) {
LOCK_REGION(self->lock.asRead());
int size = self->size;
for (int i = 0; i < size; i++) {
Box* e = self->elts->elts[i];
......@@ -351,6 +382,8 @@ Box* listContains(BoxedList* self, Box* elt) {
}
Box* listCount(BoxedList* self, Box* elt) {
LOCK_REGION(self->lock.asRead());
int size = self->size;
int count = 0;
......@@ -365,6 +398,8 @@ Box* listCount(BoxedList* self, Box* elt) {
}
Box* listIndex(BoxedList* self, Box* elt) {
LOCK_REGION(self->lock.asRead());
int size = self->size;
for (int i = 0; i < size; i++) {
......@@ -380,6 +415,8 @@ Box* listIndex(BoxedList* self, Box* elt) {
}
Box* listRemove(BoxedList* self, Box* elt) {
LOCK_REGION(self->lock.asWrite());
assert(self->cls == list_cls);
for (int i = 0; i < self->size; i++) {
......@@ -398,6 +435,8 @@ Box* listRemove(BoxedList* self, Box* elt) {
}
Box* listReverse(BoxedList* self) {
LOCK_REGION(self->lock.asWrite());
assert(self->cls == list_cls);
for (int i = 0, j = self->size - 1; i < j; i++, j--) {
Box* e = self->elts->elts[i];
......@@ -475,6 +514,9 @@ Box* listEq(BoxedList* self, Box* rhs) {
if (rhs->cls != list_cls) {
return NotImplemented;
}
LOCK_REGION(self->lock.asRead());
return _listCmp(self, static_cast<BoxedList*>(rhs), AST_TYPE::Eq);
}
......
......@@ -1577,6 +1577,60 @@ static inline Box*& getArg(int idx, Box*& arg1, Box*& arg2, Box*& arg3, Box** ar
return args[idx - 3];
}
static CompiledFunction* pickVersion(CLFunction* f, int num_output_args, Box* oarg1, Box* oarg2, Box* oarg3,
Box** oargs) {
LOCK_REGION(codegen_rwlock.asWrite());
CompiledFunction* chosen_cf = NULL;
for (CompiledFunction* cf : f->versions) {
assert(cf->spec->arg_types.size() == num_output_args);
if (cf->spec->rtn_type->llvmType() != UNKNOWN->llvmType())
continue;
bool works = true;
for (int i = 0; i < num_output_args; i++) {
Box* arg = getArg(i, oarg1, oarg2, oarg3, oargs);
ConcreteCompilerType* t = cf->spec->arg_types[i];
if ((arg && !t->isFitBy(arg->cls)) || (!arg && t != UNKNOWN)) {
works = false;
break;
}
}
if (!works)
continue;
chosen_cf = cf;
break;
}
if (chosen_cf == NULL) {
if (f->source == NULL) {
// TODO I don't think this should be happening any more?
printf("Error: couldn't find suitable function version and no source to recompile!\n");
abort();
}
std::vector<ConcreteCompilerType*> arg_types;
for (int i = 0; i < num_output_args; i++) {
Box* arg = getArg(i, oarg1, oarg2, oarg3, oargs);
assert(arg); // only builtin functions can pass NULL args
arg_types.push_back(typeFromClass(arg->cls));
}
FunctionSpecialization* spec = new FunctionSpecialization(UNKNOWN, arg_types);
EffortLevel::EffortLevel new_effort = initialEffort();
// this also pushes the new CompiledVersion to the back of the version list:
chosen_cf = compileFunction(f, spec, new_effort, NULL);
}
return chosen_cf;
}
Box* callFunc(BoxedFunction* func, CallRewriteArgs* rewrite_args, ArgPassSpec argspec, Box* arg1, Box* arg2, Box* arg3,
Box** args, const std::vector<const std::string*>* keyword_names) {
/*
......@@ -1835,54 +1889,7 @@ Box* callFunc(BoxedFunction* func, CallRewriteArgs* rewrite_args, ArgPassSpec ar
// Pick a specific version to use:
CompiledFunction* chosen_cf = NULL;
for (CompiledFunction* cf : f->versions) {
assert(cf->spec->arg_types.size() == num_output_args);
if (cf->spec->rtn_type->llvmType() != UNKNOWN->llvmType())
continue;
bool works = true;
for (int i = 0; i < num_output_args; i++) {
Box* arg = getArg(i, oarg1, oarg2, oarg3, oargs);
ConcreteCompilerType* t = cf->spec->arg_types[i];
if ((arg && !t->isFitBy(arg->cls)) || (!arg && t != UNKNOWN)) {
works = false;
break;
}
}
if (!works)
continue;
chosen_cf = cf;
break;
}
if (chosen_cf == NULL) {
if (f->source == NULL) {
// TODO I don't think this should be happening any more?
printf("Error: couldn't find suitable function version and no source to recompile!\n");
abort();
}
std::vector<ConcreteCompilerType*> arg_types;
for (int i = 0; i < num_output_args; i++) {
Box* arg = getArg(i, oarg1, oarg2, oarg3, oargs);
assert(arg); // only builtin functions can pass NULL args
arg_types.push_back(typeFromClass(arg->cls));
}
FunctionSpecialization* spec = new FunctionSpecialization(UNKNOWN, arg_types);
EffortLevel::EffortLevel new_effort = initialEffort();
// this also pushes the new CompiledVersion to the back of the version list:
chosen_cf = compileFunction(f, spec, new_effort, NULL);
}
CompiledFunction* chosen_cf = pickVersion(f, num_output_args, oarg1, oarg2, oarg3, oargs);
assert(chosen_cf->is_interpreted == (chosen_cf->code == NULL));
if (chosen_cf->is_interpreted) {
......
......@@ -15,6 +15,7 @@
#ifndef PYSTON_RUNTIME_TYPES_H
#define PYSTON_RUNTIME_TYPES_H
#include "core/threading.h"
#include "core/types.h"
namespace pyston {
......@@ -218,6 +219,8 @@ public:
int64_t size, capacity;
GCdArray* elts;
DS_DEFINE_MUTEX(lock);
BoxedList() __attribute__((visibility("default"))) : Box(&list_flavor, list_cls), size(0), capacity(0) {}
void ensure(int space);
......
from thread import start_new_thread
import time
work = []
done = []
......@@ -18,7 +19,7 @@ for i in xrange(nthreads):
t = start_new_thread(run, (N,))
while len(done) < nthreads:
pass
time.sleep(0)
# print work
assert sum(work) == 0
from thread import start_new_thread
import time
a = 0
b = 0
done = []
def set_thread(num):
global a, b
print "starting set_thread", num
for i in xrange(num):
a += 1
b += 1
if i % 10000 == 0:
print i
done.append(None)
def check_thread(num):
while b < num:
_b = b
_a = a
assert _a >= _b, (_a, _b)
done.append(None)
print "starting!"
N = 100000
start_new_thread(check_thread, (N,))
start_new_thread(set_thread, (N,))
while len(done) < 2:
time.sleep(0)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment