Commit de36c6be authored by Kevin Modzelewski's avatar Kevin Modzelewski

Merge pull request #676 from kmod/perf

An assortment of misc small perf changes
parents 2e9a9e35 ac3dedc2
......@@ -108,11 +108,7 @@ else
LLVM_BUILD := $(LLVM_TRUNK_BUILD)
endif
ifeq ($(FORCE_TRUNK_BINARIES),1)
LLVM_BIN := $(LLVM_TRUNK_BUILD)/Release/bin
else
LLVM_BIN := $(LLVM_BUILD)/Release/bin
endif
LLVM_BIN := ./build/Release/llvm/bin
LLVM_LINK_LIBS := core mcjit native bitreader bitwriter ipo irreader debuginfodwarf instrumentation
ifneq ($(ENABLE_INTEL_JIT_EVENTS),0)
......
class C(object):
pass
def f():
g = getattr
c = C()
c.o = 1
for i in xrange(10000000):
g(c, "o")
f()
......@@ -11,6 +11,7 @@
.text._ZN6pyston15objectNewNoArgsEPNS_10BoxedClassE
.text._PyIndex_Check
.text._ZN6pyston9threading21allowGLReadPreemptionEv
.text._ZN6pyston9threading22_allowGLReadPreemptionEv
.text._ZN6pyston9getOpNameEi
.text._ZN6pyston8callFuncEPNS_17BoxedFunctionBaseEPNS_15CallRewriteArgsENS_11ArgPassSpecEPNS_3BoxES6_S6_PS6_PKSt6vectorIPKSsSaISA_EE
.text._ZN6pyston2gc9GCVisitor5visitEPv
......
......@@ -15,6 +15,8 @@
#ifndef PYSTON_ANALYSIS_SCOPINGANALYSIS_H
#define PYSTON_ANALYSIS_SCOPINGANALYSIS_H
#include "llvm/ADT/DenseMap.h"
#include "core/common.h"
#include "core/stringpool.h"
......@@ -146,14 +148,14 @@ public:
class ScopingAnalysis {
public:
struct ScopeNameUsage;
typedef std::unordered_map<AST*, ScopeNameUsage*> NameUsageMap;
typedef llvm::DenseMap<AST*, ScopeNameUsage*> NameUsageMap;
private:
std::unordered_map<AST*, ScopeInfo*> scopes;
llvm::DenseMap<AST*, ScopeInfo*> scopes;
AST_Module* parent_module;
InternedStringPool* interned_strings;
std::unordered_map<AST*, AST*> scope_replacements;
llvm::DenseMap<AST*, AST*> scope_replacements;
ScopeInfo* analyzeSubtree(AST* node);
void processNameUsages(NameUsageMap* usages);
......
......@@ -186,8 +186,9 @@ void Rewriter::ConstLoader::moveImmediate(uint64_t val, assembler::Register dst_
assembler::Register Rewriter::ConstLoader::findConst(uint64_t val, bool& found_value) {
assert(rewriter->phase_emitting);
if (constToVar.count(val) > 0) {
RewriterVar* var = constToVar[val];
auto it = constToVar.find(val);
if (it != constToVar.end()) {
RewriterVar* var = it->second;
for (Location l : var->locations) {
if (l.type == Location::Register) {
found_value = true;
......
......@@ -14,7 +14,6 @@
#include "core/threading.h"
#include <atomic>
#include <cstdio>
#include <cstdlib>
#include <err.h>
......@@ -481,7 +480,7 @@ extern "C" void endAllowThreads() noexcept {
static pthread_mutex_t gil = PTHREAD_MUTEX_INITIALIZER;
static std::atomic<int> threads_waiting_on_gil(0);
std::atomic<int> threads_waiting_on_gil(0);
static pthread_cond_t gil_acquired = PTHREAD_COND_INITIALIZER;
extern "C" void PyEval_ReInitThreads() noexcept {
......@@ -524,9 +523,6 @@ void releaseGLWrite() {
pthread_mutex_unlock(&gil);
}
#define GIL_CHECK_INTERVAL 1000
// Note: this doesn't need to be an atomic, since it should
// only be accessed by the thread that holds the gil:
int gil_check_count = 0;
// TODO: this function is fair in that it forces a thread to give up the GIL
......@@ -535,37 +531,19 @@ int gil_check_count = 0;
// switching back and forth, and a third that never gets run.
// We could enforce fairness by having a FIFO of events (implementd with mutexes?)
// and make sure to always wake up the longest-waiting one.
void allowGLReadPreemption() {
#if ENABLE_SAMPLING_PROFILER
if (unlikely(sigprof_pending)) {
// Output multiple stacktraces if we received multiple signals
// between being able to handle it (such as being in LLVM or the GC),
// to try to fully account for that time.
while (sigprof_pending) {
_printStacktrace();
sigprof_pending--;
}
}
#endif
void _allowGLReadPreemption() {
assert(gil_check_count >= GIL_CHECK_INTERVAL);
gil_check_count = 0;
// Double-checked locking: first read with no ordering constraint:
if (!threads_waiting_on_gil.load(std::memory_order_relaxed))
// Double check this, since if we are wrong about there being a thread waiting on the gil,
// we're going to get stuck in the following pthread_cond_wait:
if (!threads_waiting_on_gil.load(std::memory_order_seq_cst))
return;
gil_check_count++;
if (gil_check_count >= GIL_CHECK_INTERVAL) {
gil_check_count = 0;
// Double check this, since if we are wrong about there being a thread waiting on the gil,
// we're going to get stuck in the following pthread_cond_wait:
if (!threads_waiting_on_gil.load(std::memory_order_seq_cst))
return;
threads_waiting_on_gil++;
pthread_cond_wait(&gil_acquired, &gil);
threads_waiting_on_gil--;
pthread_cond_signal(&gil_acquired);
}
threads_waiting_on_gil++;
pthread_cond_wait(&gil_acquired, &gil);
threads_waiting_on_gil--;
pthread_cond_signal(&gil_acquired);
}
#elif THREADING_USE_GRWL
static pthread_rwlock_t grwl = PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP;
......
......@@ -15,6 +15,7 @@
#ifndef PYSTON_CORE_THREADING_H
#define PYSTON_CORE_THREADING_H
#include <atomic>
#include <cstdint>
#include <cstring>
#include <ucontext.h>
......@@ -81,7 +82,36 @@ void acquireGLRead();
void releaseGLRead();
void acquireGLWrite();
void releaseGLWrite();
void allowGLReadPreemption();
void _allowGLReadPreemption();
#define GIL_CHECK_INTERVAL 1000
// Note: this doesn't need to be an atomic, since it should
// only be accessed by the thread that holds the gil:
extern int gil_check_count;
extern std::atomic<int> threads_waiting_on_gil;
inline void allowGLReadPreemption() {
#if ENABLE_SAMPLING_PROFILER
if (unlikely(sigprof_pending)) {
// Output multiple stacktraces if we received multiple signals
// between being able to handle it (such as being in LLVM or the GC),
// to try to fully account for that time.
while (sigprof_pending) {
_printStacktrace();
sigprof_pending--;
}
}
#endif
// Double-checked locking: first read with no ordering constraint:
if (!threads_waiting_on_gil.load(std::memory_order_relaxed))
return;
gil_check_count++;
if (likely(gil_check_count < GIL_CHECK_INTERVAL))
return;
_allowGLReadPreemption();
}
// Note: promoteGL is free to drop the lock and then reacquire
void promoteGL();
void demoteGL();
......
......@@ -393,9 +393,11 @@ Box* BoxedWrapperDescriptor::descr_get(Box* _self, Box* inst, Box* owner) noexce
if (inst == None)
return self;
if (!isSubclass(inst->cls, self->type))
if (!isSubclass(inst->cls, self->type)) {
PyErr_Format(TypeError, "Descriptor '' for '%s' objects doesn't apply to '%s' object",
getFullNameOfClass(self->type).c_str(), getFullTypeName(inst).c_str());
return NULL;
}
return new BoxedWrapperObject(self, inst);
}
......
......@@ -249,6 +249,8 @@ extern "C" PyObject* PyDict_GetItem(PyObject* dict, PyObject* key) noexcept {
return d->getOrNull(key);
}
// XXX this would be easy to make much faster.
// This path doesn't exist in CPython; we have it to support extension modules that do
// something along the lines of PyDict_GetItem(PyModule_GetDict()):
try {
......@@ -304,6 +306,9 @@ extern "C" int PyDict_Next(PyObject* op, Py_ssize_t* ppos, PyObject** pkey, PyOb
}
extern "C" PyObject* PyDict_GetItemString(PyObject* dict, const char* key) noexcept {
if (dict->cls == attrwrapper_cls)
return unwrapAttrWrapper(dict)->getattr(key);
Box* key_s;
try {
key_s = boxString(key);
......
......@@ -17,6 +17,7 @@
#include "codegen/irgen/hooks.h"
#include "core/ast.h"
#include "core/threading.h"
#include "core/types.h"
#include "gc/heap.h"
#include "runtime/complex.h"
......@@ -138,6 +139,8 @@ void force() {
FORCE(boxedLocalsGet);
FORCE(boxedLocalsDel);
FORCE(threading::allowGLReadPreemption);
// FORCE(listIter);
}
}
......
......@@ -594,6 +594,10 @@ Box* listIAdd(BoxedList* self, Box* _rhs) {
int s1 = self->size;
int s2 = rhs->size;
if (s2 == 0)
return self;
self->ensure(s1 + s2);
memcpy(self->elts->elts + s1, rhs->elts->elts, sizeof(rhs->elts->elts[0]) * s2);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment