Commit 57a0f307 authored by Kevin Modzelewski's avatar Kevin Modzelewski

Merge pull request #582 from kmod/optimize_statcounters

Optimize stattimers
parents 6a6d049c c2e037b5
def f():
S = set("abc")
c = "b"
for i in xrange(10000000):
c in S
f()
# This is the string tokenizer.PseudoToken:
pattern = '[ \\f\\t]*((\\\\\\r?\\n|\\Z|#[^\\r\\n]*|([uUbB]?[rR]?\'\'\'|[uUbB]?[rR]?"""))|((\\d+[jJ]|((\\d+\\.\\d*|\\.\\d+)([eE][-+]?\\d+)?|\\d+[eE][-+]?\\d+)[jJ])|((\\d+\\.\\d*|\\.\\d+)([eE][-+]?\\d+)?|\\d+[eE][-+]?\\d+)|(0[xX][\\da-fA-F]+[lL]?|0[bB][01]+[lL]?|(0[oO][0-7]+)|(0[0-7]*)[lL]?|[1-9]\\d*[lL]?))|((\\*\\*=?|>>=?|<<=?|<>|!=|//=?|[+\\-*/%&|^=<>]=?|~)|[][(){}]|(\\r?\\n|[:;.,`@]))|([uUbB]?[rR]?\'[^\\n\'\\\\]*(?:\\\\.[^\\n\'\\\\]*)*(\'|\\\\\\r?\\n)|[uUbB]?[rR]?"[^\\n"\\\\]*(?:\\\\.[^\\n"\\\\]*)*("|\\\\\\r?\\n))|[a-zA-Z_]\\w*)'
class Tokenizer:
def __init__(self, string):
self.string = string
self.index = 0
self.__next()
def __next(self):
if self.index >= len(self.string):
self.next = None
return
char = self.string[self.index]
if char[0] == "\\":
try:
c = self.string[self.index + 1]
except IndexError:
raise error, "bogus escape (end of line)"
char = char + c
self.index = self.index + len(char)
self.next = char
def match(self, char, skip=1):
if char == self.next:
if skip:
self.__next()
return 1
return 0
def get(self):
this = self.next
self.__next()
return this
def tell(self):
return self.index, self.next
def seek(self, index):
self.index, self.next = index
for i in xrange(1000):
t = Tokenizer(pattern)
for j in xrange(500):
t._Tokenizer__next()
# n = t.get()
# if not n:
# break
# sre_parse.parse(pattern, 0)
......@@ -123,7 +123,7 @@ uint8_t* PystonMemoryManager::allocateSection(MemoryGroup& MemGroup, uintptr_t S
}
std::string stat_name = "mem_section_" + std::string(SectionName);
Stats::log(Stats::getStatId(stat_name), MB.size());
Stats::log(Stats::getStatCounter(stat_name), MB.size());
// Save this address as the basis for our next request
MemGroup.Near = MB;
......
......@@ -23,133 +23,75 @@ namespace pyston {
#if !DISABLE_STATS
#if STAT_TIMERS
extern "C" const char* getStatTimerNameById(int id) {
return Stats::getStatName(id).c_str();
}
extern "C" int getStatTimerId() {
return StatTimer::getStack()->getId();
}
extern "C" const char* getStatTimerName() {
return getStatTimerNameById(getStatTimerId());
}
__thread StatTimer* StatTimer::stack;
StatTimer::StatTimer(int statid, bool push) {
uint64_t at_time = getCPUTicks();
_start_time = 0;
_statid = statid;
if (!push) {
_prev = NULL;
return;
}
_prev = stack;
stack = this;
if (_prev) {
_prev->pause(at_time);
}
resume(at_time);
}
StatTimer::StatTimer(int statid, uint64_t at_time) {
_start_time = 0;
_statid = statid;
_prev = stack;
stack = this;
if (_prev) {
_prev->pause(at_time);
}
resume(at_time);
}
StatTimer::~StatTimer() {
assert(stack == this);
uint64_t at_time;
if (!isPaused()) {
at_time = getCPUTicks();
pause(at_time);
} else {
// fprintf (stderr, "WARNING: timer was paused.\n");
at_time = _last_pause_time;
}
stack = _prev;
if (stack) {
stack->resume(at_time);
}
}
void StatTimer::pause(uint64_t at_time) {
assert(!isPaused());
assert(at_time > _start_time);
uint64_t _duration = at_time - _start_time;
Stats::log(_statid, _duration);
StatTimer* StatTimer::swapStack(StatTimer* s) {
uint64_t at_time = getCPUTicks();
_start_time = 0;
_last_pause_time = at_time;
// fprintf (stderr, "paused %d at %lu\n", _statid, at_time);
assert(stack);
assert(s);
StatTimer* prev_stack = stack;
stack->pause(at_time);
stack = s;
stack->resume(at_time);
assert(prev_stack);
return prev_stack;
}
void StatTimer::resume(uint64_t at_time) {
assert(isPaused());
_start_time = at_time;
// fprintf (stderr, "resumed %d at %lu\n", _statid, at_time);
}
StatTimer* StatTimer::createStack(StatTimer& timer) {
uint64_t at_time = getCPUTicks();
StatTimer* StatTimer::swapStack(StatTimer* s, uint64_t at_time) {
assert(stack);
StatTimer* prev_stack = stack;
if (stack) {
stack->pause(at_time);
}
stack = s;
if (stack) {
stack->resume(at_time);
}
stack->pause(at_time);
stack = NULL;
timer.pushTopLevel(at_time);
return prev_stack;
}
#endif
std::vector<uint64_t>* Stats::counts;
std::unordered_map<int, std::string>* Stats::names;
std::unordered_map<uint64_t*, std::string>* Stats::names;
bool Stats::enabled;
timespec Stats::start_ts;
uint64_t Stats::start_tick;
StatCounter::StatCounter(const std::string& name) : id(Stats::getStatId(name)) {
StatCounter::StatCounter(const std::string& name) : counter(Stats::getStatCounter(name)) {
}
StatPerThreadCounter::StatPerThreadCounter(const std::string& name) {
char buf[80];
snprintf(buf, 80, "%s_t%ld", name.c_str(), pthread_self());
id = Stats::getStatId(buf);
counter = Stats::getStatCounter(buf);
}
int Stats::getStatId(const std::string& name) {
static std::vector<uint64_t*>* counts;
uint64_t* Stats::getStatCounter(const std::string& name) {
// hacky but easy way of getting around static constructor ordering issues for now:
static std::unordered_map<int, std::string> names;
static std::unordered_map<uint64_t*, std::string> names;
Stats::names = &names;
static std::vector<uint64_t> counts;
Stats::counts = &counts;
static std::unordered_map<std::string, int> made;
static std::unordered_map<std::string, uint64_t*> made;
// TODO: can do better than doing a malloc per counter:
static std::vector<uint64_t*> counts;
pyston::counts = &counts;
if (made.count(name))
return made[name];
int rtn = names.size();
uint64_t* rtn = new uint64_t(0);
names[rtn] = name;
made[name] = rtn;
counts.push_back(0);
counts.push_back(rtn);
return rtn;
}
std::string Stats::getStatName(int id) {
return (*names)[id];
void Stats::clear() {
assert(counts);
for (auto p : *counts) {
*p = 0;
}
}
void Stats::startEstimatingCPUFreq() {
......@@ -183,7 +125,7 @@ void Stats::dump(bool includeZeros) {
fprintf(stderr, "Counters:\n");
std::vector<std::pair<std::string, int>> pairs;
std::vector<std::pair<std::string, uint64_t*>> pairs;
for (const auto& p : *names) {
pairs.push_back(make_pair(p.second, p.first));
}
......@@ -193,19 +135,19 @@ void Stats::dump(bool includeZeros) {
uint64_t ticks_in_main = 0;
uint64_t accumulated_stat_timer_ticks = 0;
for (int i = 0; i < pairs.size(); i++) {
if (includeZeros || (*counts)[pairs[i].second] > 0) {
uint64_t count = *pairs[i].second;
if (includeZeros || count > 0) {
if (startswith(pairs[i].first, "us_") || startswith(pairs[i].first, "_init_us_")) {
fprintf(stderr, "%s: %lu\n", pairs[i].first.c_str(),
(uint64_t)((*counts)[pairs[i].second] / cycles_per_us));
fprintf(stderr, "%s: %lu\n", pairs[i].first.c_str(), (uint64_t)(count / cycles_per_us));
} else
fprintf(stderr, "%s: %lu\n", pairs[i].first.c_str(), (*counts)[pairs[i].second]);
fprintf(stderr, "%s: %lu\n", pairs[i].first.c_str(), count);
if (startswith(pairs[i].first, "us_timer_"))
accumulated_stat_timer_ticks += (*counts)[pairs[i].second];
accumulated_stat_timer_ticks += count;
if (pairs[i].first == "ticks_in_main")
ticks_in_main = (*counts)[pairs[i].second];
ticks_in_main = count;
}
}
......@@ -229,10 +171,10 @@ void Stats::dump(bool includeZeros) {
}
void Stats::endOfInit() {
int orig_names = names->size();
for (int orig_id = 0; orig_id < orig_names; orig_id++) {
int init_id = getStatId("_init_" + (*names)[orig_id]);
log(init_id, (*counts)[orig_id]);
std::unordered_map<uint64_t*, std::string> names_copy(names->begin(), names->end());
for (const auto& p : names_copy) {
uint64_t* init_id = getStatCounter("_init_" + p.second);
log(init_id, *p.first);
}
};
......
......@@ -30,17 +30,14 @@ namespace pyston {
#define STAT_ALLOCATIONS 0 && !DISABLE_STATS
#define STAT_TIMERS 0 && !DISABLE_STATS
#define EXPENSIVE_STAT_TIMERS 0 && STAT_TIMERS
#if STAT_TIMERS
#define STAT_TIMER(id, name) \
static int _stid##id = Stats::getStatId(name); \
StatTimer _st##id(_stid##id)
#define STAT_TIMER2(id, name, at_time) \
static int _stid##id = Stats::getStatId(name); \
StatTimer _st##id(_stid##id, at_time)
static uint64_t* _stcounter##id = Stats::getStatCounter(name); \
ScopedStatTimer _st##id(_stcounter##id)
#else
#define STAT_TIMER(id, name) StatTimer _st##id(0);
#define STAT_TIMER2(id, name, at_time) StatTimer _st##id(0);
#define STAT_TIMER(id, name)
#endif
#define STAT_TIMER_NAME(id) _st##id
......@@ -48,8 +45,7 @@ namespace pyston {
#if !DISABLE_STATS
struct Stats {
private:
static std::vector<uint64_t>* counts;
static std::unordered_map<int, std::string>* names;
static std::unordered_map<uint64_t*, std::string>* names;
static bool enabled;
static timespec start_ts;
......@@ -59,35 +55,34 @@ public:
static void startEstimatingCPUFreq();
static double estimateCPUFreq();
static int getStatId(const std::string& name);
static std::string getStatName(int id);
static uint64_t* getStatCounter(const std::string& name);
static void setEnabled(bool enabled) { Stats::enabled = enabled; }
static void log(int id, uint64_t count = 1) { (*counts)[id] += count; }
static void log(uint64_t* counter, uint64_t count = 1) { *counter += count; }
static void clear() { std::fill(counts->begin(), counts->end(), 0); }
static void clear();
static void dump(bool includeZeros = true);
static void endOfInit();
};
struct StatCounter {
private:
int id;
uint64_t* counter;
public:
StatCounter(const std::string& name);
void log(uint64_t count = 1) { Stats::log(id, count); }
void log(uint64_t count = 1) { *counter += count; }
};
struct StatPerThreadCounter {
private:
int id = 0;
uint64_t* counter = 0;
public:
StatPerThreadCounter(const std::string& name);
void log(uint64_t count = 1) { Stats::log(id, count); }
void log(uint64_t count = 1) { *counter += count; }
};
#else
......@@ -119,32 +114,93 @@ private:
// the start time of the current active segment (0 == paused)
uint64_t _start_time;
uint64_t _last_pause_time;
StatTimer* _prev;
int _statid;
uint64_t* _statcounter;
public:
StatTimer(int statid, bool push = true);
StatTimer(int statid, uint64_t at_time);
~StatTimer();
StatTimer(uint64_t* counter) : _statcounter(counter) {}
void pushNonTopLevel() {
uint64_t at_time = getCPUTicks();
#ifndef NDEBUG
_start_time = 0;
#endif
assert(stack);
_prev = stack;
stack = this;
_prev->pause(at_time);
resume(at_time);
}
void popNonTopLevel() {
assert(stack == this);
uint64_t at_time;
assert(!isPaused());
at_time = getCPUTicks();
pause(at_time);
assert(_prev);
stack = _prev;
stack->resume(at_time);
}
void pushTopLevel(uint64_t at_time) {
#ifndef NDEBUG
_start_time = 0;
#endif
assert(!stack);
_prev = stack;
stack = this;
resume(at_time);
}
void popTopLevel(uint64_t at_time) {
assert(!_prev);
stack = _prev;
pause(at_time);
}
private:
void pause(uint64_t at_time) {
assert(!isPaused());
assert(at_time > _start_time);
void pause(uint64_t at_time);
void resume(uint64_t at_time);
uint64_t _duration = at_time - _start_time;
Stats::log(_statcounter, _duration);
_start_time = 0;
}
void resume(uint64_t at_time) {
assert(isPaused());
_start_time = at_time;
}
public:
#ifndef NDEBUG
bool isPaused() const { return _start_time == 0; }
int getId() const { return _statid; }
#endif
static StatTimer* getStack() { return stack; }
// Creates a new stattimer stack from an unstarted top-level timer
static StatTimer* createStack(StatTimer& timer);
static StatTimer* swapStack(StatTimer* s);
static StatTimer* swapStack(StatTimer* s, uint64_t at_time);
static void assertActive() { ASSERT(stack && !stack->isPaused(), ""); }
};
class ScopedStatTimer {
private:
StatTimer timer;
static void assertActive() { RELEASE_ASSERT(stack && !stack->isPaused(), ""); }
public:
ScopedStatTimer(uint64_t* counter) : timer(counter) { timer.pushNonTopLevel(); }
~ScopedStatTimer() { timer.popNonTopLevel(); }
};
#else
struct StatTimer {
StatTimer(int statid, bool push = true) {}
StatTimer(int statid) {}
StatTimer(int statid, uint64_t at_time) {}
~StatTimer() {}
bool isPaused() const { return false; }
......@@ -152,8 +208,8 @@ struct StatTimer {
void pause(uint64_t at_time) {}
void resume(uint64_t at_time) {}
static StatTimer* getStack() { return NULL; }
static StatTimer* swapStack(StatTimer* s, uint64_t at_time) { return NULL; }
static StatTimer* createStack(StatTimer& timer);
static StatTimer* swapStack(StatTimer* s);
static void assertActive() {}
};
#endif
......
......@@ -29,18 +29,6 @@
namespace pyston {
static inline uint64_t rdtsc() {
unsigned long lo, hi;
asm("rdtsc" : "=a"(lo), "=d"(hi));
return (lo | (hi << 32));
}
uint64_t getCPUTicks() {
// unsigned int _unused;
// return __rdtscp(&_unused);
return rdtsc();
}
#if !DISABLE_TIMERS
int Timer::level = 0;
......
......@@ -24,7 +24,11 @@
namespace pyston {
uint64_t getCPUTicks();
inline uint64_t getCPUTicks() {
unsigned long lo, hi;
asm("rdtsc" : "=a"(lo), "=d"(hi));
return (lo | (hi << 32));
}
#define DISABLE_TIMERS 0
......
......@@ -31,6 +31,10 @@
namespace pyston {
namespace gc {
#if STAT_TIMERS
uint64_t* gc_alloc_stattimer_counter = Stats::getStatCounter("us_timer_gc_alloc");
#endif
extern "C" void* gc_compat_malloc(size_t sz) noexcept {
return gc_alloc(sz, GCKind::CONSERVATIVE);
}
......
......@@ -40,8 +40,16 @@ static StatCounter gc_alloc_bytes_typed[] = {
};
#endif
#if STAT_TIMERS
extern uint64_t* gc_alloc_stattimer_counter;
#endif
extern "C" inline void* gc_alloc(size_t bytes, GCKind kind_id) {
STAT_TIMER(t0, "us_timer_gc_alloc");
#if EXPENSIVE_STAT_TIMERS
// This stat timer is quite expensive, not just because this function is extremely hot,
// but also because it tends to increase the size of this function enough that we can't
// inline it, which is especially useful for this function.
ScopedStatTimer gc_alloc_stattimer(gc_alloc_stattimer_counter);
#endif
size_t alloc_bytes = bytes + sizeof(GCAllocation);
#ifndef NVALGRIND
......
......@@ -268,7 +268,10 @@ static int main(int argc, char** argv) {
Timer main_time;
int rtncode = 0;
{
STAT_TIMER2(t0, "us_timer_main_toplevel", main_time.getStartTime());
#if STAT_TIMERS
StatTimer timer(Stats::getStatCounter("us_timer_main_toplevel"));
timer.pushTopLevel(main_time.getStartTime());
#endif
int code;
const char* command = NULL;
......@@ -502,11 +505,14 @@ static int main(int argc, char** argv) {
joinRuntime();
_t.split("finishing up");
#if STAT_TIMERS
uint64_t main_time_ended_at;
uint64_t main_time_duration = main_time.end(&main_time_ended_at);
static StatCounter mt("ticks_in_main");
mt.log(main_time_duration);
STAT_TIMER_NAME(t0).pause(main_time_ended_at);
timer.popTopLevel(main_time_ended_at);
#endif
}
Stats::dump(true);
......
......@@ -63,11 +63,23 @@ static void* thread_start(Box* target, Box* varargs, Box* kwargs) {
assert(target);
assert(varargs);
#if STAT_TIMERS
// TODO: maybe we should just not log anything for threads...
static uint64_t* timer_counter = Stats::getStatCounter("us_timer_thread_start");
StatTimer timer(timer_counter);
timer.pushTopLevel(getCPUTicks());
#endif
try {
runtimeCall(target, ArgPassSpec(0, 0, true, kwargs != NULL), varargs, kwargs, NULL, NULL, NULL);
} catch (ExcInfo e) {
e.printExcAndTraceback();
}
#if STAT_TIMERS
timer.popTopLevel(getCPUTicks());
#endif
return NULL;
}
......
......@@ -87,8 +87,6 @@ Context* getReturnContextForGeneratorFrame(void* frame_addr) {
void generatorEntry(BoxedGenerator* g) {
{
STAT_TIMER2(t0, "us_timer_generator_toplevel", g->timer_time);
assert(g->cls == generator_cls);
assert(g->function->cls == function_cls);
......@@ -110,11 +108,6 @@ void generatorEntry(BoxedGenerator* g) {
// we returned from the body of the generator. next/send/throw will notify the caller
g->entryExited = true;
threading::popGenerator();
#if STAT_TIMERS
g->timer_time = getCPUTicks(); // store off the timer that our caller (in g->returnContext) will resume at
STAT_TIMER_NAME(t0).pause(g->timer_time);
#endif
}
swapContext(&g->context, g->returnContext, 0);
}
......@@ -138,17 +131,20 @@ static void generatorSendInternal(BoxedGenerator* self, Box* v) {
self->running = true;
#if STAT_TIMERS
// store off the time that the generator will use to initialize its toplevel timer
self->timer_time = getCPUTicks();
StatTimer* current_timers = StatTimer::swapStack(self->statTimers, self->timer_time);
if (!self->prev_stack)
self->prev_stack = StatTimer::createStack(self->my_timer);
else
self->prev_stack = StatTimer::swapStack(self->prev_stack);
#endif
swapContext(&self->returnContext, self->context, (intptr_t)self);
#if STAT_TIMERS
// if the generator exited we use the time that generatorEntry stored in self->timer_time (the same time it paused
// its timer at).
self->statTimers = StatTimer::swapStack(current_timers, self->entryExited ? self->timer_time : getCPUTicks());
self->prev_stack = StatTimer::swapStack(self->prev_stack);
if (self->entryExited) {
assert(self->prev_stack == &self->my_timer);
assert(self->my_timer.isPaused());
}
#endif
self->running = false;
......@@ -282,7 +278,9 @@ extern "C" BoxedGenerator* createGenerator(BoxedFunctionBase* function, Box* arg
return new BoxedGenerator(function, arg1, arg2, arg3, args);
}
#if STAT_TIMERS
static uint64_t* generator_timer_counter = Stats::getStatCounter("us_timer_generator_toplevel");
#endif
extern "C" BoxedGenerator::BoxedGenerator(BoxedFunctionBase* function, Box* arg1, Box* arg2, Box* arg3, Box** args)
: function(function),
arg1(arg1),
......@@ -294,7 +292,13 @@ extern "C" BoxedGenerator::BoxedGenerator(BoxedFunctionBase* function, Box* arg1
returnValue(nullptr),
exception(nullptr, nullptr, nullptr),
context(nullptr),
returnContext(nullptr) {
returnContext(nullptr)
#if STAT_TIMERS
,
prev_stack(NULL),
my_timer(generator_timer_counter)
#endif
{
int numArgs = function->f->num_args;
if (numArgs > 3) {
......
......@@ -123,8 +123,15 @@ static Box* (*callattrInternal2)(Box*, llvm::StringRef, LookupScope, CallRewrite
static Box* (*callattrInternal3)(Box*, llvm::StringRef, LookupScope, CallRewriteArgs*, ArgPassSpec, Box*, Box*, Box*)
= (Box * (*)(Box*, llvm::StringRef, LookupScope, CallRewriteArgs*, ArgPassSpec, Box*, Box*, Box*))callattrInternal;
#if STAT_TIMERS
static uint64_t* pyhasher_timer_counter = Stats::getStatCounter("us_timer_PyHasher");
static uint64_t* pyeq_timer_counter = Stats::getStatCounter("us_timer_PyEq");
static uint64_t* pylt_timer_counter = Stats::getStatCounter("us_timer_PyLt");
#endif
size_t PyHasher::operator()(Box* b) const {
STAT_TIMER(t0, "us_timer_PyHasher");
#if EXPENSIVE_STAT_TIMERS
ScopedStatTimer _st(pyhasher_timer_counter);
#endif
if (b->cls == str_cls) {
StringHash<char> H;
auto s = static_cast<BoxedString*>(b);
......@@ -135,7 +142,9 @@ size_t PyHasher::operator()(Box* b) const {
}
bool PyEq::operator()(Box* lhs, Box* rhs) const {
STAT_TIMER(t0, "us_timer_PyEq");
#if EXPENSIVE_STAT_TIMERS
ScopedStatTimer _st(pyeq_timer_counter);
#endif
int r = PyObject_RichCompareBool(lhs, rhs, Py_EQ);
if (r == -1)
......@@ -144,7 +153,9 @@ bool PyEq::operator()(Box* lhs, Box* rhs) const {
}
bool PyLt::operator()(Box* lhs, Box* rhs) const {
STAT_TIMER(t0, "us_timer_PyLt");
#if EXPENSIVE_STAT_TIMERS
ScopedStatTimer _st(pylt_timer_counter);
#endif
int r = PyObject_RichCompareBool(lhs, rhs, Py_LT);
if (r == -1)
......@@ -206,7 +217,9 @@ extern "C" void my_assert(bool b) {
}
extern "C" bool isSubclass(BoxedClass* child, BoxedClass* parent) {
#if EXPENSIVE_STAT_TIMERS
STAT_TIMER(t0, "us_timer_isSubclass");
#endif
return PyType_IsSubtype(child, parent);
}
......@@ -1755,8 +1768,8 @@ extern "C" Box* getattr(Box* obj, const char* attr) {
if (VERBOSITY() >= 2) {
#if !DISABLE_STATS
std::string per_name_stat_name = "getattr__" + std::string(attr);
int id = Stats::getStatId(per_name_stat_name);
Stats::log(id);
uint64_t* counter = Stats::getStatCounter(per_name_stat_name);
Stats::log(counter);
#endif
}
......@@ -4823,8 +4836,8 @@ extern "C" Box* getGlobal(Box* globals, const std::string* name) {
if (VERBOSITY() >= 2) {
#if !DISABLE_STATS
std::string per_name_stat_name = "getglobal__" + *name;
int id = Stats::getStatId(per_name_stat_name);
Stats::log(id);
uint64_t* counter = Stats::getStatCounter(per_name_stat_name);
Stats::log(counter);
#endif
}
......
......@@ -836,8 +836,8 @@ public:
void* stack_begin;
#if STAT_TIMERS
StatTimer* statTimers;
uint64_t timer_time;
StatTimer* prev_stack;
StatTimer my_timer;
#endif
BoxedGenerator(BoxedFunctionBase* function, Box* arg1, Box* arg2, Box* arg3, Box** args);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment