Commit 0691f5bc authored by Kevin Modzelewski's avatar Kevin Modzelewski

Enable vtune support and try to improve cache locality

Enable the VTune JIT support in llvm, and add it as a jit
listener.

I think it's mostly confirming my suspicion that the slowdown is
cache-related... it's not being very helpful with determining why
(it's in some function that it can't analyze).

I updated the memory allocator to have strong thread-affinity
(ie a thread now generally gets back memory that it had previously freed),
but that doesn't seem to have any effect.

Going to punt on further investigations for now, pretty happy though that there's
an overall speedup with the grwl, even if there are still issues.
parent 0b0b2ceb
From 4e7e44e970b439e977c20693325f46ea8fda6c3f Mon Sep 17 00:00:00 2001
From: Kevin Modzelewski <kmod@dropbox.com>
Date: Tue, 17 Jun 2014 19:18:34 -0700
Subject: [PATCH] Update IntelJITEvents with some recent API changes
---
.../IntelJITEvents/IntelJITEventListener.cpp | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 9a65fa0..4e22a8b 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -86,7 +86,7 @@ static LineNumberInfo DILineInfoToIntelJITFormat(uintptr_t StartAddress,
LineNumberInfo Result;
Result.Offset = Address - StartAddress;
- Result.LineNumber = Line.getLine();
+ Result.LineNumber = Line.Line;
return Result;
}
@@ -233,7 +233,7 @@ void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
FunctionMessage.line_number_size = 0;
FunctionMessage.line_number_table = 0;
} else {
- SourceFileName = Lines.front().second.getFileName();
+ SourceFileName = Lines.front().second.FileName;
FunctionMessage.source_file_name = const_cast<char *>(SourceFileName.c_str());
FunctionMessage.line_number_size = LineInfo.size();
FunctionMessage.line_number_table = &*LineInfo.begin();
--
1.7.4.1
...@@ -33,6 +33,8 @@ COLOR := 1 ...@@ -33,6 +33,8 @@ COLOR := 1
VERBOSE := 0 VERBOSE := 0
ENABLE_INTEL_JIT_EVENTS := 0
# Setting this to 1 will set the Makefile to use binaries from the trunk # Setting this to 1 will set the Makefile to use binaries from the trunk
# directory, even if USE_TEST_LLVM is set to 1. # directory, even if USE_TEST_LLVM is set to 1.
# This is useful if clang isn't installed into the test directory, ex due # This is useful if clang isn't installed into the test directory, ex due
...@@ -78,6 +80,10 @@ else ...@@ -78,6 +80,10 @@ else
endif endif
LLVM_LINK_LIBS := core mcjit native bitreader ipo irreader jit debuginfo instrumentation LLVM_LINK_LIBS := core mcjit native bitreader ipo irreader jit debuginfo instrumentation
ifneq ($(ENABLE_INTEL_JIT_EVENTS),0)
LLVM_LINK_LIBS += inteljitevents
endif
LLVM_CXXFLAGS := $(shell $(LLVM_BUILD)/Release+Asserts/bin/llvm-config --cxxflags) LLVM_CXXFLAGS := $(shell $(LLVM_BUILD)/Release+Asserts/bin/llvm-config --cxxflags)
LLVM_LDFLAGS := $(shell $(LLVM_BUILD)/Release+Asserts/bin/llvm-config --ldflags --libs $(LLVM_LINK_LIBS)) LLVM_LDFLAGS := $(shell $(LLVM_BUILD)/Release+Asserts/bin/llvm-config --ldflags --libs $(LLVM_LINK_LIBS))
LLVM_LIB_DEPS := $(wildcard $(LLVM_BUILD)/Release+Asserts/lib/*) LLVM_LIB_DEPS := $(wildcard $(LLVM_BUILD)/Release+Asserts/lib/*)
...@@ -121,6 +127,7 @@ COMMON_CXXFLAGS += -Wextra -Wno-sign-compare ...@@ -121,6 +127,7 @@ COMMON_CXXFLAGS += -Wextra -Wno-sign-compare
COMMON_CXXFLAGS += -Wno-unused-parameter # should use the "unused" attribute COMMON_CXXFLAGS += -Wno-unused-parameter # should use the "unused" attribute
COMMON_CXXFLAGS += -fexceptions -fno-rtti COMMON_CXXFLAGS += -fexceptions -fno-rtti
COMMON_CXXFLAGS += -Wno-invalid-offsetof # allow the use of "offsetof", and we'll just have to make sure to only use it legally. COMMON_CXXFLAGS += -Wno-invalid-offsetof # allow the use of "offsetof", and we'll just have to make sure to only use it legally.
COMMON_CXXFLAGS += -DENABLE_INTEL_JIT_EVENTS=$(ENABLE_INTEL_JIT_EVENTS)
ifeq ($(ENABLE_VALGRIND),0) ifeq ($(ENABLE_VALGRIND),0)
COMMON_CXXFLAGS += -DNVALGRIND COMMON_CXXFLAGS += -DNVALGRIND
...@@ -424,10 +431,14 @@ llvm_configure: ...@@ -424,10 +431,14 @@ llvm_configure:
rm -f $(LLVM_BUILD)/Makefile.config rm -f $(LLVM_BUILD)/Makefile.config
$(MAKE) $(LLVM_CONFIGURATION) $(MAKE) $(LLVM_CONFIGURATION)
LLVM_CONFIGURE_LINE := CXX=$(GPP) $(LLVM_SRC)/configure --enable-targets=host --with-gcc-toolchain=$(GCC_DIR)
ifneq ($(ENABLE_INTEL_JIT_EVENTS),0)
LLVM_CONFIGURE_LINE += --with-intel-jitevents
endif
$(LLVM_CONFIGURATION): $(LLVM_SRC)/configure | $(LLVM_SRC)/_patched $(LLVM_CONFIGURATION): $(LLVM_SRC)/configure | $(LLVM_SRC)/_patched
mkdir -p $(LLVM_BUILD) mkdir -p $(LLVM_BUILD)
cd $(LLVM_BUILD) ; \ cd $(LLVM_BUILD) ; \
CXX=$(GPP) $(LLVM_SRC)/configure --enable-targets=host --with-gcc-toolchain=$(GCC_DIR) $(LLVM_CONFIGURE_LINE)
# CXX=ccache\ g++ ./configure --enable-targets=host # CXX=ccache\ g++ ./configure --enable-targets=host
# CXX='env CCACHE_PREFIX=distcc ccache g++' ./configure --enable-targets=host # CXX='env CCACHE_PREFIX=distcc ccache g++' ./configure --enable-targets=host
......
...@@ -202,6 +202,12 @@ void initCodegen() { ...@@ -202,6 +202,12 @@ void initCodegen() {
g.jit_listeners.push_back(stackmap_listener); g.jit_listeners.push_back(stackmap_listener);
g.engine->RegisterJITEventListener(stackmap_listener); g.engine->RegisterJITEventListener(stackmap_listener);
#if ENABLE_INTEL_JIT_EVENTS
llvm::JITEventListener* intel_listener = llvm::JITEventListener::createIntelJITEventListener();
g.jit_listeners.push_back(intel_listener);
g.engine->RegisterJITEventListener(intel_listener);
#endif
llvm::JITEventListener* registry_listener = makeRegistryListener(); llvm::JITEventListener* registry_listener = makeRegistryListener();
g.jit_listeners.push_back(registry_listener); g.jit_listeners.push_back(registry_listener);
g.engine->RegisterJITEventListener(registry_listener); g.engine->RegisterJITEventListener(registry_listener);
......
...@@ -137,6 +137,8 @@ static Block* alloc_block(uint64_t size, Block** prev) { ...@@ -137,6 +137,8 @@ static Block* alloc_block(uint64_t size, Block** prev) {
// VALGRIND_CREATE_MEMPOOL(rtn, 0, true); // VALGRIND_CREATE_MEMPOOL(rtn, 0, true);
#endif #endif
//printf("Allocated new block %p\n", rtn);
// Don't think I need to do this: // Don't think I need to do this:
memset(rtn->isfree, 0, sizeof(Block::isfree)); memset(rtn->isfree, 0, sizeof(Block::isfree));
...@@ -183,19 +185,16 @@ Heap::ThreadBlockCache::~ThreadBlockCache() { ...@@ -183,19 +185,16 @@ Heap::ThreadBlockCache::~ThreadBlockCache() {
LOCK_REGION(heap->lock); LOCK_REGION(heap->lock);
for (int i = 0; i < NUM_BUCKETS; i++) { for (int i = 0; i < NUM_BUCKETS; i++) {
Block* b = cache_heads[i]; while (Block* b = cache_free_heads[i]) {
if (b == NULL)
continue;
removeFromLL(b); removeFromLL(b);
// This should have been the only block in the list.
// Well, we could cache multiple blocks if we want, and maybe we should,
// but for now this routine only supports caching a single one, and would
// need to get updated:
assert(cache_heads[i] == NULL);
insertIntoLL(&heap->heads[i], b); insertIntoLL(&heap->heads[i], b);
} }
while (Block* b = cache_full_heads[i]) {
removeFromLL(b);
insertIntoLL(&heap->full_heads[i], b);
}
}
} }
static void* allocFromBlock(Block* b) { static void* allocFromBlock(Block* b) {
...@@ -242,7 +241,7 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) { ...@@ -242,7 +241,7 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
ThreadBlockCache* cache = thread_caches.get(); ThreadBlockCache* cache = thread_caches.get();
Block** cache_head = &cache->cache_heads[bucket_idx]; Block** cache_head = &cache->cache_free_heads[bucket_idx];
//static __thread int gc_allocs = 0; //static __thread int gc_allocs = 0;
//if (++gc_allocs == 128) { //if (++gc_allocs == 128) {
...@@ -252,13 +251,13 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) { ...@@ -252,13 +251,13 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
//} //}
while (true) { while (true) {
Block* cache_block = *cache_head; while (Block* cache_block = *cache_head) {
if (cache_block) {
void* rtn = allocFromBlock(cache_block); void* rtn = allocFromBlock(cache_block);
if (rtn) if (rtn)
return rtn; return rtn;
removeFromLL(cache_block); removeFromLL(cache_block);
insertIntoLL(&cache->cache_full_heads[bucket_idx], cache_block);
} }
// Not very useful to count the cache misses if we don't count the total attempts: // Not very useful to count the cache misses if we don't count the total attempts:
...@@ -267,10 +266,6 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) { ...@@ -267,10 +266,6 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
LOCK_REGION(lock); LOCK_REGION(lock);
if (cache_block) {
insertIntoLL(full_head, cache_block);
}
assert(*cache_head == NULL); assert(*cache_head == NULL);
// should probably be called allocBlock: // should probably be called allocBlock:
...@@ -416,8 +411,6 @@ static Block** freeChain(Block** head) { ...@@ -416,8 +411,6 @@ static Block** freeChain(Block** head) {
if (isMarked(header)) { if (isMarked(header)) {
clearMark(header); clearMark(header);
} else { } else {
if (VERBOSITY() >= 2)
printf("Freeing %p\n", p);
// assert(p != (void*)0x127000d960); // the main module // assert(p != (void*)0x127000d960); // the main module
b->isfree[bitmap_idx] |= mask; b->isfree[bitmap_idx] |= mask;
} }
...@@ -429,21 +422,48 @@ static Block** freeChain(Block** head) { ...@@ -429,21 +422,48 @@ static Block** freeChain(Block** head) {
} }
void Heap::freeUnmarked() { void Heap::freeUnmarked() {
Timer _t("looking at the thread caches");
thread_caches.forEachValue([this](ThreadBlockCache* cache) {
for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) { for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
Block** chain_end = freeChain(&heads[bidx]); Block *h = cache->cache_free_heads[bidx];
freeChain(&full_heads[bidx]); // Try to limit the amount of unused memory a thread can hold onto;
// currently pretty dumb, just limit the number of blocks in the free-list
// to 50. (blocks in the full list don't need to be limited, since we're sure
// that the thread had just actively used those.)
// Eventually may want to come up with some scrounging system.
// TODO does this thread locality even help at all?
for (int i = 0; i < 50; i++) {
if (h)
h = h->next;
else
break;
}
if (h) {
removeFromLL(h);
insertIntoLL(&heads[bidx], h);
}
while (Block* b = full_heads[bidx]) { Block** chain_end = freeChain(&cache->cache_free_heads[bidx]);
freeChain(&cache->cache_full_heads[bidx]);
while (Block* b = cache->cache_full_heads[bidx]) {
removeFromLL(b); removeFromLL(b);
insertIntoLL(chain_end, b); insertIntoLL(chain_end, b);
} }
} }
});
_t.end();
thread_caches.forEachValue([](ThreadBlockCache* cache) {
for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) { for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
freeChain(&cache->cache_heads[bidx]); Block** chain_end = freeChain(&heads[bidx]);
freeChain(&full_heads[bidx]);
while (Block* b = full_heads[bidx]) {
removeFromLL(b);
insertIntoLL(chain_end, b);
}
} }
});
LargeObj* cur = large_head; LargeObj* cur = large_head;
while (cur) { while (cur) {
......
...@@ -88,9 +88,13 @@ private: ...@@ -88,9 +88,13 @@ private:
struct ThreadBlockCache { struct ThreadBlockCache {
Heap* heap; Heap* heap;
Block* cache_heads[NUM_BUCKETS]; Block* cache_free_heads[NUM_BUCKETS];
Block* cache_full_heads[NUM_BUCKETS];
ThreadBlockCache(Heap* heap) : heap(heap) { memset(cache_heads, 0, sizeof(cache_heads)); } ThreadBlockCache(Heap* heap) : heap(heap) {
memset(cache_free_heads, 0, sizeof(cache_free_heads));
memset(cache_full_heads, 0, sizeof(cache_full_heads));
}
~ThreadBlockCache(); ~ThreadBlockCache();
}; };
friend class ThreadBlockCache; friend class ThreadBlockCache;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment