Enable vtune support and try to improve cache locality

Enable the VTune JIT support in llvm, and add it as a jit listener. I think it's mostly confirming my suspicion that the slowdown is cache-related... it's not being very helpful with determining why (it's in some function that it can't analyze). I updated the memory allocator to have strong thread-affinity (ie a thread now generally gets back memory that it had previously freed), but that doesn't seem to have any effect. Going to punt on further investigations for now, pretty happy though that there's an overall speedup with the grwl, even if there are still issues.

Enable vtune support and try to improve cache locality
Enable the VTune JIT support in llvm, and add it as a jit listener. I think it's mostly confirming my suspicion that the slowdown is cache-related... it's not being very helpful with determining why (it's in some function that it can't analyze). I updated the memory allocator to have strong thread-affinity (ie a thread now generally gets back memory that it had previously freed), but that doesn't seem to have any effect. Going to punt on further investigations for now, pretty happy though that there's an overall speedup with the grwl, even if there are still issues.
0691f5bc · Kevin Modzelewski · 0b0b2ceb · 0691f5bc · 0691f5bc · 0691f5bc
Commit 0691f5bc authored Jun 17, 2014 by Kevin Modzelewski
5 changed files
--- a/llvm_patches/0004-Update-IntelJITEvents-with-some-recent-API-changes.patch
+++ b/llvm_patches/0004-Update-IntelJITEvents-with-some-recent-API-changes.patch
+From 4e7e44e970b439e977c20693325f46ea8fda6c3f Mon Sep 17 00:00:00 2001
+From: Kevin Modzelewski <kmod@dropbox.com>
+Date: Tue, 17 Jun 2014 19:18:34 -0700
+Subject: [PATCH] Update IntelJITEvents with some recent API changes
+---
+ .../IntelJITEvents/IntelJITEventListener.cpp       |    4 ++--
+ 1 files changed, 2 insertions(+), 2 deletions(-)
+diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+index 9a65fa0..4e22a8b 100644
+--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+@@ -86,7 +86,7 @@ static LineNumberInfo DILineInfoToIntelJITFormat(uintptr_t StartAddress,
+   LineNumberInfo Result;
+   Result.Offset = Address - StartAddress;
+-  Result.LineNumber = Line.getLine();
+  Result.LineNumber = Line.Line;
+   return Result;
+ }
+@@ -233,7 +233,7 @@ void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
+           FunctionMessage.line_number_size = 0;
+           FunctionMessage.line_number_table = 0;
+         } else {
+-          SourceFileName = Lines.front().second.getFileName();
+          SourceFileName = Lines.front().second.FileName;
+           FunctionMessage.source_file_name = const_cast<char *>(SourceFileName.c_str());
+           FunctionMessage.line_number_size = LineInfo.size();
+           FunctionMessage.line_number_table = &*LineInfo.begin();
+-- 
+1.7.4.1
--- a/src/Makefile
+++ b/src/Makefile
@@ -33,6 +33,8 @@ COLOR := 1
 VERBOSE := 0
+ENABLE_INTEL_JIT_EVENTS := 0
 # Setting this to 1 will set the Makefile to use binaries from the trunk
 # directory, even if USE_TEST_LLVM is set to 1.
 # This is useful if clang isn't installed into the test directory, ex due
@@ -78,6 +80,10 @@ else
 endif
 LLVM_LINK_LIBS := core mcjit native bitreader ipo irreader jit debuginfo instrumentation
+ifneq ($(ENABLE_INTEL_JIT_EVENTS),0)
+LLVM_LINK_LIBS += inteljitevents
+endif
 LLVM_CXXFLAGS := $(shell $(LLVM_BUILD)/Release+Asserts/bin/llvm-config --cxxflags)
 LLVM_LDFLAGS := $(shell $(LLVM_BUILD)/Release+Asserts/bin/llvm-config --ldflags --libs $(LLVM_LINK_LIBS))
 LLVM_LIB_DEPS := $(wildcard $(LLVM_BUILD)/Release+Asserts/lib/*)
@@ -121,6 +127,7 @@ COMMON_CXXFLAGS += -Wextra -Wno-sign-compare
 COMMON_CXXFLAGS += -Wno-unused-parameter # should use the "unused" attribute
 COMMON_CXXFLAGS += -fexceptions -fno-rtti
 COMMON_CXXFLAGS += -Wno-invalid-offsetof # allow the use of "offsetof", and we'll just have to make sure to only use it legally.
+COMMON_CXXFLAGS += -DENABLE_INTEL_JIT_EVENTS=$(ENABLE_INTEL_JIT_EVENTS)
 ifeq ($(ENABLE_VALGRIND),0)
 	COMMON_CXXFLAGS += -DNVALGRIND
@@ -424,10 +431,14 @@ llvm_configure:
 	rm -f $(LLVM_BUILD)/Makefile.config
 	$(MAKE) $(LLVM_CONFIGURATION)
+LLVM_CONFIGURE_LINE := CXX=$(GPP) $(LLVM_SRC)/configure --enable-targets=host --with-gcc-toolchain=$(GCC_DIR)
+ifneq ($(ENABLE_INTEL_JIT_EVENTS),0)
+LLVM_CONFIGURE_LINE += --with-intel-jitevents
+endif
 $(LLVM_CONFIGURATION): $(LLVM_SRC)/configure | $(LLVM_SRC)/_patched
 	mkdir -p $(LLVM_BUILD)
 	cd $(LLVM_BUILD) ; \
-	CXX=$(GPP) $(LLVM_SRC)/configure --enable-targets=host --with-gcc-toolchain=$(GCC_DIR)
+	$(LLVM_CONFIGURE_LINE)
 	# CXX=ccache\ g++ ./configure --enable-targets=host
 	# CXX='env CCACHE_PREFIX=distcc ccache g++' ./configure --enable-targets=host

--- a/src/codegen/entry.cpp
+++ b/src/codegen/entry.cpp
@@ -202,6 +202,12 @@ void initCodegen() {
    g.jit_listeners.push_back(stackmap_listener);
    g.engine->RegisterJITEventListener(stackmap_listener);
+#if ENABLE_INTEL_JIT_EVENTS
+    llvm::JITEventListener* intel_listener = llvm::JITEventListener::createIntelJITEventListener();
+    g.jit_listeners.push_back(intel_listener);
+    g.engine->RegisterJITEventListener(intel_listener);
+#endif
    llvm::JITEventListener* registry_listener = makeRegistryListener();
    g.jit_listeners.push_back(registry_listener);
    g.engine->RegisterJITEventListener(registry_listener);

--- a/src/gc/heap.cpp
+++ b/src/gc/heap.cpp
@@ -137,6 +137,8 @@ static Block* alloc_block(uint64_t size, Block** prev) {
 // VALGRIND_CREATE_MEMPOOL(rtn, 0, true);
 #endif
+    //printf("Allocated new block %p\n", rtn);
    // Don't think I need to do this:
    memset(rtn->isfree, 0, sizeof(Block::isfree));
@@ -183,19 +185,16 @@ Heap::ThreadBlockCache::~ThreadBlockCache() {
    LOCK_REGION(heap->lock);
    for (int i = 0; i < NUM_BUCKETS; i++) {
-        Block* b = cache_heads[i];
+        while (Block* b = cache_free_heads[i]) {
-        if (b == NULL)
-            continue;
            removeFromLL(b);
-        // This should have been the only block in the list.
-        // Well, we could cache multiple blocks if we want, and maybe we should,
-        // but for now this routine only supports caching a single one, and would
-        // need to get updated:
-        assert(cache_heads[i] == NULL);
            insertIntoLL(&heap->heads[i], b);
        }
+        while (Block* b = cache_full_heads[i]) {
+            removeFromLL(b);
+            insertIntoLL(&heap->full_heads[i], b);
+        }
+    }
 }
 static void* allocFromBlock(Block* b) {
@@ -242,7 +241,7 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
    ThreadBlockCache* cache = thread_caches.get();
-    Block** cache_head = &cache->cache_heads[bucket_idx];
+    Block** cache_head = &cache->cache_free_heads[bucket_idx];
    //static __thread int gc_allocs = 0;
    //if (++gc_allocs == 128) {
@@ -252,13 +251,13 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
    //}
    while (true) {
-        Block* cache_block = *cache_head;
+        while (Block* cache_block = *cache_head) {
-        if (cache_block) {
            void* rtn = allocFromBlock(cache_block);
            if (rtn)
                return rtn;
            removeFromLL(cache_block);
+            insertIntoLL(&cache->cache_full_heads[bucket_idx], cache_block);
        }
        // Not very useful to count the cache misses if we don't count the total attempts:
@@ -267,10 +266,6 @@ void* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
        LOCK_REGION(lock);
-        if (cache_block) {
-            insertIntoLL(full_head, cache_block);
-        }
        assert(*cache_head == NULL);
        // should probably be called allocBlock:
@@ -416,8 +411,6 @@ static Block** freeChain(Block** head) {
            if (isMarked(header)) {
                clearMark(header);
            } else {
-                if (VERBOSITY() >= 2)
-                    printf("Freeing %p\n", p);
                // assert(p != (void*)0x127000d960); // the main module
                b->isfree[bitmap_idx] |= mask;
            }
@@ -429,21 +422,48 @@ static Block** freeChain(Block** head) {
 }
 void Heap::freeUnmarked() {
+    Timer _t("looking at the thread caches");
+    thread_caches.forEachValue([this](ThreadBlockCache* cache) {
        for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
-        Block** chain_end = freeChain(&heads[bidx]);
+            Block *h = cache->cache_free_heads[bidx];
-        freeChain(&full_heads[bidx]);
+            // Try to limit the amount of unused memory a thread can hold onto;
+            // currently pretty dumb, just limit the number of blocks in the free-list
+            // to 50.  (blocks in the full list don't need to be limited, since we're sure
+            // that the thread had just actively used those.)
+            // Eventually may want to come up with some scrounging system.
+            // TODO does this thread locality even help at all?
+            for (int i = 0; i < 50; i++) {
+                if (h)
+                    h = h->next;
+                else
+                    break;
+            }
+            if (h) {
+                removeFromLL(h);
+                insertIntoLL(&heads[bidx], h);
+            }
-        while (Block* b = full_heads[bidx]) {
+            Block** chain_end = freeChain(&cache->cache_free_heads[bidx]);
+            freeChain(&cache->cache_full_heads[bidx]);
+            while (Block* b = cache->cache_full_heads[bidx]) {
                removeFromLL(b);
                insertIntoLL(chain_end, b);
            }
        }
+    });
+    _t.end();
-    thread_caches.forEachValue([](ThreadBlockCache* cache) {
    for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
-            freeChain(&cache->cache_heads[bidx]);
+        Block** chain_end = freeChain(&heads[bidx]);
+        freeChain(&full_heads[bidx]);
+        while (Block* b = full_heads[bidx]) {
+            removeFromLL(b);
+            insertIntoLL(chain_end, b);
+        }
    }
-    });
    LargeObj* cur = large_head;
    while (cur) {

--- a/src/gc/heap.h
+++ b/src/gc/heap.h
@@ -88,9 +88,13 @@ private:
    struct ThreadBlockCache {
        Heap* heap;
-        Block* cache_heads[NUM_BUCKETS];
+        Block* cache_free_heads[NUM_BUCKETS];
+        Block* cache_full_heads[NUM_BUCKETS];
-        ThreadBlockCache(Heap* heap) : heap(heap) { memset(cache_heads, 0, sizeof(cache_heads)); }
+        ThreadBlockCache(Heap* heap) : heap(heap) {
+            memset(cache_free_heads, 0, sizeof(cache_free_heads));
+            memset(cache_full_heads, 0, sizeof(cache_full_heads));
+        }
        ~ThreadBlockCache();
    };
    friend class ThreadBlockCache;