Merge pull request #298 from toshok/three-arenas

Add third GC arena

Merge pull request #298 from toshok/three-arenas
Add third GC arena
012d6d50 · Kevin Modzelewski · 7cf92757 · d85c9893 · 012d6d50 · 012d6d50
Commit 012d6d50 authored Feb 13, 2015 by Kevin Modzelewski
5 changed files
--- a/minibenchmarks/pidigits.py
+++ b/minibenchmarks/pidigits.py
+import time
+
+PIDIGITS_LEN = 1500
+
+def pidigits(length):
+    i = k = ns = 0
+    k1 = 1
+    n,a,d,t,u = 1,0,1,0,0
+    while(True):
+        k += 1
+        t = n<<1
+        n *= k
+        a += t
+        k1 += 2
+        a *= k1
+        d *= k1
+        if a >= n:
+            t,u = divmod(n*3 + a,d)
+            u += n
+            if d > u:
+                ns = ns*10 + t
+                i += 1
+                if i % 10 == 0:
+                    ns = 0
+                if i >= length:
+                    break
+                a -= d*t
+                a *= 10
+                n *= 10
+
+def main(n):
+    l = []
+    for i in range(n):
+        t0 = time.time()
+        pidigits(PIDIGITS_LEN)
+        l.append(time.time() - t0)
+    return l
+
+main(100)
--- a/src/gc/gc_alloc.h
+++ b/src/gc/gc_alloc.h
@@ -51,6 +51,11 @@ extern "C" inline void* gc_alloc(size_t bytes, GCKind kind_id) {
 #endif

    GCAllocation* alloc = global_heap.alloc(alloc_bytes);
+
+#ifndef NVALGRIND
+    VALGRIND_DISABLE_ERROR_REPORTING;
+#endif
+
    alloc->kind_id = kind_id;
    alloc->gc_flags = 0;

@@ -67,7 +72,10 @@ extern "C" inline void* gc_alloc(size_t bytes, GCKind kind_id) {
    }

    void* r = alloc->user_data;
+
 #ifndef NVALGRIND
+    VALGRIND_ENABLE_ERROR_REPORTING;
+
    if (ENABLE_REDZONES) {
        r = ((char*)r) + REDZONE_SIZE;
    }

--- a/src/gc/heap.cpp
+++ b/src/gc/heap.cpp
@@ -17,7 +17,6 @@
 #include <cstdlib>
 #include <cstring>
 #include <stdint.h>
-#include <sys/mman.h>

 #include "core/common.h"
 #include "core/util.h"
@@ -34,6 +33,67 @@
 namespace pyston {
 namespace gc {

+void _doFree(GCAllocation* al);
+
+// lots of linked lists around here, so let's just use template functions for operations on them.
+template <class ListT> inline void nullNextPrev(ListT* node) {
+    node->next = NULL;
+    node->prev = NULL;
+}
+
+template <class ListT> inline void removeFromLL(ListT* node) {
+    *node->prev = node->next;
+    if (node->next)
+        node->next->prev = node->prev;
+}
+
+template <class ListT> inline void removeFromLLAndNull(ListT* node) {
+    *node->prev = node->next;
+    if (node->next)
+        node->next->prev = node->prev;
+    nullNextPrev(node);
+}
+
+template <class ListT> inline void insertIntoLL(ListT** next_pointer, ListT* next) {
+    assert(next_pointer);
+    assert(next);
+    assert(!next->next);
+    assert(!next->prev);
+
+    next->next = *next_pointer;
+    if (next->next)
+        next->next->prev = &next->next;
+    *next_pointer = next;
+    next->prev = next_pointer;
+}
+
+template <class ListT, typename Func> inline void forEach(ListT* list, Func func) {
+    auto cur = list;
+    while (cur) {
+        func(cur);
+        cur = cur->next;
+    }
+}
+
+template <class ListT, typename Free> inline void sweepList(ListT* head, Free free_func) {
+    auto cur = head;
+    while (cur) {
+        GCAllocation* al = cur->data;
+        if (isMarked(al)) {
+            clearMark(al);
+            cur = cur->next;
+        } else {
+            _doFree(al);
+
+            removeFromLL(cur);
+
+            auto to_free = cur;
+            cur = cur->next;
+            free_func(to_free);
+        }
+    }
+}
+
 static unsigned bytesAllocatedSinceCollection;
 static __thread unsigned thread_bytesAllocatedSinceCollection;
 #define ALLOCBYTES_PER_COLLECTION 10000000
@@ -64,73 +124,268 @@ void registerGCManagedBytes(size_t bytes) {

 Heap global_heap;

-#define PAGE_SIZE 4096
-class Arena {
-private:
-    void* start;
-    void* cur;
+void _doFree(GCAllocation* al) {
+    if (VERBOSITY() >= 2)
+        printf("Freeing %p\n", al->user_data);

-public:
-    constexpr Arena(void* start) : start(start), cur(start) {}
+#ifndef NVALGRIND
+    VALGRIND_DISABLE_ERROR_REPORTING;
+#endif
+    GCKind alloc_kind = al->kind_id;
+#ifndef NVALGRIND
+    VALGRIND_ENABLE_ERROR_REPORTING;
+#endif

-    void* doMmap(size_t size) {
-        assert(size % PAGE_SIZE == 0);
-        // printf("mmap %ld\n", size);
+    if (alloc_kind == GCKind::PYTHON) {
+#ifndef NVALGRIND
+        VALGRIND_DISABLE_ERROR_REPORTING;
+#endif
+        Box* b = (Box*)al->user_data;
+#ifndef NVALGRIND
+        VALGRIND_ENABLE_ERROR_REPORTING;
+#endif

-        void* mrtn = mmap(cur, size, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-        assert((uintptr_t)mrtn != -1 && "failed to allocate memory from OS");
-        ASSERT(mrtn == cur, "%p %p\n", mrtn, cur);
-        cur = (uint8_t*)cur + size;
-        return mrtn;
+        ASSERT(b->cls->tp_dealloc == NULL, "%s", getTypeName(b));
+        if (b->cls->simple_destructor)
+            b->cls->simple_destructor(b);
    }
+}
+
+void Heap::destructContents(GCAllocation* al) {
+    _doFree(al);
+}
+
+struct HeapStatistics {
+    struct TypeStats {
+        int64_t nallocs;
+        int64_t nbytes;
+        TypeStats() : nallocs(0), nbytes(0) {}

-    bool contains(void* addr) { return start <= addr && addr < cur; }
+        void print(const char* name) const {
+            if (nbytes > (1 << 20))
+                printf("%s: %ld allocations for %.1f MB\n", name, nallocs, nbytes * 1.0 / (1 << 20));
+            else if (nbytes > (1 << 10))
+                printf("%s: %ld allocations for %.1f KB\n", name, nallocs, nbytes * 1.0 / (1 << 10));
+            else
+                printf("%s: %ld allocations for %ld bytes\n", name, nallocs, nbytes);
+        }
+    };
+    std::unordered_map<BoxedClass*, TypeStats> by_cls;
+    TypeStats conservative, untracked;
+    TypeStats total;
 };

-static Arena small_arena((void*)0x1270000000L);
-static Arena large_arena((void*)0x2270000000L);
+void addStatistic(HeapStatistics* stats, GCAllocation* al, int nbytes) {
+    stats->total.nallocs++;
+    stats->total.nbytes += nbytes;

-struct LargeObj {
-    LargeObj* next, **prev;
-    size_t obj_size;
-    GCAllocation data[0];
+    if (al->kind_id == GCKind::PYTHON) {
+        Box* b = (Box*)al->user_data;
+        auto& t = stats->by_cls[b->cls];

-    int mmap_size() {
-        size_t total_size = obj_size + sizeof(LargeObj);
-        total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
-        return total_size;
+        t.nallocs++;
+        t.nbytes += nbytes;
+    } else if (al->kind_id == GCKind::CONSERVATIVE) {
+        stats->conservative.nallocs++;
+        stats->conservative.nbytes += nbytes;
+    } else if (al->kind_id == GCKind::UNTRACKED) {
+        stats->untracked.nallocs++;
+        stats->untracked.nbytes += nbytes;
+    } else {
+        RELEASE_ASSERT(0, "%d", (int)al->kind_id);
    }
+}
+
+
+
+void Heap::dumpHeapStatistics() {
+    threading::GLPromoteRegion _lock;
+
+    HeapStatistics stats;

-    int capacity() { return mmap_size() - sizeof(LargeObj); }
+    small_arena.getStatistics(&stats);
+    large_arena.getStatistics(&stats);
+    huge_arena.getStatistics(&stats);

-    static LargeObj* fromAllocation(GCAllocation* alloc) {
-        char* rtn = (char*)alloc - offsetof(LargeObj, data);
-        assert((uintptr_t)rtn % PAGE_SIZE == 0);
-        return reinterpret_cast<LargeObj*>(rtn);
+    stats.conservative.print("conservative");
+    stats.untracked.print("untracked");
+    for (const auto& p : stats.by_cls) {
+        p.second.print(getFullNameOfClass(p.first).c_str());
    }
-};
+    stats.total.print("Total");
+    printf("\n");
+}

-GCAllocation* Heap::allocLarge(size_t size) {
-    registerGCManagedBytes(size);
+void dumpHeapStatistics() {
+    global_heap.dumpHeapStatistics();
+}

-    LOCK_REGION(lock);
+//////
+/// Small Arena
+
+GCAllocation* SmallArena::alloc(size_t bytes) {
+    registerGCManagedBytes(bytes);
+    if (bytes <= 16)
+        return _alloc(16, 0);
+    else if (bytes <= 32)
+        return _alloc(32, 1);
+    else {
+        for (int i = 2; i < NUM_BUCKETS; i++) {
+            if (sizes[i] >= bytes) {
+                return _alloc(sizes[i], i);
+            }
+        }
+        return NULL;
+    }
+}

-    size_t total_size = size + sizeof(LargeObj);
-    total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
-    LargeObj* rtn = (LargeObj*)large_arena.doMmap(total_size);
-    rtn->obj_size = size;
+GCAllocation* SmallArena::realloc(GCAllocation* al, size_t bytes) {
+    Block* b = Block::forPointer(al);

-    rtn->next = large_head;
-    if (rtn->next)
-        rtn->next->prev = &rtn->next;
-    rtn->prev = &large_head;
-    large_head = rtn;
+    size_t size = b->size;

-    return rtn->data;
+    if (size >= bytes && size < bytes * 2)
+        return al;
+
+    GCAllocation* rtn = heap->alloc(bytes);
+
+#ifndef NVALGRIND
+    VALGRIND_DISABLE_ERROR_REPORTING;
+    memcpy(rtn, al, std::min(bytes, size));
+    VALGRIND_ENABLE_ERROR_REPORTING;
+#else
+    memcpy(rtn, al, std::min(bytes, size));
+#endif
+
+    free(al);
+    return rtn;
+}
+
+void SmallArena::free(GCAllocation* alloc) {
+    Block* b = Block::forPointer(alloc);
+    size_t size = b->size;
+    int offset = (char*)alloc - (char*)b;
+    assert(offset % size == 0);
+    int atom_idx = offset / ATOM_SIZE;
+
+    assert(!b->isfree.isSet(atom_idx));
+    b->isfree.set(atom_idx);
+
+#ifndef NVALGRIND
+// VALGRIND_MEMPOOL_FREE(b, ptr);
+#endif
+}
+
+GCAllocation* SmallArena::allocationFrom(void* ptr) {
+    Block* b = Block::forPointer(ptr);
+    size_t size = b->size;
+    int offset = (char*)ptr - (char*)b;
+    int obj_idx = offset / size;
+
+    if (obj_idx < b->minObjIndex() || obj_idx >= b->numObjects())
+        return NULL;
+
+    int atom_idx = obj_idx * b->atomsPerObj();
+
+    if (b->isfree.isSet(atom_idx))
+        return NULL;
+
+    return reinterpret_cast<GCAllocation*>(&b->atoms[atom_idx]);
+}
+
+void SmallArena::freeUnmarked() {
+    thread_caches.forEachValue([this](ThreadBlockCache* cache) {
+        for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
+            Block* h = cache->cache_free_heads[bidx];
+            // Try to limit the amount of unused memory a thread can hold onto;
+            // currently pretty dumb, just limit the number of blocks in the free-list
+            // to 50.  (blocks in the full list don't need to be limited, since we're sure
+            // that the thread had just actively used those.)
+            // Eventually may want to come up with some scrounging system.
+            // TODO does this thread locality even help at all?
+            for (int i = 0; i < 50; i++) {
+                if (h)
+                    h = h->next;
+                else
+                    break;
+            }
+            if (h) {
+                removeFromLLAndNull(h);
+                insertIntoLL(&heads[bidx], h);
+            }
+
+            Block** chain_end = _freeChain(&cache->cache_free_heads[bidx]);
+            _freeChain(&cache->cache_full_heads[bidx]);
+
+            while (Block* b = cache->cache_full_heads[bidx]) {
+                removeFromLLAndNull(b);
+                insertIntoLL(chain_end, b);
+            }
+        }
+    });
+
+    for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
+        Block** chain_end = _freeChain(&heads[bidx]);
+        _freeChain(&full_heads[bidx]);
+
+        while (Block* b = full_heads[bidx]) {
+            removeFromLLAndNull(b);
+            insertIntoLL(chain_end, b);
+        }
+    }
 }

-static Block* alloc_block(uint64_t size, Block** prev) {
-    Block* rtn = (Block*)small_arena.doMmap(sizeof(Block));
+// TODO: copy-pasted from freeUnmarked()
+void SmallArena::getStatistics(HeapStatistics* stats) {
+    thread_caches.forEachValue([this, stats](ThreadBlockCache* cache) {
+        for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
+            Block* h = cache->cache_free_heads[bidx];
+
+            _getChainStatistics(stats, &cache->cache_free_heads[bidx]);
+            _getChainStatistics(stats, &cache->cache_full_heads[bidx]);
+        }
+    });
+
+    for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
+        _getChainStatistics(stats, &heads[bidx]);
+        _getChainStatistics(stats, &full_heads[bidx]);
+    }
+}
+
+
+SmallArena::Block** SmallArena::_freeChain(Block** head) {
+    while (Block* b = *head) {
+        int num_objects = b->numObjects();
+        int first_obj = b->minObjIndex();
+        int atoms_per_obj = b->atomsPerObj();
+
+        for (int obj_idx = first_obj; obj_idx < num_objects; obj_idx++) {
+            int atom_idx = obj_idx * atoms_per_obj;
+
+            if (b->isfree.isSet(atom_idx))
+                continue;
+
+            void* p = &b->atoms[atom_idx];
+            GCAllocation* al = reinterpret_cast<GCAllocation*>(p);
+
+            if (isMarked(al)) {
+                clearMark(al);
+            } else {
+                _doFree(al);
+
+                // assert(p != (void*)0x127000d960); // the main module
+                b->isfree.set(atom_idx);
+            }
+        }
+
+        head = &b->next;
+    }
+    return head;
+}
+
+
+SmallArena::Block* SmallArena::_allocBlock(uint64_t size, Block** prev) {
+    Block* rtn = (Block*)doMmap(sizeof(Block));
    assert(rtn);
    rtn->size = size;
    rtn->num_obj = BLOCK_SIZE / size;
@@ -165,45 +420,23 @@ static Block* alloc_block(uint64_t size, Block** prev) {
    return rtn;
 }

-static void insertIntoLL(Block** next_pointer, Block* next) {
-    assert(next_pointer);
-    assert(next);
-    assert(!next->next);
-    assert(!next->prev);
-
-    next->next = *next_pointer;
-    if (next->next)
-        next->next->prev = &next->next;
-    *next_pointer = next;
-    next->prev = next_pointer;
-}
-
-static void removeFromLL(Block* b) {
-    if (b->next)
-        b->next->prev = b->prev;
-    *b->prev = b->next;
-
-    b->next = NULL;
-    b->prev = NULL;
-}
-
-Heap::ThreadBlockCache::~ThreadBlockCache() {
+SmallArena::ThreadBlockCache::~ThreadBlockCache() {
    LOCK_REGION(heap->lock);

    for (int i = 0; i < NUM_BUCKETS; i++) {
        while (Block* b = cache_free_heads[i]) {
-            removeFromLL(b);
-            insertIntoLL(&heap->heads[i], b);
+            removeFromLLAndNull(b);
+            insertIntoLL(&small->heads[i], b);
        }

        while (Block* b = cache_full_heads[i]) {
-            removeFromLL(b);
-            insertIntoLL(&heap->full_heads[i], b);
+            removeFromLLAndNull(b);
+            insertIntoLL(&small->full_heads[i], b);
        }
    }
 }

-static GCAllocation* allocFromBlock(Block* b) {
+GCAllocation* SmallArena::_allocFromBlock(Block* b) {
    int idx = b->isfree.scanForNext(b->next_to_check);
    if (idx == -1)
        return NULL;
@@ -212,19 +445,17 @@ static GCAllocation* allocFromBlock(Block* b) {
    return reinterpret_cast<GCAllocation*>(rtn);
 }

-static Block* claimBlock(size_t rounded_size, Block** free_head) {
+SmallArena::Block* SmallArena::_claimBlock(size_t rounded_size, Block** free_head) {
    Block* free_block = *free_head;
    if (free_block) {
-        removeFromLL(free_block);
+        removeFromLLAndNull(free_block);
        return free_block;
    }

-    return alloc_block(rounded_size, NULL);
+    return _allocBlock(rounded_size, NULL);
 }

-GCAllocation* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
-    registerGCManagedBytes(rounded_size);
-
+GCAllocation* SmallArena::_alloc(size_t rounded_size, int bucket_idx) {
    Block** free_head = &heads[bucket_idx];
    Block** full_head = &full_heads[bucket_idx];

@@ -241,11 +472,11 @@ GCAllocation* Heap::allocSmall(size_t rounded_size, int bucket_idx) {

    while (true) {
        while (Block* cache_block = *cache_head) {
-            GCAllocation* rtn = allocFromBlock(cache_block);
+            GCAllocation* rtn = _allocFromBlock(cache_block);
            if (rtn)
                return rtn;

-            removeFromLL(cache_block);
+            removeFromLLAndNull(cache_block);
            insertIntoLL(&cache->cache_full_heads[bucket_idx], cache_block);
        }

@@ -253,12 +484,12 @@ GCAllocation* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
        // static StatCounter sc_fallback("gc_allocs_cachemiss");
        // sc_fallback.log();

-        LOCK_REGION(lock);
+        LOCK_REGION(heap->lock);

        assert(*cache_head == NULL);

        // should probably be called allocBlock:
-        Block* myblock = claimBlock(rounded_size, &heads[bucket_idx]);
+        Block* myblock = _claimBlock(rounded_size, &heads[bucket_idx]);
        assert(myblock);
        assert(!myblock->next);
        assert(!myblock->prev);
@@ -269,322 +500,290 @@ GCAllocation* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
    }
 }

-void _freeFrom(GCAllocation* alloc, Block* b) {
-    assert(b == Block::forPointer(alloc));
-
-    size_t size = b->size;
-    int offset = (char*)alloc - (char*)b;
-    assert(offset % size == 0);
-    int atom_idx = offset / ATOM_SIZE;
-
-    assert(!b->isfree.isSet(atom_idx));
-    b->isfree.toggle(atom_idx);
-
-#ifndef NVALGRIND
-// VALGRIND_MEMPOOL_FREE(b, ptr);
-#endif
-}
+// TODO: copy-pasted from _freeChain
+void SmallArena::_getChainStatistics(HeapStatistics* stats, Block** head) {
+    while (Block* b = *head) {
+        int num_objects = b->numObjects();
+        int first_obj = b->minObjIndex();
+        int atoms_per_obj = b->atomsPerObj();

-static void _freeLargeObj(LargeObj* lobj) {
-    *lobj->prev = lobj->next;
-    if (lobj->next)
-        lobj->next->prev = lobj->prev;
+        for (int obj_idx = first_obj; obj_idx < num_objects; obj_idx++) {
+            int atom_idx = obj_idx * atoms_per_obj;

-    int r = munmap(lobj, lobj->mmap_size());
-    assert(r == 0);
-}
+            if (b->isfree.isSet(atom_idx))
+                continue;

-static void _doFree(GCAllocation* al) {
-    if (VERBOSITY() >= 2)
-        printf("Freeing %p\n", al->user_data);
+            void* p = &b->atoms[atom_idx];
+            GCAllocation* al = reinterpret_cast<GCAllocation*>(p);

-    if (al->kind_id == GCKind::PYTHON) {
-        Box* b = (Box*)al->user_data;
+            addStatistic(stats, al, b->size);
+        }

-        ASSERT(b->cls->tp_dealloc == NULL, "%s", getTypeName(b));
-        if (b->cls->simple_destructor)
-            b->cls->simple_destructor(b);
+        head = &b->next;
    }
 }

-void Heap::free(GCAllocation* al) {
-    _doFree(al);
+//////
+/// Large Arena

-    if (large_arena.contains(al)) {
-        LargeObj* lobj = LargeObj::fromAllocation(al);
-        _freeLargeObj(lobj);
-        return;
-    }
+#define LARGE_BLOCK_NUM_CHUNKS ((BLOCK_SIZE >> CHUNK_BITS) - 1)

-    assert(small_arena.contains(al));
-    Block* b = Block::forPointer(al);
-    _freeFrom(al, b);
-}
+#define LARGE_BLOCK_FOR_OBJ(obj) ((LargeBlock*)((int64_t)(obj) & ~(int64_t)(BLOCK_SIZE - 1)))
+#define LARGE_CHUNK_INDEX(obj, section) (((char*)(obj) - (char*)(section)) >> CHUNK_BITS)

-GCAllocation* Heap::realloc(GCAllocation* al, size_t bytes) {
-    if (large_arena.contains(al)) {
-        LargeObj* lobj = LargeObj::fromAllocation(al);
+GCAllocation* LargeArena::alloc(size_t size) {
+    registerGCManagedBytes(size);

-        int capacity = lobj->capacity();
-        if (capacity >= bytes && capacity < bytes * 2)
-            return al;
+    LOCK_REGION(heap->lock);

-        GCAllocation* rtn = alloc(bytes);
-        memcpy(rtn, al, std::min(bytes, lobj->obj_size));
+    // printf ("allocLarge %zu\n", size);

-        _freeLargeObj(lobj);
-        return rtn;
-    }
+    LargeObj* obj = _alloc(size + sizeof(GCAllocation) + sizeof(LargeObj));

-    assert(small_arena.contains(al));
-    Block* b = Block::forPointer(al);
+    obj->size = size;

-    size_t size = b->size;
+    nullNextPrev(obj);
+    insertIntoLL(&head, obj);
+
+    return obj->data;
+}

+GCAllocation* LargeArena::realloc(GCAllocation* al, size_t bytes) {
+    LargeObj* obj = LargeObj::fromAllocation(al);
+    int size = obj->size;
    if (size >= bytes && size < bytes * 2)
        return al;

-    GCAllocation* rtn = alloc(bytes);
-
-#ifndef NVALGRIND
-    VALGRIND_DISABLE_ERROR_REPORTING;
-    memcpy(rtn, al, std::min(bytes, size));
-    VALGRIND_ENABLE_ERROR_REPORTING;
-#else
-    memcpy(rtn, al, std::min(bytes, size));
-#endif
+    GCAllocation* rtn = heap->alloc(bytes);
+    memcpy(rtn, al, std::min(bytes, obj->size));

-    _freeFrom(al, b);
+    _freeLargeObj(obj);
    return rtn;
 }

-GCAllocation* Heap::getAllocationFromInteriorPointer(void* ptr) {
-    if (large_arena.contains(ptr)) {
-        LargeObj* cur = large_head;
-        while (cur) {
-            if (ptr >= cur && ptr < &cur->data[cur->obj_size])
-                return &cur->data[0];
-            cur = cur->next;
+void LargeArena::free(GCAllocation* al) {
+    _freeLargeObj(LargeObj::fromAllocation(al));
+}
+
+GCAllocation* LargeArena::allocationFrom(void* ptr) {
+    LargeObj* obj = NULL;
+
+    for (obj = head; obj; obj = obj->next) {
+        char* end = (char*)&obj->data + obj->size;
+
+        if (ptr >= obj->data && ptr < end) {
+            return &obj->data[0];
        }
-        return NULL;
    }
+    return NULL;
+}

-    if (!small_arena.contains(ptr))
-        return NULL;
+void LargeArena::freeUnmarked() {
+    sweepList(head, [this](LargeObj* ptr) { _freeLargeObj(ptr); });
+}

-    Block* b = Block::forPointer(ptr);
-    size_t size = b->size;
-    int offset = (char*)ptr - (char*)b;
-    int obj_idx = offset / size;
+void LargeArena::getStatistics(HeapStatistics* stats) {
+    forEach(head, [stats](LargeObj* obj) { addStatistic(stats, obj->data, obj->size); });
+}

-    if (obj_idx < b->minObjIndex() || obj_idx >= b->numObjects())
-        return NULL;

-    int atom_idx = obj_idx * b->atomsPerObj();
+void LargeArena::add_free_chunk(LargeFreeChunk* free_chunks, size_t size) {
+    size_t num_chunks = size >> CHUNK_BITS;

-    if (b->isfree.isSet(atom_idx))
-        return NULL;
+    free_chunks->size = size;

-    return reinterpret_cast<GCAllocation*>(&b->atoms[atom_idx]);
+    if (num_chunks >= NUM_FREE_LISTS)
+        num_chunks = 0;
+    free_chunks->next_size = free_lists[num_chunks];
+    free_lists[num_chunks] = free_chunks;
 }

-static Block** freeChain(Block** head) {
-    while (Block* b = *head) {
-        int num_objects = b->numObjects();
-        int first_obj = b->minObjIndex();
-        int atoms_per_obj = b->atomsPerObj();
+LargeArena::LargeFreeChunk* LargeArena::get_from_size_list(LargeFreeChunk** list, size_t size) {
+    LargeFreeChunk* free_chunks = NULL;
+    LargeBlock* section;
+    size_t i, num_chunks, start_index;

-        for (int obj_idx = first_obj; obj_idx < num_objects; obj_idx++) {
-            int atom_idx = obj_idx * atoms_per_obj;
+    assert((size & (CHUNK_SIZE - 1)) == 0);

-            if (b->isfree.isSet(atom_idx))
-                continue;
+    while (*list) {
+        free_chunks = *list;
+        if (free_chunks->size >= size)
+            break;
+        list = &(*list)->next_size;
+    }

-            void* p = &b->atoms[atom_idx];
-            GCAllocation* al = reinterpret_cast<GCAllocation*>(p);
+    if (!*list)
+        return NULL;

-            if (isMarked(al)) {
-                clearMark(al);
-            } else {
-                _doFree(al);
+    *list = free_chunks->next_size;

-                // assert(p != (void*)0x127000d960); // the main module
-                b->isfree.set(atom_idx);
-            }
-        }
+    if (free_chunks->size > size)
+        add_free_chunk((LargeFreeChunk*)((char*)free_chunks + size), free_chunks->size - size);

-        head = &b->next;
+    num_chunks = size >> CHUNK_BITS;
+
+    section = LARGE_BLOCK_FOR_OBJ(free_chunks);
+
+    start_index = LARGE_CHUNK_INDEX(free_chunks, section);
+    for (i = start_index; i < start_index + num_chunks; ++i) {
+        assert(section->free_chunk_map[i]);
+        section->free_chunk_map[i] = 0;
    }
-    return head;
+
+    section->num_free_chunks -= size >> CHUNK_BITS;
+    assert(section->num_free_chunks >= 0);
+
+    return free_chunks;
 }

-void Heap::freeUnmarked() {
-    thread_caches.forEachValue([this](ThreadBlockCache* cache) {
-        for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
-            Block* h = cache->cache_free_heads[bidx];
-            // Try to limit the amount of unused memory a thread can hold onto;
-            // currently pretty dumb, just limit the number of blocks in the free-list
-            // to 50.  (blocks in the full list don't need to be limited, since we're sure
-            // that the thread had just actively used those.)
-            // Eventually may want to come up with some scrounging system.
-            // TODO does this thread locality even help at all?
-            for (int i = 0; i < 50; i++) {
-                if (h)
-                    h = h->next;
-                else
-                    break;
-            }
-            if (h) {
-                removeFromLL(h);
-                insertIntoLL(&heads[bidx], h);
-            }
+LargeArena::LargeObj* LargeArena::_alloc(size_t size) {
+    LargeBlock* section;
+    LargeFreeChunk* free_chunks;
+    size_t num_chunks;

-            Block** chain_end = freeChain(&cache->cache_free_heads[bidx]);
-            freeChain(&cache->cache_full_heads[bidx]);
+    size += CHUNK_SIZE - 1;
+    size &= ~(CHUNK_SIZE - 1);

-            while (Block* b = cache->cache_full_heads[bidx]) {
-                removeFromLL(b);
-                insertIntoLL(chain_end, b);
-            }
-        }
-    });
+    num_chunks = size >> CHUNK_BITS;

-    for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
-        Block** chain_end = freeChain(&heads[bidx]);
-        freeChain(&full_heads[bidx]);
+    assert(size > 0 && size - sizeof(LargeObj) <= ALLOC_SIZE_LIMIT);
+    assert(num_chunks > 0);

-        while (Block* b = full_heads[bidx]) {
-            removeFromLL(b);
-            insertIntoLL(chain_end, b);
+retry:
+    if (num_chunks >= NUM_FREE_LISTS) {
+        free_chunks = get_from_size_list(&free_lists[0], size);
+    } else {
+        size_t i;
+        for (i = num_chunks; i < NUM_FREE_LISTS; ++i) {
+            free_chunks = get_from_size_list(&free_lists[i], size);
+            if (free_chunks)
+                break;
        }
+        if (!free_chunks)
+            free_chunks = get_from_size_list(&free_lists[0], size);
    }

-    LargeObj* cur = large_head;
-    while (cur) {
-        GCAllocation* al = cur->data;
-        if (isMarked(al)) {
-            clearMark(al);
-        } else {
-            _doFree(al);
+    if (free_chunks)
+        return (LargeObj*)free_chunks;

-            *cur->prev = cur->next;
-            if (cur->next)
-                cur->next->prev = cur->prev;
+    section = (LargeBlock*)doMmap(BLOCK_SIZE);

-            LargeObj* to_free = cur;
-            cur = cur->next;
-            _freeLargeObj(to_free);
-            continue;
-        }
+    if (!section)
+        return NULL;

-        cur = cur->next;
-    }
-}
+    free_chunks = (LargeFreeChunk*)((char*)section + CHUNK_SIZE);
+    free_chunks->size = BLOCK_SIZE - CHUNK_SIZE;
+    free_chunks->next_size = free_lists[0];
+    free_lists[0] = free_chunks;

-void dumpHeapStatistics() {
-    global_heap.dumpHeapStatistics();
+    section->num_free_chunks = LARGE_BLOCK_NUM_CHUNKS;
+
+    section->free_chunk_map = (unsigned char*)section + sizeof(LargeBlock);
+    assert(sizeof(LargeBlock) + LARGE_BLOCK_NUM_CHUNKS + 1 <= CHUNK_SIZE);
+    section->free_chunk_map[0] = 0;
+    memset(section->free_chunk_map + 1, 1, LARGE_BLOCK_NUM_CHUNKS);
+
+    section->next = blocks;
+    blocks = section;
+
+    goto retry;
 }

-struct HeapStatistics {
-    struct TypeStats {
-        int64_t nallocs;
-        int64_t nbytes;
-        TypeStats() : nallocs(0), nbytes(0) {}
+void LargeArena::_freeLargeObj(LargeObj* obj) {
+    removeFromLL(obj);

-        void print(const char* name) const {
-            if (nbytes > (1 << 20))
-                printf("%s: %ld allocations for %.1f MB\n", name, nallocs, nbytes * 1.0 / (1 << 20));
-            else if (nbytes > (1 << 10))
-                printf("%s: %ld allocations for %.1f KB\n", name, nallocs, nbytes * 1.0 / (1 << 10));
-            else
-                printf("%s: %ld allocations for %ld bytes\n", name, nallocs, nbytes);
-        }
-    };
-    std::unordered_map<BoxedClass*, TypeStats> by_cls;
-    TypeStats conservative, untracked;
-    TypeStats total;
-};
+    size_t size = obj->size;
+    LargeBlock* section = LARGE_BLOCK_FOR_OBJ(obj);
+    size_t num_chunks, i, start_index;

-void addStatistic(HeapStatistics* stats, GCAllocation* al, int nbytes) {
-    stats->total.nallocs++;
-    stats->total.nbytes += nbytes;
+    size += CHUNK_SIZE - 1;
+    size &= ~(CHUNK_SIZE - 1);

-    if (al->kind_id == GCKind::PYTHON) {
-        Box* b = (Box*)al->user_data;
-        auto& t = stats->by_cls[b->cls];
+    num_chunks = size >> CHUNK_BITS;

-        t.nallocs++;
-        t.nbytes += nbytes;
-    } else if (al->kind_id == GCKind::CONSERVATIVE) {
-        stats->conservative.nallocs++;
-        stats->conservative.nbytes += nbytes;
-    } else if (al->kind_id == GCKind::UNTRACKED) {
-        stats->untracked.nallocs++;
-        stats->untracked.nbytes += nbytes;
-    } else {
-        RELEASE_ASSERT(0, "%d", (int)al->kind_id);
+    assert(size > 0 && size - sizeof(LargeObj) <= ALLOC_SIZE_LIMIT);
+    assert(num_chunks > 0);
+
+    section->num_free_chunks += num_chunks;
+    assert(section->num_free_chunks <= LARGE_BLOCK_NUM_CHUNKS);
+
+    /*
+     * We could free the LOS section here if it's empty, but we
+     * can't unless we also remove its free chunks from the fast
+     * free lists.  Instead, we do it in los_sweep().
+     */
+
+    start_index = LARGE_CHUNK_INDEX(obj, section);
+    for (i = start_index; i < start_index + num_chunks; ++i) {
+        assert(!section->free_chunk_map[i]);
+        section->free_chunk_map[i] = 1;
    }
+
+    add_free_chunk((LargeFreeChunk*)obj, size);
 }

-// TODO: copy-pasted from freeChain
-void getChainStatistics(HeapStatistics* stats, Block** head) {
-    while (Block* b = *head) {
-        int num_objects = b->numObjects();
-        int first_obj = b->minObjIndex();
-        int atoms_per_obj = b->atomsPerObj();
+//////
+/// Huge Arena

-        for (int obj_idx = first_obj; obj_idx < num_objects; obj_idx++) {
-            int atom_idx = obj_idx * atoms_per_obj;

-            if (b->isfree.isSet(atom_idx))
-                continue;
+GCAllocation* HugeArena::alloc(size_t size) {
+    registerGCManagedBytes(size);

-            void* p = &b->atoms[atom_idx];
-            GCAllocation* al = reinterpret_cast<GCAllocation*>(p);
+    LOCK_REGION(heap->lock);

-            addStatistic(stats, al, b->size);
-        }
+    size_t total_size = size + sizeof(HugeObj);
+    total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+    HugeObj* rtn = (HugeObj*)doMmap(total_size);
+    rtn->obj_size = size;

-        head = &b->next;
-    }
+    nullNextPrev(rtn);
+    insertIntoLL(&head, rtn);
+
+    return rtn->data;
 }

-// TODO: copy-pasted from freeUnmarked()
-void Heap::dumpHeapStatistics() {
-    threading::GLPromoteRegion _lock;
+GCAllocation* HugeArena::realloc(GCAllocation* al, size_t bytes) {
+    HugeObj* obj = HugeObj::fromAllocation(al);

-    HeapStatistics stats;
+    int capacity = obj->capacity();
+    if (capacity >= bytes && capacity < bytes * 2)
+        return al;

-    thread_caches.forEachValue([this, &stats](ThreadBlockCache* cache) {
-        for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
-            Block* h = cache->cache_free_heads[bidx];
+    GCAllocation* rtn = heap->alloc(bytes);
+    memcpy(rtn, al, std::min(bytes, obj->obj_size));

-            getChainStatistics(&stats, &cache->cache_free_heads[bidx]);
-            getChainStatistics(&stats, &cache->cache_full_heads[bidx]);
-        }
-    });
+    _freeHugeObj(obj);
+    return rtn;
+}

-    for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
-        getChainStatistics(&stats, &heads[bidx]);
-        getChainStatistics(&stats, &full_heads[bidx]);
-    }
+void HugeArena::free(GCAllocation* al) {
+    _freeHugeObj(HugeObj::fromAllocation(al));
+}

-    LargeObj* cur = large_head;
+GCAllocation* HugeArena::allocationFrom(void* ptr) {
+    HugeObj* cur = head;
    while (cur) {
-        GCAllocation* al = cur->data;
-        addStatistic(&stats, al, cur->capacity());
-
+        if (ptr >= cur && ptr < &cur->data[cur->obj_size])
+            return &cur->data[0];
        cur = cur->next;
    }
+    return NULL;
+}

-    stats.conservative.print("conservative");
-    stats.untracked.print("untracked");
-    for (const auto& p : stats.by_cls) {
-        p.second.print(getFullNameOfClass(p.first).c_str());
-    }
-    stats.total.print("Total");
-    printf("\n");
+void HugeArena::freeUnmarked() {
+    sweepList(head, [this](HugeObj* ptr) { _freeHugeObj(ptr); });
 }

+void HugeArena::getStatistics(HeapStatistics* stats) {
+    forEach(head, [stats](HugeObj* obj) { addStatistic(stats, obj->data, obj->capacity()); });
+}
+
+void HugeArena::_freeHugeObj(HugeObj* lobj) {
+    removeFromLL(lobj);
+    int r = munmap(lobj, lobj->mmap_size());
+    assert(r == 0);
+}
+
+
 } // namespace gc
 } // namespace pyston
--- a/src/gc/heap.h
+++ b/src/gc/heap.h
@@ -17,6 +17,7 @@

 #include <cstddef>
 #include <cstdint>
+#include <sys/mman.h>

 #include "core/common.h"
 #include "core/threading.h"
@@ -24,6 +25,9 @@
 namespace pyston {
 namespace gc {

+class Heap;
+struct HeapStatistics;
+
 typedef uint8_t kindid_t;
 struct GCAllocation {
    unsigned int gc_flags : 8;
@@ -59,173 +63,394 @@ inline void clearMark(GCAllocation* header) {

 #undef MARK_BIT

+#define PAGE_SIZE 4096

-template <int N> class Bitmap {
-    static_assert(N % 64 == 0, "");
-
+template <uintptr_t arena_start, uintptr_t arena_size> class Arena {
 private:
-    uint64_t data[N / 64];
+    void* cur;
+    void* end;
+
+protected:
+    Arena() : cur((void*)arena_start), end((void*)(arena_start + arena_size)) {}
+
+public:
+    void* doMmap(size_t size) {
+        assert(size % PAGE_SIZE == 0);
+
+        assert(((uint8_t*)cur + size) < end && "arena full");
+
+        void* mrtn = mmap(cur, size, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        assert((uintptr_t)mrtn != -1 && "failed to allocate memory from OS");
+        ASSERT(mrtn == cur, "%p %p\n", mrtn, cur);
+        cur = (uint8_t*)cur + size;
+        return mrtn;
+    }
+
+    bool contains(void* addr) { return (void*)arena_start <= addr && addr < cur; }
+};
+
+constexpr uintptr_t ARENA_SIZE = 0x1000000000L;
+constexpr uintptr_t SMALL_ARENA_START = 0x1270000000L;
+constexpr uintptr_t LARGE_ARENA_START = 0x2270000000L;
+constexpr uintptr_t HUGE_ARENA_START = 0x3270000000L;
+
+
+//
+// The SmallArena allocates objects <= 3584 bytes.
+//
+// it uses segregated-fit allocation, and each block contains a free
+// bitmap for objects of a given size (constant for the block)
+//
+static const size_t sizes[] = {
+    16,  32,  48,  64,  80,  96,   112,  128,  160,  192,  224,  256,  320,  384,
+    448, 512, 640, 768, 896, 1024, 1280, 1536, 1792, 2048, 2560, 3072, 3584, // 4096,
+};
+static constexpr size_t NUM_BUCKETS = sizeof(sizes) / sizeof(sizes[0]);

+
+class SmallArena : public Arena<SMALL_ARENA_START, ARENA_SIZE> {
 public:
-    void setAllZero() { memset(data, 0, sizeof(data)); }
+    SmallArena(Heap* heap) : Arena(), heap(heap), thread_caches(heap, this) {}
+
+    GCAllocation* __attribute__((__malloc__)) alloc(size_t bytes);
+    GCAllocation* realloc(GCAllocation* alloc, size_t bytes);
+    void free(GCAllocation* al);
+
+    GCAllocation* allocationFrom(void* ptr);
+    void freeUnmarked();
+
+    void getStatistics(HeapStatistics* stats);
+
+private:
+    template <int N> class Bitmap {
+        static_assert(N % 64 == 0, "");

-    struct Scanner {
    private:
-        int next_to_check;
-        friend class Bitmap<N>;
+        uint64_t data[N / 64];

    public:
-        void reset() { next_to_check = 0; }
-    };
+        void setAllZero() { memset(data, 0, sizeof(data)); }

-    bool isSet(int idx) { return (data[idx / 64] >> (idx % 64)) & 1; }
+        struct Scanner {
+        private:
+            int next_to_check;
+            friend class Bitmap<N>;

-    void set(int idx) { data[idx / 64] |= 1UL << (idx % 64); }
+        public:
+            void reset() { next_to_check = 0; }
+        };

-    void toggle(int idx) { data[idx / 64] ^= 1UL << (idx % 64); }
+        bool isSet(int idx) { return (data[idx / 64] >> (idx % 64)) & 1; }

-    void clear(int idx) { data[idx / 64] &= ~(1UL << (idx % 64)); }
+        void set(int idx) { data[idx / 64] |= 1UL << (idx % 64); }

-    int scanForNext(Scanner& sc) {
-        uint64_t mask = data[sc.next_to_check];
+        void toggle(int idx) { data[idx / 64] ^= 1UL << (idx % 64); }

-        if (unlikely(mask == 0L)) {
-            while (true) {
-                sc.next_to_check++;
-                if (sc.next_to_check == N / 64) {
-                    sc.next_to_check = 0;
-                    return -1;
-                }
-                mask = data[sc.next_to_check];
-                if (likely(mask != 0L)) {
-                    break;
+        void clear(int idx) { data[idx / 64] &= ~(1UL << (idx % 64)); }
+
+        int scanForNext(Scanner& sc) {
+            uint64_t mask = data[sc.next_to_check];
+
+            if (unlikely(mask == 0L)) {
+                while (true) {
+                    sc.next_to_check++;
+                    if (sc.next_to_check == N / 64) {
+                        sc.next_to_check = 0;
+                        return -1;
+                    }
+                    mask = data[sc.next_to_check];
+                    if (likely(mask != 0L)) {
+                        break;
+                    }
                }
            }
-        }

-        int i = sc.next_to_check;
+            int i = sc.next_to_check;

-        int first = __builtin_ctzll(mask);
-        assert(first < 64);
-        assert(data[i] & (1L << first));
-        data[i] ^= (1L << first);
+            int first = __builtin_ctzll(mask);
+            assert(first < 64);
+            assert(data[i] & (1L << first));
+            data[i] ^= (1L << first);
+
+            int idx = first + i * 64;
+            return idx;
+        }
+    };

-        int idx = first + i * 64;
-        return idx;
-    }
-};

+    static constexpr size_t BLOCK_SIZE = 4 * 4096;

-#define BLOCK_SIZE (4 * 4096)
 #define ATOM_SIZE 16
-static_assert(BLOCK_SIZE % ATOM_SIZE == 0, "");
+    static_assert(BLOCK_SIZE % ATOM_SIZE == 0, "");
 #define ATOMS_PER_BLOCK (BLOCK_SIZE / ATOM_SIZE)
-static_assert(ATOMS_PER_BLOCK % 64 == 0, "");
+    static_assert(ATOMS_PER_BLOCK % 64 == 0, "");
 #define BITFIELD_SIZE (ATOMS_PER_BLOCK / 8)
 #define BITFIELD_ELTS (BITFIELD_SIZE / 8)

 #define BLOCK_HEADER_SIZE (BITFIELD_SIZE + 4 * sizeof(void*))
 #define BLOCK_HEADER_ATOMS ((BLOCK_HEADER_SIZE + ATOM_SIZE - 1) / ATOM_SIZE)

-struct Atoms {
-    char _data[ATOM_SIZE];
-};
-
-struct Block {
-    union {
-        struct {
-            Block* next, **prev;
-            uint32_t size;
-            uint16_t num_obj;
-            uint8_t min_obj_index;
-            uint8_t atoms_per_obj;
-            Bitmap<ATOMS_PER_BLOCK> isfree;
-            Bitmap<ATOMS_PER_BLOCK>::Scanner next_to_check;
-            void* _header_end[0];
-        };
-        Atoms atoms[ATOMS_PER_BLOCK];
+    struct Atoms {
+        char _data[ATOM_SIZE];
    };

-    inline int minObjIndex() const { return min_obj_index; }
+    struct Block {
+        union {
+            struct {
+                Block* next, **prev;
+                uint32_t size;
+                uint16_t num_obj;
+                uint8_t min_obj_index;
+                uint8_t atoms_per_obj;
+                Bitmap<ATOMS_PER_BLOCK> isfree;
+                Bitmap<ATOMS_PER_BLOCK>::Scanner next_to_check;
+                void* _header_end[0];
+            };
+            Atoms atoms[ATOMS_PER_BLOCK];
+        };

-    inline int numObjects() const { return num_obj; }
+        inline int minObjIndex() const { return min_obj_index; }

-    inline int atomsPerObj() const { return atoms_per_obj; }
+        inline int numObjects() const { return num_obj; }

-    static Block* forPointer(void* ptr) { return (Block*)((uintptr_t)ptr & ~(BLOCK_SIZE - 1)); }
-};
-static_assert(sizeof(Block) == BLOCK_SIZE, "bad size");
-static_assert(offsetof(Block, _header_end) >= BLOCK_HEADER_SIZE, "bad header size");
-static_assert(offsetof(Block, _header_end) <= BLOCK_HEADER_SIZE, "bad header size");
-
-constexpr const size_t sizes[] = {
-    16,  32,  48,  64,  80,  96,  112, 128,  160,  192,  224,  256,
-    320, 384, 448, 512, 640, 768, 896, 1024, 1280, 1536, 1792, 2048,
-    // 2560, 3072, 3584, // 4096,
-};
-#define NUM_BUCKETS (sizeof(sizes) / sizeof(sizes[0]))
+        inline int atomsPerObj() const { return atoms_per_obj; }

-struct LargeObj;
-class Heap {
-private:
-    Block* heads[NUM_BUCKETS];
-    Block* full_heads[NUM_BUCKETS];
-    LargeObj* large_head = NULL;
-
-    GCAllocation* __attribute__((__malloc__)) allocSmall(size_t rounded_size, int bucket_idx);
-    GCAllocation* __attribute__((__malloc__)) allocLarge(size_t bytes);
+        static Block* forPointer(void* ptr) { return (Block*)((uintptr_t)ptr & ~(BLOCK_SIZE - 1)); }
+    };
+    static_assert(sizeof(Block) == BLOCK_SIZE, "bad size");
+    static_assert(offsetof(Block, _header_end) >= BLOCK_HEADER_SIZE, "bad header size");
+    static_assert(offsetof(Block, _header_end) <= BLOCK_HEADER_SIZE, "bad header size");

-    // DS_DEFINE_MUTEX(lock);
-    DS_DEFINE_SPINLOCK(lock);

    struct ThreadBlockCache {
        Heap* heap;
+        SmallArena* small;
        Block* cache_free_heads[NUM_BUCKETS];
        Block* cache_full_heads[NUM_BUCKETS];

-        ThreadBlockCache(Heap* heap) : heap(heap) {
+        ThreadBlockCache(Heap* heap, SmallArena* small) : heap(heap), small(small) {
            memset(cache_free_heads, 0, sizeof(cache_free_heads));
            memset(cache_full_heads, 0, sizeof(cache_full_heads));
        }
        ~ThreadBlockCache();
    };
+
+
+    Block* heads[NUM_BUCKETS];
+    Block* full_heads[NUM_BUCKETS];
+
    friend struct ThreadBlockCache;
+
+    Heap* heap;
    // TODO only use thread caches if we're in GRWL mode?
-    threading::PerThreadSet<ThreadBlockCache, Heap*> thread_caches;
+    threading::PerThreadSet<ThreadBlockCache, Heap*, SmallArena*> thread_caches;
+
+    Block* _allocBlock(uint64_t size, Block** prev);
+    GCAllocation* _allocFromBlock(Block* b);
+    Block* _claimBlock(size_t rounded_size, Block** free_head);
+    Block** _freeChain(Block** head);
+    void _getChainStatistics(HeapStatistics* stats, Block** head);
+
+    GCAllocation* __attribute__((__malloc__)) _alloc(size_t bytes, int bucket_idx);
+};
+
+//
+// The LargeArena allocates objects where 3584 < size <1024*1024-CHUNK_SIZE-sizeof(LargeObject) bytes.
+//
+// it maintains a set of size-segregated free lists, and a special
+// free list for larger objects.  If the free list specific to a given
+// size has no entries, we search the large free list.
+//
+// Blocks of 1meg are mmap'ed individually, and carved up as needed.
+//
+class LargeArena : public Arena<LARGE_ARENA_START, ARENA_SIZE> {
+private:
+    struct LargeBlock {
+        LargeBlock* next;
+        size_t num_free_chunks;
+        unsigned char* free_chunk_map;
+    };
+
+    struct LargeFreeChunk {
+        LargeFreeChunk* next_size;
+        size_t size;
+    };

+    struct LargeObj {
+        LargeObj* next, **prev;
+        size_t size;
+        GCAllocation data[0];
+
+        static LargeObj* fromAllocation(GCAllocation* alloc) {
+            char* rtn = (char*)alloc - offsetof(LargeObj, data);
+            return reinterpret_cast<LargeObj*>(rtn);
+        }
+    };
+
+    /*
+     * This shouldn't be much smaller or larger than the largest small size bucket.
+     * Must be at least sizeof (LargeBlock).
+     */
+    static constexpr size_t CHUNK_SIZE = 4096;
+    static constexpr int CHUNK_BITS = 12;
+
+    static_assert(CHUNK_SIZE > sizeof(LargeBlock), "bad large block size");
+
+    static constexpr int BLOCK_SIZE = 1024 * 1024;
+
+    static constexpr int NUM_FREE_LISTS = 32;
+
+    Heap* heap;
+    LargeObj* head;
+    LargeBlock* blocks;
+    LargeFreeChunk* free_lists[NUM_FREE_LISTS]; /* 0 is for larger sizes */
+
+    void add_free_chunk(LargeFreeChunk* free_chunks, size_t size);
+    LargeFreeChunk* get_from_size_list(LargeFreeChunk** list, size_t size);
+    LargeObj* _alloc(size_t size);
+    void _freeLargeObj(LargeObj* obj);
+
+public:
+    LargeArena(Heap* heap) : heap(heap), head(NULL), blocks(NULL) {}
+
+    /* Largest object that can be allocated in a large block. */
+    static constexpr size_t ALLOC_SIZE_LIMIT = BLOCK_SIZE - CHUNK_SIZE - sizeof(LargeObj);
+
+    GCAllocation* __attribute__((__malloc__)) alloc(size_t bytes);
+    GCAllocation* realloc(GCAllocation* alloc, size_t bytes);
+    void free(GCAllocation* alloc);
+
+    GCAllocation* allocationFrom(void* ptr);
+    void freeUnmarked();
+
+    void getStatistics(HeapStatistics* stats);
+};
+
+// The HugeArena allocates objects where size > 1024*1024 bytes.
+//
+// Objects are allocated with individual mmap() calls, and kept in a
+// linked list.  They are not reused.
+class HugeArena : public Arena<HUGE_ARENA_START, ARENA_SIZE> {
 public:
-    Heap() : thread_caches(this) {}
+    HugeArena(Heap* heap) : heap(heap) {}

+    GCAllocation* __attribute__((__malloc__)) alloc(size_t bytes);
    GCAllocation* realloc(GCAllocation* alloc, size_t bytes);
+    void free(GCAllocation* alloc);
+
+    GCAllocation* allocationFrom(void* ptr);
+    void freeUnmarked();
+
+    void getStatistics(HeapStatistics* stats);
+
+private:
+    struct HugeObj {
+        HugeObj* next, **prev;
+        size_t obj_size;
+        GCAllocation data[0];
+
+        int mmap_size() {
+            size_t total_size = obj_size + sizeof(HugeObj);
+            total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+            return total_size;
+        }
+
+        int capacity() { return mmap_size() - sizeof(HugeObj); }
+
+        static HugeObj* fromAllocation(GCAllocation* alloc) {
+            char* rtn = (char*)alloc - offsetof(HugeObj, data);
+            assert((uintptr_t)rtn % PAGE_SIZE == 0);
+            return reinterpret_cast<HugeObj*>(rtn);
+        }
+    };
+
+    void _freeHugeObj(HugeObj* lobj);
+
+    HugeObj* head;
+
+    Heap* heap;
+};
+
+
+class Heap {
+private:
+    SmallArena small_arena;
+    LargeArena large_arena;
+    HugeArena huge_arena;
+
+    friend class SmallArena;
+    friend class LargeArena;
+    friend class HugeArena;
+
+    // DS_DEFINE_MUTEX(lock);
+    DS_DEFINE_SPINLOCK(lock);
+
+public:
+    Heap() : small_arena(this), large_arena(this), huge_arena(this) {}
+
+    GCAllocation* realloc(GCAllocation* alloc, size_t bytes) {
+
+        // TODO(toshok): there is duplicate code in each of the
+        // ::realloc methods to test whether the allocation can be
+        // reused.  Would be nice to factor it all out here into this
+        // method.
+
+        if (large_arena.contains(alloc)) {
+            return large_arena.realloc(alloc, bytes);
+        } else if (huge_arena.contains(alloc)) {
+            return huge_arena.realloc(alloc, bytes);
+        }
+
+        assert(small_arena.contains(alloc));
+        return small_arena.realloc(alloc, bytes);
+    }

    GCAllocation* __attribute__((__malloc__)) alloc(size_t bytes) {
-        GCAllocation* rtn;
-        // assert(bytes >= 16);
-        if (bytes <= 16)
-            rtn = allocSmall(16, 0);
-        else if (bytes <= 32)
-            rtn = allocSmall(32, 1);
+        if (bytes > LargeArena::ALLOC_SIZE_LIMIT)
+            return huge_arena.alloc(bytes);
        else if (bytes > sizes[NUM_BUCKETS - 1])
-            rtn = allocLarge(bytes);
-        else {
-            rtn = NULL;
-            for (int i = 2; i < NUM_BUCKETS; i++) {
-                if (sizes[i] >= bytes) {
-                    rtn = allocSmall(sizes[i], i);
-                    break;
-                }
-            }
+            return large_arena.alloc(bytes);
+        else
+            return small_arena.alloc(bytes);
+    }
+
+    void destructContents(GCAllocation* alloc);
+
+    void free(GCAllocation* alloc) {
+        destructContents(alloc);
+
+        if (large_arena.contains(alloc)) {
+            large_arena.free(alloc);
+            return;
        }

-        return rtn;
-    }
+        if (huge_arena.contains(alloc)) {
+            huge_arena.free(alloc);
+            return;
+        }

-    void free(GCAllocation* alloc);
+        assert(small_arena.contains(alloc));
+        small_arena.free(alloc);
+    }

    // not thread safe:
-    GCAllocation* getAllocationFromInteriorPointer(void* ptr);
+    GCAllocation* getAllocationFromInteriorPointer(void* ptr) {
+        if (large_arena.contains(ptr)) {
+            return large_arena.allocationFrom(ptr);
+        } else if (huge_arena.contains(ptr)) {
+            return huge_arena.allocationFrom(ptr);
+        } else if (small_arena.contains(ptr)) {
+            return small_arena.allocationFrom(ptr);
+        }
+
+        return NULL;
+    }
    // not thread safe:
-    void freeUnmarked();
+    void freeUnmarked() {
+        small_arena.freeUnmarked();
+        large_arena.freeUnmarked();
+        huge_arena.freeUnmarked();
+    }

    void dumpHeapStatistics();
 };

--- a/test/unittests/gc.cpp
+++ b/test/unittests/gc.cpp
@@ -68,6 +68,9 @@ TEST(alloc, alloc64) { testAlloc(64); }
 TEST(alloc, alloc128) { testAlloc(128); }
 TEST(alloc, alloc258) { testAlloc(258); }
 TEST(alloc, alloc3584) { testAlloc(3584); }
+TEST(alloc, alloc4096) { testAlloc(4096); }
+TEST(alloc, alloc8192) { testAlloc(8192); }
+TEST(alloc, alloc16384) { testAlloc(16384); }

 TEST(alloc, largeallocs) {
    int s1 = 1 << 20;