Add a third size class (between small/large) to the gc.

port over sgen's idea of LOSSections as a mid-sized arena, so that we now have: SmallArena original non-large allocator, free bitmaps, segregated-fit allocator. handles objects where size <= 3584 bytes LargeArena (new code, size-specific free lists.) handles object where 3584 < size <= ~1 meg HugeArena (original large allocator, 1 mmap per object. handles objects where size > ~1meg

Add a third size class (between small/large) to the gc.
port over sgen's idea of LOSSections as a mid-sized arena, so that we now have: SmallArena original non-large allocator, free bitmaps, segregated-fit allocator. handles objects where size <= 3584 bytes LargeArena (new code, size-specific free lists.) handles object where 3584 < size <= ~1 meg HugeArena (original large allocator, 1 mmap per object. handles objects where size > ~1meg
919dd3df · Chris Toshok · 4afb0656 · 919dd3df · 919dd3df · 919dd3df
Commit 919dd3df authored Feb 06, 2015 by Chris Toshok
Showing with 803 additions and 354 deletions

minibenchmarks/pidigits.py minibenchmarks/pidigits.py +39 -0

src/gc/heap.cpp src/gc/heap.cpp +429 -254

src/gc/heap.h src/gc/heap.h +332 -100

test/unittests/gc.cpp test/unittests/gc.cpp +3 -0

No files found.
--- a/minibenchmarks/pidigits.py
+++ b/minibenchmarks/pidigits.py
+import time
+
+PIDIGITS_LEN = 1500
+
+def pidigits(length):
+    i = k = ns = 0
+    k1 = 1
+    n,a,d,t,u = 1,0,1,0,0
+    while(True):
+        k += 1
+        t = n<<1
+        n *= k
+        a += t
+        k1 += 2
+        a *= k1
+        d *= k1
+        if a >= n:
+            t,u = divmod(n*3 + a,d)
+            u += n
+            if d > u:
+                ns = ns*10 + t
+                i += 1
+                if i % 10 == 0:
+                    ns = 0
+                if i >= length:
+                    break
+                a -= d*t
+                a *= 10
+                n *= 10
+
+def main(n):
+    l = []
+    for i in range(n):
+        t0 = time.time()
+        pidigits(PIDIGITS_LEN)
+        l.append(time.time() - t0)
+    return l
+
+main(100)
--- a/src/gc/heap.cpp
+++ b/src/gc/heap.cpp
@@ -17,7 +17,6 @@
 #include <cstdlib>
 #include <cstring>
 #include <stdint.h>
-#include <sys/mman.h>

 #include "core/common.h"
 #include "core/util.h"
@@ -34,6 +33,35 @@
 namespace pyston {
 namespace gc {

+void _doFree(GCAllocation* al);
+
+// these template functions are for both large and huge sections
+template <class ListT> inline void unlinkNode(ListT* node) {
+    *node->prev = node->next;
+    if (node->next)
+        node->next->prev = node->prev;
+}
+
+template <class ListT, typename Free>
+inline void sweepHeap(ListT* head, std::function<void(GCAllocation*)> __free, Free free_func) {
+    auto cur = head;
+    while (cur) {
+        GCAllocation* al = cur->data;
+        if (isMarked(al)) {
+            clearMark(al);
+            cur = cur->next;
+        } else {
+            __free(al);
+
+            unlinkNode(cur);
+
+            auto to_free = cur;
+            cur = cur->next;
+            free_func(to_free);
+        }
+    }
+}
+
 static unsigned bytesAllocatedSinceCollection;
 static __thread unsigned thread_bytesAllocatedSinceCollection;
 #define ALLOCBYTES_PER_COLLECTION 10000000
@@ -64,73 +92,384 @@ void registerGCManagedBytes(size_t bytes) {

 Heap global_heap;

-#define PAGE_SIZE 4096
-class Arena {
-private:
-    void* start;
-    void* cur;
+GCAllocation* SmallArena::realloc(GCAllocation* al, size_t bytes) {
+    Block* b = Block::forPointer(al);
+
+    size_t size = b->size;
+
+    if (size >= bytes && size < bytes * 2)
+        return al;
+
+    GCAllocation* rtn = heap->alloc(bytes);
+
+#ifndef NVALGRIND
+    VALGRIND_DISABLE_ERROR_REPORTING;
+    memcpy(rtn, al, std::min(bytes, size));
+    VALGRIND_ENABLE_ERROR_REPORTING;
+#else
+    memcpy(rtn, al, std::min(bytes, size));
+#endif
+
+    _free(al, b);
+    return rtn;
+}
+
+GCAllocation* SmallArena::allocationFrom(void* ptr) {
+    Block* b = Block::forPointer(ptr);
+    size_t size = b->size;
+    int offset = (char*)ptr - (char*)b;
+    int obj_idx = offset / size;
+
+    if (obj_idx < b->minObjIndex() || obj_idx >= b->numObjects())
+        return NULL;
+
+    int atom_idx = obj_idx * b->atomsPerObj();
+
+    if (b->isfree.isSet(atom_idx))
+        return NULL;
+
+    return reinterpret_cast<GCAllocation*>(&b->atoms[atom_idx]);
+}
+
+SmallArena::Block** SmallArena::freeChain(Block** head) {
+    while (Block* b = *head) {
+        int num_objects = b->numObjects();
+        int first_obj = b->minObjIndex();
+        int atoms_per_obj = b->atomsPerObj();

-public:
-    constexpr Arena(void* start) : start(start), cur(start) {}
+        for (int obj_idx = first_obj; obj_idx < num_objects; obj_idx++) {
+            int atom_idx = obj_idx * atoms_per_obj;

-    void* doMmap(size_t size) {
-        assert(size % PAGE_SIZE == 0);
-        // printf("mmap %ld\n", size);
+            if (b->isfree.isSet(atom_idx))
+                continue;
+
+            void* p = &b->atoms[atom_idx];
+            GCAllocation* al = reinterpret_cast<GCAllocation*>(p);

-        void* mrtn = mmap(cur, size, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-        assert((uintptr_t)mrtn != -1 && "failed to allocate memory from OS");
-        ASSERT(mrtn == cur, "%p %p\n", mrtn, cur);
-        cur = (uint8_t*)cur + size;
-        return mrtn;
+            if (isMarked(al)) {
+                clearMark(al);
+            } else {
+                _doFree(al);
+
+                // assert(p != (void*)0x127000d960); // the main module
+                b->isfree.set(atom_idx);
+            }
        }

-    bool contains(void* addr) { return start <= addr && addr < cur; }
-};
+        head = &b->next;
+    }
+    return head;
+}

-static Arena small_arena((void*)0x1270000000L);
-static Arena large_arena((void*)0x2270000000L);

-struct LargeObj {
-    LargeObj* next, **prev;
-    size_t obj_size;
-    GCAllocation data[0];
+void SmallArena::freeUnmarked() {
+    thread_caches.forEachValue([this](ThreadBlockCache* cache) {
+        for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
+            Block* h = cache->cache_free_heads[bidx];
+            // Try to limit the amount of unused memory a thread can hold onto;
+            // currently pretty dumb, just limit the number of blocks in the free-list
+            // to 50.  (blocks in the full list don't need to be limited, since we're sure
+            // that the thread had just actively used those.)
+            // Eventually may want to come up with some scrounging system.
+            // TODO does this thread locality even help at all?
+            for (int i = 0; i < 50; i++) {
+                if (h)
+                    h = h->next;
+                else
+                    break;
+            }
+            if (h) {
+                removeFromLL(h);
+                insertIntoLL(&heads[bidx], h);
+            }

-    int mmap_size() {
-        size_t total_size = obj_size + sizeof(LargeObj);
-        total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
-        return total_size;
+            Block** chain_end = freeChain(&cache->cache_free_heads[bidx]);
+            freeChain(&cache->cache_full_heads[bidx]);
+
+            while (Block* b = cache->cache_full_heads[bidx]) {
+                removeFromLL(b);
+                insertIntoLL(chain_end, b);
+            }
        }
+    });

-    int capacity() { return mmap_size() - sizeof(LargeObj); }
+    for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
+        Block** chain_end = freeChain(&heads[bidx]);
+        freeChain(&full_heads[bidx]);

-    static LargeObj* fromAllocation(GCAllocation* alloc) {
-        char* rtn = (char*)alloc - offsetof(LargeObj, data);
-        assert((uintptr_t)rtn % PAGE_SIZE == 0);
-        return reinterpret_cast<LargeObj*>(rtn);
+        while (Block* b = full_heads[bidx]) {
+            removeFromLL(b);
+            insertIntoLL(chain_end, b);
        }
-};
+    }
+}
+
+
+#define LARGE_BLOCK_NUM_CHUNKS ((BLOCK_SIZE >> CHUNK_BITS) - 1)
+
+#define LARGE_BLOCK_FOR_OBJ(obj) ((LargeBlock*)((int64_t)(obj) & ~(int64_t)(BLOCK_SIZE - 1)))
+#define LARGE_CHUNK_INDEX(obj, section) (((char*)(obj) - (char*)(section)) >> CHUNK_BITS)
+
+int64_t los_memory_usage = 0;
+
+static int64_t large_object_count = 0;
+static int large_block_count = 0;
+
+void LargeArena::add_free_chunk(LargeFreeChunk* free_chunks, size_t size) {
+    size_t num_chunks = size >> CHUNK_BITS;
+
+    free_chunks->size = size;
+
+    if (num_chunks >= NUM_FREE_LISTS)
+        num_chunks = 0;
+    free_chunks->next_size = free_lists[num_chunks];
+    free_lists[num_chunks] = free_chunks;
+}
+
+LargeArena::LargeFreeChunk* LargeArena::get_from_size_list(LargeFreeChunk** list, size_t size) {
+    LargeFreeChunk* free_chunks = NULL;
+    LargeBlock* section;
+    size_t i, num_chunks, start_index;
+
+    assert((size & (CHUNK_SIZE - 1)) == 0);
+
+    while (*list) {
+        free_chunks = *list;
+        if (free_chunks->size >= size)
+            break;
+        list = &(*list)->next_size;
+    }
+
+    if (!*list)
+        return NULL;
+
+    *list = free_chunks->next_size;
+
+    if (free_chunks->size > size)
+        add_free_chunk((LargeFreeChunk*)((char*)free_chunks + size), free_chunks->size - size);
+
+    num_chunks = size >> CHUNK_BITS;
+
+    section = LARGE_BLOCK_FOR_OBJ(free_chunks);
+
+    start_index = LARGE_CHUNK_INDEX(free_chunks, section);
+    for (i = start_index; i < start_index + num_chunks; ++i) {
+        assert(section->free_chunk_map[i]);
+        section->free_chunk_map[i] = 0;
+    }
+
+    section->num_free_chunks -= size >> CHUNK_BITS;
+    assert(section->num_free_chunks >= 0);
+
+    return free_chunks;
+}
+
+LargeArena::LargeObj* LargeArena::_allocInternal(size_t size) {
+    LargeBlock* section;
+    LargeFreeChunk* free_chunks;
+    size_t num_chunks;
+
+    size += CHUNK_SIZE - 1;
+    size &= ~(CHUNK_SIZE - 1);
+
+    num_chunks = size >> CHUNK_BITS;
+
+    assert(size > 0 && size - sizeof(LargeObj) <= ALLOC_SIZE_LIMIT);
+    assert(num_chunks > 0);
+
+retry:
+    if (num_chunks >= NUM_FREE_LISTS) {
+        free_chunks = get_from_size_list(&free_lists[0], size);
+    } else {
+        size_t i;
+        for (i = num_chunks; i < NUM_FREE_LISTS; ++i) {
+            free_chunks = get_from_size_list(&free_lists[i], size);
+            if (free_chunks)
+                break;
+        }
+        if (!free_chunks)
+            free_chunks = get_from_size_list(&free_lists[0], size);
+    }
+
+    if (free_chunks)
+        return (LargeObj*)free_chunks;
+
+    section = (LargeBlock*)doMmap(BLOCK_SIZE);
+
+    if (!section)
+        return NULL;

-GCAllocation* Heap::allocLarge(size_t size) {
+    free_chunks = (LargeFreeChunk*)((char*)section + CHUNK_SIZE);
+    free_chunks->size = BLOCK_SIZE - CHUNK_SIZE;
+    free_chunks->next_size = free_lists[0];
+    free_lists[0] = free_chunks;
+
+    section->num_free_chunks = LARGE_BLOCK_NUM_CHUNKS;
+
+    section->free_chunk_map = (unsigned char*)section + sizeof(LargeBlock);
+    assert(sizeof(LargeBlock) + LARGE_BLOCK_NUM_CHUNKS + 1 <= CHUNK_SIZE);
+    section->free_chunk_map[0] = 0;
+    memset(section->free_chunk_map + 1, 1, LARGE_BLOCK_NUM_CHUNKS);
+
+    section->next = blocks;
+    blocks = section;
+
+    ++large_block_count;
+
+    goto retry;
+}
+
+void LargeArena::_freeInternal(LargeObj* obj, size_t size) {
+    LargeBlock* section = LARGE_BLOCK_FOR_OBJ(obj);
+    size_t num_chunks, i, start_index;
+
+    size += CHUNK_SIZE - 1;
+    size &= ~(CHUNK_SIZE - 1);
+
+    num_chunks = size >> CHUNK_BITS;
+
+    assert(size > 0 && size - sizeof(LargeObj) <= ALLOC_SIZE_LIMIT);
+    assert(num_chunks > 0);
+
+    section->num_free_chunks += num_chunks;
+    assert(section->num_free_chunks <= LARGE_BLOCK_NUM_CHUNKS);
+
+    /*
+     * We could free the LOS section here if it's empty, but we
+     * can't unless we also remove its free chunks from the fast
+     * free lists.  Instead, we do it in los_sweep().
+     */
+
+    start_index = LARGE_CHUNK_INDEX(obj, section);
+    for (i = start_index; i < start_index + num_chunks; ++i) {
+        assert(!section->free_chunk_map[i]);
+        section->free_chunk_map[i] = 1;
+    }
+
+    add_free_chunk((LargeFreeChunk*)obj, size);
+}
+
+void LargeArena::_free(LargeObj* obj) {
+    unlinkNode(obj);
+    _freeInternal(obj, obj->size);
+}
+
+void LargeArena::freeUnmarked() {
+    sweepHeap(head, _doFree, [this](LargeObj* ptr) { _freeInternal(ptr, ptr->size); });
+}
+
+GCAllocation* LargeArena::alloc(size_t size) {
+    registerGCManagedBytes(size);
+
+    LOCK_REGION(heap->lock);
+
+    // printf ("allocLarge %zu\n", size);
+
+    LargeObj* obj = _allocInternal(size + sizeof(GCAllocation) + sizeof(LargeObj));
+
+    obj->size = size;
+
+    obj->next = head;
+    if (obj->next)
+        obj->next->prev = &obj->next;
+    obj->prev = &head;
+    head = obj;
+    large_object_count++;
+
+    return obj->data;
+}
+
+GCAllocation* LargeArena::realloc(GCAllocation* al, size_t bytes) {
+    LargeObj* obj = (LargeObj*)((char*)al - offsetof(LargeObj, data));
+    int size = obj->size;
+    if (size >= bytes && size < bytes * 2)
+        return al;
+
+    GCAllocation* rtn = heap->alloc(bytes);
+    memcpy(rtn, al, std::min(bytes, obj->size));
+
+    _free(obj);
+    return rtn;
+}
+
+void LargeArena::free(GCAllocation* al) {
+    LargeObj* obj = (LargeObj*)((char*)al - offsetof(LargeObj, data));
+    _free(obj);
+}
+
+GCAllocation* LargeArena::allocationFrom(void* ptr) {
+    LargeObj* obj = NULL;
+
+    for (obj = head; obj; obj = obj->next) {
+        char* end = (char*)&obj->data + obj->size;
+
+        if (ptr >= obj->data && ptr < end) {
+            return &obj->data[0];
+        }
+    }
+    return NULL;
+}
+
+void HugeArena::freeUnmarked() {
+    sweepHeap(head, _doFree, [this](HugeObj* ptr) { _freeHugeObj(ptr); });
+}
+
+GCAllocation* HugeArena::alloc(size_t size) {
    registerGCManagedBytes(size);

-    LOCK_REGION(lock);
+    LOCK_REGION(heap->lock);

-    size_t total_size = size + sizeof(LargeObj);
+    size_t total_size = size + sizeof(HugeObj);
    total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
-    LargeObj* rtn = (LargeObj*)large_arena.doMmap(total_size);
+    HugeObj* rtn = (HugeObj*)doMmap(total_size);
    rtn->obj_size = size;

-    rtn->next = large_head;
+    rtn->next = head;
    if (rtn->next)
        rtn->next->prev = &rtn->next;
-    rtn->prev = &large_head;
-    large_head = rtn;
-
+    rtn->prev = &head;
+    head = rtn;
    return rtn->data;
 }

-static Block* alloc_block(uint64_t size, Block** prev) {
-    Block* rtn = (Block*)small_arena.doMmap(sizeof(Block));
+GCAllocation* HugeArena::realloc(GCAllocation* al, size_t bytes) {
+    HugeObj* lobj = HugeObj::fromAllocation(al);
+
+    int capacity = lobj->capacity();
+    if (capacity >= bytes && capacity < bytes * 2)
+        return al;
+
+    GCAllocation* rtn = heap->alloc(bytes);
+    memcpy(rtn, al, std::min(bytes, lobj->obj_size));
+
+    _freeHugeObj(lobj);
+    return rtn;
+}
+
+void HugeArena::_freeHugeObj(HugeObj* lobj) {
+    unlinkNode(lobj);
+    int r = munmap(lobj, lobj->mmap_size());
+    assert(r == 0);
+}
+
+
+void HugeArena::free(GCAllocation* al) {
+    HugeObj* lobj = HugeObj::fromAllocation(al);
+    _freeHugeObj(lobj);
+}
+
+GCAllocation* HugeArena::allocationFrom(void* ptr) {
+    HugeObj* cur = head;
+    while (cur) {
+        if (ptr >= cur && ptr < &cur->data[cur->obj_size])
+            return &cur->data[0];
+        cur = cur->next;
+    }
+    return NULL;
+}
+
+SmallArena::Block* SmallArena::alloc_block(uint64_t size, Block** prev) {
+    Block* rtn = (Block*)doMmap(sizeof(Block));
    assert(rtn);
    rtn->size = size;
    rtn->num_obj = BLOCK_SIZE / size;
@@ -165,7 +504,7 @@ static Block* alloc_block(uint64_t size, Block** prev) {
    return rtn;
 }

-static void insertIntoLL(Block** next_pointer, Block* next) {
+void SmallArena::insertIntoLL(Block** next_pointer, Block* next) {
    assert(next_pointer);
    assert(next);
    assert(!next->next);
@@ -178,32 +517,29 @@ static void insertIntoLL(Block** next_pointer, Block* next) {
    next->prev = next_pointer;
 }

-static void removeFromLL(Block* b) {
-    if (b->next)
-        b->next->prev = b->prev;
-    *b->prev = b->next;
-
+void SmallArena::removeFromLL(Block* b) {
+    unlinkNode(b);
    b->next = NULL;
    b->prev = NULL;
 }

-Heap::ThreadBlockCache::~ThreadBlockCache() {
+SmallArena::ThreadBlockCache::~ThreadBlockCache() {
    LOCK_REGION(heap->lock);

    for (int i = 0; i < NUM_BUCKETS; i++) {
        while (Block* b = cache_free_heads[i]) {
-            removeFromLL(b);
-            insertIntoLL(&heap->heads[i], b);
+            small->removeFromLL(b);
+            small->insertIntoLL(&small->heads[i], b);
        }

        while (Block* b = cache_full_heads[i]) {
-            removeFromLL(b);
-            insertIntoLL(&heap->full_heads[i], b);
+            small->removeFromLL(b);
+            small->insertIntoLL(&small->full_heads[i], b);
        }
    }
 }

-static GCAllocation* allocFromBlock(Block* b) {
+GCAllocation* SmallArena::allocFromBlock(Block* b) {
    int idx = b->isfree.scanForNext(b->next_to_check);
    if (idx == -1)
        return NULL;
@@ -212,7 +548,7 @@ static GCAllocation* allocFromBlock(Block* b) {
    return reinterpret_cast<GCAllocation*>(rtn);
 }

-static Block* claimBlock(size_t rounded_size, Block** free_head) {
+SmallArena::Block* SmallArena::claimBlock(size_t rounded_size, Block** free_head) {
    Block* free_block = *free_head;
    if (free_block) {
        removeFromLL(free_block);
@@ -222,7 +558,7 @@ static Block* claimBlock(size_t rounded_size, Block** free_head) {
    return alloc_block(rounded_size, NULL);
 }

-GCAllocation* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
+GCAllocation* SmallArena::_alloc(size_t rounded_size, int bucket_idx) {
    registerGCManagedBytes(rounded_size);

    Block** free_head = &heads[bucket_idx];
@@ -253,7 +589,7 @@ GCAllocation* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
        // static StatCounter sc_fallback("gc_allocs_cachemiss");
        // sc_fallback.log();

-        LOCK_REGION(lock);
+        LOCK_REGION(heap->lock);

        assert(*cache_head == NULL);

@@ -269,7 +605,7 @@ GCAllocation* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
    }
 }

-void _freeFrom(GCAllocation* alloc, Block* b) {
+void SmallArena::_free(GCAllocation* alloc, Block* b) {
    assert(b == Block::forPointer(alloc));

    size_t size = b->size;
@@ -285,16 +621,7 @@ void _freeFrom(GCAllocation* alloc, Block* b) {
 #endif
 }

-static void _freeLargeObj(LargeObj* lobj) {
-    *lobj->prev = lobj->next;
-    if (lobj->next)
-        lobj->next->prev = lobj->prev;
-
-    int r = munmap(lobj, lobj->mmap_size());
-    assert(r == 0);
-}
-
-static void _doFree(GCAllocation* al) {
+void _doFree(GCAllocation* al) {
    if (VERBOSITY() >= 2)
        printf("Freeing %p\n", al->user_data);

@@ -307,178 +634,8 @@ static void _doFree(GCAllocation* al) {
    }
 }

-void Heap::free(GCAllocation* al) {
-    _doFree(al);
-
-    if (large_arena.contains(al)) {
-        LargeObj* lobj = LargeObj::fromAllocation(al);
-        _freeLargeObj(lobj);
-        return;
-    }
-
-    assert(small_arena.contains(al));
-    Block* b = Block::forPointer(al);
-    _freeFrom(al, b);
-}
-
-GCAllocation* Heap::realloc(GCAllocation* al, size_t bytes) {
-    if (large_arena.contains(al)) {
-        LargeObj* lobj = LargeObj::fromAllocation(al);
-
-        int capacity = lobj->capacity();
-        if (capacity >= bytes && capacity < bytes * 2)
-            return al;
-
-        GCAllocation* rtn = alloc(bytes);
-        memcpy(rtn, al, std::min(bytes, lobj->obj_size));
-
-        _freeLargeObj(lobj);
-        return rtn;
-    }
-
-    assert(small_arena.contains(al));
-    Block* b = Block::forPointer(al);
-
-    size_t size = b->size;
-
-    if (size >= bytes && size < bytes * 2)
-        return al;
-
-    GCAllocation* rtn = alloc(bytes);
-
-#ifndef NVALGRIND
-    VALGRIND_DISABLE_ERROR_REPORTING;
-    memcpy(rtn, al, std::min(bytes, size));
-    VALGRIND_ENABLE_ERROR_REPORTING;
-#else
-    memcpy(rtn, al, std::min(bytes, size));
-#endif
-
-    _freeFrom(al, b);
-    return rtn;
-}
-
-GCAllocation* Heap::getAllocationFromInteriorPointer(void* ptr) {
-    if (large_arena.contains(ptr)) {
-        LargeObj* cur = large_head;
-        while (cur) {
-            if (ptr >= cur && ptr < &cur->data[cur->obj_size])
-                return &cur->data[0];
-            cur = cur->next;
-        }
-        return NULL;
-    }
-
-    if (!small_arena.contains(ptr))
-        return NULL;
-
-    Block* b = Block::forPointer(ptr);
-    size_t size = b->size;
-    int offset = (char*)ptr - (char*)b;
-    int obj_idx = offset / size;
-
-    if (obj_idx < b->minObjIndex() || obj_idx >= b->numObjects())
-        return NULL;
-
-    int atom_idx = obj_idx * b->atomsPerObj();
-
-    if (b->isfree.isSet(atom_idx))
-        return NULL;
-
-    return reinterpret_cast<GCAllocation*>(&b->atoms[atom_idx]);
-}
-
-static Block** freeChain(Block** head) {
-    while (Block* b = *head) {
-        int num_objects = b->numObjects();
-        int first_obj = b->minObjIndex();
-        int atoms_per_obj = b->atomsPerObj();
-
-        for (int obj_idx = first_obj; obj_idx < num_objects; obj_idx++) {
-            int atom_idx = obj_idx * atoms_per_obj;
-
-            if (b->isfree.isSet(atom_idx))
-                continue;
-
-            void* p = &b->atoms[atom_idx];
-            GCAllocation* al = reinterpret_cast<GCAllocation*>(p);
-
-            if (isMarked(al)) {
-                clearMark(al);
-            } else {
-                _doFree(al);
-
-                // assert(p != (void*)0x127000d960); // the main module
-                b->isfree.set(atom_idx);
-            }
-        }
-
-        head = &b->next;
-    }
-    return head;
-}
-
-void Heap::freeUnmarked() {
-    thread_caches.forEachValue([this](ThreadBlockCache* cache) {
-        for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
-            Block* h = cache->cache_free_heads[bidx];
-            // Try to limit the amount of unused memory a thread can hold onto;
-            // currently pretty dumb, just limit the number of blocks in the free-list
-            // to 50.  (blocks in the full list don't need to be limited, since we're sure
-            // that the thread had just actively used those.)
-            // Eventually may want to come up with some scrounging system.
-            // TODO does this thread locality even help at all?
-            for (int i = 0; i < 50; i++) {
-                if (h)
-                    h = h->next;
-                else
-                    break;
-            }
-            if (h) {
-                removeFromLL(h);
-                insertIntoLL(&heads[bidx], h);
-            }
-
-            Block** chain_end = freeChain(&cache->cache_free_heads[bidx]);
-            freeChain(&cache->cache_full_heads[bidx]);
-
-            while (Block* b = cache->cache_full_heads[bidx]) {
-                removeFromLL(b);
-                insertIntoLL(chain_end, b);
-            }
-        }
-    });
-
-    for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
-        Block** chain_end = freeChain(&heads[bidx]);
-        freeChain(&full_heads[bidx]);
-
-        while (Block* b = full_heads[bidx]) {
-            removeFromLL(b);
-            insertIntoLL(chain_end, b);
-        }
-    }
-
-    LargeObj* cur = large_head;
-    while (cur) {
-        GCAllocation* al = cur->data;
-        if (isMarked(al)) {
-            clearMark(al);
-        } else {
+void Heap::destroyContents(GCAllocation* al) {
    _doFree(al);
-
-            *cur->prev = cur->next;
-            if (cur->next)
-                cur->next->prev = cur->prev;
-
-            LargeObj* to_free = cur;
-            cur = cur->next;
-            _freeLargeObj(to_free);
-            continue;
-        }
-
-        cur = cur->next;
-    }
 }

 void dumpHeapStatistics() {
@@ -527,7 +684,7 @@ void addStatistic(HeapStatistics* stats, GCAllocation* al, int nbytes) {
 }

 // TODO: copy-pasted from freeChain
-void getChainStatistics(HeapStatistics* stats, Block** head) {
+void SmallArena::getChainStatistics(HeapStatistics* stats, Block** head) {
    while (Block* b = *head) {
        int num_objects = b->numObjects();
        int first_obj = b->minObjIndex();
@@ -550,32 +707,50 @@ void getChainStatistics(HeapStatistics* stats, Block** head) {
 }

 // TODO: copy-pasted from freeUnmarked()
-void Heap::dumpHeapStatistics() {
-    threading::GLPromoteRegion _lock;
-
-    HeapStatistics stats;
-
-    thread_caches.forEachValue([this, &stats](ThreadBlockCache* cache) {
+void SmallArena::getStatistics(HeapStatistics* stats) {
+    thread_caches.forEachValue([this, stats](ThreadBlockCache* cache) {
        for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
            Block* h = cache->cache_free_heads[bidx];

-            getChainStatistics(&stats, &cache->cache_free_heads[bidx]);
-            getChainStatistics(&stats, &cache->cache_full_heads[bidx]);
+            getChainStatistics(stats, &cache->cache_free_heads[bidx]);
+            getChainStatistics(stats, &cache->cache_full_heads[bidx]);
        }
    });

    for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
-        getChainStatistics(&stats, &heads[bidx]);
-        getChainStatistics(&stats, &full_heads[bidx]);
+        getChainStatistics(stats, &heads[bidx]);
+        getChainStatistics(stats, &full_heads[bidx]);
    }
+}

-    LargeObj* cur = large_head;
+void LargeArena::getStatistics(HeapStatistics* stats) {
+    LargeObj* cur = head;
    while (cur) {
        GCAllocation* al = cur->data;
-        addStatistic(&stats, al, cur->capacity());
+        addStatistic(stats, al, cur->size);

        cur = cur->next;
    }
+}
+
+void HugeArena::getStatistics(HeapStatistics* stats) {
+    HugeObj* cur = head;
+    while (cur) {
+        GCAllocation* al = cur->data;
+        addStatistic(stats, al, cur->capacity());
+
+        cur = cur->next;
+    }
+}
+
+void Heap::dumpHeapStatistics() {
+    threading::GLPromoteRegion _lock;
+
+    HeapStatistics stats;
+
+    small_arena.getStatistics(&stats);
+    large_arena.getStatistics(&stats);
+    huge_arena.getStatistics(&stats);

    stats.conservative.print("conservative");
    stats.untracked.print("untracked");

--- a/src/gc/heap.h
+++ b/src/gc/heap.h
@@ -17,6 +17,7 @@

 #include <cstddef>
 #include <cstdint>
+#include <sys/mman.h>

 #include "core/common.h"
 #include "core/threading.h"
@@ -24,6 +25,9 @@
 namespace pyston {
 namespace gc {

+class Heap;
+struct HeapStatistics;
+
 typedef uint8_t kindid_t;
 struct GCAllocation {
    unsigned int gc_flags : 8;
@@ -59,14 +63,57 @@ inline void clearMark(GCAllocation* header) {

 #undef MARK_BIT

+#define PAGE_SIZE 4096

-template <int N> class Bitmap {
-    static_assert(N % 64 == 0, "");
+template <uintptr_t start> class Arena {
+private:
+    void* cur;
+
+protected:
+    Arena() : cur((void*)start) {}
+
+public:
+    void* doMmap(size_t size) {
+        assert(size % PAGE_SIZE == 0);
+
+        void* mrtn = mmap(cur, size, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        assert((uintptr_t)mrtn != -1 && "failed to allocate memory from OS");
+        ASSERT(mrtn == cur, "%p %p\n", mrtn, cur);
+        cur = (uint8_t*)cur + size;
+        return mrtn;
+    }
+
+    bool contains(void* addr) { return (void*)start <= addr && addr < cur; }
+};
+
+constexpr uintptr_t SMALL_ARENA_START = 0x1270000000L;
+constexpr uintptr_t LARGE_ARENA_START = 0x2270000000L;
+constexpr uintptr_t HUGE_ARENA_START = 0x3270000000L;
+
+
+//
+// The SmallArena allocates objects <= 3584 bytes.
+//
+// it uses segregated-fit allocation, and each block contains free
+// bitmap for objects of a given size (assigned to the block)
+//
+static const size_t sizes[] = {
+    16,  32,  48,  64,  80,  96,   112,  128,  160,  192,  224,  256,  320,  384,
+    448, 512, 640, 768, 896, 1024, 1280, 1536, 1792, 2048, 2560, 3072, 3584, // 4096,
+};
+static constexpr size_t NUM_BUCKETS = sizeof(sizes) / sizeof(sizes[0]);
+
+class SmallArena : public Arena<SMALL_ARENA_START> {
+public:

 private:
+    template <int N> class Bitmap {
+        static_assert(N % 64 == 0, "");
+
+    private:
        uint64_t data[N / 64];

-public:
+    public:
        void setAllZero() { memset(data, 0, sizeof(data)); }

        struct Scanner {
@@ -113,25 +160,26 @@ public:
            int idx = first + i * 64;
            return idx;
        }
-};
+    };


-#define BLOCK_SIZE (4 * 4096)
+    static constexpr size_t BLOCK_SIZE = 4 * 4096;
+
 #define ATOM_SIZE 16
-static_assert(BLOCK_SIZE % ATOM_SIZE == 0, "");
+    static_assert(BLOCK_SIZE % ATOM_SIZE == 0, "");
 #define ATOMS_PER_BLOCK (BLOCK_SIZE / ATOM_SIZE)
-static_assert(ATOMS_PER_BLOCK % 64 == 0, "");
+    static_assert(ATOMS_PER_BLOCK % 64 == 0, "");
 #define BITFIELD_SIZE (ATOMS_PER_BLOCK / 8)
 #define BITFIELD_ELTS (BITFIELD_SIZE / 8)

 #define BLOCK_HEADER_SIZE (BITFIELD_SIZE + 4 * sizeof(void*))
 #define BLOCK_HEADER_ATOMS ((BLOCK_HEADER_SIZE + ATOM_SIZE - 1) / ATOM_SIZE)

-struct Atoms {
+    struct Atoms {
        char _data[ATOM_SIZE];
-};
+    };

-struct Block {
+    struct Block {
        union {
            struct {
                Block* next, **prev;
@@ -153,79 +201,263 @@ struct Block {
        inline int atomsPerObj() const { return atoms_per_obj; }

        static Block* forPointer(void* ptr) { return (Block*)((uintptr_t)ptr & ~(BLOCK_SIZE - 1)); }
-};
-static_assert(sizeof(Block) == BLOCK_SIZE, "bad size");
-static_assert(offsetof(Block, _header_end) >= BLOCK_HEADER_SIZE, "bad header size");
-static_assert(offsetof(Block, _header_end) <= BLOCK_HEADER_SIZE, "bad header size");
-
-constexpr const size_t sizes[] = {
-    16,  32,  48,  64,  80,  96,  112, 128,  160,  192,  224,  256,
-    320, 384, 448, 512, 640, 768, 896, 1024, 1280, 1536, 1792, 2048,
-    // 2560, 3072, 3584, // 4096,
-};
-#define NUM_BUCKETS (sizeof(sizes) / sizeof(sizes[0]))
-
-struct LargeObj;
-class Heap {
-private:
-    Block* heads[NUM_BUCKETS];
-    Block* full_heads[NUM_BUCKETS];
-    LargeObj* large_head = NULL;
-
-    GCAllocation* __attribute__((__malloc__)) allocSmall(size_t rounded_size, int bucket_idx);
-    GCAllocation* __attribute__((__malloc__)) allocLarge(size_t bytes);
-
-    // DS_DEFINE_MUTEX(lock);
-    DS_DEFINE_SPINLOCK(lock);
+    };
+    static_assert(sizeof(Block) == BLOCK_SIZE, "bad size");
+    static_assert(offsetof(Block, _header_end) >= BLOCK_HEADER_SIZE, "bad header size");
+    static_assert(offsetof(Block, _header_end) <= BLOCK_HEADER_SIZE, "bad header size");

+    // forward (public) definition of ThreadBlockCache so we can reference it both in this class (privately) and in Heap
+    // (for a friend ref).
    struct ThreadBlockCache {
        Heap* heap;
+        SmallArena* small;
        Block* cache_free_heads[NUM_BUCKETS];
        Block* cache_full_heads[NUM_BUCKETS];

-        ThreadBlockCache(Heap* heap) : heap(heap) {
+        ThreadBlockCache(Heap* heap, SmallArena* small) : heap(heap), small(small) {
            memset(cache_free_heads, 0, sizeof(cache_free_heads));
            memset(cache_full_heads, 0, sizeof(cache_full_heads));
        }
        ~ThreadBlockCache();
    };
+
+
+
+    Block* heads[NUM_BUCKETS];
+    Block* full_heads[NUM_BUCKETS];
+
    friend struct ThreadBlockCache;
+
+    Heap* heap;
    // TODO only use thread caches if we're in GRWL mode?
-    threading::PerThreadSet<ThreadBlockCache, Heap*> thread_caches;
+    threading::PerThreadSet<ThreadBlockCache, Heap*, SmallArena*> thread_caches;

-public:
-    Heap() : thread_caches(this) {}

-    GCAllocation* realloc(GCAllocation* alloc, size_t bytes);
+    Block* alloc_block(uint64_t size, Block** prev);
+    GCAllocation* allocFromBlock(Block* b);
+    Block* claimBlock(size_t rounded_size, Block** free_head);
+    void insertIntoLL(Block** next_pointer, Block* next);
+    void removeFromLL(Block* b);
+    Block** freeChain(Block** head);
+    void getChainStatistics(HeapStatistics* stats, Block** head);
+
+    GCAllocation* __attribute__((__malloc__)) _alloc(size_t bytes, int bucket_idx);
+    void _free(GCAllocation* al, Block* b);
+
+public:
+    SmallArena(Heap* heap) : Arena(), heap(heap), thread_caches(heap, this) {}

    GCAllocation* __attribute__((__malloc__)) alloc(size_t bytes) {
-        GCAllocation* rtn;
-        // assert(bytes >= 16);
        if (bytes <= 16)
-            rtn = allocSmall(16, 0);
+            return _alloc(16, 0);
        else if (bytes <= 32)
-            rtn = allocSmall(32, 1);
-        else if (bytes > sizes[NUM_BUCKETS - 1])
-            rtn = allocLarge(bytes);
+            return _alloc(32, 1);
        else {
-            rtn = NULL;
            for (int i = 2; i < NUM_BUCKETS; i++) {
                if (sizes[i] >= bytes) {
-                    rtn = allocSmall(sizes[i], i);
-                    break;
+                    return _alloc(sizes[i], i);
                }
            }
+            return NULL;
        }
+    }
+    GCAllocation* realloc(GCAllocation* alloc, size_t bytes);

-        return rtn;
+    void free(GCAllocation* al) {
+        Block* b = Block::forPointer(al);
+        _free(al, b);
    }

+    void getStatistics(HeapStatistics* stats);
+
+    GCAllocation* allocationFrom(void* ptr);
+    void freeUnmarked();
+};
+
+//
+// The LargeArena allocates objects where 3584 < size <1024*1024 bytes.
+//
+// it maintains a set of size-segregated free lists, and a special
+// free list for larger objects.  If the free list specific to a given
+// size has no entries, we search the large free list.
+//
+class LargeArena : public Arena<LARGE_ARENA_START> {
+    struct LargeFreeChunk {
+        LargeFreeChunk* next_size;
+        size_t size;
+    };
+
+    struct LargeBlock {
+        LargeBlock* next;
+        size_t num_free_chunks;
+        unsigned char* free_chunk_map;
+    };
+
+    struct LargeObj {
+        LargeObj* next, **prev;
+        size_t size;
+        GCAllocation data[0];
+    };
+
+    /*
+     * This shouldn't be much smaller or larger than the largest small size bucket.
+     * Must be at least sizeof (LargeBlock).
+     */
+    static constexpr size_t CHUNK_SIZE = 4096;
+    static constexpr int CHUNK_BITS = 12;
+
+    static_assert(CHUNK_SIZE > sizeof(LargeBlock), "bad large block size");
+
+    static constexpr int BLOCK_SIZE = 1024 * 1024;
+
+    static constexpr int NUM_FREE_LISTS = 32;
+
+    void add_free_chunk(LargeFreeChunk* free_chunks, size_t size);
+    LargeFreeChunk* get_from_size_list(LargeFreeChunk** list, size_t size);
+    LargeObj* _allocInternal(size_t size);
+    void _freeInternal(LargeObj* obj, size_t size);
+    void _free(LargeObj* obj);
+
+    LargeObj* head;
+    LargeBlock* blocks;
+    LargeFreeChunk* free_lists[NUM_FREE_LISTS]; /* 0 is for larger sizes */
+
+    Heap* heap;
+
+public:
+    LargeArena(Heap* heap) : head(NULL), blocks(NULL), heap(heap) {}
+
+    /* Largest object that can be allocated in a large block. */
+    static constexpr size_t ALLOC_SIZE_LIMIT = BLOCK_SIZE - CHUNK_SIZE - sizeof(LargeObj);
+
+    GCAllocation* __attribute__((__malloc__)) alloc(size_t bytes);
+    GCAllocation* realloc(GCAllocation* alloc, size_t bytes);
+    void free(GCAllocation* alloc);
+
+    void freeUnmarked();
+
+    GCAllocation* allocationFrom(void* ptr);
+    void getStatistics(HeapStatistics* stats);
+};
+
+// The HugeArena allocates objects where size > 1024*1024 bytes.
+//
+// Objects are allocated with individual mmap() calls, and kept in a
+// linked list.  They are not reused.
+class HugeArena : public Arena<HUGE_ARENA_START> {
+    struct HugeObj {
+        HugeObj* next, **prev;
+        size_t obj_size;
+        GCAllocation data[0];
+
+        int mmap_size() {
+            size_t total_size = obj_size + sizeof(HugeObj);
+            total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+            return total_size;
+        }
+
+        int capacity() { return mmap_size() - sizeof(HugeObj); }
+
+        static HugeObj* fromAllocation(GCAllocation* alloc) {
+            char* rtn = (char*)alloc - offsetof(HugeObj, data);
+            assert((uintptr_t)rtn % PAGE_SIZE == 0);
+            return reinterpret_cast<HugeObj*>(rtn);
+        }
+    };
+
+    void _freeHugeObj(HugeObj* lobj);
+
+    HugeObj* head;
+
+    Heap* heap;
+
+public:
+    HugeArena(Heap* heap) : heap(heap) {}
+
+    GCAllocation* __attribute__((__malloc__)) alloc(size_t bytes);
+    GCAllocation* realloc(GCAllocation* alloc, size_t bytes);
    void free(GCAllocation* alloc);

+    void freeUnmarked();
+
+    GCAllocation* allocationFrom(void* ptr);
+    void getStatistics(HeapStatistics* stats);
+};
+
+
+class Heap {
+private:
+    SmallArena small_arena;
+    LargeArena large_arena;
+    HugeArena huge_arena;
+
+    friend class SmallArena;
+    friend class LargeArena;
+    friend class HugeArena;
+
+    // DS_DEFINE_MUTEX(lock);
+    DS_DEFINE_SPINLOCK(lock);
+
+public:
+    Heap() : small_arena(this), large_arena(this), huge_arena(this) {}
+
+    GCAllocation* realloc(GCAllocation* alloc, size_t bytes) {
+        if (large_arena.contains(alloc)) {
+            return large_arena.realloc(alloc, bytes);
+        } else if (huge_arena.contains(alloc)) {
+            return huge_arena.realloc(alloc, bytes);
+        }
+
+        assert(small_arena.contains(alloc));
+        return small_arena.realloc(alloc, bytes);
+    }
+
+    GCAllocation* __attribute__((__malloc__)) alloc(size_t bytes) {
+        if (bytes > LargeArena::ALLOC_SIZE_LIMIT)
+            return huge_arena.alloc(bytes);
+        else if (bytes > sizes[NUM_BUCKETS - 1])
+            return large_arena.alloc(bytes);
+        else
+            return small_arena.alloc(bytes);
+    }
+
+    void destroyContents(GCAllocation* alloc);
+
+    void free(GCAllocation* alloc) {
+        destroyContents(alloc);
+
+        if (large_arena.contains(alloc)) {
+            large_arena.free(alloc);
+            return;
+        }
+
+        if (huge_arena.contains(alloc)) {
+            huge_arena.free(alloc);
+            return;
+        }
+
+        assert(small_arena.contains(alloc));
+        small_arena.free(alloc);
+    }
+
    // not thread safe:
-    GCAllocation* getAllocationFromInteriorPointer(void* ptr);
+    GCAllocation* getAllocationFromInteriorPointer(void* ptr) {
+        if (large_arena.contains(ptr)) {
+            return large_arena.allocationFrom(ptr);
+        } else if (huge_arena.contains(ptr)) {
+            return huge_arena.allocationFrom(ptr);
+        } else if (small_arena.contains(ptr)) {
+            return small_arena.allocationFrom(ptr);
+        }
+
+        return NULL;
+    }
    // not thread safe:
-    void freeUnmarked();
+    void freeUnmarked() {
+        small_arena.freeUnmarked();
+        large_arena.freeUnmarked();
+        huge_arena.freeUnmarked();
+    }

    void dumpHeapStatistics();
 };

--- a/test/unittests/gc.cpp
+++ b/test/unittests/gc.cpp
@@ -68,6 +68,9 @@ TEST(alloc, alloc64) { testAlloc(64); }
 TEST(alloc, alloc128) { testAlloc(128); }
 TEST(alloc, alloc258) { testAlloc(258); }
 TEST(alloc, alloc3584) { testAlloc(3584); }
+TEST(alloc, alloc4096) { testAlloc(4096); }
+TEST(alloc, alloc8192) { testAlloc(8192); }
+TEST(alloc, alloc16384) { testAlloc(16384); }

 TEST(alloc, largeallocs) {
    int s1 = 1 << 20;