Commit 012d6d50 authored by Kevin Modzelewski's avatar Kevin Modzelewski

Merge pull request #298 from toshok/three-arenas

Add third GC arena
parents 7cf92757 d85c9893
import time
PIDIGITS_LEN = 1500
def pidigits(length):
i = k = ns = 0
k1 = 1
n,a,d,t,u = 1,0,1,0,0
while(True):
k += 1
t = n<<1
n *= k
a += t
k1 += 2
a *= k1
d *= k1
if a >= n:
t,u = divmod(n*3 + a,d)
u += n
if d > u:
ns = ns*10 + t
i += 1
if i % 10 == 0:
ns = 0
if i >= length:
break
a -= d*t
a *= 10
n *= 10
def main(n):
l = []
for i in range(n):
t0 = time.time()
pidigits(PIDIGITS_LEN)
l.append(time.time() - t0)
return l
main(100)
......@@ -51,6 +51,11 @@ extern "C" inline void* gc_alloc(size_t bytes, GCKind kind_id) {
#endif
GCAllocation* alloc = global_heap.alloc(alloc_bytes);
#ifndef NVALGRIND
VALGRIND_DISABLE_ERROR_REPORTING;
#endif
alloc->kind_id = kind_id;
alloc->gc_flags = 0;
......@@ -67,7 +72,10 @@ extern "C" inline void* gc_alloc(size_t bytes, GCKind kind_id) {
}
void* r = alloc->user_data;
#ifndef NVALGRIND
VALGRIND_ENABLE_ERROR_REPORTING;
if (ENABLE_REDZONES) {
r = ((char*)r) + REDZONE_SIZE;
}
......
......@@ -17,7 +17,6 @@
#include <cstdlib>
#include <cstring>
#include <stdint.h>
#include <sys/mman.h>
#include "core/common.h"
#include "core/util.h"
......@@ -34,6 +33,67 @@
namespace pyston {
namespace gc {
void _doFree(GCAllocation* al);
// lots of linked lists around here, so let's just use template functions for operations on them.
template <class ListT> inline void nullNextPrev(ListT* node) {
node->next = NULL;
node->prev = NULL;
}
template <class ListT> inline void removeFromLL(ListT* node) {
*node->prev = node->next;
if (node->next)
node->next->prev = node->prev;
}
template <class ListT> inline void removeFromLLAndNull(ListT* node) {
*node->prev = node->next;
if (node->next)
node->next->prev = node->prev;
nullNextPrev(node);
}
template <class ListT> inline void insertIntoLL(ListT** next_pointer, ListT* next) {
assert(next_pointer);
assert(next);
assert(!next->next);
assert(!next->prev);
next->next = *next_pointer;
if (next->next)
next->next->prev = &next->next;
*next_pointer = next;
next->prev = next_pointer;
}
template <class ListT, typename Func> inline void forEach(ListT* list, Func func) {
auto cur = list;
while (cur) {
func(cur);
cur = cur->next;
}
}
template <class ListT, typename Free> inline void sweepList(ListT* head, Free free_func) {
auto cur = head;
while (cur) {
GCAllocation* al = cur->data;
if (isMarked(al)) {
clearMark(al);
cur = cur->next;
} else {
_doFree(al);
removeFromLL(cur);
auto to_free = cur;
cur = cur->next;
free_func(to_free);
}
}
}
static unsigned bytesAllocatedSinceCollection;
static __thread unsigned thread_bytesAllocatedSinceCollection;
#define ALLOCBYTES_PER_COLLECTION 10000000
......@@ -64,73 +124,268 @@ void registerGCManagedBytes(size_t bytes) {
Heap global_heap;
#define PAGE_SIZE 4096
class Arena {
private:
void* start;
void* cur;
void _doFree(GCAllocation* al) {
if (VERBOSITY() >= 2)
printf("Freeing %p\n", al->user_data);
public:
constexpr Arena(void* start) : start(start), cur(start) {}
#ifndef NVALGRIND
VALGRIND_DISABLE_ERROR_REPORTING;
#endif
GCKind alloc_kind = al->kind_id;
#ifndef NVALGRIND
VALGRIND_ENABLE_ERROR_REPORTING;
#endif
void* doMmap(size_t size) {
assert(size % PAGE_SIZE == 0);
// printf("mmap %ld\n", size);
if (alloc_kind == GCKind::PYTHON) {
#ifndef NVALGRIND
VALGRIND_DISABLE_ERROR_REPORTING;
#endif
Box* b = (Box*)al->user_data;
#ifndef NVALGRIND
VALGRIND_ENABLE_ERROR_REPORTING;
#endif
void* mrtn = mmap(cur, size, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
assert((uintptr_t)mrtn != -1 && "failed to allocate memory from OS");
ASSERT(mrtn == cur, "%p %p\n", mrtn, cur);
cur = (uint8_t*)cur + size;
return mrtn;
ASSERT(b->cls->tp_dealloc == NULL, "%s", getTypeName(b));
if (b->cls->simple_destructor)
b->cls->simple_destructor(b);
}
}
void Heap::destructContents(GCAllocation* al) {
_doFree(al);
}
bool contains(void* addr) { return start <= addr && addr < cur; }
struct HeapStatistics {
struct TypeStats {
int64_t nallocs;
int64_t nbytes;
TypeStats() : nallocs(0), nbytes(0) {}
void print(const char* name) const {
if (nbytes > (1 << 20))
printf("%s: %ld allocations for %.1f MB\n", name, nallocs, nbytes * 1.0 / (1 << 20));
else if (nbytes > (1 << 10))
printf("%s: %ld allocations for %.1f KB\n", name, nallocs, nbytes * 1.0 / (1 << 10));
else
printf("%s: %ld allocations for %ld bytes\n", name, nallocs, nbytes);
}
};
std::unordered_map<BoxedClass*, TypeStats> by_cls;
TypeStats conservative, untracked;
TypeStats total;
};
static Arena small_arena((void*)0x1270000000L);
static Arena large_arena((void*)0x2270000000L);
void addStatistic(HeapStatistics* stats, GCAllocation* al, int nbytes) {
stats->total.nallocs++;
stats->total.nbytes += nbytes;
struct LargeObj {
LargeObj* next, **prev;
size_t obj_size;
GCAllocation data[0];
if (al->kind_id == GCKind::PYTHON) {
Box* b = (Box*)al->user_data;
auto& t = stats->by_cls[b->cls];
int mmap_size() {
size_t total_size = obj_size + sizeof(LargeObj);
total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
return total_size;
t.nallocs++;
t.nbytes += nbytes;
} else if (al->kind_id == GCKind::CONSERVATIVE) {
stats->conservative.nallocs++;
stats->conservative.nbytes += nbytes;
} else if (al->kind_id == GCKind::UNTRACKED) {
stats->untracked.nallocs++;
stats->untracked.nbytes += nbytes;
} else {
RELEASE_ASSERT(0, "%d", (int)al->kind_id);
}
}
int capacity() { return mmap_size() - sizeof(LargeObj); }
static LargeObj* fromAllocation(GCAllocation* alloc) {
char* rtn = (char*)alloc - offsetof(LargeObj, data);
assert((uintptr_t)rtn % PAGE_SIZE == 0);
return reinterpret_cast<LargeObj*>(rtn);
void Heap::dumpHeapStatistics() {
threading::GLPromoteRegion _lock;
HeapStatistics stats;
small_arena.getStatistics(&stats);
large_arena.getStatistics(&stats);
huge_arena.getStatistics(&stats);
stats.conservative.print("conservative");
stats.untracked.print("untracked");
for (const auto& p : stats.by_cls) {
p.second.print(getFullNameOfClass(p.first).c_str());
}
};
stats.total.print("Total");
printf("\n");
}
GCAllocation* Heap::allocLarge(size_t size) {
registerGCManagedBytes(size);
void dumpHeapStatistics() {
global_heap.dumpHeapStatistics();
}
LOCK_REGION(lock);
//////
/// Small Arena
size_t total_size = size + sizeof(LargeObj);
total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
LargeObj* rtn = (LargeObj*)large_arena.doMmap(total_size);
rtn->obj_size = size;
GCAllocation* SmallArena::alloc(size_t bytes) {
registerGCManagedBytes(bytes);
if (bytes <= 16)
return _alloc(16, 0);
else if (bytes <= 32)
return _alloc(32, 1);
else {
for (int i = 2; i < NUM_BUCKETS; i++) {
if (sizes[i] >= bytes) {
return _alloc(sizes[i], i);
}
}
return NULL;
}
}
rtn->next = large_head;
if (rtn->next)
rtn->next->prev = &rtn->next;
rtn->prev = &large_head;
large_head = rtn;
GCAllocation* SmallArena::realloc(GCAllocation* al, size_t bytes) {
Block* b = Block::forPointer(al);
return rtn->data;
size_t size = b->size;
if (size >= bytes && size < bytes * 2)
return al;
GCAllocation* rtn = heap->alloc(bytes);
#ifndef NVALGRIND
VALGRIND_DISABLE_ERROR_REPORTING;
memcpy(rtn, al, std::min(bytes, size));
VALGRIND_ENABLE_ERROR_REPORTING;
#else
memcpy(rtn, al, std::min(bytes, size));
#endif
free(al);
return rtn;
}
void SmallArena::free(GCAllocation* alloc) {
Block* b = Block::forPointer(alloc);
size_t size = b->size;
int offset = (char*)alloc - (char*)b;
assert(offset % size == 0);
int atom_idx = offset / ATOM_SIZE;
assert(!b->isfree.isSet(atom_idx));
b->isfree.set(atom_idx);
#ifndef NVALGRIND
// VALGRIND_MEMPOOL_FREE(b, ptr);
#endif
}
GCAllocation* SmallArena::allocationFrom(void* ptr) {
Block* b = Block::forPointer(ptr);
size_t size = b->size;
int offset = (char*)ptr - (char*)b;
int obj_idx = offset / size;
if (obj_idx < b->minObjIndex() || obj_idx >= b->numObjects())
return NULL;
int atom_idx = obj_idx * b->atomsPerObj();
if (b->isfree.isSet(atom_idx))
return NULL;
return reinterpret_cast<GCAllocation*>(&b->atoms[atom_idx]);
}
static Block* alloc_block(uint64_t size, Block** prev) {
Block* rtn = (Block*)small_arena.doMmap(sizeof(Block));
void SmallArena::freeUnmarked() {
thread_caches.forEachValue([this](ThreadBlockCache* cache) {
for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
Block* h = cache->cache_free_heads[bidx];
// Try to limit the amount of unused memory a thread can hold onto;
// currently pretty dumb, just limit the number of blocks in the free-list
// to 50. (blocks in the full list don't need to be limited, since we're sure
// that the thread had just actively used those.)
// Eventually may want to come up with some scrounging system.
// TODO does this thread locality even help at all?
for (int i = 0; i < 50; i++) {
if (h)
h = h->next;
else
break;
}
if (h) {
removeFromLLAndNull(h);
insertIntoLL(&heads[bidx], h);
}
Block** chain_end = _freeChain(&cache->cache_free_heads[bidx]);
_freeChain(&cache->cache_full_heads[bidx]);
while (Block* b = cache->cache_full_heads[bidx]) {
removeFromLLAndNull(b);
insertIntoLL(chain_end, b);
}
}
});
for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
Block** chain_end = _freeChain(&heads[bidx]);
_freeChain(&full_heads[bidx]);
while (Block* b = full_heads[bidx]) {
removeFromLLAndNull(b);
insertIntoLL(chain_end, b);
}
}
}
// TODO: copy-pasted from freeUnmarked()
void SmallArena::getStatistics(HeapStatistics* stats) {
thread_caches.forEachValue([this, stats](ThreadBlockCache* cache) {
for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
Block* h = cache->cache_free_heads[bidx];
_getChainStatistics(stats, &cache->cache_free_heads[bidx]);
_getChainStatistics(stats, &cache->cache_full_heads[bidx]);
}
});
for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
_getChainStatistics(stats, &heads[bidx]);
_getChainStatistics(stats, &full_heads[bidx]);
}
}
SmallArena::Block** SmallArena::_freeChain(Block** head) {
while (Block* b = *head) {
int num_objects = b->numObjects();
int first_obj = b->minObjIndex();
int atoms_per_obj = b->atomsPerObj();
for (int obj_idx = first_obj; obj_idx < num_objects; obj_idx++) {
int atom_idx = obj_idx * atoms_per_obj;
if (b->isfree.isSet(atom_idx))
continue;
void* p = &b->atoms[atom_idx];
GCAllocation* al = reinterpret_cast<GCAllocation*>(p);
if (isMarked(al)) {
clearMark(al);
} else {
_doFree(al);
// assert(p != (void*)0x127000d960); // the main module
b->isfree.set(atom_idx);
}
}
head = &b->next;
}
return head;
}
SmallArena::Block* SmallArena::_allocBlock(uint64_t size, Block** prev) {
Block* rtn = (Block*)doMmap(sizeof(Block));
assert(rtn);
rtn->size = size;
rtn->num_obj = BLOCK_SIZE / size;
......@@ -165,45 +420,23 @@ static Block* alloc_block(uint64_t size, Block** prev) {
return rtn;
}
static void insertIntoLL(Block** next_pointer, Block* next) {
assert(next_pointer);
assert(next);
assert(!next->next);
assert(!next->prev);
next->next = *next_pointer;
if (next->next)
next->next->prev = &next->next;
*next_pointer = next;
next->prev = next_pointer;
}
static void removeFromLL(Block* b) {
if (b->next)
b->next->prev = b->prev;
*b->prev = b->next;
b->next = NULL;
b->prev = NULL;
}
Heap::ThreadBlockCache::~ThreadBlockCache() {
SmallArena::ThreadBlockCache::~ThreadBlockCache() {
LOCK_REGION(heap->lock);
for (int i = 0; i < NUM_BUCKETS; i++) {
while (Block* b = cache_free_heads[i]) {
removeFromLL(b);
insertIntoLL(&heap->heads[i], b);
removeFromLLAndNull(b);
insertIntoLL(&small->heads[i], b);
}
while (Block* b = cache_full_heads[i]) {
removeFromLL(b);
insertIntoLL(&heap->full_heads[i], b);
removeFromLLAndNull(b);
insertIntoLL(&small->full_heads[i], b);
}
}
}
static GCAllocation* allocFromBlock(Block* b) {
GCAllocation* SmallArena::_allocFromBlock(Block* b) {
int idx = b->isfree.scanForNext(b->next_to_check);
if (idx == -1)
return NULL;
......@@ -212,19 +445,17 @@ static GCAllocation* allocFromBlock(Block* b) {
return reinterpret_cast<GCAllocation*>(rtn);
}
static Block* claimBlock(size_t rounded_size, Block** free_head) {
SmallArena::Block* SmallArena::_claimBlock(size_t rounded_size, Block** free_head) {
Block* free_block = *free_head;
if (free_block) {
removeFromLL(free_block);
removeFromLLAndNull(free_block);
return free_block;
}
return alloc_block(rounded_size, NULL);
return _allocBlock(rounded_size, NULL);
}
GCAllocation* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
registerGCManagedBytes(rounded_size);
GCAllocation* SmallArena::_alloc(size_t rounded_size, int bucket_idx) {
Block** free_head = &heads[bucket_idx];
Block** full_head = &full_heads[bucket_idx];
......@@ -241,11 +472,11 @@ GCAllocation* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
while (true) {
while (Block* cache_block = *cache_head) {
GCAllocation* rtn = allocFromBlock(cache_block);
GCAllocation* rtn = _allocFromBlock(cache_block);
if (rtn)
return rtn;
removeFromLL(cache_block);
removeFromLLAndNull(cache_block);
insertIntoLL(&cache->cache_full_heads[bucket_idx], cache_block);
}
......@@ -253,12 +484,12 @@ GCAllocation* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
// static StatCounter sc_fallback("gc_allocs_cachemiss");
// sc_fallback.log();
LOCK_REGION(lock);
LOCK_REGION(heap->lock);
assert(*cache_head == NULL);
// should probably be called allocBlock:
Block* myblock = claimBlock(rounded_size, &heads[bucket_idx]);
Block* myblock = _claimBlock(rounded_size, &heads[bucket_idx]);
assert(myblock);
assert(!myblock->next);
assert(!myblock->prev);
......@@ -269,322 +500,290 @@ GCAllocation* Heap::allocSmall(size_t rounded_size, int bucket_idx) {
}
}
void _freeFrom(GCAllocation* alloc, Block* b) {
assert(b == Block::forPointer(alloc));
size_t size = b->size;
int offset = (char*)alloc - (char*)b;
assert(offset % size == 0);
int atom_idx = offset / ATOM_SIZE;
assert(!b->isfree.isSet(atom_idx));
b->isfree.toggle(atom_idx);
#ifndef NVALGRIND
// VALGRIND_MEMPOOL_FREE(b, ptr);
#endif
}
// TODO: copy-pasted from _freeChain
void SmallArena::_getChainStatistics(HeapStatistics* stats, Block** head) {
while (Block* b = *head) {
int num_objects = b->numObjects();
int first_obj = b->minObjIndex();
int atoms_per_obj = b->atomsPerObj();
static void _freeLargeObj(LargeObj* lobj) {
*lobj->prev = lobj->next;
if (lobj->next)
lobj->next->prev = lobj->prev;
for (int obj_idx = first_obj; obj_idx < num_objects; obj_idx++) {
int atom_idx = obj_idx * atoms_per_obj;
int r = munmap(lobj, lobj->mmap_size());
assert(r == 0);
}
if (b->isfree.isSet(atom_idx))
continue;
static void _doFree(GCAllocation* al) {
if (VERBOSITY() >= 2)
printf("Freeing %p\n", al->user_data);
void* p = &b->atoms[atom_idx];
GCAllocation* al = reinterpret_cast<GCAllocation*>(p);
if (al->kind_id == GCKind::PYTHON) {
Box* b = (Box*)al->user_data;
addStatistic(stats, al, b->size);
}
ASSERT(b->cls->tp_dealloc == NULL, "%s", getTypeName(b));
if (b->cls->simple_destructor)
b->cls->simple_destructor(b);
head = &b->next;
}
}
void Heap::free(GCAllocation* al) {
_doFree(al);
//////
/// Large Arena
if (large_arena.contains(al)) {
LargeObj* lobj = LargeObj::fromAllocation(al);
_freeLargeObj(lobj);
return;
}
#define LARGE_BLOCK_NUM_CHUNKS ((BLOCK_SIZE >> CHUNK_BITS) - 1)
assert(small_arena.contains(al));
Block* b = Block::forPointer(al);
_freeFrom(al, b);
}
#define LARGE_BLOCK_FOR_OBJ(obj) ((LargeBlock*)((int64_t)(obj) & ~(int64_t)(BLOCK_SIZE - 1)))
#define LARGE_CHUNK_INDEX(obj, section) (((char*)(obj) - (char*)(section)) >> CHUNK_BITS)
GCAllocation* LargeArena::alloc(size_t size) {
registerGCManagedBytes(size);
GCAllocation* Heap::realloc(GCAllocation* al, size_t bytes) {
if (large_arena.contains(al)) {
LargeObj* lobj = LargeObj::fromAllocation(al);
LOCK_REGION(heap->lock);
int capacity = lobj->capacity();
if (capacity >= bytes && capacity < bytes * 2)
return al;
// printf ("allocLarge %zu\n", size);
GCAllocation* rtn = alloc(bytes);
memcpy(rtn, al, std::min(bytes, lobj->obj_size));
LargeObj* obj = _alloc(size + sizeof(GCAllocation) + sizeof(LargeObj));
_freeLargeObj(lobj);
return rtn;
}
obj->size = size;
assert(small_arena.contains(al));
Block* b = Block::forPointer(al);
nullNextPrev(obj);
insertIntoLL(&head, obj);
size_t size = b->size;
return obj->data;
}
GCAllocation* LargeArena::realloc(GCAllocation* al, size_t bytes) {
LargeObj* obj = LargeObj::fromAllocation(al);
int size = obj->size;
if (size >= bytes && size < bytes * 2)
return al;
GCAllocation* rtn = alloc(bytes);
GCAllocation* rtn = heap->alloc(bytes);
memcpy(rtn, al, std::min(bytes, obj->size));
#ifndef NVALGRIND
VALGRIND_DISABLE_ERROR_REPORTING;
memcpy(rtn, al, std::min(bytes, size));
VALGRIND_ENABLE_ERROR_REPORTING;
#else
memcpy(rtn, al, std::min(bytes, size));
#endif
_freeFrom(al, b);
_freeLargeObj(obj);
return rtn;
}
GCAllocation* Heap::getAllocationFromInteriorPointer(void* ptr) {
if (large_arena.contains(ptr)) {
LargeObj* cur = large_head;
while (cur) {
if (ptr >= cur && ptr < &cur->data[cur->obj_size])
return &cur->data[0];
cur = cur->next;
void LargeArena::free(GCAllocation* al) {
_freeLargeObj(LargeObj::fromAllocation(al));
}
GCAllocation* LargeArena::allocationFrom(void* ptr) {
LargeObj* obj = NULL;
for (obj = head; obj; obj = obj->next) {
char* end = (char*)&obj->data + obj->size;
if (ptr >= obj->data && ptr < end) {
return &obj->data[0];
}
return NULL;
}
if (!small_arena.contains(ptr))
return NULL;
}
Block* b = Block::forPointer(ptr);
size_t size = b->size;
int offset = (char*)ptr - (char*)b;
int obj_idx = offset / size;
void LargeArena::freeUnmarked() {
sweepList(head, [this](LargeObj* ptr) { _freeLargeObj(ptr); });
}
if (obj_idx < b->minObjIndex() || obj_idx >= b->numObjects())
return NULL;
void LargeArena::getStatistics(HeapStatistics* stats) {
forEach(head, [stats](LargeObj* obj) { addStatistic(stats, obj->data, obj->size); });
}
int atom_idx = obj_idx * b->atomsPerObj();
if (b->isfree.isSet(atom_idx))
return NULL;
void LargeArena::add_free_chunk(LargeFreeChunk* free_chunks, size_t size) {
size_t num_chunks = size >> CHUNK_BITS;
return reinterpret_cast<GCAllocation*>(&b->atoms[atom_idx]);
free_chunks->size = size;
if (num_chunks >= NUM_FREE_LISTS)
num_chunks = 0;
free_chunks->next_size = free_lists[num_chunks];
free_lists[num_chunks] = free_chunks;
}
static Block** freeChain(Block** head) {
while (Block* b = *head) {
int num_objects = b->numObjects();
int first_obj = b->minObjIndex();
int atoms_per_obj = b->atomsPerObj();
LargeArena::LargeFreeChunk* LargeArena::get_from_size_list(LargeFreeChunk** list, size_t size) {
LargeFreeChunk* free_chunks = NULL;
LargeBlock* section;
size_t i, num_chunks, start_index;
for (int obj_idx = first_obj; obj_idx < num_objects; obj_idx++) {
int atom_idx = obj_idx * atoms_per_obj;
assert((size & (CHUNK_SIZE - 1)) == 0);
if (b->isfree.isSet(atom_idx))
continue;
while (*list) {
free_chunks = *list;
if (free_chunks->size >= size)
break;
list = &(*list)->next_size;
}
void* p = &b->atoms[atom_idx];
GCAllocation* al = reinterpret_cast<GCAllocation*>(p);
if (!*list)
return NULL;
if (isMarked(al)) {
clearMark(al);
} else {
_doFree(al);
*list = free_chunks->next_size;
// assert(p != (void*)0x127000d960); // the main module
b->isfree.set(atom_idx);
}
}
if (free_chunks->size > size)
add_free_chunk((LargeFreeChunk*)((char*)free_chunks + size), free_chunks->size - size);
head = &b->next;
num_chunks = size >> CHUNK_BITS;
section = LARGE_BLOCK_FOR_OBJ(free_chunks);
start_index = LARGE_CHUNK_INDEX(free_chunks, section);
for (i = start_index; i < start_index + num_chunks; ++i) {
assert(section->free_chunk_map[i]);
section->free_chunk_map[i] = 0;
}
return head;
section->num_free_chunks -= size >> CHUNK_BITS;
assert(section->num_free_chunks >= 0);
return free_chunks;
}
void Heap::freeUnmarked() {
thread_caches.forEachValue([this](ThreadBlockCache* cache) {
for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
Block* h = cache->cache_free_heads[bidx];
// Try to limit the amount of unused memory a thread can hold onto;
// currently pretty dumb, just limit the number of blocks in the free-list
// to 50. (blocks in the full list don't need to be limited, since we're sure
// that the thread had just actively used those.)
// Eventually may want to come up with some scrounging system.
// TODO does this thread locality even help at all?
for (int i = 0; i < 50; i++) {
if (h)
h = h->next;
else
LargeArena::LargeObj* LargeArena::_alloc(size_t size) {
LargeBlock* section;
LargeFreeChunk* free_chunks;
size_t num_chunks;
size += CHUNK_SIZE - 1;
size &= ~(CHUNK_SIZE - 1);
num_chunks = size >> CHUNK_BITS;
assert(size > 0 && size - sizeof(LargeObj) <= ALLOC_SIZE_LIMIT);
assert(num_chunks > 0);
retry:
if (num_chunks >= NUM_FREE_LISTS) {
free_chunks = get_from_size_list(&free_lists[0], size);
} else {
size_t i;
for (i = num_chunks; i < NUM_FREE_LISTS; ++i) {
free_chunks = get_from_size_list(&free_lists[i], size);
if (free_chunks)
break;
}
if (h) {
removeFromLL(h);
insertIntoLL(&heads[bidx], h);
if (!free_chunks)
free_chunks = get_from_size_list(&free_lists[0], size);
}
Block** chain_end = freeChain(&cache->cache_free_heads[bidx]);
freeChain(&cache->cache_full_heads[bidx]);
if (free_chunks)
return (LargeObj*)free_chunks;
while (Block* b = cache->cache_full_heads[bidx]) {
removeFromLL(b);
insertIntoLL(chain_end, b);
}
}
});
section = (LargeBlock*)doMmap(BLOCK_SIZE);
for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
Block** chain_end = freeChain(&heads[bidx]);
freeChain(&full_heads[bidx]);
if (!section)
return NULL;
while (Block* b = full_heads[bidx]) {
removeFromLL(b);
insertIntoLL(chain_end, b);
}
}
free_chunks = (LargeFreeChunk*)((char*)section + CHUNK_SIZE);
free_chunks->size = BLOCK_SIZE - CHUNK_SIZE;
free_chunks->next_size = free_lists[0];
free_lists[0] = free_chunks;
LargeObj* cur = large_head;
while (cur) {
GCAllocation* al = cur->data;
if (isMarked(al)) {
clearMark(al);
} else {
_doFree(al);
section->num_free_chunks = LARGE_BLOCK_NUM_CHUNKS;
*cur->prev = cur->next;
if (cur->next)
cur->next->prev = cur->prev;
section->free_chunk_map = (unsigned char*)section + sizeof(LargeBlock);
assert(sizeof(LargeBlock) + LARGE_BLOCK_NUM_CHUNKS + 1 <= CHUNK_SIZE);
section->free_chunk_map[0] = 0;
memset(section->free_chunk_map + 1, 1, LARGE_BLOCK_NUM_CHUNKS);
LargeObj* to_free = cur;
cur = cur->next;
_freeLargeObj(to_free);
continue;
}
section->next = blocks;
blocks = section;
cur = cur->next;
}
goto retry;
}
void dumpHeapStatistics() {
global_heap.dumpHeapStatistics();
}
void LargeArena::_freeLargeObj(LargeObj* obj) {
removeFromLL(obj);
struct HeapStatistics {
struct TypeStats {
int64_t nallocs;
int64_t nbytes;
TypeStats() : nallocs(0), nbytes(0) {}
size_t size = obj->size;
LargeBlock* section = LARGE_BLOCK_FOR_OBJ(obj);
size_t num_chunks, i, start_index;
void print(const char* name) const {
if (nbytes > (1 << 20))
printf("%s: %ld allocations for %.1f MB\n", name, nallocs, nbytes * 1.0 / (1 << 20));
else if (nbytes > (1 << 10))
printf("%s: %ld allocations for %.1f KB\n", name, nallocs, nbytes * 1.0 / (1 << 10));
else
printf("%s: %ld allocations for %ld bytes\n", name, nallocs, nbytes);
}
};
std::unordered_map<BoxedClass*, TypeStats> by_cls;
TypeStats conservative, untracked;
TypeStats total;
};
size += CHUNK_SIZE - 1;
size &= ~(CHUNK_SIZE - 1);
void addStatistic(HeapStatistics* stats, GCAllocation* al, int nbytes) {
stats->total.nallocs++;
stats->total.nbytes += nbytes;
num_chunks = size >> CHUNK_BITS;
if (al->kind_id == GCKind::PYTHON) {
Box* b = (Box*)al->user_data;
auto& t = stats->by_cls[b->cls];
assert(size > 0 && size - sizeof(LargeObj) <= ALLOC_SIZE_LIMIT);
assert(num_chunks > 0);
t.nallocs++;
t.nbytes += nbytes;
} else if (al->kind_id == GCKind::CONSERVATIVE) {
stats->conservative.nallocs++;
stats->conservative.nbytes += nbytes;
} else if (al->kind_id == GCKind::UNTRACKED) {
stats->untracked.nallocs++;
stats->untracked.nbytes += nbytes;
} else {
RELEASE_ASSERT(0, "%d", (int)al->kind_id);
section->num_free_chunks += num_chunks;
assert(section->num_free_chunks <= LARGE_BLOCK_NUM_CHUNKS);
/*
* We could free the LOS section here if it's empty, but we
* can't unless we also remove its free chunks from the fast
* free lists. Instead, we do it in los_sweep().
*/
start_index = LARGE_CHUNK_INDEX(obj, section);
for (i = start_index; i < start_index + num_chunks; ++i) {
assert(!section->free_chunk_map[i]);
section->free_chunk_map[i] = 1;
}
add_free_chunk((LargeFreeChunk*)obj, size);
}
// TODO: copy-pasted from freeChain
void getChainStatistics(HeapStatistics* stats, Block** head) {
while (Block* b = *head) {
int num_objects = b->numObjects();
int first_obj = b->minObjIndex();
int atoms_per_obj = b->atomsPerObj();
//////
/// Huge Arena
for (int obj_idx = first_obj; obj_idx < num_objects; obj_idx++) {
int atom_idx = obj_idx * atoms_per_obj;
if (b->isfree.isSet(atom_idx))
continue;
GCAllocation* HugeArena::alloc(size_t size) {
registerGCManagedBytes(size);
void* p = &b->atoms[atom_idx];
GCAllocation* al = reinterpret_cast<GCAllocation*>(p);
LOCK_REGION(heap->lock);
addStatistic(stats, al, b->size);
}
size_t total_size = size + sizeof(HugeObj);
total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
HugeObj* rtn = (HugeObj*)doMmap(total_size);
rtn->obj_size = size;
head = &b->next;
}
nullNextPrev(rtn);
insertIntoLL(&head, rtn);
return rtn->data;
}
// TODO: copy-pasted from freeUnmarked()
void Heap::dumpHeapStatistics() {
threading::GLPromoteRegion _lock;
GCAllocation* HugeArena::realloc(GCAllocation* al, size_t bytes) {
HugeObj* obj = HugeObj::fromAllocation(al);
HeapStatistics stats;
int capacity = obj->capacity();
if (capacity >= bytes && capacity < bytes * 2)
return al;
thread_caches.forEachValue([this, &stats](ThreadBlockCache* cache) {
for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
Block* h = cache->cache_free_heads[bidx];
GCAllocation* rtn = heap->alloc(bytes);
memcpy(rtn, al, std::min(bytes, obj->obj_size));
getChainStatistics(&stats, &cache->cache_free_heads[bidx]);
getChainStatistics(&stats, &cache->cache_full_heads[bidx]);
}
});
_freeHugeObj(obj);
return rtn;
}
for (int bidx = 0; bidx < NUM_BUCKETS; bidx++) {
getChainStatistics(&stats, &heads[bidx]);
getChainStatistics(&stats, &full_heads[bidx]);
}
void HugeArena::free(GCAllocation* al) {
_freeHugeObj(HugeObj::fromAllocation(al));
}
LargeObj* cur = large_head;
GCAllocation* HugeArena::allocationFrom(void* ptr) {
HugeObj* cur = head;
while (cur) {
GCAllocation* al = cur->data;
addStatistic(&stats, al, cur->capacity());
if (ptr >= cur && ptr < &cur->data[cur->obj_size])
return &cur->data[0];
cur = cur->next;
}
return NULL;
}
stats.conservative.print("conservative");
stats.untracked.print("untracked");
for (const auto& p : stats.by_cls) {
p.second.print(getFullNameOfClass(p.first).c_str());
}
stats.total.print("Total");
printf("\n");
void HugeArena::freeUnmarked() {
sweepList(head, [this](HugeObj* ptr) { _freeHugeObj(ptr); });
}
void HugeArena::getStatistics(HeapStatistics* stats) {
forEach(head, [stats](HugeObj* obj) { addStatistic(stats, obj->data, obj->capacity()); });
}
void HugeArena::_freeHugeObj(HugeObj* lobj) {
removeFromLL(lobj);
int r = munmap(lobj, lobj->mmap_size());
assert(r == 0);
}
} // namespace gc
} // namespace pyston
......@@ -17,6 +17,7 @@
#include <cstddef>
#include <cstdint>
#include <sys/mman.h>
#include "core/common.h"
#include "core/threading.h"
......@@ -24,6 +25,9 @@
namespace pyston {
namespace gc {
class Heap;
struct HeapStatistics;
typedef uint8_t kindid_t;
struct GCAllocation {
unsigned int gc_flags : 8;
......@@ -59,14 +63,72 @@ inline void clearMark(GCAllocation* header) {
#undef MARK_BIT
#define PAGE_SIZE 4096
template <int N> class Bitmap {
static_assert(N % 64 == 0, "");
template <uintptr_t arena_start, uintptr_t arena_size> class Arena {
private:
void* cur;
void* end;
protected:
Arena() : cur((void*)arena_start), end((void*)(arena_start + arena_size)) {}
public:
void* doMmap(size_t size) {
assert(size % PAGE_SIZE == 0);
assert(((uint8_t*)cur + size) < end && "arena full");
void* mrtn = mmap(cur, size, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
assert((uintptr_t)mrtn != -1 && "failed to allocate memory from OS");
ASSERT(mrtn == cur, "%p %p\n", mrtn, cur);
cur = (uint8_t*)cur + size;
return mrtn;
}
bool contains(void* addr) { return (void*)arena_start <= addr && addr < cur; }
};
constexpr uintptr_t ARENA_SIZE = 0x1000000000L;
constexpr uintptr_t SMALL_ARENA_START = 0x1270000000L;
constexpr uintptr_t LARGE_ARENA_START = 0x2270000000L;
constexpr uintptr_t HUGE_ARENA_START = 0x3270000000L;
//
// The SmallArena allocates objects <= 3584 bytes.
//
// it uses segregated-fit allocation, and each block contains a free
// bitmap for objects of a given size (constant for the block)
//
static const size_t sizes[] = {
16, 32, 48, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384,
448, 512, 640, 768, 896, 1024, 1280, 1536, 1792, 2048, 2560, 3072, 3584, // 4096,
};
static constexpr size_t NUM_BUCKETS = sizeof(sizes) / sizeof(sizes[0]);
class SmallArena : public Arena<SMALL_ARENA_START, ARENA_SIZE> {
public:
SmallArena(Heap* heap) : Arena(), heap(heap), thread_caches(heap, this) {}
GCAllocation* __attribute__((__malloc__)) alloc(size_t bytes);
GCAllocation* realloc(GCAllocation* alloc, size_t bytes);
void free(GCAllocation* al);
GCAllocation* allocationFrom(void* ptr);
void freeUnmarked();
void getStatistics(HeapStatistics* stats);
private:
template <int N> class Bitmap {
static_assert(N % 64 == 0, "");
private:
uint64_t data[N / 64];
public:
public:
void setAllZero() { memset(data, 0, sizeof(data)); }
struct Scanner {
......@@ -113,25 +175,26 @@ public:
int idx = first + i * 64;
return idx;
}
};
};
#define BLOCK_SIZE (4 * 4096)
static constexpr size_t BLOCK_SIZE = 4 * 4096;
#define ATOM_SIZE 16
static_assert(BLOCK_SIZE % ATOM_SIZE == 0, "");
static_assert(BLOCK_SIZE % ATOM_SIZE == 0, "");
#define ATOMS_PER_BLOCK (BLOCK_SIZE / ATOM_SIZE)
static_assert(ATOMS_PER_BLOCK % 64 == 0, "");
static_assert(ATOMS_PER_BLOCK % 64 == 0, "");
#define BITFIELD_SIZE (ATOMS_PER_BLOCK / 8)
#define BITFIELD_ELTS (BITFIELD_SIZE / 8)
#define BLOCK_HEADER_SIZE (BITFIELD_SIZE + 4 * sizeof(void*))
#define BLOCK_HEADER_ATOMS ((BLOCK_HEADER_SIZE + ATOM_SIZE - 1) / ATOM_SIZE)
struct Atoms {
struct Atoms {
char _data[ATOM_SIZE];
};
};
struct Block {
struct Block {
union {
struct {
Block* next, **prev;
......@@ -153,79 +216,241 @@ struct Block {
inline int atomsPerObj() const { return atoms_per_obj; }
static Block* forPointer(void* ptr) { return (Block*)((uintptr_t)ptr & ~(BLOCK_SIZE - 1)); }
};
static_assert(sizeof(Block) == BLOCK_SIZE, "bad size");
static_assert(offsetof(Block, _header_end) >= BLOCK_HEADER_SIZE, "bad header size");
static_assert(offsetof(Block, _header_end) <= BLOCK_HEADER_SIZE, "bad header size");
constexpr const size_t sizes[] = {
16, 32, 48, 64, 80, 96, 112, 128, 160, 192, 224, 256,
320, 384, 448, 512, 640, 768, 896, 1024, 1280, 1536, 1792, 2048,
// 2560, 3072, 3584, // 4096,
};
#define NUM_BUCKETS (sizeof(sizes) / sizeof(sizes[0]))
struct LargeObj;
class Heap {
private:
Block* heads[NUM_BUCKETS];
Block* full_heads[NUM_BUCKETS];
LargeObj* large_head = NULL;
GCAllocation* __attribute__((__malloc__)) allocSmall(size_t rounded_size, int bucket_idx);
GCAllocation* __attribute__((__malloc__)) allocLarge(size_t bytes);
};
static_assert(sizeof(Block) == BLOCK_SIZE, "bad size");
static_assert(offsetof(Block, _header_end) >= BLOCK_HEADER_SIZE, "bad header size");
static_assert(offsetof(Block, _header_end) <= BLOCK_HEADER_SIZE, "bad header size");
// DS_DEFINE_MUTEX(lock);
DS_DEFINE_SPINLOCK(lock);
struct ThreadBlockCache {
Heap* heap;
SmallArena* small;
Block* cache_free_heads[NUM_BUCKETS];
Block* cache_full_heads[NUM_BUCKETS];
ThreadBlockCache(Heap* heap) : heap(heap) {
ThreadBlockCache(Heap* heap, SmallArena* small) : heap(heap), small(small) {
memset(cache_free_heads, 0, sizeof(cache_free_heads));
memset(cache_full_heads, 0, sizeof(cache_full_heads));
}
~ThreadBlockCache();
};
Block* heads[NUM_BUCKETS];
Block* full_heads[NUM_BUCKETS];
friend struct ThreadBlockCache;
Heap* heap;
// TODO only use thread caches if we're in GRWL mode?
threading::PerThreadSet<ThreadBlockCache, Heap*> thread_caches;
threading::PerThreadSet<ThreadBlockCache, Heap*, SmallArena*> thread_caches;
Block* _allocBlock(uint64_t size, Block** prev);
GCAllocation* _allocFromBlock(Block* b);
Block* _claimBlock(size_t rounded_size, Block** free_head);
Block** _freeChain(Block** head);
void _getChainStatistics(HeapStatistics* stats, Block** head);
GCAllocation* __attribute__((__malloc__)) _alloc(size_t bytes, int bucket_idx);
};
//
// The LargeArena allocates objects where 3584 < size <1024*1024-CHUNK_SIZE-sizeof(LargeObject) bytes.
//
// it maintains a set of size-segregated free lists, and a special
// free list for larger objects. If the free list specific to a given
// size has no entries, we search the large free list.
//
// Blocks of 1meg are mmap'ed individually, and carved up as needed.
//
class LargeArena : public Arena<LARGE_ARENA_START, ARENA_SIZE> {
private:
struct LargeBlock {
LargeBlock* next;
size_t num_free_chunks;
unsigned char* free_chunk_map;
};
struct LargeFreeChunk {
LargeFreeChunk* next_size;
size_t size;
};
struct LargeObj {
LargeObj* next, **prev;
size_t size;
GCAllocation data[0];
static LargeObj* fromAllocation(GCAllocation* alloc) {
char* rtn = (char*)alloc - offsetof(LargeObj, data);
return reinterpret_cast<LargeObj*>(rtn);
}
};
/*
* This shouldn't be much smaller or larger than the largest small size bucket.
* Must be at least sizeof (LargeBlock).
*/
static constexpr size_t CHUNK_SIZE = 4096;
static constexpr int CHUNK_BITS = 12;
static_assert(CHUNK_SIZE > sizeof(LargeBlock), "bad large block size");
static constexpr int BLOCK_SIZE = 1024 * 1024;
static constexpr int NUM_FREE_LISTS = 32;
Heap* heap;
LargeObj* head;
LargeBlock* blocks;
LargeFreeChunk* free_lists[NUM_FREE_LISTS]; /* 0 is for larger sizes */
void add_free_chunk(LargeFreeChunk* free_chunks, size_t size);
LargeFreeChunk* get_from_size_list(LargeFreeChunk** list, size_t size);
LargeObj* _alloc(size_t size);
void _freeLargeObj(LargeObj* obj);
public:
Heap() : thread_caches(this) {}
LargeArena(Heap* heap) : heap(heap), head(NULL), blocks(NULL) {}
/* Largest object that can be allocated in a large block. */
static constexpr size_t ALLOC_SIZE_LIMIT = BLOCK_SIZE - CHUNK_SIZE - sizeof(LargeObj);
GCAllocation* __attribute__((__malloc__)) alloc(size_t bytes);
GCAllocation* realloc(GCAllocation* alloc, size_t bytes);
void free(GCAllocation* alloc);
GCAllocation* allocationFrom(void* ptr);
void freeUnmarked();
void getStatistics(HeapStatistics* stats);
};
// The HugeArena allocates objects where size > 1024*1024 bytes.
//
// Objects are allocated with individual mmap() calls, and kept in a
// linked list. They are not reused.
class HugeArena : public Arena<HUGE_ARENA_START, ARENA_SIZE> {
public:
HugeArena(Heap* heap) : heap(heap) {}
GCAllocation* __attribute__((__malloc__)) alloc(size_t bytes);
GCAllocation* realloc(GCAllocation* alloc, size_t bytes);
void free(GCAllocation* alloc);
GCAllocation* allocationFrom(void* ptr);
void freeUnmarked();
void getStatistics(HeapStatistics* stats);
private:
struct HugeObj {
HugeObj* next, **prev;
size_t obj_size;
GCAllocation data[0];
int mmap_size() {
size_t total_size = obj_size + sizeof(HugeObj);
total_size = (total_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
return total_size;
}
int capacity() { return mmap_size() - sizeof(HugeObj); }
static HugeObj* fromAllocation(GCAllocation* alloc) {
char* rtn = (char*)alloc - offsetof(HugeObj, data);
assert((uintptr_t)rtn % PAGE_SIZE == 0);
return reinterpret_cast<HugeObj*>(rtn);
}
};
void _freeHugeObj(HugeObj* lobj);
HugeObj* head;
Heap* heap;
};
class Heap {
private:
SmallArena small_arena;
LargeArena large_arena;
HugeArena huge_arena;
friend class SmallArena;
friend class LargeArena;
friend class HugeArena;
// DS_DEFINE_MUTEX(lock);
DS_DEFINE_SPINLOCK(lock);
public:
Heap() : small_arena(this), large_arena(this), huge_arena(this) {}
GCAllocation* realloc(GCAllocation* alloc, size_t bytes) {
// TODO(toshok): there is duplicate code in each of the
// ::realloc methods to test whether the allocation can be
// reused. Would be nice to factor it all out here into this
// method.
if (large_arena.contains(alloc)) {
return large_arena.realloc(alloc, bytes);
} else if (huge_arena.contains(alloc)) {
return huge_arena.realloc(alloc, bytes);
}
assert(small_arena.contains(alloc));
return small_arena.realloc(alloc, bytes);
}
GCAllocation* __attribute__((__malloc__)) alloc(size_t bytes) {
GCAllocation* rtn;
// assert(bytes >= 16);
if (bytes <= 16)
rtn = allocSmall(16, 0);
else if (bytes <= 32)
rtn = allocSmall(32, 1);
if (bytes > LargeArena::ALLOC_SIZE_LIMIT)
return huge_arena.alloc(bytes);
else if (bytes > sizes[NUM_BUCKETS - 1])
rtn = allocLarge(bytes);
else {
rtn = NULL;
for (int i = 2; i < NUM_BUCKETS; i++) {
if (sizes[i] >= bytes) {
rtn = allocSmall(sizes[i], i);
break;
}
return large_arena.alloc(bytes);
else
return small_arena.alloc(bytes);
}
void destructContents(GCAllocation* alloc);
void free(GCAllocation* alloc) {
destructContents(alloc);
if (large_arena.contains(alloc)) {
large_arena.free(alloc);
return;
}
return rtn;
if (huge_arena.contains(alloc)) {
huge_arena.free(alloc);
return;
}
void free(GCAllocation* alloc);
assert(small_arena.contains(alloc));
small_arena.free(alloc);
}
// not thread safe:
GCAllocation* getAllocationFromInteriorPointer(void* ptr);
GCAllocation* getAllocationFromInteriorPointer(void* ptr) {
if (large_arena.contains(ptr)) {
return large_arena.allocationFrom(ptr);
} else if (huge_arena.contains(ptr)) {
return huge_arena.allocationFrom(ptr);
} else if (small_arena.contains(ptr)) {
return small_arena.allocationFrom(ptr);
}
return NULL;
}
// not thread safe:
void freeUnmarked();
void freeUnmarked() {
small_arena.freeUnmarked();
large_arena.freeUnmarked();
huge_arena.freeUnmarked();
}
void dumpHeapStatistics();
};
......
......@@ -68,6 +68,9 @@ TEST(alloc, alloc64) { testAlloc(64); }
TEST(alloc, alloc128) { testAlloc(128); }
TEST(alloc, alloc258) { testAlloc(258); }
TEST(alloc, alloc3584) { testAlloc(3584); }
TEST(alloc, alloc4096) { testAlloc(4096); }
TEST(alloc, alloc8192) { testAlloc(8192); }
TEST(alloc, alloc16384) { testAlloc(16384); }
TEST(alloc, largeallocs) {
int s1 = 1 << 20;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment