Commit fb26504d authored by Inada Naoki's avatar Inada Naoki Committed by GitHub

bpo-37543: optimize pymalloc (#14674)

PyObject_Malloc() and PyObject_Free() inlines pymalloc_alloc and
pymalloc_free partially.
But when PGO is not used, compiler don't know where is the hot part
in pymalloc_alloc and pymalloc_free.
parent 7036e1de
...@@ -710,19 +710,21 @@ PyObject_Free(void *ptr) ...@@ -710,19 +710,21 @@ PyObject_Free(void *ptr)
} }
#ifdef WITH_PYMALLOC
#ifdef WITH_VALGRIND
#include <valgrind/valgrind.h>
/* If we're using GCC, use __builtin_expect() to reduce overhead of /* If we're using GCC, use __builtin_expect() to reduce overhead of
the valgrind checks */ the valgrind checks */
#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__) #if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
# define UNLIKELY(value) __builtin_expect((value), 0) # define UNLIKELY(value) __builtin_expect((value), 0)
# define LIKELY(value) __builtin_expect((value), 1)
#else #else
# define UNLIKELY(value) (value) # define UNLIKELY(value) (value)
# define LIKELY(value) (value)
#endif #endif
#ifdef WITH_PYMALLOC
#ifdef WITH_VALGRIND
#include <valgrind/valgrind.h>
/* -1 indicates that we haven't checked that we're running on valgrind yet. */ /* -1 indicates that we haven't checked that we're running on valgrind yet. */
static int running_on_valgrind = -1; static int running_on_valgrind = -1;
#endif #endif
...@@ -1424,96 +1426,48 @@ address_in_range(void *p, poolp pool) ...@@ -1424,96 +1426,48 @@ address_in_range(void *p, poolp pool)
/*==========================================================================*/ /*==========================================================================*/
/* pymalloc allocator // Called when freelist is exhausted. Extend the freelist if there is
// space for a block. Otherwise, remove this pool from usedpools.
The basic blocks are ordered by decreasing execution frequency, static void
which minimizes the number of jumps in the most common cases, pymalloc_pool_extend(poolp pool, uint size)
improves branching prediction and instruction scheduling (small
block allocations typically result in a couple of instructions).
Unless the optimizer reorders everything, being too smart...
Return 1 if pymalloc allocated memory and wrote the pointer into *ptr_p.
Return 0 if pymalloc failed to allocate the memory block: on bigger
requests, on error in the code below (as a last chance to serve the request)
or when the max memory limit has been reached. */
static int
pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
{ {
block *bp; if (UNLIKELY(pool->nextoffset <= pool->maxnextoffset)) {
poolp pool; /* There is room for another block. */
poolp next; pool->freeblock = (block*)pool + pool->nextoffset;
uint size; pool->nextoffset += INDEX2SIZE(size);
*(block **)(pool->freeblock) = NULL;
#ifdef WITH_VALGRIND return;
if (UNLIKELY(running_on_valgrind == -1)) {
running_on_valgrind = RUNNING_ON_VALGRIND;
}
if (UNLIKELY(running_on_valgrind)) {
return 0;
}
#endif
if (nbytes == 0) {
return 0;
}
if (nbytes > SMALL_REQUEST_THRESHOLD) {
return 0;
} }
/* /* Pool is full, unlink from used pools. */
* Most frequent paths first poolp next;
*/ next = pool->nextpool;
size = (uint)(nbytes - 1) >> ALIGNMENT_SHIFT; pool = pool->prevpool;
pool = usedpools[size + size]; next->prevpool = pool;
if (pool != pool->nextpool) { pool->nextpool = next;
/* }
* There is a used pool for this size class.
* Pick up the head block of its free list.
*/
++pool->ref.count;
bp = pool->freeblock;
assert(bp != NULL);
if ((pool->freeblock = *(block **)bp) != NULL) {
goto success;
}
/*
* Reached the end of the free list, try to extend it.
*/
if (pool->nextoffset <= pool->maxnextoffset) {
/* There is room for another block. */
pool->freeblock = (block*)pool +
pool->nextoffset;
pool->nextoffset += INDEX2SIZE(size);
*(block **)(pool->freeblock) = NULL;
goto success;
}
/* Pool is full, unlink from used pools. */
next = pool->nextpool;
pool = pool->prevpool;
next->prevpool = pool;
pool->nextpool = next;
goto success;
}
/* called when pymalloc_alloc can not allocate a block from usedpool.
* This function takes new pool and allocate a block from it.
*/
static void*
allocate_from_new_pool(uint size)
{
/* There isn't a pool of the right size class immediately /* There isn't a pool of the right size class immediately
* available: use a free pool. * available: use a free pool.
*/ */
if (usable_arenas == NULL) { if (UNLIKELY(usable_arenas == NULL)) {
/* No arena has a free pool: allocate a new arena. */ /* No arena has a free pool: allocate a new arena. */
#ifdef WITH_MEMORY_LIMITS #ifdef WITH_MEMORY_LIMITS
if (narenas_currently_allocated >= MAX_ARENAS) { if (narenas_currently_allocated >= MAX_ARENAS) {
goto failed; return NULL;
} }
#endif #endif
usable_arenas = new_arena(); usable_arenas = new_arena();
if (usable_arenas == NULL) { if (usable_arenas == NULL) {
goto failed; return NULL;
} }
usable_arenas->nextarena = usable_arenas->nextarena = usable_arenas->prevarena = NULL;
usable_arenas->prevarena = NULL;
assert(nfp2lasta[usable_arenas->nfreepools] == NULL); assert(nfp2lasta[usable_arenas->nfreepools] == NULL);
nfp2lasta[usable_arenas->nfreepools] = usable_arenas; nfp2lasta[usable_arenas->nfreepools] = usable_arenas;
} }
...@@ -1536,12 +1490,12 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes) ...@@ -1536,12 +1490,12 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
} }
/* Try to get a cached free pool. */ /* Try to get a cached free pool. */
pool = usable_arenas->freepools; poolp pool = usable_arenas->freepools;
if (pool != NULL) { if (LIKELY(pool != NULL)) {
/* Unlink from cached pools. */ /* Unlink from cached pools. */
usable_arenas->freepools = pool->nextpool; usable_arenas->freepools = pool->nextpool;
--usable_arenas->nfreepools; usable_arenas->nfreepools--;
if (usable_arenas->nfreepools == 0) { if (UNLIKELY(usable_arenas->nfreepools == 0)) {
/* Wholly allocated: remove. */ /* Wholly allocated: remove. */
assert(usable_arenas->freepools == NULL); assert(usable_arenas->freepools == NULL);
assert(usable_arenas->nextarena == NULL || assert(usable_arenas->nextarena == NULL ||
...@@ -1564,73 +1518,123 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes) ...@@ -1564,73 +1518,123 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
(block*)usable_arenas->address + (block*)usable_arenas->address +
ARENA_SIZE - POOL_SIZE); ARENA_SIZE - POOL_SIZE);
} }
}
else {
/* Carve off a new pool. */
assert(usable_arenas->nfreepools > 0);
assert(usable_arenas->freepools == NULL);
pool = (poolp)usable_arenas->pool_address;
assert((block*)pool <= (block*)usable_arenas->address +
ARENA_SIZE - POOL_SIZE);
pool->arenaindex = (uint)(usable_arenas - arenas);
assert(&arenas[pool->arenaindex] == usable_arenas);
pool->szidx = DUMMY_SIZE_IDX;
usable_arenas->pool_address += POOL_SIZE;
--usable_arenas->nfreepools;
init_pool: if (usable_arenas->nfreepools == 0) {
/* Frontlink to used pools. */ assert(usable_arenas->nextarena == NULL ||
next = usedpools[size + size]; /* == prev */ usable_arenas->nextarena->prevarena ==
pool->nextpool = next; usable_arenas);
pool->prevpool = next; /* Unlink the arena: it is completely allocated. */
next->nextpool = pool; usable_arenas = usable_arenas->nextarena;
next->prevpool = pool; if (usable_arenas != NULL) {
pool->ref.count = 1; usable_arenas->prevarena = NULL;
if (pool->szidx == size) { assert(usable_arenas->address != 0);
/* Luckily, this pool last contained blocks }
* of the same size class, so its header
* and free list are already initialized.
*/
bp = pool->freeblock;
assert(bp != NULL);
pool->freeblock = *(block **)bp;
goto success;
} }
/* }
* Initialize the pool header, set up the free list to
* contain just the second block, and return the first /* Frontlink to used pools. */
* block. block *bp;
poolp next = usedpools[size + size]; /* == prev */
pool->nextpool = next;
pool->prevpool = next;
next->nextpool = pool;
next->prevpool = pool;
pool->ref.count = 1;
if (pool->szidx == size) {
/* Luckily, this pool last contained blocks
* of the same size class, so its header
* and free list are already initialized.
*/ */
pool->szidx = size; bp = pool->freeblock;
size = INDEX2SIZE(size); assert(bp != NULL);
bp = (block *)pool + POOL_OVERHEAD; pool->freeblock = *(block **)bp;
pool->nextoffset = POOL_OVERHEAD + (size << 1); return bp;
pool->maxnextoffset = POOL_SIZE - size; }
pool->freeblock = bp + size; /*
*(block **)(pool->freeblock) = NULL; * Initialize the pool header, set up the free list to
goto success; * contain just the second block, and return the first
* block.
*/
pool->szidx = size;
size = INDEX2SIZE(size);
bp = (block *)pool + POOL_OVERHEAD;
pool->nextoffset = POOL_OVERHEAD + (size << 1);
pool->maxnextoffset = POOL_SIZE - size;
pool->freeblock = bp + size;
*(block **)(pool->freeblock) = NULL;
return bp;
}
/* pymalloc allocator
Return 1 if pymalloc allocated memory and wrote the pointer into *ptr_p.
Return 0 if pymalloc failed to allocate the memory block: on bigger
requests, on error in the code below (as a last chance to serve the request)
or when the max memory limit has been reached.
*/
static inline int
pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
{
#ifdef WITH_VALGRIND
if (UNLIKELY(running_on_valgrind == -1)) {
running_on_valgrind = RUNNING_ON_VALGRIND;
} }
if (UNLIKELY(running_on_valgrind)) {
return 0;
}
#endif
/* Carve off a new pool. */ if (UNLIKELY(nbytes == 0)) {
assert(usable_arenas->nfreepools > 0); return 0;
assert(usable_arenas->freepools == NULL); }
pool = (poolp)usable_arenas->pool_address; if (UNLIKELY(nbytes > SMALL_REQUEST_THRESHOLD)) {
assert((block*)pool <= (block*)usable_arenas->address + return 0;
ARENA_SIZE - POOL_SIZE);
pool->arenaindex = (uint)(usable_arenas - arenas);
assert(&arenas[pool->arenaindex] == usable_arenas);
pool->szidx = DUMMY_SIZE_IDX;
usable_arenas->pool_address += POOL_SIZE;
--usable_arenas->nfreepools;
if (usable_arenas->nfreepools == 0) {
assert(usable_arenas->nextarena == NULL ||
usable_arenas->nextarena->prevarena ==
usable_arenas);
/* Unlink the arena: it is completely allocated. */
usable_arenas = usable_arenas->nextarena;
if (usable_arenas != NULL) {
usable_arenas->prevarena = NULL;
assert(usable_arenas->address != 0);
}
} }
goto init_pool; uint size = (uint)(nbytes - 1) >> ALIGNMENT_SHIFT;
poolp pool = usedpools[size + size];
block *bp;
if (LIKELY(pool != pool->nextpool)) {
/*
* There is a used pool for this size class.
* Pick up the head block of its free list.
*/
++pool->ref.count;
bp = pool->freeblock;
if (UNLIKELY((pool->freeblock = *(block **)bp) == NULL)) {
// Reached the end of the free list, try to extend it.
pymalloc_pool_extend(pool, size);
}
}
else {
/* There isn't a pool of the right size class immediately
* available: use a free pool.
*/
bp = allocate_from_new_pool(size);
if (UNLIKELY(bp == NULL)) {
return 0;
}
}
success:
assert(bp != NULL); assert(bp != NULL);
*ptr_p = (void *)bp; *ptr_p = (void *)bp;
return 1; return 1;
failed:
return 0;
} }
...@@ -1638,7 +1642,7 @@ static void * ...@@ -1638,7 +1642,7 @@ static void *
_PyObject_Malloc(void *ctx, size_t nbytes) _PyObject_Malloc(void *ctx, size_t nbytes)
{ {
void* ptr; void* ptr;
if (pymalloc_alloc(ctx, &ptr, nbytes)) { if (LIKELY(pymalloc_alloc(ctx, &ptr, nbytes))) {
return ptr; return ptr;
} }
...@@ -1658,7 +1662,7 @@ _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize) ...@@ -1658,7 +1662,7 @@ _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize)
assert(elsize == 0 || nelem <= (size_t)PY_SSIZE_T_MAX / elsize); assert(elsize == 0 || nelem <= (size_t)PY_SSIZE_T_MAX / elsize);
size_t nbytes = nelem * elsize; size_t nbytes = nelem * elsize;
if (pymalloc_alloc(ctx, &ptr, nbytes)) { if (LIKELY(pymalloc_alloc(ctx, &ptr, nbytes))) {
memset(ptr, 0, nbytes); memset(ptr, 0, nbytes);
return ptr; return ptr;
} }
...@@ -1671,88 +1675,37 @@ _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize) ...@@ -1671,88 +1675,37 @@ _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize)
} }
/* Free a memory block allocated by pymalloc_alloc(). static void
Return 1 if it was freed. insert_to_usedpool(poolp pool)
Return 0 if the block was not allocated by pymalloc_alloc(). */
static int
pymalloc_free(void *ctx, void *p)
{ {
poolp pool; assert(pool->ref.count > 0); /* else the pool is empty */
block *lastfree;
poolp next, prev;
uint size;
assert(p != NULL);
#ifdef WITH_VALGRIND
if (UNLIKELY(running_on_valgrind > 0)) {
return 0;
}
#endif
pool = POOL_ADDR(p); uint size = pool->szidx;
if (!address_in_range(p, pool)) { poolp next = usedpools[size + size];
return 0; poolp prev = next->prevpool;
}
/* We allocated this address. */
/* Link p to the start of the pool's freeblock list. Since
* the pool had at least the p block outstanding, the pool
* wasn't empty (so it's already in a usedpools[] list, or
* was full and is in no list -- it's not in the freeblocks
* list in any case).
*/
assert(pool->ref.count > 0); /* else it was empty */
*(block **)p = lastfree = pool->freeblock;
pool->freeblock = (block *)p;
if (!lastfree) {
/* Pool was full, so doesn't currently live in any list:
* link it to the front of the appropriate usedpools[] list.
* This mimics LRU pool usage for new allocations and
* targets optimal filling when several pools contain
* blocks of the same size class.
*/
--pool->ref.count;
assert(pool->ref.count > 0); /* else the pool is empty */
size = pool->szidx;
next = usedpools[size + size];
prev = next->prevpool;
/* insert pool before next: prev <-> pool <-> next */
pool->nextpool = next;
pool->prevpool = prev;
next->prevpool = pool;
prev->nextpool = pool;
goto success;
}
struct arena_object* ao; /* insert pool before next: prev <-> pool <-> next */
uint nf; /* ao->nfreepools */ pool->nextpool = next;
pool->prevpool = prev;
next->prevpool = pool;
prev->nextpool = pool;
}
/* freeblock wasn't NULL, so the pool wasn't full, static void
* and the pool is in a usedpools[] list. insert_to_freepool(poolp pool)
*/ {
if (--pool->ref.count != 0) { poolp next = pool->nextpool;
/* pool isn't empty: leave it in usedpools */ poolp prev = pool->prevpool;
goto success;
}
/* Pool is now empty: unlink from usedpools, and
* link to the front of freepools. This ensures that
* previously freed pools will be allocated later
* (being not referenced, they are perhaps paged out).
*/
next = pool->nextpool;
prev = pool->prevpool;
next->prevpool = prev; next->prevpool = prev;
prev->nextpool = next; prev->nextpool = next;
/* Link the pool to freepools. This is a singly-linked /* Link the pool to freepools. This is a singly-linked
* list, and pool->prevpool isn't used there. * list, and pool->prevpool isn't used there.
*/ */
ao = &arenas[pool->arenaindex]; struct arena_object *ao = &arenas[pool->arenaindex];
pool->nextpool = ao->freepools; pool->nextpool = ao->freepools;
ao->freepools = pool; ao->freepools = pool;
nf = ao->nfreepools; uint nf = ao->nfreepools;
/* If this is the rightmost arena with this number of free pools, /* If this is the rightmost arena with this number of free pools,
* nfp2lasta[nf] needs to change. Caution: if nf is 0, there * nfp2lasta[nf] needs to change. Caution: if nf is 0, there
* are no arenas in usable_arenas with that value. * are no arenas in usable_arenas with that value.
...@@ -1826,7 +1779,7 @@ pymalloc_free(void *ctx, void *p) ...@@ -1826,7 +1779,7 @@ pymalloc_free(void *ctx, void *p)
ao->address = 0; /* mark unassociated */ ao->address = 0; /* mark unassociated */
--narenas_currently_allocated; --narenas_currently_allocated;
goto success; return;
} }
if (nf == 1) { if (nf == 1) {
...@@ -1845,7 +1798,7 @@ pymalloc_free(void *ctx, void *p) ...@@ -1845,7 +1798,7 @@ pymalloc_free(void *ctx, void *p)
nfp2lasta[1] = ao; nfp2lasta[1] = ao;
} }
goto success; return;
} }
/* If this arena is now out of order, we need to keep /* If this arena is now out of order, we need to keep
...@@ -1862,7 +1815,7 @@ pymalloc_free(void *ctx, void *p) ...@@ -1862,7 +1815,7 @@ pymalloc_free(void *ctx, void *p)
/* If this was the rightmost of the old size, it remains in place. */ /* If this was the rightmost of the old size, it remains in place. */
if (ao == lastnf) { if (ao == lastnf) {
/* Case 4. Nothing to do. */ /* Case 4. Nothing to do. */
goto success; return;
} }
/* If ao were the only arena in the list, the last block would have /* If ao were the only arena in the list, the last block would have
* gotten us out. * gotten us out.
...@@ -1898,10 +1851,65 @@ pymalloc_free(void *ctx, void *p) ...@@ -1898,10 +1851,65 @@ pymalloc_free(void *ctx, void *p)
assert(ao->nextarena == NULL || ao->nextarena->prevarena == ao); assert(ao->nextarena == NULL || ao->nextarena->prevarena == ao);
assert((usable_arenas == ao && ao->prevarena == NULL) assert((usable_arenas == ao && ao->prevarena == NULL)
|| ao->prevarena->nextarena == ao); || ao->prevarena->nextarena == ao);
}
/* Free a memory block allocated by pymalloc_alloc().
Return 1 if it was freed.
Return 0 if the block was not allocated by pymalloc_alloc(). */
static inline int
pymalloc_free(void *ctx, void *p)
{
assert(p != NULL);
#ifdef WITH_VALGRIND
if (UNLIKELY(running_on_valgrind > 0)) {
return 0;
}
#endif
poolp pool = POOL_ADDR(p);
if (UNLIKELY(!address_in_range(p, pool))) {
return 0;
}
/* We allocated this address. */
/* Link p to the start of the pool's freeblock list. Since
* the pool had at least the p block outstanding, the pool
* wasn't empty (so it's already in a usedpools[] list, or
* was full and is in no list -- it's not in the freeblocks
* list in any case).
*/
assert(pool->ref.count > 0); /* else it was empty */
block *lastfree = pool->freeblock;
*(block **)p = lastfree;
pool->freeblock = (block *)p;
pool->ref.count--;
if (UNLIKELY(lastfree == NULL)) {
/* Pool was full, so doesn't currently live in any list:
* link it to the front of the appropriate usedpools[] list.
* This mimics LRU pool usage for new allocations and
* targets optimal filling when several pools contain
* blocks of the same size class.
*/
insert_to_usedpool(pool);
return 1;
}
goto success; /* freeblock wasn't NULL, so the pool wasn't full,
* and the pool is in a usedpools[] list.
*/
if (LIKELY(pool->ref.count != 0)) {
/* pool isn't empty: leave it in usedpools */
return 1;
}
success: /* Pool is now empty: unlink from usedpools, and
* link to the front of freepools. This ensures that
* previously freed pools will be allocated later
* (being not referenced, they are perhaps paged out).
*/
insert_to_freepool(pool);
return 1; return 1;
} }
...@@ -1914,7 +1922,7 @@ _PyObject_Free(void *ctx, void *p) ...@@ -1914,7 +1922,7 @@ _PyObject_Free(void *ctx, void *p)
return; return;
} }
if (!pymalloc_free(ctx, p)) { if (UNLIKELY(!pymalloc_free(ctx, p))) {
/* pymalloc didn't allocate this address */ /* pymalloc didn't allocate this address */
PyMem_RawFree(p); PyMem_RawFree(p);
raw_allocated_blocks--; raw_allocated_blocks--;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment