Commit e70ddf3a authored by Tim Peters's avatar Tim Peters

Widespread, but mostly in _PyMalloc_Malloc: optimize away all expensive

runtime multiplications and divisions, via the scheme developed with
Vladimir Marangozov on Python-Dev.  The pool_header struct loses its
capacity member, but gains nextoffset and maxnextoffset members; this
still leaves it at 32 bytes on a 32-bit box (it has to be padded to a
multiple of 8 bytes).
parent d3dab2b1
...@@ -116,6 +116,9 @@ ...@@ -116,6 +116,9 @@
#define ALIGNMENT_SHIFT 3 #define ALIGNMENT_SHIFT 3
#define ALIGNMENT_MASK (ALIGNMENT - 1) #define ALIGNMENT_MASK (ALIGNMENT - 1)
/* Return the number of bytes in size class I, as a uint. */
#define INDEX2SIZE(I) (((uint)(I) + 1) << ALIGNMENT_SHIFT)
/* /*
* Max size threshold below which malloc requests are considered to be * Max size threshold below which malloc requests are considered to be
* small enough in order to use preallocated memory pools. You can tune * small enough in order to use preallocated memory pools. You can tune
...@@ -225,7 +228,7 @@ ...@@ -225,7 +228,7 @@
/* When you say memory, my mind reasons in terms of (pointers to) blocks */ /* When you say memory, my mind reasons in terms of (pointers to) blocks */
typedef uchar block; typedef uchar block;
/* Pool for small blocks */ /* Pool for small blocks. */
struct pool_header { struct pool_header {
union { block *_padding; union { block *_padding;
uint count; } ref; /* number of allocated blocks */ uint count; } ref; /* number of allocated blocks */
...@@ -234,7 +237,8 @@ struct pool_header { ...@@ -234,7 +237,8 @@ struct pool_header {
struct pool_header *prevpool; /* previous pool "" */ struct pool_header *prevpool; /* previous pool "" */
uint arenaindex; /* index into arenas of base adr */ uint arenaindex; /* index into arenas of base adr */
uint szidx; /* block size class index */ uint szidx; /* block size class index */
uint capacity; /* pool capacity in # of blocks */ uint nextoffset; /* bytes to virgin block */
uint maxnextoffset; /* largest valid nextoffset */
}; };
typedef struct pool_header *poolp; typedef struct pool_header *poolp;
...@@ -246,8 +250,11 @@ typedef struct pool_header *poolp; ...@@ -246,8 +250,11 @@ typedef struct pool_header *poolp;
#define DUMMY_SIZE_IDX 0xffff /* size class of newly cached pools */ #define DUMMY_SIZE_IDX 0xffff /* size class of newly cached pools */
/* Round pointer P down to the closest pool-aligned address <= P, as a poolp */ /* Round pointer P down to the closest pool-aligned address <= P, as a poolp */
#define POOL_ADDR(P) \ #define POOL_ADDR(P) ((poolp)((uptr)(P) & ~(uptr)POOL_SIZE_MASK))
((poolp)((uptr)(P) & ~(uptr)POOL_SIZE_MASK))
/* Return total number of blocks in poolp P, as a uint. */
#define NUMBLOCKS(P) \
((uint)(POOL_SIZE - POOL_OVERHEAD) / INDEX2SIZE((P)->szidx))
/*==========================================================================*/ /*==========================================================================*/
...@@ -299,14 +306,7 @@ empty == all the pool's blocks are currently available for allocation ...@@ -299,14 +306,7 @@ empty == all the pool's blocks are currently available for allocation
Empty pools have no inherent size class: the next time a malloc finds Empty pools have no inherent size class: the next time a malloc finds
an empty list in usedpools[], it takes the first pool off of freepools. an empty list in usedpools[], it takes the first pool off of freepools.
If the size class needed happens to be the same as the size class the pool If the size class needed happens to be the same as the size class the pool
last had, some expensive initialization can be skipped (including an last had, some pool initialization can be skipped.
integer division -- XXX since the value
pool->capacity = (POOL_SIZE - POOL_OVERHEAD) / size;
is invariant across all pools of a given size class, it may make more
sense to compute those at compile-time into a const vector indexed by
size class, and lose the pool->capacity member and the runtime divisions).
Block Management Block Management
...@@ -315,18 +315,20 @@ Blocks within pools are again carved out as needed. pool->freeblock points to ...@@ -315,18 +315,20 @@ Blocks within pools are again carved out as needed. pool->freeblock points to
the start of a singly-linked list of free blocks within the pool. When a the start of a singly-linked list of free blocks within the pool. When a
block is freed, it's inserted at the front of its pool's freeblock list. Note block is freed, it's inserted at the front of its pool's freeblock list. Note
that the available blocks in a pool are *not* linked all together when a pool that the available blocks in a pool are *not* linked all together when a pool
is initialized. Instead only "the first" (lowest address) block is set up, is initialized. Instead only "the first two" (lowest addresses) blocks are
setting pool->freeblock to NULL. This is consistent with that pymalloc set up, returning the first such block, and setting pool->freeblock to a
strives at all levels (arena, pool, and block) never to touch a piece of one-block list holding the second such block. This is consistent with that
memory until it's actually needed. So long as a pool is in the used state, pymalloc strives at all levels (arena, pool, and block) never to touch a piece
we're certain there *is* a block available for allocating. If pool->freeblock of memory until it's actually needed.
is NULL then, that means we simply haven't yet gotten to one of the higher-
address blocks. The address of "the next" available block can be computed So long as a pool is in the used state, we're certain there *is* a block
then from pool->ref.count (the number of currently allocated blocks). This available for allocating. If pool->freeblock is NULL then, that means we
computation can be expensive, because it requires an integer multiply. simply haven't yet gotten to one of the higher-address blocks. The offset
However, so long as the pool's size class doesn't change, it's a one-time cost from the pool_header to the start of "the next" virgin block is stored in
for that block; the computation could be made cheaper via adding a highwater the pool_header nextoffset member, and the largest value of nextoffset that
pointer to the pool_header, but the tradeoff is murky. makes sense is stored in the maxnextoffset member when a pool is initialized.
All the blocks in a pool have been passed out at least when and only when
nextoffset > maxnextoffset.
Major obscurity: While the usedpools vector is declared to have poolp Major obscurity: While the usedpools vector is declared to have poolp
...@@ -596,15 +598,13 @@ _PyMalloc_Malloc(size_t nbytes) ...@@ -596,15 +598,13 @@ _PyMalloc_Malloc(size_t nbytes)
/* /*
* Reached the end of the free list, try to extend it * Reached the end of the free list, try to extend it
*/ */
if (pool->ref.count < pool->capacity) { if (pool->nextoffset <= pool->maxnextoffset) {
/* /*
* There is room for another block * There is room for another block
*/ */
size++; pool->freeblock = (block *)pool +
size <<= ALIGNMENT_SHIFT; /* block size */ pool->nextoffset;
pool->freeblock = (block *)pool + \ pool->nextoffset += INDEX2SIZE(size);
POOL_OVERHEAD + \
pool->ref.count * size;
*(block **)(pool->freeblock) = NULL; *(block **)(pool->freeblock) = NULL;
UNLOCK(); UNLOCK();
return (void *)bp; return (void *)bp;
...@@ -650,16 +650,17 @@ _PyMalloc_Malloc(size_t nbytes) ...@@ -650,16 +650,17 @@ _PyMalloc_Malloc(size_t nbytes)
return (void *)bp; return (void *)bp;
} }
/* /*
* Initialize the pool header and free list * Initialize the pool header, set up the free list to
* then return the first block. * contain just the second block, and return the first
* block.
*/ */
pool->szidx = size; pool->szidx = size;
size++; size = INDEX2SIZE(size);
size <<= ALIGNMENT_SHIFT; /* block size */
bp = (block *)pool + POOL_OVERHEAD; bp = (block *)pool + POOL_OVERHEAD;
pool->nextoffset = POOL_OVERHEAD + (size << 1);
pool->maxnextoffset = POOL_SIZE - size;
pool->freeblock = bp + size; pool->freeblock = bp + size;
*(block **)(pool->freeblock) = NULL; *(block **)(pool->freeblock) = NULL;
pool->capacity = (POOL_SIZE - POOL_OVERHEAD) / size;
UNLOCK(); UNLOCK();
return (void *)bp; return (void *)bp;
} }
...@@ -736,7 +737,6 @@ _PyMalloc_Free(void *p) ...@@ -736,7 +737,6 @@ _PyMalloc_Free(void *p)
* freeblock wasn't NULL, so the pool wasn't full, * freeblock wasn't NULL, so the pool wasn't full,
* and the pool is in a usedpools[] list. * and the pool is in a usedpools[] list.
*/ */
assert(pool->ref.count < pool->capacity);
if (--pool->ref.count != 0) { if (--pool->ref.count != 0) {
/* pool isn't empty: leave it in usedpools */ /* pool isn't empty: leave it in usedpools */
UNLOCK(); UNLOCK();
...@@ -767,7 +767,6 @@ _PyMalloc_Free(void *p) ...@@ -767,7 +767,6 @@ _PyMalloc_Free(void *p)
* targets optimal filling when several pools contain * targets optimal filling when several pools contain
* blocks of the same size class. * blocks of the same size class.
*/ */
assert(pool->ref.count == pool->capacity); /* else not full */
--pool->ref.count; --pool->ref.count;
assert(pool->ref.count > 0); /* else the pool is empty */ assert(pool->ref.count > 0); /* else the pool is empty */
size = pool->szidx; size = pool->szidx;
...@@ -806,7 +805,7 @@ _PyMalloc_Realloc(void *p, size_t nbytes) ...@@ -806,7 +805,7 @@ _PyMalloc_Realloc(void *p, size_t nbytes)
if (ADDRESS_IN_RANGE(p, pool->arenaindex)) { if (ADDRESS_IN_RANGE(p, pool->arenaindex)) {
/* We're in charge of this block */ /* We're in charge of this block */
INCMINE; INCMINE;
size = (pool->szidx + 1) << ALIGNMENT_SHIFT; /* block size */ size = INDEX2SIZE(pool->szidx);
if (size >= nbytes) if (size >= nbytes)
/* Don't bother if a smaller size was requested. */ /* Don't bother if a smaller size was requested. */
return p; return p;
...@@ -1255,7 +1254,7 @@ _PyMalloc_DebugDumpStats(void) ...@@ -1255,7 +1254,7 @@ _PyMalloc_DebugDumpStats(void)
} }
++numpools[p->szidx]; ++numpools[p->szidx];
numblocks[p->szidx] += p->ref.count; numblocks[p->szidx] += p->ref.count;
numfreeblocks[p->szidx] += p->capacity - p->ref.count; numfreeblocks[p->szidx] += NUMBLOCKS(p) - p->ref.count;
} }
} }
...@@ -1271,7 +1270,7 @@ _PyMalloc_DebugDumpStats(void) ...@@ -1271,7 +1270,7 @@ _PyMalloc_DebugDumpStats(void)
ulong p = numpools[i]; ulong p = numpools[i];
ulong b = numblocks[i]; ulong b = numblocks[i];
ulong f = numfreeblocks[i]; ulong f = numfreeblocks[i];
uint size = (i+1) << ALIGNMENT_SHIFT; uint size = INDEX2SIZE(i);
if (p == 0) { if (p == 0) {
assert(b == 0 && f == 0); assert(b == 0 && f == 0);
continue; continue;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment