Commit b9e55f3d authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] slab: updates for per-arch alignments

From: Manfred Spraul <manfred@colorfullife.com>

Description:

Right now kmem_cache_create automatically decides about the alignment of
allocated objects. The automatic decisions are sometimes wrong:

- for some objects, it's better to keep them as small as possible to
  reduce the memory usage.  Ingo already added a parameter to
  kmem_cache_create for the sigqueue cache, but it wasn't implemented.

- for s390, normal kmalloc must be 8-byte aligned.  With debugging
  enabled, the default allocation was 4-bytes.  This means that s390 cannot
  enable slab debugging.

- arm26 needs 1 kB aligned objects.  Previously this was impossible to
  generate, therefore arm has its own allocator in
  arm26/machine/small_page.c

- most objects should be cache line aligned, to avoid false sharing.  But
  the cache line size was set at compile time, often to 128 bytes for
  generic kernels.  This wastes memory.  The new code uses the runtime
  determined cache line size instead.

- some caches want an explicit alignment.  One example are the pte_chain
  objects: they must find the start of the object with addr&mask.  Right
  now pte_chain objects are scaled to the cache line size, because that was
  the only alignment that could be generated reliably.

The implementation reuses the "offset" parameter of kmem_cache_create and
now uses it to pass in the requested alignment.  offset was ignored by the
current implementation, and the only user I found is sigqueue, which
intended to set the alignment.

In the long run, it might be interesting for the main tree: due to the 128
byte alignment, only 7 inodes fit into one page, with 64-byte alignment, 9
inodes - 20% memory recovered for Athlon systems.



For generic kernels  running on P6 cpus (i.e. 32 byte cachelines), it means

Number of objects per page:

 ext2_inode_cache: 8 instead of 7
 ext3_inode_cache: 8 instead of 7
 fat_inode_cache: 9 instead of 7
 rpc_tasks: 24 instead of 15
 tcp_tw_bucket: 40 instead of 30
 arp_cache: 40 instead of 30
 nfs_write_data: 9 instead of 7
parent 1aa6c0d1
...@@ -530,18 +530,18 @@ void __init pgtable_cache_init(void) ...@@ -530,18 +530,18 @@ void __init pgtable_cache_init(void)
{ {
if (PTRS_PER_PMD > 1) { if (PTRS_PER_PMD > 1) {
pmd_cache = kmem_cache_create("pmd", pmd_cache = kmem_cache_create("pmd",
PTRS_PER_PMD*sizeof(pmd_t),
PTRS_PER_PMD*sizeof(pmd_t), PTRS_PER_PMD*sizeof(pmd_t),
0, 0,
SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
pmd_ctor, pmd_ctor,
NULL); NULL);
if (!pmd_cache) if (!pmd_cache)
panic("pgtable_cache_init(): cannot create pmd cache"); panic("pgtable_cache_init(): cannot create pmd cache");
} }
pgd_cache = kmem_cache_create("pgd", pgd_cache = kmem_cache_create("pgd",
PTRS_PER_PGD*sizeof(pgd_t),
PTRS_PER_PGD*sizeof(pgd_t), PTRS_PER_PGD*sizeof(pgd_t),
0, 0,
SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
pgd_ctor, pgd_ctor,
PTRS_PER_PMD == 1 ? pgd_dtor : NULL); PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
if (!pgd_cache) if (!pgd_cache)
......
...@@ -403,6 +403,8 @@ struct tss_struct { ...@@ -403,6 +403,8 @@ struct tss_struct {
unsigned long stack[64]; unsigned long stack[64];
} __attribute__((packed)); } __attribute__((packed));
#define ARCH_MIN_TASKALIGN 16
struct thread_struct { struct thread_struct {
/* cached TLS descriptors. */ /* cached TLS descriptors. */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
......
...@@ -207,11 +207,14 @@ EXPORT_SYMBOL(autoremove_wake_function); ...@@ -207,11 +207,14 @@ EXPORT_SYMBOL(autoremove_wake_function);
void __init fork_init(unsigned long mempages) void __init fork_init(unsigned long mempages)
{ {
#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN 0
#endif
/* create a slab on which task_structs can be allocated */ /* create a slab on which task_structs can be allocated */
task_struct_cachep = task_struct_cachep =
kmem_cache_create("task_struct", kmem_cache_create("task_struct",
sizeof(struct task_struct),0, sizeof(struct task_struct),ARCH_MIN_TASKALIGN,
SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); 0, NULL, NULL);
if (!task_struct_cachep) if (!task_struct_cachep)
panic("fork_init(): cannot create task_struct SLAB cache"); panic("fork_init(): cannot create task_struct SLAB cache");
#endif #endif
......
...@@ -522,9 +522,9 @@ struct pte_chain *pte_chain_alloc(int gfp_flags) ...@@ -522,9 +522,9 @@ struct pte_chain *pte_chain_alloc(int gfp_flags)
void __init pte_chain_init(void) void __init pte_chain_init(void)
{ {
pte_chain_cache = kmem_cache_create( "pte_chain", pte_chain_cache = kmem_cache_create( "pte_chain",
sizeof(struct pte_chain),
sizeof(struct pte_chain), sizeof(struct pte_chain),
0, 0,
SLAB_MUST_HWCACHE_ALIGN,
pte_chain_ctor, pte_chain_ctor,
NULL); NULL);
......
...@@ -121,6 +121,14 @@ ...@@ -121,6 +121,14 @@
/* Shouldn't this be in a header file somewhere? */ /* Shouldn't this be in a header file somewhere? */
#define BYTES_PER_WORD sizeof(void *) #define BYTES_PER_WORD sizeof(void *)
#ifndef cache_line_size
#define cache_line_size() L1_CACHE_BYTES
#endif
#ifndef ARCH_KMALLOC_MINALIGN
#define ARCH_KMALLOC_MINALIGN 0
#endif
/* Legal flag mask for kmem_cache_create(). */ /* Legal flag mask for kmem_cache_create(). */
#if DEBUG #if DEBUG
# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
...@@ -268,6 +276,7 @@ struct kmem_cache_s { ...@@ -268,6 +276,7 @@ struct kmem_cache_s {
unsigned int colour_off; /* colour offset */ unsigned int colour_off; /* colour offset */
unsigned int colour_next; /* cache colouring */ unsigned int colour_next; /* cache colouring */
kmem_cache_t *slabp_cache; kmem_cache_t *slabp_cache;
unsigned int slab_size;
unsigned int dflags; /* dynamic flags */ unsigned int dflags; /* dynamic flags */
/* constructor func */ /* constructor func */
...@@ -490,8 +499,10 @@ static kmem_cache_t cache_cache = { ...@@ -490,8 +499,10 @@ static kmem_cache_t cache_cache = {
.objsize = sizeof(kmem_cache_t), .objsize = sizeof(kmem_cache_t),
.flags = SLAB_NO_REAP, .flags = SLAB_NO_REAP,
.spinlock = SPIN_LOCK_UNLOCKED, .spinlock = SPIN_LOCK_UNLOCKED,
.colour_off = L1_CACHE_BYTES,
.name = "kmem_cache", .name = "kmem_cache",
#if DEBUG
.reallen = sizeof(kmem_cache_t),
#endif
}; };
/* Guard access to the cache-chain. */ /* Guard access to the cache-chain. */
...@@ -535,7 +546,7 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep) ...@@ -535,7 +546,7 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep)
} }
/* Cal the num objs, wastage, and bytes left over for a given slab size. */ /* Cal the num objs, wastage, and bytes left over for a given slab size. */
static void cache_estimate (unsigned long gfporder, size_t size, static void cache_estimate (unsigned long gfporder, size_t size, size_t align,
int flags, size_t *left_over, unsigned int *num) int flags, size_t *left_over, unsigned int *num)
{ {
int i; int i;
...@@ -548,7 +559,7 @@ static void cache_estimate (unsigned long gfporder, size_t size, ...@@ -548,7 +559,7 @@ static void cache_estimate (unsigned long gfporder, size_t size,
extra = sizeof(kmem_bufctl_t); extra = sizeof(kmem_bufctl_t);
} }
i = 0; i = 0;
while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage) while (i*size + ALIGN(base+i*extra, align) <= wastage)
i++; i++;
if (i > 0) if (i > 0)
i--; i--;
...@@ -558,7 +569,7 @@ static void cache_estimate (unsigned long gfporder, size_t size, ...@@ -558,7 +569,7 @@ static void cache_estimate (unsigned long gfporder, size_t size,
*num = i; *num = i;
wastage -= i*size; wastage -= i*size;
wastage -= L1_CACHE_ALIGN(base+i*extra); wastage -= ALIGN(base+i*extra, align);
*left_over = wastage; *left_over = wastage;
} }
...@@ -705,16 +716,20 @@ void __init kmem_cache_init(void) ...@@ -705,16 +716,20 @@ void __init kmem_cache_init(void)
init_MUTEX(&cache_chain_sem); init_MUTEX(&cache_chain_sem);
INIT_LIST_HEAD(&cache_chain); INIT_LIST_HEAD(&cache_chain);
list_add(&cache_cache.next, &cache_chain); list_add(&cache_cache.next, &cache_chain);
cache_cache.colour_off = cache_line_size();
cache_cache.array[smp_processor_id()] = &initarray_cache.cache; cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
cache_estimate(0, cache_cache.objsize, 0, cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
&left_over, &cache_cache.num);
cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
&left_over, &cache_cache.num);
if (!cache_cache.num) if (!cache_cache.num)
BUG(); BUG();
cache_cache.colour = left_over/cache_cache.colour_off; cache_cache.colour = left_over/cache_cache.colour_off;
cache_cache.colour_next = 0; cache_cache.colour_next = 0;
cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
sizeof(struct slab), cache_line_size());
/* 2+3) create the kmalloc caches */ /* 2+3) create the kmalloc caches */
sizes = malloc_sizes; sizes = malloc_sizes;
...@@ -728,7 +743,7 @@ void __init kmem_cache_init(void) ...@@ -728,7 +743,7 @@ void __init kmem_cache_init(void)
* allow tighter packing of the smaller caches. */ * allow tighter packing of the smaller caches. */
sizes->cs_cachep = kmem_cache_create( sizes->cs_cachep = kmem_cache_create(
names->name, sizes->cs_size, names->name, sizes->cs_size,
0, SLAB_HWCACHE_ALIGN, NULL, NULL); ARCH_KMALLOC_MINALIGN, 0, NULL, NULL);
if (!sizes->cs_cachep) if (!sizes->cs_cachep)
BUG(); BUG();
...@@ -740,7 +755,7 @@ void __init kmem_cache_init(void) ...@@ -740,7 +755,7 @@ void __init kmem_cache_init(void)
sizes->cs_dmacachep = kmem_cache_create( sizes->cs_dmacachep = kmem_cache_create(
names->name_dma, sizes->cs_size, names->name_dma, sizes->cs_size,
0, SLAB_CACHE_DMA|SLAB_HWCACHE_ALIGN, NULL, NULL); ARCH_KMALLOC_MINALIGN, SLAB_CACHE_DMA, NULL, NULL);
if (!sizes->cs_dmacachep) if (!sizes->cs_dmacachep)
BUG(); BUG();
...@@ -1056,7 +1071,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) ...@@ -1056,7 +1071,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
* kmem_cache_create - Create a cache. * kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache. * @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache. * @size: The size of objects to be created in this cache.
* @offset: The offset to use within the page. * @align: The required alignment for the objects.
* @flags: SLAB flags * @flags: SLAB flags
* @ctor: A constructor for the objects. * @ctor: A constructor for the objects.
* @dtor: A destructor for the objects. * @dtor: A destructor for the objects.
...@@ -1081,16 +1096,15 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) ...@@ -1081,16 +1096,15 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
* %SLAB_NO_REAP - Don't automatically reap this cache when we're under * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
* memory pressure. * memory pressure.
* *
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * %SLAB_HWCACHE_ALIGN - This flag has no effect and will be removed soon.
* cacheline. This can be beneficial if you're counting cycles as closely *
* as davem.
*/ */
kmem_cache_t * kmem_cache_t *
kmem_cache_create (const char *name, size_t size, size_t offset, kmem_cache_create (const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
void (*dtor)(void*, kmem_cache_t *, unsigned long)) void (*dtor)(void*, kmem_cache_t *, unsigned long))
{ {
size_t left_over, align, slab_size; size_t left_over, slab_size;
kmem_cache_t *cachep = NULL; kmem_cache_t *cachep = NULL;
/* /*
...@@ -1101,7 +1115,7 @@ kmem_cache_create (const char *name, size_t size, size_t offset, ...@@ -1101,7 +1115,7 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
(size < BYTES_PER_WORD) || (size < BYTES_PER_WORD) ||
(size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) || (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
(dtor && !ctor) || (dtor && !ctor) ||
(offset < 0 || offset > size)) { (align < 0)) {
printk(KERN_ERR "%s: Early error in slab %s\n", printk(KERN_ERR "%s: Early error in slab %s\n",
__FUNCTION__, name); __FUNCTION__, name);
BUG(); BUG();
...@@ -1118,22 +1132,16 @@ kmem_cache_create (const char *name, size_t size, size_t offset, ...@@ -1118,22 +1132,16 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
#if FORCED_DEBUG #if FORCED_DEBUG
/* /*
* Enable redzoning and last user accounting, except * Enable redzoning and last user accounting, except for caches with
* - for caches with forced alignment: redzoning would violate the * large objects, if the increased size would increase the object size
* alignment * above the next power of two: caches with object sizes just above a
* - for caches with large objects, if the increased size would * power of two have a significant amount of internal fragmentation.
* increase the object size above the next power of two: caches
* with object sizes just above a power of two have a significant
* amount of internal fragmentation
*/ */
if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)) if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
&& !(flags & SLAB_MUST_HWCACHE_ALIGN)) {
flags |= SLAB_RED_ZONE|SLAB_STORE_USER; flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
}
flags |= SLAB_POISON; flags |= SLAB_POISON;
#endif #endif
#endif #endif
/* /*
* Always checks flags, a caller might be expecting debug * Always checks flags, a caller might be expecting debug
* support which isn't available. * support which isn't available.
...@@ -1141,15 +1149,23 @@ kmem_cache_create (const char *name, size_t size, size_t offset, ...@@ -1141,15 +1149,23 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
if (flags & ~CREATE_MASK) if (flags & ~CREATE_MASK)
BUG(); BUG();
if (align) {
/* minimum supported alignment: */
if (align < BYTES_PER_WORD)
align = BYTES_PER_WORD;
/* combinations of forced alignment and advanced debugging is
* not yet implemented.
*/
flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
}
/* Get cache's description obj. */ /* Get cache's description obj. */
cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
if (!cachep) if (!cachep)
goto opps; goto opps;
memset(cachep, 0, sizeof(kmem_cache_t)); memset(cachep, 0, sizeof(kmem_cache_t));
#if DEBUG
cachep->reallen = size;
#endif
/* Check that size is in terms of words. This is needed to avoid /* Check that size is in terms of words. This is needed to avoid
* unaligned accesses for some archs when redzoning is used, and makes * unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned. * sure any on-slab bufctl's are also correctly aligned.
...@@ -1160,30 +1176,31 @@ kmem_cache_create (const char *name, size_t size, size_t offset, ...@@ -1160,30 +1176,31 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
} }
#if DEBUG #if DEBUG
cachep->reallen = size;
if (flags & SLAB_RED_ZONE) { if (flags & SLAB_RED_ZONE) {
/* /* redzoning only works with word aligned caches */
* There is no point trying to honour cache alignment align = BYTES_PER_WORD;
* when redzoning.
*/
flags &= ~SLAB_HWCACHE_ALIGN;
/* add space for red zone words */ /* add space for red zone words */
cachep->dbghead += BYTES_PER_WORD; cachep->dbghead += BYTES_PER_WORD;
size += 2*BYTES_PER_WORD; size += 2*BYTES_PER_WORD;
} }
if (flags & SLAB_STORE_USER) { if (flags & SLAB_STORE_USER) {
flags &= ~SLAB_HWCACHE_ALIGN; /* user store requires word alignment and
size += BYTES_PER_WORD; /* add space */ * one word storage behind the end of the real
* object.
*/
align = BYTES_PER_WORD;
size += BYTES_PER_WORD;
} }
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size > 128 && cachep->reallen > L1_CACHE_BYTES && size < PAGE_SIZE) { if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
cachep->dbghead += PAGE_SIZE - size; cachep->dbghead += PAGE_SIZE - size;
size = PAGE_SIZE; size = PAGE_SIZE;
} }
#endif #endif
#endif #endif
align = BYTES_PER_WORD;
if (flags & SLAB_HWCACHE_ALIGN)
align = L1_CACHE_BYTES;
/* Determine if the slab management is 'on' or 'off' slab. */ /* Determine if the slab management is 'on' or 'off' slab. */
if (size >= (PAGE_SIZE>>3)) if (size >= (PAGE_SIZE>>3))
...@@ -1193,13 +1210,16 @@ kmem_cache_create (const char *name, size_t size, size_t offset, ...@@ -1193,13 +1210,16 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
*/ */
flags |= CFLGS_OFF_SLAB; flags |= CFLGS_OFF_SLAB;
if (flags & SLAB_HWCACHE_ALIGN) { if (!align) {
/* Need to adjust size so that objs are cache aligned. */ /* Default alignment: compile time specified l1 cache size.
/* Small obj size, can get at least two per cache line. */ * Except if an object is really small, then squeeze multiple
* into one cacheline.
*/
align = cache_line_size();
while (size <= align/2) while (size <= align/2)
align /= 2; align /= 2;
size = (size+align-1)&(~(align-1));
} }
size = ALIGN(size, align);
/* Cal size (in pages) of slabs, and the num of objs per slab. /* Cal size (in pages) of slabs, and the num of objs per slab.
* This could be made much more intelligent. For now, try to avoid * This could be made much more intelligent. For now, try to avoid
...@@ -1209,7 +1229,7 @@ kmem_cache_create (const char *name, size_t size, size_t offset, ...@@ -1209,7 +1229,7 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
do { do {
unsigned int break_flag = 0; unsigned int break_flag = 0;
cal_wastage: cal_wastage:
cache_estimate(cachep->gfporder, size, flags, cache_estimate(cachep->gfporder, size, align, flags,
&left_over, &cachep->num); &left_over, &cachep->num);
if (break_flag) if (break_flag)
break; break;
...@@ -1243,7 +1263,8 @@ kmem_cache_create (const char *name, size_t size, size_t offset, ...@@ -1243,7 +1263,8 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
cachep = NULL; cachep = NULL;
goto opps; goto opps;
} }
slab_size = L1_CACHE_ALIGN(cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab)); slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
+ sizeof(struct slab), align);
/* /*
* If the slab has been placed off-slab, and we have enough space then * If the slab has been placed off-slab, and we have enough space then
...@@ -1254,14 +1275,17 @@ kmem_cache_create (const char *name, size_t size, size_t offset, ...@@ -1254,14 +1275,17 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
left_over -= slab_size; left_over -= slab_size;
} }
/* Offset must be a multiple of the alignment. */ if (flags & CFLGS_OFF_SLAB) {
offset += (align-1); /* really off slab. No need for manual alignment */
offset &= ~(align-1); slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
if (!offset) }
offset = L1_CACHE_BYTES;
cachep->colour_off = offset;
cachep->colour = left_over/offset;
cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
if (cachep->colour_off < align)
cachep->colour_off = align;
cachep->colour = left_over/cachep->colour_off;
cachep->slab_size = slab_size;
cachep->flags = flags; cachep->flags = flags;
cachep->gfpflags = 0; cachep->gfpflags = 0;
if (flags & SLAB_CACHE_DMA) if (flags & SLAB_CACHE_DMA)
...@@ -1543,8 +1567,7 @@ static inline struct slab* alloc_slabmgmt (kmem_cache_t *cachep, ...@@ -1543,8 +1567,7 @@ static inline struct slab* alloc_slabmgmt (kmem_cache_t *cachep,
return NULL; return NULL;
} else { } else {
slabp = objp+colour_off; slabp = objp+colour_off;
colour_off += L1_CACHE_ALIGN(cachep->num * colour_off += cachep->slab_size;
sizeof(kmem_bufctl_t) + sizeof(struct slab));
} }
slabp->inuse = 0; slabp->inuse = 0;
slabp->colouroff = colour_off; slabp->colouroff = colour_off;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment