Commit 77631565 authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

[PATCH] rmaplock: SLAB_DESTROY_BY_RCU

With page_map_lock gone, how to stabilize page->mapping's anon_vma while
acquiring anon_vma->lock in page_referenced_anon and try_to_unmap_anon?

The page cannot actually be freed (vmscan holds reference), but however much
we check page_mapped (which guarantees that anon_vma is in use - or would
guarantee that if we added suitable barriers), there's no locking against page
becoming unmapped the instant after, then anon_vma freed.

It's okay to take anon_vma->lock after it's freed, so long as it remains a
struct anon_vma (its list would become empty, or perhaps reused for an
unrelated anon_vma: but no problem since we always check that the page located
is the right one); but corruption if that memory gets reused for some other
purpose.

This is not unique: it's liable to be problem whenever the kernel tries to
approach a structure obliquely.  It's generally solved with an atomic
reference count; but one advantage of anon_vma over anonmm is that it does not
have such a count, and it would be a backward step to add one.

Therefore...  implement SLAB_DESTROY_BY_RCU flag, to guarantee that such a
kmem_cache_alloc'ed structure cannot get freed to other use while the
rcu_read_lock is held i.e.  preempt disabled; and use that for anon_vma.

Fix concerns raised by Manfred: this flag is incompatible with poisoning and
destructor, and kmem_cache_destroy needs to synchronize_kernel.

I hope SLAB_DESTROY_BY_RCU may be useful elsewhere; but though it's safe for
little anon_vma, I'd be reluctant to use it on any caches whose immediate
shrinkage under pressure is important to the system.
Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent edcc56dc
...@@ -45,6 +45,7 @@ typedef struct kmem_cache_s kmem_cache_t; ...@@ -45,6 +45,7 @@ typedef struct kmem_cache_s kmem_cache_t;
#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* track pages allocated to indicate #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* track pages allocated to indicate
what is reclaimable later*/ what is reclaimable later*/
#define SLAB_PANIC 0x00040000UL /* panic if kmem_cache_create() fails */ #define SLAB_PANIC 0x00040000UL /* panic if kmem_cache_create() fails */
#define SLAB_DESTROY_BY_RCU 0x00080000UL /* defer freeing pages to RCU */
/* flags passed to a constructor func */ /* flags passed to a constructor func */
#define SLAB_CTOR_CONSTRUCTOR 0x001UL /* if not set, then deconstructor */ #define SLAB_CTOR_CONSTRUCTOR 0x001UL /* if not set, then deconstructor */
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/rmap.h> #include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
...@@ -159,8 +160,31 @@ static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) ...@@ -159,8 +160,31 @@ static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
void __init anon_vma_init(void) void __init anon_vma_init(void)
{ {
anon_vma_cachep = kmem_cache_create("anon_vma", anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL); 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
}
/*
* Getting a lock on a stable anon_vma from a page off the LRU is
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
*/
static struct anon_vma *page_lock_anon_vma(struct page *page)
{
struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
rcu_read_lock();
anon_mapping = (unsigned long) page->mapping;
if (!(anon_mapping & PAGE_MAPPING_ANON))
goto out;
if (!page_mapped(page))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
spin_lock(&anon_vma->lock);
out:
rcu_read_unlock();
return anon_vma;
} }
/* /*
...@@ -238,19 +262,15 @@ static int page_referenced_one(struct page *page, ...@@ -238,19 +262,15 @@ static int page_referenced_one(struct page *page,
static int page_referenced_anon(struct page *page) static int page_referenced_anon(struct page *page)
{ {
unsigned int mapcount; unsigned int mapcount;
struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON; struct anon_vma *anon_vma;
struct vm_area_struct *vma; struct vm_area_struct *vma;
int referenced = 0; int referenced = 0;
/* anon_vma = page_lock_anon_vma(page);
* Recheck mapcount: it is not safe to take anon_vma->lock after if (!anon_vma)
* last page_remove_rmap, since struct anon_vma might be reused.
*/
mapcount = page_mapcount(page);
if (!mapcount)
return referenced; return referenced;
spin_lock(&anon_vma->lock); mapcount = page_mapcount(page);
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
referenced += page_referenced_one(page, vma, &mapcount); referenced += page_referenced_one(page, vma, &mapcount);
if (!mapcount) if (!mapcount)
...@@ -634,18 +654,14 @@ static int try_to_unmap_cluster(unsigned long cursor, ...@@ -634,18 +654,14 @@ static int try_to_unmap_cluster(unsigned long cursor,
static int try_to_unmap_anon(struct page *page) static int try_to_unmap_anon(struct page *page)
{ {
struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON; struct anon_vma *anon_vma;
struct vm_area_struct *vma; struct vm_area_struct *vma;
int ret = SWAP_AGAIN; int ret = SWAP_AGAIN;
/* anon_vma = page_lock_anon_vma(page);
* Recheck mapped: it is not safe to take anon_vma->lock after if (!anon_vma)
* last page_remove_rmap, since struct anon_vma might be reused.
*/
if (!page_mapped(page))
return ret; return ret;
spin_lock(&anon_vma->lock);
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
ret = try_to_unmap_one(page, vma); ret = try_to_unmap_one(page, vma);
if (ret == SWAP_FAIL || !page_mapped(page)) if (ret == SWAP_FAIL || !page_mapped(page))
......
...@@ -91,6 +91,7 @@ ...@@ -91,6 +91,7 @@
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/sysctl.h> #include <linux/sysctl.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/rcupdate.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/cacheflush.h> #include <asm/cacheflush.h>
...@@ -139,11 +140,13 @@ ...@@ -139,11 +140,13 @@
SLAB_POISON | SLAB_HWCACHE_ALIGN | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \
SLAB_NO_REAP | SLAB_CACHE_DMA | \ SLAB_NO_REAP | SLAB_CACHE_DMA | \
SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC) SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU)
#else #else
# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC) SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU)
#endif #endif
/* /*
...@@ -189,6 +192,28 @@ struct slab { ...@@ -189,6 +192,28 @@ struct slab {
kmem_bufctl_t free; kmem_bufctl_t free;
}; };
/*
* struct slab_rcu
*
* slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
* arrange for kmem_freepages to be called via RCU. This is useful if
* we need to approach a kernel structure obliquely, from its address
* obtained without the usual locking. We can lock the structure to
* stabilize it and check it's still at the given address, only if we
* can be sure that the memory has not been meanwhile reused for some
* other kind of object (which our subsystem's lock might corrupt).
*
* rcu_read_lock before reading the address, then rcu_read_unlock after
* taking the spinlock within the structure expected at that address.
*
* We assume struct slab_rcu can overlay struct slab when destroying.
*/
struct slab_rcu {
struct rcu_head head;
kmem_cache_t *cachep;
void *addr;
};
/* /*
* struct array_cache * struct array_cache
* *
...@@ -873,6 +898,16 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr) ...@@ -873,6 +898,16 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
} }
static void kmem_rcu_free(struct rcu_head *head)
{
struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
kmem_cache_t *cachep = slab_rcu->cachep;
kmem_freepages(cachep, slab_rcu->addr);
if (OFF_SLAB(cachep))
kmem_cache_free(cachep->slabp_cache, slab_rcu);
}
#if DEBUG #if DEBUG
#ifdef CONFIG_DEBUG_PAGEALLOC #ifdef CONFIG_DEBUG_PAGEALLOC
...@@ -1026,6 +1061,8 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp) ...@@ -1026,6 +1061,8 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
*/ */
static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
{ {
void *addr = slabp->s_mem - slabp->colouroff;
#if DEBUG #if DEBUG
int i; int i;
for (i = 0; i < cachep->num; i++) { for (i = 0; i < cachep->num; i++) {
...@@ -1061,10 +1098,19 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) ...@@ -1061,10 +1098,19 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
} }
} }
#endif #endif
kmem_freepages(cachep, slabp->s_mem-slabp->colouroff); if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
if (OFF_SLAB(cachep)) struct slab_rcu *slab_rcu;
kmem_cache_free(cachep->slabp_cache, slabp);
slab_rcu = (struct slab_rcu *) slabp;
slab_rcu->cachep = cachep;
slab_rcu->addr = addr;
call_rcu(&slab_rcu->head, kmem_rcu_free);
} else {
kmem_freepages(cachep, addr);
if (OFF_SLAB(cachep))
kmem_cache_free(cachep->slabp_cache, slabp);
}
} }
/** /**
...@@ -1139,9 +1185,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, ...@@ -1139,9 +1185,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/ */
if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
flags |= SLAB_RED_ZONE|SLAB_STORE_USER; flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
flags |= SLAB_POISON; if (!(flags & SLAB_DESTROY_BY_RCU))
flags |= SLAB_POISON;
#endif #endif
if (flags & SLAB_DESTROY_BY_RCU)
BUG_ON(flags & SLAB_POISON);
#endif #endif
if (flags & SLAB_DESTROY_BY_RCU)
BUG_ON(dtor);
/* /*
* Always checks flags, a caller might be expecting debug * Always checks flags, a caller might be expecting debug
* support which isn't available. * support which isn't available.
...@@ -1553,6 +1605,9 @@ int kmem_cache_destroy (kmem_cache_t * cachep) ...@@ -1553,6 +1605,9 @@ int kmem_cache_destroy (kmem_cache_t * cachep)
return 1; return 1;
} }
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
synchronize_kernel();
/* no cpu_online check required here since we clear the percpu /* no cpu_online check required here since we clear the percpu
* array on cpu offline and set this to NULL. * array on cpu offline and set this to NULL.
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment