Commit cc330b55 authored by David S. Miller's avatar David S. Miller

Merge branch 'rhashtable-next'

Herbert Xu says:

====================
rhashtable: Multiple rehashing

This series introduces multiple rehashing.

Recall that the original implementation in br_multicast used
two list pointers per hash node and therefore is limited to at
most one rehash at a time since you need one list pointer for
the old table and one for the new table.

Thanks to Josh Triplett's suggestion of using a single list pointer
we're no longer limited by that.  So it is perfectly OK to have
an arbitrary number of tables in existence at any one time.

The reader and removal simply has to walk from the oldest table
to the newest table in order not to miss anything.  Insertion
without lookup are just as easy as we simply go to the last table
that we can find and add the entry there.

However, insertion with uniqueness lookup is more complicated
because we need to ensure that two simultaneous insertions of the
same key do not both succeed.  To achieve this, all insertions
including those without lookups are required to obtain the bucket
lock from the oldest hash table that is still alive.  This is
determined by having the rehasher (there is only one rehashing
thread in the system) keep a pointer of where it is up to.  If
a bucket has already been rehashed then it is dead, i.e., there
cannot be any more insertions to it, otherwise it is considered
alive.  This guarantees that the same key cannot be inserted
in two different tables in parallel.

Patch 1 is actually a bug fix for the walker.

Patch 2-5 eliminates unnecessary out-of-line copies of jhash.

Patch 6 makes rhashtable_shrink shrink to fit.

Patch 7 introduces multiple rehashing.  This means that if we
decide to grow then we will grow regardless of whether the previous
one has finished.  However, this is still asynchronous meaning
that if insertions come fast enough we may still end up with a
table that is overutilised.

Patch 8 adds support for GFP_ATOMIC allocations of struct bucket_table.

Finally patch 9 enables immediate rehashing.  This is done either
when the table reaches 100% utilisation, or when the chain length
exceeds 16 (the latter can be disabled on request, e.g., for
nft_hash.

With these patches the system should no longer have any trouble
dealing with fast insertions on a small table.  In the worst
case you end up with a list of tables that's log N in length
while the rehasher catches up.

v3 restores rhashtable_shrink and fixes a number of bugs in the
multiple rehashing patches (7 and 9).
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e167359b ccd57b1b
......@@ -19,6 +19,7 @@
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/jhash.h>
#include <linux/list_nulls.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
......@@ -102,8 +103,9 @@ struct rhashtable;
* @max_size: Maximum size while expanding
* @min_size: Minimum size while shrinking
* @nulls_base: Base value to generate nulls marker
* @insecure_elasticity: Set to true to disable chain length checks
* @locks_mul: Number of bucket locks to allocate per cpu (default: 128)
* @hashfn: Function to hash key
* @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
* @obj_hashfn: Function to hash object
* @obj_cmpfn: Function to compare key with object
*/
......@@ -115,6 +117,7 @@ struct rhashtable_params {
unsigned int max_size;
unsigned int min_size;
u32 nulls_base;
bool insecure_elasticity;
size_t locks_mul;
rht_hashfn_t hashfn;
rht_obj_hashfn_t obj_hashfn;
......@@ -125,6 +128,8 @@ struct rhashtable_params {
* struct rhashtable - Hash table handle
* @tbl: Bucket table
* @nelems: Number of elements in table
* @key_len: Key length for hashfn
* @elasticity: Maximum chain length before rehash
* @p: Configuration parameters
* @run_work: Deferred worker to expand/shrink asynchronously
* @mutex: Mutex to protect current/future table swapping
......@@ -134,6 +139,8 @@ struct rhashtable {
struct bucket_table __rcu *tbl;
atomic_t nelems;
bool being_destroyed;
unsigned int key_len;
unsigned int elasticity;
struct rhashtable_params p;
struct work_struct run_work;
struct mutex mutex;
......@@ -199,9 +206,31 @@ static inline unsigned int rht_key_hashfn(
struct rhashtable *ht, const struct bucket_table *tbl,
const void *key, const struct rhashtable_params params)
{
return rht_bucket_index(tbl, params.hashfn(key, params.key_len ?:
ht->p.key_len,
tbl->hash_rnd));
unsigned hash;
/* params must be equal to ht->p if it isn't constant. */
if (!__builtin_constant_p(params.key_len))
hash = ht->p.hashfn(key, ht->key_len, tbl->hash_rnd);
else if (params.key_len) {
unsigned key_len = params.key_len;
if (params.hashfn)
hash = params.hashfn(key, key_len, tbl->hash_rnd);
else if (key_len & (sizeof(u32) - 1))
hash = jhash(key, key_len, tbl->hash_rnd);
else
hash = jhash2(key, key_len / sizeof(u32),
tbl->hash_rnd);
} else {
unsigned key_len = ht->p.key_len;
if (params.hashfn)
hash = params.hashfn(key, key_len, tbl->hash_rnd);
else
hash = jhash(key, key_len, tbl->hash_rnd);
}
return rht_bucket_index(tbl, hash);
}
static inline unsigned int rht_head_hashfn(
......@@ -241,6 +270,17 @@ static inline bool rht_shrink_below_30(const struct rhashtable *ht,
tbl->size > ht->p.min_size;
}
/**
* rht_grow_above_100 - returns true if nelems > table-size
* @ht: hash table
* @tbl: current table
*/
static inline bool rht_grow_above_100(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
return atomic_read(&ht->nelems) > tbl->size;
}
/* The bucket lock is selected based on the hash and protects mutations
* on a group of hash buckets.
*
......@@ -282,9 +322,7 @@ int rhashtable_init(struct rhashtable *ht,
int rhashtable_insert_slow(struct rhashtable *ht, const void *key,
struct rhash_head *obj,
struct bucket_table *old_tbl);
int rhashtable_expand(struct rhashtable *ht);
int rhashtable_shrink(struct rhashtable *ht);
int rhashtable_insert_rehash(struct rhashtable *ht);
int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter);
void rhashtable_walk_exit(struct rhashtable_iter *iter);
......@@ -507,43 +545,64 @@ static inline int __rhashtable_insert_fast(
.ht = ht,
.key = key,
};
int err = -EEXIST;
struct bucket_table *tbl, *new_tbl;
struct rhash_head *head;
spinlock_t *lock;
unsigned elasticity;
unsigned hash;
int err;
restart:
rcu_read_lock();
tbl = rht_dereference_rcu(ht->tbl, ht);
/* All insertions must grab the oldest table containing
* the hashed bucket that is yet to be rehashed.
*/
for (;;) {
hash = rht_head_hashfn(ht, tbl, obj, params);
lock = rht_bucket_lock(tbl, hash);
spin_lock_bh(lock);
/* Because we have already taken the bucket lock in tbl,
* if we find that future_tbl is not yet visible then
* that guarantees all other insertions of the same entry
* will also grab the bucket lock in tbl because until
* the rehash completes ht->tbl won't be changed.
*/
if (tbl->rehash <= hash)
break;
spin_unlock_bh(lock);
tbl = rht_dereference_rcu(tbl->future_tbl, ht);
}
new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
if (unlikely(new_tbl)) {
err = rhashtable_insert_slow(ht, key, obj, new_tbl);
if (err == -EAGAIN)
goto slow_path;
goto out;
}
if (!key)
goto skip_lookup;
if (unlikely(rht_grow_above_100(ht, tbl))) {
slow_path:
spin_unlock_bh(lock);
rcu_read_unlock();
err = rhashtable_insert_rehash(ht);
if (err)
return err;
goto restart;
}
err = -EEXIST;
elasticity = ht->elasticity;
rht_for_each(head, tbl, hash) {
if (unlikely(!(params.obj_cmpfn ?
if (key &&
unlikely(!(params.obj_cmpfn ?
params.obj_cmpfn(&arg, rht_obj(ht, head)) :
rhashtable_compare(&arg, rht_obj(ht, head)))))
goto out;
if (!--elasticity)
goto slow_path;
}
skip_lookup:
err = 0;
head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
......
......@@ -58,7 +58,8 @@ EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
#endif
static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
gfp_t gfp)
{
unsigned int i, size;
#if defined(CONFIG_PROVE_LOCKING)
......@@ -75,12 +76,13 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
if (sizeof(spinlock_t) != 0) {
#ifdef CONFIG_NUMA
if (size * sizeof(spinlock_t) > PAGE_SIZE)
if (size * sizeof(spinlock_t) > PAGE_SIZE &&
gfp == GFP_KERNEL)
tbl->locks = vmalloc(size * sizeof(spinlock_t));
else
#endif
tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
GFP_KERNEL);
gfp);
if (!tbl->locks)
return -ENOMEM;
for (i = 0; i < size; i++)
......@@ -105,23 +107,25 @@ static void bucket_table_free_rcu(struct rcu_head *head)
}
static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
size_t nbuckets)
size_t nbuckets,
gfp_t gfp)
{
struct bucket_table *tbl = NULL;
size_t size;
int i;
size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
if (tbl == NULL)
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
gfp != GFP_KERNEL)
tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
if (tbl == NULL && gfp == GFP_KERNEL)
tbl = vzalloc(size);
if (tbl == NULL)
return NULL;
tbl->size = nbuckets;
if (alloc_bucket_locks(ht, tbl) < 0) {
if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
bucket_table_free(tbl);
return NULL;
}
......@@ -136,11 +140,24 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
return tbl;
}
static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
struct bucket_table *tbl)
{
struct bucket_table *new_tbl;
do {
new_tbl = tbl;
tbl = rht_dereference_rcu(tbl->future_tbl, ht);
} while (tbl);
return new_tbl;
}
static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
{
struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
struct bucket_table *new_tbl =
rht_dereference(old_tbl->future_tbl, ht) ?: old_tbl;
struct bucket_table *new_tbl = rhashtable_last_table(ht,
rht_dereference_rcu(old_tbl->future_tbl, ht));
struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
int err = -ENOENT;
struct rhash_head *head, *next, *entry;
......@@ -196,12 +213,18 @@ static void rhashtable_rehash_chain(struct rhashtable *ht, unsigned old_hash)
spin_unlock_bh(old_bucket_lock);
}
static void rhashtable_rehash(struct rhashtable *ht,
static int rhashtable_rehash_attach(struct rhashtable *ht,
struct bucket_table *old_tbl,
struct bucket_table *new_tbl)
{
struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
struct rhashtable_walker *walker;
unsigned old_hash;
/* Protect future_tbl using the first bucket lock. */
spin_lock_bh(old_tbl->locks);
/* Did somebody beat us to it? */
if (rcu_access_pointer(old_tbl->future_tbl)) {
spin_unlock_bh(old_tbl->locks);
return -EEXIST;
}
/* Make insertions go into the new, empty table right away. Deletions
* and lookups will be attempted in both tables until we synchronize.
......@@ -211,6 +234,22 @@ static void rhashtable_rehash(struct rhashtable *ht,
/* Ensure the new table is visible to readers. */
smp_wmb();
spin_unlock_bh(old_tbl->locks);
return 0;
}
static int rhashtable_rehash_table(struct rhashtable *ht)
{
struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
struct bucket_table *new_tbl;
struct rhashtable_walker *walker;
unsigned old_hash;
new_tbl = rht_dereference(old_tbl->future_tbl, ht);
if (!new_tbl)
return 0;
for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
rhashtable_rehash_chain(ht, old_hash);
......@@ -225,6 +264,8 @@ static void rhashtable_rehash(struct rhashtable *ht,
* remain.
*/
call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
}
/**
......@@ -242,27 +283,32 @@ static void rhashtable_rehash(struct rhashtable *ht,
* It is valid to have concurrent insertions and deletions protected by per
* bucket locks or concurrent RCU protected lookups and traversals.
*/
int rhashtable_expand(struct rhashtable *ht)
static int rhashtable_expand(struct rhashtable *ht)
{
struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
int err;
ASSERT_RHT_MUTEX(ht);
new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
old_tbl = rhashtable_last_table(ht, old_tbl);
new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
if (new_tbl == NULL)
return -ENOMEM;
rhashtable_rehash(ht, new_tbl);
return 0;
err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
if (err)
bucket_table_free(new_tbl);
return err;
}
EXPORT_SYMBOL_GPL(rhashtable_expand);
/**
* rhashtable_shrink - Shrink hash table while allowing concurrent lookups
* @ht: the hash table to shrink
*
* This function may only be called in a context where it is safe to call
* synchronize_rcu(), e.g. not within a rcu_read_lock() section.
* This function shrinks the hash table to fit, i.e., the smallest
* size would not cause it to expand right away automatically.
*
* The caller must ensure that no concurrent resizing occurs by holding
* ht->mutex.
......@@ -273,25 +319,39 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
* It is valid to have concurrent insertions and deletions protected by per
* bucket locks or concurrent RCU protected lookups and traversals.
*/
int rhashtable_shrink(struct rhashtable *ht)
static int rhashtable_shrink(struct rhashtable *ht)
{
struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
unsigned size = roundup_pow_of_two(atomic_read(&ht->nelems) * 3 / 2);
int err;
ASSERT_RHT_MUTEX(ht);
new_tbl = bucket_table_alloc(ht, old_tbl->size / 2);
if (size < ht->p.min_size)
size = ht->p.min_size;
if (old_tbl->size <= size)
return 0;
if (rht_dereference(old_tbl->future_tbl, ht))
return -EEXIST;
new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
if (new_tbl == NULL)
return -ENOMEM;
rhashtable_rehash(ht, new_tbl);
return 0;
err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
if (err)
bucket_table_free(new_tbl);
return err;
}
EXPORT_SYMBOL_GPL(rhashtable_shrink);
static void rht_deferred_worker(struct work_struct *work)
{
struct rhashtable *ht;
struct bucket_table *tbl;
int err = 0;
ht = container_of(work, struct rhashtable, run_work);
mutex_lock(&ht->mutex);
......@@ -299,29 +359,92 @@ static void rht_deferred_worker(struct work_struct *work)
goto unlock;
tbl = rht_dereference(ht->tbl, ht);
tbl = rhashtable_last_table(ht, tbl);
if (rht_grow_above_75(ht, tbl))
rhashtable_expand(ht);
else if (rht_shrink_below_30(ht, tbl))
rhashtable_shrink(ht);
err = rhashtable_rehash_table(ht);
unlock:
mutex_unlock(&ht->mutex);
if (err)
schedule_work(&ht->run_work);
}
static bool rhashtable_check_elasticity(struct rhashtable *ht,
struct bucket_table *tbl,
unsigned hash)
{
unsigned elasticity = ht->elasticity;
struct rhash_head *head;
rht_for_each(head, tbl, hash)
if (!--elasticity)
return true;
return false;
}
int rhashtable_insert_rehash(struct rhashtable *ht)
{
struct bucket_table *old_tbl;
struct bucket_table *new_tbl;
struct bucket_table *tbl;
unsigned int size;
int err;
old_tbl = rht_dereference_rcu(ht->tbl, ht);
tbl = rhashtable_last_table(ht, old_tbl);
size = tbl->size;
if (rht_grow_above_75(ht, tbl))
size *= 2;
/* More than two rehashes (not resizes) detected. */
else if (WARN_ON(old_tbl != tbl && old_tbl->size == size))
return -EBUSY;
new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
if (new_tbl == NULL)
return -ENOMEM;
err = rhashtable_rehash_attach(ht, tbl, new_tbl);
if (err) {
bucket_table_free(new_tbl);
if (err == -EEXIST)
err = 0;
} else
schedule_work(&ht->run_work);
return err;
}
EXPORT_SYMBOL_GPL(rhashtable_insert_rehash);
int rhashtable_insert_slow(struct rhashtable *ht, const void *key,
struct rhash_head *obj,
struct bucket_table *tbl)
{
struct rhash_head *head;
unsigned hash;
int err = -EEXIST;
int err;
tbl = rhashtable_last_table(ht, tbl);
hash = head_hashfn(ht, tbl, obj);
spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
err = -EEXIST;
if (key && rhashtable_lookup_fast(ht, key, ht->p))
goto exit;
err = -EAGAIN;
if (rhashtable_check_elasticity(ht, tbl, hash) ||
rht_grow_above_100(ht, tbl))
goto exit;
err = 0;
head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
......@@ -477,6 +600,9 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
iter->skip = 0;
}
/* Ensure we see any new tables. */
smp_rmb();
iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht);
if (iter->walker->tbl) {
iter->slot = 0;
......@@ -529,6 +655,11 @@ static size_t rounded_hashtable_size(const struct rhashtable_params *params)
(unsigned long)params->min_size);
}
static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
{
return jhash2(key, length, seed);
}
/**
* rhashtable_init - initialize a new hash table
* @ht: hash table to be initialized
......@@ -580,7 +711,7 @@ int rhashtable_init(struct rhashtable *ht,
size = HASH_DEFAULT_SIZE;
if ((!(params->key_len && params->hashfn) && !params->obj_hashfn) ||
if ((!params->key_len && !params->obj_hashfn) ||
(params->obj_hashfn && !params->obj_cmpfn))
return -EINVAL;
......@@ -602,12 +733,25 @@ int rhashtable_init(struct rhashtable *ht,
ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
if (!params->insecure_elasticity)
ht->elasticity = 16;
if (params->locks_mul)
ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
else
ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
tbl = bucket_table_alloc(ht, size);
ht->key_len = ht->p.key_len;
if (!params->hashfn) {
ht->p.hashfn = jhash;
if (!(ht->key_len & (sizeof(u32) - 1))) {
ht->key_len /= sizeof(u32);
ht->p.hashfn = rhashtable_jhash2;
}
}
tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
if (tbl == NULL)
return -ENOMEM;
......
......@@ -155,30 +155,6 @@ static int __init test_rhashtable(struct rhashtable *ht)
test_rht_lookup(ht);
rcu_read_unlock();
for (i = 0; i < TEST_NEXPANDS; i++) {
pr_info(" Table expansion iteration %u...\n", i);
mutex_lock(&ht->mutex);
rhashtable_expand(ht);
mutex_unlock(&ht->mutex);
rcu_read_lock();
pr_info(" Verifying lookups...\n");
test_rht_lookup(ht);
rcu_read_unlock();
}
for (i = 0; i < TEST_NEXPANDS; i++) {
pr_info(" Table shrinkage iteration %u...\n", i);
mutex_lock(&ht->mutex);
rhashtable_shrink(ht);
mutex_unlock(&ht->mutex);
rcu_read_lock();
pr_info(" Verifying lookups...\n");
test_rht_lookup(ht);
rcu_read_unlock();
}
rcu_read_lock();
test_bucket_stats(ht, true);
rcu_read_unlock();
......
......@@ -3133,13 +3133,12 @@ static inline u32 netlink_hash(const void *data, u32 seed)
struct netlink_compare_arg arg;
netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid);
return jhash(&arg, netlink_compare_arg_len, seed);
return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed);
}
static const struct rhashtable_params netlink_rhashtable_params = {
.head_offset = offsetof(struct netlink_sock, node),
.key_len = netlink_compare_arg_len,
.hashfn = jhash,
.obj_hashfn = netlink_hash,
.obj_cmpfn = netlink_compare,
.max_size = 65536,
......
......@@ -35,7 +35,6 @@
*/
#include <linux/rhashtable.h>
#include <linux/jhash.h>
#include "core.h"
#include "name_table.h"
#include "node.h"
......@@ -2294,7 +2293,6 @@ static const struct rhashtable_params tsk_rht_params = {
.head_offset = offsetof(struct tipc_sock, node),
.key_offset = offsetof(struct tipc_sock, portid),
.key_len = sizeof(u32), /* portid */
.hashfn = jhash,
.max_size = 1048576,
.min_size = 256,
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment