Commit bae2e81a authored by David S. Miller's avatar David S. Miller

Merge branch 'concurrent_hash_tables'

Thomas Graf says:

====================
Lockless netlink_lookup() with new concurrent hash table

Netlink sockets are maintained in a hash table to allow efficient lookup
via the port ID for unicast messages. However, lookups currently require
a read lock to be taken. This series adds a new generic, resizable,
scalable, concurrent hash table based on the paper referenced in the first
patch. It then makes use of the new data type to implement lockless
netlink_lookup().

Patch 3/3 to convert nft_hash is included for reference but should be
merged via the netfilter tree. Inclusion in this series is to provide
context for the suggested API.

Against net-next since the initial user of the new hash table is in net/

Changes:
v4-v5:
 - use GFP_KERNEL to alloc Netlink buckets as suggested by Nikolay
   Aleksandrov
 - free nft hash element on removal as spotted by Nikolay Aleksandrov
   and Patrick McHardy
v3-v4:
 - fixed wrong shift assignment placement as spotted by Nikolay Aleksandrov
 - reverted default size of nft_hash to 4 as requested by Patrick McHardy,
   default size for other hash tables remains at 64 if no hint is given
 - fixed copyright as requested by Patrick McHardy
v2-v3:
 - fixed typo in nft_hash_destroy() when passing rhashtable handle
v1-v2:
 - fixed traversal off-by-one as spotted by Tobias Klauser
 - removed unlikely() from BUG_ON() as spotted by Josh Triplett
 - new 3rd patch to convert nft_hash to rhashtable
 - make rhashtable_insert() return void
 - nl_sk_hash_lock must be a mutex
 - fixed wrong name of rht_shrink_below_30()
 - exported symbols rht_grow_above_75() and rht_shrink_below_30()
 - allow table freeing with RCU callback
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents d39a9ffc cfe4a9dd
/*
* Resizable, Scalable, Concurrent Hash Table
*
* Copyright (c) 2014 Thomas Graf <tgraf@suug.ch>
* Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
*
* Based on the following paper by Josh Triplett, Paul E. McKenney
* and Jonathan Walpole:
* https://www.usenix.org/legacy/event/atc11/tech/final_files/Triplett.pdf
*
* Code partially derived from nft_hash
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#ifndef _LINUX_RHASHTABLE_H
#define _LINUX_RHASHTABLE_H
#include <linux/rculist.h>
struct rhash_head {
struct rhash_head *next;
};
#define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL)
struct bucket_table {
size_t size;
struct rhash_head __rcu *buckets[];
};
typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 seed);
struct rhashtable;
/**
* struct rhashtable_params - Hash table construction parameters
* @nelem_hint: Hint on number of elements, should be 75% of desired size
* @key_len: Length of key
* @key_offset: Offset of key in struct to be hashed
* @head_offset: Offset of rhash_head in struct to be hashed
* @hash_rnd: Seed to use while hashing
* @max_shift: Maximum number of shifts while expanding
* @hashfn: Function to hash key
* @obj_hashfn: Function to hash object
* @grow_decision: If defined, may return true if table should expand
* @shrink_decision: If defined, may return true if table should shrink
* @mutex_is_held: Must return true if protecting mutex is held
*/
struct rhashtable_params {
size_t nelem_hint;
size_t key_len;
size_t key_offset;
size_t head_offset;
u32 hash_rnd;
size_t max_shift;
rht_hashfn_t hashfn;
rht_obj_hashfn_t obj_hashfn;
bool (*grow_decision)(const struct rhashtable *ht,
size_t new_size);
bool (*shrink_decision)(const struct rhashtable *ht,
size_t new_size);
int (*mutex_is_held)(void);
};
/**
* struct rhashtable - Hash table handle
* @tbl: Bucket table
* @nelems: Number of elements in table
* @shift: Current size (1 << shift)
* @p: Configuration parameters
*/
struct rhashtable {
struct bucket_table __rcu *tbl;
size_t nelems;
size_t shift;
struct rhashtable_params p;
};
#ifdef CONFIG_PROVE_LOCKING
int lockdep_rht_mutex_is_held(const struct rhashtable *ht);
#else
static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
{
return 1;
}
#endif /* CONFIG_PROVE_LOCKING */
int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params);
u32 rhashtable_hashfn(const struct rhashtable *ht, const void *key, u32 len);
u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr);
void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node, gfp_t);
bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node, gfp_t);
void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
struct rhash_head **pprev, gfp_t flags);
bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size);
bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size);
int rhashtable_expand(struct rhashtable *ht, gfp_t flags);
int rhashtable_shrink(struct rhashtable *ht, gfp_t flags);
void *rhashtable_lookup(const struct rhashtable *ht, const void *key);
void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash,
bool (*compare)(void *, void *), void *arg);
void rhashtable_destroy(const struct rhashtable *ht);
#define rht_dereference(p, ht) \
rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))
#define rht_dereference_rcu(p, ht) \
rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht))
/* Internal, use rht_obj() instead */
#define rht_entry(ptr, type, member) container_of(ptr, type, member)
#define rht_entry_safe(ptr, type, member) \
({ \
typeof(ptr) __ptr = (ptr); \
__ptr ? rht_entry(__ptr, type, member) : NULL; \
})
#define rht_entry_safe_rcu(ptr, type, member) \
({ \
typeof(*ptr) __rcu *__ptr = (typeof(*ptr) __rcu __force *)ptr; \
__ptr ? container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member) : NULL; \
})
#define rht_next_entry_safe(pos, ht, member) \
({ \
pos ? rht_entry_safe(rht_dereference((pos)->member.next, ht), \
typeof(*(pos)), member) : NULL; \
})
/**
* rht_for_each - iterate over hash chain
* @pos: &struct rhash_head to use as a loop cursor.
* @head: head of the hash chain (struct rhash_head *)
* @ht: pointer to your struct rhashtable
*/
#define rht_for_each(pos, head, ht) \
for (pos = rht_dereference(head, ht); \
pos; \
pos = rht_dereference((pos)->next, ht))
/**
* rht_for_each_entry - iterate over hash chain of given type
* @pos: type * to use as a loop cursor.
* @head: head of the hash chain (struct rhash_head *)
* @ht: pointer to your struct rhashtable
* @member: name of the rhash_head within the hashable struct.
*/
#define rht_for_each_entry(pos, head, ht, member) \
for (pos = rht_entry_safe(rht_dereference(head, ht), \
typeof(*(pos)), member); \
pos; \
pos = rht_next_entry_safe(pos, ht, member))
/**
* rht_for_each_entry_safe - safely iterate over hash chain of given type
* @pos: type * to use as a loop cursor.
* @n: type * to use for temporary next object storage
* @head: head of the hash chain (struct rhash_head *)
* @ht: pointer to your struct rhashtable
* @member: name of the rhash_head within the hashable struct.
*
* This hash chain list-traversal primitive allows for the looped code to
* remove the loop cursor from the list.
*/
#define rht_for_each_entry_safe(pos, n, head, ht, member) \
for (pos = rht_entry_safe(rht_dereference(head, ht), \
typeof(*(pos)), member), \
n = rht_next_entry_safe(pos, ht, member); \
pos; \
pos = n, \
n = rht_next_entry_safe(pos, ht, member))
/**
* rht_for_each_rcu - iterate over rcu hash chain
* @pos: &struct rhash_head to use as a loop cursor.
* @head: head of the hash chain (struct rhash_head *)
* @ht: pointer to your struct rhashtable
*
* This hash chain list-traversal primitive may safely run concurrently with
* the _rcu fkht mutation primitives such as rht_insert() as long as the
* traversal is guarded by rcu_read_lock().
*/
#define rht_for_each_rcu(pos, head, ht) \
for (pos = rht_dereference_rcu(head, ht); \
pos; \
pos = rht_dereference_rcu((pos)->next, ht))
/**
* rht_for_each_entry_rcu - iterate over rcu hash chain of given type
* @pos: type * to use as a loop cursor.
* @head: head of the hash chain (struct rhash_head *)
* @member: name of the rhash_head within the hashable struct.
*
* This hash chain list-traversal primitive may safely run concurrently with
* the _rcu fkht mutation primitives such as rht_insert() as long as the
* traversal is guarded by rcu_read_lock().
*/
#define rht_for_each_entry_rcu(pos, head, member) \
for (pos = rht_entry_safe_rcu(head, typeof(*(pos)), member); \
pos; \
pos = rht_entry_safe_rcu((pos)->member.next, \
typeof(*(pos)), member))
#endif /* _LINUX_RHASHTABLE_H */
...@@ -1550,6 +1550,14 @@ config TEST_STRING_HELPERS ...@@ -1550,6 +1550,14 @@ config TEST_STRING_HELPERS
config TEST_KSTRTOX config TEST_KSTRTOX
tristate "Test kstrto*() family of functions at runtime" tristate "Test kstrto*() family of functions at runtime"
config TEST_RHASHTABLE
bool "Perform selftest on resizable hash table"
default n
help
Enable this option to test the rhashtable functions at boot.
If unsure, say N.
endmenu # runtime tests endmenu # runtime tests
config PROVIDE_OHCI1394_DMA_INIT config PROVIDE_OHCI1394_DMA_INIT
......
...@@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ ...@@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \ gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \ bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
percpu-refcount.o percpu_ida.o hash.o percpu-refcount.o percpu_ida.o hash.o rhashtable.o
obj-y += string_helpers.o obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
obj-y += kstrtox.o obj-y += kstrtox.o
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#ifndef _AF_NETLINK_H #ifndef _AF_NETLINK_H
#define _AF_NETLINK_H #define _AF_NETLINK_H
#include <linux/rhashtable.h>
#include <net/sock.h> #include <net/sock.h>
#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
...@@ -47,6 +48,8 @@ struct netlink_sock { ...@@ -47,6 +48,8 @@ struct netlink_sock {
struct netlink_ring tx_ring; struct netlink_ring tx_ring;
atomic_t mapped; atomic_t mapped;
#endif /* CONFIG_NETLINK_MMAP */ #endif /* CONFIG_NETLINK_MMAP */
struct rhash_head node;
}; };
static inline struct netlink_sock *nlk_sk(struct sock *sk) static inline struct netlink_sock *nlk_sk(struct sock *sk)
...@@ -54,21 +57,8 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) ...@@ -54,21 +57,8 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk)
return container_of(sk, struct netlink_sock, sk); return container_of(sk, struct netlink_sock, sk);
} }
struct nl_portid_hash {
struct hlist_head *table;
unsigned long rehash_time;
unsigned int mask;
unsigned int shift;
unsigned int entries;
unsigned int max_shift;
u32 rnd;
};
struct netlink_table { struct netlink_table {
struct nl_portid_hash hash; struct rhashtable hash;
struct hlist_head mc_list; struct hlist_head mc_list;
struct listeners __rcu *listeners; struct listeners __rcu *listeners;
unsigned int flags; unsigned int flags;
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <linux/netlink.h> #include <linux/netlink.h>
#include <linux/sock_diag.h> #include <linux/sock_diag.h>
#include <linux/netlink_diag.h> #include <linux/netlink_diag.h>
#include <linux/rhashtable.h>
#include "af_netlink.h" #include "af_netlink.h"
...@@ -101,16 +102,20 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, ...@@ -101,16 +102,20 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
int protocol, int s_num) int protocol, int s_num)
{ {
struct netlink_table *tbl = &nl_table[protocol]; struct netlink_table *tbl = &nl_table[protocol];
struct nl_portid_hash *hash = &tbl->hash; struct rhashtable *ht = &tbl->hash;
const struct bucket_table *htbl = rht_dereference(ht->tbl, ht);
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);
struct netlink_diag_req *req; struct netlink_diag_req *req;
struct netlink_sock *nlsk;
struct sock *sk; struct sock *sk;
int ret = 0, num = 0, i; int ret = 0, num = 0, i;
req = nlmsg_data(cb->nlh); req = nlmsg_data(cb->nlh);
for (i = 0; i <= hash->mask; i++) { for (i = 0; i < htbl->size; i++) {
sk_for_each(sk, &hash->table[i]) { rht_for_each_entry(nlsk, htbl->buckets[i], ht, node) {
sk = (struct sock *)nlsk;
if (!net_eq(sock_net(sk), net)) if (!net_eq(sock_net(sk), net))
continue; continue;
if (num < s_num) { if (num < s_num) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment