Commit 70ae7222 authored by David S. Miller's avatar David S. Miller

Merge branch 'inet-frags-bring-rhashtables-to-IP-defrag'

Eric Dumazet says:

====================
inet: frags: bring rhashtables to IP defrag

IP defrag processing is one of the remaining problematic layer in linux.

It uses static hash tables of 1024 buckets, and up to 128 items per bucket.

A work queue is supposed to garbage collect items when host is under memory
pressure, and doing a hash rebuild, changing seed used in hash computations.

This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
occurring every 5 seconds if host is under fire.

Then there is the problem of sharing this hash table for all netns.

It is time to switch to rhashtables, and allocate one of them per netns
to speedup netns dismantle, since this is a critical metric these days.

Lookup is now using RCU, and 64bit hosts can now provision whatever amount
of memory needed to handle the expected workloads.

v2: Addressed Herbert and Kirill feedbacks
  (Use rhashtable_free_and_destroy(), and split the big patch into small units)

v3: Removed the extra add_frag_mem_limit(...) from inet_frag_create()
    Removed the refcount_inc_not_zero() call from inet_frags_free_cb(),
    as we can exploit del_timer() return value.

v4: kbuild robot feedback about one missing static (squashed)
    Additional patches :
      inet: frags: do not clone skb in ip_expire()
      ipv6: frags: rewrite ip6_expire_frag_queue()
      rhashtable: reorganize struct rhashtable layout
      inet: frags: reorganize struct netns_frags
      inet: frags: get rid of ipfrag_skb_cb/FRAG_CB
      ipv6: frags: get rid of ip6frag_skb_cb/FRAG6_CB
      inet: frags: get rid of nf_ct_frag6_skb_cb/NFCT_FRAG6_CB
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 5749d6af f2d1c724
...@@ -133,14 +133,11 @@ min_adv_mss - INTEGER ...@@ -133,14 +133,11 @@ min_adv_mss - INTEGER
IP Fragmentation: IP Fragmentation:
ipfrag_high_thresh - INTEGER ipfrag_high_thresh - LONG INTEGER
Maximum memory used to reassemble IP fragments. When Maximum memory used to reassemble IP fragments.
ipfrag_high_thresh bytes of memory is allocated for this purpose,
the fragment handler will toss packets until ipfrag_low_thresh ipfrag_low_thresh - LONG INTEGER
is reached. This also serves as a maximum limit to namespaces (Obsolete since linux-4.17)
different from the initial one.
ipfrag_low_thresh - INTEGER
Maximum memory used to reassemble IP fragments before the kernel Maximum memory used to reassemble IP fragments before the kernel
begins to remove incomplete fragment queues to free up resources. begins to remove incomplete fragment queues to free up resources.
The kernel still accepts new fragments for defragmentation. The kernel still accepts new fragments for defragmentation.
......
...@@ -152,25 +152,25 @@ struct rhashtable_params { ...@@ -152,25 +152,25 @@ struct rhashtable_params {
/** /**
* struct rhashtable - Hash table handle * struct rhashtable - Hash table handle
* @tbl: Bucket table * @tbl: Bucket table
* @nelems: Number of elements in table
* @key_len: Key length for hashfn * @key_len: Key length for hashfn
* @p: Configuration parameters
* @max_elems: Maximum number of elements in table * @max_elems: Maximum number of elements in table
* @p: Configuration parameters
* @rhlist: True if this is an rhltable * @rhlist: True if this is an rhltable
* @run_work: Deferred worker to expand/shrink asynchronously * @run_work: Deferred worker to expand/shrink asynchronously
* @mutex: Mutex to protect current/future table swapping * @mutex: Mutex to protect current/future table swapping
* @lock: Spin lock to protect walker list * @lock: Spin lock to protect walker list
* @nelems: Number of elements in table
*/ */
struct rhashtable { struct rhashtable {
struct bucket_table __rcu *tbl; struct bucket_table __rcu *tbl;
atomic_t nelems;
unsigned int key_len; unsigned int key_len;
struct rhashtable_params p;
unsigned int max_elems; unsigned int max_elems;
struct rhashtable_params p;
bool rhlist; bool rhlist;
struct work_struct run_work; struct work_struct run_work;
struct mutex mutex; struct mutex mutex;
spinlock_t lock; spinlock_t lock;
atomic_t nelems;
}; };
/** /**
......
...@@ -672,6 +672,7 @@ struct sk_buff { ...@@ -672,6 +672,7 @@ struct sk_buff {
* UDP receive path is one user. * UDP receive path is one user.
*/ */
unsigned long dev_scratch; unsigned long dev_scratch;
int ip_defrag_offset;
}; };
}; };
struct rb_node rbnode; /* used in netem & tcp stack */ struct rb_node rbnode; /* used in netem & tcp stack */
......
...@@ -2,14 +2,20 @@ ...@@ -2,14 +2,20 @@
#ifndef __NET_FRAG_H__ #ifndef __NET_FRAG_H__
#define __NET_FRAG_H__ #define __NET_FRAG_H__
#include <linux/rhashtable.h>
struct netns_frags { struct netns_frags {
/* Keep atomic mem on separate cachelines in structs that include it */
atomic_t mem ____cacheline_aligned_in_smp;
/* sysctls */ /* sysctls */
long high_thresh;
long low_thresh;
int timeout; int timeout;
int high_thresh;
int low_thresh;
int max_dist; int max_dist;
struct inet_frags *f;
struct rhashtable rhashtable ____cacheline_aligned_in_smp;
/* Keep atomic mem on separate cachelines in structs that include it */
atomic_long_t mem ____cacheline_aligned_in_smp;
}; };
/** /**
...@@ -25,12 +31,30 @@ enum { ...@@ -25,12 +31,30 @@ enum {
INET_FRAG_COMPLETE = BIT(2), INET_FRAG_COMPLETE = BIT(2),
}; };
struct frag_v4_compare_key {
__be32 saddr;
__be32 daddr;
u32 user;
u32 vif;
__be16 id;
u16 protocol;
};
struct frag_v6_compare_key {
struct in6_addr saddr;
struct in6_addr daddr;
u32 user;
__be32 id;
u32 iif;
};
/** /**
* struct inet_frag_queue - fragment queue * struct inet_frag_queue - fragment queue
* *
* @lock: spinlock protecting the queue * @node: rhash node
* @key: keys identifying this frag.
* @timer: queue expiration timer * @timer: queue expiration timer
* @list: hash bucket list * @lock: spinlock protecting this frag
* @refcnt: reference count of the queue * @refcnt: reference count of the queue
* @fragments: received fragments head * @fragments: received fragments head
* @fragments_tail: received fragments tail * @fragments_tail: received fragments tail
...@@ -40,12 +64,16 @@ enum { ...@@ -40,12 +64,16 @@ enum {
* @flags: fragment queue flags * @flags: fragment queue flags
* @max_size: maximum received fragment size * @max_size: maximum received fragment size
* @net: namespace that this frag belongs to * @net: namespace that this frag belongs to
* @list_evictor: list of queues to forcefully evict (e.g. due to low memory) * @rcu: rcu head for freeing deferall
*/ */
struct inet_frag_queue { struct inet_frag_queue {
spinlock_t lock; struct rhash_head node;
union {
struct frag_v4_compare_key v4;
struct frag_v6_compare_key v6;
} key;
struct timer_list timer; struct timer_list timer;
struct hlist_node list; spinlock_t lock;
refcount_t refcnt; refcount_t refcnt;
struct sk_buff *fragments; struct sk_buff *fragments;
struct sk_buff *fragments_tail; struct sk_buff *fragments_tail;
...@@ -55,100 +83,56 @@ struct inet_frag_queue { ...@@ -55,100 +83,56 @@ struct inet_frag_queue {
__u8 flags; __u8 flags;
u16 max_size; u16 max_size;
struct netns_frags *net; struct netns_frags *net;
struct hlist_node list_evictor; struct rcu_head rcu;
};
#define INETFRAGS_HASHSZ 1024
/* averaged:
* max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
* rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
* struct frag_queue))
*/
#define INETFRAGS_MAXDEPTH 128
struct inet_frag_bucket {
struct hlist_head chain;
spinlock_t chain_lock;
}; };
struct inet_frags { struct inet_frags {
struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
struct work_struct frags_work;
unsigned int next_bucket;
unsigned long last_rebuild_jiffies;
bool rebuild;
/* The first call to hashfn is responsible to initialize
* rnd. This is best done with net_get_random_once.
*
* rnd_seqlock is used to let hash insertion detect
* when it needs to re-lookup the hash chain to use.
*/
u32 rnd;
seqlock_t rnd_seqlock;
unsigned int qsize; unsigned int qsize;
unsigned int (*hashfn)(const struct inet_frag_queue *);
bool (*match)(const struct inet_frag_queue *q,
const void *arg);
void (*constructor)(struct inet_frag_queue *q, void (*constructor)(struct inet_frag_queue *q,
const void *arg); const void *arg);
void (*destructor)(struct inet_frag_queue *); void (*destructor)(struct inet_frag_queue *);
void (*frag_expire)(struct timer_list *t); void (*frag_expire)(struct timer_list *t);
struct kmem_cache *frags_cachep; struct kmem_cache *frags_cachep;
const char *frags_cache_name; const char *frags_cache_name;
struct rhashtable_params rhash_params;
}; };
int inet_frags_init(struct inet_frags *); int inet_frags_init(struct inet_frags *);
void inet_frags_fini(struct inet_frags *); void inet_frags_fini(struct inet_frags *);
static inline void inet_frags_init_net(struct netns_frags *nf) static inline int inet_frags_init_net(struct netns_frags *nf)
{ {
atomic_set(&nf->mem, 0); atomic_long_set(&nf->mem, 0);
return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
} }
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); void inet_frags_exit_net(struct netns_frags *nf);
void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f); void inet_frag_kill(struct inet_frag_queue *q);
void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f); void inet_frag_destroy(struct inet_frag_queue *q);
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
struct inet_frags *f, void *key, unsigned int hash);
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, static inline void inet_frag_put(struct inet_frag_queue *q)
const char *prefix);
static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
{ {
if (refcount_dec_and_test(&q->refcnt)) if (refcount_dec_and_test(&q->refcnt))
inet_frag_destroy(q, f); inet_frag_destroy(q);
}
static inline bool inet_frag_evicting(struct inet_frag_queue *q)
{
return !hlist_unhashed(&q->list_evictor);
} }
/* Memory Tracking Functions. */ /* Memory Tracking Functions. */
static inline int frag_mem_limit(struct netns_frags *nf) static inline long frag_mem_limit(const struct netns_frags *nf)
{
return atomic_read(&nf->mem);
}
static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
{ {
atomic_sub(i, &nf->mem); return atomic_long_read(&nf->mem);
} }
static inline void add_frag_mem_limit(struct netns_frags *nf, int i) static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
{ {
atomic_add(i, &nf->mem); atomic_long_sub(val, &nf->mem);
} }
static inline int sum_frag_mem_limit(struct netns_frags *nf) static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
{ {
return atomic_read(&nf->mem); atomic_long_add(val, &nf->mem);
} }
/* RFC 3168 support : /* RFC 3168 support :
......
...@@ -588,7 +588,6 @@ static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *s ...@@ -588,7 +588,6 @@ static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *s
return skb; return skb;
} }
#endif #endif
int ip_frag_mem(struct net *net);
/* /*
* Functions provided by ip_forward.c * Functions provided by ip_forward.c
......
...@@ -379,13 +379,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev) ...@@ -379,13 +379,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
idev->cnf.accept_ra; idev->cnf.accept_ra;
} }
#if IS_ENABLED(CONFIG_IPV6)
static inline int ip6_frag_mem(struct net *net)
{
return sum_frag_mem_limit(&net->ipv6.frags);
}
#endif
#define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */
#define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */ #define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */
#define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */ #define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */
...@@ -579,17 +572,8 @@ enum ip6_defrag_users { ...@@ -579,17 +572,8 @@ enum ip6_defrag_users {
__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
}; };
struct ip6_create_arg {
__be32 id;
u32 user;
const struct in6_addr *src;
const struct in6_addr *dst;
int iif;
u8 ecn;
};
void ip6_frag_init(struct inet_frag_queue *q, const void *a); void ip6_frag_init(struct inet_frag_queue *q, const void *a);
bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); extern const struct rhashtable_params ip6_rhash_params;
/* /*
* Equivalent of ipv4 struct ip * Equivalent of ipv4 struct ip
...@@ -597,19 +581,12 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); ...@@ -597,19 +581,12 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
struct frag_queue { struct frag_queue {
struct inet_frag_queue q; struct inet_frag_queue q;
__be32 id; /* fragment id */
u32 user;
struct in6_addr saddr;
struct in6_addr daddr;
int iif; int iif;
unsigned int csum;
__u16 nhoffset; __u16 nhoffset;
u8 ecn; u8 ecn;
}; };
void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
struct inet_frags *frags);
static inline bool ipv6_addr_any(const struct in6_addr *a) static inline bool ipv6_addr_any(const struct in6_addr *a)
{ {
......
...@@ -333,6 +333,7 @@ static int rhashtable_rehash_table(struct rhashtable *ht) ...@@ -333,6 +333,7 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
err = rhashtable_rehash_chain(ht, old_hash); err = rhashtable_rehash_chain(ht, old_hash);
if (err) if (err)
return err; return err;
cond_resched();
} }
/* Publish the new table pointer. */ /* Publish the new table pointer. */
...@@ -1112,6 +1113,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, ...@@ -1112,6 +1113,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
for (i = 0; i < tbl->size; i++) { for (i = 0; i < tbl->size; i++) {
struct rhash_head *pos, *next; struct rhash_head *pos, *next;
cond_resched();
for (pos = rht_dereference(*rht_bucket(tbl, i), ht), for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
next = !rht_is_a_nulls(pos) ? next = !rht_is_a_nulls(pos) ?
rht_dereference(pos->next, ht) : NULL; rht_dereference(pos->next, ht) : NULL;
......
...@@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result; ...@@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result;
#define LOWPAN_DISPATCH_FRAG1 0xc0 #define LOWPAN_DISPATCH_FRAG1 0xc0
#define LOWPAN_DISPATCH_FRAGN 0xe0 #define LOWPAN_DISPATCH_FRAGN 0xe0
struct lowpan_create_arg { struct frag_lowpan_compare_key {
u16 tag; u16 tag;
u16 d_size; u16 d_size;
const struct ieee802154_addr *src; const struct ieee802154_addr src;
const struct ieee802154_addr *dst; const struct ieee802154_addr dst;
}; };
/* Equivalent of ipv4 struct ip /* Equivalent of ipv4 struct ipq
*/ */
struct lowpan_frag_queue { struct lowpan_frag_queue {
struct inet_frag_queue q; struct inet_frag_queue q;
u16 tag;
u16 d_size;
struct ieee802154_addr saddr;
struct ieee802154_addr daddr;
}; };
static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
{
switch (a->mode) {
case IEEE802154_ADDR_LONG:
return (((__force u64)a->extended_addr) >> 32) ^
(((__force u64)a->extended_addr) & 0xffffffff);
case IEEE802154_ADDR_SHORT:
return (__force u32)(a->short_addr + (a->pan_id << 16));
default:
return 0;
}
}
int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
void lowpan_net_frag_exit(void); void lowpan_net_frag_exit(void);
int lowpan_net_frag_init(void); int lowpan_net_frag_init(void);
......
...@@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags; ...@@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags;
static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
struct sk_buff *prev, struct net_device *ldev); struct sk_buff *prev, struct net_device *ldev);
static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
const struct ieee802154_addr *saddr,
const struct ieee802154_addr *daddr)
{
net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
return jhash_3words(ieee802154_addr_hash(saddr),
ieee802154_addr_hash(daddr),
(__force u32)(tag + (d_size << 16)),
lowpan_frags.rnd);
}
static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
{
const struct lowpan_frag_queue *fq;
fq = container_of(q, struct lowpan_frag_queue, q);
return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
}
static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
{
const struct lowpan_frag_queue *fq;
const struct lowpan_create_arg *arg = a;
fq = container_of(q, struct lowpan_frag_queue, q);
return fq->tag == arg->tag && fq->d_size == arg->d_size &&
ieee802154_addr_equal(&fq->saddr, arg->src) &&
ieee802154_addr_equal(&fq->daddr, arg->dst);
}
static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
{ {
const struct lowpan_create_arg *arg = a; const struct frag_lowpan_compare_key *key = a;
struct lowpan_frag_queue *fq; struct lowpan_frag_queue *fq;
fq = container_of(q, struct lowpan_frag_queue, q); fq = container_of(q, struct lowpan_frag_queue, q);
fq->tag = arg->tag; BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
fq->d_size = arg->d_size; memcpy(&q->key, key, sizeof(*key));
fq->saddr = *arg->src;
fq->daddr = *arg->dst;
} }
static void lowpan_frag_expire(struct timer_list *t) static void lowpan_frag_expire(struct timer_list *t)
...@@ -94,10 +62,10 @@ static void lowpan_frag_expire(struct timer_list *t) ...@@ -94,10 +62,10 @@ static void lowpan_frag_expire(struct timer_list *t)
if (fq->q.flags & INET_FRAG_COMPLETE) if (fq->q.flags & INET_FRAG_COMPLETE)
goto out; goto out;
inet_frag_kill(&fq->q, &lowpan_frags); inet_frag_kill(&fq->q);
out: out:
spin_unlock(&fq->q.lock); spin_unlock(&fq->q.lock);
inet_frag_put(&fq->q, &lowpan_frags); inet_frag_put(&fq->q);
} }
static inline struct lowpan_frag_queue * static inline struct lowpan_frag_queue *
...@@ -105,25 +73,20 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, ...@@ -105,25 +73,20 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
const struct ieee802154_addr *src, const struct ieee802154_addr *src,
const struct ieee802154_addr *dst) const struct ieee802154_addr *dst)
{ {
struct inet_frag_queue *q;
struct lowpan_create_arg arg;
unsigned int hash;
struct netns_ieee802154_lowpan *ieee802154_lowpan = struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net); net_ieee802154_lowpan(net);
struct frag_lowpan_compare_key key = {
.tag = cb->d_tag,
.d_size = cb->d_size,
.src = *src,
.dst = *dst,
};
struct inet_frag_queue *q;
arg.tag = cb->d_tag; q = inet_frag_find(&ieee802154_lowpan->frags, &key);
arg.d_size = cb->d_size; if (!q)
arg.src = src;
arg.dst = dst;
hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst);
q = inet_frag_find(&ieee802154_lowpan->frags,
&lowpan_frags, &arg, hash);
if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL; return NULL;
}
return container_of(q, struct lowpan_frag_queue, q); return container_of(q, struct lowpan_frag_queue, q);
} }
...@@ -230,7 +193,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, ...@@ -230,7 +193,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
struct sk_buff *fp, *head = fq->q.fragments; struct sk_buff *fp, *head = fq->q.fragments;
int sum_truesize; int sum_truesize;
inet_frag_kill(&fq->q, &lowpan_frags); inet_frag_kill(&fq->q);
/* Make the one we just received the head. */ /* Make the one we just received the head. */
if (prev) { if (prev) {
...@@ -438,7 +401,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) ...@@ -438,7 +401,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
ret = lowpan_frag_queue(fq, skb, frag_type); ret = lowpan_frag_queue(fq, skb, frag_type);
spin_unlock(&fq->q.lock); spin_unlock(&fq->q.lock);
inet_frag_put(&fq->q, &lowpan_frags); inet_frag_put(&fq->q);
return ret; return ret;
} }
...@@ -448,23 +411,23 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) ...@@ -448,23 +411,23 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
} }
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
static int zero; static long zero;
static struct ctl_table lowpan_frags_ns_ctl_table[] = { static struct ctl_table lowpan_frags_ns_ctl_table[] = {
{ {
.procname = "6lowpanfrag_high_thresh", .procname = "6lowpanfrag_high_thresh",
.data = &init_net.ieee802154_lowpan.frags.high_thresh, .data = &init_net.ieee802154_lowpan.frags.high_thresh,
.maxlen = sizeof(int), .maxlen = sizeof(unsigned long),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_doulongvec_minmax,
.extra1 = &init_net.ieee802154_lowpan.frags.low_thresh .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh
}, },
{ {
.procname = "6lowpanfrag_low_thresh", .procname = "6lowpanfrag_low_thresh",
.data = &init_net.ieee802154_lowpan.frags.low_thresh, .data = &init_net.ieee802154_lowpan.frags.low_thresh,
.maxlen = sizeof(int), .maxlen = sizeof(unsigned long),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_doulongvec_minmax,
.extra1 = &zero, .extra1 = &zero,
.extra2 = &init_net.ieee802154_lowpan.frags.high_thresh .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh
}, },
...@@ -581,14 +544,20 @@ static int __net_init lowpan_frags_init_net(struct net *net) ...@@ -581,14 +544,20 @@ static int __net_init lowpan_frags_init_net(struct net *net)
{ {
struct netns_ieee802154_lowpan *ieee802154_lowpan = struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net); net_ieee802154_lowpan(net);
int res;
ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
ieee802154_lowpan->frags.f = &lowpan_frags;
inet_frags_init_net(&ieee802154_lowpan->frags); res = inet_frags_init_net(&ieee802154_lowpan->frags);
if (res < 0)
return lowpan_frags_ns_sysctl_register(net); return res;
res = lowpan_frags_ns_sysctl_register(net);
if (res < 0)
inet_frags_exit_net(&ieee802154_lowpan->frags);
return res;
} }
static void __net_exit lowpan_frags_exit_net(struct net *net) static void __net_exit lowpan_frags_exit_net(struct net *net)
...@@ -597,7 +566,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net) ...@@ -597,7 +566,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
net_ieee802154_lowpan(net); net_ieee802154_lowpan(net);
lowpan_frags_ns_sysctl_unregister(net); lowpan_frags_ns_sysctl_unregister(net);
inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags); inet_frags_exit_net(&ieee802154_lowpan->frags);
} }
static struct pernet_operations lowpan_frags_ops = { static struct pernet_operations lowpan_frags_ops = {
...@@ -605,32 +574,63 @@ static struct pernet_operations lowpan_frags_ops = { ...@@ -605,32 +574,63 @@ static struct pernet_operations lowpan_frags_ops = {
.exit = lowpan_frags_exit_net, .exit = lowpan_frags_exit_net,
}; };
int __init lowpan_net_frag_init(void) static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
{ {
int ret; return jhash2(data,
sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
}
ret = lowpan_frags_sysctl_register(); static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
if (ret) {
return ret; const struct inet_frag_queue *fq = data;
ret = register_pernet_subsys(&lowpan_frags_ops); return jhash2((const u32 *)&fq->key,
if (ret) sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
goto err_pernet; }
static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
const struct frag_lowpan_compare_key *key = arg->key;
const struct inet_frag_queue *fq = ptr;
return !!memcmp(&fq->key, key, sizeof(*key));
}
static const struct rhashtable_params lowpan_rhash_params = {
.head_offset = offsetof(struct inet_frag_queue, node),
.hashfn = lowpan_key_hashfn,
.obj_hashfn = lowpan_obj_hashfn,
.obj_cmpfn = lowpan_obj_cmpfn,
.automatic_shrinking = true,
};
int __init lowpan_net_frag_init(void)
{
int ret;
lowpan_frags.hashfn = lowpan_hashfn;
lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.constructor = lowpan_frag_init;
lowpan_frags.destructor = NULL; lowpan_frags.destructor = NULL;
lowpan_frags.qsize = sizeof(struct frag_queue); lowpan_frags.qsize = sizeof(struct frag_queue);
lowpan_frags.match = lowpan_frag_match;
lowpan_frags.frag_expire = lowpan_frag_expire; lowpan_frags.frag_expire = lowpan_frag_expire;
lowpan_frags.frags_cache_name = lowpan_frags_cache_name; lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
lowpan_frags.rhash_params = lowpan_rhash_params;
ret = inet_frags_init(&lowpan_frags); ret = inet_frags_init(&lowpan_frags);
if (ret) if (ret)
goto err_pernet; goto out;
ret = lowpan_frags_sysctl_register();
if (ret)
goto err_sysctl;
ret = register_pernet_subsys(&lowpan_frags_ops);
if (ret)
goto err_pernet;
out:
return ret; return ret;
err_pernet: err_pernet:
lowpan_frags_sysctl_unregister(); lowpan_frags_sysctl_unregister();
err_sysctl:
inet_frags_fini(&lowpan_frags);
return ret; return ret;
} }
......
This diff is collapsed.
This diff is collapsed.
...@@ -54,7 +54,6 @@ ...@@ -54,7 +54,6 @@
static int sockstat_seq_show(struct seq_file *seq, void *v) static int sockstat_seq_show(struct seq_file *seq, void *v)
{ {
struct net *net = seq->private; struct net *net = seq->private;
unsigned int frag_mem;
int orphans, sockets; int orphans, sockets;
orphans = percpu_counter_sum_positive(&tcp_orphan_count); orphans = percpu_counter_sum_positive(&tcp_orphan_count);
...@@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) ...@@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
sock_prot_inuse_get(net, &udplite_prot)); sock_prot_inuse_get(net, &udplite_prot));
seq_printf(seq, "RAW: inuse %d\n", seq_printf(seq, "RAW: inuse %d\n",
sock_prot_inuse_get(net, &raw_prot)); sock_prot_inuse_get(net, &raw_prot));
frag_mem = ip_frag_mem(net); seq_printf(seq, "FRAG: inuse %u memory %lu\n",
seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem); atomic_read(&net->ipv4.frags.rhashtable.nelems),
frag_mem_limit(&net->ipv4.frags));
return 0; return 0;
} }
......
...@@ -52,18 +52,10 @@ ...@@ -52,18 +52,10 @@
static const char nf_frags_cache_name[] = "nf-frags"; static const char nf_frags_cache_name[] = "nf-frags";
struct nf_ct_frag6_skb_cb
{
struct inet6_skb_parm h;
int offset;
};
#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb *)((skb)->cb))
static struct inet_frags nf_frags; static struct inet_frags nf_frags;
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
static int zero; static long zero;
static struct ctl_table nf_ct_frag6_sysctl_table[] = { static struct ctl_table nf_ct_frag6_sysctl_table[] = {
{ {
...@@ -76,18 +68,18 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { ...@@ -76,18 +68,18 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = {
{ {
.procname = "nf_conntrack_frag6_low_thresh", .procname = "nf_conntrack_frag6_low_thresh",
.data = &init_net.nf_frag.frags.low_thresh, .data = &init_net.nf_frag.frags.low_thresh,
.maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned long),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_doulongvec_minmax,
.extra1 = &zero, .extra1 = &zero,
.extra2 = &init_net.nf_frag.frags.high_thresh .extra2 = &init_net.nf_frag.frags.high_thresh
}, },
{ {
.procname = "nf_conntrack_frag6_high_thresh", .procname = "nf_conntrack_frag6_high_thresh",
.data = &init_net.nf_frag.frags.high_thresh, .data = &init_net.nf_frag.frags.high_thresh,
.maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned long),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_doulongvec_minmax,
.extra1 = &init_net.nf_frag.frags.low_thresh .extra1 = &init_net.nf_frag.frags.low_thresh
}, },
{ } { }
...@@ -152,23 +144,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) ...@@ -152,23 +144,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
} }
static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
const struct in6_addr *daddr)
{
net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
(__force u32)id, nf_frags.rnd);
}
static unsigned int nf_hashfn(const struct inet_frag_queue *q)
{
const struct frag_queue *nq;
nq = container_of(q, struct frag_queue, q);
return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
}
static void nf_ct_frag6_expire(struct timer_list *t) static void nf_ct_frag6_expire(struct timer_list *t)
{ {
struct inet_frag_queue *frag = from_timer(frag, t, timer); struct inet_frag_queue *frag = from_timer(frag, t, timer);
...@@ -178,34 +153,26 @@ static void nf_ct_frag6_expire(struct timer_list *t) ...@@ -178,34 +153,26 @@ static void nf_ct_frag6_expire(struct timer_list *t)
fq = container_of(frag, struct frag_queue, q); fq = container_of(frag, struct frag_queue, q);
net = container_of(fq->q.net, struct net, nf_frag.frags); net = container_of(fq->q.net, struct net, nf_frag.frags);
ip6_expire_frag_queue(net, fq, &nf_frags); ip6_expire_frag_queue(net, fq);
} }
/* Creation primitives. */ /* Creation primitives. */
static inline struct frag_queue *fq_find(struct net *net, __be32 id, static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
u32 user, struct in6_addr *src, const struct ipv6hdr *hdr, int iif)
struct in6_addr *dst, int iif, u8 ecn)
{ {
struct frag_v6_compare_key key = {
.id = id,
.saddr = hdr->saddr,
.daddr = hdr->daddr,
.user = user,
.iif = iif,
};
struct inet_frag_queue *q; struct inet_frag_queue *q;
struct ip6_create_arg arg;
unsigned int hash; q = inet_frag_find(&net->nf_frag.frags, &key);
if (!q)
arg.id = id;
arg.user = user;
arg.src = src;
arg.dst = dst;
arg.iif = iif;
arg.ecn = ecn;
local_bh_disable();
hash = nf_hash_frag(id, src, dst);
q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
local_bh_enable();
if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL; return NULL;
}
return container_of(q, struct frag_queue, q); return container_of(q, struct frag_queue, q);
} }
...@@ -264,7 +231,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, ...@@ -264,7 +231,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
* this case. -DaveM * this case. -DaveM
*/ */
pr_debug("end of fragment not rounded to 8 bytes.\n"); pr_debug("end of fragment not rounded to 8 bytes.\n");
inet_frag_kill(&fq->q, &nf_frags); inet_frag_kill(&fq->q);
return -EPROTO; return -EPROTO;
} }
if (end > fq->q.len) { if (end > fq->q.len) {
...@@ -295,13 +262,13 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, ...@@ -295,13 +262,13 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
* this fragment, right? * this fragment, right?
*/ */
prev = fq->q.fragments_tail; prev = fq->q.fragments_tail;
if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { if (!prev || prev->ip_defrag_offset < offset) {
next = NULL; next = NULL;
goto found; goto found;
} }
prev = NULL; prev = NULL;
for (next = fq->q.fragments; next != NULL; next = next->next) { for (next = fq->q.fragments; next != NULL; next = next->next) {
if (NFCT_FRAG6_CB(next)->offset >= offset) if (next->ip_defrag_offset >= offset)
break; /* bingo! */ break; /* bingo! */
prev = next; prev = next;
} }
...@@ -317,14 +284,19 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, ...@@ -317,14 +284,19 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
/* Check for overlap with preceding fragment. */ /* Check for overlap with preceding fragment. */
if (prev && if (prev &&
(NFCT_FRAG6_CB(prev)->offset + prev->len) > offset) (prev->ip_defrag_offset + prev->len) > offset)
goto discard_fq; goto discard_fq;
/* Look for overlap with succeeding segment. */ /* Look for overlap with succeeding segment. */
if (next && NFCT_FRAG6_CB(next)->offset < end) if (next && next->ip_defrag_offset < end)
goto discard_fq; goto discard_fq;
NFCT_FRAG6_CB(skb)->offset = offset; /* Note : skb->ip_defrag_offset and skb->dev share the same location */
if (skb->dev)
fq->iif = skb->dev->ifindex;
/* Makes sure compiler wont do silly aliasing games */
barrier();
skb->ip_defrag_offset = offset;
/* Insert this fragment in the chain of fragments. */ /* Insert this fragment in the chain of fragments. */
skb->next = next; skb->next = next;
...@@ -335,10 +307,6 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, ...@@ -335,10 +307,6 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
else else
fq->q.fragments = skb; fq->q.fragments = skb;
if (skb->dev) {
fq->iif = skb->dev->ifindex;
skb->dev = NULL;
}
fq->q.stamp = skb->tstamp; fq->q.stamp = skb->tstamp;
fq->q.meat += skb->len; fq->q.meat += skb->len;
fq->ecn |= ecn; fq->ecn |= ecn;
...@@ -357,7 +325,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, ...@@ -357,7 +325,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
return 0; return 0;
discard_fq: discard_fq:
inet_frag_kill(&fq->q, &nf_frags); inet_frag_kill(&fq->q);
err: err:
return -EINVAL; return -EINVAL;
} }
...@@ -379,10 +347,10 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic ...@@ -379,10 +347,10 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
int payload_len; int payload_len;
u8 ecn; u8 ecn;
inet_frag_kill(&fq->q, &nf_frags); inet_frag_kill(&fq->q);
WARN_ON(head == NULL); WARN_ON(head == NULL);
WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); WARN_ON(head->ip_defrag_offset != 0);
ecn = ip_frag_ecn_table[fq->ecn]; ecn = ip_frag_ecn_table[fq->ecn];
if (unlikely(ecn == 0xff)) if (unlikely(ecn == 0xff))
...@@ -593,8 +561,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) ...@@ -593,8 +561,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
fhdr = (struct frag_hdr *)skb_transport_header(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb);
skb_orphan(skb); skb_orphan(skb);
fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, fq = fq_find(net, fhdr->identification, user, hdr,
skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); skb->dev ? skb->dev->ifindex : 0);
if (fq == NULL) { if (fq == NULL) {
pr_debug("Can't find and can't create new queue\n"); pr_debug("Can't find and can't create new queue\n");
return -ENOMEM; return -ENOMEM;
...@@ -622,25 +590,33 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) ...@@ -622,25 +590,33 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
out_unlock: out_unlock:
spin_unlock_bh(&fq->q.lock); spin_unlock_bh(&fq->q.lock);
inet_frag_put(&fq->q, &nf_frags); inet_frag_put(&fq->q);
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
static int nf_ct_net_init(struct net *net) static int nf_ct_net_init(struct net *net)
{ {
int res;
net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
inet_frags_init_net(&net->nf_frag.frags); net->nf_frag.frags.f = &nf_frags;
return nf_ct_frag6_sysctl_register(net); res = inet_frags_init_net(&net->nf_frag.frags);
if (res < 0)
return res;
res = nf_ct_frag6_sysctl_register(net);
if (res < 0)
inet_frags_exit_net(&net->nf_frag.frags);
return res;
} }
static void nf_ct_net_exit(struct net *net) static void nf_ct_net_exit(struct net *net)
{ {
nf_ct_frags6_sysctl_unregister(net); nf_ct_frags6_sysctl_unregister(net);
inet_frags_exit_net(&net->nf_frag.frags, &nf_frags); inet_frags_exit_net(&net->nf_frag.frags);
} }
static struct pernet_operations nf_ct_net_ops = { static struct pernet_operations nf_ct_net_ops = {
...@@ -652,13 +628,12 @@ int nf_ct_frag6_init(void) ...@@ -652,13 +628,12 @@ int nf_ct_frag6_init(void)
{ {
int ret = 0; int ret = 0;
nf_frags.hashfn = nf_hashfn;
nf_frags.constructor = ip6_frag_init; nf_frags.constructor = ip6_frag_init;
nf_frags.destructor = NULL; nf_frags.destructor = NULL;
nf_frags.qsize = sizeof(struct frag_queue); nf_frags.qsize = sizeof(struct frag_queue);
nf_frags.match = ip6_frag_match;
nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frag_expire = nf_ct_frag6_expire;
nf_frags.frags_cache_name = nf_frags_cache_name; nf_frags.frags_cache_name = nf_frags_cache_name;
nf_frags.rhash_params = ip6_rhash_params;
ret = inet_frags_init(&nf_frags); ret = inet_frags_init(&nf_frags);
if (ret) if (ret)
goto out; goto out;
......
...@@ -38,7 +38,6 @@ ...@@ -38,7 +38,6 @@
static int sockstat6_seq_show(struct seq_file *seq, void *v) static int sockstat6_seq_show(struct seq_file *seq, void *v)
{ {
struct net *net = seq->private; struct net *net = seq->private;
unsigned int frag_mem = ip6_frag_mem(net);
seq_printf(seq, "TCP6: inuse %d\n", seq_printf(seq, "TCP6: inuse %d\n",
sock_prot_inuse_get(net, &tcpv6_prot)); sock_prot_inuse_get(net, &tcpv6_prot));
...@@ -48,7 +47,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) ...@@ -48,7 +47,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
sock_prot_inuse_get(net, &udplitev6_prot)); sock_prot_inuse_get(net, &udplitev6_prot));
seq_printf(seq, "RAW6: inuse %d\n", seq_printf(seq, "RAW6: inuse %d\n",
sock_prot_inuse_get(net, &rawv6_prot)); sock_prot_inuse_get(net, &rawv6_prot));
seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem); seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
atomic_read(&net->ipv6.frags.rhashtable.nelems),
frag_mem_limit(&net->ipv6.frags));
return 0; return 0;
} }
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment