Commit c30f1fc0 authored by David S. Miller's avatar David S. Miller

Merge branch 'ip-Use-rb-trees-for-IP-frag-queue'

Peter Oskolkov says:

====================
ip: Use rb trees for IP frag queue.

This patchset
 * changes IPv4 defrag behavior to match that of IPv6: overlapping
   fragments now cause the whole IP datagram to be discarded (suggested
   by David Miller): there are no legitimate use cases for overlapping
   fragments;
 * changes IPv4 defrag queue from a list to a rb tree (suggested
   by Eric Dumazet): this change removes a potential attach vector.

Upcoming patches will contain similar changes for IPv6 frag queue,
as well as a comprehensive IP defrag self-test (temporarily delayed).
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents cfb4099f fa0f5273
......@@ -676,13 +676,16 @@ struct sk_buff {
* UDP receive path is one user.
*/
unsigned long dev_scratch;
int ip_defrag_offset;
};
};
struct rb_node rbnode; /* used in netem & tcp stack */
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
struct list_head list;
};
union {
struct sock *sk;
int ip_defrag_offset;
};
union {
ktime_t tstamp;
......@@ -2585,7 +2588,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
kfree_skb(skb);
}
void skb_rbtree_purge(struct rb_root *root);
unsigned int skb_rbtree_purge(struct rb_root *root);
void *netdev_alloc_frag(unsigned int fragsz);
......
......@@ -75,7 +75,8 @@ struct inet_frag_queue {
struct timer_list timer;
spinlock_t lock;
refcount_t refcnt;
struct sk_buff *fragments;
struct sk_buff *fragments; /* Used in IPv6. */
struct rb_root rb_fragments; /* Used in IPv4. */
struct sk_buff *fragments_tail;
ktime_t stamp;
int len;
......
......@@ -56,6 +56,7 @@ enum
IPSTATS_MIB_ECT1PKTS, /* InECT1Pkts */
IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */
IPSTATS_MIB_CEPKTS, /* InCEPkts */
IPSTATS_MIB_REASM_OVERLAPS, /* ReasmOverlaps */
__IPSTATS_MIB_MAX
};
......
......@@ -2858,23 +2858,27 @@ EXPORT_SYMBOL(skb_queue_purge);
/**
* skb_rbtree_purge - empty a skb rbtree
* @root: root of the rbtree to empty
* Return value: the sum of truesizes of all purged skbs.
*
* Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
* the list and one reference dropped. This function does not take
* any lock. Synchronization should be handled by the caller (e.g., TCP
* out-of-order queue is protected by the socket lock).
*/
void skb_rbtree_purge(struct rb_root *root)
unsigned int skb_rbtree_purge(struct rb_root *root)
{
struct rb_node *p = rb_first(root);
unsigned int sum = 0;
while (p) {
struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
p = rb_next(p);
rb_erase(&skb->rbnode, root);
sum += skb->truesize;
kfree_skb(skb);
}
return sum;
}
/**
......
......@@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
fp = q->fragments;
nf = q->net;
f = nf->f;
while (fp) {
if (fp) {
do {
struct sk_buff *xp = fp->next;
sum_truesize += fp->truesize;
kfree_skb(fp);
fp = xp;
} while (fp);
} else {
sum_truesize = skb_rbtree_purge(&q->rb_fragments);
}
sum = sum_truesize + f->qsize;
......
This diff is collapsed.
......@@ -119,6 +119,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
SNMP_MIB_SENTINEL
};
......
......@@ -463,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
head->csum);
fq->q.fragments = NULL;
fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
return true;
......
......@@ -405,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
rcu_read_unlock();
fq->q.fragments = NULL;
fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
return 1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment