Commit 68426377 authored by Hideaki Yoshifuji's avatar Hideaki Yoshifuji Committed by David S. Miller

[IPV4/IPV6]: Fix fragment creation.

We have two problem in ipv4 and ipv6 wrt. to
fragmentation on output.  Both of which result
in fragmentation when it is really not needed.

When fragmenting both ipv4 and ipv6 need to make
the post IP-header portion of the packet have a
length of modulo 8.  This means that if the PMTU
is not a multiple of 8 after the IP header size
is subtracted, we will fragment for full sized
frames for the modulo 8 bytes.

Furthermore, IPV6 subtracts out space for the
fragmentation header it must add, making unnecessary
fragmentation even more likely.

We still need to handle ip*_append_page() and that
will happen in a followup fix.
Signed-off-by: default avatarHideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 6a66a40a
......@@ -734,10 +734,10 @@ int ip_append_data(struct sock *sk,
int hh_len;
int exthdrlen;
int mtu;
int copy;
int copy = 0;
int err;
int offset = 0;
unsigned int maxfraglen, fragheaderlen;
unsigned int maxfraglen, fragheaderlen, fraggap = 0;
int csummode = CHECKSUM_NONE;
if (flags&MSG_PROBE)
......@@ -780,13 +780,27 @@ int ip_append_data(struct sock *sk,
hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen;
if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
return -EMSGSIZE;
}
/*
* Let's try using as much space as possible to avoid generating
* additional unnecessary small fragment of length
* (mtu-fragheaderlen)%8 if mtu-fragheaderlen is not 0 modulo 8.
* -- yoshfuji
*/
if (fragheaderlen + inet->cork.length + length <= mtu)
maxfraglen = mtu;
else
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
if (fragheaderlen + inet->cork.length <= mtu &&
fragheaderlen + inet->cork.length + length > mtu)
fraggap = 1;
/*
* transhdrlen > 0 means that this is the first fragment and we wish
* it won't be fragmented in the future.
......@@ -804,16 +818,12 @@ int ip_append_data(struct sock *sk,
* We use calculated fragment length to generate chained skb,
* each of segments is IP fragment ready for sending to network after
* adding appropriate IP header.
*
* Mistake is:
*
* If mtu-fragheaderlen is not 0 modulo 8, we generate additional
* small fragment of length (mtu-fragheaderlen)%8, even though
* it is not necessary. Not a big bug, but needs a fix.
*/
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
fraggap = 0;
goto alloc_new_skb;
}
while (length > 0) {
if ((copy = maxfraglen - skb->len) <= 0) {
......@@ -821,12 +831,18 @@ int ip_append_data(struct sock *sk,
unsigned int datalen;
unsigned int fraglen;
unsigned int alloclen;
BUG_TRAP(copy == 0);
struct sk_buff *skb_prev;
BUG_TRAP(fraggap || copy == 0);
alloc_new_skb:
skb_prev = skb;
if (fraggap)
fraggap = -copy;
datalen = maxfraglen - fragheaderlen;
if (datalen > length)
datalen = length;
if (datalen > length + fraggap)
datalen = length + fraggap;
fraglen = datalen + fragheaderlen;
if ((flags & MSG_MORE) &&
......@@ -875,7 +891,14 @@ int ip_append_data(struct sock *sk,
data += fragheaderlen;
skb->h.raw = data + exthdrlen;
copy = datalen - transhdrlen;
if (fraggap) {
skb_copy_bits(skb_prev, maxfraglen,
data + transhdrlen, fraggap);
data += fraggap;
skb_trim(skb_prev, maxfraglen);
}
copy = datalen - transhdrlen - fraggap;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
......@@ -883,9 +906,10 @@ int ip_append_data(struct sock *sk,
}
offset += copy;
length -= datalen;
length -= datalen - fraggap;
transhdrlen = 0;
exthdrlen = 0;
fraggap = 0;
csummode = CHECKSUM_NONE;
/*
......
......@@ -813,7 +813,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
struct inet_opt *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff *skb;
unsigned int maxfraglen, fragheaderlen;
unsigned int maxfraglen, fragheaderlen, fraggap = 0;
int exthdrlen;
int hh_len;
int mtu;
......@@ -866,7 +866,6 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
......@@ -875,10 +874,37 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
}
}
/*
* Let's try using as much space as possible.
* Use MTU if total length of the message fits into the MTU.
* Otherwise, we need to reserve fragment header and
* fragment alignment (= 8-15 octects, in total).
*
* Note that we may need to "move" the data from the tail of
* of the buffer to the new fragment when we split
* the message at the first time.
*
* FIXME: It may be fragmented into multiple chunks
* at once if non-fragmentable extension headers
* are too large.
* --yoshfuji
*/
if (fragheaderlen + inet->cork.length + length <= mtu)
maxfraglen = mtu;
else
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen
- sizeof(struct frag_hdr);
if (fragheaderlen + inet->cork.length <= mtu &&
fragheaderlen + inet->cork.length + length > mtu)
fraggap = 1;
inet->cork.length += length;
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
fraggap = 0;
goto alloc_new_skb;
}
while (length > 0) {
if ((copy = maxfraglen - skb->len) <= 0) {
......@@ -886,18 +912,42 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
unsigned int datalen;
unsigned int fraglen;
unsigned int alloclen;
BUG_TRAP(copy == 0);
struct sk_buff *skb_prev;
BUG_TRAP(fraggap || copy == 0);
alloc_new_skb:
skb_prev = skb;
/* There's no room in the current skb */
if (fraggap)
fraggap = -copy;
datalen = maxfraglen - fragheaderlen;
if (datalen > length)
datalen = length;
if (datalen > length + fraggap)
datalen = length + fraggap;
fraglen = datalen + fragheaderlen;
if ((flags & MSG_MORE) &&
!(rt->u.dst.dev->features&NETIF_F_SG))
alloclen = maxfraglen;
else
alloclen = fraglen;
alloclen = datalen + fragheaderlen;
/*
* The last fragment gets additional space at tail.
* Note: we overallocate on fragments with MSG_MODE
* because we have no idea if we're the last one.
*/
if (datalen == length + fraggap)
alloclen += rt->u.dst.trailer_len;
/*
* We just reserve space for fragment header.
* Note: this may be overallocation if the message
* (without MSG_MORE) fits into the MTU.
*/
alloclen += sizeof(struct frag_hdr);
if (transhdrlen) {
skb = sock_alloc_send_skb(sk,
alloclen + hh_len,
......@@ -919,7 +969,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
*/
skb->ip_summed = csummode;
skb->csum = 0;
/* reserve 8 byte for fragmentation */
/* reserve for fragmentation */
skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
/*
......@@ -929,17 +979,29 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
skb->nh.raw = data + exthdrlen;
data += fragheaderlen;
skb->h.raw = data + exthdrlen;
copy = datalen - transhdrlen;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
if (fraggap) {
skb_copy_bits(skb_prev, maxfraglen,
data + transhdrlen, fraggap);
data += fraggap;
skb_trim(skb_prev, maxfraglen);
}
copy = datalen - transhdrlen - fraggap;
if (copy < 0) {
err = -EINVAL;
kfree_skb(skb);
goto error;
} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
}
offset += copy;
length -= datalen;
length -= datalen - fraggap;
transhdrlen = 0;
exthdrlen = 0;
fraggap = 0;
csummode = CHECKSUM_NONE;
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment