Commit 1470ddf7 authored by Herbert Xu's avatar Herbert Xu Committed by David S. Miller

inet: Remove explicit write references to sk/inet in ip_append_data

In order to allow simultaneous calls to ip_append_data on the same
socket, it must not modify any shared state in sk or inet (other
than those that are designed to allow that such as atomic counters).

This patch abstracts out write references to sk and inet_sk in
ip_append_data and its friends so that we may use the underlying
code in parallel.
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
Acked-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 5a2ef920
...@@ -86,6 +86,19 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) ...@@ -86,6 +86,19 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
return (struct inet_request_sock *)sk; return (struct inet_request_sock *)sk;
} }
struct inet_cork {
unsigned int flags;
unsigned int fragsize;
struct ip_options *opt;
struct dst_entry *dst;
int length; /* Total length of all frames */
__be32 addr;
struct flowi fl;
struct page *page;
u32 off;
u8 tx_flags;
};
struct ip_mc_socklist; struct ip_mc_socklist;
struct ipv6_pinfo; struct ipv6_pinfo;
struct rtable; struct rtable;
...@@ -143,15 +156,7 @@ struct inet_sock { ...@@ -143,15 +156,7 @@ struct inet_sock {
int mc_index; int mc_index;
__be32 mc_addr; __be32 mc_addr;
struct ip_mc_socklist __rcu *mc_list; struct ip_mc_socklist __rcu *mc_list;
struct { struct inet_cork cork;
unsigned int flags;
unsigned int fragsize;
struct ip_options *opt;
struct dst_entry *dst;
int length; /* Total length of all frames */
__be32 addr;
struct flowi fl;
} cork;
}; };
#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ #define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
......
...@@ -733,6 +733,7 @@ csum_page(struct page *page, int offset, int copy) ...@@ -733,6 +733,7 @@ csum_page(struct page *page, int offset, int copy)
} }
static inline int ip_ufo_append_data(struct sock *sk, static inline int ip_ufo_append_data(struct sock *sk,
struct sk_buff_head *queue,
int getfrag(void *from, char *to, int offset, int len, int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb), int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen, void *from, int length, int hh_len, int fragheaderlen,
...@@ -745,7 +746,7 @@ static inline int ip_ufo_append_data(struct sock *sk, ...@@ -745,7 +746,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
* device, so create one single skb packet containing complete * device, so create one single skb packet containing complete
* udp datagram * udp datagram
*/ */
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { if ((skb = skb_peek_tail(queue)) == NULL) {
skb = sock_alloc_send_skb(sk, skb = sock_alloc_send_skb(sk,
hh_len + fragheaderlen + transhdrlen + 20, hh_len + fragheaderlen + transhdrlen + 20,
(flags & MSG_DONTWAIT), &err); (flags & MSG_DONTWAIT), &err);
...@@ -771,35 +772,24 @@ static inline int ip_ufo_append_data(struct sock *sk, ...@@ -771,35 +772,24 @@ static inline int ip_ufo_append_data(struct sock *sk,
/* specify the length of each IP datagram fragment */ /* specify the length of each IP datagram fragment */
skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP; skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
__skb_queue_tail(&sk->sk_write_queue, skb); __skb_queue_tail(queue, skb);
} }
return skb_append_datato_frags(sk, skb, getfrag, from, return skb_append_datato_frags(sk, skb, getfrag, from,
(length - transhdrlen)); (length - transhdrlen));
} }
/* static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
* ip_append_data() and ip_append_page() can make one large IP datagram struct inet_cork *cork,
* from many pieces of data. Each pieces will be holded on the socket int getfrag(void *from, char *to, int offset,
* until ip_push_pending_frames() is called. Each piece can be a page int len, int odd, struct sk_buff *skb),
* or non-page data. void *from, int length, int transhdrlen,
* unsigned int flags)
* Not only UDP, other transport protocols - e.g. raw sockets - can use
* this interface potentially.
*
* LATER: length must be adjusted by pad at tail, when it is required.
*/
int ip_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable **rtp,
unsigned int flags)
{ {
struct inet_sock *inet = inet_sk(sk); struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb; struct sk_buff *skb;
struct ip_options *opt = NULL; struct ip_options *opt = inet->cork.opt;
int hh_len; int hh_len;
int exthdrlen; int exthdrlen;
int mtu; int mtu;
...@@ -808,58 +798,19 @@ int ip_append_data(struct sock *sk, ...@@ -808,58 +798,19 @@ int ip_append_data(struct sock *sk,
int offset = 0; int offset = 0;
unsigned int maxfraglen, fragheaderlen; unsigned int maxfraglen, fragheaderlen;
int csummode = CHECKSUM_NONE; int csummode = CHECKSUM_NONE;
struct rtable *rt; struct rtable *rt = (struct rtable *)cork->dst;
if (flags&MSG_PROBE)
return 0;
if (skb_queue_empty(&sk->sk_write_queue)) { exthdrlen = transhdrlen ? rt->dst.header_len : 0;
/* length += exthdrlen;
* setup for corking. transhdrlen += exthdrlen;
*/ mtu = inet->cork.fragsize;
opt = ipc->opt;
if (opt) {
if (inet->cork.opt == NULL) {
inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
if (unlikely(inet->cork.opt == NULL))
return -ENOBUFS;
}
memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
inet->cork.flags |= IPCORK_OPT;
inet->cork.addr = ipc->addr;
}
rt = *rtp;
if (unlikely(!rt))
return -EFAULT;
/*
* We steal reference to this route, caller should not release it
*/
*rtp = NULL;
inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
rt->dst.dev->mtu :
dst_mtu(rt->dst.path);
inet->cork.dst = &rt->dst;
inet->cork.length = 0;
sk->sk_sndmsg_page = NULL;
sk->sk_sndmsg_off = 0;
exthdrlen = rt->dst.header_len;
length += exthdrlen;
transhdrlen += exthdrlen;
} else {
rt = (struct rtable *)inet->cork.dst;
if (inet->cork.flags & IPCORK_OPT)
opt = inet->cork.opt;
transhdrlen = 0;
exthdrlen = 0;
mtu = inet->cork.fragsize;
}
hh_len = LL_RESERVED_SPACE(rt->dst.dev); hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
if (inet->cork.length + length > 0xFFFF - fragheaderlen) { if (cork->length + length > 0xFFFF - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
mtu-exthdrlen); mtu-exthdrlen);
return -EMSGSIZE; return -EMSGSIZE;
...@@ -875,15 +826,15 @@ int ip_append_data(struct sock *sk, ...@@ -875,15 +826,15 @@ int ip_append_data(struct sock *sk,
!exthdrlen) !exthdrlen)
csummode = CHECKSUM_PARTIAL; csummode = CHECKSUM_PARTIAL;
skb = skb_peek_tail(&sk->sk_write_queue); skb = skb_peek_tail(queue);
inet->cork.length += length; cork->length += length;
if (((length > mtu) || (skb && skb_is_gso(skb))) && if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) && (sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO)) { (rt->dst.dev->features & NETIF_F_UFO)) {
err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, err = ip_ufo_append_data(sk, queue, getfrag, from, length,
fragheaderlen, transhdrlen, mtu, hh_len, fragheaderlen, transhdrlen,
flags); mtu, flags);
if (err) if (err)
goto error; goto error;
return 0; return 0;
...@@ -960,7 +911,7 @@ int ip_append_data(struct sock *sk, ...@@ -960,7 +911,7 @@ int ip_append_data(struct sock *sk,
else else
/* only the initial fragment is /* only the initial fragment is
time stamped */ time stamped */
ipc->tx_flags = 0; cork->tx_flags = 0;
} }
if (skb == NULL) if (skb == NULL)
goto error; goto error;
...@@ -971,7 +922,7 @@ int ip_append_data(struct sock *sk, ...@@ -971,7 +922,7 @@ int ip_append_data(struct sock *sk,
skb->ip_summed = csummode; skb->ip_summed = csummode;
skb->csum = 0; skb->csum = 0;
skb_reserve(skb, hh_len); skb_reserve(skb, hh_len);
skb_shinfo(skb)->tx_flags = ipc->tx_flags; skb_shinfo(skb)->tx_flags = cork->tx_flags;
/* /*
* Find where to start putting bytes. * Find where to start putting bytes.
...@@ -1008,7 +959,7 @@ int ip_append_data(struct sock *sk, ...@@ -1008,7 +959,7 @@ int ip_append_data(struct sock *sk,
/* /*
* Put the packet on the pending queue. * Put the packet on the pending queue.
*/ */
__skb_queue_tail(&sk->sk_write_queue, skb); __skb_queue_tail(queue, skb);
continue; continue;
} }
...@@ -1028,8 +979,8 @@ int ip_append_data(struct sock *sk, ...@@ -1028,8 +979,8 @@ int ip_append_data(struct sock *sk,
} else { } else {
int i = skb_shinfo(skb)->nr_frags; int i = skb_shinfo(skb)->nr_frags;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
struct page *page = sk->sk_sndmsg_page; struct page *page = cork->page;
int off = sk->sk_sndmsg_off; int off = cork->off;
unsigned int left; unsigned int left;
if (page && (left = PAGE_SIZE - off) > 0) { if (page && (left = PAGE_SIZE - off) > 0) {
...@@ -1041,7 +992,7 @@ int ip_append_data(struct sock *sk, ...@@ -1041,7 +992,7 @@ int ip_append_data(struct sock *sk,
goto error; goto error;
} }
get_page(page); get_page(page);
skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); skb_fill_page_desc(skb, i, page, off, 0);
frag = &skb_shinfo(skb)->frags[i]; frag = &skb_shinfo(skb)->frags[i];
} }
} else if (i < MAX_SKB_FRAGS) { } else if (i < MAX_SKB_FRAGS) {
...@@ -1052,8 +1003,8 @@ int ip_append_data(struct sock *sk, ...@@ -1052,8 +1003,8 @@ int ip_append_data(struct sock *sk,
err = -ENOMEM; err = -ENOMEM;
goto error; goto error;
} }
sk->sk_sndmsg_page = page; cork->page = page;
sk->sk_sndmsg_off = 0; cork->off = 0;
skb_fill_page_desc(skb, i, page, 0, 0); skb_fill_page_desc(skb, i, page, 0, 0);
frag = &skb_shinfo(skb)->frags[i]; frag = &skb_shinfo(skb)->frags[i];
...@@ -1065,7 +1016,7 @@ int ip_append_data(struct sock *sk, ...@@ -1065,7 +1016,7 @@ int ip_append_data(struct sock *sk,
err = -EFAULT; err = -EFAULT;
goto error; goto error;
} }
sk->sk_sndmsg_off += copy; cork->off += copy;
frag->size += copy; frag->size += copy;
skb->len += copy; skb->len += copy;
skb->data_len += copy; skb->data_len += copy;
...@@ -1079,11 +1030,87 @@ int ip_append_data(struct sock *sk, ...@@ -1079,11 +1030,87 @@ int ip_append_data(struct sock *sk,
return 0; return 0;
error: error:
inet->cork.length -= length; cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
return err; return err;
} }
static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
struct ipcm_cookie *ipc, struct rtable **rtp)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_options *opt;
struct rtable *rt;
/*
* setup for corking.
*/
opt = ipc->opt;
if (opt) {
if (cork->opt == NULL) {
cork->opt = kmalloc(sizeof(struct ip_options) + 40,
sk->sk_allocation);
if (unlikely(cork->opt == NULL))
return -ENOBUFS;
}
memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
cork->flags |= IPCORK_OPT;
cork->addr = ipc->addr;
}
rt = *rtp;
if (unlikely(!rt))
return -EFAULT;
/*
* We steal reference to this route, caller should not release it
*/
*rtp = NULL;
cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
rt->dst.dev->mtu : dst_mtu(rt->dst.path);
cork->dst = &rt->dst;
cork->length = 0;
cork->tx_flags = ipc->tx_flags;
cork->page = NULL;
cork->off = 0;
return 0;
}
/*
* ip_append_data() and ip_append_page() can make one large IP datagram
* from many pieces of data. Each pieces will be holded on the socket
* until ip_push_pending_frames() is called. Each piece can be a page
* or non-page data.
*
* Not only UDP, other transport protocols - e.g. raw sockets - can use
* this interface potentially.
*
* LATER: length must be adjusted by pad at tail, when it is required.
*/
int ip_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable **rtp,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
int err;
if (flags&MSG_PROBE)
return 0;
if (skb_queue_empty(&sk->sk_write_queue)) {
err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
if (err)
return err;
} else {
transhdrlen = 0;
}
return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
from, length, transhdrlen, flags);
}
ssize_t ip_append_page(struct sock *sk, struct page *page, ssize_t ip_append_page(struct sock *sk, struct page *page,
int offset, size_t size, int flags) int offset, size_t size, int flags)
{ {
...@@ -1227,40 +1254,42 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, ...@@ -1227,40 +1254,42 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
return err; return err;
} }
static void ip_cork_release(struct inet_sock *inet) static void ip_cork_release(struct inet_cork *cork)
{ {
inet->cork.flags &= ~IPCORK_OPT; cork->flags &= ~IPCORK_OPT;
kfree(inet->cork.opt); kfree(cork->opt);
inet->cork.opt = NULL; cork->opt = NULL;
dst_release(inet->cork.dst); dst_release(cork->dst);
inet->cork.dst = NULL; cork->dst = NULL;
} }
/* /*
* Combined all pending IP fragments on the socket as one IP datagram * Combined all pending IP fragments on the socket as one IP datagram
* and push them out. * and push them out.
*/ */
int ip_push_pending_frames(struct sock *sk) static int __ip_push_pending_frames(struct sock *sk,
struct sk_buff_head *queue,
struct inet_cork *cork)
{ {
struct sk_buff *skb, *tmp_skb; struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb; struct sk_buff **tail_skb;
struct inet_sock *inet = inet_sk(sk); struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
struct ip_options *opt = NULL; struct ip_options *opt = NULL;
struct rtable *rt = (struct rtable *)inet->cork.dst; struct rtable *rt = (struct rtable *)cork->dst;
struct iphdr *iph; struct iphdr *iph;
__be16 df = 0; __be16 df = 0;
__u8 ttl; __u8 ttl;
int err = 0; int err = 0;
if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) if ((skb = __skb_dequeue(queue)) == NULL)
goto out; goto out;
tail_skb = &(skb_shinfo(skb)->frag_list); tail_skb = &(skb_shinfo(skb)->frag_list);
/* move skb->data to ip header from ext header */ /* move skb->data to ip header from ext header */
if (skb->data < skb_network_header(skb)) if (skb->data < skb_network_header(skb))
__skb_pull(skb, skb_network_offset(skb)); __skb_pull(skb, skb_network_offset(skb));
while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
__skb_pull(tmp_skb, skb_network_header_len(skb)); __skb_pull(tmp_skb, skb_network_header_len(skb));
*tail_skb = tmp_skb; *tail_skb = tmp_skb;
tail_skb = &(tmp_skb->next); tail_skb = &(tmp_skb->next);
...@@ -1286,8 +1315,8 @@ int ip_push_pending_frames(struct sock *sk) ...@@ -1286,8 +1315,8 @@ int ip_push_pending_frames(struct sock *sk)
ip_dont_fragment(sk, &rt->dst))) ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF); df = htons(IP_DF);
if (inet->cork.flags & IPCORK_OPT) if (cork->flags & IPCORK_OPT)
opt = inet->cork.opt; opt = cork->opt;
if (rt->rt_type == RTN_MULTICAST) if (rt->rt_type == RTN_MULTICAST)
ttl = inet->mc_ttl; ttl = inet->mc_ttl;
...@@ -1299,7 +1328,7 @@ int ip_push_pending_frames(struct sock *sk) ...@@ -1299,7 +1328,7 @@ int ip_push_pending_frames(struct sock *sk)
iph->ihl = 5; iph->ihl = 5;
if (opt) { if (opt) {
iph->ihl += opt->optlen>>2; iph->ihl += opt->optlen>>2;
ip_options_build(skb, opt, inet->cork.addr, rt, 0); ip_options_build(skb, opt, cork->addr, rt, 0);
} }
iph->tos = inet->tos; iph->tos = inet->tos;
iph->frag_off = df; iph->frag_off = df;
...@@ -1315,7 +1344,7 @@ int ip_push_pending_frames(struct sock *sk) ...@@ -1315,7 +1344,7 @@ int ip_push_pending_frames(struct sock *sk)
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
* on dst refcount * on dst refcount
*/ */
inet->cork.dst = NULL; cork->dst = NULL;
skb_dst_set(skb, &rt->dst); skb_dst_set(skb, &rt->dst);
if (iph->protocol == IPPROTO_ICMP) if (iph->protocol == IPPROTO_ICMP)
...@@ -1332,7 +1361,7 @@ int ip_push_pending_frames(struct sock *sk) ...@@ -1332,7 +1361,7 @@ int ip_push_pending_frames(struct sock *sk)
} }
out: out:
ip_cork_release(inet); ip_cork_release(cork);
return err; return err;
error: error:
...@@ -1340,17 +1369,30 @@ int ip_push_pending_frames(struct sock *sk) ...@@ -1340,17 +1369,30 @@ int ip_push_pending_frames(struct sock *sk)
goto out; goto out;
} }
int ip_push_pending_frames(struct sock *sk)
{
return __ip_push_pending_frames(sk, &sk->sk_write_queue,
&inet_sk(sk)->cork);
}
/* /*
* Throw away all pending data on the socket. * Throw away all pending data on the socket.
*/ */
void ip_flush_pending_frames(struct sock *sk) static void __ip_flush_pending_frames(struct sock *sk,
struct sk_buff_head *queue,
struct inet_cork *cork)
{ {
struct sk_buff *skb; struct sk_buff *skb;
while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) while ((skb = __skb_dequeue_tail(queue)) != NULL)
kfree_skb(skb); kfree_skb(skb);
ip_cork_release(inet_sk(sk)); ip_cork_release(cork);
}
void ip_flush_pending_frames(struct sock *sk)
{
__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment