Commit 52900d22 authored by Willem de Bruijn's avatar Willem de Bruijn Committed by David S. Miller

udp: elide zerocopy operation in hot path

With MSG_ZEROCOPY, each skb holds a reference to a struct ubuf_info.
Release of its last reference triggers a completion notification.

The TCP stack in tcp_sendmsg_locked holds an extra ref independent of
the skbs, because it can build, send and free skbs within its loop,
possibly reaching refcount zero and freeing the ubuf_info too soon.

The UDP stack currently also takes this extra ref, but does not need
it as all skbs are sent after return from __ip(6)_append_data.

Avoid the extra refcount_inc and refcount_dec_and_test, and generally
the sock_zerocopy_put in the common path, by passing the initial
reference to the first skb.

This approach is taken instead of initializing the refcount to 0, as
that would generate error "refcount_t: increment on 0" on the
next skb_zcopy_set.

Changes
  v3 -> v4
    - Move skb_zcopy_set below the only kfree_skb that might cause
      a premature uarg destroy before skb_zerocopy_put_abort
      - Move the entire skb_shinfo assignment block, to keep that
        cacheline access in one place
Signed-off-by: default avatarWillem de Bruijn <willemb@google.com>
Acked-by: default avatarPaolo Abeni <pabeni@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent b5947e5d
...@@ -481,7 +481,7 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg) ...@@ -481,7 +481,7 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
} }
void sock_zerocopy_put(struct ubuf_info *uarg); void sock_zerocopy_put(struct ubuf_info *uarg);
void sock_zerocopy_put_abort(struct ubuf_info *uarg); void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
...@@ -1326,9 +1326,13 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) ...@@ -1326,9 +1326,13 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
return is_zcopy ? skb_uarg(skb) : NULL; return is_zcopy ? skb_uarg(skb) : NULL;
} }
static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg) static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
bool *have_ref)
{ {
if (skb && uarg && !skb_zcopy(skb)) { if (skb && uarg && !skb_zcopy(skb)) {
if (unlikely(have_ref && *have_ref))
*have_ref = false;
else
sock_zerocopy_get(uarg); sock_zerocopy_get(uarg);
skb_shinfo(skb)->destructor_arg = uarg; skb_shinfo(skb)->destructor_arg = uarg;
skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
...@@ -1374,7 +1378,7 @@ static inline void skb_zcopy_abort(struct sk_buff *skb) ...@@ -1374,7 +1378,7 @@ static inline void skb_zcopy_abort(struct sk_buff *skb)
struct ubuf_info *uarg = skb_zcopy(skb); struct ubuf_info *uarg = skb_zcopy(skb);
if (uarg) { if (uarg) {
sock_zerocopy_put_abort(uarg); sock_zerocopy_put_abort(uarg, false);
skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
} }
} }
......
...@@ -1089,7 +1089,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg) ...@@ -1089,7 +1089,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg)
} }
EXPORT_SYMBOL_GPL(sock_zerocopy_put); EXPORT_SYMBOL_GPL(sock_zerocopy_put);
void sock_zerocopy_put_abort(struct ubuf_info *uarg) void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{ {
if (uarg) { if (uarg) {
struct sock *sk = skb_from_uarg(uarg)->sk; struct sock *sk = skb_from_uarg(uarg)->sk;
...@@ -1097,6 +1097,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) ...@@ -1097,6 +1097,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
atomic_dec(&sk->sk_zckey); atomic_dec(&sk->sk_zckey);
uarg->len--; uarg->len--;
if (have_uref)
sock_zerocopy_put(uarg); sock_zerocopy_put(uarg);
} }
} }
...@@ -1137,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, ...@@ -1137,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
return err; return err;
} }
skb_zcopy_set(skb, uarg); skb_zcopy_set(skb, uarg, NULL);
return skb->len - orig_len; return skb->len - orig_len;
} }
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
...@@ -1157,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, ...@@ -1157,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
if (skb_copy_ubufs(nskb, GFP_ATOMIC)) if (skb_copy_ubufs(nskb, GFP_ATOMIC))
return -EIO; return -EIO;
} }
skb_zcopy_set(nskb, skb_uarg(orig)); skb_zcopy_set(nskb, skb_uarg(orig), NULL);
} }
return 0; return 0;
} }
......
...@@ -881,8 +881,8 @@ static int __ip_append_data(struct sock *sk, ...@@ -881,8 +881,8 @@ static int __ip_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE; int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst; struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0; unsigned int wmem_alloc_delta = 0;
bool paged, extra_uref;
u32 tskey = 0; u32 tskey = 0;
bool paged;
skb = skb_peek_tail(queue); skb = skb_peek_tail(queue);
...@@ -921,12 +921,13 @@ static int __ip_append_data(struct sock *sk, ...@@ -921,12 +921,13 @@ static int __ip_append_data(struct sock *sk,
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
if (!uarg) if (!uarg)
return -ENOBUFS; return -ENOBUFS;
extra_uref = true;
if (rt->dst.dev->features & NETIF_F_SG && if (rt->dst.dev->features & NETIF_F_SG &&
csummode == CHECKSUM_PARTIAL) { csummode == CHECKSUM_PARTIAL) {
paged = true; paged = true;
} else { } else {
uarg->zerocopy = 0; uarg->zerocopy = 0;
skb_zcopy_set(skb, uarg); skb_zcopy_set(skb, uarg, &extra_uref);
} }
} }
...@@ -1015,13 +1016,6 @@ static int __ip_append_data(struct sock *sk, ...@@ -1015,13 +1016,6 @@ static int __ip_append_data(struct sock *sk,
skb->csum = 0; skb->csum = 0;
skb_reserve(skb, hh_len); skb_reserve(skb, hh_len);
/* only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
skb_zcopy_set(skb, uarg);
/* /*
* Find where to start putting bytes. * Find where to start putting bytes.
*/ */
...@@ -1054,6 +1048,13 @@ static int __ip_append_data(struct sock *sk, ...@@ -1054,6 +1048,13 @@ static int __ip_append_data(struct sock *sk,
exthdrlen = 0; exthdrlen = 0;
csummode = CHECKSUM_NONE; csummode = CHECKSUM_NONE;
/* only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
if ((flags & MSG_CONFIRM) && !skb_prev) if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1); skb_set_dst_pending_confirm(skb, 1);
...@@ -1124,13 +1125,12 @@ static int __ip_append_data(struct sock *sk, ...@@ -1124,13 +1125,12 @@ static int __ip_append_data(struct sock *sk,
if (wmem_alloc_delta) if (wmem_alloc_delta)
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
sock_zerocopy_put(uarg);
return 0; return 0;
error_efault: error_efault:
err = -EFAULT; err = -EFAULT;
error: error:
sock_zerocopy_put_abort(uarg); sock_zerocopy_put_abort(uarg, extra_uref);
cork->length -= length; cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
......
...@@ -1423,7 +1423,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1423,7 +1423,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
if (copied + copied_syn) if (copied + copied_syn)
goto out; goto out;
out_err: out_err:
sock_zerocopy_put_abort(uarg); sock_zerocopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err); err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */ /* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
......
...@@ -1258,7 +1258,7 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1258,7 +1258,7 @@ static int __ip6_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE; int csummode = CHECKSUM_NONE;
unsigned int maxnonfragsize, headersize; unsigned int maxnonfragsize, headersize;
unsigned int wmem_alloc_delta = 0; unsigned int wmem_alloc_delta = 0;
bool paged; bool paged, extra_uref;
skb = skb_peek_tail(queue); skb = skb_peek_tail(queue);
if (!skb) { if (!skb) {
...@@ -1327,12 +1327,13 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1327,12 +1327,13 @@ static int __ip6_append_data(struct sock *sk,
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
if (!uarg) if (!uarg)
return -ENOBUFS; return -ENOBUFS;
extra_uref = true;
if (rt->dst.dev->features & NETIF_F_SG && if (rt->dst.dev->features & NETIF_F_SG &&
csummode == CHECKSUM_PARTIAL) { csummode == CHECKSUM_PARTIAL) {
paged = true; paged = true;
} else { } else {
uarg->zerocopy = 0; uarg->zerocopy = 0;
skb_zcopy_set(skb, uarg); skb_zcopy_set(skb, uarg, &extra_uref);
} }
} }
...@@ -1454,13 +1455,6 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1454,13 +1455,6 @@ static int __ip6_append_data(struct sock *sk,
skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
dst_exthdrlen); dst_exthdrlen);
/* Only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
skb_zcopy_set(skb, uarg);
/* /*
* Find where to start putting bytes * Find where to start putting bytes
*/ */
...@@ -1492,6 +1486,13 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1492,6 +1486,13 @@ static int __ip6_append_data(struct sock *sk,
exthdrlen = 0; exthdrlen = 0;
dst_exthdrlen = 0; dst_exthdrlen = 0;
/* Only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
if ((flags & MSG_CONFIRM) && !skb_prev) if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1); skb_set_dst_pending_confirm(skb, 1);
...@@ -1562,13 +1563,12 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1562,13 +1563,12 @@ static int __ip6_append_data(struct sock *sk,
if (wmem_alloc_delta) if (wmem_alloc_delta)
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
sock_zerocopy_put(uarg);
return 0; return 0;
error_efault: error_efault:
err = -EFAULT; err = -EFAULT;
error: error:
sock_zerocopy_put_abort(uarg); sock_zerocopy_put_abort(uarg, extra_uref);
cork->length -= length; cork->length -= length;
IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment