Commit 850cbadd authored by Paolo Abeni's avatar Paolo Abeni Committed by David S. Miller

udp: use it's own memory accounting schema

Completely avoid default sock memory accounting and replace it
with udp-specific accounting.

Since the new memory accounting model encapsulates completely
the required locking, remove the socket lock on both enqueue and
dequeue, and avoid using the backlog on enqueue.

Be sure to clean-up rx queue memory on socket destruction, using
udp its own sk_destruct.

Tested using pktgen with random src port, 64 bytes packet,
wire-speed on a 10G link as sender and udp_sink as the receiver,
using an l4 tuple rxhash to stress the contention, and one or more
udp_sink instances with reuseport.

nr readers      Kpps (vanilla)  Kpps (patched)
1               170             440
3               1250            2150
6               3000            3650
9               4200            4450
12              5700            6250

v4 -> v5:
  - avoid unneeded test in first_packet_length

v3 -> v4:
  - remove useless sk_rcvqueues_full() call

v2 -> v3:
  - do not set the now unsed backlog_rcv callback

v1 -> v2:
  - add memory pressure support
  - fixed dropwatch accounting for ipv6
Acked-by: default avatarHannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Acked-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent f970bd9e
...@@ -1307,13 +1307,7 @@ static int first_packet_length(struct sock *sk) ...@@ -1307,13 +1307,7 @@ static int first_packet_length(struct sock *sk)
res = skb ? skb->len : -1; res = skb ? skb->len : -1;
spin_unlock_bh(&rcvq->lock); spin_unlock_bh(&rcvq->lock);
if (!skb_queue_empty(&list_kill)) {
bool slow = lock_sock_fast(sk);
__skb_queue_purge(&list_kill); __skb_queue_purge(&list_kill);
sk_mem_reclaim_partial(sk);
unlock_sock_fast(sk, slow);
}
return res; return res;
} }
...@@ -1362,7 +1356,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, ...@@ -1362,7 +1356,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int err; int err;
int is_udplite = IS_UDPLITE(sk); int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false; bool checksum_valid = false;
bool slow;
if (flags & MSG_ERRQUEUE) if (flags & MSG_ERRQUEUE)
return ip_recv_error(sk, msg, len, addr_len); return ip_recv_error(sk, msg, len, addr_len);
...@@ -1403,13 +1396,12 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, ...@@ -1403,13 +1396,12 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
} }
if (unlikely(err)) { if (unlikely(err)) {
trace_kfree_skb(skb, udp_recvmsg);
if (!peeked) { if (!peeked) {
atomic_inc(&sk->sk_drops); atomic_inc(&sk->sk_drops);
UDP_INC_STATS(sock_net(sk), UDP_INC_STATS(sock_net(sk),
UDP_MIB_INERRORS, is_udplite); UDP_MIB_INERRORS, is_udplite);
} }
skb_free_datagram_locked(sk, skb); kfree_skb(skb);
return err; return err;
} }
...@@ -1434,16 +1426,15 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, ...@@ -1434,16 +1426,15 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
if (flags & MSG_TRUNC) if (flags & MSG_TRUNC)
err = ulen; err = ulen;
__skb_free_datagram_locked(sk, skb, peeking ? -err : err); skb_consume_udp(sk, skb, peeking ? -err : err);
return err; return err;
csum_copy_err: csum_copy_err:
slow = lock_sock_fast(sk); if (!__sk_queue_drop_skb(sk, skb, flags)) {
if (!skb_kill_datagram(sk, skb, flags)) {
UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
} }
unlock_sock_fast(sk, slow); kfree_skb(skb);
/* starting over for a new packet, but check if we need to yield */ /* starting over for a new packet, but check if we need to yield */
cond_resched(); cond_resched();
...@@ -1562,7 +1553,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ...@@ -1562,7 +1553,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sk_incoming_cpu_update(sk); sk_incoming_cpu_update(sk);
} }
rc = __sock_queue_rcv_skb(sk, skb); rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) { if (rc < 0) {
int is_udplite = IS_UDPLITE(sk); int is_udplite = IS_UDPLITE(sk);
...@@ -1577,7 +1568,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ...@@ -1577,7 +1568,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
} }
return 0; return 0;
} }
static struct static_key udp_encap_needed __read_mostly; static struct static_key udp_encap_needed __read_mostly;
...@@ -1599,7 +1589,6 @@ EXPORT_SYMBOL(udp_encap_enable); ...@@ -1599,7 +1589,6 @@ EXPORT_SYMBOL(udp_encap_enable);
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{ {
struct udp_sock *up = udp_sk(sk); struct udp_sock *up = udp_sk(sk);
int rc;
int is_udplite = IS_UDPLITE(sk); int is_udplite = IS_UDPLITE(sk);
/* /*
...@@ -1686,25 +1675,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ...@@ -1686,25 +1675,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop; goto drop;
udp_csum_pull_header(skb); udp_csum_pull_header(skb);
if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
__UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
goto drop;
}
rc = 0;
ipv4_pktinfo_prepare(sk, skb); ipv4_pktinfo_prepare(sk, skb);
bh_lock_sock(sk); return __udp_queue_rcv_skb(sk, skb);
if (!sock_owned_by_user(sk))
rc = __udp_queue_rcv_skb(sk, skb);
else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
bh_unlock_sock(sk);
goto drop;
}
bh_unlock_sock(sk);
return rc;
csum_error: csum_error:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
...@@ -2314,13 +2287,13 @@ struct proto udp_prot = { ...@@ -2314,13 +2287,13 @@ struct proto udp_prot = {
.connect = ip4_datagram_connect, .connect = ip4_datagram_connect,
.disconnect = udp_disconnect, .disconnect = udp_disconnect,
.ioctl = udp_ioctl, .ioctl = udp_ioctl,
.init = udp_init_sock,
.destroy = udp_destroy_sock, .destroy = udp_destroy_sock,
.setsockopt = udp_setsockopt, .setsockopt = udp_setsockopt,
.getsockopt = udp_getsockopt, .getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg, .sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg, .recvmsg = udp_recvmsg,
.sendpage = udp_sendpage, .sendpage = udp_sendpage,
.backlog_rcv = __udp_queue_rcv_skb,
.release_cb = ip4_datagram_release_cb, .release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash, .hash = udp_lib_hash,
.unhash = udp_lib_unhash, .unhash = udp_lib_unhash,
......
...@@ -334,7 +334,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -334,7 +334,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int is_udplite = IS_UDPLITE(sk); int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false; bool checksum_valid = false;
int is_udp4; int is_udp4;
bool slow;
if (flags & MSG_ERRQUEUE) if (flags & MSG_ERRQUEUE)
return ipv6_recv_error(sk, msg, len, addr_len); return ipv6_recv_error(sk, msg, len, addr_len);
...@@ -378,7 +377,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -378,7 +377,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto csum_copy_err; goto csum_copy_err;
} }
if (unlikely(err)) { if (unlikely(err)) {
trace_kfree_skb(skb, udpv6_recvmsg);
if (!peeked) { if (!peeked) {
atomic_inc(&sk->sk_drops); atomic_inc(&sk->sk_drops);
if (is_udp4) if (is_udp4)
...@@ -388,7 +386,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -388,7 +386,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
is_udplite); is_udplite);
} }
skb_free_datagram_locked(sk, skb); kfree_skb(skb);
return err; return err;
} }
if (!peeked) { if (!peeked) {
...@@ -437,12 +435,11 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -437,12 +435,11 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (flags & MSG_TRUNC) if (flags & MSG_TRUNC)
err = ulen; err = ulen;
__skb_free_datagram_locked(sk, skb, peeking ? -err : err); skb_consume_udp(sk, skb, peeking ? -err : err);
return err; return err;
csum_copy_err: csum_copy_err:
slow = lock_sock_fast(sk); if (!__sk_queue_drop_skb(sk, skb, flags)) {
if (!skb_kill_datagram(sk, skb, flags)) {
if (is_udp4) { if (is_udp4) {
UDP_INC_STATS(sock_net(sk), UDP_INC_STATS(sock_net(sk),
UDP_MIB_CSUMERRORS, is_udplite); UDP_MIB_CSUMERRORS, is_udplite);
...@@ -455,7 +452,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -455,7 +452,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
UDP_MIB_INERRORS, is_udplite); UDP_MIB_INERRORS, is_udplite);
} }
} }
unlock_sock_fast(sk, slow); kfree_skb(skb);
/* starting over for a new packet, but check if we need to yield */ /* starting over for a new packet, but check if we need to yield */
cond_resched(); cond_resched();
...@@ -523,7 +520,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ...@@ -523,7 +520,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sk_incoming_cpu_update(sk); sk_incoming_cpu_update(sk);
} }
rc = __sock_queue_rcv_skb(sk, skb); rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) { if (rc < 0) {
int is_udplite = IS_UDPLITE(sk); int is_udplite = IS_UDPLITE(sk);
...@@ -535,6 +532,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ...@@ -535,6 +532,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
kfree_skb(skb); kfree_skb(skb);
return -1; return -1;
} }
return 0; return 0;
} }
...@@ -556,7 +554,6 @@ EXPORT_SYMBOL(udpv6_encap_enable); ...@@ -556,7 +554,6 @@ EXPORT_SYMBOL(udpv6_encap_enable);
int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{ {
struct udp_sock *up = udp_sk(sk); struct udp_sock *up = udp_sk(sk);
int rc;
int is_udplite = IS_UDPLITE(sk); int is_udplite = IS_UDPLITE(sk);
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
...@@ -622,25 +619,10 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ...@@ -622,25 +619,10 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop; goto drop;
udp_csum_pull_header(skb); udp_csum_pull_header(skb);
if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
__UDP6_INC_STATS(sock_net(sk),
UDP_MIB_RCVBUFERRORS, is_udplite);
goto drop;
}
skb_dst_drop(skb); skb_dst_drop(skb);
bh_lock_sock(sk); return __udpv6_queue_rcv_skb(sk, skb);
rc = 0;
if (!sock_owned_by_user(sk))
rc = __udpv6_queue_rcv_skb(sk, skb);
else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
bh_unlock_sock(sk);
goto drop;
}
bh_unlock_sock(sk);
return rc;
csum_error: csum_error:
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
...@@ -1433,12 +1415,12 @@ struct proto udpv6_prot = { ...@@ -1433,12 +1415,12 @@ struct proto udpv6_prot = {
.connect = ip6_datagram_connect, .connect = ip6_datagram_connect,
.disconnect = udp_disconnect, .disconnect = udp_disconnect,
.ioctl = udp_ioctl, .ioctl = udp_ioctl,
.init = udp_init_sock,
.destroy = udpv6_destroy_sock, .destroy = udpv6_destroy_sock,
.setsockopt = udpv6_setsockopt, .setsockopt = udpv6_setsockopt,
.getsockopt = udpv6_getsockopt, .getsockopt = udpv6_getsockopt,
.sendmsg = udpv6_sendmsg, .sendmsg = udpv6_sendmsg,
.recvmsg = udpv6_recvmsg, .recvmsg = udpv6_recvmsg,
.backlog_rcv = __udpv6_queue_rcv_skb,
.release_cb = ip6_datagram_release_cb, .release_cb = ip6_datagram_release_cb,
.hash = udp_lib_hash, .hash = udp_lib_hash,
.unhash = udp_lib_unhash, .unhash = udp_lib_unhash,
......
...@@ -39,6 +39,7 @@ ...@@ -39,6 +39,7 @@
#include <net/checksum.h> #include <net/checksum.h>
#include <net/ip.h> #include <net/ip.h>
#include <net/ipv6.h> #include <net/ipv6.h>
#include <net/udp.h>
#include <net/tcp.h> #include <net/tcp.h>
#include <net/tcp_states.h> #include <net/tcp_states.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
...@@ -129,6 +130,18 @@ static void svc_release_skb(struct svc_rqst *rqstp) ...@@ -129,6 +130,18 @@ static void svc_release_skb(struct svc_rqst *rqstp)
} }
} }
static void svc_release_udp_skb(struct svc_rqst *rqstp)
{
struct sk_buff *skb = rqstp->rq_xprt_ctxt;
if (skb) {
rqstp->rq_xprt_ctxt = NULL;
dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
consume_skb(skb);
}
}
union svc_pktinfo_u { union svc_pktinfo_u {
struct in_pktinfo pkti; struct in_pktinfo pkti;
struct in6_pktinfo pkti6; struct in6_pktinfo pkti6;
...@@ -575,7 +588,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) ...@@ -575,7 +588,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
goto out_free; goto out_free;
} }
local_bh_enable(); local_bh_enable();
skb_free_datagram_locked(svsk->sk_sk, skb); consume_skb(skb);
} else { } else {
/* we can use it in-place */ /* we can use it in-place */
rqstp->rq_arg.head[0].iov_base = skb->data; rqstp->rq_arg.head[0].iov_base = skb->data;
...@@ -602,8 +615,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) ...@@ -602,8 +615,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
return len; return len;
out_free: out_free:
trace_kfree_skb(skb, svc_udp_recvfrom); kfree_skb(skb);
skb_free_datagram_locked(svsk->sk_sk, skb);
return 0; return 0;
} }
...@@ -660,7 +672,7 @@ static struct svc_xprt_ops svc_udp_ops = { ...@@ -660,7 +672,7 @@ static struct svc_xprt_ops svc_udp_ops = {
.xpo_create = svc_udp_create, .xpo_create = svc_udp_create,
.xpo_recvfrom = svc_udp_recvfrom, .xpo_recvfrom = svc_udp_recvfrom,
.xpo_sendto = svc_udp_sendto, .xpo_sendto = svc_udp_sendto,
.xpo_release_rqst = svc_release_skb, .xpo_release_rqst = svc_release_udp_skb,
.xpo_detach = svc_sock_detach, .xpo_detach = svc_sock_detach,
.xpo_free = svc_sock_free, .xpo_free = svc_sock_free,
.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
......
...@@ -1083,7 +1083,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport) ...@@ -1083,7 +1083,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
skb = skb_recv_datagram(sk, 0, 1, &err); skb = skb_recv_datagram(sk, 0, 1, &err);
if (skb != NULL) { if (skb != NULL) {
xs_udp_data_read_skb(&transport->xprt, sk, skb); xs_udp_data_read_skb(&transport->xprt, sk, skb);
skb_free_datagram_locked(sk, skb); consume_skb(skb);
continue; continue;
} }
if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment