Commit cb586c63 authored by David S. Miller's avatar David S. Miller

Merge branch 'udp-gso'

Willem de Bruijn says:

====================
udp gso

Segmentation offload reduces cycles/byte for large packets by
amortizing the cost of protocol stack traversal.

This patchset implements GSO for UDP. A process can concatenate and
submit multiple datagrams to the same destination in one send call
by setting socket option SOL_UDP/UDP_SEGMENT with the segment size,
or passing an analogous cmsg at send time.

The stack will send the entire large (up to network layer max size)
datagram through the protocol layer. At the GSO layer, it is broken
up in individual segments. All receive the same network layer header
and UDP src and dst port. All but the last segment have the same UDP
header, but the last may differ in length and checksum.

Initial results show a significant reduction in UDP cycles/byte.
See the main patch for more details and benchmark results.

        udp
          876 MB/s 14873 msg/s 624666 calls/s
            11,205,777,429      cycles

        udp gso
         2139 MB/s 36282 msg/s 36282 calls/s
            11,204,374,561      cycles

The patch set is broken down as follows:
- patch 1 is a prerequisite: code rearrangement, noop otherwise
- patch 2 implements the gso logic
- patch 3 adds protocol stack support for UDP_SEGMENT
- patch 4,5,7 are refinements
- patch 6 adds the cmsg interface
- patch 8..11 are tests

This idea was presented previously at netconf 2017-2
http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf

Changes v1 -> v2
  - Convert __udp_gso_segment to modify headers after skb_segment
  - Split main patch into two, one for gso logic, one for UDP_SEGMENT

Changes RFC -> v1
  - MSG_MORE:
      fixed, by allowing checksum offload with corking if gso
  - SKB_GSO_UDP_L4:
      made independent from SKB_GSO_UDP
      and removed skb_is_ufo() wrapper
  - NETIF_F_GSO_UDP_L4:
      add to netdev_features_string
      and to netdev-features.txt
      add BUILD_BUG_ON to match SKB_GSO_UDP_L4 value
  - UDP_MAX_SEGMENTS:
      introduce limit on number of segments per gso skb
      to avoid extreme cases like IP_MAX_MTU/IPV4_MIN_MTU
  - CHECKSUM_PARTIAL:
      test against missing feature after ndo_features_check
      if not supported return error, analogous to udp_send_check
  - MSG_ZEROCOPY: removed, deferred for now
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents a9537c93 3a687bef
......@@ -113,6 +113,13 @@ whatever headers there might be.
NETIF_F_TSO_ECN means that hardware can properly split packets with CWR bit
set, be it TCPv4 (when NETIF_F_TSO is enabled) or TCPv6 (NETIF_F_TSO6).
* Transmit UDP segmentation offload
NETIF_F_GSO_UDP_GSO_L4 accepts a single UDP header with a payload that exceeds
gso_size. On segmentation, it segments the payload on gso_size boundaries and
replicates the network and UDP headers (fixing up the last one if less than
gso_size).
* Transmit DMA from high memory
On platforms where this is relevant, NETIF_F_HIGHDMA signals that
......
......@@ -55,8 +55,9 @@ enum {
NETIF_F_GSO_SCTP_BIT, /* ... SCTP fragmentation */
NETIF_F_GSO_ESP_BIT, /* ... ESP with TSO */
NETIF_F_GSO_UDP_BIT, /* ... UFO, deprecated except tuntap */
NETIF_F_GSO_UDP_L4_BIT, /* ... UDP payload GSO (not UFO) */
/**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */
NETIF_F_GSO_UDP_BIT,
NETIF_F_GSO_UDP_L4_BIT,
NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */
NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */
......@@ -147,6 +148,7 @@ enum {
#define NETIF_F_HW_ESP_TX_CSUM __NETIF_F(HW_ESP_TX_CSUM)
#define NETIF_F_RX_UDP_TUNNEL_PORT __NETIF_F(RX_UDP_TUNNEL_PORT)
#define NETIF_F_HW_TLS_RECORD __NETIF_F(HW_TLS_RECORD)
#define NETIF_F_GSO_UDP_L4 __NETIF_F(GSO_UDP_L4)
#define for_each_netdev_feature(mask_addr, bit) \
for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
......@@ -216,6 +218,7 @@ enum {
NETIF_F_GSO_GRE_CSUM | \
NETIF_F_GSO_IPXIP4 | \
NETIF_F_GSO_IPXIP6 | \
NETIF_F_GSO_UDP_L4 | \
NETIF_F_GSO_UDP_TUNNEL | \
NETIF_F_GSO_UDP_TUNNEL_CSUM)
......
......@@ -4186,6 +4186,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT));
return (features & feature) == feature;
}
......
......@@ -573,6 +573,8 @@ enum {
SKB_GSO_ESP = 1 << 15,
SKB_GSO_UDP = 1 << 16,
SKB_GSO_UDP_L4 = 1 << 17,
};
#if BITS_PER_LONG > 32
......
......@@ -55,6 +55,7 @@ struct udp_sock {
* when the socket is uncorked.
*/
__u16 len; /* total length of pending frames */
__u16 gso_size;
/*
* Fields specific to UDP-Lite.
*/
......@@ -87,6 +88,8 @@ struct udp_sock {
int forward_deficit;
};
#define UDP_MAX_SEGMENTS (1 << 6UL)
static inline struct udp_sock *udp_sk(const struct sock *sk)
{
return (struct udp_sock *)sk;
......
......@@ -147,6 +147,7 @@ struct inet_cork {
__u8 ttl;
__s16 tos;
char priority;
__u16 gso_size;
};
struct inet_cork_full {
......
......@@ -76,6 +76,7 @@ struct ipcm_cookie {
__u8 ttl;
__s16 tos;
char priority;
__u16 gso_size;
};
#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
......@@ -171,7 +172,7 @@ struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable **rtp,
unsigned int flags);
struct inet_cork *cork, unsigned int flags);
static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4)
{
......
......@@ -298,6 +298,7 @@ struct ipcm6_cookie {
__s16 tclass;
__s8 dontfrag;
struct ipv6_txoptions *opt;
__u16 gso_size;
};
static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
......@@ -950,6 +951,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
void *from, int length, int transhdrlen,
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
struct rt6_info *rt, unsigned int flags,
struct inet_cork_full *cork,
const struct sockcm_cookie *sockc);
static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
......
......@@ -174,6 +174,10 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
struct udphdr *uh, udp_lookup_t lookup);
int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
netdev_features_t features,
unsigned int mss, __sum16 check);
static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
{
struct udphdr *uh;
......@@ -269,6 +273,7 @@ int udp_abort(struct sock *sk, int err);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
int udp_push_pending_frames(struct sock *sk);
void udp_flush_pending_frames(struct sock *sk);
int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
int udp_rcv(struct sk_buff *skb);
int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
......
......@@ -32,6 +32,7 @@ struct udphdr {
#define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */
#define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */
#define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */
#define UDP_SEGMENT 103 /* Set GSO segmentation size */
/* UDP encapsulation types */
#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
......
......@@ -92,6 +92,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
[NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
[NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
[NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
......
......@@ -4940,6 +4940,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
thlen = tcp_hdrlen(skb);
} else if (unlikely(skb_is_gso_sctp(skb))) {
thlen = sizeof(struct sctphdr);
} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
thlen = sizeof(struct udphdr);
}
/* UFO sets gso_size to the size of the fragmentation
* payload, i.e. the size of the L4 (UDP) header is already
......
......@@ -878,11 +878,14 @@ static int __ip_append_data(struct sock *sk,
struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0;
u32 tskey = 0;
bool paged;
skb = skb_peek_tail(queue);
exthdrlen = !skb ? rt->dst.header_len : 0;
mtu = cork->fragsize;
mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
paged = !!cork->gso_size;
if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
tskey = sk->sk_tskey++;
......@@ -906,7 +909,7 @@ static int __ip_append_data(struct sock *sk,
if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
!(flags & MSG_MORE) &&
(!(flags & MSG_MORE) || cork->gso_size) &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
......@@ -933,6 +936,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
unsigned int pagedlen = 0;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
......@@ -953,8 +957,12 @@ static int __ip_append_data(struct sock *sk,
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
else
else if (!paged)
alloclen = fraglen;
else {
alloclen = min_t(int, fraglen, MAX_HEADER);
pagedlen = fraglen - alloclen;
}
alloclen += exthdrlen;
......@@ -998,7 +1006,7 @@ static int __ip_append_data(struct sock *sk,
/*
* Find where to start putting bytes.
*/
data = skb_put(skb, fraglen + exthdrlen);
data = skb_put(skb, fraglen + exthdrlen - pagedlen);
skb_set_network_header(skb, exthdrlen);
skb->transport_header = (skb->network_header +
fragheaderlen);
......@@ -1014,7 +1022,7 @@ static int __ip_append_data(struct sock *sk,
pskb_trim_unique(skb_prev, maxfraglen);
}
copy = datalen - transhdrlen - fraggap;
copy = datalen - transhdrlen - fraggap - pagedlen;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
......@@ -1022,7 +1030,7 @@ static int __ip_append_data(struct sock *sk,
}
offset += copy;
length -= datalen - fraggap;
length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
csummode = CHECKSUM_NONE;
......@@ -1135,6 +1143,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
*rtp = NULL;
cork->fragsize = ip_sk_use_pmtu(sk) ?
dst_mtu(&rt->dst) : rt->dst.dev->mtu;
cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
cork->dst = &rt->dst;
cork->length = 0;
cork->ttl = ipc->ttl;
......@@ -1214,7 +1224,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
return -EOPNOTSUPP;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
mtu = cork->fragsize;
mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
......@@ -1470,9 +1480,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable **rtp,
unsigned int flags)
struct inet_cork *cork, unsigned int flags)
{
struct inet_cork cork;
struct sk_buff_head queue;
int err;
......@@ -1481,22 +1490,22 @@ struct sk_buff *ip_make_skb(struct sock *sk,
__skb_queue_head_init(&queue);
cork.flags = 0;
cork.addr = 0;
cork.opt = NULL;
err = ip_setup_cork(sk, &cork, ipc, rtp);
cork->flags = 0;
cork->addr = 0;
cork->opt = NULL;
err = ip_setup_cork(sk, cork, ipc, rtp);
if (err)
return ERR_PTR(err);
err = __ip_append_data(sk, fl4, &queue, &cork,
err = __ip_append_data(sk, fl4, &queue, cork,
&current->task_frag, getfrag,
from, length, transhdrlen, flags);
if (err) {
__ip_flush_pending_frames(sk, &queue, &cork);
__ip_flush_pending_frames(sk, &queue, cork);
return ERR_PTR(err);
}
return __ip_make_skb(sk, fl4, &queue, &cork);
return __ip_make_skb(sk, fl4, &queue, cork);
}
/*
......
......@@ -757,7 +757,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
}
EXPORT_SYMBOL(udp_set_csum);
static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
struct inet_cork *cork)
{
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
......@@ -777,6 +778,21 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
uh->len = htons(len);
uh->check = 0;
if (cork->gso_size) {
const int hlen = skb_network_header_len(skb) +
sizeof(struct udphdr);
if (hlen + cork->gso_size > cork->fragsize)
return -EINVAL;
if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
return -EINVAL;
if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
return -EIO;
skb_shinfo(skb)->gso_size = cork->gso_size;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
}
if (is_udplite) /* UDP-Lite */
csum = udplite_csum(skb);
......@@ -828,7 +844,7 @@ int udp_push_pending_frames(struct sock *sk)
if (!skb)
goto out;
err = udp_send_skb(skb, fl4);
err = udp_send_skb(skb, fl4, &inet->cork.base);
out:
up->len = 0;
......@@ -837,6 +853,43 @@ int udp_push_pending_frames(struct sock *sk)
}
EXPORT_SYMBOL(udp_push_pending_frames);
static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
{
switch (cmsg->cmsg_type) {
case UDP_SEGMENT:
if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
return -EINVAL;
*gso_size = *(__u16 *)CMSG_DATA(cmsg);
return 0;
default:
return -EINVAL;
}
}
int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
{
struct cmsghdr *cmsg;
bool need_ip = false;
int err;
for_each_cmsghdr(cmsg, msg) {
if (!CMSG_OK(msg, cmsg))
return -EINVAL;
if (cmsg->cmsg_level != SOL_UDP) {
need_ip = true;
continue;
}
err = __udp_cmsg_send(cmsg, gso_size);
if (err)
return err;
}
return need_ip;
}
EXPORT_SYMBOL_GPL(udp_cmsg_send);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
......@@ -922,10 +975,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.sockc.tsflags = sk->sk_tsflags;
ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
ipc.gso_size = up->gso_size;
if (msg->msg_controllen) {
err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
if (unlikely(err)) {
err = udp_cmsg_send(sk, msg, &ipc.gso_size);
if (err > 0)
err = ip_cmsg_send(sk, msg, &ipc,
sk->sk_family == AF_INET6);
if (unlikely(err < 0)) {
kfree(ipc.opt);
return err;
}
......@@ -1030,12 +1087,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* Lockless fast path for the non-corking case. */
if (!corkreq) {
struct inet_cork cork;
skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc, &rt,
msg->msg_flags);
&cork, msg->msg_flags);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
err = udp_send_skb(skb, fl4);
err = udp_send_skb(skb, fl4, &cork);
goto out;
}
......@@ -2365,6 +2424,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
up->no_check6_rx = valbool;
break;
case UDP_SEGMENT:
if (val < 0 || val > USHRT_MAX)
return -EINVAL;
up->gso_size = val;
break;
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
......@@ -2455,6 +2520,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
val = up->no_check6_rx;
break;
case UDP_SEGMENT:
val = up->gso_size;
break;
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */
case UDPLITE_SEND_CSCOV:
......
......@@ -187,6 +187,68 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
}
EXPORT_SYMBOL(skb_udp_tunnel_segment);
struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
netdev_features_t features,
unsigned int mss, __sum16 check)
{
struct sock *sk = gso_skb->sk;
unsigned int sum_truesize = 0;
struct sk_buff *segs, *seg;
unsigned int hdrlen;
struct udphdr *uh;
if (gso_skb->len <= sizeof(*uh) + mss)
return ERR_PTR(-EINVAL);
hdrlen = gso_skb->data - skb_mac_header(gso_skb);
skb_pull(gso_skb, sizeof(*uh));
/* clear destructor to avoid skb_segment assigning it to tail */
WARN_ON_ONCE(gso_skb->destructor != sock_wfree);
gso_skb->destructor = NULL;
segs = skb_segment(gso_skb, features);
if (unlikely(IS_ERR_OR_NULL(segs))) {
gso_skb->destructor = sock_wfree;
return segs;
}
for (seg = segs; seg; seg = seg->next) {
uh = udp_hdr(seg);
uh->len = htons(seg->len - hdrlen);
uh->check = check;
/* last packet can be partial gso_size */
if (!seg->next)
csum_replace2(&uh->check, htons(mss),
htons(seg->len - hdrlen - sizeof(*uh)));
seg->destructor = sock_wfree;
seg->sk = sk;
sum_truesize += seg->truesize;
}
refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
return segs;
}
EXPORT_SYMBOL_GPL(__udp_gso_segment);
static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
netdev_features_t features)
{
const struct iphdr *iph = ip_hdr(gso_skb);
unsigned int mss = skb_shinfo(gso_skb)->gso_size;
if (!can_checksum_protocol(features, htons(ETH_P_IP)))
return ERR_PTR(-EIO);
return __udp_gso_segment(gso_skb, features, mss,
udp_v4_check(sizeof(struct udphdr) + mss,
iph->saddr, iph->daddr, 0));
}
EXPORT_SYMBOL_GPL(__udp4_gso_segment);
static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
{
......@@ -203,12 +265,15 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
goto out;
}
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP))
if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
goto out;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto out;
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
return __udp4_gso_segment(skb, features);
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;
......
......@@ -88,9 +88,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
if (skb->encapsulation &&
skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
udpfrag = proto == IPPROTO_UDP && encap;
udpfrag = proto == IPPROTO_UDP && encap &&
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
else
udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
udpfrag = proto == IPPROTO_UDP && !skb->encapsulation &&
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
ops = rcu_dereference(inet6_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment)) {
......
......@@ -1240,6 +1240,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
if (mtu < IPV6_MIN_MTU)
return -EINVAL;
cork->base.fragsize = mtu;
cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
if (dst_allfrag(xfrm_dst_path(&rt->dst)))
cork->base.flags |= IPCORK_ALLFRAG;
cork->base.length = 0;
......@@ -1274,6 +1276,7 @@ static int __ip6_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE;
unsigned int maxnonfragsize, headersize;
unsigned int wmem_alloc_delta = 0;
bool paged;
skb = skb_peek_tail(queue);
if (!skb) {
......@@ -1281,7 +1284,8 @@ static int __ip6_append_data(struct sock *sk,
dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
}
mtu = cork->fragsize;
paged = !!cork->gso_size;
mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
orig_mtu = mtu;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
......@@ -1329,7 +1333,7 @@ static int __ip6_append_data(struct sock *sk,
if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
headersize == sizeof(struct ipv6hdr) &&
length <= mtu - headersize &&
!(flags & MSG_MORE) &&
(!(flags & MSG_MORE) || cork->gso_size) &&
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
......@@ -1372,6 +1376,7 @@ static int __ip6_append_data(struct sock *sk,
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
unsigned int pagedlen = 0;
alloc_new_skb:
/* There's no room in the current skb */
if (skb)
......@@ -1394,11 +1399,17 @@ static int __ip6_append_data(struct sock *sk,
if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
fraglen = datalen + fragheaderlen;
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
else
alloclen = datalen + fragheaderlen;
else if (!paged)
alloclen = fraglen;
else {
alloclen = min_t(int, fraglen, MAX_HEADER);
pagedlen = fraglen - alloclen;
}
alloclen += dst_exthdrlen;
......@@ -1420,7 +1431,7 @@ static int __ip6_append_data(struct sock *sk,
*/
alloclen += sizeof(struct frag_hdr);
copy = datalen - transhdrlen - fraggap;
copy = datalen - transhdrlen - fraggap - pagedlen;
if (copy < 0) {
err = -EINVAL;
goto error;
......@@ -1459,7 +1470,7 @@ static int __ip6_append_data(struct sock *sk,
/*
* Find where to start putting bytes
*/
data = skb_put(skb, fraglen);
data = skb_put(skb, fraglen - pagedlen);
skb_set_network_header(skb, exthdrlen);
data += fragheaderlen;
skb->transport_header = (skb->network_header +
......@@ -1482,7 +1493,7 @@ static int __ip6_append_data(struct sock *sk,
}
offset += copy;
length -= datalen - fraggap;
length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
dst_exthdrlen = 0;
......@@ -1755,9 +1766,9 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
void *from, int length, int transhdrlen,
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
struct rt6_info *rt, unsigned int flags,
struct inet_cork_full *cork,
const struct sockcm_cookie *sockc)
{
struct inet_cork_full cork;
struct inet6_cork v6_cork;
struct sk_buff_head queue;
int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
......@@ -1768,27 +1779,27 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
__skb_queue_head_init(&queue);
cork.base.flags = 0;
cork.base.addr = 0;
cork.base.opt = NULL;
cork.base.dst = NULL;
cork->base.flags = 0;
cork->base.addr = 0;
cork->base.opt = NULL;
cork->base.dst = NULL;
v6_cork.opt = NULL;
err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
if (err) {
ip6_cork_release(&cork, &v6_cork);
ip6_cork_release(cork, &v6_cork);
return ERR_PTR(err);
}
if (ipc6->dontfrag < 0)
ipc6->dontfrag = inet6_sk(sk)->dontfrag;
err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
&current->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
flags, ipc6, sockc);
if (err) {
__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
return ERR_PTR(err);
}
return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
return __ip6_make_skb(sk, &queue, cork, &v6_cork);
}
......@@ -1023,7 +1023,8 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
* Sending
*/
static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
struct inet_cork *cork)
{
struct sock *sk = skb->sk;
struct udphdr *uh;
......@@ -1042,6 +1043,21 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
uh->len = htons(len);
uh->check = 0;
if (cork->gso_size) {
const int hlen = skb_network_header_len(skb) +
sizeof(struct udphdr);
if (hlen + cork->gso_size > cork->fragsize)
return -EINVAL;
if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
return -EINVAL;
if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
return -EIO;
skb_shinfo(skb)->gso_size = cork->gso_size;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
}
if (is_udplite)
csum = udplite_csum(skb);
else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */
......@@ -1093,7 +1109,7 @@ static int udp_v6_push_pending_frames(struct sock *sk)
if (!skb)
goto out;
err = udp_v6_send_skb(skb, &fl6);
err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
out:
up->len = 0;
......@@ -1127,6 +1143,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc6.hlimit = -1;
ipc6.tclass = -1;
ipc6.dontfrag = -1;
ipc6.gso_size = up->gso_size;
sockc.tsflags = sk->sk_tsflags;
/* destination address check */
......@@ -1259,7 +1276,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
opt->tot_len = sizeof(*opt);
ipc6.opt = opt;
err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc);
err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
if (err > 0)
err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
&ipc6, &sockc);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
......@@ -1324,15 +1344,16 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* Lockless fast path for the non-corking case */
if (!corkreq) {
struct inet_cork_full cork;
struct sk_buff *skb;
skb = ip6_make_skb(sk, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc6,
&fl6, (struct rt6_info *)dst,
msg->msg_flags, &sockc);
msg->msg_flags, &cork, &sockc);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
err = udp_v6_send_skb(skb, &fl6);
err = udp_v6_send_skb(skb, &fl6, &cork.base);
goto out;
}
......
......@@ -17,6 +17,20 @@
#include <net/ip6_checksum.h>
#include "ip6_offload.h"
static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
netdev_features_t features)
{
const struct ipv6hdr *ip6h = ipv6_hdr(gso_skb);
unsigned int mss = skb_shinfo(gso_skb)->gso_size;
if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
return ERR_PTR(-EIO);
return __udp_gso_segment(gso_skb, features, mss,
udp_v6_check(sizeof(struct udphdr) + mss,
&ip6h->saddr, &ip6h->daddr, 0));
}
static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
{
......@@ -42,12 +56,15 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
const struct ipv6hdr *ipv6h;
struct udphdr *uh;
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP))
if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
goto out;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto out;
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
return __udp6_gso_segment(skb, features);
/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
* do checksum of UDP packets sent as multiple IP fragments.
*/
......
......@@ -8,3 +8,6 @@ reuseport_bpf_numa
reuseport_dualstack
reuseaddr_conflict
tcp_mmap
udpgso
udpgso_bench_rx
udpgso_bench_tx
......@@ -5,12 +5,14 @@ CFLAGS = -Wall -Wl,--no-as-needed -O2 -g
CFLAGS += -I../../../../usr/include/
TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
TEST_PROGS += fib_tests.sh fib-onlink-tests.sh in_netns.sh pmtu.sh
TEST_PROGS += fib_tests.sh fib-onlink-tests.sh in_netns.sh pmtu.sh udpgso.sh
TEST_PROGS += udpgso_bench.sh
TEST_GEN_FILES = socket
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
TEST_GEN_FILES += tcp_mmap
TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict
TEST_GEN_PROGS += udpgso udpgso_bench_tx udpgso_bench_rx
include ../lib.mk
......
This diff is collapsed.
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
#
# Run a series of udpgso regression tests
echo "ipv4 cmsg"
./in_netns.sh ./udpgso -4 -C
echo "ipv4 setsockopt"
./in_netns.sh ./udpgso -4 -C -s
echo "ipv6 cmsg"
./in_netns.sh ./udpgso -6 -C
echo "ipv6 setsockopt"
./in_netns.sh ./udpgso -6 -C -s
echo "ipv4 connected"
./in_netns.sh ./udpgso -4 -c
# blocked on 2nd loopback address
# echo "ipv6 connected"
# ./in_netns.sh ./udpgso -6 -c
echo "ipv4 msg_more"
./in_netns.sh ./udpgso -4 -C -m
echo "ipv6 msg_more"
./in_netns.sh ./udpgso -6 -C -m
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
#
# Run a series of udpgso benchmarks
wake_children() {
local -r jobs="$(jobs -p)"
if [[ "${jobs}" != "" ]]; then
kill -1 ${jobs} 2>/dev/null
fi
}
trap wake_children EXIT
run_one() {
local -r args=$@
./udpgso_bench_rx &
./udpgso_bench_rx -t &
./udpgso_bench_tx ${args}
}
run_in_netns() {
local -r args=$@
./in_netns.sh $0 __subprocess ${args}
}
run_udp() {
local -r args=$@
echo "udp"
run_in_netns ${args}
echo "udp gso"
run_in_netns ${args} -S
echo "udp gso zerocopy"
run_in_netns ${args} -S -z
}
run_tcp() {
local -r args=$@
echo "tcp"
run_in_netns ${args} -t
echo "tcp zerocopy"
run_in_netns ${args} -t -z
}
run_all() {
local -r core_args="-l 4"
local -r ipv4_args="${core_args} -4 -D 127.0.0.1"
local -r ipv6_args="${core_args} -6 -D ::1"
echo "ipv4"
run_tcp "${ipv4_args}"
run_udp "${ipv4_args}"
echo "ipv6"
run_tcp "${ipv4_args}"
run_udp "${ipv6_args}"
}
if [[ $# -eq 0 ]]; then
run_all
elif [[ $1 == "__subprocess" ]]; then
shift
run_one $@
else
run_in_netns $@
fi
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <arpa/inet.h>
#include <error.h>
#include <errno.h>
#include <limits.h>
#include <linux/errqueue.h>
#include <linux/if_packet.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <poll.h>
#include <sched.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
static int cfg_port = 8000;
static bool cfg_tcp;
static bool cfg_verify;
static bool interrupted;
static unsigned long packets, bytes;
static void sigint_handler(int signum)
{
if (signum == SIGINT)
interrupted = true;
}
static unsigned long gettimeofday_ms(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
}
static void do_poll(int fd)
{
struct pollfd pfd;
int ret;
pfd.events = POLLIN;
pfd.revents = 0;
pfd.fd = fd;
do {
ret = poll(&pfd, 1, 10);
if (ret == -1)
error(1, errno, "poll");
if (ret == 0)
continue;
if (pfd.revents != POLLIN)
error(1, errno, "poll: 0x%x expected 0x%x\n",
pfd.revents, POLLIN);
} while (!ret && !interrupted);
}
static int do_socket(bool do_tcp)
{
struct sockaddr_in6 addr = {0};
int fd, val;
fd = socket(PF_INET6, cfg_tcp ? SOCK_STREAM : SOCK_DGRAM, 0);
if (fd == -1)
error(1, errno, "socket");
val = 1 << 21;
if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)))
error(1, errno, "setsockopt rcvbuf");
val = 1;
if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val)))
error(1, errno, "setsockopt reuseport");
addr.sin6_family = PF_INET6;
addr.sin6_port = htons(cfg_port);
addr.sin6_addr = in6addr_any;
if (bind(fd, (void *) &addr, sizeof(addr)))
error(1, errno, "bind");
if (do_tcp) {
int accept_fd = fd;
if (listen(accept_fd, 1))
error(1, errno, "listen");
do_poll(accept_fd);
fd = accept(accept_fd, NULL, NULL);
if (fd == -1)
error(1, errno, "accept");
if (close(accept_fd))
error(1, errno, "close accept fd");
}
return fd;
}
/* Flush all outstanding bytes for the tcp receive queue */
static void do_flush_tcp(int fd)
{
int ret;
while (true) {
/* MSG_TRUNC flushes up to len bytes */
ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
if (ret == -1 && errno == EAGAIN)
return;
if (ret == -1)
error(1, errno, "flush");
if (ret == 0) {
/* client detached */
exit(0);
}
packets++;
bytes += ret;
}
}
static char sanitized_char(char val)
{
return (val >= 'a' && val <= 'z') ? val : '.';
}
static void do_verify_udp(const char *data, int len)
{
char cur = data[0];
int i;
/* verify contents */
if (cur < 'a' || cur > 'z')
error(1, 0, "data initial byte out of range");
for (i = 1; i < len; i++) {
if (cur == 'z')
cur = 'a';
else
cur++;
if (data[i] != cur)
error(1, 0, "data[%d]: len %d, %c(%hhu) != %c(%hhu)\n",
i, len,
sanitized_char(data[i]), data[i],
sanitized_char(cur), cur);
}
}
/* Flush all outstanding datagrams. Verify first few bytes of each. */
static void do_flush_udp(int fd)
{
static char rbuf[ETH_DATA_LEN];
int ret, len, budget = 256;
len = cfg_verify ? sizeof(rbuf) : 0;
while (budget--) {
/* MSG_TRUNC will make return value full datagram length */
ret = recv(fd, rbuf, len, MSG_TRUNC | MSG_DONTWAIT);
if (ret == -1 && errno == EAGAIN)
return;
if (ret == -1)
error(1, errno, "recv");
if (len) {
if (ret == 0)
error(1, errno, "recv: 0 byte datagram\n");
do_verify_udp(rbuf, ret);
}
packets++;
bytes += ret;
}
}
static void usage(const char *filepath)
{
error(1, 0, "Usage: %s [-tv] [-p port]", filepath);
}
static void parse_opts(int argc, char **argv)
{
int c;
while ((c = getopt(argc, argv, "ptv")) != -1) {
switch (c) {
case 'p':
cfg_port = htons(strtoul(optarg, NULL, 0));
break;
case 't':
cfg_tcp = true;
break;
case 'v':
cfg_verify = true;
break;
}
}
if (optind != argc)
usage(argv[0]);
if (cfg_tcp && cfg_verify)
error(1, 0, "TODO: implement verify mode for tcp");
}
static void do_recv(void)
{
unsigned long tnow, treport;
int fd;
fd = do_socket(cfg_tcp);
treport = gettimeofday_ms() + 1000;
do {
do_poll(fd);
if (cfg_tcp)
do_flush_tcp(fd);
else
do_flush_udp(fd);
tnow = gettimeofday_ms();
if (tnow > treport) {
if (packets)
fprintf(stderr,
"%s rx: %6lu MB/s %8lu calls/s\n",
cfg_tcp ? "tcp" : "udp",
bytes >> 20, packets);
bytes = packets = 0;
treport = tnow + 1000;
}
} while (!interrupted);
if (close(fd))
error(1, errno, "close");
}
int main(int argc, char **argv)
{
parse_opts(argc, argv);
signal(SIGINT, sigint_handler);
do_recv();
return 0;
}
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <arpa/inet.h>
#include <errno.h>
#include <error.h>
#include <netinet/if_ether.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet/udp.h>
#include <poll.h>
#include <sched.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#ifndef ETH_MAX_MTU
#define ETH_MAX_MTU 0xFFFFU
#endif
#ifndef UDP_SEGMENT
#define UDP_SEGMENT 103
#endif
#ifndef SO_ZEROCOPY
#define SO_ZEROCOPY 60
#endif
#ifndef MSG_ZEROCOPY
#define MSG_ZEROCOPY 0x4000000
#endif
#define NUM_PKT 100
static bool cfg_cache_trash;
static int cfg_cpu = -1;
static int cfg_connected = true;
static int cfg_family = PF_UNSPEC;
static uint16_t cfg_mss;
static int cfg_payload_len = (1472 * 42);
static int cfg_port = 8000;
static int cfg_runtime_ms = -1;
static bool cfg_segment;
static bool cfg_sendmmsg;
static bool cfg_tcp;
static bool cfg_zerocopy;
static socklen_t cfg_alen;
static struct sockaddr_storage cfg_dst_addr;
static bool interrupted;
static char buf[NUM_PKT][ETH_MAX_MTU];
static void sigint_handler(int signum)
{
if (signum == SIGINT)
interrupted = true;
}
static unsigned long gettimeofday_ms(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
}
static int set_cpu(int cpu)
{
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(cpu, &mask);
if (sched_setaffinity(0, sizeof(mask), &mask))
error(1, 0, "setaffinity %d", cpu);
return 0;
}
static void setup_sockaddr(int domain, const char *str_addr, void *sockaddr)
{
struct sockaddr_in6 *addr6 = (void *) sockaddr;
struct sockaddr_in *addr4 = (void *) sockaddr;
switch (domain) {
case PF_INET:
addr4->sin_family = AF_INET;
addr4->sin_port = htons(cfg_port);
if (inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
error(1, 0, "ipv4 parse error: %s", str_addr);
break;
case PF_INET6:
addr6->sin6_family = AF_INET6;
addr6->sin6_port = htons(cfg_port);
if (inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
error(1, 0, "ipv6 parse error: %s", str_addr);
break;
default:
error(1, 0, "illegal domain");
}
}
static void flush_zerocopy(int fd)
{
struct msghdr msg = {0}; /* flush */
int ret;
while (1) {
ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
if (ret == -1 && errno == EAGAIN)
break;
if (ret == -1)
error(1, errno, "errqueue");
if (msg.msg_flags != (MSG_ERRQUEUE | MSG_CTRUNC))
error(1, 0, "errqueue: flags 0x%x\n", msg.msg_flags);
msg.msg_flags = 0;
}
}
static int send_tcp(int fd, char *data)
{
int ret, done = 0, count = 0;
while (done < cfg_payload_len) {
ret = send(fd, data + done, cfg_payload_len - done,
cfg_zerocopy ? MSG_ZEROCOPY : 0);
if (ret == -1)
error(1, errno, "write");
done += ret;
count++;
}
return count;
}
static int send_udp(int fd, char *data)
{
int ret, total_len, len, count = 0;
total_len = cfg_payload_len;
while (total_len) {
len = total_len < cfg_mss ? total_len : cfg_mss;
ret = sendto(fd, data, len, cfg_zerocopy ? MSG_ZEROCOPY : 0,
cfg_connected ? NULL : (void *)&cfg_dst_addr,
cfg_connected ? 0 : cfg_alen);
if (ret == -1)
error(1, errno, "write");
if (ret != len)
error(1, errno, "write: %uB != %uB\n", ret, len);
total_len -= len;
count++;
}
return count;
}
static int send_udp_sendmmsg(int fd, char *data)
{
const int max_nr_msg = ETH_MAX_MTU / ETH_DATA_LEN;
struct mmsghdr mmsgs[max_nr_msg];
struct iovec iov[max_nr_msg];
unsigned int off = 0, left;
int i = 0, ret;
memset(mmsgs, 0, sizeof(mmsgs));
left = cfg_payload_len;
while (left) {
if (i == max_nr_msg)
error(1, 0, "sendmmsg: exceeds max_nr_msg");
iov[i].iov_base = data + off;
iov[i].iov_len = cfg_mss < left ? cfg_mss : left;
mmsgs[i].msg_hdr.msg_iov = iov + i;
mmsgs[i].msg_hdr.msg_iovlen = 1;
off += iov[i].iov_len;
left -= iov[i].iov_len;
i++;
}
ret = sendmmsg(fd, mmsgs, i, cfg_zerocopy ? MSG_ZEROCOPY : 0);
if (ret == -1)
error(1, errno, "sendmmsg");
return ret;
}
static void send_udp_segment_cmsg(struct cmsghdr *cm)
{
uint16_t *valp;
cm->cmsg_level = SOL_UDP;
cm->cmsg_type = UDP_SEGMENT;
cm->cmsg_len = CMSG_LEN(sizeof(cfg_mss));
valp = (void *)CMSG_DATA(cm);
*valp = cfg_mss;
}
static int send_udp_segment(int fd, char *data)
{
char control[CMSG_SPACE(sizeof(cfg_mss))] = {0};
struct msghdr msg = {0};
struct iovec iov = {0};
int ret;
iov.iov_base = data;
iov.iov_len = cfg_payload_len;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = control;
msg.msg_controllen = sizeof(control);
send_udp_segment_cmsg(CMSG_FIRSTHDR(&msg));
msg.msg_name = (void *)&cfg_dst_addr;
msg.msg_namelen = cfg_alen;
ret = sendmsg(fd, &msg, cfg_zerocopy ? MSG_ZEROCOPY : 0);
if (ret == -1)
error(1, errno, "sendmsg");
if (ret != iov.iov_len)
error(1, 0, "sendmsg: %u != %lu\n", ret, iov.iov_len);
return 1;
}
static void usage(const char *filepath)
{
error(1, 0, "Usage: %s [-46cmStuz] [-C cpu] [-D dst ip] [-l secs] [-p port] [-s sendsize]",
filepath);
}
static void parse_opts(int argc, char **argv)
{
int max_len, hdrlen;
int c;
while ((c = getopt(argc, argv, "46cC:D:l:mp:s:Stuz")) != -1) {
switch (c) {
case '4':
if (cfg_family != PF_UNSPEC)
error(1, 0, "Pass one of -4 or -6");
cfg_family = PF_INET;
cfg_alen = sizeof(struct sockaddr_in);
break;
case '6':
if (cfg_family != PF_UNSPEC)
error(1, 0, "Pass one of -4 or -6");
cfg_family = PF_INET6;
cfg_alen = sizeof(struct sockaddr_in6);
break;
case 'c':
cfg_cache_trash = true;
break;
case 'C':
cfg_cpu = strtol(optarg, NULL, 0);
break;
case 'D':
setup_sockaddr(cfg_family, optarg, &cfg_dst_addr);
break;
case 'l':
cfg_runtime_ms = strtoul(optarg, NULL, 10) * 1000;
break;
case 'm':
cfg_sendmmsg = true;
break;
case 'p':
cfg_port = strtoul(optarg, NULL, 0);
break;
case 's':
cfg_payload_len = strtoul(optarg, NULL, 0);
break;
case 'S':
cfg_segment = true;
break;
case 't':
cfg_tcp = true;
break;
case 'u':
cfg_connected = false;
break;
case 'z':
cfg_zerocopy = true;
break;
}
}
if (optind != argc)
usage(argv[0]);
if (cfg_family == PF_UNSPEC)
error(1, 0, "must pass one of -4 or -6");
if (cfg_tcp && !cfg_connected)
error(1, 0, "connectionless tcp makes no sense");
if (cfg_segment && cfg_sendmmsg)
error(1, 0, "cannot combine segment offload and sendmmsg");
if (cfg_family == PF_INET)
hdrlen = sizeof(struct iphdr) + sizeof(struct udphdr);
else
hdrlen = sizeof(struct ip6_hdr) + sizeof(struct udphdr);
cfg_mss = ETH_DATA_LEN - hdrlen;
max_len = ETH_MAX_MTU - hdrlen;
if (cfg_payload_len > max_len)
error(1, 0, "payload length %u exceeds max %u",
cfg_payload_len, max_len);
}
static void set_pmtu_discover(int fd, bool is_ipv4)
{
int level, name, val;
if (is_ipv4) {
level = SOL_IP;
name = IP_MTU_DISCOVER;
val = IP_PMTUDISC_DO;
} else {
level = SOL_IPV6;
name = IPV6_MTU_DISCOVER;
val = IPV6_PMTUDISC_DO;
}
if (setsockopt(fd, level, name, &val, sizeof(val)))
error(1, errno, "setsockopt path mtu");
}
int main(int argc, char **argv)
{
unsigned long num_msgs, num_sends;
unsigned long tnow, treport, tstop;
int fd, i, val;
parse_opts(argc, argv);
if (cfg_cpu > 0)
set_cpu(cfg_cpu);
for (i = 0; i < sizeof(buf[0]); i++)
buf[0][i] = 'a' + (i % 26);
for (i = 1; i < NUM_PKT; i++)
memcpy(buf[i], buf[0], sizeof(buf[0]));
signal(SIGINT, sigint_handler);
fd = socket(cfg_family, cfg_tcp ? SOCK_STREAM : SOCK_DGRAM, 0);
if (fd == -1)
error(1, errno, "socket");
if (cfg_zerocopy) {
val = 1;
if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &val, sizeof(val)))
error(1, errno, "setsockopt zerocopy");
}
if (cfg_connected &&
connect(fd, (void *)&cfg_dst_addr, cfg_alen))
error(1, errno, "connect");
if (cfg_segment)
set_pmtu_discover(fd, cfg_family == PF_INET);
num_msgs = num_sends = 0;
tnow = gettimeofday_ms();
tstop = tnow + cfg_runtime_ms;
treport = tnow + 1000;
i = 0;
do {
if (cfg_tcp)
num_sends += send_tcp(fd, buf[i]);
else if (cfg_segment)
num_sends += send_udp_segment(fd, buf[i]);
else if (cfg_sendmmsg)
num_sends += send_udp_sendmmsg(fd, buf[i]);
else
num_sends += send_udp(fd, buf[i]);
num_msgs++;
if (cfg_zerocopy && ((num_msgs & 0xF) == 0))
flush_zerocopy(fd);
tnow = gettimeofday_ms();
if (tnow > treport) {
fprintf(stderr,
"%s tx: %6lu MB/s %8lu calls/s %6lu msg/s\n",
cfg_tcp ? "tcp" : "udp",
(num_msgs * cfg_payload_len) >> 20,
num_sends, num_msgs);
num_msgs = num_sends = 0;
treport = tnow + 1000;
}
/* cold cache when writing buffer */
if (cfg_cache_trash)
i = ++i < NUM_PKT ? i : 0;
} while (!interrupted && (cfg_runtime_ms == -1 || tnow < tstop));
if (close(fd))
error(1, errno, "close");
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment