Commit 983f507c authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'net-support-ipv4-big-tcp'

Xin Long says:

====================
net: support ipv4 big tcp

This is similar to the BIG TCP patchset added by Eric for IPv6:

  https://lwn.net/Articles/895398/

Different from IPv6, IPv4 tot_len is 16-bit long only, and IPv4 header
doesn't have exthdrs(options) for the BIG TCP packets' length. To make
it simple, as David and Paolo suggested, we set IPv4 tot_len to 0 to
indicate this might be a BIG TCP packet and use skb->len as the real
IPv4 total length.

This will work safely, as all BIG TCP packets are GSO/GRO packets and
processed on the same host as they were created; There is no padding
in GSO/GRO packets, and skb->len - network_offset is exactly the IPv4
packet total length; Also, before implementing the feature, all those
places that may get iph tot_len from BIG TCP packets are taken care
with some new APIs:

Patch 1 adds some APIs for iph tot_len setting and getting, which are
used in all these places where IPv4 BIG TCP packets may reach in Patch
2-7, Patch 8 adds a GSO_TCP tp_status for af_packet users, and Patch 9
add new netlink attributes to make IPv4 BIG TCP independent from IPv6
BIG TCP on configuration, and Patch 10 implements this feature.

Note that the similar change as in Patch 2-6 are also needed for IPv6
BIG TCP packets, and will be addressed in another patchset.

The similar performance test is done for IPv4 BIG TCP with 25Gbit NIC
and 1.5K MTU:

No BIG TCP:
for i in {1..10}; do netperf -t TCP_RR -H 192.168.100.1 -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done
168          322          337          3776.49
143          236          277          4654.67
128          258          288          4772.83
171          229          278          4645.77
175          228          243          4678.93
149          239          279          4599.86
164          234          268          4606.94
155          276          289          4235.82
180          255          268          4418.95
168          241          249          4417.82

Enable BIG TCP:
ip link set dev ens1f0np0 gro_ipv4_max_size 128000 gso_ipv4_max_size 128000
for i in {1..10}; do netperf -t TCP_RR -H 192.168.100.1 -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done
161          241          252          4821.73
174          205          217          5098.28
167          208          220          5001.43
164          228          249          4883.98
150          233          249          4914.90
180          233          244          4819.66
154          208          219          5004.92
157          209          247          4999.78
160          218          246          4842.31
174          206          217          5080.99

Thanks for the feedback from Eric and David Ahern.
====================

Link: https://lore.kernel.org/r/cover.1674921359.git.lucien.xin@gmail.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents d8673afb b1a78b9b
...@@ -157,7 +157,7 @@ void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type) ...@@ -157,7 +157,7 @@ void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type)
return NULL; return NULL;
ip4h = ip_hdr(skb); ip4h = ip_hdr(skb);
pktlen = ntohs(ip4h->tot_len); pktlen = skb_ip_totlen(skb);
if (ip4h->ihl < 5 || ip4h->version != 4) if (ip4h->ihl < 5 || ip4h->version != 4)
return NULL; return NULL;
if (skb->len < pktlen || pktlen < (ip4h->ihl * 4)) if (skb->len < pktlen || pktlen < (ip4h->ihl * 4))
......
...@@ -35,4 +35,25 @@ static inline unsigned int ip_transport_len(const struct sk_buff *skb) ...@@ -35,4 +35,25 @@ static inline unsigned int ip_transport_len(const struct sk_buff *skb)
{ {
return ntohs(ip_hdr(skb)->tot_len) - skb_network_header_len(skb); return ntohs(ip_hdr(skb)->tot_len) - skb_network_header_len(skb);
} }
static inline unsigned int iph_totlen(const struct sk_buff *skb, const struct iphdr *iph)
{
u32 len = ntohs(iph->tot_len);
return (len || !skb_is_gso(skb) || !skb_is_gso_tcp(skb)) ?
len : skb->len - skb_network_offset(skb);
}
static inline unsigned int skb_ip_totlen(const struct sk_buff *skb)
{
return iph_totlen(skb, ip_hdr(skb));
}
/* IPv4 datagram length is stored into 16bit field (tot_len) */
#define IP_MAX_MTU 0xFFFFU
static inline void iph_set_totlen(struct iphdr *iph, unsigned int len)
{
iph->tot_len = len <= IP_MAX_MTU ? htons(len) : 0;
}
#endif /* _LINUX_IP_H */ #endif /* _LINUX_IP_H */
...@@ -1964,6 +1964,8 @@ enum netdev_ml_priv_type { ...@@ -1964,6 +1964,8 @@ enum netdev_ml_priv_type {
* @gso_max_segs: Maximum number of segments that can be passed to the * @gso_max_segs: Maximum number of segments that can be passed to the
* NIC for GSO * NIC for GSO
* @tso_max_segs: Device (as in HW) limit on the max TSO segment count * @tso_max_segs: Device (as in HW) limit on the max TSO segment count
* @gso_ipv4_max_size: Maximum size of generic segmentation offload,
* for IPv4.
* *
* @dcbnl_ops: Data Center Bridging netlink ops * @dcbnl_ops: Data Center Bridging netlink ops
* @num_tc: Number of traffic classes in the net device * @num_tc: Number of traffic classes in the net device
...@@ -2004,6 +2006,8 @@ enum netdev_ml_priv_type { ...@@ -2004,6 +2006,8 @@ enum netdev_ml_priv_type {
* keep a list of interfaces to be deleted. * keep a list of interfaces to be deleted.
* @gro_max_size: Maximum size of aggregated packet in generic * @gro_max_size: Maximum size of aggregated packet in generic
* receive offload (GRO) * receive offload (GRO)
* @gro_ipv4_max_size: Maximum size of aggregated packet in generic
* receive offload (GRO), for IPv4.
* *
* @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @dev_addr_shadow: Copy of @dev_addr to catch direct writes.
* @linkwatch_dev_tracker: refcount tracker used by linkwatch. * @linkwatch_dev_tracker: refcount tracker used by linkwatch.
...@@ -2207,6 +2211,7 @@ struct net_device { ...@@ -2207,6 +2211,7 @@ struct net_device {
*/ */
#define GRO_MAX_SIZE (8 * 65535u) #define GRO_MAX_SIZE (8 * 65535u)
unsigned int gro_max_size; unsigned int gro_max_size;
unsigned int gro_ipv4_max_size;
rx_handler_func_t __rcu *rx_handler; rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data; void __rcu *rx_handler_data;
...@@ -2330,6 +2335,7 @@ struct net_device { ...@@ -2330,6 +2335,7 @@ struct net_device {
u16 gso_max_segs; u16 gso_max_segs;
#define TSO_MAX_SEGS U16_MAX #define TSO_MAX_SEGS U16_MAX
u16 tso_max_segs; u16 tso_max_segs;
unsigned int gso_ipv4_max_size;
#ifdef CONFIG_DCB #ifdef CONFIG_DCB
const struct dcbnl_rtnl_ops *dcbnl_ops; const struct dcbnl_rtnl_ops *dcbnl_ops;
......
...@@ -29,7 +29,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) ...@@ -29,7 +29,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt)
if (iph->ihl < 5 || iph->version != 4) if (iph->ihl < 5 || iph->version != 4)
return -1; return -1;
len = ntohs(iph->tot_len); len = iph_totlen(pkt->skb, iph);
thoff = iph->ihl * 4; thoff = iph->ihl * 4;
if (pkt->skb->len < len) if (pkt->skb->len < len)
return -1; return -1;
...@@ -64,7 +64,7 @@ static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) ...@@ -64,7 +64,7 @@ static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt)
if (iph->ihl < 5 || iph->version != 4) if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error; goto inhdr_error;
len = ntohs(iph->tot_len); len = iph_totlen(pkt->skb, iph);
thoff = iph->ihl * 4; thoff = iph->ihl * 4;
if (pkt->skb->len < len) { if (pkt->skb->len < len) {
__IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS); __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS);
......
...@@ -35,9 +35,6 @@ ...@@ -35,9 +35,6 @@
#include <linux/cache.h> #include <linux/cache.h>
#include <linux/security.h> #include <linux/security.h>
/* IPv4 datagram length is stored into 16bit field (tot_len) */
#define IP_MAX_MTU 0xFFFFU
#define RTO_ONLINK 0x01 #define RTO_ONLINK 0x01
#define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE)) #define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
......
...@@ -374,6 +374,9 @@ enum { ...@@ -374,6 +374,9 @@ enum {
IFLA_DEVLINK_PORT, IFLA_DEVLINK_PORT,
IFLA_GSO_IPV4_MAX_SIZE,
IFLA_GRO_IPV4_MAX_SIZE,
__IFLA_MAX __IFLA_MAX
}; };
......
...@@ -115,6 +115,7 @@ struct tpacket_auxdata { ...@@ -115,6 +115,7 @@ struct tpacket_auxdata {
#define TP_STATUS_BLK_TMO (1 << 5) #define TP_STATUS_BLK_TMO (1 << 5)
#define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */ #define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */
#define TP_STATUS_CSUM_VALID (1 << 7) #define TP_STATUS_CSUM_VALID (1 << 7)
#define TP_STATUS_GSO_TCP (1 << 8)
/* Tx ring - header status */ /* Tx ring - header status */
#define TP_STATUS_AVAILABLE 0 #define TP_STATUS_AVAILABLE 0
......
...@@ -214,7 +214,7 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) ...@@ -214,7 +214,7 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb)
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto csum_error; goto csum_error;
len = ntohs(iph->tot_len); len = skb_ip_totlen(skb);
if (skb->len < len) { if (skb->len < len) {
__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop; goto drop;
......
...@@ -212,7 +212,7 @@ static int nf_ct_br_ip_check(const struct sk_buff *skb) ...@@ -212,7 +212,7 @@ static int nf_ct_br_ip_check(const struct sk_buff *skb)
iph->version != 4) iph->version != 4)
return -1; return -1;
len = ntohs(iph->tot_len); len = skb_ip_totlen(skb);
if (skb->len < nhoff + len || if (skb->len < nhoff + len ||
len < (iph->ihl * 4)) len < (iph->ihl * 4))
return -1; return -1;
...@@ -256,7 +256,7 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, ...@@ -256,7 +256,7 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(struct iphdr))) if (!pskb_may_pull(skb, sizeof(struct iphdr)))
return NF_ACCEPT; return NF_ACCEPT;
len = ntohs(ip_hdr(skb)->tot_len); len = skb_ip_totlen(skb);
if (pskb_trim_rcsum(skb, len)) if (pskb_trim_rcsum(skb, len))
return NF_ACCEPT; return NF_ACCEPT;
......
...@@ -3001,6 +3001,8 @@ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) ...@@ -3001,6 +3001,8 @@ void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
dev->tso_max_size = min(GSO_MAX_SIZE, size); dev->tso_max_size = min(GSO_MAX_SIZE, size);
if (size < READ_ONCE(dev->gso_max_size)) if (size < READ_ONCE(dev->gso_max_size))
netif_set_gso_max_size(dev, size); netif_set_gso_max_size(dev, size);
if (size < READ_ONCE(dev->gso_ipv4_max_size))
netif_set_gso_ipv4_max_size(dev, size);
} }
EXPORT_SYMBOL(netif_set_tso_max_size); EXPORT_SYMBOL(netif_set_tso_max_size);
...@@ -10614,6 +10616,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, ...@@ -10614,6 +10616,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS; dev->gso_max_segs = GSO_MAX_SEGS;
dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
dev->tso_max_segs = TSO_MAX_SEGS; dev->tso_max_segs = TSO_MAX_SEGS;
dev->upper_level = 1; dev->upper_level = 1;
......
...@@ -100,6 +100,8 @@ static inline void netif_set_gso_max_size(struct net_device *dev, ...@@ -100,6 +100,8 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
{ {
/* dev->gso_max_size is read locklessly from sk_setup_caps() */ /* dev->gso_max_size is read locklessly from sk_setup_caps() */
WRITE_ONCE(dev->gso_max_size, size); WRITE_ONCE(dev->gso_max_size, size);
if (size <= GSO_LEGACY_MAX_SIZE)
WRITE_ONCE(dev->gso_ipv4_max_size, size);
} }
static inline void netif_set_gso_max_segs(struct net_device *dev, static inline void netif_set_gso_max_segs(struct net_device *dev,
...@@ -114,6 +116,22 @@ static inline void netif_set_gro_max_size(struct net_device *dev, ...@@ -114,6 +116,22 @@ static inline void netif_set_gro_max_size(struct net_device *dev,
{ {
/* This pairs with the READ_ONCE() in skb_gro_receive() */ /* This pairs with the READ_ONCE() in skb_gro_receive() */
WRITE_ONCE(dev->gro_max_size, size); WRITE_ONCE(dev->gro_max_size, size);
if (size <= GRO_LEGACY_MAX_SIZE)
WRITE_ONCE(dev->gro_ipv4_max_size, size);
}
static inline void netif_set_gso_ipv4_max_size(struct net_device *dev,
unsigned int size)
{
/* dev->gso_ipv4_max_size is read locklessly from sk_setup_caps() */
WRITE_ONCE(dev->gso_ipv4_max_size, size);
}
static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
unsigned int size)
{
/* This pairs with the READ_ONCE() in skb_gro_receive() */
WRITE_ONCE(dev->gro_ipv4_max_size, size);
} }
#endif #endif
...@@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) ...@@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
struct sk_buff *lp; struct sk_buff *lp;
int segs; int segs;
/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
gro_max_size = READ_ONCE(p->dev->gro_max_size); gro_max_size = p->protocol == htons(ETH_P_IPV6) ?
READ_ONCE(p->dev->gro_max_size) :
READ_ONCE(p->dev->gro_ipv4_max_size);
if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
return -E2BIG; return -E2BIG;
if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
if (p->protocol != htons(ETH_P_IPV6) || if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || (p->protocol == htons(ETH_P_IPV6) &&
ipv6_hdr(p)->nexthdr != IPPROTO_TCP || skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
p->encapsulation) p->encapsulation)
return -E2BIG; return -E2BIG;
} }
......
...@@ -1074,6 +1074,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, ...@@ -1074,6 +1074,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
+ nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
+ nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */
+ nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */
+ nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */
+ nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */
+ nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */
+ nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_OPERSTATE */
...@@ -1807,6 +1809,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, ...@@ -1807,6 +1809,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) ||
nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) ||
nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) ||
nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) ||
nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) ||
#ifdef CONFIG_RPS #ifdef CONFIG_RPS
...@@ -1968,6 +1972,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { ...@@ -1968,6 +1972,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT },
[IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT },
[IFLA_ALLMULTI] = { .type = NLA_REJECT }, [IFLA_ALLMULTI] = { .type = NLA_REJECT },
[IFLA_GSO_IPV4_MAX_SIZE] = { .type = NLA_U32 },
[IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 },
}; };
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
...@@ -2883,6 +2889,29 @@ static int do_setlink(const struct sk_buff *skb, ...@@ -2883,6 +2889,29 @@ static int do_setlink(const struct sk_buff *skb,
} }
} }
if (tb[IFLA_GSO_IPV4_MAX_SIZE]) {
u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]);
if (max_size > dev->tso_max_size) {
err = -EINVAL;
goto errout;
}
if (dev->gso_ipv4_max_size ^ max_size) {
netif_set_gso_ipv4_max_size(dev, max_size);
status |= DO_SETLINK_MODIFIED;
}
}
if (tb[IFLA_GRO_IPV4_MAX_SIZE]) {
u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]);
if (dev->gro_ipv4_max_size ^ gro_max_size) {
netif_set_gro_ipv4_max_size(dev, gro_max_size);
status |= DO_SETLINK_MODIFIED;
}
}
if (tb[IFLA_OPERSTATE]) if (tb[IFLA_OPERSTATE])
set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
...@@ -3325,6 +3354,10 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, ...@@ -3325,6 +3354,10 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS]));
if (tb[IFLA_GRO_MAX_SIZE]) if (tb[IFLA_GRO_MAX_SIZE])
netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE]));
if (tb[IFLA_GSO_IPV4_MAX_SIZE])
netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]));
if (tb[IFLA_GRO_IPV4_MAX_SIZE])
netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]));
return dev; return dev;
} }
......
...@@ -2373,17 +2373,22 @@ void sk_free_unlock_clone(struct sock *sk) ...@@ -2373,17 +2373,22 @@ void sk_free_unlock_clone(struct sock *sk)
} }
EXPORT_SYMBOL_GPL(sk_free_unlock_clone); EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
static void sk_trim_gso_size(struct sock *sk) static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
{ {
if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) bool is_ipv6 = false;
return; u32 max_size;
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6 && is_ipv6 = (sk->sk_family == AF_INET6 &&
sk_is_tcp(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
return;
#endif #endif
sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
READ_ONCE(dst->dev->gso_ipv4_max_size);
if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
max_size = GSO_LEGACY_MAX_SIZE;
return max_size - (MAX_TCP_HEADER + 1);
} }
void sk_setup_caps(struct sock *sk, struct dst_entry *dst) void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
...@@ -2403,10 +2408,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) ...@@ -2403,10 +2408,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
sk->sk_route_caps &= ~NETIF_F_GSO_MASK; sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else { } else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
sk_trim_gso_size(sk);
sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
} }
......
...@@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) ...@@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
if (unlikely(ip_fast_csum((u8 *)iph, 5))) if (unlikely(ip_fast_csum((u8 *)iph, 5)))
goto out; goto out;
NAPI_GRO_CB(skb)->proto = proto;
id = ntohl(*(__be32 *)&iph->id); id = ntohl(*(__be32 *)&iph->id);
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
id >>= 16; id >>= 16;
...@@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) ...@@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
int inet_gro_complete(struct sk_buff *skb, int nhoff) int inet_gro_complete(struct sk_buff *skb, int nhoff)
{ {
__be16 newlen = htons(skb->len - nhoff);
struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
const struct net_offload *ops; const struct net_offload *ops;
__be16 totlen = iph->tot_len;
int proto = iph->protocol; int proto = iph->protocol;
int err = -ENOSYS; int err = -ENOSYS;
...@@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff) ...@@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
skb_set_inner_network_header(skb, nhoff); skb_set_inner_network_header(skb, nhoff);
} }
csum_replace2(&iph->check, iph->tot_len, newlen); iph_set_totlen(iph, skb->len - nhoff);
iph->tot_len = newlen; csum_replace2(&iph->check, totlen, iph->tot_len);
ops = rcu_dereference(inet_offloads[proto]); ops = rcu_dereference(inet_offloads[proto]);
if (WARN_ON(!ops || !ops->callbacks.gro_complete)) if (WARN_ON(!ops || !ops->callbacks.gro_complete))
......
...@@ -2222,7 +2222,7 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb, ...@@ -2222,7 +2222,7 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb,
memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len); memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len);
if (len_delta != 0) { if (len_delta != 0) {
iph->ihl = 5 + (opt_len >> 2); iph->ihl = 5 + (opt_len >> 2);
iph->tot_len = htons(skb->len); iph_set_totlen(iph, skb->len);
} }
ip_send_check(iph); ip_send_check(iph);
......
...@@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) ...@@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto csum_error; goto csum_error;
len = ntohs(iph->tot_len); len = iph_totlen(skb, iph);
if (skb->len < len) { if (skb->len < len) {
drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
......
...@@ -100,7 +100,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) ...@@ -100,7 +100,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{ {
struct iphdr *iph = ip_hdr(skb); struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len); iph_set_totlen(iph, skb->len);
ip_send_check(iph); ip_send_check(iph);
/* if egress device is enslaved to an L3 master device pass the /* if egress device is enslaved to an L3 master device pass the
......
...@@ -994,7 +994,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, ...@@ -994,7 +994,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
old_dsfield = ipv4_get_dsfield(old_iph); old_dsfield = ipv4_get_dsfield(old_iph);
*ttl = old_iph->ttl; *ttl = old_iph->ttl;
if (payload_len) if (payload_len)
*payload_len = ntohs(old_iph->tot_len); *payload_len = skb_ip_totlen(skb);
} }
/* Implement full-functionality option for ECN encapsulation */ /* Implement full-functionality option for ECN encapsulation */
......
...@@ -322,7 +322,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, ...@@ -322,7 +322,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK,
ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
/* Max length: 6 "CE DF MF " */ /* Max length: 6 "CE DF MF " */
......
...@@ -21,7 +21,7 @@ static bool ...@@ -21,7 +21,7 @@ static bool
length_mt(const struct sk_buff *skb, struct xt_action_param *par) length_mt(const struct sk_buff *skb, struct xt_action_param *par)
{ {
const struct xt_length_info *info = par->matchinfo; const struct xt_length_info *info = par->matchinfo;
u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); u32 pktlen = skb_ip_totlen(skb);
return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
} }
......
...@@ -1103,7 +1103,7 @@ static int ovs_skb_network_trim(struct sk_buff *skb) ...@@ -1103,7 +1103,7 @@ static int ovs_skb_network_trim(struct sk_buff *skb)
switch (skb->protocol) { switch (skb->protocol) {
case htons(ETH_P_IP): case htons(ETH_P_IP):
len = ntohs(ip_hdr(skb)->tot_len); len = skb_ip_totlen(skb);
break; break;
case htons(ETH_P_IPV6): case htons(ETH_P_IPV6):
len = sizeof(struct ipv6hdr) len = sizeof(struct ipv6hdr)
......
...@@ -2296,6 +2296,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, ...@@ -2296,6 +2296,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
else if (skb->pkt_type != PACKET_OUTGOING && else if (skb->pkt_type != PACKET_OUTGOING &&
skb_csum_unnecessary(skb)) skb_csum_unnecessary(skb))
status |= TP_STATUS_CSUM_VALID; status |= TP_STATUS_CSUM_VALID;
if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
status |= TP_STATUS_GSO_TCP;
if (snaplen > res) if (snaplen > res)
snaplen = res; snaplen = res;
...@@ -3522,6 +3524,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, ...@@ -3522,6 +3524,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
else if (skb->pkt_type != PACKET_OUTGOING && else if (skb->pkt_type != PACKET_OUTGOING &&
skb_csum_unnecessary(skb)) skb_csum_unnecessary(skb))
aux.tp_status |= TP_STATUS_CSUM_VALID; aux.tp_status |= TP_STATUS_CSUM_VALID;
if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
aux.tp_status |= TP_STATUS_GSO_TCP;
aux.tp_len = origlen; aux.tp_len = origlen;
aux.tp_snaplen = skb->len; aux.tp_snaplen = skb->len;
......
...@@ -707,7 +707,7 @@ static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family) ...@@ -707,7 +707,7 @@ static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family)
switch (family) { switch (family) {
case NFPROTO_IPV4: case NFPROTO_IPV4:
len = ntohs(ip_hdr(skb)->tot_len); len = skb_ip_totlen(skb);
break; break;
case NFPROTO_IPV6: case NFPROTO_IPV6:
len = sizeof(struct ipv6hdr) len = sizeof(struct ipv6hdr)
......
...@@ -1209,7 +1209,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, ...@@ -1209,7 +1209,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
iph_check->daddr != iph->daddr) iph_check->daddr != iph->daddr)
continue; continue;
seglen = ntohs(iph_check->tot_len) - seglen = iph_totlen(skb, iph_check) -
(4 * iph_check->ihl); (4 * iph_check->ihl);
} else if (iph_check->version == 6) { } else if (iph_check->version == 6) {
ipv6h = (struct ipv6hdr *)iph; ipv6h = (struct ipv6hdr *)iph;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment