Commit dc83d4d8 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp_skb_cb'

Eric Dumazet says:

====================
tcp: better TCP_SKB_CB layout

TCP had the assumption that IPCB and IP6CB are first members of skb->cb[]

This is fine, except that IPCB/IP6CB are used in TCP for a very short time
in input path.

What really matters for TCP stack is to get skb->next,
TCP_SKB_CB(skb)->seq, and TCP_SKB_CB(skb)->end_seq in the same cache line.

skb that are immediately consumed do not care because whole skb->cb[] is
hot in cpu cache, while skb that sit in wocket write queue or receive queues
do not need TCP_SKB_CB(skb)->header at all.

This patch set implements the prereq for IPv4, IPv6, and TCP to make this
possible. This makes TCP more efficient.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents ff04a771 971f10ec
...@@ -180,8 +180,10 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) ...@@ -180,8 +180,10 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
} }
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
__be32 saddr, const struct ip_reply_arg *arg, const struct ip_options *sopt,
__be32 daddr, __be32 saddr,
const struct ip_reply_arg *arg,
unsigned int len); unsigned int len);
#define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field) #define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field)
...@@ -511,7 +513,14 @@ int ip_forward(struct sk_buff *skb); ...@@ -511,7 +513,14 @@ int ip_forward(struct sk_buff *skb);
void ip_options_build(struct sk_buff *skb, struct ip_options *opt, void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
__be32 daddr, struct rtable *rt, int is_frag); __be32 daddr, struct rtable *rt, int is_frag);
int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb);
int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
const struct ip_options *sopt);
static inline int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
{
return __ip_options_echo(dopt, skb, &IPCB(skb)->opt);
}
void ip_options_fragment(struct sk_buff *skb); void ip_options_fragment(struct sk_buff *skb);
int ip_options_compile(struct net *net, struct ip_options *opt, int ip_options_compile(struct net *net, struct ip_options *opt,
struct sk_buff *skb); struct sk_buff *skb);
......
...@@ -288,7 +288,8 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, ...@@ -288,7 +288,8 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
struct ipv6_txoptions *opt); struct ipv6_txoptions *opt);
bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb); bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
const struct inet6_skb_parm *opt);
static inline bool ipv6_accept_ra(struct inet6_dev *idev) static inline bool ipv6_accept_ra(struct inet6_dev *idev)
{ {
......
...@@ -696,12 +696,6 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) ...@@ -696,12 +696,6 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
* If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately. * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
*/ */
struct tcp_skb_cb { struct tcp_skb_cb {
union {
struct inet_skb_parm h4;
#if IS_ENABLED(CONFIG_IPV6)
struct inet6_skb_parm h6;
#endif
} header; /* For incoming frames */
__u32 seq; /* Starting sequence number */ __u32 seq; /* Starting sequence number */
__u32 end_seq; /* SEQ + FIN + SYN + datalen */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */
__u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */
...@@ -720,6 +714,12 @@ struct tcp_skb_cb { ...@@ -720,6 +714,12 @@ struct tcp_skb_cb {
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
/* 1 byte hole */ /* 1 byte hole */
__u32 ack_seq; /* Sequence number ACK'd */ __u32 ack_seq; /* Sequence number ACK'd */
union {
struct inet_skb_parm h4;
#if IS_ENABLED(CONFIG_IPV6)
struct inet6_skb_parm h6;
#endif
} header; /* For incoming frames */
}; };
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
......
...@@ -404,7 +404,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ...@@ -404,7 +404,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
if (ipv6_opt_accepted(sk, skb) || if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) ||
np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
atomic_inc(&skb->users); atomic_inc(&skb->users);
......
...@@ -87,17 +87,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, ...@@ -87,17 +87,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
* NOTE: dopt cannot point to skb. * NOTE: dopt cannot point to skb.
*/ */
int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
const struct ip_options *sopt)
{ {
const struct ip_options *sopt;
unsigned char *sptr, *dptr; unsigned char *sptr, *dptr;
int soffset, doffset; int soffset, doffset;
int optlen; int optlen;
memset(dopt, 0, sizeof(struct ip_options)); memset(dopt, 0, sizeof(struct ip_options));
sopt = &(IPCB(skb)->opt);
if (sopt->optlen == 0) if (sopt->optlen == 0)
return 0; return 0;
......
...@@ -1522,8 +1522,10 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { ...@@ -1522,8 +1522,10 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
.uc_ttl = -1, .uc_ttl = -1,
}; };
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
__be32 saddr, const struct ip_reply_arg *arg, const struct ip_options *sopt,
__be32 daddr, __be32 saddr,
const struct ip_reply_arg *arg,
unsigned int len) unsigned int len)
{ {
struct ip_options_data replyopts; struct ip_options_data replyopts;
...@@ -1534,7 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, ...@@ -1534,7 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
struct sock *sk; struct sock *sk;
struct inet_sock *inet; struct inet_sock *inet;
if (ip_options_echo(&replyopts.opt.opt, skb)) if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
return; return;
ipc.addr = daddr; ipc.addr = daddr;
......
...@@ -681,8 +681,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) ...@@ -681,8 +681,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
net = dev_net(skb_dst(skb)->dev); net = dev_net(skb_dst(skb)->dev);
arg.tos = ip_hdr(skb)->tos; arg.tos = ip_hdr(skb)->tos;
ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
...@@ -764,8 +765,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ...@@ -764,8 +765,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
if (oif) if (oif)
arg.bound_dev_if = oif; arg.bound_dev_if = oif;
arg.tos = tos; arg.tos = tos;
ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
} }
...@@ -884,20 +886,18 @@ EXPORT_SYMBOL(tcp_syn_flood_action); ...@@ -884,20 +886,18 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
*/ */
static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
{ {
const struct ip_options *opt = &(IPCB(skb)->opt); const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
struct ip_options_rcu *dopt = NULL; struct ip_options_rcu *dopt = NULL;
if (opt && opt->optlen) { if (opt && opt->optlen) {
int opt_size = sizeof(*dopt) + opt->optlen; int opt_size = sizeof(*dopt) + opt->optlen;
dopt = kmalloc(opt_size, GFP_ATOMIC); dopt = kmalloc(opt_size, GFP_ATOMIC);
if (dopt) { if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) {
if (ip_options_echo(&dopt->opt, skb)) {
kfree(dopt); kfree(dopt);
dopt = NULL; dopt = NULL;
} }
} }
}
return dopt; return dopt;
} }
...@@ -1429,7 +1429,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) ...@@ -1429,7 +1429,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
#ifdef CONFIG_SYN_COOKIES #ifdef CONFIG_SYN_COOKIES
if (!th->syn) if (!th->syn)
sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt);
#endif #endif
return sk; return sk;
} }
...@@ -1634,6 +1634,13 @@ int tcp_v4_rcv(struct sk_buff *skb) ...@@ -1634,6 +1634,13 @@ int tcp_v4_rcv(struct sk_buff *skb)
th = tcp_hdr(skb); th = tcp_hdr(skb);
iph = ip_hdr(skb); iph = ip_hdr(skb);
/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
* barrier() makes sure compiler wont play fool^Waliasing games.
*/
memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
sizeof(struct inet_skb_parm));
barrier();
TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->seq = ntohl(th->seq);
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff * 4); skb->len - th->doff * 4);
......
...@@ -974,6 +974,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, ...@@ -974,6 +974,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
/* Our usage of tstamp should remain private */ /* Our usage of tstamp should remain private */
skb->tstamp.tv64 = 0; skb->tstamp.tv64 = 0;
/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
sizeof(struct inet6_skb_parm)));
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
if (likely(err <= 0)) if (likely(err <= 0))
......
...@@ -672,10 +672,10 @@ int inet6_sk_rebuild_header(struct sock *sk) ...@@ -672,10 +672,10 @@ int inet6_sk_rebuild_header(struct sock *sk)
} }
EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb) bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
const struct inet6_skb_parm *opt)
{ {
const struct ipv6_pinfo *np = inet6_sk(sk); const struct ipv6_pinfo *np = inet6_sk(sk);
const struct inet6_skb_parm *opt = IP6CB(skb);
if (np->rxopt.all) { if (np->rxopt.all) {
if ((opt->hop && (np->rxopt.bits.hopopts || if ((opt->hop && (np->rxopt.bits.hopopts ||
......
...@@ -203,7 +203,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ...@@ -203,7 +203,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
ireq->ir_num = ntohs(th->dest); ireq->ir_num = ntohs(th->dest);
ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
if (ipv6_opt_accepted(sk, skb) || if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
atomic_inc(&skb->users); atomic_inc(&skb->users);
......
...@@ -742,7 +742,8 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, ...@@ -742,7 +742,8 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk,
ireq->ir_iif = inet6_iif(skb); ireq->ir_iif = inet6_iif(skb);
if (!TCP_SKB_CB(skb)->tcp_tw_isn && if (!TCP_SKB_CB(skb)->tcp_tw_isn &&
(ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
np->rxopt.bits.rxinfo ||
np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
np->rxopt.bits.rxohlim || np->repflow)) { np->rxopt.bits.rxohlim || np->repflow)) {
atomic_inc(&skb->users); atomic_inc(&skb->users);
...@@ -1367,7 +1368,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -1367,7 +1368,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
if (np->repflow) if (np->repflow)
np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
if (ipv6_opt_accepted(sk, opt_skb)) { if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
skb_set_owner_r(opt_skb, sk); skb_set_owner_r(opt_skb, sk);
opt_skb = xchg(&np->pktoptions, opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb);
} else { } else {
...@@ -1411,6 +1412,13 @@ static int tcp_v6_rcv(struct sk_buff *skb) ...@@ -1411,6 +1412,13 @@ static int tcp_v6_rcv(struct sk_buff *skb)
th = tcp_hdr(skb); th = tcp_hdr(skb);
hdr = ipv6_hdr(skb); hdr = ipv6_hdr(skb);
/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
* barrier() makes sure compiler wont play fool^Waliasing games.
*/
memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb),
sizeof(struct inet6_skb_parm));
barrier();
TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->seq = ntohl(th->seq);
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff*4); skb->len - th->doff*4);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment