Commit 9f2dbdd9 authored by David S. Miller's avatar David S. Miller

Merge branch 'listener_refactor_part_11'

Eric Dumazet says:

====================
inet: tcp listener refactoring, part 11

Before inserting request sockets into general (ehash) table,
we need to prepare netfilter to cope with them, as they are
not full sockets.

I'll later change xt_socket to get full support, including for
request sockets (NEW_SYN_RECV)

Save 8 bytes in inet_request_sock on 64bit arches. We'll soon add
a pointer to the listener socket.

I included two TCP changes in this patch series.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents c2497395 7970ddc8
...@@ -94,11 +94,11 @@ struct inet_request_sock { ...@@ -94,11 +94,11 @@ struct inet_request_sock {
acked : 1, acked : 1,
no_srccheck: 1; no_srccheck: 1;
kmemcheck_bitfield_end(flags); kmemcheck_bitfield_end(flags);
u32 ir_mark;
union { union {
struct ip_options_rcu *opt; struct ip_options_rcu *opt;
struct sk_buff *pktopts; struct sk_buff *pktopts;
}; };
u32 ir_mark;
}; };
static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
...@@ -106,13 +106,12 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) ...@@ -106,13 +106,12 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
return (struct inet_request_sock *)sk; return (struct inet_request_sock *)sk;
} }
static inline u32 inet_request_mark(struct sock *sk, struct sk_buff *skb) static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
{ {
if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) { if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)
return skb->mark; return skb->mark;
} else {
return sk->sk_mark; return sk->sk_mark;
}
} }
struct inet_cork { struct inet_cork {
......
...@@ -1137,31 +1137,6 @@ static inline int tcp_full_space(const struct sock *sk) ...@@ -1137,31 +1137,6 @@ static inline int tcp_full_space(const struct sock *sk)
return tcp_win_from_space(sk->sk_rcvbuf); return tcp_win_from_space(sk->sk_rcvbuf);
} }
static inline void tcp_openreq_init(struct request_sock *req,
struct tcp_options_received *rx_opt,
struct sk_buff *skb, struct sock *sk)
{
struct inet_request_sock *ireq = inet_rsk(req);
req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
tcp_rsk(req)->snt_synack = tcp_time_stamp;
tcp_rsk(req)->last_oow_ack_time = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
ireq->sack_ok = rx_opt->sack_ok;
ireq->snd_wscale = rx_opt->snd_wscale;
ireq->wscale_ok = rx_opt->wscale_ok;
ireq->acked = 0;
ireq->ecn_ok = 0;
ireq->ir_rmt_port = tcp_hdr(skb)->source;
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);
}
extern void tcp_openreq_init_rwin(struct request_sock *req, extern void tcp_openreq_init_rwin(struct request_sock *req,
struct sock *sk, struct dst_entry *dst); struct sock *sk, struct dst_entry *dst);
...@@ -1241,36 +1216,8 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, ...@@ -1241,36 +1216,8 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
return true; return true;
} }
/* Return true if we're currently rate-limiting out-of-window ACKs and bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
* thus shouldn't send a dupack right now. We rate-limit dupacks in int mib_idx, u32 *last_oow_ack_time);
* response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
* attacks that send repeated SYNs or ACKs for the same connection. To
* do this, we do not send a duplicate SYNACK or ACK if the remote
* endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
*/
static inline bool tcp_oow_rate_limited(struct net *net,
const struct sk_buff *skb,
int mib_idx, u32 *last_oow_ack_time)
{
/* Data packets without SYNs are not likely part of an ACK loop. */
if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
!tcp_hdr(skb)->syn)
goto not_rate_limited;
if (*last_oow_ack_time) {
s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
NET_INC_STATS_BH(net, mib_idx);
return true; /* rate-limited: don't send yet! */
}
}
*last_oow_ack_time = tcp_time_stamp;
not_rate_limited:
return false; /* not rate-limited: go ahead, send dupack now! */
}
static inline void tcp_mib_init(struct net *net) static inline void tcp_mib_init(struct net *net)
{ {
......
...@@ -3321,6 +3321,36 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ...@@ -3321,6 +3321,36 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
return flag; return flag;
} }
/* Return true if we're currently rate-limiting out-of-window ACKs and
* thus shouldn't send a dupack right now. We rate-limit dupacks in
* response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
* attacks that send repeated SYNs or ACKs for the same connection. To
* do this, we do not send a duplicate SYNACK or ACK if the remote
* endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
*/
bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
int mib_idx, u32 *last_oow_ack_time)
{
/* Data packets without SYNs are not likely part of an ACK loop. */
if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
!tcp_hdr(skb)->syn)
goto not_rate_limited;
if (*last_oow_ack_time) {
s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
NET_INC_STATS_BH(net, mib_idx);
return true; /* rate-limited: don't send yet! */
}
}
*last_oow_ack_time = tcp_time_stamp;
not_rate_limited:
return false; /* not rate-limited: go ahead, send dupack now! */
}
/* RFC 5961 7 [ACK Throttling] */ /* RFC 5961 7 [ACK Throttling] */
static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
{ {
...@@ -5912,6 +5942,31 @@ static void tcp_ecn_create_request(struct request_sock *req, ...@@ -5912,6 +5942,31 @@ static void tcp_ecn_create_request(struct request_sock *req,
inet_rsk(req)->ecn_ok = 1; inet_rsk(req)->ecn_ok = 1;
} }
static void tcp_openreq_init(struct request_sock *req,
const struct tcp_options_received *rx_opt,
struct sk_buff *skb, const struct sock *sk)
{
struct inet_request_sock *ireq = inet_rsk(req);
req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
tcp_rsk(req)->snt_synack = tcp_time_stamp;
tcp_rsk(req)->last_oow_ack_time = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
ireq->sack_ok = rx_opt->sack_ok;
ireq->snd_wscale = rx_opt->snd_wscale;
ireq->wscale_ok = rx_opt->wscale_ok;
ireq->acked = 0;
ireq->ecn_ok = 0;
ireq->ir_rmt_port = tcp_hdr(skb)->source;
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);
}
int tcp_conn_request(struct request_sock_ops *rsk_ops, int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops, const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb) struct sock *sk, struct sk_buff *skb)
......
...@@ -209,7 +209,7 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) ...@@ -209,7 +209,7 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
struct sock *sk = skb->sk; struct sock *sk = skb->sk;
struct rtable *ort = skb_rtable(skb); struct rtable *ort = skb_rtable(skb);
if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) if (!skb->dev && sk && sk_fullsock(sk))
ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
} }
......
...@@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); ...@@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header);
void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk) void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk)
{ {
if (!sk || sk->sk_state == TCP_TIME_WAIT) if (!sk || !sk_fullsock(sk))
return; return;
read_lock_bh(&sk->sk_callback_lock); read_lock_bh(&sk->sk_callback_lock);
......
...@@ -539,7 +539,7 @@ __build_packet_message(struct nfnl_log_net *log, ...@@ -539,7 +539,7 @@ __build_packet_message(struct nfnl_log_net *log,
/* UID */ /* UID */
sk = skb->sk; sk = skb->sk;
if (sk && sk->sk_state != TCP_TIME_WAIT) { if (sk && sk_fullsock(sk)) {
read_lock_bh(&sk->sk_callback_lock); read_lock_bh(&sk->sk_callback_lock);
if (sk->sk_socket && sk->sk_socket->file) { if (sk->sk_socket && sk->sk_socket->file) {
struct file *file = sk->sk_socket->file; struct file *file = sk->sk_socket->file;
......
...@@ -257,7 +257,7 @@ static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) ...@@ -257,7 +257,7 @@ static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk)
{ {
const struct cred *cred; const struct cred *cred;
if (sk->sk_state == TCP_TIME_WAIT) if (!sk_fullsock(sk))
return 0; return 0;
read_lock_bh(&sk->sk_callback_lock); read_lock_bh(&sk->sk_callback_lock);
......
...@@ -83,7 +83,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, ...@@ -83,7 +83,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
*(u16 *)dest->data = out->type; *(u16 *)dest->data = out->type;
break; break;
case NFT_META_SKUID: case NFT_META_SKUID:
if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) if (skb->sk == NULL || !sk_fullsock(skb->sk))
goto err; goto err;
read_lock_bh(&skb->sk->sk_callback_lock); read_lock_bh(&skb->sk->sk_callback_lock);
...@@ -99,7 +99,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, ...@@ -99,7 +99,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
read_unlock_bh(&skb->sk->sk_callback_lock); read_unlock_bh(&skb->sk->sk_callback_lock);
break; break;
case NFT_META_SKGID: case NFT_META_SKGID:
if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) if (skb->sk == NULL || !sk_fullsock(skb->sk))
goto err; goto err;
read_lock_bh(&skb->sk->sk_callback_lock); read_lock_bh(&skb->sk->sk_callback_lock);
......
...@@ -42,15 +42,21 @@ enum nf_tproxy_lookup_t { ...@@ -42,15 +42,21 @@ enum nf_tproxy_lookup_t {
static bool tproxy_sk_is_transparent(struct sock *sk) static bool tproxy_sk_is_transparent(struct sock *sk)
{ {
if (sk->sk_state != TCP_TIME_WAIT) { switch (sk->sk_state) {
if (inet_sk(sk)->transparent) case TCP_TIME_WAIT:
return true;
sock_put(sk);
} else {
if (inet_twsk(sk)->tw_transparent) if (inet_twsk(sk)->tw_transparent)
return true; return true;
inet_twsk_put(inet_twsk(sk)); break;
case TCP_NEW_SYN_RECV:
if (inet_rsk(inet_reqsk(sk))->no_srccheck)
return true;
break;
default:
if (inet_sk(sk)->transparent)
return true;
} }
sock_gen_put(sk);
return false; return false;
} }
......
...@@ -129,6 +129,20 @@ xt_socket_get_sock_v4(struct net *net, const u8 protocol, ...@@ -129,6 +129,20 @@ xt_socket_get_sock_v4(struct net *net, const u8 protocol,
return NULL; return NULL;
} }
static bool xt_socket_sk_is_transparent(struct sock *sk)
{
switch (sk->sk_state) {
case TCP_TIME_WAIT:
return inet_twsk(sk)->tw_transparent;
case TCP_NEW_SYN_RECV:
return inet_rsk(inet_reqsk(sk))->no_srccheck;
default:
return inet_sk(sk)->transparent;
}
}
static bool static bool
socket_match(const struct sk_buff *skb, struct xt_action_param *par, socket_match(const struct sk_buff *skb, struct xt_action_param *par,
const struct xt_socket_mtinfo1 *info) const struct xt_socket_mtinfo1 *info)
...@@ -195,16 +209,14 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, ...@@ -195,16 +209,14 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
* unless XT_SOCKET_NOWILDCARD is set * unless XT_SOCKET_NOWILDCARD is set
*/ */
wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
sk->sk_state != TCP_TIME_WAIT && sk_fullsock(sk) &&
inet_sk(sk)->inet_rcv_saddr == 0); inet_sk(sk)->inet_rcv_saddr == 0);
/* Ignore non-transparent sockets, /* Ignore non-transparent sockets,
if XT_SOCKET_TRANSPARENT is used */ * if XT_SOCKET_TRANSPARENT is used
*/
if (info->flags & XT_SOCKET_TRANSPARENT) if (info->flags & XT_SOCKET_TRANSPARENT)
transparent = ((sk->sk_state != TCP_TIME_WAIT && transparent = xt_socket_sk_is_transparent(sk);
inet_sk(sk)->transparent) ||
(sk->sk_state == TCP_TIME_WAIT &&
inet_twsk(sk)->tw_transparent));
if (sk != skb->sk) if (sk != skb->sk)
sock_gen_put(sk); sock_gen_put(sk);
...@@ -363,16 +375,14 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) ...@@ -363,16 +375,14 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
* unless XT_SOCKET_NOWILDCARD is set * unless XT_SOCKET_NOWILDCARD is set
*/ */
wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
sk->sk_state != TCP_TIME_WAIT && sk_fullsock(sk) &&
ipv6_addr_any(&sk->sk_v6_rcv_saddr)); ipv6_addr_any(&sk->sk_v6_rcv_saddr));
/* Ignore non-transparent sockets, /* Ignore non-transparent sockets,
if XT_SOCKET_TRANSPARENT is used */ * if XT_SOCKET_TRANSPARENT is used
*/
if (info->flags & XT_SOCKET_TRANSPARENT) if (info->flags & XT_SOCKET_TRANSPARENT)
transparent = ((sk->sk_state != TCP_TIME_WAIT && transparent = xt_socket_sk_is_transparent(sk);
inet_sk(sk)->transparent) ||
(sk->sk_state == TCP_TIME_WAIT &&
inet_twsk(sk)->tw_transparent));
if (sk != skb->sk) if (sk != skb->sk)
sock_gen_put(sk); sock_gen_put(sk);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment