Commit 86c1aee1 authored by David S. Miller's avatar David S. Miller

Merge branch 'listener_refactor_part_15'

Eric Dumazet says:

====================
tcp listener refactoring part 15

I am trying to make the final patch pushing request socks into ehash
as small as possible. In this patch series, I made various adjustments
for the SYNACK generation, allowing me to reach 1 Mpps SYNACK in my
stress test (still hitting LISTENER spinlock of course, and the syn_wait
spinlock)

I also converted the ICMP handlers a bit ahead of time :

They no longer need to get the LISTENER socket, and can use
only a lookup in ehash table. No big deal if we ignore ICMP
for requests socks before the final steps.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents c9231f82 52036a43
...@@ -43,6 +43,7 @@ enum dccp_state { ...@@ -43,6 +43,7 @@ enum dccp_state {
DCCP_CLOSING = TCP_CLOSING, DCCP_CLOSING = TCP_CLOSING,
DCCP_TIME_WAIT = TCP_TIME_WAIT, DCCP_TIME_WAIT = TCP_TIME_WAIT,
DCCP_CLOSED = TCP_CLOSE, DCCP_CLOSED = TCP_CLOSE,
DCCP_NEW_SYN_RECV = TCP_NEW_SYN_RECV,
DCCP_PARTOPEN = TCP_MAX_STATES, DCCP_PARTOPEN = TCP_MAX_STATES,
DCCP_PASSIVE_CLOSEREQ, /* clients receiving CloseReq */ DCCP_PASSIVE_CLOSEREQ, /* clients receiving CloseReq */
DCCP_MAX_STATES DCCP_MAX_STATES
...@@ -57,6 +58,7 @@ enum { ...@@ -57,6 +58,7 @@ enum {
DCCPF_CLOSING = TCPF_CLOSING, DCCPF_CLOSING = TCPF_CLOSING,
DCCPF_TIME_WAIT = TCPF_TIME_WAIT, DCCPF_TIME_WAIT = TCPF_TIME_WAIT,
DCCPF_CLOSED = TCPF_CLOSE, DCCPF_CLOSED = TCPF_CLOSE,
DCCPF_NEW_SYN_RECV = TCPF_NEW_SYN_RECV,
DCCPF_PARTOPEN = (1 << DCCP_PARTOPEN), DCCPF_PARTOPEN = (1 << DCCP_PARTOPEN),
}; };
...@@ -317,6 +319,6 @@ static inline const char *dccp_role(const struct sock *sk) ...@@ -317,6 +319,6 @@ static inline const char *dccp_role(const struct sock *sk)
return NULL; return NULL;
} }
extern void dccp_syn_ack_timeout(struct sock *sk, struct request_sock *req); extern void dccp_syn_ack_timeout(const struct request_sock *req);
#endif /* _LINUX_DCCP_H */ #endif /* _LINUX_DCCP_H */
...@@ -39,8 +39,7 @@ struct request_sock_ops { ...@@ -39,8 +39,7 @@ struct request_sock_ops {
void (*send_reset)(struct sock *sk, void (*send_reset)(struct sock *sk,
struct sk_buff *skb); struct sk_buff *skb);
void (*destructor)(struct request_sock *req); void (*destructor)(struct request_sock *req);
void (*syn_ack_timeout)(struct sock *sk, void (*syn_ack_timeout)(const struct request_sock *req);
struct request_sock *req);
}; };
int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req); int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req);
...@@ -174,11 +173,6 @@ struct fastopen_queue { ...@@ -174,11 +173,6 @@ struct fastopen_queue {
* %syn_wait_lock is necessary only to avoid proc interface having to grab the main * %syn_wait_lock is necessary only to avoid proc interface having to grab the main
* lock sock while browsing the listening hash (otherwise it's deadlock prone). * lock sock while browsing the listening hash (otherwise it's deadlock prone).
* *
* This lock is acquired in read mode only from listening_get_next() seq_file
* op and it's acquired in write mode _only_ from code that is actively
* changing rskq_accept_head. All readers that are holding the master sock lock
* don't need to grab this lock in read mode too as rskq_accept_head. writes
* are always protected from the main sock lock.
*/ */
struct request_sock_queue { struct request_sock_queue {
struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_head;
...@@ -193,7 +187,7 @@ struct request_sock_queue { ...@@ -193,7 +187,7 @@ struct request_sock_queue {
*/ */
/* temporary alignment, our goal is to get rid of this lock */ /* temporary alignment, our goal is to get rid of this lock */
rwlock_t syn_wait_lock ____cacheline_aligned_in_smp; spinlock_t syn_wait_lock ____cacheline_aligned_in_smp;
}; };
int reqsk_queue_alloc(struct request_sock_queue *queue, int reqsk_queue_alloc(struct request_sock_queue *queue,
...@@ -224,14 +218,14 @@ static inline void reqsk_queue_unlink(struct request_sock_queue *queue, ...@@ -224,14 +218,14 @@ static inline void reqsk_queue_unlink(struct request_sock_queue *queue,
struct listen_sock *lopt = queue->listen_opt; struct listen_sock *lopt = queue->listen_opt;
struct request_sock **prev; struct request_sock **prev;
write_lock(&queue->syn_wait_lock); spin_lock(&queue->syn_wait_lock);
prev = &lopt->syn_table[req->rsk_hash]; prev = &lopt->syn_table[req->rsk_hash];
while (*prev != req) while (*prev != req)
prev = &(*prev)->dl_next; prev = &(*prev)->dl_next;
*prev = req->dl_next; *prev = req->dl_next;
write_unlock(&queue->syn_wait_lock); spin_unlock(&queue->syn_wait_lock);
if (del_timer(&req->rsk_timer)) if (del_timer(&req->rsk_timer))
reqsk_put(req); reqsk_put(req);
} }
......
...@@ -433,7 +433,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, ...@@ -433,7 +433,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
int compat_tcp_setsockopt(struct sock *sk, int level, int optname, int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, unsigned int optlen); char __user *optval, unsigned int optlen);
void tcp_set_keepalive(struct sock *sk, int val); void tcp_set_keepalive(struct sock *sk, int val);
void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req); void tcp_syn_ack_timeout(const struct request_sock *req);
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len); int flags, int *addr_len);
void tcp_parse_options(const struct sk_buff *skb, void tcp_parse_options(const struct sk_buff *skb,
...@@ -447,6 +447,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); ...@@ -447,6 +447,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
void tcp_v4_mtu_reduced(struct sock *sk); void tcp_v4_mtu_reduced(struct sock *sk);
void tcp_req_err(struct sock *sk, u32 seq);
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_create_openreq_child(struct sock *sk, struct sock *tcp_create_openreq_child(struct sock *sk,
struct request_sock *req, struct request_sock *req,
......
...@@ -58,14 +58,14 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, ...@@ -58,14 +58,14 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
return -ENOMEM; return -ENOMEM;
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
rwlock_init(&queue->syn_wait_lock); spin_lock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL; queue->rskq_accept_head = NULL;
lopt->nr_table_entries = nr_table_entries; lopt->nr_table_entries = nr_table_entries;
lopt->max_qlen_log = ilog2(nr_table_entries); lopt->max_qlen_log = ilog2(nr_table_entries);
write_lock_bh(&queue->syn_wait_lock); spin_lock_bh(&queue->syn_wait_lock);
queue->listen_opt = lopt; queue->listen_opt = lopt;
write_unlock_bh(&queue->syn_wait_lock); spin_unlock_bh(&queue->syn_wait_lock);
return 0; return 0;
} }
...@@ -81,10 +81,10 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk( ...@@ -81,10 +81,10 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk(
{ {
struct listen_sock *lopt; struct listen_sock *lopt;
write_lock_bh(&queue->syn_wait_lock); spin_lock_bh(&queue->syn_wait_lock);
lopt = queue->listen_opt; lopt = queue->listen_opt;
queue->listen_opt = NULL; queue->listen_opt = NULL;
write_unlock_bh(&queue->syn_wait_lock); spin_unlock_bh(&queue->syn_wait_lock);
return lopt; return lopt;
} }
...@@ -100,7 +100,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) ...@@ -100,7 +100,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
for (i = 0; i < lopt->nr_table_entries; i++) { for (i = 0; i < lopt->nr_table_entries; i++) {
struct request_sock *req; struct request_sock *req;
write_lock_bh(&queue->syn_wait_lock); spin_lock_bh(&queue->syn_wait_lock);
while ((req = lopt->syn_table[i]) != NULL) { while ((req = lopt->syn_table[i]) != NULL) {
lopt->syn_table[i] = req->dl_next; lopt->syn_table[i] = req->dl_next;
atomic_inc(&lopt->qlen_dec); atomic_inc(&lopt->qlen_dec);
...@@ -108,7 +108,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) ...@@ -108,7 +108,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
reqsk_put(req); reqsk_put(req);
reqsk_put(req); reqsk_put(req);
} }
write_unlock_bh(&queue->syn_wait_lock); spin_unlock_bh(&queue->syn_wait_lock);
} }
} }
......
...@@ -317,6 +317,7 @@ int inet_dccp_listen(struct socket *sock, int backlog); ...@@ -317,6 +317,7 @@ int inet_dccp_listen(struct socket *sock, int backlog);
unsigned int dccp_poll(struct file *file, struct socket *sock, unsigned int dccp_poll(struct file *file, struct socket *sock,
poll_table *wait); poll_table *wait);
int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
void dccp_req_err(struct sock *sk, u64 seq);
struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb); struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb);
int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
......
...@@ -195,6 +195,32 @@ static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) ...@@ -195,6 +195,32 @@ static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk)
dst->ops->redirect(dst, sk, skb); dst->ops->redirect(dst, sk, skb);
} }
void dccp_req_err(struct sock *sk, u64 seq)
{
struct request_sock *req = inet_reqsk(sk);
struct net *net = sock_net(sk);
/*
* ICMPs are not backlogged, hence we cannot get an established
* socket here.
*/
WARN_ON(req->sk);
if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
} else {
/*
* Still in RESPOND, just remove it silently.
* There is no good way to pass the error to the newly
* created socket, and POSIX does not want network
* errors returned from accept().
*/
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
}
reqsk_put(req);
}
EXPORT_SYMBOL(dccp_req_err);
/* /*
* This routine is called by the ICMP module when it gets some sort of error * This routine is called by the ICMP module when it gets some sort of error
* condition. If err < 0 then the socket should be closed and the error * condition. If err < 0 then the socket should be closed and the error
...@@ -227,10 +253,11 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) ...@@ -227,10 +253,11 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
return; return;
} }
sk = inet_lookup(net, &dccp_hashinfo, sk = __inet_lookup_established(net, &dccp_hashinfo,
iph->daddr, dh->dccph_dport, iph->daddr, dh->dccph_dport,
iph->saddr, dh->dccph_sport, inet_iif(skb)); iph->saddr, ntohs(dh->dccph_sport),
if (sk == NULL) { inet_iif(skb));
if (!sk) {
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return; return;
} }
...@@ -239,6 +266,9 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) ...@@ -239,6 +266,9 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
inet_twsk_put(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk));
return; return;
} }
seq = dccp_hdr_seq(dh);
if (sk->sk_state == DCCP_NEW_SYN_RECV)
return dccp_req_err(sk, seq);
bh_lock_sock(sk); bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy /* If too many ICMPs get dropped on busy
...@@ -251,7 +281,6 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) ...@@ -251,7 +281,6 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
goto out; goto out;
dp = dccp_sk(sk); dp = dccp_sk(sk);
seq = dccp_hdr_seq(dh);
if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
!between48(seq, dp->dccps_awl, dp->dccps_awh)) { !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
...@@ -288,37 +317,6 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) ...@@ -288,37 +317,6 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
} }
switch (sk->sk_state) { switch (sk->sk_state) {
struct request_sock *req;
case DCCP_LISTEN:
if (sock_owned_by_user(sk))
goto out;
req = inet_csk_search_req(sk, dh->dccph_dport,
iph->daddr, iph->saddr);
if (!req)
goto out;
/*
* ICMPs are not backlogged, hence we cannot get an established
* socket here.
*/
WARN_ON(req->sk);
if (!between48(seq, dccp_rsk(req)->dreq_iss,
dccp_rsk(req)->dreq_gss)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
reqsk_put(req);
goto out;
}
/*
* Still in RESPOND, just remove it silently.
* There is no good way to pass the error to the newly
* created socket, and POSIX does not want network
* errors returned from accept().
*/
inet_csk_reqsk_queue_drop(sk, req);
reqsk_put(req);
goto out;
case DCCP_REQUESTING: case DCCP_REQUESTING:
case DCCP_RESPOND: case DCCP_RESPOND:
if (!sock_owned_by_user(sk)) { if (!sock_owned_by_user(sk)) {
...@@ -576,7 +574,7 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req) ...@@ -576,7 +574,7 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req)
kfree(inet_rsk(req)->opt); kfree(inet_rsk(req)->opt);
} }
void dccp_syn_ack_timeout(struct sock *sk, struct request_sock *req) void dccp_syn_ack_timeout(const struct request_sock *req)
{ {
} }
EXPORT_SYMBOL(dccp_syn_ack_timeout); EXPORT_SYMBOL(dccp_syn_ack_timeout);
......
...@@ -85,11 +85,12 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ...@@ -85,11 +85,12 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return; return;
} }
sk = inet6_lookup(net, &dccp_hashinfo, sk = __inet6_lookup_established(net, &dccp_hashinfo,
&hdr->daddr, dh->dccph_dport, &hdr->daddr, dh->dccph_dport,
&hdr->saddr, dh->dccph_sport, inet6_iif(skb)); &hdr->saddr, ntohs(dh->dccph_sport),
inet6_iif(skb));
if (sk == NULL) { if (!sk) {
ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS); ICMP6_MIB_INERRORS);
return; return;
...@@ -99,6 +100,9 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ...@@ -99,6 +100,9 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
inet_twsk_put(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk));
return; return;
} }
seq = dccp_hdr_seq(dh);
if (sk->sk_state == DCCP_NEW_SYN_RECV)
return dccp_req_err(sk, seq);
bh_lock_sock(sk); bh_lock_sock(sk);
if (sock_owned_by_user(sk)) if (sock_owned_by_user(sk))
...@@ -108,7 +112,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ...@@ -108,7 +112,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
goto out; goto out;
dp = dccp_sk(sk); dp = dccp_sk(sk);
seq = dccp_hdr_seq(dh);
if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
!between48(seq, dp->dccps_awl, dp->dccps_awh)) { !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
...@@ -149,34 +152,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ...@@ -149,34 +152,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
/* Might be for an request_sock */ /* Might be for an request_sock */
switch (sk->sk_state) { switch (sk->sk_state) {
struct request_sock *req;
case DCCP_LISTEN:
if (sock_owned_by_user(sk))
goto out;
req = inet6_csk_search_req(sk, dh->dccph_dport,
&hdr->daddr, &hdr->saddr,
inet6_iif(skb));
if (!req)
goto out;
/*
* ICMPs are not backlogged, hence we cannot get an established
* socket here.
*/
WARN_ON(req->sk != NULL);
if (!between48(seq, dccp_rsk(req)->dreq_iss,
dccp_rsk(req)->dreq_gss)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
reqsk_put(req);
goto out;
}
inet_csk_reqsk_queue_drop(sk, req);
reqsk_put(req);
goto out;
case DCCP_REQUESTING: case DCCP_REQUESTING:
case DCCP_RESPOND: /* Cannot happen. case DCCP_RESPOND: /* Cannot happen.
It can, it SYNs are crossed. --ANK */ It can, it SYNs are crossed. --ANK */
......
...@@ -403,18 +403,17 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, ...@@ -403,18 +403,17 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
struct flowi4 *fl4, struct flowi4 *fl4,
const struct request_sock *req) const struct request_sock *req)
{ {
struct rtable *rt;
const struct inet_request_sock *ireq = inet_rsk(req); const struct inet_request_sock *ireq = inet_rsk(req);
struct ip_options_rcu *opt = inet_rsk(req)->opt; struct net *net = read_pnet(&ireq->ireq_net);
struct net *net = sock_net(sk); struct ip_options_rcu *opt = ireq->opt;
int flags = inet_sk_flowi_flags(sk); struct rtable *rt;
flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol, sk->sk_protocol, inet_sk_flowi_flags(sk),
flags,
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); ireq->ir_loc_addr, ireq->ir_rmt_port,
htons(ireq->ir_num));
security_req_classify_flow(req, flowi4_to_flowi(fl4)); security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk); rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) if (IS_ERR(rt))
...@@ -436,9 +435,9 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, ...@@ -436,9 +435,9 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
const struct request_sock *req) const struct request_sock *req)
{ {
const struct inet_request_sock *ireq = inet_rsk(req); const struct inet_request_sock *ireq = inet_rsk(req);
struct net *net = read_pnet(&ireq->ireq_net);
struct inet_sock *newinet = inet_sk(newsk); struct inet_sock *newinet = inet_sk(newsk);
struct ip_options_rcu *opt; struct ip_options_rcu *opt;
struct net *net = sock_net(sk);
struct flowi4 *fl4; struct flowi4 *fl4;
struct rtable *rt; struct rtable *rt;
...@@ -446,11 +445,12 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, ...@@ -446,11 +445,12 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
rcu_read_lock(); rcu_read_lock();
opt = rcu_dereference(newinet->inet_opt); opt = rcu_dereference(newinet->inet_opt);
flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark, flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol, inet_sk_flowi_flags(sk), sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); ireq->ir_loc_addr, ireq->ir_rmt_port,
htons(ireq->ir_num));
security_req_classify_flow(req, flowi4_to_flowi(fl4)); security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk); rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) if (IS_ERR(rt))
...@@ -495,7 +495,7 @@ struct request_sock *inet_csk_search_req(struct sock *sk, ...@@ -495,7 +495,7 @@ struct request_sock *inet_csk_search_req(struct sock *sk,
u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd, u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd,
lopt->nr_table_entries); lopt->nr_table_entries);
write_lock(&icsk->icsk_accept_queue.syn_wait_lock); spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req); const struct inet_request_sock *ireq = inet_rsk(req);
...@@ -508,7 +508,7 @@ struct request_sock *inet_csk_search_req(struct sock *sk, ...@@ -508,7 +508,7 @@ struct request_sock *inet_csk_search_req(struct sock *sk,
break; break;
} }
} }
write_unlock(&icsk->icsk_accept_queue.syn_wait_lock); spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
return req; return req;
} }
...@@ -571,8 +571,9 @@ static void reqsk_timer_handler(unsigned long data) ...@@ -571,8 +571,9 @@ static void reqsk_timer_handler(unsigned long data)
struct inet_connection_sock *icsk = inet_csk(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener);
struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct listen_sock *lopt = queue->listen_opt; struct listen_sock *lopt = queue->listen_opt;
int expire = 0, resend = 0; int qlen, expire = 0, resend = 0;
int max_retries, thresh; int max_retries, thresh;
u8 defer_accept;
if (sk_listener->sk_state != TCP_LISTEN || !lopt) { if (sk_listener->sk_state != TCP_LISTEN || !lopt) {
reqsk_put(req); reqsk_put(req);
...@@ -598,21 +599,23 @@ static void reqsk_timer_handler(unsigned long data) ...@@ -598,21 +599,23 @@ static void reqsk_timer_handler(unsigned long data)
* embrions; and abort old ones without pity, if old * embrions; and abort old ones without pity, if old
* ones are about to clog our table. * ones are about to clog our table.
*/ */
if (listen_sock_qlen(lopt) >> (lopt->max_qlen_log - 1)) { qlen = listen_sock_qlen(lopt);
if (qlen >> (lopt->max_qlen_log - 1)) {
int young = listen_sock_young(lopt) << 1; int young = listen_sock_young(lopt) << 1;
while (thresh > 2) { while (thresh > 2) {
if (listen_sock_qlen(lopt) < young) if (qlen < young)
break; break;
thresh--; thresh--;
young <<= 1; young <<= 1;
} }
} }
if (queue->rskq_defer_accept) defer_accept = READ_ONCE(queue->rskq_defer_accept);
max_retries = queue->rskq_defer_accept; if (defer_accept)
syn_ack_recalc(req, thresh, max_retries, queue->rskq_defer_accept, max_retries = defer_accept;
syn_ack_recalc(req, thresh, max_retries, defer_accept,
&expire, &resend); &expire, &resend);
req->rsk_ops->syn_ack_timeout(sk_listener, req); req->rsk_ops->syn_ack_timeout(req);
if (!expire && if (!expire &&
(!resend || (!resend ||
!inet_rtx_syn_ack(sk_listener, req) || !inet_rtx_syn_ack(sk_listener, req) ||
...@@ -647,10 +650,10 @@ void reqsk_queue_hash_req(struct request_sock_queue *queue, ...@@ -647,10 +650,10 @@ void reqsk_queue_hash_req(struct request_sock_queue *queue,
setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
req->rsk_hash = hash; req->rsk_hash = hash;
write_lock(&queue->syn_wait_lock); spin_lock(&queue->syn_wait_lock);
req->dl_next = lopt->syn_table[hash]; req->dl_next = lopt->syn_table[hash];
lopt->syn_table[hash] = req; lopt->syn_table[hash] = req;
write_unlock(&queue->syn_wait_lock); spin_unlock(&queue->syn_wait_lock);
mod_timer_pinned(&req->rsk_timer, jiffies + timeout); mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
} }
......
...@@ -728,7 +728,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, ...@@ -728,7 +728,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
entry.family = sk->sk_family; entry.family = sk->sk_family;
read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
lopt = icsk->icsk_accept_queue.listen_opt; lopt = icsk->icsk_accept_queue.listen_opt;
if (!lopt || !listen_sock_qlen(lopt)) if (!lopt || !listen_sock_qlen(lopt))
...@@ -776,7 +776,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, ...@@ -776,7 +776,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
} }
out: out:
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
return err; return err;
} }
......
...@@ -310,6 +310,34 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) ...@@ -310,6 +310,34 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk)
dst->ops->redirect(dst, sk, skb); dst->ops->redirect(dst, sk, skb);
} }
/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
void tcp_req_err(struct sock *sk, u32 seq)
{
struct request_sock *req = inet_reqsk(sk);
struct net *net = sock_net(sk);
/* ICMPs are not backlogged, hence we cannot get
* an established socket here.
*/
WARN_ON(req->sk);
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
} else {
/*
* Still in SYN_RECV, just remove it silently.
* There is no good way to pass the error to the newly
* created socket, and POSIX does not want network
* errors returned from accept().
*/
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
}
reqsk_put(req);
}
EXPORT_SYMBOL(tcp_req_err);
/* /*
* This routine is called by the ICMP module when it gets some * This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should * sort of error condition. If err < 0 then the socket should
...@@ -343,8 +371,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) ...@@ -343,8 +371,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
int err; int err;
struct net *net = dev_net(icmp_skb->dev); struct net *net = dev_net(icmp_skb->dev);
sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
iph->saddr, th->source, inet_iif(icmp_skb)); th->dest, iph->saddr, ntohs(th->source),
inet_iif(icmp_skb));
if (!sk) { if (!sk) {
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return; return;
...@@ -353,6 +382,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) ...@@ -353,6 +382,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
inet_twsk_put(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk));
return; return;
} }
seq = ntohl(th->seq);
if (sk->sk_state == TCP_NEW_SYN_RECV)
return tcp_req_err(sk, seq);
bh_lock_sock(sk); bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy /* If too many ICMPs get dropped on busy
...@@ -374,7 +406,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) ...@@ -374,7 +406,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
icsk = inet_csk(sk); icsk = inet_csk(sk);
tp = tcp_sk(sk); tp = tcp_sk(sk);
seq = ntohl(th->seq);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
fastopen = tp->fastopen_rsk; fastopen = tp->fastopen_rsk;
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
...@@ -458,38 +489,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) ...@@ -458,38 +489,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
} }
switch (sk->sk_state) { switch (sk->sk_state) {
struct request_sock *req;
case TCP_LISTEN:
if (sock_owned_by_user(sk))
goto out;
req = inet_csk_search_req(sk, th->dest,
iph->daddr, iph->saddr);
if (!req)
goto out;
/* ICMPs are not backlogged, hence we cannot get
an established socket here.
*/
WARN_ON(req->sk);
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
reqsk_put(req);
goto out;
}
/*
* Still in SYN_RECV, just remove it silently.
* There is no good way to pass the error to the newly
* created socket, and POSIX does not want network
* errors returned from accept().
*/
inet_csk_reqsk_queue_drop(sk, req);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
reqsk_put(req);
goto out;
case TCP_SYN_SENT: case TCP_SYN_SENT:
case TCP_SYN_RECV: case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is /* Only in fast or simultaneous open. If a fast open socket is
...@@ -1909,13 +1908,13 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ...@@ -1909,13 +1908,13 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
} }
sk = sk_nulls_next(st->syn_wait_sk); sk = sk_nulls_next(st->syn_wait_sk);
st->state = TCP_SEQ_STATE_LISTENING; st->state = TCP_SEQ_STATE_LISTENING;
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
} else { } else {
icsk = inet_csk(sk); icsk = inet_csk(sk);
read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
if (reqsk_queue_len(&icsk->icsk_accept_queue)) if (reqsk_queue_len(&icsk->icsk_accept_queue))
goto start_req; goto start_req;
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
sk = sk_nulls_next(sk); sk = sk_nulls_next(sk);
} }
get_sk: get_sk:
...@@ -1927,7 +1926,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ...@@ -1927,7 +1926,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
goto out; goto out;
} }
icsk = inet_csk(sk); icsk = inet_csk(sk);
read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
if (reqsk_queue_len(&icsk->icsk_accept_queue)) { if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
start_req: start_req:
st->uid = sock_i_uid(sk); st->uid = sock_i_uid(sk);
...@@ -1936,7 +1935,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ...@@ -1936,7 +1935,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
st->sbucket = 0; st->sbucket = 0;
goto get_req; goto get_req;
} }
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
} }
spin_unlock_bh(&ilb->lock); spin_unlock_bh(&ilb->lock);
st->offset = 0; st->offset = 0;
...@@ -2155,7 +2154,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) ...@@ -2155,7 +2154,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_OPENREQ:
if (v) { if (v) {
struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
} }
case TCP_SEQ_STATE_LISTENING: case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN) if (v != SEQ_START_TOKEN)
......
...@@ -327,7 +327,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) ...@@ -327,7 +327,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
struct request_sock *req; struct request_sock *req;
req = tcp_sk(sk)->fastopen_rsk; req = tcp_sk(sk)->fastopen_rsk;
req->rsk_ops->syn_ack_timeout(sk, req); req->rsk_ops->syn_ack_timeout(req);
if (req->num_timeout >= max_retries) { if (req->num_timeout >= max_retries) {
tcp_write_err(sk); tcp_write_err(sk);
...@@ -539,9 +539,11 @@ static void tcp_write_timer(unsigned long data) ...@@ -539,9 +539,11 @@ static void tcp_write_timer(unsigned long data)
sock_put(sk); sock_put(sk);
} }
void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req) void tcp_syn_ack_timeout(const struct request_sock *req)
{ {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); struct net *net = read_pnet(&inet_rsk(req)->ireq_net);
NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS);
} }
EXPORT_SYMBOL(tcp_syn_ack_timeout); EXPORT_SYMBOL(tcp_syn_ack_timeout);
......
...@@ -124,7 +124,7 @@ struct request_sock *inet6_csk_search_req(struct sock *sk, ...@@ -124,7 +124,7 @@ struct request_sock *inet6_csk_search_req(struct sock *sk,
u32 hash = inet6_synq_hash(raddr, rport, lopt->hash_rnd, u32 hash = inet6_synq_hash(raddr, rport, lopt->hash_rnd,
lopt->nr_table_entries); lopt->nr_table_entries);
write_lock(&icsk->icsk_accept_queue.syn_wait_lock); spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req); const struct inet_request_sock *ireq = inet_rsk(req);
...@@ -138,7 +138,7 @@ struct request_sock *inet6_csk_search_req(struct sock *sk, ...@@ -138,7 +138,7 @@ struct request_sock *inet6_csk_search_req(struct sock *sk,
break; break;
} }
} }
write_unlock(&icsk->icsk_accept_queue.syn_wait_lock); spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
return req; return req;
} }
......
...@@ -324,18 +324,20 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ...@@ -324,18 +324,20 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
{ {
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
struct net *net = dev_net(skb->dev);
struct request_sock *fastopen;
struct ipv6_pinfo *np; struct ipv6_pinfo *np;
struct sock *sk;
int err;
struct tcp_sock *tp; struct tcp_sock *tp;
struct request_sock *fastopen;
__u32 seq, snd_una; __u32 seq, snd_una;
struct net *net = dev_net(skb->dev); struct sock *sk;
int err;
sk = inet6_lookup(net, &tcp_hashinfo, &hdr->daddr, sk = __inet6_lookup_established(net, &tcp_hashinfo,
th->dest, &hdr->saddr, th->source, skb->dev->ifindex); &hdr->daddr, th->dest,
&hdr->saddr, ntohs(th->source),
skb->dev->ifindex);
if (sk == NULL) { if (!sk) {
ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS); ICMP6_MIB_INERRORS);
return; return;
...@@ -345,6 +347,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ...@@ -345,6 +347,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
inet_twsk_put(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk));
return; return;
} }
seq = ntohl(th->seq);
if (sk->sk_state == TCP_NEW_SYN_RECV)
return tcp_req_err(sk, seq);
bh_lock_sock(sk); bh_lock_sock(sk);
if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
...@@ -359,7 +364,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ...@@ -359,7 +364,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
} }
tp = tcp_sk(sk); tp = tcp_sk(sk);
seq = ntohl(th->seq);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
fastopen = tp->fastopen_rsk; fastopen = tp->fastopen_rsk;
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
...@@ -403,33 +407,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ...@@ -403,33 +407,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
/* Might be for an request_sock */ /* Might be for an request_sock */
switch (sk->sk_state) { switch (sk->sk_state) {
struct request_sock *req;
case TCP_LISTEN:
if (sock_owned_by_user(sk))
goto out;
/* Note : We use inet6_iif() here, not tcp_v6_iif() */
req = inet6_csk_search_req(sk, th->dest, &hdr->daddr,
&hdr->saddr, inet6_iif(skb));
if (!req)
goto out;
/* ICMPs are not backlogged, hence we cannot get
* an established socket here.
*/
WARN_ON(req->sk != NULL);
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
reqsk_put(req);
goto out;
}
inet_csk_reqsk_queue_drop(sk, req);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
reqsk_put(req);
goto out;
case TCP_SYN_SENT: case TCP_SYN_SENT:
case TCP_SYN_RECV: case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is /* Only in fast or simultaneous open. If a fast open socket is
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment