Commit 8336886f authored by Jerry Chu's avatar Jerry Chu Committed by David S. Miller

tcp: TCP Fast Open Server - support TFO listeners

This patch builds on top of the previous patch to add the support
for TFO listeners. This includes -

1. allocating, properly initializing, and managing the per listener
fastopen_queue structure when TFO is enabled

2. changes to the inet_csk_accept code to support TFO. E.g., the
request_sock can no longer be freed upon accept(), not until 3WHS
finishes

3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg()
if it's a TFO socket

4. properly closing a TFO listener, and a TFO socket before 3WHS
finishes

5. supporting TCP_FASTOPEN socket option

6. modifying tcp_check_req() to use to check a TFO socket as well
as request_sock

7. supporting TCP's TFO cookie option

8. adding a new SYN-ACK retransmit handler to use the timer directly
off the TFO socket rather than the listener socket. Note that TFO
server side will not retransmit anything other than SYN-ACK until
the 3WHS is completed.

The patch also contains an important function
"reqsk_fastopen_remove()" to manage the somewhat complex relation
between a listener, its request_sock, and the corresponding child
socket. See the comment above the function for the detail.
Signed-off-by: default avatarH.K. Jerry Chu <hkchu@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 10467163
...@@ -226,19 +226,6 @@ static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue ...@@ -226,19 +226,6 @@ static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue
return req; return req;
} }
static inline struct sock *reqsk_queue_get_child(struct request_sock_queue *queue,
struct sock *parent)
{
struct request_sock *req = reqsk_queue_remove(queue);
struct sock *child = req->sk;
WARN_ON(child == NULL);
sk_acceptq_removed(parent);
__reqsk_free(req);
return child;
}
static inline int reqsk_queue_removed(struct request_sock_queue *queue, static inline int reqsk_queue_removed(struct request_sock_queue *queue,
struct request_sock *req) struct request_sock *req)
{ {
......
...@@ -424,7 +424,8 @@ extern enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock * ...@@ -424,7 +424,8 @@ extern enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *
const struct tcphdr *th); const struct tcphdr *th);
extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb, extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb,
struct request_sock *req, struct request_sock *req,
struct request_sock **prev); struct request_sock **prev,
bool fastopen);
extern int tcp_child_process(struct sock *parent, struct sock *child, extern int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb); struct sk_buff *skb);
extern bool tcp_use_frto(struct sock *sk); extern bool tcp_use_frto(struct sock *sk);
...@@ -478,7 +479,8 @@ extern int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, ...@@ -478,7 +479,8 @@ extern int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
extern int tcp_connect(struct sock *sk); extern int tcp_connect(struct sock *sk);
extern struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, extern struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req, struct request_sock *req,
struct request_values *rvp); struct request_values *rvp,
struct tcp_fastopen_cookie *foc);
extern int tcp_disconnect(struct sock *sk, int flags); extern int tcp_disconnect(struct sock *sk, int flags);
void tcp_connect_init(struct sock *sk); void tcp_connect_init(struct sock *sk);
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/random.h> #include <linux/random.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/tcp.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <net/request_sock.h> #include <net/request_sock.h>
...@@ -130,3 +131,97 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) ...@@ -130,3 +131,97 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
kfree(lopt); kfree(lopt);
} }
/*
* This function is called to set a Fast Open socket's "fastopen_rsk" field
* to NULL when a TFO socket no longer needs to access the request_sock.
* This happens only after 3WHS has been either completed or aborted (e.g.,
* RST is received).
*
* Before TFO, a child socket is created only after 3WHS is completed,
* hence it never needs to access the request_sock. things get a lot more
* complex with TFO. A child socket, accepted or not, has to access its
* request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
* until 3WHS is either completed or aborted. Afterwards the req will stay
* until either the child socket is accepted, or in the rare case when the
* listener is closed before the child is accepted.
*
* In short, a request socket is only freed after BOTH 3WHS has completed
* (or aborted) and the child socket has been accepted (or listener closed).
* When a child socket is accepted, its corresponding req->sk is set to
* NULL since it's no longer needed. More importantly, "req->sk == NULL"
* will be used by the code below to determine if a child socket has been
* accepted or not, and the check is protected by the fastopenq->lock
* described below.
*
* Note that fastopen_rsk is only accessed from the child socket's context
* with its socket lock held. But a request_sock (req) can be accessed by
* both its child socket through fastopen_rsk, and a listener socket through
* icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
* lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
* only in the rare case when both the listener and the child locks are held,
* e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
* The lock also protects other fields such as fastopenq->qlen, which is
* decremented by this function when fastopen_rsk is no longer needed.
*
* Note that another solution was to simply use the existing socket lock
* from the listener. But first socket lock is difficult to use. It is not
* a simple spin lock - one must consider sock_owned_by_user() and arrange
* to use sk_add_backlog() stuff. But what really makes it infeasible is the
* locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
* acquire a child's lock while holding listener's socket lock. A corner
* case might also exist in tcp_v4_hnd_req() that will trigger this locking
* order.
*
* When a TFO req is created, it needs to sock_hold its listener to prevent
* the latter data structure from going away.
*
* This function also sets "treq->listener" to NULL and unreference listener
* socket. treq->listener is used by the listener so it is protected by the
* fastopenq->lock in this function.
*/
void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
bool reset)
{
struct sock *lsk = tcp_rsk(req)->listener;
struct fastopen_queue *fastopenq =
inet_csk(lsk)->icsk_accept_queue.fastopenq;
BUG_ON(!spin_is_locked(&sk->sk_lock.slock) && !sock_owned_by_user(sk));
tcp_sk(sk)->fastopen_rsk = NULL;
spin_lock_bh(&fastopenq->lock);
fastopenq->qlen--;
tcp_rsk(req)->listener = NULL;
if (req->sk) /* the child socket hasn't been accepted yet */
goto out;
if (!reset || lsk->sk_state != TCP_LISTEN) {
/* If the listener has been closed don't bother with the
* special RST handling below.
*/
spin_unlock_bh(&fastopenq->lock);
sock_put(lsk);
reqsk_free(req);
return;
}
/* Wait for 60secs before removing a req that has triggered RST.
* This is a simple defense against TFO spoofing attack - by
* counting the req against fastopen.max_qlen, and disabling
* TFO when the qlen exceeds max_qlen.
*
* For more details see CoNext'11 "TCP Fast Open" paper.
*/
req->expires = jiffies + 60*HZ;
if (fastopenq->rskq_rst_head == NULL)
fastopenq->rskq_rst_head = req;
else
fastopenq->rskq_rst_tail->dl_next = req;
req->dl_next = NULL;
fastopenq->rskq_rst_tail = req;
fastopenq->qlen++;
out:
spin_unlock_bh(&fastopenq->lock);
sock_put(lsk);
return;
}
...@@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk) ...@@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk)
pr_err("Attempt to release alive inet socket %p\n", sk); pr_err("Attempt to release alive inet socket %p\n", sk);
return; return;
} }
if (sk->sk_type == SOCK_STREAM) {
struct fastopen_queue *fastopenq =
inet_csk(sk)->icsk_accept_queue.fastopenq;
kfree(fastopenq);
}
WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(atomic_read(&sk->sk_rmem_alloc));
WARN_ON(atomic_read(&sk->sk_wmem_alloc)); WARN_ON(atomic_read(&sk->sk_wmem_alloc));
...@@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog) ...@@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog)
* we can only allow the backlog to be adjusted. * we can only allow the backlog to be adjusted.
*/ */
if (old_state != TCP_LISTEN) { if (old_state != TCP_LISTEN) {
/* Check special setups for testing purpose to enable TFO w/o
* requiring TCP_FASTOPEN sockopt.
* Note that only TCP sockets (SOCK_STREAM) will reach here.
* Also fastopenq may already been allocated because this
* socket was in TCP_LISTEN state previously but was
* shutdown() (rather than close()).
*/
if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
err = fastopen_init_queue(sk, backlog);
else if ((sysctl_tcp_fastopen &
TFO_SERVER_WO_SOCKOPT2) != 0)
err = fastopen_init_queue(sk,
((uint)sysctl_tcp_fastopen) >> 16);
else
err = 0;
if (err)
goto out;
}
err = inet_csk_listen_start(sk, backlog); err = inet_csk_listen_start(sk, backlog);
if (err) if (err)
goto out; goto out;
...@@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) ...@@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
sock_rps_record_flow(sk2); sock_rps_record_flow(sk2);
WARN_ON(!((1 << sk2->sk_state) & WARN_ON(!((1 << sk2->sk_state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); (TCPF_ESTABLISHED | TCPF_SYN_RECV |
TCPF_CLOSE_WAIT | TCPF_CLOSE)));
sock_graft(sk2, newsock); sock_graft(sk2, newsock);
......
...@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) ...@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct sock *newsk; struct sock *newsk;
struct request_sock *req;
int error; int error;
lock_sock(sk); lock_sock(sk);
...@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) ...@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
goto out_err; goto out_err;
/* Find already established connection */ /* Find already established connection */
if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { if (reqsk_queue_empty(queue)) {
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */ /* If this is a non blocking socket don't sleep */
...@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) ...@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
if (error) if (error)
goto out_err; goto out_err;
} }
req = reqsk_queue_remove(queue);
newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); newsk = req->sk;
WARN_ON(newsk->sk_state == TCP_SYN_RECV);
sk_acceptq_removed(sk);
if (sk->sk_type == SOCK_STREAM && queue->fastopenq != NULL) {
spin_lock_bh(&queue->fastopenq->lock);
if (tcp_rsk(req)->listener) {
/* We are still waiting for the final ACK from 3WHS
* so can't free req now. Instead, we set req->sk to
* NULL to signify that the child socket is taken
* so reqsk_fastopen_remove() will free the req
* when 3WHS finishes (or is aborted).
*/
req->sk = NULL;
req = NULL;
}
spin_unlock_bh(&queue->fastopenq->lock);
}
out: out:
release_sock(sk); release_sock(sk);
if (req)
__reqsk_free(req);
return newsk; return newsk;
out_err: out_err:
newsk = NULL; newsk = NULL;
req = NULL;
*err = error; *err = error;
goto out; goto out;
} }
...@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start); ...@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
void inet_csk_listen_stop(struct sock *sk) void inet_csk_listen_stop(struct sock *sk)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct request_sock *acc_req; struct request_sock *acc_req;
struct request_sock *req; struct request_sock *req;
inet_csk_delete_keepalive_timer(sk); inet_csk_delete_keepalive_timer(sk);
/* make all the listen_opt local to us */ /* make all the listen_opt local to us */
acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); acc_req = reqsk_queue_yank_acceptq(queue);
/* Following specs, it would be better either to send FIN /* Following specs, it would be better either to send FIN
* (and enter FIN-WAIT-1, it is normal close) * (and enter FIN-WAIT-1, it is normal close)
...@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk) ...@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk)
* To be honest, we are not able to make either * To be honest, we are not able to make either
* of the variants now. --ANK * of the variants now. --ANK
*/ */
reqsk_queue_destroy(&icsk->icsk_accept_queue); reqsk_queue_destroy(queue);
while ((req = acc_req) != NULL) { while ((req = acc_req) != NULL) {
struct sock *child = req->sk; struct sock *child = req->sk;
...@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk) ...@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk)
percpu_counter_inc(sk->sk_prot->orphan_count); percpu_counter_inc(sk->sk_prot->orphan_count);
if (sk->sk_type == SOCK_STREAM && tcp_rsk(req)->listener) {
BUG_ON(tcp_sk(child)->fastopen_rsk != req);
BUG_ON(sk != tcp_rsk(req)->listener);
/* Paranoid, to prevent race condition if
* an inbound pkt destined for child is
* blocked by sock lock in tcp_v4_rcv().
* Also to satisfy an assertion in
* tcp_v4_destroy_sock().
*/
tcp_sk(child)->fastopen_rsk = NULL;
sock_put(sk);
}
inet_csk_destroy_sock(child); inet_csk_destroy_sock(child);
bh_unlock_sock(child); bh_unlock_sock(child);
...@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk) ...@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk)
sk_acceptq_removed(sk); sk_acceptq_removed(sk);
__reqsk_free(req); __reqsk_free(req);
} }
if (queue->fastopenq != NULL) {
/* Free all the reqs queued in rskq_rst_head. */
spin_lock_bh(&queue->fastopenq->lock);
acc_req = queue->fastopenq->rskq_rst_head;
queue->fastopenq->rskq_rst_head = NULL;
spin_unlock_bh(&queue->fastopenq->lock);
while ((req = acc_req) != NULL) {
acc_req = req->dl_next;
__reqsk_free(req);
}
}
WARN_ON(sk->sk_ack_backlog); WARN_ON(sk->sk_ack_backlog);
} }
EXPORT_SYMBOL_GPL(inet_csk_listen_stop); EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
......
...@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ...@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
ireq->tstamp_ok = tcp_opt.saw_tstamp; ireq->tstamp_ok = tcp_opt.saw_tstamp;
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
treq->listener = NULL;
/* We throwed the options of the initial SYN away, so we hope /* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8) * the ACK carries the same options again (see RFC1122 4.2.3.8)
......
...@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) ...@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (sk->sk_shutdown & RCV_SHUTDOWN) if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLIN | POLLRDNORM | POLLRDHUP; mask |= POLLIN | POLLRDNORM | POLLRDHUP;
/* Connected? */ /* Connected or passive Fast Open socket? */
if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (sk->sk_state != TCP_SYN_SENT &&
(sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
int target = sock_rcvlowat(sk, 0, INT_MAX); int target = sock_rcvlowat(sk, 0, INT_MAX);
if (tp->urg_seq == tp->copied_seq && if (tp->urg_seq == tp->copied_seq &&
...@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse ...@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
ssize_t copied; ssize_t copied;
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/* Wait for a connection to finish. */ /* Wait for a connection to finish. One exception is TCP Fast Open
if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) * (passive side) where data is allowed to be sent before a connection
* is fully established.
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
goto out_err; goto out_err;
}
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
...@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ...@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/* Wait for a connection to finish. */ /* Wait for a connection to finish. One exception is TCP Fast Open
if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) * (passive side) where data is allowed to be sent before a connection
* is fully established.
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
goto do_error; goto do_error;
}
if (unlikely(tp->repair)) { if (unlikely(tp->repair)) {
if (tp->repair_queue == TCP_RECV_QUEUE) { if (tp->repair_queue == TCP_RECV_QUEUE) {
...@@ -2144,6 +2155,10 @@ void tcp_close(struct sock *sk, long timeout) ...@@ -2144,6 +2155,10 @@ void tcp_close(struct sock *sk, long timeout)
* they look as CLOSING or LAST_ACK for Linux) * they look as CLOSING or LAST_ACK for Linux)
* Probably, I missed some more holelets. * Probably, I missed some more holelets.
* --ANK * --ANK
* XXX (TFO) - To start off we don't support SYN+ACK+FIN
* in a single packet! (May consider it later but will
* probably need API support or TCP_CORK SYN-ACK until
* data is written and socket is closed.)
*/ */
tcp_send_fin(sk); tcp_send_fin(sk);
} }
...@@ -2215,8 +2230,16 @@ void tcp_close(struct sock *sk, long timeout) ...@@ -2215,8 +2230,16 @@ void tcp_close(struct sock *sk, long timeout)
} }
} }
if (sk->sk_state == TCP_CLOSE) if (sk->sk_state == TCP_CLOSE) {
struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
/* We could get here with a non-NULL req if the socket is
* aborted (e.g., closed with unread data) before 3WHS
* finishes.
*/
if (req != NULL)
reqsk_fastopen_remove(sk, req, false);
inet_csk_destroy_sock(sk); inet_csk_destroy_sock(sk);
}
/* Otherwise, socket is reprieved until protocol close. */ /* Otherwise, socket is reprieved until protocol close. */
out: out:
...@@ -2688,6 +2711,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level, ...@@ -2688,6 +2711,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else else
icsk->icsk_user_timeout = msecs_to_jiffies(val); icsk->icsk_user_timeout = msecs_to_jiffies(val);
break; break;
case TCP_FASTOPEN:
if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
TCPF_LISTEN)))
err = fastopen_init_queue(sk, val);
else
err = -EINVAL;
break;
default: default:
err = -ENOPROTOOPT; err = -ENOPROTOOPT;
break; break;
...@@ -3501,11 +3532,15 @@ EXPORT_SYMBOL(tcp_cookie_generator); ...@@ -3501,11 +3532,15 @@ EXPORT_SYMBOL(tcp_cookie_generator);
void tcp_done(struct sock *sk) void tcp_done(struct sock *sk)
{ {
struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
tcp_set_state(sk, TCP_CLOSE); tcp_set_state(sk, TCP_CLOSE);
tcp_clear_xmit_timers(sk); tcp_clear_xmit_timers(sk);
if (req != NULL)
reqsk_fastopen_remove(sk, req, false);
sk->sk_shutdown = SHUTDOWN_MASK; sk->sk_shutdown = SHUTDOWN_MASK;
......
...@@ -839,7 +839,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, ...@@ -839,7 +839,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1; return -1;
skb = tcp_make_synack(sk, dst, req, rvp); skb = tcp_make_synack(sk, dst, req, rvp, NULL);
if (skb) { if (skb) {
__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
...@@ -1554,7 +1554,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) ...@@ -1554,7 +1554,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
iph->saddr, iph->daddr); iph->saddr, iph->daddr);
if (req) if (req)
return tcp_check_req(sk, skb, req, prev); return tcp_check_req(sk, skb, req, prev, false);
nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
th->source, iph->daddr, th->dest, inet_iif(skb)); th->source, iph->daddr, th->dest, inet_iif(skb));
......
...@@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, ...@@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss; newtp->rx_opt.mss_clamp = req->mss;
TCP_ECN_openreq_child(newtp, req); TCP_ECN_openreq_child(newtp, req);
newtp->fastopen_rsk = NULL;
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
} }
...@@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, ...@@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
EXPORT_SYMBOL(tcp_create_openreq_child); EXPORT_SYMBOL(tcp_create_openreq_child);
/* /*
* Process an incoming packet for SYN_RECV sockets represented * Process an incoming packet for SYN_RECV sockets represented as a
* as a request_sock. * request_sock. Normally sk is the listener socket but for TFO it
* points to the child socket.
*
* XXX (TFO) - The current impl contains a special check for ack
* validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
*/ */
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req, struct request_sock *req,
struct request_sock **prev) struct request_sock **prev,
bool fastopen)
{ {
struct tcp_options_received tmp_opt; struct tcp_options_received tmp_opt;
const u8 *hash_location; const u8 *hash_location;
...@@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, ...@@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
bool paws_reject = false; bool paws_reject = false;
BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
tmp_opt.saw_tstamp = 0; tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) { if (th->doff > (sizeof(struct tcphdr)>>2)) {
tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
...@@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, ...@@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* *
* Enforce "SYN-ACK" according to figure 8, figure 6 * Enforce "SYN-ACK" according to figure 8, figure 6
* of RFC793, fixed by RFC1122. * of RFC793, fixed by RFC1122.
*
* Note that even if there is new data in the SYN packet
* they will be thrown away too.
*/ */
req->rsk_ops->rtx_syn_ack(sk, req, NULL); req->rsk_ops->rtx_syn_ack(sk, req, NULL);
return NULL; return NULL;
...@@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, ...@@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* sent (the segment carries an unacceptable ACK) ... * sent (the segment carries an unacceptable ACK) ...
* a reset is sent." * a reset is sent."
* *
* Invalid ACK: reset will be sent by listening socket * Invalid ACK: reset will be sent by listening socket.
* Note that the ACK validity check for a Fast Open socket is done
* elsewhere and is checked directly against the child socket rather
* than req because user data may have been sent out.
*/ */
if ((flg & TCP_FLAG_ACK) && if ((flg & TCP_FLAG_ACK) && !fastopen &&
(TCP_SKB_CB(skb)->ack_seq != (TCP_SKB_CB(skb)->ack_seq !=
tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
return sk; return sk;
...@@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, ...@@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* RFC793: "first check sequence number". */ /* RFC793: "first check sequence number". */
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
/* Out of window: send ACK and drop. */ /* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST)) if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_ack(sk, skb, req); req->rsk_ops->send_ack(sk, skb, req);
...@@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, ...@@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* In sequence, PAWS is OK. */ /* In sequence, PAWS is OK. */
if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
req->ts_recent = tmp_opt.rcv_tsval; req->ts_recent = tmp_opt.rcv_tsval;
if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
...@@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, ...@@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* ACK sequence verified above, just make sure ACK is /* ACK sequence verified above, just make sure ACK is
* set. If ACK not set, just silently drop the packet. * set. If ACK not set, just silently drop the packet.
*
* XXX (TFO) - if we ever allow "data after SYN", the
* following check needs to be removed.
*/ */
if (!(flg & TCP_FLAG_ACK)) if (!(flg & TCP_FLAG_ACK))
return NULL; return NULL;
/* For Fast Open no more processing is needed (sk is the
* child socket).
*/
if (fastopen)
return sk;
/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
...@@ -706,11 +729,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, ...@@ -706,11 +729,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
} }
embryonic_reset: embryonic_reset:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); if (!(flg & TCP_FLAG_RST)) {
if (!(flg & TCP_FLAG_RST)) /* Received a bad SYN pkt - for TFO We try not to reset
* the local connection unless it's really necessary to
* avoid becoming vulnerable to outside attack aiming at
* resetting legit local connections.
*/
req->rsk_ops->send_reset(sk, skb); req->rsk_ops->send_reset(sk, skb);
} else if (fastopen) { /* received a valid RST pkt */
inet_csk_reqsk_queue_drop(sk, req, prev); reqsk_fastopen_remove(sk, req, true);
tcp_reset(sk);
}
if (!fastopen) {
inet_csk_reqsk_queue_drop(sk, req, prev);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
}
return NULL; return NULL;
} }
EXPORT_SYMBOL(tcp_check_req); EXPORT_SYMBOL(tcp_check_req);
...@@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req); ...@@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req);
* Queue segment on the new socket if the new socket is active, * Queue segment on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with * otherwise we just shortcircuit this and continue with
* the new socket. * the new socket.
*
* For the vast majority of cases child->sk_state will be TCP_SYN_RECV
* when entering. But other states are possible due to a race condition
* where after __inet_lookup_established() fails but before the listener
* locked is obtained, other packets cause the same connection to
* be created.
*/ */
int tcp_child_process(struct sock *parent, struct sock *child, int tcp_child_process(struct sock *parent, struct sock *child,
......
...@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk, ...@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk,
unsigned int mss, struct sk_buff *skb, unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts, struct tcp_out_options *opts,
struct tcp_md5sig_key **md5, struct tcp_md5sig_key **md5,
struct tcp_extend_values *xvp) struct tcp_extend_values *xvp,
struct tcp_fastopen_cookie *foc)
{ {
struct inet_request_sock *ireq = inet_rsk(req); struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE; unsigned int remaining = MAX_TCP_OPTION_SPACE;
...@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk, ...@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk,
if (unlikely(!ireq->tstamp_ok)) if (unlikely(!ireq->tstamp_ok))
remaining -= TCPOLEN_SACKPERM_ALIGNED; remaining -= TCPOLEN_SACKPERM_ALIGNED;
} }
if (foc != NULL) {
u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
need = (need + 3) & ~3U; /* Align to 32 bits */
if (remaining >= need) {
opts->options |= OPTION_FAST_OPEN_COOKIE;
opts->fastopen_cookie = foc;
remaining -= need;
}
}
/* Similar rationale to tcp_syn_options() applies here, too. /* Similar rationale to tcp_syn_options() applies here, too.
* If the <SYN> options fit, the same options should fit now! * If the <SYN> options fit, the same options should fit now!
*/ */
...@@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk) ...@@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk)
*/ */
struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req, struct request_sock *req,
struct request_values *rvp) struct request_values *rvp,
struct tcp_fastopen_cookie *foc)
{ {
struct tcp_out_options opts; struct tcp_out_options opts;
struct tcp_extend_values *xvp = tcp_xv(rvp); struct tcp_extend_values *xvp = tcp_xv(rvp);
...@@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, ...@@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
#endif #endif
TCP_SKB_CB(skb)->when = tcp_time_stamp; TCP_SKB_CB(skb)->when = tcp_time_stamp;
tcp_header_size = tcp_synack_options(sk, req, mss, tcp_header_size = tcp_synack_options(sk, req, mss,
skb, &opts, &md5, xvp) skb, &opts, &md5, xvp, foc)
+ sizeof(*th); + sizeof(*th);
skb_push(skb, tcp_header_size); skb_push(skb, tcp_header_size);
...@@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, ...@@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
} }
th->seq = htonl(TCP_SKB_CB(skb)->seq); th->seq = htonl(TCP_SKB_CB(skb)->seq);
th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); /* XXX data is queued and acked as is. No buffer/window check */
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rcv_wnd, 65535U)); th->window = htons(min(req->rcv_wnd, 65535U));
......
...@@ -304,6 +304,35 @@ static void tcp_probe_timer(struct sock *sk) ...@@ -304,6 +304,35 @@ static void tcp_probe_timer(struct sock *sk)
} }
} }
/*
* Timer for Fast Open socket to retransmit SYNACK. Note that the
* sk here is the child socket, not the parent (listener) socket.
*/
static void tcp_fastopen_synack_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int max_retries = icsk->icsk_syn_retries ? :
sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
struct request_sock *req;
req = tcp_sk(sk)->fastopen_rsk;
req->rsk_ops->syn_ack_timeout(sk, req);
if (req->retrans >= max_retries) {
tcp_write_err(sk);
return;
}
/* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
* returned from rtx_syn_ack() to make it more persistent like
* regular retransmit because if the child socket has been accepted
* it's not good to give up too easily.
*/
req->rsk_ops->rtx_syn_ack(sk, req, NULL);
req->retrans++;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX);
}
/* /*
* The TCP retransmit timer. * The TCP retransmit timer.
*/ */
...@@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk) ...@@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk)
tcp_resume_early_retransmit(sk); tcp_resume_early_retransmit(sk);
return; return;
} }
if (tp->fastopen_rsk) {
BUG_ON(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
tcp_fastopen_synack_timer(sk);
/* Before we receive ACK to our SYN-ACK don't retransmit
* anything else (e.g., data or FIN segments).
*/
return;
}
if (!tp->packets_out) if (!tp->packets_out)
goto out; goto out;
......
...@@ -190,6 +190,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ...@@ -190,6 +190,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
ireq = inet_rsk(req); ireq = inet_rsk(req);
ireq6 = inet6_rsk(req); ireq6 = inet6_rsk(req);
treq = tcp_rsk(req); treq = tcp_rsk(req);
treq->listener = NULL;
if (security_inet_conn_request(sk, skb, req)) if (security_inet_conn_request(sk, skb, req))
goto out_free; goto out_free;
......
...@@ -475,7 +475,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, ...@@ -475,7 +475,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL) if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL)
goto done; goto done;
skb = tcp_make_synack(sk, dst, req, rvp); skb = tcp_make_synack(sk, dst, req, rvp, NULL);
if (skb) { if (skb) {
__tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
...@@ -987,7 +987,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) ...@@ -987,7 +987,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->saddr,
&ipv6_hdr(skb)->daddr, inet6_iif(skb)); &ipv6_hdr(skb)->daddr, inet6_iif(skb));
if (req) if (req)
return tcp_check_req(sk, skb, req, prev); return tcp_check_req(sk, skb, req, prev, false);
nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
&ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->saddr, th->source,
...@@ -1179,6 +1179,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ...@@ -1179,6 +1179,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
want_cookie) want_cookie)
goto drop_and_free; goto drop_and_free;
tcp_rsk(req)->listener = NULL;
inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment