Commit b75eba76 authored by Soheil Hassas Yeganeh's avatar Soheil Hassas Yeganeh Committed by David S. Miller

tcp: send in-queue bytes in cmsg upon read

Applications with many concurrent connections, high variance
in receive queue length and tight memory bounds cannot
allocate worst-case buffer size to drain sockets. Knowing
the size of receive queue length, applications can optimize
how they allocate buffers to read from the socket.

The number of bytes pending on the socket is directly
available through ioctl(FIONREAD/SIOCINQ) and can be
approximated using getsockopt(MEMINFO) (rmem_alloc includes
skb overheads in addition to application data). But, both of
these options add an extra syscall per recvmsg. Moreover,
ioctl(FIONREAD/SIOCINQ) takes the socket lock.

Add the TCP_INQ socket option to TCP. When this socket
option is set, recvmsg() relays the number of bytes available
on the socket for reading to the application via the
TCP_CM_INQ control message.

Calculate the number of bytes after releasing the socket lock
to include the processed backlog, if any. To avoid an extra
branch in the hot path of recvmsg() for this new control
message, move all cmsg processing inside an existing branch for
processing receive timestamps. Since the socket lock is not held
when calculating the size of receive queue, TCP_INQ is a hint.
For example, it can overestimate the queue size by one byte,
if FIN is received.

With this method, applications can start reading from the socket
using a small buffer, and then use larger buffers based on the
remaining data when needed.

V3 change-log:
	As suggested by David Miller, added loads with barrier
	to check whether we have multiple threads calling recvmsg
	in parallel. When that happens we lock the socket to
	calculate inq.
V4 change-log:
	Removed inline from a static function.
Signed-off-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: default avatarYuchung Cheng <ycheng@google.com>
Signed-off-by: default avatarWillem de Bruijn <willemb@google.com>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Reviewed-by: default avatarNeal Cardwell <ncardwell@google.com>
Suggested-by: default avatarDavid Miller <davem@davemloft.net>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent ab85539e
...@@ -228,7 +228,7 @@ struct tcp_sock { ...@@ -228,7 +228,7 @@ struct tcp_sock {
unused:2; unused:2;
u8 nonagle : 4,/* Disable Nagle algorithm? */ u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */ thin_lto : 1,/* Use linear timeouts for thin streams */
unused1 : 1, recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
repair : 1, repair : 1,
frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */
u8 repair_queue; u8 repair_queue;
......
...@@ -123,6 +123,9 @@ enum { ...@@ -123,6 +123,9 @@ enum {
#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ #define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */
#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ #define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */
#define TCP_ZEROCOPY_RECEIVE 35 #define TCP_ZEROCOPY_RECEIVE 35
#define TCP_INQ 36 /* Notify bytes available to read as a cmsg on read */
#define TCP_CM_INQ TCP_INQ
struct tcp_repair_opt { struct tcp_repair_opt {
__u32 opt_code; __u32 opt_code;
......
...@@ -1889,6 +1889,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, ...@@ -1889,6 +1889,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
} }
} }
static int tcp_inq_hint(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
u32 copied_seq = READ_ONCE(tp->copied_seq);
u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
int inq;
inq = rcv_nxt - copied_seq;
if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
lock_sock(sk);
inq = tp->rcv_nxt - tp->copied_seq;
release_sock(sk);
}
return inq;
}
/* /*
* This routine copies from a sock struct into the user buffer. * This routine copies from a sock struct into the user buffer.
* *
...@@ -1905,13 +1921,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, ...@@ -1905,13 +1921,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
u32 peek_seq; u32 peek_seq;
u32 *seq; u32 *seq;
unsigned long used; unsigned long used;
int err; int err, inq;
int target; /* Read at least this many bytes */ int target; /* Read at least this many bytes */
long timeo; long timeo;
struct sk_buff *skb, *last; struct sk_buff *skb, *last;
u32 urg_hole = 0; u32 urg_hole = 0;
struct scm_timestamping tss; struct scm_timestamping tss;
bool has_tss = false; bool has_tss = false;
bool has_cmsg;
if (unlikely(flags & MSG_ERRQUEUE)) if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len); return inet_recv_error(sk, msg, len, addr_len);
...@@ -1926,6 +1943,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, ...@@ -1926,6 +1943,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
if (sk->sk_state == TCP_LISTEN) if (sk->sk_state == TCP_LISTEN)
goto out; goto out;
has_cmsg = tp->recvmsg_inq;
timeo = sock_rcvtimeo(sk, nonblock); timeo = sock_rcvtimeo(sk, nonblock);
/* Urgent data needs to be handled specially. */ /* Urgent data needs to be handled specially. */
...@@ -2112,6 +2130,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, ...@@ -2112,6 +2130,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
if (TCP_SKB_CB(skb)->has_rxtstamp) { if (TCP_SKB_CB(skb)->has_rxtstamp) {
tcp_update_recv_tstamps(skb, &tss); tcp_update_recv_tstamps(skb, &tss);
has_tss = true; has_tss = true;
has_cmsg = true;
} }
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok; goto found_fin_ok;
...@@ -2131,13 +2150,20 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, ...@@ -2131,13 +2150,20 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
* on connected socket. I was just happy when found this 8) --ANK * on connected socket. I was just happy when found this 8) --ANK
*/ */
if (has_tss)
tcp_recv_timestamp(msg, sk, &tss);
/* Clean up data we have read: This will do ACK frames. */ /* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied); tcp_cleanup_rbuf(sk, copied);
release_sock(sk); release_sock(sk);
if (has_cmsg) {
if (has_tss)
tcp_recv_timestamp(msg, sk, &tss);
if (tp->recvmsg_inq) {
inq = tcp_inq_hint(sk);
put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
}
}
return copied; return copied;
out: out:
...@@ -3006,6 +3032,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, ...@@ -3006,6 +3032,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
tp->notsent_lowat = val; tp->notsent_lowat = val;
sk->sk_write_space(sk); sk->sk_write_space(sk);
break; break;
case TCP_INQ:
if (val > 1 || val < 0)
err = -EINVAL;
else
tp->recvmsg_inq = val;
break;
default: default:
err = -ENOPROTOOPT; err = -ENOPROTOOPT;
break; break;
...@@ -3431,6 +3463,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, ...@@ -3431,6 +3463,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_NOTSENT_LOWAT: case TCP_NOTSENT_LOWAT:
val = tp->notsent_lowat; val = tp->notsent_lowat;
break; break;
case TCP_INQ:
val = tp->recvmsg_inq;
break;
case TCP_SAVE_SYN: case TCP_SAVE_SYN:
val = tp->save_syn; val = tp->save_syn;
break; break;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment