Commit afb83012 authored by Soheil Hassas Yeganeh's avatar Soheil Hassas Yeganeh Committed by David S. Miller

tcp: schedule EPOLLOUT after a partial sendmsg

For EPOLLET, applications must call sendmsg until they get EAGAIN.
Otherwise, there is no guarantee that EPOLLOUT is sent if there was
a failure upon memory allocation.

As a result on high-speed NICs, userspace observes multiple small
sendmsgs after a partial sendmsg until EAGAIN, since TCP can send
1-2 TSOs in between two sendmsg syscalls:

// One large partial send due to memory allocation failure.
sendmsg(20MB)   = 2MB
// Many small sends until EAGAIN.
sendmsg(18MB)   = 64KB
sendmsg(17.9MB) = 128KB
sendmsg(17.8MB) = 64KB
...
sendmsg(...)    = EAGAIN
// At this point, userspace can assume an EPOLLOUT.

To fix this, set the SOCK_NOSPACE on all partial sendmsg scenarios
to guarantee that we send EPOLLOUT after partial sendmsg.

After this commit userspace can assume that it will receive an EPOLLOUT
after the first partial sendmsg. This EPOLLOUT will benefit from
sk_stream_write_space() logic delaying the EPOLLOUT until significant
space is available in write queue.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 8ba3c9d1
...@@ -1004,12 +1004,12 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, ...@@ -1004,12 +1004,12 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
!tcp_skb_can_collapse_to(skb)) { !tcp_skb_can_collapse_to(skb)) {
new_segment: new_segment:
if (!sk_stream_memory_free(sk)) if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf; goto wait_for_space;
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
tcp_rtx_and_write_queues_empty(sk)); tcp_rtx_and_write_queues_empty(sk));
if (!skb) if (!skb)
goto wait_for_memory; goto wait_for_space;
#ifdef CONFIG_TLS_DEVICE #ifdef CONFIG_TLS_DEVICE
skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
...@@ -1028,7 +1028,7 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, ...@@ -1028,7 +1028,7 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
goto new_segment; goto new_segment;
} }
if (!sk_wmem_schedule(sk, copy)) if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory; goto wait_for_space;
if (can_coalesce) { if (can_coalesce) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
...@@ -1069,9 +1069,8 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, ...@@ -1069,9 +1069,8 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
tcp_push_one(sk, mss_now); tcp_push_one(sk, mss_now);
continue; continue;
wait_for_sndbuf: wait_for_space:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now, tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal); TCP_NAGLE_PUSH, size_goal);
...@@ -1282,7 +1281,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1282,7 +1281,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
new_segment: new_segment:
if (!sk_stream_memory_free(sk)) if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf; goto wait_for_space;
if (unlikely(process_backlog >= 16)) { if (unlikely(process_backlog >= 16)) {
process_backlog = 0; process_backlog = 0;
...@@ -1293,7 +1292,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1293,7 +1292,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
first_skb); first_skb);
if (!skb) if (!skb)
goto wait_for_memory; goto wait_for_space;
process_backlog++; process_backlog++;
skb->ip_summed = CHECKSUM_PARTIAL; skb->ip_summed = CHECKSUM_PARTIAL;
...@@ -1326,7 +1325,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1326,7 +1325,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
struct page_frag *pfrag = sk_page_frag(sk); struct page_frag *pfrag = sk_page_frag(sk);
if (!sk_page_frag_refill(sk, pfrag)) if (!sk_page_frag_refill(sk, pfrag))
goto wait_for_memory; goto wait_for_space;
if (!skb_can_coalesce(skb, i, pfrag->page, if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) { pfrag->offset)) {
...@@ -1340,7 +1339,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1340,7 +1339,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
copy = min_t(int, copy, pfrag->size - pfrag->offset); copy = min_t(int, copy, pfrag->size - pfrag->offset);
if (!sk_wmem_schedule(sk, copy)) if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory; goto wait_for_space;
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
pfrag->page, pfrag->page,
...@@ -1393,9 +1392,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1393,9 +1392,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
tcp_push_one(sk, mss_now); tcp_push_one(sk, mss_now);
continue; continue;
wait_for_sndbuf: wait_for_space:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied) if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now, tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal); TCP_NAGLE_PUSH, size_goal);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment