Commit 72308ecb authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'mptcp-improve-multiple-xmit-streams-support'

Paolo Abeni says:

====================
mptcp: improve multiple xmit streams support

This series improves MPTCP handling of multiple concurrent
xmit streams.

The to-be-transmitted data is enqueued to a subflow only when
the send window is open, keeping the subflows xmit queue shorter
and allowing for faster switch-over.

The above requires a more accurate msk socket state tracking
and some additional infrastructure to allow pushing the data
pending in the msk xmit queue as soon as the MPTCP's send window
opens (patches 6-10).

As a side effect, the MPTCP socket could enqueue data to subflows
after close() time - to completely spooling the data sitting in the
msk xmit queue. Dealing with the requires some infrastructure and
core TCP changes (patches 1-5)

Finally, patches 11-12 introduce a more accurate tracking of the other
end's receive window.

Overall this refactor the MPTCP xmit path, without introducing
new features - the new code is covered by the existing self-tests.

v2 -> v3:
 - rebased,
 - fixed checkpatch issue in patch 1/13
 - fixed some state tracking issues in patch 8/13

v1 -> v2:
 - this is just a repost, to cope with patchwork issues, no changes
   at all
====================

Link: https://lore.kernel.org/r/cover.1605458224.git.pabeni@redhat.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents c0a645a7 7ed90803
...@@ -322,6 +322,7 @@ void tcp_shutdown(struct sock *sk, int how); ...@@ -322,6 +322,7 @@ void tcp_shutdown(struct sock *sk, int how);
int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_early_demux(struct sk_buff *skb);
int tcp_v4_rcv(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb);
void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb);
int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
...@@ -329,6 +330,8 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, ...@@ -329,6 +330,8 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags); int flags);
int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
size_t size, int flags); size_t size, int flags);
struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
struct page *page, int offset, size_t *size);
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags); size_t size, int flags);
int tcp_send_mss(struct sock *sk, int *size_goal, int flags); int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
...@@ -392,6 +395,7 @@ void tcp_update_metrics(struct sock *sk); ...@@ -392,6 +395,7 @@ void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void); void tcp_metrics_init(void);
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
void __tcp_close(struct sock *sk, long timeout);
void tcp_close(struct sock *sk, long timeout); void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk); void tcp_init_sock(struct sock *sk);
void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb);
......
...@@ -954,7 +954,7 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags) ...@@ -954,7 +954,7 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
* importantly be able to generate EPOLLOUT for Edge Trigger epoll() * importantly be able to generate EPOLLOUT for Edge Trigger epoll()
* users. * users.
*/ */
static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
{ {
if (skb && !skb->len) { if (skb && !skb->len) {
tcp_unlink_write_queue(skb, sk); tcp_unlink_write_queue(skb, sk);
...@@ -964,6 +964,68 @@ static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) ...@@ -964,6 +964,68 @@ static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
} }
} }
struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
struct page *page, int offset, size_t *size)
{
struct sk_buff *skb = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
bool can_coalesce;
int copy, i;
if (!skb || (copy = size_goal - skb->len) <= 0 ||
!tcp_skb_can_collapse_to(skb)) {
new_segment:
if (!sk_stream_memory_free(sk))
return NULL;
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
tcp_rtx_and_write_queues_empty(sk));
if (!skb)
return NULL;
#ifdef CONFIG_TLS_DEVICE
skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
#endif
skb_entail(sk, skb);
copy = size_goal;
}
if (copy > *size)
copy = *size;
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, page, offset);
if (!can_coalesce && i >= sysctl_max_skb_frags) {
tcp_mark_push(tp, skb);
goto new_segment;
}
if (!sk_wmem_schedule(sk, copy))
return NULL;
if (can_coalesce) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
get_page(page);
skb_fill_page_desc(skb, i, page, offset, copy);
}
if (!(flags & MSG_NO_SHARED_FRAGS))
skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
skb->ip_summed = CHECKSUM_PARTIAL;
WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0);
*size = copy;
return skb;
}
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags) size_t size, int flags)
{ {
...@@ -999,60 +1061,13 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, ...@@ -999,60 +1061,13 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
goto out_err; goto out_err;
while (size > 0) { while (size > 0) {
struct sk_buff *skb = tcp_write_queue_tail(sk); struct sk_buff *skb;
int copy, i; size_t copy = size;
bool can_coalesce;
if (!skb || (copy = size_goal - skb->len) <= 0 || skb = tcp_build_frag(sk, size_goal, flags, page, offset, &copy);
!tcp_skb_can_collapse_to(skb)) { if (!skb)
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_space;
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
tcp_rtx_and_write_queues_empty(sk));
if (!skb)
goto wait_for_space;
#ifdef CONFIG_TLS_DEVICE
skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
#endif
skb_entail(sk, skb);
copy = size_goal;
}
if (copy > size)
copy = size;
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, page, offset);
if (!can_coalesce && i >= sysctl_max_skb_frags) {
tcp_mark_push(tp, skb);
goto new_segment;
}
if (!sk_wmem_schedule(sk, copy))
goto wait_for_space; goto wait_for_space;
if (can_coalesce) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
get_page(page);
skb_fill_page_desc(skb, i, page, offset, copy);
}
if (!(flags & MSG_NO_SHARED_FRAGS))
skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
skb->ip_summed = CHECKSUM_PARTIAL;
WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0);
if (!copied) if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
...@@ -2405,13 +2420,12 @@ bool tcp_check_oom(struct sock *sk, int shift) ...@@ -2405,13 +2420,12 @@ bool tcp_check_oom(struct sock *sk, int shift)
return too_many_orphans || out_of_socket_memory; return too_many_orphans || out_of_socket_memory;
} }
void tcp_close(struct sock *sk, long timeout) void __tcp_close(struct sock *sk, long timeout)
{ {
struct sk_buff *skb; struct sk_buff *skb;
int data_was_unread = 0; int data_was_unread = 0;
int state; int state;
lock_sock(sk);
sk->sk_shutdown = SHUTDOWN_MASK; sk->sk_shutdown = SHUTDOWN_MASK;
if (sk->sk_state == TCP_LISTEN) { if (sk->sk_state == TCP_LISTEN) {
...@@ -2575,6 +2589,12 @@ void tcp_close(struct sock *sk, long timeout) ...@@ -2575,6 +2589,12 @@ void tcp_close(struct sock *sk, long timeout)
out: out:
bh_unlock_sock(sk); bh_unlock_sock(sk);
local_bh_enable(); local_bh_enable();
}
void tcp_close(struct sock *sk, long timeout)
{
lock_sock(sk);
__tcp_close(sk, timeout);
release_sock(sk); release_sock(sk);
sock_put(sk); sock_put(sk);
} }
......
...@@ -492,7 +492,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, ...@@ -492,7 +492,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
bool ret = false; bool ret = false;
mpext = skb ? mptcp_get_ext(skb) : NULL; mpext = skb ? mptcp_get_ext(skb) : NULL;
snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable); snd_data_fin_enable = mptcp_data_fin_enabled(msk);
if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) {
unsigned int map_size; unsigned int map_size;
...@@ -809,11 +809,14 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) ...@@ -809,11 +809,14 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
return cur_ack; return cur_ack;
} }
static void update_una(struct mptcp_sock *msk, static void ack_update_msk(struct mptcp_sock *msk,
struct mptcp_options_received *mp_opt) const struct sock *ssk,
struct mptcp_options_received *mp_opt)
{ {
u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una);
u64 write_seq = READ_ONCE(msk->write_seq); u64 new_wnd_end, wnd_end, old_wnd_end = atomic64_read(&msk->wnd_end);
u64 snd_nxt = READ_ONCE(msk->snd_nxt);
struct sock *sk = (struct sock *)msk;
/* avoid ack expansion on update conflict, to reduce the risk of /* avoid ack expansion on update conflict, to reduce the risk of
* wrongly expanding to a future ack sequence number, which is way * wrongly expanding to a future ack sequence number, which is way
...@@ -822,15 +825,28 @@ static void update_una(struct mptcp_sock *msk, ...@@ -822,15 +825,28 @@ static void update_una(struct mptcp_sock *msk,
new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
/* ACK for data not even sent yet? Ignore. */ /* ACK for data not even sent yet? Ignore. */
if (after64(new_snd_una, write_seq)) if (after64(new_snd_una, snd_nxt))
new_snd_una = old_snd_una; new_snd_una = old_snd_una;
new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
while (after64(new_wnd_end, old_wnd_end)) {
wnd_end = old_wnd_end;
old_wnd_end = atomic64_cmpxchg(&msk->wnd_end, wnd_end,
new_wnd_end);
if (old_wnd_end == wnd_end) {
if (mptcp_send_head(sk))
mptcp_schedule_work(sk);
break;
}
}
while (after64(new_snd_una, old_snd_una)) { while (after64(new_snd_una, old_snd_una)) {
snd_una = old_snd_una; snd_una = old_snd_una;
old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una,
new_snd_una); new_snd_una);
if (old_snd_una == snd_una) { if (old_snd_una == snd_una) {
mptcp_data_acked((struct sock *)msk); mptcp_data_acked(sk);
break; break;
} }
} }
...@@ -930,7 +946,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) ...@@ -930,7 +946,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
* monodirectional flows will stuck * monodirectional flows will stuck
*/ */
if (mp_opt.use_ack) if (mp_opt.use_ack)
update_una(msk, &mp_opt); ack_update_msk(msk, sk, &mp_opt);
/* Zero-data-length packets are dropped by the caller and not /* Zero-data-length packets are dropped by the caller and not
* propagated to the MPTCP layer, so the skb extension does not * propagated to the MPTCP layer, so the skb extension does not
......
...@@ -89,8 +89,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, ...@@ -89,8 +89,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
return false; return false;
msk->pm.status |= BIT(new_status); msk->pm.status |= BIT(new_status);
if (schedule_work(&msk->work)) mptcp_schedule_work((struct sock *)msk);
sock_hold((struct sock *)msk);
return true; return true;
} }
......
...@@ -416,14 +416,13 @@ void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) ...@@ -416,14 +416,13 @@ void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
int how = RCV_SHUTDOWN | SEND_SHUTDOWN; int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
long timeout = 0;
if (msk->pm.rm_id != subflow->remote_id) if (msk->pm.rm_id != subflow->remote_id)
continue; continue;
spin_unlock_bh(&msk->pm.lock); spin_unlock_bh(&msk->pm.lock);
mptcp_subflow_shutdown(sk, ssk, how); mptcp_subflow_shutdown(sk, ssk, how);
__mptcp_close_ssk(sk, ssk, subflow, timeout); __mptcp_close_ssk(sk, ssk, subflow);
spin_lock_bh(&msk->pm.lock); spin_lock_bh(&msk->pm.lock);
msk->pm.add_addr_accepted--; msk->pm.add_addr_accepted--;
...@@ -452,14 +451,13 @@ void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id) ...@@ -452,14 +451,13 @@ void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id)
list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
int how = RCV_SHUTDOWN | SEND_SHUTDOWN; int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
long timeout = 0;
if (rm_id != subflow->local_id) if (rm_id != subflow->local_id)
continue; continue;
spin_unlock_bh(&msk->pm.lock); spin_unlock_bh(&msk->pm.lock);
mptcp_subflow_shutdown(sk, ssk, how); mptcp_subflow_shutdown(sk, ssk, how);
__mptcp_close_ssk(sk, ssk, subflow, timeout); __mptcp_close_ssk(sk, ssk, subflow);
spin_lock_bh(&msk->pm.lock); spin_lock_bh(&msk->pm.lock);
msk->pm.local_addr_used--; msk->pm.local_addr_used--;
......
This diff is collapsed.
...@@ -86,11 +86,19 @@ ...@@ -86,11 +86,19 @@
/* MPTCP socket flags */ /* MPTCP socket flags */
#define MPTCP_DATA_READY 0 #define MPTCP_DATA_READY 0
#define MPTCP_SEND_SPACE 1 #define MPTCP_NOSPACE 1
#define MPTCP_WORK_RTX 2 #define MPTCP_WORK_RTX 2
#define MPTCP_WORK_EOF 3 #define MPTCP_WORK_EOF 3
#define MPTCP_FALLBACK_DONE 4 #define MPTCP_FALLBACK_DONE 4
#define MPTCP_WORK_CLOSE_SUBFLOW 5 #define MPTCP_WORK_CLOSE_SUBFLOW 5
#define MPTCP_WORKER_RUNNING 6
static inline bool before64(__u64 seq1, __u64 seq2)
{
return (__s64)(seq1 - seq2) < 0;
}
#define after64(seq2, seq1) before64(seq1, seq2)
struct mptcp_options_received { struct mptcp_options_received {
u64 sndr_key; u64 sndr_key;
...@@ -187,9 +195,10 @@ struct mptcp_pm_data { ...@@ -187,9 +195,10 @@ struct mptcp_pm_data {
struct mptcp_data_frag { struct mptcp_data_frag {
struct list_head list; struct list_head list;
u64 data_seq; u64 data_seq;
int data_len; u16 data_len;
int offset; u16 offset;
int overhead; u16 overhead;
u16 already_sent;
struct page *page; struct page *page;
}; };
...@@ -200,11 +209,13 @@ struct mptcp_sock { ...@@ -200,11 +209,13 @@ struct mptcp_sock {
u64 local_key; u64 local_key;
u64 remote_key; u64 remote_key;
u64 write_seq; u64 write_seq;
u64 snd_nxt;
u64 ack_seq; u64 ack_seq;
u64 rcv_data_fin_seq; u64 rcv_data_fin_seq;
struct sock *last_snd; struct sock *last_snd;
int snd_burst; int snd_burst;
atomic64_t snd_una; atomic64_t snd_una;
atomic64_t wnd_end;
unsigned long timer_ival; unsigned long timer_ival;
u32 token; u32 token;
unsigned long flags; unsigned long flags;
...@@ -219,6 +230,7 @@ struct mptcp_sock { ...@@ -219,6 +230,7 @@ struct mptcp_sock {
struct rb_root out_of_order_queue; struct rb_root out_of_order_queue;
struct list_head conn_list; struct list_head conn_list;
struct list_head rtx_queue; struct list_head rtx_queue;
struct mptcp_data_frag *first_pending;
struct list_head join_list; struct list_head join_list;
struct skb_ext *cached_ext; /* for the next sendmsg */ struct skb_ext *cached_ext; /* for the next sendmsg */
struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */
...@@ -240,11 +252,41 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) ...@@ -240,11 +252,41 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
return (struct mptcp_sock *)sk; return (struct mptcp_sock *)sk;
} }
static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
{
const struct mptcp_sock *msk = mptcp_sk(sk);
return READ_ONCE(msk->first_pending);
}
static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *cur;
cur = msk->first_pending;
return list_is_last(&cur->list, &msk->rtx_queue) ? NULL :
list_next_entry(cur, list);
}
static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
if (!msk->first_pending)
return NULL;
if (WARN_ON_ONCE(list_empty(&msk->rtx_queue)))
return NULL;
return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
}
static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk) static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
if (list_empty(&msk->rtx_queue)) if (!before64(msk->snd_nxt, atomic64_read(&msk->snd_una)))
return NULL; return NULL;
return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
...@@ -312,7 +354,8 @@ struct mptcp_subflow_context { ...@@ -312,7 +354,8 @@ struct mptcp_subflow_context {
mpc_map : 1, mpc_map : 1,
backup : 1, backup : 1,
rx_eof : 1, rx_eof : 1,
can_ack : 1; /* only after processing the remote a key */ can_ack : 1, /* only after processing the remote a key */
disposable : 1; /* ctx can be free at ulp release time */
enum mptcp_data_avail data_avail; enum mptcp_data_avail data_avail;
u32 remote_nonce; u32 remote_nonce;
u64 thmac; u64 thmac;
...@@ -369,8 +412,7 @@ bool mptcp_subflow_data_available(struct sock *sk); ...@@ -369,8 +412,7 @@ bool mptcp_subflow_data_available(struct sock *sk);
void __init mptcp_subflow_init(void); void __init mptcp_subflow_init(void);
void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how);
void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow, struct mptcp_subflow_context *subflow);
long timeout);
void mptcp_subflow_reset(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk);
/* called with sk socket lock held */ /* called with sk socket lock held */
...@@ -408,9 +450,16 @@ static inline bool mptcp_is_fully_established(struct sock *sk) ...@@ -408,9 +450,16 @@ static inline bool mptcp_is_fully_established(struct sock *sk)
void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk); bool mptcp_finish_join(struct sock *sk);
bool mptcp_schedule_work(struct sock *sk);
void mptcp_data_acked(struct sock *sk); void mptcp_data_acked(struct sock *sk);
void mptcp_subflow_eof(struct sock *sk); void mptcp_subflow_eof(struct sock *sk);
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
{
return READ_ONCE(msk->snd_data_fin_enable) &&
READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
}
void mptcp_destroy_common(struct mptcp_sock *msk); void mptcp_destroy_common(struct mptcp_sock *msk);
void __init mptcp_token_init(void); void __init mptcp_token_init(void);
...@@ -495,13 +544,6 @@ static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) ...@@ -495,13 +544,6 @@ static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP);
} }
static inline bool before64(__u64 seq1, __u64 seq2)
{
return (__s64)(seq1 - seq2) < 0;
}
#define after64(seq2, seq1) before64(seq1, seq2)
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk) static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk)
......
...@@ -997,17 +997,16 @@ static void subflow_data_ready(struct sock *sk) ...@@ -997,17 +997,16 @@ static void subflow_data_ready(struct sock *sk)
static void subflow_write_space(struct sock *sk) static void subflow_write_space(struct sock *sk)
{ {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct socket *sock = READ_ONCE(sk->sk_socket);
struct sock *parent = subflow->conn; struct sock *parent = subflow->conn;
if (!sk_stream_is_writeable(sk)) if (!sk_stream_is_writeable(sk))
return; return;
if (sk_stream_is_writeable(parent)) { if (sock && sk_stream_is_writeable(parent))
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); clear_bit(SOCK_NOSPACE, &sock->flags);
smp_mb__after_atomic();
/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ sk_stream_write_space(parent);
sk_stream_write_space(parent);
}
} }
static struct inet_connection_sock_af_ops * static struct inet_connection_sock_af_ops *
...@@ -1125,6 +1124,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, ...@@ -1125,6 +1124,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
if (err && err != -EINPROGRESS) if (err && err != -EINPROGRESS)
goto failed; goto failed;
sock_hold(ssk);
spin_lock_bh(&msk->join_list_lock); spin_lock_bh(&msk->join_list_lock);
list_add_tail(&subflow->node, &msk->join_list); list_add_tail(&subflow->node, &msk->join_list);
spin_unlock_bh(&msk->join_list_lock); spin_unlock_bh(&msk->join_list_lock);
...@@ -1132,6 +1132,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, ...@@ -1132,6 +1132,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
return err; return err;
failed: failed:
subflow->disposable = 1;
sock_release(sf); sock_release(sf);
return err; return err;
} }
...@@ -1254,7 +1255,6 @@ static void subflow_state_change(struct sock *sk) ...@@ -1254,7 +1255,6 @@ static void subflow_state_change(struct sock *sk)
mptcp_data_ready(parent, sk); mptcp_data_ready(parent, sk);
if (__mptcp_check_fallback(mptcp_sk(parent)) && if (__mptcp_check_fallback(mptcp_sk(parent)) &&
!(parent->sk_shutdown & RCV_SHUTDOWN) &&
!subflow->rx_eof && subflow_is_done(sk)) { !subflow->rx_eof && subflow_is_done(sk)) {
subflow->rx_eof = 1; subflow->rx_eof = 1;
mptcp_subflow_eof(parent); mptcp_subflow_eof(parent);
...@@ -1297,17 +1297,26 @@ static int subflow_ulp_init(struct sock *sk) ...@@ -1297,17 +1297,26 @@ static int subflow_ulp_init(struct sock *sk)
return err; return err;
} }
static void subflow_ulp_release(struct sock *sk) static void subflow_ulp_release(struct sock *ssk)
{ {
struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
bool release = true;
struct sock *sk;
if (!ctx) if (!ctx)
return; return;
if (ctx->conn) sk = ctx->conn;
sock_put(ctx->conn); if (sk) {
/* if the msk has been orphaned, keep the ctx
* alive, will be freed by mptcp_done()
*/
release = ctx->disposable;
sock_put(sk);
}
kfree_rcu(ctx, rcu); if (release)
kfree_rcu(ctx, rcu);
} }
static void subflow_ulp_clone(const struct request_sock *req, static void subflow_ulp_clone(const struct request_sock *req,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment