Commit 18b683bf authored by Paolo Abeni's avatar Paolo Abeni Committed by David S. Miller

mptcp: queue data for mptcp level retransmission

Keep the send page fragment on an MPTCP level retransmission queue.
The queue entries are allocated inside the page frag allocator,
acquiring an additional reference to the page for each list entry.

Also switch to a custom page frag refill function, to ensure that
the current page fragment can always host an MPTCP rtx queue entry.

The MPTCP rtx queue is flushed at disconnect() and close() time

Note that now we need to call __mptcp_init_sock() regardless of mptcp
enable status, as the destructor will try to walk the rtx_queue.

v2 -> v3:
 - remove 'inline' in foo.c files (David S. Miller)
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Signed-off-by: default avatarMat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent cc9d2566
...@@ -285,15 +285,75 @@ static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, ...@@ -285,15 +285,75 @@ static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
return mpext && mpext->data_seq + mpext->data_len == msk->write_seq; return mpext && mpext->data_seq + mpext->data_len == msk->write_seq;
} }
static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
const struct page_frag *pfrag,
const struct mptcp_data_frag *df)
{
return df && pfrag->page == df->page &&
df->data_seq + df->data_len == msk->write_seq;
}
static void dfrag_clear(struct mptcp_data_frag *dfrag)
{
list_del(&dfrag->list);
put_page(dfrag->page);
}
static void mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
u64 snd_una = atomic64_read(&msk->snd_una);
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
break;
dfrag_clear(dfrag);
}
}
/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
* data
*/
static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
{
if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
pfrag, sk->sk_allocation)))
return true;
sk->sk_prot->enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
return false;
}
static struct mptcp_data_frag *
mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
int orig_offset)
{
int offset = ALIGN(orig_offset, sizeof(long));
struct mptcp_data_frag *dfrag;
dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
dfrag->data_len = 0;
dfrag->data_seq = msk->write_seq;
dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
dfrag->offset = offset + sizeof(struct mptcp_data_frag);
dfrag->page = pfrag->page;
return dfrag;
}
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct msghdr *msg, long *timeo, int *pmss_now, struct msghdr *msg, long *timeo, int *pmss_now,
int *ps_goal) int *ps_goal)
{ {
int mss_now, avail_size, size_goal, ret; int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0;
bool dfrag_collapsed, can_collapse = false;
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_ext *mpext = NULL; struct mptcp_ext *mpext = NULL;
struct mptcp_data_frag *dfrag;
struct sk_buff *skb, *tail; struct sk_buff *skb, *tail;
bool can_collapse = false;
struct page_frag *pfrag; struct page_frag *pfrag;
size_t psize; size_t psize;
...@@ -301,11 +361,17 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -301,11 +361,17 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* from one substream to another, but do per subflow memory accounting * from one substream to another, but do per subflow memory accounting
*/ */
pfrag = sk_page_frag(sk); pfrag = sk_page_frag(sk);
while (!sk_page_frag_refill(ssk, pfrag) || while (!mptcp_page_frag_refill(ssk, pfrag) ||
!mptcp_ext_cache_refill(msk)) { !mptcp_ext_cache_refill(msk)) {
ret = sk_stream_wait_memory(ssk, timeo); ret = sk_stream_wait_memory(ssk, timeo);
if (ret) if (ret)
return ret; return ret;
/* if sk_stream_wait_memory() sleeps snd_una can change
* significantly, refresh the rtx queue
*/
mptcp_clean_una(sk);
if (unlikely(__mptcp_needs_tcp_fallback(msk))) if (unlikely(__mptcp_needs_tcp_fallback(msk)))
return 0; return 0;
} }
...@@ -332,11 +398,23 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -332,11 +398,23 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
else else
avail_size = size_goal - skb->len; avail_size = size_goal - skb->len;
} }
psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size);
/* reuse tail pfrag, if possible, or carve a new one from the page
* allocator
*/
dfrag = mptcp_rtx_tail(sk);
offset = pfrag->offset;
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
if (!dfrag_collapsed) {
dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
offset = dfrag->offset;
frag_truesize = dfrag->overhead;
}
psize = min_t(size_t, pfrag->size - offset, avail_size);
/* Copy to page */ /* Copy to page */
pr_debug("left=%zu", msg_data_left(msg)); pr_debug("left=%zu", msg_data_left(msg));
psize = copy_page_from_iter(pfrag->page, pfrag->offset, psize = copy_page_from_iter(pfrag->page, offset,
min_t(size_t, msg_data_left(msg), psize), min_t(size_t, msg_data_left(msg), psize),
&msg->msg_iter); &msg->msg_iter);
pr_debug("left=%zu", msg_data_left(msg)); pr_debug("left=%zu", msg_data_left(msg));
...@@ -346,13 +424,24 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -346,13 +424,24 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
/* tell the TCP stack to delay the push so that we can safely /* tell the TCP stack to delay the push so that we can safely
* access the skb after the sendpages call * access the skb after the sendpages call
*/ */
ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize, ret = do_tcp_sendpages(ssk, pfrag->page, offset, psize,
msg->msg_flags | MSG_SENDPAGE_NOTLAST); msg->msg_flags | MSG_SENDPAGE_NOTLAST);
if (ret <= 0) if (ret <= 0)
return ret; return ret;
frag_truesize += ret;
if (unlikely(ret < psize)) if (unlikely(ret < psize))
iov_iter_revert(&msg->msg_iter, psize - ret); iov_iter_revert(&msg->msg_iter, psize - ret);
/* send successful, keep track of sent data for mptcp-level
* retransmission
*/
dfrag->data_len += ret;
if (!dfrag_collapsed) {
get_page(dfrag->page);
list_add_tail(&dfrag->list, &msk->rtx_queue);
}
/* if the tail skb extension is still the cached one, collapsing /* if the tail skb extension is still the cached one, collapsing
* really happened. Note: we can't check for 'same skb' as the sk_buff * really happened. Note: we can't check for 'same skb' as the sk_buff
* hdr on tail can be transmitted, freed and re-allocated by the * hdr on tail can be transmitted, freed and re-allocated by the
...@@ -381,7 +470,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ...@@ -381,7 +470,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext->dsn64); mpext->dsn64);
out: out:
pfrag->offset += ret; pfrag->offset += frag_truesize;
msk->write_seq += ret; msk->write_seq += ret;
mptcp_subflow_ctx(ssk)->rel_write_seq += ret; mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
...@@ -472,6 +561,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ...@@ -472,6 +561,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return ret >= 0 ? ret + copied : (copied ? copied : ret); return ret >= 0 ? ret + copied : (copied ? copied : ret);
} }
mptcp_clean_una(sk);
__mptcp_flush_join_list(msk); __mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk); ssk = mptcp_subflow_get_send(msk);
while (!sk_stream_memory_free(sk) || !ssk) { while (!sk_stream_memory_free(sk) || !ssk) {
...@@ -479,6 +570,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ...@@ -479,6 +570,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (ret) if (ret)
goto out; goto out;
mptcp_clean_una(sk);
ssk = mptcp_subflow_get_send(msk); ssk = mptcp_subflow_get_send(msk);
if (list_empty(&msk->conn_list)) { if (list_empty(&msk->conn_list)) {
ret = -ENOTCONN; ret = -ENOTCONN;
...@@ -744,6 +837,7 @@ static int __mptcp_init_sock(struct sock *sk) ...@@ -744,6 +837,7 @@ static int __mptcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->conn_list);
INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->join_list);
INIT_LIST_HEAD(&msk->rtx_queue);
__set_bit(MPTCP_SEND_SPACE, &msk->flags); __set_bit(MPTCP_SEND_SPACE, &msk->flags);
INIT_WORK(&msk->work, mptcp_worker); INIT_WORK(&msk->work, mptcp_worker);
...@@ -757,10 +851,24 @@ static int __mptcp_init_sock(struct sock *sk) ...@@ -757,10 +851,24 @@ static int __mptcp_init_sock(struct sock *sk)
static int mptcp_init_sock(struct sock *sk) static int mptcp_init_sock(struct sock *sk)
{ {
int ret = __mptcp_init_sock(sk);
if (ret)
return ret;
if (!mptcp_is_enabled(sock_net(sk))) if (!mptcp_is_enabled(sock_net(sk)))
return -ENOPROTOOPT; return -ENOPROTOOPT;
return __mptcp_init_sock(sk); return 0;
}
static void __mptcp_clear_xmit(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
dfrag_clear(dfrag);
} }
static void mptcp_cancel_work(struct sock *sk) static void mptcp_cancel_work(struct sock *sk)
...@@ -822,6 +930,8 @@ static void mptcp_close(struct sock *sk, long timeout) ...@@ -822,6 +930,8 @@ static void mptcp_close(struct sock *sk, long timeout)
data_fin_tx_seq = msk->write_seq; data_fin_tx_seq = msk->write_seq;
__mptcp_clear_xmit(sk);
release_sock(sk); release_sock(sk);
list_for_each_entry_safe(subflow, tmp, &conn_list, node) { list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
...@@ -863,6 +973,14 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) ...@@ -863,6 +973,14 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
} }
static int mptcp_disconnect(struct sock *sk, int flags)
{
lock_sock(sk);
__mptcp_clear_xmit(sk);
release_sock(sk);
return tcp_disconnect(sk, flags);
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6) #if IS_ENABLED(CONFIG_MPTCP_IPV6)
static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
{ {
...@@ -1173,6 +1291,7 @@ static struct proto mptcp_prot = { ...@@ -1173,6 +1291,7 @@ static struct proto mptcp_prot = {
.name = "MPTCP", .name = "MPTCP",
.owner = THIS_MODULE, .owner = THIS_MODULE,
.init = mptcp_init_sock, .init = mptcp_init_sock,
.disconnect = mptcp_disconnect,
.close = mptcp_close, .close = mptcp_close,
.accept = mptcp_accept, .accept = mptcp_accept,
.setsockopt = mptcp_setsockopt, .setsockopt = mptcp_setsockopt,
......
...@@ -139,6 +139,15 @@ struct mptcp_pm_data { ...@@ -139,6 +139,15 @@ struct mptcp_pm_data {
struct work_struct work; struct work_struct work;
}; };
struct mptcp_data_frag {
struct list_head list;
u64 data_seq;
int data_len;
int offset;
int overhead;
struct page *page;
};
/* MPTCP connection sock */ /* MPTCP connection sock */
struct mptcp_sock { struct mptcp_sock {
/* inet_connection_sock must be the first member */ /* inet_connection_sock must be the first member */
...@@ -154,6 +163,7 @@ struct mptcp_sock { ...@@ -154,6 +163,7 @@ struct mptcp_sock {
spinlock_t join_list_lock; spinlock_t join_list_lock;
struct work_struct work; struct work_struct work;
struct list_head conn_list; struct list_head conn_list;
struct list_head rtx_queue;
struct list_head join_list; struct list_head join_list;
struct skb_ext *cached_ext; /* for the next sendmsg */ struct skb_ext *cached_ext; /* for the next sendmsg */
struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */
...@@ -169,6 +179,16 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) ...@@ -169,6 +179,16 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
return (struct mptcp_sock *)sk; return (struct mptcp_sock *)sk;
} }
static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
if (list_empty(&msk->rtx_queue))
return NULL;
return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
}
struct mptcp_subflow_request_sock { struct mptcp_subflow_request_sock {
struct tcp_request_sock sk; struct tcp_request_sock sk;
u16 mp_capable : 1, u16 mp_capable : 1,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment