Commit 21214d55 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'mptcp-rework-fwd-memory-allocation-and-one-cleanup'

Mat Martineau says:

====================
mptcp: Rework fwd memory allocation and one cleanup

These patches from the MPTCP tree rework forward memory allocation for
MPTCP (with some supporting changes in the net core), and also clean up
an unused function parameter.

Patch 1 updates TCP code but does not change any behavior, and creates
some macros for reclaim thresholds that will be reused in the MPTCP
code.

Patch 2 adds sk_forward_alloc_get() to the networking core to support
MPTCP's forward allocation with the diag interface.

Patch 3 reworks forward memory for MPTCP.

Patch 4 removes an unused arg and has no functional changes.
====================

Link: https://lore.kernel.org/r/20211026232916.179450-1-mathew.j.martineau@linux.intel.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 9dfc685e b8e0def3
......@@ -1210,6 +1210,8 @@ struct proto {
unsigned int inuse_idx;
#endif
int (*forward_alloc_get)(const struct sock *sk);
bool (*stream_memory_free)(const struct sock *sk, int wake);
bool (*stream_memory_read)(const struct sock *sk);
/* Memory pressure */
......@@ -1217,6 +1219,7 @@ struct proto {
void (*leave_memory_pressure)(struct sock *sk);
atomic_long_t *memory_allocated; /* Current allocated memory. */
struct percpu_counter *sockets_allocated; /* Current number of sockets. */
/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
......@@ -1294,6 +1297,14 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));
static inline int sk_forward_alloc_get(const struct sock *sk)
{
if (!sk->sk_prot->forward_alloc_get)
return sk->sk_forward_alloc;
return sk->sk_prot->forward_alloc_get(sk);
}
static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
......@@ -1573,6 +1584,11 @@ static inline void sk_mem_charge(struct sock *sk, int size)
sk->sk_forward_alloc -= size;
}
/* the following macros control memory reclaiming in sk_mem_uncharge()
*/
#define SK_RECLAIM_THRESHOLD (1 << 21)
#define SK_RECLAIM_CHUNK (1 << 20)
static inline void sk_mem_uncharge(struct sock *sk, int size)
{
int reclaimable;
......@@ -1589,8 +1605,8 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
* If we reach 2 MBytes, reclaim 1 MBytes right now, there is
* no need to hold that much forward allocation anyway.
*/
if (unlikely(reclaimable >= 1 << 21))
__sk_mem_reclaim(sk, 1 << 20);
if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD))
__sk_mem_reclaim(sk, SK_RECLAIM_CHUNK);
}
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
......
......@@ -150,7 +150,7 @@ void inet_sock_destruct(struct sock *sk)
WARN_ON(atomic_read(&sk->sk_rmem_alloc));
WARN_ON(refcount_read(&sk->sk_wmem_alloc));
WARN_ON(sk->sk_wmem_queued);
WARN_ON(sk->sk_forward_alloc);
WARN_ON(sk_forward_alloc_get(sk));
kfree(rcu_dereference_protected(inet->inet_opt, 1));
dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
......
......@@ -271,7 +271,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct inet_diag_meminfo minfo = {
.idiag_rmem = sk_rmem_alloc_get(sk),
.idiag_wmem = READ_ONCE(sk->sk_wmem_queued),
.idiag_fmem = sk->sk_forward_alloc,
.idiag_fmem = sk_forward_alloc_get(sk),
.idiag_tmem = sk_wmem_alloc_get(sk),
};
......
......@@ -126,6 +126,11 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
__kfree_skb(skb);
}
static void mptcp_rmem_charge(struct sock *sk, int size)
{
mptcp_sk(sk)->rmem_fwd_alloc -= size;
}
static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
struct sk_buff *from)
{
......@@ -142,7 +147,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
kfree_skb_partial(from, fragstolen);
atomic_add(delta, &sk->sk_rmem_alloc);
sk_mem_charge(sk, delta);
mptcp_rmem_charge(sk, delta);
return true;
}
......@@ -155,6 +160,44 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
return mptcp_try_coalesce((struct sock *)msk, to, from);
}
static void __mptcp_rmem_reclaim(struct sock *sk, int amount)
{
amount >>= SK_MEM_QUANTUM_SHIFT;
mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
__sk_mem_reduce_allocated(sk, amount);
}
static void mptcp_rmem_uncharge(struct sock *sk, int size)
{
struct mptcp_sock *msk = mptcp_sk(sk);
int reclaimable;
msk->rmem_fwd_alloc += size;
reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
/* see sk_mem_uncharge() for the rationale behind the following schema */
if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD))
__mptcp_rmem_reclaim(sk, SK_RECLAIM_CHUNK);
}
static void mptcp_rfree(struct sk_buff *skb)
{
unsigned int len = skb->truesize;
struct sock *sk = skb->sk;
atomic_sub(len, &sk->sk_rmem_alloc);
mptcp_rmem_uncharge(sk, len);
}
static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
skb_orphan(skb);
skb->sk = sk;
skb->destructor = mptcp_rfree;
atomic_add(skb->truesize, &sk->sk_rmem_alloc);
mptcp_rmem_charge(sk, skb->truesize);
}
/* "inspired" by tcp_data_queue_ofo(), main differences:
* - use mptcp seqs
* - don't cope with sacks
......@@ -267,7 +310,29 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
end:
skb_condense(skb);
skb_set_owner_r(skb, sk);
mptcp_set_owner_r(skb, sk);
}
static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
{
struct mptcp_sock *msk = mptcp_sk(sk);
int amt, amount;
if (size < msk->rmem_fwd_alloc)
return true;
amt = sk_mem_pages(size);
amount = amt << SK_MEM_QUANTUM_SHIFT;
msk->rmem_fwd_alloc += amount;
if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) {
if (ssk->sk_forward_alloc < amount) {
msk->rmem_fwd_alloc -= amount;
return false;
}
ssk->sk_forward_alloc -= amount;
}
return true;
}
static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
......@@ -285,15 +350,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
skb_orphan(skb);
/* try to fetch required memory from subflow */
if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT;
if (ssk->sk_forward_alloc < amount)
goto drop;
ssk->sk_forward_alloc -= amount;
sk->sk_forward_alloc += amount;
}
if (!mptcp_rmem_schedule(sk, ssk, skb->truesize))
goto drop;
has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
......@@ -313,7 +371,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
if (tail && mptcp_try_coalesce(sk, tail, skb))
return true;
skb_set_owner_r(skb, sk);
mptcp_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
return true;
} else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
......@@ -908,122 +966,20 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
df->data_seq + df->data_len == msk->write_seq;
}
static int mptcp_wmem_with_overhead(int size)
{
return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT);
}
static void __mptcp_wmem_reserve(struct sock *sk, int size)
{
int amount = mptcp_wmem_with_overhead(size);
struct mptcp_sock *msk = mptcp_sk(sk);
WARN_ON_ONCE(msk->wmem_reserved);
if (WARN_ON_ONCE(amount < 0))
amount = 0;
if (amount <= sk->sk_forward_alloc)
goto reserve;
/* under memory pressure try to reserve at most a single page
* otherwise try to reserve the full estimate and fallback
* to a single page before entering the error path
*/
if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) ||
!sk_wmem_schedule(sk, amount)) {
if (amount <= PAGE_SIZE)
goto nomem;
amount = PAGE_SIZE;
if (!sk_wmem_schedule(sk, amount))
goto nomem;
}
reserve:
msk->wmem_reserved = amount;
sk->sk_forward_alloc -= amount;
return;
nomem:
/* we will wait for memory on next allocation */
msk->wmem_reserved = -1;
}
static void __mptcp_update_wmem(struct sock *sk)
static void __mptcp_mem_reclaim_partial(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
lockdep_assert_held_once(&sk->sk_lock.slock);
if (!msk->wmem_reserved)
return;
if (msk->wmem_reserved < 0)
msk->wmem_reserved = 0;
if (msk->wmem_reserved > 0) {
sk->sk_forward_alloc += msk->wmem_reserved;
msk->wmem_reserved = 0;
}
}
static bool mptcp_wmem_alloc(struct sock *sk, int size)
{
struct mptcp_sock *msk = mptcp_sk(sk);
/* check for pre-existing error condition */
if (msk->wmem_reserved < 0)
return false;
if (msk->wmem_reserved >= size)
goto account;
mptcp_data_lock(sk);
if (!sk_wmem_schedule(sk, size)) {
mptcp_data_unlock(sk);
return false;
}
sk->sk_forward_alloc -= size;
msk->wmem_reserved += size;
mptcp_data_unlock(sk);
account:
msk->wmem_reserved -= size;
return true;
}
static void mptcp_wmem_uncharge(struct sock *sk, int size)
{
struct mptcp_sock *msk = mptcp_sk(sk);
if (msk->wmem_reserved < 0)
msk->wmem_reserved = 0;
msk->wmem_reserved += size;
}
static void __mptcp_mem_reclaim_partial(struct sock *sk)
{
lockdep_assert_held_once(&sk->sk_lock.slock);
__mptcp_update_wmem(sk);
__mptcp_rmem_reclaim(sk, reclaimable - 1);
sk_mem_reclaim_partial(sk);
}
static void mptcp_mem_reclaim_partial(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
/* if we are experiencing a transint allocation error,
* the forward allocation memory has been already
* released
*/
if (msk->wmem_reserved < 0)
return;
mptcp_data_lock(sk);
sk->sk_forward_alloc += msk->wmem_reserved;
sk_mem_reclaim_partial(sk);
msk->wmem_reserved = sk->sk_forward_alloc;
sk->sk_forward_alloc = 0;
__mptcp_mem_reclaim_partial(sk);
mptcp_data_unlock(sk);
}
......@@ -1513,8 +1469,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
return NULL;
}
static void mptcp_push_release(struct sock *sk, struct sock *ssk,
struct mptcp_sendmsg_info *info)
static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info)
{
tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
release_sock(ssk);
......@@ -1577,7 +1532,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
* the last round, release prev_ssk
*/
if (ssk != prev_ssk && prev_ssk)
mptcp_push_release(sk, prev_ssk, &info);
mptcp_push_release(prev_ssk, &info);
if (!ssk)
goto out;
......@@ -1590,7 +1545,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
if (ret <= 0) {
mptcp_push_release(sk, ssk, &info);
mptcp_push_release(ssk, &info);
goto out;
}
......@@ -1605,7 +1560,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
/* at this point we held the socket lock for the last subflow we used */
if (ssk)
mptcp_push_release(sk, ssk, &info);
mptcp_push_release(ssk, &info);
out:
/* ensure the rtx timer is running */
......@@ -1664,7 +1619,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
/* __mptcp_alloc_tx_skb could have released some wmem and we are
* not going to flush it via release_sock()
*/
__mptcp_update_wmem(sk);
if (copied) {
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
......@@ -1701,7 +1655,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* silently ignore everything else */
msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL;
mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len)));
lock_sock(sk);
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
......@@ -1749,17 +1703,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
psize = min_t(size_t, psize, msg_data_left(msg));
total_ts = psize + frag_truesize;
if (!mptcp_wmem_alloc(sk, total_ts))
if (!sk_wmem_schedule(sk, total_ts))
goto wait_for_memory;
if (copy_page_from_iter(dfrag->page, offset, psize,
&msg->msg_iter) != psize) {
mptcp_wmem_uncharge(sk, psize + frag_truesize);
ret = -EFAULT;
goto out;
}
/* data successfully copied into the write queue */
sk->sk_forward_alloc -= total_ts;
copied += psize;
dfrag->data_len += psize;
frag_truesize += psize;
......@@ -1956,7 +1910,7 @@ static void __mptcp_update_rmem(struct sock *sk)
return;
atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
sk_mem_uncharge(sk, msk->rmem_released);
mptcp_rmem_uncharge(sk, msk->rmem_released);
WRITE_ONCE(msk->rmem_released, 0);
}
......@@ -2024,7 +1978,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
lock_sock(sk);
if (unlikely(sk->sk_state == TCP_LISTEN)) {
copied = -ENOTCONN;
goto out_err;
......@@ -2504,7 +2458,7 @@ static int __mptcp_init_sock(struct sock *sk)
__skb_queue_head_init(&msk->receive_queue);
msk->out_of_order_queue = RB_ROOT;
msk->first_pending = NULL;
msk->wmem_reserved = 0;
msk->rmem_fwd_alloc = 0;
WRITE_ONCE(msk->rmem_released, 0);
msk->timer_ival = TCP_RTO_MIN;
......@@ -2715,7 +2669,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
sk->sk_prot->destroy(sk);
WARN_ON_ONCE(msk->wmem_reserved);
WARN_ON_ONCE(msk->rmem_fwd_alloc);
WARN_ON_ONCE(msk->rmem_released);
sk_stream_kill_queues(sk);
xfrm_sk_free_policy(sk);
......@@ -2948,8 +2902,14 @@ void mptcp_destroy_common(struct mptcp_sock *msk)
/* move to sk_receive_queue, sk_stream_kill_queues will purge it */
skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_receive_queue);
skb_rbtree_purge(&msk->out_of_order_queue);
/* move all the rx fwd alloc into the sk_mem_reclaim_final in
* inet_sock_destruct() will dispose it
*/
sk->sk_forward_alloc += msk->rmem_fwd_alloc;
msk->rmem_fwd_alloc = 0;
mptcp_token_destroy(msk);
mptcp_pm_free_anno_list(msk);
}
......@@ -3031,10 +2991,6 @@ static void mptcp_release_cb(struct sock *sk)
if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags))
__mptcp_error_report(sk);
/* push_pending may touch wmem_reserved, ensure we do the cleanup
* later
*/
__mptcp_update_wmem(sk);
__mptcp_update_rmem(sk);
}
......@@ -3184,6 +3140,11 @@ static void mptcp_shutdown(struct sock *sk, int how)
__mptcp_wr_shutdown(sk);
}
static int mptcp_forward_alloc_get(const struct sock *sk)
{
return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc;
}
static struct proto mptcp_prot = {
.name = "MPTCP",
.owner = THIS_MODULE,
......@@ -3201,6 +3162,7 @@ static struct proto mptcp_prot = {
.hash = mptcp_hash,
.unhash = mptcp_unhash,
.get_port = mptcp_get_port,
.forward_alloc_get = mptcp_forward_alloc_get,
.sockets_allocated = &mptcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
......
......@@ -227,7 +227,7 @@ struct mptcp_sock {
u64 ack_seq;
u64 rcv_wnd_sent;
u64 rcv_data_fin_seq;
int wmem_reserved;
int rmem_fwd_alloc;
struct sock *last_snd;
int snd_burst;
int old_wspace;
......@@ -272,19 +272,6 @@ struct mptcp_sock {
char ca_name[TCP_CA_NAME_MAX];
};
#define mptcp_lock_sock(___sk, cb) do { \
struct sock *__sk = (___sk); /* silence macro reuse warning */ \
might_sleep(); \
spin_lock_bh(&__sk->sk_lock.slock); \
if (__sk->sk_lock.owned) \
__lock_sock(__sk); \
cb; \
__sk->sk_lock.owned = 1; \
spin_unlock(&__sk->sk_lock.slock); \
mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \
local_bh_enable(); \
} while (0)
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)
......
......@@ -457,7 +457,7 @@ META_COLLECTOR(int_sk_fwd_alloc)
*err = -1;
return;
}
dst->value = sk->sk_forward_alloc;
dst->value = sk_forward_alloc_get(sk);
}
META_COLLECTOR(int_sk_sndbuf)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment