Commit 2bb2f5fb authored by Wei Wang's avatar Wei Wang Committed by David S. Miller

net: add new socket option SO_RESERVE_MEM

This socket option provides a mechanism for users to reserve a certain
amount of memory for the socket to use. When this option is set, kernel
charges the user specified amount of memory to memcg, as well as
sk_forward_alloc. This amount of memory is not reclaimable and is
available in sk_forward_alloc for this socket.
With this socket option set, the networking stack spends less cycles
doing forward alloc and reclaim, which should lead to better system
performance, with the cost of an amount of pre-allocated and
unreclaimable memory, even under memory pressure.

Note:
This socket option is only available when memory cgroup is enabled and we
require this reserved memory to be charged to the user's memcg. We hope
this could avoid mis-behaving users to abused this feature to reserve a
large amount on certain sockets and cause unfairness for others.
Signed-off-by: default avatarWei Wang <weiwan@google.com>
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 4075a6a0
...@@ -269,6 +269,7 @@ struct bpf_local_storage; ...@@ -269,6 +269,7 @@ struct bpf_local_storage;
* @sk_omem_alloc: "o" is "option" or "other" * @sk_omem_alloc: "o" is "option" or "other"
* @sk_wmem_queued: persistent queue size * @sk_wmem_queued: persistent queue size
* @sk_forward_alloc: space allocated forward * @sk_forward_alloc: space allocated forward
* @sk_reserved_mem: space reserved and non-reclaimable for the socket
* @sk_napi_id: id of the last napi context to receive data for sk * @sk_napi_id: id of the last napi context to receive data for sk
* @sk_ll_usec: usecs to busypoll when there is no data * @sk_ll_usec: usecs to busypoll when there is no data
* @sk_allocation: allocation mode * @sk_allocation: allocation mode
...@@ -409,6 +410,7 @@ struct sock { ...@@ -409,6 +410,7 @@ struct sock {
#define sk_rmem_alloc sk_backlog.rmem_alloc #define sk_rmem_alloc sk_backlog.rmem_alloc
int sk_forward_alloc; int sk_forward_alloc;
u32 sk_reserved_mem;
#ifdef CONFIG_NET_RX_BUSY_POLL #ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sk_ll_usec; unsigned int sk_ll_usec;
/* ===== mostly read cache line ===== */ /* ===== mostly read cache line ===== */
...@@ -1511,20 +1513,49 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) ...@@ -1511,20 +1513,49 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
skb_pfmemalloc(skb); skb_pfmemalloc(skb);
} }
static inline int sk_unused_reserved_mem(const struct sock *sk)
{
int unused_mem;
if (likely(!sk->sk_reserved_mem))
return 0;
unused_mem = sk->sk_reserved_mem - sk->sk_wmem_queued -
atomic_read(&sk->sk_rmem_alloc);
return unused_mem > 0 ? unused_mem : 0;
}
static inline void sk_mem_reclaim(struct sock *sk) static inline void sk_mem_reclaim(struct sock *sk)
{ {
int reclaimable;
if (!sk_has_account(sk)) if (!sk_has_account(sk))
return; return;
if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
__sk_mem_reclaim(sk, sk->sk_forward_alloc); reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
if (reclaimable >= SK_MEM_QUANTUM)
__sk_mem_reclaim(sk, reclaimable);
}
static inline void sk_mem_reclaim_final(struct sock *sk)
{
sk->sk_reserved_mem = 0;
sk_mem_reclaim(sk);
} }
static inline void sk_mem_reclaim_partial(struct sock *sk) static inline void sk_mem_reclaim_partial(struct sock *sk)
{ {
int reclaimable;
if (!sk_has_account(sk)) if (!sk_has_account(sk))
return; return;
if (sk->sk_forward_alloc > SK_MEM_QUANTUM)
__sk_mem_reclaim(sk, sk->sk_forward_alloc - 1); reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
if (reclaimable > SK_MEM_QUANTUM)
__sk_mem_reclaim(sk, reclaimable - 1);
} }
static inline void sk_mem_charge(struct sock *sk, int size) static inline void sk_mem_charge(struct sock *sk, int size)
...@@ -1536,9 +1567,12 @@ static inline void sk_mem_charge(struct sock *sk, int size) ...@@ -1536,9 +1567,12 @@ static inline void sk_mem_charge(struct sock *sk, int size)
static inline void sk_mem_uncharge(struct sock *sk, int size) static inline void sk_mem_uncharge(struct sock *sk, int size)
{ {
int reclaimable;
if (!sk_has_account(sk)) if (!sk_has_account(sk))
return; return;
sk->sk_forward_alloc += size; sk->sk_forward_alloc += size;
reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
/* Avoid a possible overflow. /* Avoid a possible overflow.
* TCP send queues can make this happen, if sk_mem_reclaim() * TCP send queues can make this happen, if sk_mem_reclaim()
...@@ -1547,7 +1581,7 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) ...@@ -1547,7 +1581,7 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
* If we reach 2 MBytes, reclaim 1 MBytes right now, there is * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
* no need to hold that much forward allocation anyway. * no need to hold that much forward allocation anyway.
*/ */
if (unlikely(sk->sk_forward_alloc >= 1 << 21)) if (unlikely(reclaimable >= 1 << 21))
__sk_mem_reclaim(sk, 1 << 20); __sk_mem_reclaim(sk, 1 << 20);
} }
......
...@@ -126,6 +126,8 @@ ...@@ -126,6 +126,8 @@
#define SO_BUF_LOCK 72 #define SO_BUF_LOCK 72
#define SO_RESERVE_MEM 73
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
......
...@@ -947,6 +947,53 @@ void sock_set_mark(struct sock *sk, u32 val) ...@@ -947,6 +947,53 @@ void sock_set_mark(struct sock *sk, u32 val)
} }
EXPORT_SYMBOL(sock_set_mark); EXPORT_SYMBOL(sock_set_mark);
static void sock_release_reserved_memory(struct sock *sk, int bytes)
{
/* Round down bytes to multiple of pages */
bytes &= ~(SK_MEM_QUANTUM - 1);
WARN_ON(bytes > sk->sk_reserved_mem);
sk->sk_reserved_mem -= bytes;
sk_mem_reclaim(sk);
}
static int sock_reserve_memory(struct sock *sk, int bytes)
{
long allocated;
bool charged;
int pages;
if (!mem_cgroup_sockets_enabled || !sk->sk_memcg)
return -EOPNOTSUPP;
if (!bytes)
return 0;
pages = sk_mem_pages(bytes);
/* pre-charge to memcg */
charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!charged)
return -ENOMEM;
/* pre-charge to forward_alloc */
allocated = sk_memory_allocated_add(sk, pages);
/* If the system goes into memory pressure with this
* precharge, give up and return error.
*/
if (allocated > sk_prot_mem_limits(sk, 1)) {
sk_memory_allocated_sub(sk, pages);
mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
return -ENOMEM;
}
sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
return 0;
}
/* /*
* This is meant for all protocols to use and covers goings on * This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic. * at the socket level. Everything here is generic.
...@@ -1367,6 +1414,23 @@ int sock_setsockopt(struct socket *sock, int level, int optname, ...@@ -1367,6 +1414,23 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
~SOCK_BUF_LOCK_MASK); ~SOCK_BUF_LOCK_MASK);
break; break;
case SO_RESERVE_MEM:
{
int delta;
if (val < 0) {
ret = -EINVAL;
break;
}
delta = val - sk->sk_reserved_mem;
if (delta < 0)
sock_release_reserved_memory(sk, -delta);
else
ret = sock_reserve_memory(sk, delta);
break;
}
default: default:
ret = -ENOPROTOOPT; ret = -ENOPROTOOPT;
break; break;
...@@ -1733,6 +1797,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, ...@@ -1733,6 +1797,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
break; break;
case SO_RESERVE_MEM:
v.val = sk->sk_reserved_mem;
break;
default: default:
/* We implement the SO_SNDLOWAT etc to not be settable /* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7). * (1003.1g 7).
...@@ -2045,6 +2113,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) ...@@ -2045,6 +2113,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_dst_pending_confirm = 0; newsk->sk_dst_pending_confirm = 0;
newsk->sk_wmem_queued = 0; newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0; newsk->sk_forward_alloc = 0;
newsk->sk_reserved_mem = 0;
atomic_set(&newsk->sk_drops, 0); atomic_set(&newsk->sk_drops, 0);
newsk->sk_send_head = NULL; newsk->sk_send_head = NULL;
newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
......
...@@ -202,7 +202,7 @@ void sk_stream_kill_queues(struct sock *sk) ...@@ -202,7 +202,7 @@ void sk_stream_kill_queues(struct sock *sk)
WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
/* Account for returned memory. */ /* Account for returned memory. */
sk_mem_reclaim(sk); sk_mem_reclaim_final(sk);
WARN_ON(sk->sk_wmem_queued); WARN_ON(sk->sk_wmem_queued);
WARN_ON(sk->sk_forward_alloc); WARN_ON(sk->sk_forward_alloc);
......
...@@ -135,7 +135,7 @@ void inet_sock_destruct(struct sock *sk) ...@@ -135,7 +135,7 @@ void inet_sock_destruct(struct sock *sk)
__skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_error_queue); __skb_queue_purge(&sk->sk_error_queue);
sk_mem_reclaim(sk); sk_mem_reclaim_final(sk);
if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
pr_err("Attempt to release TCP socket in state %d %p\n", pr_err("Attempt to release TCP socket in state %d %p\n",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment