Commit 0a6d10db authored by David S. Miller's avatar David S. Miller

Merge branch 'pernet_sysctls'

Eric W. Biederman says:

====================
ipv4: tcp_memcontrol and userns sysctls

While looking into allowing the ipv4 sysctls to be used in a network
namespace I stumbled upon the mess that is tcp_memcontrol.

I remove the dead code, broken code, and excessive abstraction in the
tcp_memcontrols then I clean up up and allow in the user namespace the
per net ipv4 sysctls.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3167fe98 fd2d5356
......@@ -71,7 +71,6 @@ struct netns_ipv4 {
int sysctl_tcp_ecn;
kgid_t sysctl_ping_group_range[2];
long sysctl_tcp_mem[3];
atomic_t dev_addr_genid;
......
......@@ -1036,10 +1036,10 @@ enum cg_proto_flags {
struct cg_proto {
void (*enter_memory_pressure)(struct sock *sk);
struct res_counter *memory_allocated; /* Current allocated memory. */
struct percpu_counter *sockets_allocated; /* Current number of sockets. */
int *memory_pressure;
long *sysctl_mem;
struct res_counter memory_allocated; /* Current allocated memory. */
struct percpu_counter sockets_allocated; /* Current number of sockets. */
int memory_pressure;
long sysctl_mem[3];
unsigned long flags;
/*
* memcg field is used to find which memcg we belong directly
......@@ -1135,9 +1135,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
return false;
if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
return !!*sk->sk_cgrp->memory_pressure;
return !!sk->sk_cgrp->memory_pressure;
return !!*sk->sk_prot->memory_pressure;
return !!sk->sk_prot->memory_pressure;
}
static inline void sk_leave_memory_pressure(struct sock *sk)
......@@ -1155,8 +1155,8 @@ static inline void sk_leave_memory_pressure(struct sock *sk)
struct proto *prot = sk->sk_prot;
for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
if (*cg_proto->memory_pressure)
*cg_proto->memory_pressure = 0;
if (cg_proto->memory_pressure)
cg_proto->memory_pressure = 0;
}
}
......@@ -1192,7 +1192,7 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot,
struct res_counter *fail;
int ret;
ret = res_counter_charge_nofail(prot->memory_allocated,
ret = res_counter_charge_nofail(&prot->memory_allocated,
amt << PAGE_SHIFT, &fail);
if (ret < 0)
*parent_status = OVER_LIMIT;
......@@ -1201,13 +1201,13 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot,
static inline void memcg_memory_allocated_sub(struct cg_proto *prot,
unsigned long amt)
{
res_counter_uncharge(prot->memory_allocated, amt << PAGE_SHIFT);
res_counter_uncharge(&prot->memory_allocated, amt << PAGE_SHIFT);
}
static inline u64 memcg_memory_allocated_read(struct cg_proto *prot)
{
u64 ret;
ret = res_counter_read_u64(prot->memory_allocated, RES_USAGE);
ret = res_counter_read_u64(&prot->memory_allocated, RES_USAGE);
return ret >> PAGE_SHIFT;
}
......@@ -1255,7 +1255,7 @@ static inline void sk_sockets_allocated_dec(struct sock *sk)
struct cg_proto *cg_proto = sk->sk_cgrp;
for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
percpu_counter_dec(cg_proto->sockets_allocated);
percpu_counter_dec(&cg_proto->sockets_allocated);
}
percpu_counter_dec(prot->sockets_allocated);
......@@ -1269,7 +1269,7 @@ static inline void sk_sockets_allocated_inc(struct sock *sk)
struct cg_proto *cg_proto = sk->sk_cgrp;
for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
percpu_counter_inc(cg_proto->sockets_allocated);
percpu_counter_inc(&cg_proto->sockets_allocated);
}
percpu_counter_inc(prot->sockets_allocated);
......@@ -1281,7 +1281,7 @@ sk_sockets_allocated_read_positive(struct sock *sk)
struct proto *prot = sk->sk_prot;
if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
return percpu_counter_read_positive(sk->sk_cgrp->sockets_allocated);
return percpu_counter_read_positive(&sk->sk_cgrp->sockets_allocated);
return percpu_counter_read_positive(prot->sockets_allocated);
}
......
......@@ -259,6 +259,7 @@ extern int sysctl_tcp_max_orphans;
extern int sysctl_tcp_fack;
extern int sysctl_tcp_reordering;
extern int sysctl_tcp_dsack;
extern long sysctl_tcp_mem[3];
extern int sysctl_tcp_wmem[3];
extern int sysctl_tcp_rmem[3];
extern int sysctl_tcp_app_win;
......@@ -348,8 +349,6 @@ extern struct proto tcp_prot;
#define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val)
#define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
void tcp_init_mem(struct net *net);
void tcp_tasklet_init(void);
void tcp_v4_err(struct sk_buff *skb, u32);
......
#ifndef _TCP_MEMCG_H
#define _TCP_MEMCG_H
struct tcp_memcontrol {
struct cg_proto cg_proto;
/* per-cgroup tcp memory pressure knobs */
struct res_counter tcp_memory_allocated;
struct percpu_counter tcp_sockets_allocated;
/* those two are read-mostly, leave them at the end */
long tcp_prot_mem[3];
int tcp_memory_pressure;
};
struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg);
int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
void tcp_destroy_cgroup(struct mem_cgroup *memcg);
unsigned long long tcp_max_memory(const struct mem_cgroup *memcg);
void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx);
#endif /* _TCP_MEMCG_H */
......@@ -311,7 +311,7 @@ struct mem_cgroup {
atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct tcp_memcontrol tcp_mem;
struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
/* analogous to slab_common's slab_caches list. per-memcg */
......@@ -550,13 +550,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
if (!memcg || mem_cgroup_is_root(memcg))
return NULL;
return &memcg->tcp_mem.cg_proto;
return &memcg->tcp_mem;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
if (!memcg_proto_activated(&memcg->tcp_mem))
return;
static_key_slow_dec(&memcg_socket_limit_enabled);
}
......
......@@ -1697,8 +1697,6 @@ static int __init inet_init(void)
ip_static_sysctl_init();
#endif
tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
/*
* Add all the base protocols.
*/
......
......@@ -200,49 +200,6 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,
return ret;
}
static int ipv4_tcp_mem(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
unsigned long vec[3];
struct net *net = current->nsproxy->net_ns;
#ifdef CONFIG_MEMCG_KMEM
struct mem_cgroup *memcg;
#endif
struct ctl_table tmp = {
.data = &vec,
.maxlen = sizeof(vec),
.mode = ctl->mode,
};
if (!write) {
ctl->data = &net->ipv4.sysctl_tcp_mem;
return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
}
ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
if (ret)
return ret;
#ifdef CONFIG_MEMCG_KMEM
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
tcp_prot_mem(memcg, vec[0], 0);
tcp_prot_mem(memcg, vec[1], 1);
tcp_prot_mem(memcg, vec[2], 2);
rcu_read_unlock();
#endif
net->ipv4.sysctl_tcp_mem[0] = vec[0];
net->ipv4.sysctl_tcp_mem[1] = vec[1];
net->ipv4.sysctl_tcp_mem[2] = vec[2];
return 0;
}
static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
......@@ -556,6 +513,13 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "tcp_mem",
.maxlen = sizeof(sysctl_tcp_mem),
.data = &sysctl_tcp_mem,
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
{
.procname = "tcp_wmem",
.data = &sysctl_tcp_wmem,
......@@ -865,12 +829,6 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = ipv4_local_port_range,
},
{
.procname = "tcp_mem",
.maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem),
.mode = 0644,
.proc_handler = ipv4_tcp_mem,
},
{ }
};
......@@ -880,32 +838,15 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
table = ipv4_net_table;
if (!net_eq(net, &init_net)) {
int i;
table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
if (table == NULL)
goto err_alloc;
table[0].data =
&net->ipv4.sysctl_icmp_echo_ignore_all;
table[1].data =
&net->ipv4.sysctl_icmp_echo_ignore_broadcasts;
table[2].data =
&net->ipv4.sysctl_icmp_ignore_bogus_error_responses;
table[3].data =
&net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr;
table[4].data =
&net->ipv4.sysctl_icmp_ratelimit;
table[5].data =
&net->ipv4.sysctl_icmp_ratemask;
table[6].data =
&net->ipv4.sysctl_ping_group_range;
table[7].data =
&net->ipv4.sysctl_tcp_ecn;
table[8].data =
&net->ipv4.sysctl_local_ports.range;
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
table[0].procname = NULL;
/* Update the variables to point into the current struct net */
for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++)
table[i].data += (void *)net - (void *)&init_net;
}
/*
......@@ -922,8 +863,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
net->ipv4.sysctl_local_ports.range[0] = 32768;
net->ipv4.sysctl_local_ports.range[1] = 61000;
tcp_init_mem(net);
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
if (net->ipv4.ipv4_hdr == NULL)
goto err_reg;
......
......@@ -288,9 +288,11 @@ int sysctl_tcp_min_tso_segs __read_mostly = 2;
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
long sysctl_tcp_mem[3] __read_mostly;
int sysctl_tcp_wmem[3] __read_mostly;
int sysctl_tcp_rmem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_tcp_mem);
EXPORT_SYMBOL(sysctl_tcp_rmem);
EXPORT_SYMBOL(sysctl_tcp_wmem);
......@@ -3097,13 +3099,13 @@ static int __init set_thash_entries(char *str)
}
__setup("thash_entries=", set_thash_entries);
void tcp_init_mem(struct net *net)
static void tcp_init_mem(void)
{
unsigned long limit = nr_free_buffer_pages() / 8;
limit = max(limit, 128UL);
net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
net->ipv4.sysctl_tcp_mem[1] = limit;
net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
sysctl_tcp_mem[0] = limit / 4 * 3;
sysctl_tcp_mem[1] = limit;
sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
}
void __init tcp_init(void)
......@@ -3165,7 +3167,7 @@ void __init tcp_init(void)
sysctl_tcp_max_orphans = cnt / 2;
sysctl_max_syn_backlog = max(128, cnt / 256);
tcp_init_mem(&init_net);
tcp_init_mem();
/* Set per-socket limits to no more than 1/128 the pressure threshold */
limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
max_wshare = min(4UL*1024*1024, limit);
......
......@@ -2749,6 +2749,7 @@ struct proto tcp_prot = {
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
......
......@@ -6,15 +6,10 @@
#include <linux/memcontrol.h>
#include <linux/module.h>
static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
{
return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
}
static void memcg_tcp_enter_memory_pressure(struct sock *sk)
{
if (sk->sk_cgrp->memory_pressure)
*sk->sk_cgrp->memory_pressure = 1;
sk->sk_cgrp->memory_pressure = 1;
}
EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
......@@ -27,34 +22,24 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
*/
struct res_counter *res_parent = NULL;
struct cg_proto *cg_proto, *parent_cg;
struct tcp_memcontrol *tcp;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct net *net = current->nsproxy->net_ns;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return 0;
tcp = tcp_from_cgproto(cg_proto);
tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
tcp->tcp_memory_pressure = 0;
cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0];
cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1];
cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2];
cg_proto->memory_pressure = 0;
cg_proto->memcg = memcg;
parent_cg = tcp_prot.proto_cgroup(parent);
if (parent_cg)
res_parent = parent_cg->memory_allocated;
res_counter_init(&tcp->tcp_memory_allocated, res_parent);
percpu_counter_init(&tcp->tcp_sockets_allocated, 0);
res_parent = &parent_cg->memory_allocated;
cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure;
cg_proto->memory_pressure = &tcp->tcp_memory_pressure;
cg_proto->sysctl_mem = tcp->tcp_prot_mem;
cg_proto->memory_allocated = &tcp->tcp_memory_allocated;
cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
cg_proto->memcg = memcg;
res_counter_init(&cg_proto->memory_allocated, res_parent);
percpu_counter_init(&cg_proto->sockets_allocated, 0);
return 0;
}
......@@ -63,21 +48,17 @@ EXPORT_SYMBOL(tcp_init_cgroup);
void tcp_destroy_cgroup(struct mem_cgroup *memcg)
{
struct cg_proto *cg_proto;
struct tcp_memcontrol *tcp;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return;
tcp = tcp_from_cgproto(cg_proto);
percpu_counter_destroy(&tcp->tcp_sockets_allocated);
percpu_counter_destroy(&cg_proto->sockets_allocated);
}
EXPORT_SYMBOL(tcp_destroy_cgroup);
static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
{
struct net *net = current->nsproxy->net_ns;
struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
u64 old_lim;
int i;
......@@ -90,16 +71,14 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
if (val > RES_COUNTER_MAX)
val = RES_COUNTER_MAX;
tcp = tcp_from_cgproto(cg_proto);
old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val);
old_lim = res_counter_read_u64(&cg_proto->memory_allocated, RES_LIMIT);
ret = res_counter_set_limit(&cg_proto->memory_allocated, val);
if (ret)
return ret;
for (i = 0; i < 3; i++)
tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
net->ipv4.sysctl_tcp_mem[i]);
cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT,
sysctl_tcp_mem[i]);
if (val == RES_COUNTER_MAX)
clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
......@@ -156,28 +135,24 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
{
struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return default_val;
tcp = tcp_from_cgproto(cg_proto);
return res_counter_read_u64(&tcp->tcp_memory_allocated, type);
return res_counter_read_u64(&cg_proto->memory_allocated, type);
}
static u64 tcp_read_usage(struct mem_cgroup *memcg)
{
struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT;
tcp = tcp_from_cgproto(cg_proto);
return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE);
}
static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
......@@ -205,54 +180,25 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
{
struct mem_cgroup *memcg;
struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
memcg = mem_cgroup_from_css(css);
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return 0;
tcp = tcp_from_cgproto(cg_proto);
switch (event) {
case RES_MAX_USAGE:
res_counter_reset_max(&tcp->tcp_memory_allocated);
res_counter_reset_max(&cg_proto->memory_allocated);
break;
case RES_FAILCNT:
res_counter_reset_failcnt(&tcp->tcp_memory_allocated);
res_counter_reset_failcnt(&cg_proto->memory_allocated);
break;
}
return 0;
}
unsigned long long tcp_max_memory(const struct mem_cgroup *memcg)
{
struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg);
if (!cg_proto)
return 0;
tcp = tcp_from_cgproto(cg_proto);
return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
}
void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
{
struct tcp_memcontrol *tcp;
struct cg_proto *cg_proto;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
return;
tcp = tcp_from_cgproto(cg_proto);
tcp->tcp_prot_mem[idx] = val;
}
static struct cftype tcp_files[] = {
{
.name = "kmem.tcp.limit_in_bytes",
......
......@@ -865,8 +865,6 @@ static int __init inet6_init(void)
if (err)
goto out_sock_register_fail;
tcpv6_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
/*
* ipngwg API draft makes clear that the correct semantics
* for TCP and UDP is to consider one TCP and UDP instance
......
......@@ -1929,6 +1929,7 @@ struct proto tcpv6_prot = {
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.orphan_count = &tcp_orphan_count,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment