Commit 0eea1030 authored by Balbir Singh's avatar Balbir Singh Committed by Linus Torvalds

Memory controller improve user interface

Change the interface to use bytes instead of pages.  Page sizes can vary
across platforms and configurations.  A new strategy routine has been added
to the resource counters infrastructure to format the data as desired.

Suggested by David Rientjes, Andrew Morton and Herbert Poetzl

Tested on a UML setup with the config for memory control enabled.

[kamezawa.hiroyu@jp.fujitsu.com: possible race fix in res_counter]
Signed-off-by: default avatarBalbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: default avatarPavel Emelianov <xemul@openvz.org>
Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: David Rientjes <rientjes@google.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 66e1707b
...@@ -165,11 +165,30 @@ c. Enable CONFIG_CGROUP_MEM_CONT ...@@ -165,11 +165,30 @@ c. Enable CONFIG_CGROUP_MEM_CONT
Since now we're in the 0 cgroup, Since now we're in the 0 cgroup,
We can alter the memory limit: We can alter the memory limit:
# echo -n 6000 > /cgroups/0/memory.limit # echo -n 4M > /cgroups/0/memory.limit_in_bytes
NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
mega or gigabytes.
# cat /cgroups/0/memory.limit_in_bytes
4194304 Bytes
NOTE: The interface has now changed to display the usage in bytes
instead of pages
We can check the usage: We can check the usage:
# cat /cgroups/0/memory.usage # cat /cgroups/0/memory.usage_in_bytes
25 1216512 Bytes
A successful write to this file does not guarantee a successful set of
this limit to the value written into the file. This can be due to a
number of factors, such as rounding up to page boundaries or the total
availability of memory on the system. The user is required to re-read
this file after a write to guarantee the value committed by the kernel.
# echo -n 1 > memory.limit_in_bytes
# cat memory.limit_in_bytes
4096 Bytes
The memory.failcnt field gives the number of times that the cgroup limit was The memory.failcnt field gives the number of times that the cgroup limit was
exceeded. exceeded.
...@@ -206,8 +225,8 @@ cgroup might have some charge associated with it, even though all ...@@ -206,8 +225,8 @@ cgroup might have some charge associated with it, even though all
tasks have migrated away from it. If some pages are still left, after following tasks have migrated away from it. If some pages are still left, after following
the steps listed in sections 4.1 and 4.2, check the Swap Cache usage in the steps listed in sections 4.1 and 4.2, check the Swap Cache usage in
/proc/meminfo to see if the Swap Cache usage is showing up in the /proc/meminfo to see if the Swap Cache usage is showing up in the
cgroups memory.usage counter. A simple test of swapoff -a and swapon -a cgroups memory.usage_in_bytes counter. A simple test of swapoff -a and
should free any pending Swap Cache usage. swapon -a should free any pending Swap Cache usage.
4.4 Choosing what to account -- Page Cache (unmapped) vs RSS (mapped)? 4.4 Choosing what to account -- Page Cache (unmapped) vs RSS (mapped)?
......
...@@ -23,15 +23,15 @@ struct res_counter { ...@@ -23,15 +23,15 @@ struct res_counter {
/* /*
* the current resource consumption level * the current resource consumption level
*/ */
unsigned long usage; unsigned long long usage;
/* /*
* the limit that usage cannot exceed * the limit that usage cannot exceed
*/ */
unsigned long limit; unsigned long long limit;
/* /*
* the number of unsuccessful attempts to consume the resource * the number of unsuccessful attempts to consume the resource
*/ */
unsigned long failcnt; unsigned long long failcnt;
/* /*
* the lock to protect all of the above. * the lock to protect all of the above.
* the routines below consider this to be IRQ-safe * the routines below consider this to be IRQ-safe
...@@ -52,9 +52,11 @@ struct res_counter { ...@@ -52,9 +52,11 @@ struct res_counter {
*/ */
ssize_t res_counter_read(struct res_counter *counter, int member, ssize_t res_counter_read(struct res_counter *counter, int member,
const char __user *buf, size_t nbytes, loff_t *pos); const char __user *buf, size_t nbytes, loff_t *pos,
int (*read_strategy)(unsigned long long val, char *s));
ssize_t res_counter_write(struct res_counter *counter, int member, ssize_t res_counter_write(struct res_counter *counter, int member,
const char __user *buf, size_t nbytes, loff_t *pos); const char __user *buf, size_t nbytes, loff_t *pos,
int (*write_strategy)(char *buf, unsigned long long *val));
/* /*
* the field descriptors. one for each member of res_counter * the field descriptors. one for each member of res_counter
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
void res_counter_init(struct res_counter *counter) void res_counter_init(struct res_counter *counter)
{ {
spin_lock_init(&counter->lock); spin_lock_init(&counter->lock);
counter->limit = (unsigned long)LONG_MAX; counter->limit = (unsigned long long)LLONG_MAX;
} }
int res_counter_charge_locked(struct res_counter *counter, unsigned long val) int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
...@@ -59,8 +59,8 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) ...@@ -59,8 +59,8 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
} }
static inline unsigned long *res_counter_member(struct res_counter *counter, static inline unsigned long long *
int member) res_counter_member(struct res_counter *counter, int member)
{ {
switch (member) { switch (member) {
case RES_USAGE: case RES_USAGE:
...@@ -76,24 +76,30 @@ static inline unsigned long *res_counter_member(struct res_counter *counter, ...@@ -76,24 +76,30 @@ static inline unsigned long *res_counter_member(struct res_counter *counter,
} }
ssize_t res_counter_read(struct res_counter *counter, int member, ssize_t res_counter_read(struct res_counter *counter, int member,
const char __user *userbuf, size_t nbytes, loff_t *pos) const char __user *userbuf, size_t nbytes, loff_t *pos,
int (*read_strategy)(unsigned long long val, char *st_buf))
{ {
unsigned long *val; unsigned long long *val;
char buf[64], *s; char buf[64], *s;
s = buf; s = buf;
val = res_counter_member(counter, member); val = res_counter_member(counter, member);
s += sprintf(s, "%lu\n", *val); if (read_strategy)
s += read_strategy(*val, s);
else
s += sprintf(s, "%llu\n", *val);
return simple_read_from_buffer((void __user *)userbuf, nbytes, return simple_read_from_buffer((void __user *)userbuf, nbytes,
pos, buf, s - buf); pos, buf, s - buf);
} }
ssize_t res_counter_write(struct res_counter *counter, int member, ssize_t res_counter_write(struct res_counter *counter, int member,
const char __user *userbuf, size_t nbytes, loff_t *pos) const char __user *userbuf, size_t nbytes, loff_t *pos,
int (*write_strategy)(char *st_buf, unsigned long long *val))
{ {
int ret; int ret;
char *buf, *end; char *buf, *end;
unsigned long tmp, *val; unsigned long flags;
unsigned long long tmp, *val;
buf = kmalloc(nbytes + 1, GFP_KERNEL); buf = kmalloc(nbytes + 1, GFP_KERNEL);
ret = -ENOMEM; ret = -ENOMEM;
...@@ -106,12 +112,20 @@ ssize_t res_counter_write(struct res_counter *counter, int member, ...@@ -106,12 +112,20 @@ ssize_t res_counter_write(struct res_counter *counter, int member,
goto out_free; goto out_free;
ret = -EINVAL; ret = -EINVAL;
tmp = simple_strtoul(buf, &end, 10);
if (*end != '\0')
goto out_free;
if (write_strategy) {
if (write_strategy(buf, &tmp)) {
goto out_free;
}
} else {
tmp = simple_strtoull(buf, &end, 10);
if (*end != '\0')
goto out_free;
}
spin_lock_irqsave(&counter->lock, flags);
val = res_counter_member(counter, member); val = res_counter_member(counter, member);
*val = tmp; *val = tmp;
spin_unlock_irqrestore(&counter->lock, flags);
ret = nbytes; ret = nbytes;
out_free: out_free:
kfree(buf); kfree(buf);
......
...@@ -302,7 +302,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) ...@@ -302,7 +302,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
* If we created the page_cgroup, we should free it on exceeding * If we created the page_cgroup, we should free it on exceeding
* the cgroup limit. * the cgroup limit.
*/ */
while (res_counter_charge(&mem->res, 1)) { while (res_counter_charge(&mem->res, PAGE_SIZE)) {
if (try_to_free_mem_cgroup_pages(mem)) if (try_to_free_mem_cgroup_pages(mem))
continue; continue;
...@@ -341,7 +341,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) ...@@ -341,7 +341,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
kfree(pc); kfree(pc);
pc = race_pc; pc = race_pc;
atomic_inc(&pc->ref_cnt); atomic_inc(&pc->ref_cnt);
res_counter_uncharge(&mem->res, 1); res_counter_uncharge(&mem->res, PAGE_SIZE);
css_put(&mem->css); css_put(&mem->css);
goto done; goto done;
} }
...@@ -384,7 +384,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) ...@@ -384,7 +384,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
css_put(&mem->css); css_put(&mem->css);
page_assign_page_cgroup(page, NULL); page_assign_page_cgroup(page, NULL);
unlock_page_cgroup(page); unlock_page_cgroup(page);
res_counter_uncharge(&mem->res, 1); res_counter_uncharge(&mem->res, PAGE_SIZE);
spin_lock_irqsave(&mem->lru_lock, flags); spin_lock_irqsave(&mem->lru_lock, flags);
list_del_init(&pc->lru); list_del_init(&pc->lru);
...@@ -393,12 +393,26 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) ...@@ -393,12 +393,26 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
} }
} }
static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
struct file *file, char __user *userbuf, size_t nbytes, {
loff_t *ppos) *tmp = memparse(buf, &buf);
if (*buf != '\0')
return -EINVAL;
/*
* Round up the value to the closest page size
*/
*tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
return 0;
}
static ssize_t mem_cgroup_read(struct cgroup *cont,
struct cftype *cft, struct file *file,
char __user *userbuf, size_t nbytes, loff_t *ppos)
{ {
return res_counter_read(&mem_cgroup_from_cont(cont)->res, return res_counter_read(&mem_cgroup_from_cont(cont)->res,
cft->private, userbuf, nbytes, ppos); cft->private, userbuf, nbytes, ppos,
NULL);
} }
static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
...@@ -406,17 +420,18 @@ static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, ...@@ -406,17 +420,18 @@ static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
size_t nbytes, loff_t *ppos) size_t nbytes, loff_t *ppos)
{ {
return res_counter_write(&mem_cgroup_from_cont(cont)->res, return res_counter_write(&mem_cgroup_from_cont(cont)->res,
cft->private, userbuf, nbytes, ppos); cft->private, userbuf, nbytes, ppos,
mem_cgroup_write_strategy);
} }
static struct cftype mem_cgroup_files[] = { static struct cftype mem_cgroup_files[] = {
{ {
.name = "usage", .name = "usage_in_bytes",
.private = RES_USAGE, .private = RES_USAGE,
.read = mem_cgroup_read, .read = mem_cgroup_read,
}, },
{ {
.name = "limit", .name = "limit_in_bytes",
.private = RES_LIMIT, .private = RES_LIMIT,
.write = mem_cgroup_write, .write = mem_cgroup_write,
.read = mem_cgroup_read, .read = mem_cgroup_read,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment