Commit 37e84351 authored by Vladimir Davydov's avatar Vladimir Davydov Committed by Linus Torvalds

mm: memcontrol: charge swap to cgroup2

This patchset introduces swap accounting to cgroup2.

This patch (of 7):

In the legacy hierarchy we charge memsw, which is dubious, because:

 - memsw.limit must be >= memory.limit, so it is impossible to limit
   swap usage less than memory usage. Taking into account the fact that
   the primary limiting mechanism in the unified hierarchy is
   memory.high while memory.limit is either left unset or set to a very
   large value, moving memsw.limit knob to the unified hierarchy would
   effectively make it impossible to limit swap usage according to the
   user preference.

 - memsw.usage != memory.usage + swap.usage, because a page occupying
   both swap entry and a swap cache page is charged only once to memsw
   counter. As a result, it is possible to effectively eat up to
   memory.limit of memory pages *and* memsw.limit of swap entries, which
   looks unexpected.

That said, we should provide a different swap limiting mechanism for
cgroup2.

This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup.  It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.

The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.

Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e.  when all references to a swap entry, including the one
taken by swap cache, are gone.  This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry.  At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: default avatarVladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 0b8f73e1
...@@ -163,6 +163,7 @@ struct mem_cgroup { ...@@ -163,6 +163,7 @@ struct mem_cgroup {
/* Accounted resources */ /* Accounted resources */
struct page_counter memory; struct page_counter memory;
struct page_counter swap;
/* Legacy consumer-oriented counters */ /* Legacy consumer-oriented counters */
struct page_counter memsw; struct page_counter memsw;
......
...@@ -368,11 +368,17 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) ...@@ -368,11 +368,17 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
#endif #endif
#ifdef CONFIG_MEMCG_SWAP #ifdef CONFIG_MEMCG_SWAP
extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
extern void mem_cgroup_uncharge_swap(swp_entry_t entry); extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
#else #else
static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{ {
} }
static inline int mem_cgroup_try_charge_swap(struct page *page,
swp_entry_t entry)
{
return 0;
}
static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
{ {
} }
......
...@@ -1220,7 +1220,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) ...@@ -1220,7 +1220,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_cont(":"); pr_cont(":");
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue; continue;
pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
K(mem_cgroup_read_stat(iter, i))); K(mem_cgroup_read_stat(iter, i)));
...@@ -1259,9 +1259,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) ...@@ -1259,9 +1259,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
limit = memcg->memory.limit; limit = memcg->memory.limit;
if (mem_cgroup_swappiness(memcg)) { if (mem_cgroup_swappiness(memcg)) {
unsigned long memsw_limit; unsigned long memsw_limit;
unsigned long swap_limit;
memsw_limit = memcg->memsw.limit; memsw_limit = memcg->memsw.limit;
limit = min(limit + total_swap_pages, memsw_limit); swap_limit = memcg->swap.limit;
swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
limit = min(limit + swap_limit, memsw_limit);
} }
return limit; return limit;
} }
...@@ -4201,11 +4204,13 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ...@@ -4201,11 +4204,13 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent && parent->use_hierarchy) { if (parent && parent->use_hierarchy) {
memcg->use_hierarchy = true; memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else { } else {
page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->memory, NULL);
page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->kmem, NULL);
page_counter_init(&memcg->tcpmem, NULL); page_counter_init(&memcg->tcpmem, NULL);
...@@ -5224,7 +5229,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, ...@@ -5224,7 +5229,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
if (page->mem_cgroup) if (page->mem_cgroup)
goto out; goto out;
if (do_memsw_account()) { if (do_swap_account) {
swp_entry_t ent = { .val = page_private(page), }; swp_entry_t ent = { .val = page_private(page), };
unsigned short id = lookup_swap_cgroup_id(ent); unsigned short id = lookup_swap_cgroup_id(ent);
...@@ -5677,26 +5682,66 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) ...@@ -5677,26 +5682,66 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
memcg_check_events(memcg, page); memcg_check_events(memcg, page);
} }
/*
* mem_cgroup_try_charge_swap - try charging a swap entry
* @page: page being added to swap
* @entry: swap entry to charge
*
* Try to charge @entry to the memcg that @page belongs to.
*
* Returns 0 on success, -ENOMEM on failure.
*/
int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
{
struct mem_cgroup *memcg;
struct page_counter *counter;
unsigned short oldid;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
return 0;
memcg = page->mem_cgroup;
/* Readahead page, never charged */
if (!memcg)
return 0;
if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, 1, &counter))
return -ENOMEM;
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
VM_BUG_ON_PAGE(oldid, page);
mem_cgroup_swap_statistics(memcg, true);
css_get(&memcg->css);
return 0;
}
/** /**
* mem_cgroup_uncharge_swap - uncharge a swap entry * mem_cgroup_uncharge_swap - uncharge a swap entry
* @entry: swap entry to uncharge * @entry: swap entry to uncharge
* *
* Drop the memsw charge associated with @entry. * Drop the swap charge associated with @entry.
*/ */
void mem_cgroup_uncharge_swap(swp_entry_t entry) void mem_cgroup_uncharge_swap(swp_entry_t entry)
{ {
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
unsigned short id; unsigned short id;
if (!do_memsw_account()) if (!do_swap_account)
return; return;
id = swap_cgroup_record(entry, 0); id = swap_cgroup_record(entry, 0);
rcu_read_lock(); rcu_read_lock();
memcg = mem_cgroup_from_id(id); memcg = mem_cgroup_from_id(id);
if (memcg) { if (memcg) {
if (!mem_cgroup_is_root(memcg)) if (!mem_cgroup_is_root(memcg)) {
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->swap, 1);
else
page_counter_uncharge(&memcg->memsw, 1); page_counter_uncharge(&memcg->memsw, 1);
}
mem_cgroup_swap_statistics(memcg, false); mem_cgroup_swap_statistics(memcg, false);
css_put(&memcg->css); css_put(&memcg->css);
} }
...@@ -5720,6 +5765,63 @@ static int __init enable_swap_account(char *s) ...@@ -5720,6 +5765,63 @@ static int __init enable_swap_account(char *s)
} }
__setup("swapaccount=", enable_swap_account); __setup("swapaccount=", enable_swap_account);
static u64 swap_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
static int swap_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
unsigned long max = READ_ONCE(memcg->swap.limit);
if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
else
seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
return 0;
}
static ssize_t swap_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned long max;
int err;
buf = strstrip(buf);
err = page_counter_memparse(buf, "max", &max);
if (err)
return err;
mutex_lock(&memcg_limit_mutex);
err = page_counter_limit(&memcg->swap, max);
mutex_unlock(&memcg_limit_mutex);
if (err)
return err;
return nbytes;
}
static struct cftype swap_files[] = {
{
.name = "swap.current",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = swap_current_read,
},
{
.name = "swap.max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = swap_max_show,
.write = swap_max_write,
},
{ } /* terminate */
};
static struct cftype memsw_cgroup_files[] = { static struct cftype memsw_cgroup_files[] = {
{ {
.name = "memsw.usage_in_bytes", .name = "memsw.usage_in_bytes",
...@@ -5751,6 +5853,8 @@ static int __init mem_cgroup_swap_init(void) ...@@ -5751,6 +5853,8 @@ static int __init mem_cgroup_swap_init(void)
{ {
if (!mem_cgroup_disabled() && really_do_swap_account) { if (!mem_cgroup_disabled() && really_do_swap_account) {
do_swap_account = 1; do_swap_account = 1;
WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
swap_files));
WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
memsw_cgroup_files)); memsw_cgroup_files));
} }
......
...@@ -912,6 +912,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) ...@@ -912,6 +912,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (!swap.val) if (!swap.val)
goto redirty; goto redirty;
if (mem_cgroup_try_charge_swap(page, swap))
goto free_swap;
/* /*
* Add inode to shmem_unuse()'s list of swapped-out inodes, * Add inode to shmem_unuse()'s list of swapped-out inodes,
* if it's not already there. Do it now before the page is * if it's not already there. Do it now before the page is
...@@ -940,6 +943,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) ...@@ -940,6 +943,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
} }
mutex_unlock(&shmem_swaplist_mutex); mutex_unlock(&shmem_swaplist_mutex);
free_swap:
swapcache_free(swap); swapcache_free(swap);
redirty: redirty:
set_page_dirty(page); set_page_dirty(page);
......
...@@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list) ...@@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list)
if (!entry.val) if (!entry.val)
return 0; return 0;
if (mem_cgroup_try_charge_swap(page, entry)) {
swapcache_free(entry);
return 0;
}
if (unlikely(PageTransHuge(page))) if (unlikely(PageTransHuge(page)))
if (unlikely(split_huge_page_to_list(page, list))) { if (unlikely(split_huge_page_to_list(page, list))) {
swapcache_free(entry); swapcache_free(entry);
......
...@@ -785,14 +785,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, ...@@ -785,14 +785,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
count--; count--;
} }
if (!count)
mem_cgroup_uncharge_swap(entry);
usage = count | has_cache; usage = count | has_cache;
p->swap_map[offset] = usage; p->swap_map[offset] = usage;
/* free if no reference */ /* free if no reference */
if (!usage) { if (!usage) {
mem_cgroup_uncharge_swap(entry);
dec_cluster_info_page(p, p->cluster_info, offset); dec_cluster_info_page(p, p->cluster_info, offset);
if (offset < p->lowest_bit) if (offset < p->lowest_bit)
p->lowest_bit = offset; p->lowest_bit = offset;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment