Commit 185efc0f authored by Johannes Weiner's avatar Johannes Weiner Committed by Linus Torvalds

memcg: Revert "memcg: add memory.vmscan_stat"

Revert the post-3.0 commit 82f9d486 ("memcg: add
memory.vmscan_stat").

The implementation of per-memcg reclaim statistics violates how memcg
hierarchies usually behave: hierarchically.

The reclaim statistics are accounted to child memcgs and the parent
hitting the limit, but not to hierarchy levels in between.  Usually,
hierarchical statistics are perfectly recursive, with each level
representing the sum of itself and all its children.

Since this exports statistics to userspace, this may lead to confusion
and problems with changing things after the release, so revert it now,
we can try again later.
Signed-off-by: default avatarJohannes Weiner <jweiner@redhat.com>
Acked-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Ying Han <yinghan@google.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent a4d3e9e7
......@@ -380,7 +380,7 @@ will be charged as a new owner of it.
5.2 stat file
5.2.1 memory.stat file includes following statistics
memory.stat file includes following statistics
# per-memory cgroup local status
cache - # of bytes of page cache memory.
......@@ -438,89 +438,6 @@ Note:
file_mapped is accounted only when the memory cgroup is owner of page
cache.)
5.2.2 memory.vmscan_stat
memory.vmscan_stat includes statistics information for memory scanning and
freeing, reclaiming. The statistics shows memory scanning information since
memory cgroup creation and can be reset to 0 by writing 0 as
#echo 0 > ../memory.vmscan_stat
This file contains following statistics.
[param]_[file_or_anon]_pages_by_[reason]_[under_heararchy]
[param]_elapsed_ns_by_[reason]_[under_hierarchy]
For example,
scanned_file_pages_by_limit indicates the number of scanned
file pages at vmscan.
Now, 3 parameters are supported
scanned - the number of pages scanned by vmscan
rotated - the number of pages activated at vmscan
freed - the number of pages freed by vmscan
If "rotated" is high against scanned/freed, the memcg seems busy.
Now, 2 reason are supported
limit - the memory cgroup's limit
system - global memory pressure + softlimit
(global memory pressure not under softlimit is not handled now)
When under_hierarchy is added in the tail, the number indicates the
total memcg scan of its children and itself.
elapsed_ns is a elapsed time in nanosecond. This may include sleep time
and not indicates CPU usage. So, please take this as just showing
latency.
Here is an example.
# cat /cgroup/memory/A/memory.vmscan_stat
scanned_pages_by_limit 9471864
scanned_anon_pages_by_limit 6640629
scanned_file_pages_by_limit 2831235
rotated_pages_by_limit 4243974
rotated_anon_pages_by_limit 3971968
rotated_file_pages_by_limit 272006
freed_pages_by_limit 2318492
freed_anon_pages_by_limit 962052
freed_file_pages_by_limit 1356440
elapsed_ns_by_limit 351386416101
scanned_pages_by_system 0
scanned_anon_pages_by_system 0
scanned_file_pages_by_system 0
rotated_pages_by_system 0
rotated_anon_pages_by_system 0
rotated_file_pages_by_system 0
freed_pages_by_system 0
freed_anon_pages_by_system 0
freed_file_pages_by_system 0
elapsed_ns_by_system 0
scanned_pages_by_limit_under_hierarchy 9471864
scanned_anon_pages_by_limit_under_hierarchy 6640629
scanned_file_pages_by_limit_under_hierarchy 2831235
rotated_pages_by_limit_under_hierarchy 4243974
rotated_anon_pages_by_limit_under_hierarchy 3971968
rotated_file_pages_by_limit_under_hierarchy 272006
freed_pages_by_limit_under_hierarchy 2318492
freed_anon_pages_by_limit_under_hierarchy 962052
freed_file_pages_by_limit_under_hierarchy 1356440
elapsed_ns_by_limit_under_hierarchy 351386416101
scanned_pages_by_system_under_hierarchy 0
scanned_anon_pages_by_system_under_hierarchy 0
scanned_file_pages_by_system_under_hierarchy 0
rotated_pages_by_system_under_hierarchy 0
rotated_anon_pages_by_system_under_hierarchy 0
rotated_file_pages_by_system_under_hierarchy 0
freed_pages_by_system_under_hierarchy 0
freed_anon_pages_by_system_under_hierarchy 0
freed_file_pages_by_system_under_hierarchy 0
elapsed_ns_by_system_under_hierarchy 0
5.3 swappiness
Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
......
......@@ -39,16 +39,6 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct mem_cgroup *mem_cont,
int active, int file);
struct memcg_scanrecord {
struct mem_cgroup *mem; /* scanend memory cgroup */
struct mem_cgroup *root; /* scan target hierarchy root */
int context; /* scanning context (see memcontrol.c) */
unsigned long nr_scanned[2]; /* the number of scanned pages */
unsigned long nr_rotated[2]; /* the number of rotated pages */
unsigned long nr_freed[2]; /* the number of freed pages */
unsigned long elapsed; /* nsec of time elapsed while scanning */
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
/*
* All "charge" functions with gfp_mask should use GFP_KERNEL or
......@@ -127,15 +117,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page);
extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
struct memcg_scanrecord *rec);
extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
struct zone *zone,
struct memcg_scanrecord *rec,
unsigned long *nr_scanned);
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
extern int do_swap_account;
#endif
......
......@@ -252,6 +252,12 @@ static inline void lru_cache_add_file(struct page *page)
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
extern int __isolate_lru_page(struct page *page, int mode, int file);
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap);
extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
struct zone *zone,
unsigned long *nr_scanned);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
extern int remove_mapping(struct address_space *mapping, struct page *page);
......
......@@ -204,50 +204,6 @@ struct mem_cgroup_eventfd_list {
static void mem_cgroup_threshold(struct mem_cgroup *mem);
static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
enum {
SCAN_BY_LIMIT,
SCAN_BY_SYSTEM,
NR_SCAN_CONTEXT,
SCAN_BY_SHRINK, /* not recorded now */
};
enum {
SCAN,
SCAN_ANON,
SCAN_FILE,
ROTATE,
ROTATE_ANON,
ROTATE_FILE,
FREED,
FREED_ANON,
FREED_FILE,
ELAPSED,
NR_SCANSTATS,
};
struct scanstat {
spinlock_t lock;
unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
};
const char *scanstat_string[NR_SCANSTATS] = {
"scanned_pages",
"scanned_anon_pages",
"scanned_file_pages",
"rotated_pages",
"rotated_anon_pages",
"rotated_file_pages",
"freed_pages",
"freed_anon_pages",
"freed_file_pages",
"elapsed_ns",
};
#define SCANSTAT_WORD_LIMIT "_by_limit"
#define SCANSTAT_WORD_SYSTEM "_by_system"
#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy"
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
......@@ -313,8 +269,7 @@ struct mem_cgroup {
/* For oom notifier event fd */
struct list_head oom_notify;
/* For recording LRU-scan statistics */
struct scanstat scanstat;
/*
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
......@@ -1678,44 +1633,6 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
}
#endif
static void __mem_cgroup_record_scanstat(unsigned long *stats,
struct memcg_scanrecord *rec)
{
stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
stats[SCAN_ANON] += rec->nr_scanned[0];
stats[SCAN_FILE] += rec->nr_scanned[1];
stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
stats[ROTATE_ANON] += rec->nr_rotated[0];
stats[ROTATE_FILE] += rec->nr_rotated[1];
stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
stats[FREED_ANON] += rec->nr_freed[0];
stats[FREED_FILE] += rec->nr_freed[1];
stats[ELAPSED] += rec->elapsed;
}
static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
{
struct mem_cgroup *mem;
int context = rec->context;
if (context >= NR_SCAN_CONTEXT)
return;
mem = rec->mem;
spin_lock(&mem->scanstat.lock);
__mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
spin_unlock(&mem->scanstat.lock);
mem = rec->root;
spin_lock(&mem->scanstat.lock);
__mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
spin_unlock(&mem->scanstat.lock);
}
/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
......@@ -1740,9 +1657,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
struct memcg_scanrecord rec;
unsigned long excess;
unsigned long scanned;
unsigned long nr_scanned;
excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
......@@ -1750,15 +1666,6 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
if (!check_soft && !shrink && root_mem->memsw_is_minimum)
noswap = true;
if (shrink)
rec.context = SCAN_BY_SHRINK;
else if (check_soft)
rec.context = SCAN_BY_SYSTEM;
else
rec.context = SCAN_BY_LIMIT;
rec.root = root_mem;
while (1) {
victim = mem_cgroup_select_victim(root_mem);
if (victim == root_mem) {
......@@ -1799,23 +1706,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
css_put(&victim->css);
continue;
}
rec.mem = victim;
rec.nr_scanned[0] = 0;
rec.nr_scanned[1] = 0;
rec.nr_rotated[0] = 0;
rec.nr_rotated[1] = 0;
rec.nr_freed[0] = 0;
rec.nr_freed[1] = 0;
rec.elapsed = 0;
/* we use swappiness of local cgroup */
if (check_soft) {
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
noswap, zone, &rec, &scanned);
*total_scanned += scanned;
noswap, zone, &nr_scanned);
*total_scanned += nr_scanned;
} else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
noswap, &rec);
mem_cgroup_record_scanstat(&rec);
noswap);
css_put(&victim->css);
/*
* At shrinking usage, we can't check we should stop here or
......@@ -3854,18 +3752,14 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
/* try to free all pages in this cgroup */
shrink = 1;
while (nr_retries && mem->res.usage > 0) {
struct memcg_scanrecord rec;
int progress;
if (signal_pending(current)) {
ret = -EINTR;
goto out;
}
rec.context = SCAN_BY_SHRINK;
rec.mem = mem;
rec.root = mem;
progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
false, &rec);
false);
if (!progress) {
nr_retries--;
/* maybe some writeback is necessary */
......@@ -4709,54 +4603,6 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
}
#endif /* CONFIG_NUMA */
static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
struct cftype *cft,
struct cgroup_map_cb *cb)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
char string[64];
int i;
for (i = 0; i < NR_SCANSTATS; i++) {
strcpy(string, scanstat_string[i]);
strcat(string, SCANSTAT_WORD_LIMIT);
cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]);
}
for (i = 0; i < NR_SCANSTATS; i++) {
strcpy(string, scanstat_string[i]);
strcat(string, SCANSTAT_WORD_SYSTEM);
cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
}
for (i = 0; i < NR_SCANSTATS; i++) {
strcpy(string, scanstat_string[i]);
strcat(string, SCANSTAT_WORD_LIMIT);
strcat(string, SCANSTAT_WORD_HIERARCHY);
cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
}
for (i = 0; i < NR_SCANSTATS; i++) {
strcpy(string, scanstat_string[i]);
strcat(string, SCANSTAT_WORD_SYSTEM);
strcat(string, SCANSTAT_WORD_HIERARCHY);
cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
}
return 0;
}
static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
unsigned int event)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
spin_lock(&mem->scanstat.lock);
memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
spin_unlock(&mem->scanstat.lock);
return 0;
}
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
......@@ -4827,11 +4673,6 @@ static struct cftype mem_cgroup_files[] = {
.mode = S_IRUGO,
},
#endif
{
.name = "vmscan_stat",
.read_map = mem_cgroup_vmscan_stat_read,
.trigger = mem_cgroup_reset_vmscan_stat,
},
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
......@@ -5095,7 +4936,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
atomic_set(&mem->refcnt, 1);
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);
spin_lock_init(&mem->scanstat.lock);
return &mem->css;
free_out:
__mem_cgroup_free(mem);
......
......@@ -105,7 +105,6 @@ struct scan_control {
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
struct memcg_scanrecord *memcg_record;
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
......@@ -1349,8 +1348,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
int file = is_file_lru(lru);
int numpages = hpage_nr_pages(page);
reclaim_stat->recent_rotated[file] += numpages;
if (!scanning_global_lru(sc))
sc->memcg_record->nr_rotated[file] += numpages;
}
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
......@@ -1394,10 +1391,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
reclaim_stat->recent_scanned[0] += *nr_anon;
reclaim_stat->recent_scanned[1] += *nr_file;
if (!scanning_global_lru(sc)) {
sc->memcg_record->nr_scanned[0] += *nr_anon;
sc->memcg_record->nr_scanned[1] += *nr_file;
}
}
/*
......@@ -1511,9 +1504,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
nr_reclaimed += shrink_page_list(&page_list, zone, sc);
}
if (!scanning_global_lru(sc))
sc->memcg_record->nr_freed[file] += nr_reclaimed;
local_irq_disable();
if (current_is_kswapd())
__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
......@@ -1613,8 +1603,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
}
reclaim_stat->recent_scanned[file] += nr_taken;
if (!scanning_global_lru(sc))
sc->memcg_record->nr_scanned[file] += nr_taken;
__count_zone_vm_events(PGREFILL, zone, pgscanned);
if (file)
......@@ -1666,8 +1654,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
* get_scan_ratio.
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
if (!scanning_global_lru(sc))
sc->memcg_record->nr_rotated[file] += nr_rotated;
move_active_pages_to_lru(zone, &l_active,
LRU_ACTIVE + file * LRU_FILE);
......@@ -2267,8 +2253,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
struct zone *zone,
struct memcg_scanrecord *rec,
unsigned long *scanned)
unsigned long *nr_scanned)
{
struct scan_control sc = {
.nr_scanned = 0,
......@@ -2278,9 +2263,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
.may_swap = !noswap,
.order = 0,
.mem_cgroup = mem,
.memcg_record = rec,
};
ktime_t start, end;
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
......@@ -2289,7 +2272,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
sc.may_writepage,
sc.gfp_mask);
start = ktime_get();
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
......@@ -2298,25 +2280,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
* the priority and make it zero.
*/
shrink_zone(0, zone, &sc);
end = ktime_get();
if (rec)
rec->elapsed += ktime_to_ns(ktime_sub(end, start));
*scanned = sc.nr_scanned;
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
*nr_scanned = sc.nr_scanned;
return sc.nr_reclaimed;
}
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
gfp_t gfp_mask,
bool noswap,
struct memcg_scanrecord *rec)
bool noswap)
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
ktime_t start, end;
int nid;
struct scan_control sc = {
.may_writepage = !laptop_mode,
......@@ -2325,7 +2301,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.order = 0,
.mem_cgroup = mem_cont,
.memcg_record = rec,
.nodemask = NULL, /* we don't care the placement */
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
......@@ -2334,7 +2309,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
.gfp_mask = sc.gfp_mask,
};
start = ktime_get();
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
* take care of from where we get pages. So the node where we start the
......@@ -2349,9 +2323,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
end = ktime_get();
if (rec)
rec->elapsed += ktime_to_ns(ktime_sub(end, start));
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment