Commit 2d146aa3 authored by Johannes Weiner's avatar Johannes Weiner Committed by Linus Torvalds

mm: memcontrol: switch to rstat

Replace the memory controller's custom hierarchical stats code with the
generic rstat infrastructure provided by the cgroup core.

The current implementation does batched upward propagation from the
write side (i.e.  as stats change).  The per-cpu batches introduce an
error, which is multiplied by the number of subgroups in a tree.  In
systems with many CPUs and sizable cgroup trees, the error can be large
enough to confuse users (e.g.  32 batch pages * 32 CPUs * 32 subgroups
results in an error of up to 128M per stat item).  This can entirely
swallow allocation bursts inside a workload that the user is expecting
to see reflected in the statistics.

In the past, we've done read-side aggregation, where a memory.stat read
would have to walk the entire subtree and add up per-cpu counts.  This
became problematic with lazily-freed cgroups: we could have large
subtrees where most cgroups were entirely idle.  Hence the switch to
change-driven upward propagation.  Unfortunately, it needed to trade
accuracy for speed due to the write side being so hot.

Rstat combines the best of both worlds: from the write side, it cheaply
maintains a queue of cgroups that have pending changes, so that the read
side can do selective tree aggregation.  This way the reported stats
will always be precise and recent as can be, while the aggregation can
skip over potentially large numbers of idle cgroups.

The way rstat works is that it implements a tree for tracking cgroups
with pending local changes, as well as a flush function that walks the
tree upwards.  The controller then drives this by 1) telling rstat when
a local cgroup stat changes (e.g.  mod_memcg_state) and 2) when a flush
is required to get uptodate hierarchy stats for a given subtree (e.g.
when memory.stat is read).  The controller also provides a flush
callback that is called during the rstat flush walk for each cgroup and
aggregates its local per-cpu counters and propagates them upwards.

This adds a second vmstats to struct mem_cgroup (MEMCG_NR_STAT +
NR_VM_EVENT_ITEMS) to track pending subtree deltas during upward
aggregation.  It removes 3 words from the per-cpu data.  It eliminates
memcg_exact_page_state(), since memcg_page_state() is now exact.

[akpm@linux-foundation.org: merge fix]
[hannes@cmpxchg.org: fix a sleep in atomic section problem]
  Link: https://lkml.kernel.org/r/20210315234100.64307-1-hannes@cmpxchg.org

Link: https://lkml.kernel.org/r/20210209163304.77088-7-hannes@cmpxchg.orgSigned-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Reviewed-by: default avatarRoman Gushchin <guro@fb.com>
Acked-by: default avatarMichal Hocko <mhocko@suse.com>
Reviewed-by: default avatarShakeel Butt <shakeelb@google.com>
Reviewed-by: default avatarMichal Koutný <mkoutny@suse.com>
Acked-by: default avatarBalbir Singh <bsingharora@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent dc26532a
...@@ -76,10 +76,27 @@ enum mem_cgroup_events_target { ...@@ -76,10 +76,27 @@ enum mem_cgroup_events_target {
}; };
struct memcg_vmstats_percpu { struct memcg_vmstats_percpu {
long stat[MEMCG_NR_STAT]; /* Local (CPU and cgroup) page state & events */
unsigned long events[NR_VM_EVENT_ITEMS]; long state[MEMCG_NR_STAT];
unsigned long nr_page_events; unsigned long events[NR_VM_EVENT_ITEMS];
unsigned long targets[MEM_CGROUP_NTARGETS];
/* Delta calculation for lockless upward propagation */
long state_prev[MEMCG_NR_STAT];
unsigned long events_prev[NR_VM_EVENT_ITEMS];
/* Cgroup1: threshold notifications & softlimit tree updates */
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
};
struct memcg_vmstats {
/* Aggregated (CPU and subtree) page state & events */
long state[MEMCG_NR_STAT];
unsigned long events[NR_VM_EVENT_ITEMS];
/* Pending child counts during tree propagation */
long state_pending[MEMCG_NR_STAT];
unsigned long events_pending[NR_VM_EVENT_ITEMS];
}; };
struct mem_cgroup_reclaim_iter { struct mem_cgroup_reclaim_iter {
...@@ -287,8 +304,8 @@ struct mem_cgroup { ...@@ -287,8 +304,8 @@ struct mem_cgroup {
MEMCG_PADDING(_pad1_); MEMCG_PADDING(_pad1_);
atomic_long_t vmstats[MEMCG_NR_STAT]; /* memory.stat */
atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; struct memcg_vmstats vmstats;
/* memory.events */ /* memory.events */
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
...@@ -315,10 +332,6 @@ struct mem_cgroup { ...@@ -315,10 +332,6 @@ struct mem_cgroup {
atomic_t moving_account; atomic_t moving_account;
struct task_struct *move_lock_task; struct task_struct *move_lock_task;
/* Legacy local VM stats and events */
struct memcg_vmstats_percpu __percpu *vmstats_local;
/* Subtree VM stats and events (batched updates) */
struct memcg_vmstats_percpu __percpu *vmstats_percpu; struct memcg_vmstats_percpu __percpu *vmstats_percpu;
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
...@@ -939,10 +952,6 @@ static inline void mod_memcg_lruvec_state(struct lruvec *lruvec, ...@@ -939,10 +952,6 @@ static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
local_irq_restore(flags); local_irq_restore(flags);
} }
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count); unsigned long count);
...@@ -1023,6 +1032,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, ...@@ -1023,6 +1032,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
void split_page_memcg(struct page *head, unsigned int nr); void split_page_memcg(struct page *head, unsigned int nr);
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
#else /* CONFIG_MEMCG */ #else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0 #define MEM_CGROUP_ID_SHIFT 0
...@@ -1131,6 +1144,10 @@ static inline bool lruvec_holds_page_lru_lock(struct page *page, ...@@ -1131,6 +1144,10 @@ static inline bool lruvec_holds_page_lru_lock(struct page *page,
return lruvec == &pgdat->__lruvec; return lruvec == &pgdat->__lruvec;
} }
static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
{
}
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{ {
return NULL; return NULL;
...@@ -1334,18 +1351,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, ...@@ -1334,18 +1351,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
mod_node_page_state(page_pgdat(page), idx, val); mod_node_page_state(page_pgdat(page), idx, val);
} }
static inline
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned)
{
return 0;
}
static inline void split_page_memcg(struct page *head, unsigned int nr)
{
}
static inline void count_memcg_events(struct mem_cgroup *memcg, static inline void count_memcg_events(struct mem_cgroup *memcg,
enum vm_event_item idx, enum vm_event_item idx,
unsigned long count) unsigned long count)
...@@ -1368,8 +1373,16 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) ...@@ -1368,8 +1373,16 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{ {
} }
static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) static inline void split_page_memcg(struct page *head, unsigned int nr)
{
}
static inline
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned)
{ {
return 0;
} }
#endif /* CONFIG_MEMCG */ #endif /* CONFIG_MEMCG */
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment