Commit 4b569387 authored by Nhat Pham's avatar Nhat Pham Committed by Andrew Morton

memcontrol: add helpers for hugetlb memcg accounting

Patch series "hugetlb memcg accounting", v4.

Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory.  This has been observed in our production system.

For instance, here is one of our usecases: suppose there are two 32G
containers.  The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it.  The rest is anon, cache, slab, etc.  We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness.  But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etcetera fair.

What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max.  Similar procedure is done to other memory limits
(memory.low for e.g).  However, this is rather cumbersome and buggy. 
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.

This patch series rectifies this issue by charging the memcg when the
hugetlb folio is allocated, and uncharging when the folio is freed.  In
addition, a new selftest is added to demonstrate and verify this new
behavior.


This patch (of 4):

This patch exposes charge committing and cancelling as parts of the memory
controller interface.  These functionalities are useful when the
try_charge() and commit_charge() stages have to be separated by other
actions in between (which can fail).  One such example is the new hugetlb
accounting behavior in the following patch.

The patch also adds a helper function to obtain a reference to the
current task's memcg.

Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.comSigned-off-by: default avatarNhat Pham <nphamcs@gmail.com>
Acked-by: default avatarMichal Hocko <mhocko@suse.com>
Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 59838b25
...@@ -652,6 +652,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, ...@@ -652,6 +652,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
page_counter_read(&memcg->memory); page_counter_read(&memcg->memory);
} }
void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
/** /**
...@@ -703,6 +705,8 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) ...@@ -703,6 +705,8 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
__mem_cgroup_uncharge_list(page_list); __mem_cgroup_uncharge_list(page_list);
} }
void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
void mem_cgroup_migrate(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new);
/** /**
...@@ -759,6 +763,8 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); ...@@ -759,6 +763,8 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
struct mem_cgroup *get_mem_cgroup_from_current(void);
struct lruvec *folio_lruvec_lock(struct folio *folio); struct lruvec *folio_lruvec_lock(struct folio *folio);
struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
...@@ -1239,6 +1245,11 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, ...@@ -1239,6 +1245,11 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
return false; return false;
} }
static inline void mem_cgroup_commit_charge(struct folio *folio,
struct mem_cgroup *memcg)
{
}
static inline int mem_cgroup_charge(struct folio *folio, static inline int mem_cgroup_charge(struct folio *folio,
struct mm_struct *mm, gfp_t gfp) struct mm_struct *mm, gfp_t gfp)
{ {
...@@ -1263,6 +1274,11 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) ...@@ -1263,6 +1274,11 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
{ {
} }
static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
unsigned int nr_pages)
{
}
static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
{ {
} }
...@@ -1300,6 +1316,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) ...@@ -1300,6 +1316,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
return NULL; return NULL;
} }
static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
{
return NULL;
}
static inline static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
{ {
......
...@@ -1099,6 +1099,27 @@ static __always_inline bool memcg_kmem_bypass(void) ...@@ -1099,6 +1099,27 @@ static __always_inline bool memcg_kmem_bypass(void)
return false; return false;
} }
/**
* get_mem_cgroup_from_current - Obtain a reference on current task's memcg.
*/
struct mem_cgroup *get_mem_cgroup_from_current(void)
{
struct mem_cgroup *memcg;
if (mem_cgroup_disabled())
return NULL;
again:
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
if (!css_tryget(&memcg->css)) {
rcu_read_unlock();
goto again;
}
rcu_read_unlock();
return memcg;
}
/** /**
* mem_cgroup_iter - iterate over memory cgroup hierarchy * mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root * @root: hierarchy root
...@@ -2873,7 +2894,12 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, ...@@ -2873,7 +2894,12 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
return try_charge_memcg(memcg, gfp_mask, nr_pages); return try_charge_memcg(memcg, gfp_mask, nr_pages);
} }
static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) /**
* mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
* @memcg: memcg previously charged.
* @nr_pages: number of pages previously charged.
*/
void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{ {
if (mem_cgroup_is_root(memcg)) if (mem_cgroup_is_root(memcg))
return; return;
...@@ -2898,6 +2924,22 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) ...@@ -2898,6 +2924,22 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg; folio->memcg_data = (unsigned long)memcg;
} }
/**
* mem_cgroup_commit_charge - commit a previously successful try_charge().
* @folio: folio to commit the charge to.
* @memcg: memcg previously charged.
*/
void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
{
css_get(&memcg->css);
commit_charge(folio, memcg);
local_irq_disable();
mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio));
memcg_check_events(memcg, folio_nid(folio));
local_irq_enable();
}
#ifdef CONFIG_MEMCG_KMEM #ifdef CONFIG_MEMCG_KMEM
/* /*
* The allocated objcg pointers array is not accounted directly. * The allocated objcg pointers array is not accounted directly.
...@@ -6116,7 +6158,7 @@ static void __mem_cgroup_clear_mc(void) ...@@ -6116,7 +6158,7 @@ static void __mem_cgroup_clear_mc(void)
/* we must uncharge all the leftover precharges from mc.to */ /* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) { if (mc.precharge) {
cancel_charge(mc.to, mc.precharge); mem_cgroup_cancel_charge(mc.to, mc.precharge);
mc.precharge = 0; mc.precharge = 0;
} }
/* /*
...@@ -6124,7 +6166,7 @@ static void __mem_cgroup_clear_mc(void) ...@@ -6124,7 +6166,7 @@ static void __mem_cgroup_clear_mc(void)
* we must uncharge here. * we must uncharge here.
*/ */
if (mc.moved_charge) { if (mc.moved_charge) {
cancel_charge(mc.from, mc.moved_charge); mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0; mc.moved_charge = 0;
} }
/* we must fixup refcnts and charges */ /* we must fixup refcnts and charges */
...@@ -7031,20 +7073,13 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, ...@@ -7031,20 +7073,13 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
gfp_t gfp) gfp_t gfp)
{ {
long nr_pages = folio_nr_pages(folio);
int ret; int ret;
ret = try_charge(memcg, gfp, nr_pages); ret = try_charge(memcg, gfp, folio_nr_pages(folio));
if (ret) if (ret)
goto out; goto out;
css_get(&memcg->css); mem_cgroup_commit_charge(folio, memcg);
commit_charge(folio, memcg);
local_irq_disable();
mem_cgroup_charge_statistics(memcg, nr_pages);
memcg_check_events(memcg, folio_nid(folio));
local_irq_enable();
out: out:
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment