Commit f75ca962 authored by KAMEZAWA Hiroyuki's avatar KAMEZAWA Hiroyuki Committed by Linus Torvalds

memcg: avoid css_get()

Now, memory cgroup increments css(cgroup subsys state)'s reference count
per a charged page.  And the reference count is kept until the page is
uncharged.  But this has 2 bad effect.

 1. Because css_get/put calls atomic_inc()/dec, heavy call of them
    on large smp will not scale well.
 2. Because css's refcnt cannot be in a state as "ready-to-release",
    cgroup's notify_on_release handler can't work with memcg.
 3. css's refcnt is atomic_t, it means smaller than 32bit. Maybe too small.

This has been a problem since the 1st merge of memcg.

This is a trial to remove css's refcnt per a page. Even if we remove
refcnt, pre_destroy() does enough synchronization as
  - check res->usage == 0.
  - check no pages on LRU.

This patch removes css's refcnt per page.  Even after this patch, at the
1st look, it seems css_get() is still called in try_charge().

But the logic is.

  - If a memcg of mm->owner is cached one, consume_stock() will work.
    At success, return immediately.
  - If consume_stock returns false, css_get() is called and go to
    slow path which may be blocked. At the end of slow path,
    css_put() is called and restart from the start if necessary.

So, in the fast path, we don't call css_get() and can avoid access to
shared counter. This patch can make the most possible case fast.

Here is a result of multi-threaded page fault benchmark.

[Before]
    25.32%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
     9.30%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
     8.02%  multi-fault-all  [kernel.kallsyms]      [k] try_get_mem_cgroup_from_mm <=====(*)
     7.83%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
     5.38%  multi-fault-all  [kernel.kallsyms]      [k] __css_put
     5.29%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
     4.92%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
     4.24%  multi-fault-all  [kernel.kallsyms]      [k] up_read
     3.53%  multi-fault-all  [kernel.kallsyms]      [k] css_put
     2.11%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
     1.76%  multi-fault-all  [kernel.kallsyms]      [k] __rmqueue
     1.64%  multi-fault-all  [kernel.kallsyms]      [k] __mem_cgroup_commit_charge

[After]
    28.41%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
    10.08%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
     9.58%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
     9.38%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
     5.86%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
     5.65%  multi-fault-all  [kernel.kallsyms]      [k] up_read
     2.82%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
     2.64%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
     2.48%  multi-fault-all  [kernel.kallsyms]      [k] __mem_cgroup_commit_charge

Then, 8.02% of try_get_mem_cgroup_from_mm() disappears because this patch
removes css_tryget() in it. (But yes, this is an extreme case.)
Signed-off-by: default avatarDaisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 158e0a2d
...@@ -1714,28 +1714,66 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, ...@@ -1714,28 +1714,66 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
* thread group leader migrates. It's possible that mm is not * thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage). * set, if so charge the init_mm (happens for pagecache usage).
*/ */
if (*memcg) { if (!*memcg && !mm)
goto bypass;
again:
if (*memcg) { /* css should be a valid one */
mem = *memcg; mem = *memcg;
VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;
if (consume_stock(mem))
goto done;
css_get(&mem->css); css_get(&mem->css);
} else { } else {
mem = try_get_mem_cgroup_from_mm(mm); struct task_struct *p;
if (unlikely(!mem))
return 0;
*memcg = mem;
}
VM_BUG_ON(css_is_removed(&mem->css)); rcu_read_lock();
if (mem_cgroup_is_root(mem)) p = rcu_dereference(mm->owner);
goto done; VM_BUG_ON(!p);
/*
* because we don't have task_lock(), "p" can exit while
* we're here. In that case, "mem" can point to root
* cgroup but never be NULL. (and task_struct itself is freed
* by RCU, cgroup itself is RCU safe.) Then, we have small
* risk here to get wrong cgroup. But such kind of mis-account
* by race always happens because we don't have cgroup_mutex().
* It's overkill and we allow that small race, here.
*/
mem = mem_cgroup_from_task(p);
VM_BUG_ON(!mem);
if (mem_cgroup_is_root(mem)) {
rcu_read_unlock();
goto done;
}
if (consume_stock(mem)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
* necessary. If consume_stock success, some charges
* from this memcg are cached on this cpu. So, we
* don't need to call css_get()/css_tryget() before
* calling consume_stock().
*/
rcu_read_unlock();
goto done;
}
/* after here, we may be blocked. we need to get refcnt */
if (!css_tryget(&mem->css)) {
rcu_read_unlock();
goto again;
}
rcu_read_unlock();
}
do { do {
bool oom_check; bool oom_check;
if (consume_stock(mem))
goto done; /* don't need to fill stock */
/* If killed, bypass charge */ /* If killed, bypass charge */
if (fatal_signal_pending(current)) if (fatal_signal_pending(current)) {
css_put(&mem->css);
goto bypass; goto bypass;
}
oom_check = false; oom_check = false;
if (oom && !nr_oom_retries) { if (oom && !nr_oom_retries) {
...@@ -1750,30 +1788,36 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, ...@@ -1750,30 +1788,36 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
break; break;
case CHARGE_RETRY: /* not in OOM situation but retry */ case CHARGE_RETRY: /* not in OOM situation but retry */
csize = PAGE_SIZE; csize = PAGE_SIZE;
break; css_put(&mem->css);
mem = NULL;
goto again;
case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
css_put(&mem->css);
goto nomem; goto nomem;
case CHARGE_NOMEM: /* OOM routine works */ case CHARGE_NOMEM: /* OOM routine works */
if (!oom) if (!oom) {
css_put(&mem->css);
goto nomem; goto nomem;
}
/* If oom, we never return -ENOMEM */ /* If oom, we never return -ENOMEM */
nr_oom_retries--; nr_oom_retries--;
break; break;
case CHARGE_OOM_DIE: /* Killed by OOM Killer */ case CHARGE_OOM_DIE: /* Killed by OOM Killer */
css_put(&mem->css);
goto bypass; goto bypass;
} }
} while (ret != CHARGE_OK); } while (ret != CHARGE_OK);
if (csize > PAGE_SIZE) if (csize > PAGE_SIZE)
refill_stock(mem, csize - PAGE_SIZE); refill_stock(mem, csize - PAGE_SIZE);
css_put(&mem->css);
done: done:
*memcg = mem;
return 0; return 0;
nomem: nomem:
css_put(&mem->css); *memcg = NULL;
return -ENOMEM; return -ENOMEM;
bypass: bypass:
if (mem)
css_put(&mem->css);
*memcg = NULL; *memcg = NULL;
return 0; return 0;
} }
...@@ -1790,11 +1834,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, ...@@ -1790,11 +1834,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
res_counter_uncharge(&mem->res, PAGE_SIZE * count); res_counter_uncharge(&mem->res, PAGE_SIZE * count);
if (do_swap_account) if (do_swap_account)
res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
WARN_ON_ONCE(count > INT_MAX);
__css_put(&mem->css, (int)count);
} }
/* we don't need css_put for root */
} }
static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
...@@ -2155,7 +2195,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, ...@@ -2155,7 +2195,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
goto charge_cur_mm; goto charge_cur_mm;
*ptr = mem; *ptr = mem;
ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
/* drop extra refcnt from tryget */
css_put(&mem->css); css_put(&mem->css);
return ret; return ret;
charge_cur_mm: charge_cur_mm:
...@@ -2325,10 +2364,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) ...@@ -2325,10 +2364,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break; break;
} }
if (!mem_cgroup_is_root(mem))
__do_uncharge(mem, ctype);
if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
mem_cgroup_swap_statistics(mem, true);
mem_cgroup_charge_statistics(mem, pc, false); mem_cgroup_charge_statistics(mem, pc, false);
ClearPageCgroupUsed(pc); ClearPageCgroupUsed(pc);
...@@ -2340,11 +2375,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) ...@@ -2340,11 +2375,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
*/ */
unlock_page_cgroup(pc); unlock_page_cgroup(pc);
/*
* even after unlock, we have mem->res.usage here and this memcg
* will never be freed.
*/
memcg_check_events(mem, page); memcg_check_events(mem, page);
/* at swapout, this memcg will be accessed to record to swap */ if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) mem_cgroup_swap_statistics(mem, true);
css_put(&mem->css); mem_cgroup_get(mem);
}
if (!mem_cgroup_is_root(mem))
__do_uncharge(mem, ctype);
return mem; return mem;
...@@ -2431,13 +2472,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) ...@@ -2431,13 +2472,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
memcg = __mem_cgroup_uncharge_common(page, ctype); memcg = __mem_cgroup_uncharge_common(page, ctype);
/* record memcg information */ /*
if (do_swap_account && swapout && memcg) { * record memcg information, if swapout && memcg != NULL,
* mem_cgroup_get() was called in uncharge().
*/
if (do_swap_account && swapout && memcg)
swap_cgroup_record(ent, css_id(&memcg->css)); swap_cgroup_record(ent, css_id(&memcg->css));
mem_cgroup_get(memcg);
}
if (swapout && memcg)
css_put(&memcg->css);
} }
#endif #endif
...@@ -2515,7 +2555,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, ...@@ -2515,7 +2555,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
*/ */
if (!mem_cgroup_is_root(to)) if (!mem_cgroup_is_root(to))
res_counter_uncharge(&to->res, PAGE_SIZE); res_counter_uncharge(&to->res, PAGE_SIZE);
css_put(&to->css);
} }
return 0; return 0;
} }
...@@ -4214,9 +4253,6 @@ static int mem_cgroup_do_precharge(unsigned long count) ...@@ -4214,9 +4253,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
goto one_by_one; goto one_by_one;
} }
mc.precharge += count; mc.precharge += count;
VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
WARN_ON_ONCE(count > INT_MAX);
__css_get(&mem->css, (int)count);
return ret; return ret;
} }
one_by_one: one_by_one:
...@@ -4452,7 +4488,6 @@ static void mem_cgroup_clear_mc(void) ...@@ -4452,7 +4488,6 @@ static void mem_cgroup_clear_mc(void)
} }
/* we must fixup refcnts and charges */ /* we must fixup refcnts and charges */
if (mc.moved_swap) { if (mc.moved_swap) {
WARN_ON_ONCE(mc.moved_swap > INT_MAX);
/* uncharge swap account from the old cgroup */ /* uncharge swap account from the old cgroup */
if (!mem_cgroup_is_root(mc.from)) if (!mem_cgroup_is_root(mc.from))
res_counter_uncharge(&mc.from->memsw, res_counter_uncharge(&mc.from->memsw,
...@@ -4466,8 +4501,6 @@ static void mem_cgroup_clear_mc(void) ...@@ -4466,8 +4501,6 @@ static void mem_cgroup_clear_mc(void)
*/ */
res_counter_uncharge(&mc.to->res, res_counter_uncharge(&mc.to->res,
PAGE_SIZE * mc.moved_swap); PAGE_SIZE * mc.moved_swap);
VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
__css_put(&mc.to->css, mc.moved_swap);
} }
/* we've already done mem_cgroup_get(mc.to) */ /* we've already done mem_cgroup_get(mc.to) */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment