Commit 7a147e0c authored by Johannes Weiner's avatar Johannes Weiner Committed by Greg Kroah-Hartman

mm: memcg: rework and document OOM waiting and wakeup

commit fb2a6fc5 upstream.

The memcg OOM handler open-codes a sleeping lock for OOM serialization
(trylock, wait, repeat) because the required locking is so specific to
memcg hierarchies.  However, it would be nice if this construct would be
clearly recognizable and not be as obfuscated as it is right now.  Clean
up as follows:

1. Remove the return value of mem_cgroup_oom_unlock()

2. Rename mem_cgroup_oom_lock() to mem_cgroup_oom_trylock().

3. Pull the prepare_to_wait() out of the memcg_oom_lock scope.  This
   makes it more obvious that the task has to be on the waitqueue
   before attempting to OOM-trylock the hierarchy, to not miss any
   wakeups before going to sleep.  It just didn't matter until now
   because it was all lumped together into the global memcg_oom_lock
   spinlock section.

4. Pull the mem_cgroup_oom_notify() out of the memcg_oom_lock scope.
   It is proctected by the hierarchical OOM-lock.

5. The memcg_oom_lock spinlock is only required to propagate the OOM
   lock in any given hierarchy atomically.  Restrict its scope to
   mem_cgroup_oom_(trylock|unlock).

6. Do not wake up the waitqueue unconditionally at the end of the
   function.  Only the lockholder has to wake up the next in line
   after releasing the lock.

   Note that the lockholder kicks off the OOM-killer, which in turn
   leads to wakeups from the uncharges of the exiting task.  But a
   contender is not guaranteed to see them if it enters the OOM path
   after the OOM kills but before the lockholder releases the lock.
   Thus there has to be an explicit wakeup after releasing the lock.

7. Put the OOM task on the waitqueue before marking the hierarchy as
   under OOM as that is the point where we start to receive wakeups.
   No point in listening before being on the waitqueue.

8. Likewise, unmark the hierarchy before finishing the sleep, for
   symmetry.
Signed-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Acked-by: default avatarMichal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: azurIt <azurit@pobox.sk>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: default avatarCong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 11f34787
...@@ -2075,15 +2075,18 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, ...@@ -2075,15 +2075,18 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
return total; return total;
} }
static DEFINE_SPINLOCK(memcg_oom_lock);
/* /*
* Check OOM-Killer is already running under our hierarchy. * Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false. * If someone is running, return false.
* Has to be called with memcg_oom_lock
*/ */
static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
{ {
struct mem_cgroup *iter, *failed = NULL; struct mem_cgroup *iter, *failed = NULL;
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg) { for_each_mem_cgroup_tree(iter, memcg) {
if (iter->oom_lock) { if (iter->oom_lock) {
/* /*
...@@ -2097,33 +2100,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) ...@@ -2097,33 +2100,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
iter->oom_lock = true; iter->oom_lock = true;
} }
if (!failed) if (failed) {
return true; /*
* OK, we failed to lock the whole subtree so we have
/* * to clean up what we set up to the failing subtree
* OK, we failed to lock the whole subtree so we have to clean up */
* what we set up to the failing subtree for_each_mem_cgroup_tree(iter, memcg) {
*/ if (iter == failed) {
for_each_mem_cgroup_tree(iter, memcg) { mem_cgroup_iter_break(memcg, iter);
if (iter == failed) { break;
mem_cgroup_iter_break(memcg, iter); }
break; iter->oom_lock = false;
} }
iter->oom_lock = false;
} }
return false;
spin_unlock(&memcg_oom_lock);
return !failed;
} }
/* static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
* Has to be called with memcg_oom_lock
*/
static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
{ {
struct mem_cgroup *iter; struct mem_cgroup *iter;
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg) for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false; iter->oom_lock = false;
return 0; spin_unlock(&memcg_oom_lock);
} }
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
...@@ -2147,7 +2150,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) ...@@ -2147,7 +2150,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
atomic_add_unless(&iter->under_oom, -1, 0); atomic_add_unless(&iter->under_oom, -1, 0);
} }
static DEFINE_SPINLOCK(memcg_oom_lock);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info { struct oom_wait_info {
...@@ -2194,45 +2196,52 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, ...@@ -2194,45 +2196,52 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
int order) int order)
{ {
struct oom_wait_info owait; struct oom_wait_info owait;
bool locked, need_to_kill; bool locked;
owait.memcg = memcg; owait.memcg = memcg;
owait.wait.flags = 0; owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function; owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current; owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list); INIT_LIST_HEAD(&owait.wait.task_list);
need_to_kill = true;
mem_cgroup_mark_under_oom(memcg);
/* At first, try to OOM lock hierarchy under memcg.*/
spin_lock(&memcg_oom_lock);
locked = mem_cgroup_oom_lock(memcg);
/* /*
* As with any blocking lock, a contender needs to start
* listening for wakeups before attempting the trylock,
* otherwise it can miss the wakeup from the unlock and sleep
* indefinitely. This is just open-coded because our locking
* is so particular to memcg hierarchies.
*
* Even if signal_pending(), we can't quit charge() loop without * Even if signal_pending(), we can't quit charge() loop without
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
* under OOM is always welcomed, use TASK_KILLABLE here. * under OOM is always welcomed, use TASK_KILLABLE here.
*/ */
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
if (!locked || memcg->oom_kill_disable) mem_cgroup_mark_under_oom(memcg);
need_to_kill = false;
locked = mem_cgroup_oom_trylock(memcg);
if (locked) if (locked)
mem_cgroup_oom_notify(memcg); mem_cgroup_oom_notify(memcg);
spin_unlock(&memcg_oom_lock);
if (need_to_kill) { if (locked && !memcg->oom_kill_disable) {
mem_cgroup_unmark_under_oom(memcg);
finish_wait(&memcg_oom_waitq, &owait.wait); finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_out_of_memory(memcg, mask, order); mem_cgroup_out_of_memory(memcg, mask, order);
} else { } else {
schedule(); schedule();
mem_cgroup_unmark_under_oom(memcg);
finish_wait(&memcg_oom_waitq, &owait.wait); finish_wait(&memcg_oom_waitq, &owait.wait);
} }
spin_lock(&memcg_oom_lock);
if (locked)
mem_cgroup_oom_unlock(memcg);
memcg_wakeup_oom(memcg);
spin_unlock(&memcg_oom_lock);
mem_cgroup_unmark_under_oom(memcg); if (locked) {
mem_cgroup_oom_unlock(memcg);
/*
* There is no guarantee that an OOM-lock contender
* sees the wakeups triggered by the OOM kill
* uncharges. Wake any sleepers explicitely.
*/
memcg_oom_recover(memcg);
}
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
return false; return false;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment