mm: memcg: do not trap chargers with full callstack on OOM

author Johannes Weiner <hannes@cmpxchg.org>

Thu, 12 Sep 2013 22:13:44 +0000 (15:13 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 12 Sep 2013 22:38:02 +0000 (15:38 -0700)
author Johannes Weiner <hannes@cmpxchg.org>
Thu, 12 Sep 2013 22:13:44 +0000 (15:13 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Sep 2013 22:38:02 +0000 (15:38 -0700)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 34ac649..89d576c 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -157,6 +157,10 @@ extern void mem_cgroup_replace_page_cache(struct page *oldpage,
   *
   * Toggle whether a failed memcg charge should invoke the OOM killer
   * or just return -ENOMEM.  Returns the previous toggle state.
+ *
+ * NOTE: Any path that enables the OOM killer before charging must
+ *       call mem_cgroup_oom_synchronize() afterward to finalize the
+ *       OOM handling and clean up.
   */
  static inline bool mem_cgroup_toggle_oom(bool new)
  {
@@ -182,6 +186,13 @@ static inline void mem_cgroup_disable_oom(void)
         WARN_ON(old == false);
  }
  
+static inline bool task_in_memcg_oom(struct task_struct *p)
+{
+       return p->memcg_oom.in_memcg_oom;
+}
+
+bool mem_cgroup_oom_synchronize(void);
+
  #ifdef CONFIG_MEMCG_SWAP
  extern int do_swap_account;
  #endif
@@ -427,6 +438,16 @@ static inline void mem_cgroup_disable_oom(void)
  {
  }
  
+static inline bool task_in_memcg_oom(struct task_struct *p)
+{
+       return false;
+}
+
+static inline bool mem_cgroup_oom_synchronize(void)
+{
+       return false;
+}
+
  static inline void mem_cgroup_inc_page_stat(struct page *page,
                                             enum mem_cgroup_page_stat_item idx)
  {
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 9ce1fa5..6682da3 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1395,6 +1395,10 @@ struct task_struct {
         unsigned int memcg_kmem_skip_account;
         struct memcg_oom_info {
                 unsigned int may_oom:1;
+               unsigned int in_memcg_oom:1;
+               unsigned int oom_locked:1;
+               int wakeups;
+               struct mem_cgroup *wait_on_memcg;
         } memcg_oom;
  #endif
  #ifdef CONFIG_UPROBES
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 04250cb..4b5cfb5 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -255,6 +255,7 @@ struct mem_cgroup {
  
         bool            oom_lock;
         atomic_t        under_oom;
+       atomic_t        oom_wakeups;
  
         int     swappiness;
         /* OOM-Killer disable */
@@ -2020,6 +2021,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
  
  static void memcg_wakeup_oom(struct mem_cgroup *memcg)
  {
+       atomic_inc(&memcg->oom_wakeups);
         /* for filtering, pass "memcg" as argument. */
         __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
  }
@@ -2031,19 +2033,17 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
  }
  
  /*
- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ * try to call OOM killer
   */
-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
-                                 int order)
+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
  {
-       struct oom_wait_info owait;
         bool locked;
+       int wakeups;
  
-       owait.memcg = memcg;
-       owait.wait.flags = 0;
-       owait.wait.func = memcg_oom_wake_function;
-       owait.wait.private = current;
-       INIT_LIST_HEAD(&owait.wait.task_list);
+       if (!current->memcg_oom.may_oom)
+               return;
+
+       current->memcg_oom.in_memcg_oom = 1;
  
         /*
          * As with any blocking lock, a contender needs to start
@@ -2051,12 +2051,8 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
          * otherwise it can miss the wakeup from the unlock and sleep
          * indefinitely.  This is just open-coded because our locking
          * is so particular to memcg hierarchies.
-        *
-        * Even if signal_pending(), we can't quit charge() loop without
-        * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
-        * under OOM is always welcomed, use TASK_KILLABLE here.
          */
-       prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+       wakeups = atomic_read(&memcg->oom_wakeups);
         mem_cgroup_mark_under_oom(memcg);
  
         locked = mem_cgroup_oom_trylock(memcg);
@@ -2066,15 +2062,95 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
  
         if (locked && !memcg->oom_kill_disable) {
                 mem_cgroup_unmark_under_oom(memcg);
-               finish_wait(&memcg_oom_waitq, &owait.wait);
                 mem_cgroup_out_of_memory(memcg, mask, order);
+               mem_cgroup_oom_unlock(memcg);
+               /*
+                * There is no guarantee that an OOM-lock contender
+                * sees the wakeups triggered by the OOM kill
+                * uncharges.  Wake any sleepers explicitely.
+                */
+               memcg_oom_recover(memcg);
         } else {
-               schedule();
-               mem_cgroup_unmark_under_oom(memcg);
-               finish_wait(&memcg_oom_waitq, &owait.wait);
+               /*
+                * A system call can just return -ENOMEM, but if this
+                * is a page fault and somebody else is handling the
+                * OOM already, we need to sleep on the OOM waitqueue
+                * for this memcg until the situation is resolved.
+                * Which can take some time because it might be
+                * handled by a userspace task.
+                *
+                * However, this is the charge context, which means
+                * that we may sit on a large call stack and hold
+                * various filesystem locks, the mmap_sem etc. and we
+                * don't want the OOM handler to deadlock on them
+                * while we sit here and wait.  Store the current OOM
+                * context in the task_struct, then return -ENOMEM.
+                * At the end of the page fault handler, with the
+                * stack unwound, pagefault_out_of_memory() will check
+                * back with us by calling
+                * mem_cgroup_oom_synchronize(), possibly putting the
+                * task to sleep.
+                */
+               current->memcg_oom.oom_locked = locked;
+               current->memcg_oom.wakeups = wakeups;
+               css_get(&memcg->css);
+               current->memcg_oom.wait_on_memcg = memcg;
         }
+}
+
+/**
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ *
+ * This has to be called at the end of a page fault if the the memcg
+ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ *
+ * Memcg supports userspace OOM handling, so failed allocations must
+ * sleep on a waitqueue until the userspace task resolves the
+ * situation.  Sleeping directly in the charge context with all kinds
+ * of locks held is not a good idea, instead we remember an OOM state
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
+ * the end of the page fault to put the task to sleep and clean up the
+ * OOM state.
+ *
+ * Returns %true if an ongoing memcg OOM situation was detected and
+ * finalized, %false otherwise.
+ */
+bool mem_cgroup_oom_synchronize(void)
+{
+       struct oom_wait_info owait;
+       struct mem_cgroup *memcg;
+
+       /* OOM is global, do not handle */
+       if (!current->memcg_oom.in_memcg_oom)
+               return false;
+
+       /*
+        * We invoked the OOM killer but there is a chance that a kill
+        * did not free up any charges.  Everybody else might already
+        * be sleeping, so restart the fault and keep the rampage
+        * going until some charges are released.
+        */
+       memcg = current->memcg_oom.wait_on_memcg;
+       if (!memcg)
+               goto out;
+
+       if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+               goto out_memcg;
+
+       owait.memcg = memcg;
+       owait.wait.flags = 0;
+       owait.wait.func = memcg_oom_wake_function;
+       owait.wait.private = current;
+       INIT_LIST_HEAD(&owait.wait.task_list);
  
-       if (locked) {
+       prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+       /* Only sleep if we didn't miss any wakeups since OOM */
+       if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+               schedule();
+       finish_wait(&memcg_oom_waitq, &owait.wait);
+out_memcg:
+       mem_cgroup_unmark_under_oom(memcg);
+       if (current->memcg_oom.oom_locked) {
                 mem_cgroup_oom_unlock(memcg);
                 /*
                  * There is no guarantee that an OOM-lock contender
@@ -2083,11 +2159,10 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
                  */
                 memcg_oom_recover(memcg);
         }
-
-       if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
-               return false;
-       /* Give chance to dying process */
-       schedule_timeout_uninterruptible(1);
+       css_put(&memcg->css);
+       current->memcg_oom.wait_on_memcg = NULL;
+out:
+       current->memcg_oom.in_memcg_oom = 0;
         return true;
  }
  
@@ -2400,12 +2475,11 @@ enum {
         CHARGE_RETRY,           /* need to retry but retry is not bad */
         CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
         CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
-       CHARGE_OOM_DIE,         /* the current is killed because of OOM */
  };
  
  static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                 unsigned int nr_pages, unsigned int min_pages,
-                               bool oom_check)
+                               bool invoke_oom)
  {
         unsigned long csize = nr_pages * PAGE_SIZE;
         struct mem_cgroup *mem_over_limit;
@@ -2462,14 +2536,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
         if (mem_cgroup_wait_acct_move(mem_over_limit))
                 return CHARGE_RETRY;
  
-       /* If we don't need to call oom-killer at el, return immediately */
-       if (!oom_check || !current->memcg_oom.may_oom)
-               return CHARGE_NOMEM;
-       /* check OOM */
-       if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
-               return CHARGE_OOM_DIE;
+       if (invoke_oom)
+               mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
  
-       return CHARGE_RETRY;
+       return CHARGE_NOMEM;
  }
  
  /*
@@ -2572,7 +2642,7 @@ again:
         }
  
         do {
-               bool oom_check;
+               bool invoke_oom = oom && !nr_oom_retries;
  
                 /* If killed, bypass charge */
                 if (fatal_signal_pending(current)) {
@@ -2580,14 +2650,8 @@ again:
                         goto bypass;
                 }
  
-               oom_check = false;
-               if (oom && !nr_oom_retries) {
-                       oom_check = true;
-                       nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-               }
-
-               ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
-                   oom_check);
+               ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
+                                          nr_pages, invoke_oom);
                 switch (ret) {
                 case CHARGE_OK:
                         break;
@@ -2600,16 +2664,12 @@ again:
                         css_put(&memcg->css);
                         goto nomem;
                 case CHARGE_NOMEM: /* OOM routine works */
-                       if (!oom) {
+                       if (!oom || invoke_oom) {
                                 css_put(&memcg->css);
                                 goto nomem;
                         }
-                       /* If oom, we never return -ENOMEM */
                         nr_oom_retries--;
                         break;
-               case CHARGE_OOM_DIE: /* Killed by OOM Killer */
-                       css_put(&memcg->css);
-                       goto bypass;
                 }
         } while (ret != CHARGE_OK);
  
diff --git a/mm/memory.c b/mm/memory.c

index a8f9dea..5ec6f19 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3867,6 +3867,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         if (flags & FAULT_FLAG_USER)
                 mem_cgroup_disable_oom();
  
+       if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
+               mem_cgroup_oom_synchronize();
+
         return ret;
  }
  
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 98e75f2..314e9d2 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -678,9 +678,12 @@ out:
   */
  void pagefault_out_of_memory(void)
  {
-       struct zonelist *zonelist = node_zonelist(first_online_node,
-                                                 GFP_KERNEL);
+       struct zonelist *zonelist;
  
+       if (mem_cgroup_oom_synchronize())
+               return;
+
+       zonelist = node_zonelist(first_online_node, GFP_KERNEL);
         if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
                 out_of_memory(NULL, 0, 0, NULL, false);
                 clear_zonelist_oom(zonelist, GFP_KERNEL);
author	Johannes Weiner <hannes@cmpxchg.org>
	Thu, 12 Sep 2013 22:13:44 +0000 (15:13 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 12 Sep 2013 22:38:02 +0000 (15:38 -0700)
include/linux/memcontrol.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history