mm owner: fix race between swapoff and exit

author Balbir Singh <balbir@linux.vnet.ibm.com>

Sun, 28 Sep 2008 22:09:31 +0000 (23:09 +0100)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 29 Sep 2008 15:41:47 +0000 (08:41 -0700)
author Balbir Singh <balbir@linux.vnet.ibm.com>
Sun, 28 Sep 2008 22:09:31 +0000 (23:09 +0100)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 29 Sep 2008 15:41:47 +0000 (08:41 -0700)
diff --git a/fs/exec.c b/fs/exec.c

index 32993be..cecee50 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -752,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm)
         tsk->active_mm = mm;
         activate_mm(active_mm, mm);
         task_unlock(tsk);
         tsk->active_mm = mm;
         activate_mm(active_mm, mm);
         task_unlock(tsk);
-       mm_update_next_owner(old_mm);
         arch_pick_mmap_layout(mm);
         if (old_mm) {
                 up_read(&old_mm->mmap_sem);
                 BUG_ON(active_mm != old_mm);
         arch_pick_mmap_layout(mm);
         if (old_mm) {
                 up_read(&old_mm->mmap_sem);
                 BUG_ON(active_mm != old_mm);
+               mm_update_next_owner(old_mm);
                 mmput(old_mm);
                 return 0;
         }
                 mmput(old_mm);
                 return 0;
         }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 13932ab..a0123d7 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2738,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child)
   */
  void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
  {
   */
  void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
  {
-       struct cgroup *oldcgrp, *newcgrp;
+       struct cgroup *oldcgrp, *newcgrp = NULL;
  
         if (need_mm_owner_callback) {
                 int i;
                 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                         struct cgroup_subsys *ss = subsys[i];
                         oldcgrp = task_cgroup(old, ss->subsys_id);
  
         if (need_mm_owner_callback) {
                 int i;
                 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                         struct cgroup_subsys *ss = subsys[i];
                         oldcgrp = task_cgroup(old, ss->subsys_id);
-                       newcgrp = task_cgroup(new, ss->subsys_id);
+                       if (new)
+                               newcgrp = task_cgroup(new, ss->subsys_id);
                         if (oldcgrp == newcgrp)
                                 continue;
                         if (ss->mm_owner_changed)
                         if (oldcgrp == newcgrp)
                                 continue;
                         if (ss->mm_owner_changed)
diff --git a/kernel/exit.c b/kernel/exit.c

index 1639564..85a83c8 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
          * If there are other users of the mm and the owner (us) is exiting
          * we need to find a new owner to take on the responsibility.
          */
          * If there are other users of the mm and the owner (us) is exiting
          * we need to find a new owner to take on the responsibility.
          */
-       if (!mm)
-               return 0;
         if (atomic_read(&mm->mm_users) <= 1)
                 return 0;
         if (mm->owner != p)
         if (atomic_read(&mm->mm_users) <= 1)
                 return 0;
         if (mm->owner != p)
@@ -627,6 +625,16 @@ retry:
         } while_each_thread(g, c);
  
         read_unlock(&tasklist_lock);
         } while_each_thread(g, c);
  
         read_unlock(&tasklist_lock);
+       /*
+        * We found no owner yet mm_users > 1: this implies that we are
+        * most likely racing with swapoff (try_to_unuse()) or /proc or
+        * ptrace or page migration (get_task_mm()).  Mark owner as NULL,
+        * so that subsystems can understand the callback and take action.
+        */
+       down_write(&mm->mmap_sem);
+       cgroup_mm_owner_callbacks(mm->owner, NULL);
+       mm->owner = NULL;
+       up_write(&mm->mmap_sem);
         return;
  
  assign_new_owner:
         return;
  
  assign_new_owner:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index c0500e4..36896f3 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
  
  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
  {
  
  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
  {
+       /*
+        * mm_update_next_owner() may clear mm->owner to NULL
+        * if it races with swapoff, page migration, etc.
+        * So this can be called with p == NULL.
+        */
+       if (unlikely(!p))
+               return NULL;
+
         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
                                 struct mem_cgroup, css);
  }
         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
                                 struct mem_cgroup, css);
  }
@@ -549,6 +557,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
         if (likely(!memcg)) {
                 rcu_read_lock();
                 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
         if (likely(!memcg)) {
                 rcu_read_lock();
                 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+               if (unlikely(!mem)) {
+                       rcu_read_unlock();
+                       kmem_cache_free(page_cgroup_cache, pc);
+                       return 0;
+               }
                 /*
                  * For every charge from the cgroup, increment reference count
                  */
                 /*
                  * For every charge from the cgroup, increment reference count
                  */
@@ -801,6 +814,10 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
  
         rcu_read_lock();
         mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
  
         rcu_read_lock();
         mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+       if (unlikely(!mem)) {
+               rcu_read_unlock();
+               return 0;
+       }
         css_get(&mem->css);
         rcu_read_unlock();
  
         css_get(&mem->css);
         rcu_read_unlock();
author	Balbir Singh <balbir@linux.vnet.ibm.com>
	Sun, 28 Sep 2008 22:09:31 +0000 (23:09 +0100)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 29 Sep 2008 15:41:47 +0000 (08:41 -0700)
fs/exec.c		patch \| blob \| history
kernel/cgroup.c		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history