cgroups: add an owner to the mm_struct

author Balbir Singh <balbir@linux.vnet.ibm.com>

Tue, 29 Apr 2008 08:00:16 +0000 (01:00 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 29 Apr 2008 15:06:10 +0000 (08:06 -0700)
author Balbir Singh <balbir@linux.vnet.ibm.com>
Tue, 29 Apr 2008 08:00:16 +0000 (01:00 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 29 Apr 2008 15:06:10 +0000 (08:06 -0700)
diff --git a/fs/exec.c b/fs/exec.c

index 7768453..711bc45 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -735,6 +735,7 @@ static int exec_mmap(struct mm_struct *mm)
         tsk->active_mm = mm;
         activate_mm(active_mm, mm);
         task_unlock(tsk);
+       mm_update_next_owner(mm);
         arch_pick_mmap_layout(mm);
         if (old_mm) {
                 up_read(&old_mm->mmap_sem);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 0952480..e155aa7 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -305,6 +305,12 @@ struct cgroup_subsys {
                         struct cgroup *cgrp);
         void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
         void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
+       /*
+        * This routine is called with the task_lock of mm->owner held
+        */
+       void (*mm_owner_changed)(struct cgroup_subsys *ss,
+                                       struct cgroup *old,
+                                       struct cgroup *new);
         int subsys_id;
         int active;
         int disabled;
@@ -390,4 +396,13 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
  
  #endif /* !CONFIG_CGROUPS */
  
+#ifdef CONFIG_MM_OWNER
+extern void
+cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new);
+#else /* !CONFIG_MM_OWNER */
+static inline void
+cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
+{
+}
+#endif /* CONFIG_MM_OWNER */
  #endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 8b1c429..e660877 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -27,9 +27,6 @@ struct mm_struct;
  
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
  
-extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
-extern void mm_free_cgroup(struct mm_struct *mm);
-
  #define page_reset_bad_cgroup(page)    ((page)->page_cgroup = 0)
  
  extern struct page_cgroup *page_get_page_cgroup(struct page *page);
@@ -48,8 +45,10 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
  extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
  int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
  
+extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+
  #define mm_match_cgroup(mm, cgroup)    \
-       ((cgroup) == rcu_dereference((mm)->mem_cgroup))
+       ((cgroup) == mem_cgroup_from_task((mm)->owner))
  
  extern int mem_cgroup_prepare_migration(struct page *page);
  extern void mem_cgroup_end_migration(struct page *page);
@@ -73,15 +72,6 @@ extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
                                 struct zone *zone, int priority);
  
  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
-static inline void mm_init_cgroup(struct mm_struct *mm,
-                                       struct task_struct *p)
-{
-}
-
-static inline void mm_free_cgroup(struct mm_struct *mm)
-{
-}
-
  static inline void page_reset_bad_cgroup(struct page *page)
  {
  }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index e2bae8d..bc97bd5 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -225,8 +225,9 @@ struct mm_struct {
         /* aio bits */
         rwlock_t                ioctx_list_lock;        /* aio lock */
         struct kioctx           *ioctx_list;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-       struct mem_cgroup *mem_cgroup;
+#ifdef CONFIG_MM_OWNER
+       struct task_struct *owner;      /* The thread group leader that */
+                                       /* owns the mm_struct.          */
  #endif
  };
  
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 024d72b..1d02bab 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2148,6 +2148,19 @@ static inline void migration_init(void)
  #define TASK_SIZE_OF(tsk)      TASK_SIZE
  #endif
  
+#ifdef CONFIG_MM_OWNER
+extern void mm_update_next_owner(struct mm_struct *mm);
+extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
+#else
+static inline void mm_update_next_owner(struct mm_struct *mm)
+{
+}
+
+static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+}
+#endif /* CONFIG_MM_OWNER */
+
  #endif /* __KERNEL__ */
  
  #endif
diff --git a/init/Kconfig b/init/Kconfig

index a345792..98fa96e 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -378,9 +378,13 @@ config RESOURCE_COUNTERS
            infrastructure that works with cgroups
         depends on CGROUPS
  
+config MM_OWNER
+       bool
+
  config CGROUP_MEM_RES_CTLR
         bool "Memory Resource Controller for Control Groups"
         depends on CGROUPS && RESOURCE_COUNTERS
+       select MM_OWNER
         help
           Provides a memory resource controller that manages both page cache and
           RSS memory.
@@ -393,6 +397,9 @@ config CGROUP_MEM_RES_CTLR
           Only enable when you're ok with these trade offs and really
           sure you need the memory resource controller.
  
+         This config option also selects MM_OWNER config option, which
+         could in turn add some fork/exit overhead.
+
  config SYSFS_DEPRECATED
         bool
  
diff --git a/init/main.c b/init/main.c

index 1116d2f..c62c98f 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -559,6 +559,7 @@ asmlinkage void __init start_kernel(void)
         printk(KERN_NOTICE);
         printk(linux_banner);
         setup_arch(&command_line);
+       mm_init_owner(&init_mm, &init_task);
         setup_command_line(command_line);
         unwind_setup();
         setup_per_cpu_areas();
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index abc4337..b9d467d 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -119,6 +119,7 @@ static int root_count;
   * be called.
   */
  static int need_forkexit_callback;
+static int need_mm_owner_callback __read_mostly;
  
  /* convenient tests for these bits */
  inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -2498,6 +2499,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
  
         need_forkexit_callback |= ss->fork || ss->exit;
+       need_mm_owner_callback |= !!ss->mm_owner_changed;
  
         /* At system boot, before all subsystems have been
          * registered, no tasks have been forked, so we don't
@@ -2748,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child)
         }
  }
  
+#ifdef CONFIG_MM_OWNER
+/**
+ * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
+ * @p: the new owner
+ *
+ * Called on every change to mm->owner. mm_init_owner() does not
+ * invoke this routine, since it assigns the mm->owner the first time
+ * and does not change it.
+ */
+void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
+{
+       struct cgroup *oldcgrp, *newcgrp;
+
+       if (need_mm_owner_callback) {
+               int i;
+               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                       struct cgroup_subsys *ss = subsys[i];
+                       oldcgrp = task_cgroup(old, ss->subsys_id);
+                       newcgrp = task_cgroup(new, ss->subsys_id);
+                       if (oldcgrp == newcgrp)
+                               continue;
+                       if (ss->mm_owner_changed)
+                               ss->mm_owner_changed(ss, oldcgrp, newcgrp);
+               }
+       }
+}
+#endif /* CONFIG_MM_OWNER */
+
  /**
   * cgroup_post_fork - called on a new task after adding it to the task list
   * @child: the task in question
diff --git a/kernel/exit.c b/kernel/exit.c

index 2a9d98c..ae0f2c4 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -557,6 +557,88 @@ void exit_fs(struct task_struct *tsk)
  
  EXPORT_SYMBOL_GPL(exit_fs);
  
+#ifdef CONFIG_MM_OWNER
+/*
+ * Task p is exiting and it owned mm, lets find a new owner for it
+ */
+static inline int
+mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
+{
+       /*
+        * If there are other users of the mm and the owner (us) is exiting
+        * we need to find a new owner to take on the responsibility.
+        */
+       if (!mm)
+               return 0;
+       if (atomic_read(&mm->mm_users) <= 1)
+               return 0;
+       if (mm->owner != p)
+               return 0;
+       return 1;
+}
+
+void mm_update_next_owner(struct mm_struct *mm)
+{
+       struct task_struct *c, *g, *p = current;
+
+retry:
+       if (!mm_need_new_owner(mm, p))
+               return;
+
+       read_lock(&tasklist_lock);
+       /*
+        * Search in the children
+        */
+       list_for_each_entry(c, &p->children, sibling) {
+               if (c->mm == mm)
+                       goto assign_new_owner;
+       }
+
+       /*
+        * Search in the siblings
+        */
+       list_for_each_entry(c, &p->parent->children, sibling) {
+               if (c->mm == mm)
+                       goto assign_new_owner;
+       }
+
+       /*
+        * Search through everything else. We should not get
+        * here often
+        */
+       do_each_thread(g, c) {
+               if (c->mm == mm)
+                       goto assign_new_owner;
+       } while_each_thread(g, c);
+
+       read_unlock(&tasklist_lock);
+       return;
+
+assign_new_owner:
+       BUG_ON(c == p);
+       get_task_struct(c);
+       /*
+        * The task_lock protects c->mm from changing.
+        * We always want mm->owner->mm == mm
+        */
+       task_lock(c);
+       /*
+        * Delay read_unlock() till we have the task_lock()
+        * to ensure that c does not slip away underneath us
+        */
+       read_unlock(&tasklist_lock);
+       if (c->mm != mm) {
+               task_unlock(c);
+               put_task_struct(c);
+               goto retry;
+       }
+       cgroup_mm_owner_callbacks(mm->owner, c);
+       mm->owner = c;
+       task_unlock(c);
+       put_task_struct(c);
+}
+#endif /* CONFIG_MM_OWNER */
+
  /*
   * Turn us into a lazy TLB process if we
   * aren't already..
@@ -596,6 +678,7 @@ static void exit_mm(struct task_struct * tsk)
         /* We don't want this task to be frozen prematurely */
         clear_freeze_flag(tsk);
         task_unlock(tsk);
+       mm_update_next_owner(mm);
         mmput(mm);
  }
  
diff --git a/kernel/fork.c b/kernel/fork.c

index 6067e42..156db96 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -381,14 +381,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
         mm->ioctx_list = NULL;
         mm->free_area_cache = TASK_UNMAPPED_BASE;
         mm->cached_hole_size = ~0UL;
-       mm_init_cgroup(mm, p);
+       mm_init_owner(mm, p);
  
         if (likely(!mm_alloc_pgd(mm))) {
                 mm->def_flags = 0;
                 return mm;
         }
  
-       mm_free_cgroup(mm);
         free_mm(mm);
         return NULL;
  }
@@ -438,7 +437,6 @@ void mmput(struct mm_struct *mm)
                         spin_unlock(&mmlist_lock);
                 }
                 put_swap_token(mm);
-               mm_free_cgroup(mm);
                 mmdrop(mm);
         }
  }
@@ -982,6 +980,13 @@ static void rt_mutex_init_task(struct task_struct *p)
  #endif
  }
  
+#ifdef CONFIG_MM_OWNER
+void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+       mm->owner = p;
+}
+#endif /* CONFIG_MM_OWNER */
+
  /*
   * This creates a new process as a copy of the old one,
   * but does not actually start it yet.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index d12795c..49d8081 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -236,26 +236,12 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
                                 css);
  }
  
-static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
  {
         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
                                 struct mem_cgroup, css);
  }
  
-void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
-{
-       struct mem_cgroup *mem;
-
-       mem = mem_cgroup_from_task(p);
-       css_get(&mem->css);
-       mm->mem_cgroup = mem;
-}
-
-void mm_free_cgroup(struct mm_struct *mm)
-{
-       css_put(&mm->mem_cgroup->css);
-}
-
  static inline int page_cgroup_locked(struct page *page)
  {
         return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
@@ -476,6 +462,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
         int zid = zone_idx(z);
         struct mem_cgroup_per_zone *mz;
  
+       BUG_ON(!mem_cont);
         mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
         if (active)
                 src = &mz->active_list;
@@ -574,7 +561,7 @@ retry:
                 mm = &init_mm;
  
         rcu_read_lock();
-       mem = rcu_dereference(mm->mem_cgroup);
+       mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
         /*
          * For every charge from the cgroup, increment reference count
          */
@@ -985,10 +972,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
         struct mem_cgroup *mem;
         int node;
  
-       if (unlikely((cont->parent) == NULL)) {
+       if (unlikely((cont->parent) == NULL))
                 mem = &init_mem_cgroup;
-               init_mm.mem_cgroup = mem;
-       } else
+       else
                 mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
  
         if (mem == NULL)
@@ -1067,10 +1053,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
         if (!thread_group_leader(p))
                 goto out;
  
-       css_get(&mem->css);
-       rcu_assign_pointer(mm->mem_cgroup, mem);
-       css_put(&old_mem->css);
-
  out:
         mmput(mm);
  }
author	Balbir Singh <balbir@linux.vnet.ibm.com>
	Tue, 29 Apr 2008 08:00:16 +0000 (01:00 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 29 Apr 2008 15:06:10 +0000 (08:06 -0700)
fs/exec.c		patch \| blob \| history
include/linux/cgroup.h		patch \| blob \| history
include/linux/memcontrol.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
init/main.c		patch \| blob \| history
kernel/cgroup.c		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history