cgroup: open-code cgroup_create_dir()

[pandora-kernel.git] / kernel / cgroup.c
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 13774b3..b042673 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -171,8 +171,8 @@ struct css_id {
          * The css to which this ID points. This pointer is set to valid value
          * after cgroup is populated. If cgroup is removed, this will be NULL.
          * This pointer is expected to be RCU-safe because destroy()
-        * is called after synchronize_rcu(). But for safe use, css_is_removed()
-        * css_tryget() should be used for avoiding race.
+        * is called after synchronize_rcu(). But for safe use, css_tryget()
+        * should be used for avoiding race.
          */
         struct cgroup_subsys_state __rcu *css;
         /*
@@ -854,30 +854,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
         return inode;
  }
  
-/*
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
- */
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
-{
-       struct cgroup_subsys *ss;
-       int ret = 0;
-
-       for_each_subsys(cgrp->root, ss) {
-               if (!ss->pre_destroy)
-                       continue;
-
-               ret = ss->pre_destroy(cgrp);
-               if (ret) {
-                       /* ->pre_destroy() failure is being deprecated */
-                       WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
-                       break;
-               }
-       }
-
-       return ret;
-}
-
  static void cgroup_diput(struct dentry *dentry, struct inode *inode)
  {
         /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -1014,33 +990,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
         remove_dir(dentry);
  }
  
-/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
-       if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
-               wake_up_all(&cgroup_rmdir_waitq);
-}
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
-       css_get(css);
-}
-
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
-       cgroup_wakeup_rmdir_waiter(css->cgroup);
-       css_put(css);
-}
-
  /*
   * Call with cgroup_mutex held. Drops reference counts on modules, including
   * any duplicate ones that parse_cgroupfs_options took. If this function
@@ -1432,6 +1381,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
         INIT_LIST_HEAD(&cgrp->children);
         INIT_LIST_HEAD(&cgrp->files);
         INIT_LIST_HEAD(&cgrp->css_sets);
+       INIT_LIST_HEAD(&cgrp->allcg_node);
         INIT_LIST_HEAD(&cgrp->release_list);
         INIT_LIST_HEAD(&cgrp->pidlists);
         mutex_init(&cgrp->pidlist_mutex);
@@ -1701,7 +1651,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
  
                 free_cg_links(&tmp_cg_links);
  
-               BUG_ON(!list_empty(&root_cgrp->sibling));
                 BUG_ON(!list_empty(&root_cgrp->children));
                 BUG_ON(root->number_of_cgroups != 1);
  
@@ -1750,7 +1699,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
  
         BUG_ON(root->number_of_cgroups != 1);
         BUG_ON(!list_empty(&cgrp->children));
-       BUG_ON(!list_empty(&cgrp->sibling));
  
         mutex_lock(&cgroup_mutex);
         mutex_lock(&cgroup_root_mutex);
@@ -1821,9 +1769,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
                 return 0;
         }
  
-       start = buf + buflen;
+       start = buf + buflen - 1;
  
-       *--start = '\0';
+       *start = '\0';
         for (;;) {
                 int len = dentry->d_name.len;
  
@@ -1962,9 +1910,8 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
          * trading it for newcg is protected by cgroup_mutex, we're safe to drop
          * it here; it will be freed under RCU.
          */
-       put_css_set(oldcg);
-
         set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+       put_css_set(oldcg);
  }
  
  /**
@@ -2026,12 +1973,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         }
  
         synchronize_rcu();
-
-       /*
-        * wake up rmdir() waiter. the rmdir should fail since the cgroup
-        * is no longer empty.
-        */
-       cgroup_wakeup_rmdir_waiter(cgrp);
  out:
         if (retval) {
                 for_each_subsys(root, ss) {
@@ -2201,7 +2142,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
          * step 5: success! and cleanup
          */
         synchronize_rcu();
-       cgroup_wakeup_rmdir_waiter(cgrp);
         retval = 0;
  out_put_css_set_refs:
         if (retval) {
@@ -2712,6 +2652,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
  
                 /* start off with i_nlink == 2 (for "." entry) */
                 inc_nlink(inode);
+               inc_nlink(dentry->d_parent->d_inode);
  
                 /* start with the directory inode held, so that we can
                  * populate it without racing with another mkdir */
@@ -2726,32 +2667,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
         return 0;
  }
  
-/*
- * cgroup_create_dir - create a directory for an object.
- * @cgrp: the cgroup we create the directory for. It must have a valid
- *        ->parent field. And we are going to fill its ->dentry field.
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new directory.
- */
-static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
-                               umode_t mode)
-{
-       struct dentry *parent;
-       int error = 0;
-
-       parent = cgrp->parent->dentry;
-       error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
-       if (!error) {
-               dentry->d_fsdata = cgrp;
-               inc_nlink(parent->d_inode);
-               rcu_assign_pointer(cgrp->dentry, dentry);
-               dget(dentry);
-       }
-       dput(dentry);
-
-       return error;
-}
-
  /**
   * cgroup_file_mode - deduce file mode of a control file
   * @cft: the control file in question
@@ -3045,6 +2960,92 @@ static void cgroup_enable_task_cg_lists(void)
         write_unlock(&css_set_lock);
  }
  
+/**
+ * cgroup_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_pre().  Find the next
+ * descendant to visit for pre-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
+                                         struct cgroup *cgroup)
+{
+       struct cgroup *next;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       /* if first iteration, pretend we just visited @cgroup */
+       if (!pos) {
+               if (list_empty(&cgroup->children))
+                       return NULL;
+               pos = cgroup;
+       }
+
+       /* visit the first child if exists */
+       next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
+       if (next)
+               return next;
+
+       /* no child, visit my or the closest ancestor's next sibling */
+       do {
+               next = list_entry_rcu(pos->sibling.next, struct cgroup,
+                                     sibling);
+               if (&next->sibling != &pos->parent->children)
+                       return next;
+
+               pos = pos->parent;
+       } while (pos != cgroup);
+
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+
+static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
+{
+       struct cgroup *last;
+
+       do {
+               last = pos;
+               pos = list_first_or_null_rcu(&pos->children, struct cgroup,
+                                            sibling);
+       } while (pos);
+
+       return last;
+}
+
+/**
+ * cgroup_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_post().  Find the next
+ * descendant to visit for post-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
+                                          struct cgroup *cgroup)
+{
+       struct cgroup *next;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       /* if first iteration, visit the leftmost descendant */
+       if (!pos) {
+               next = cgroup_leftmost_descendant(cgroup);
+               return next != cgroup ? next : NULL;
+       }
+
+       /* if there's an unvisited sibling, visit its leftmost descendant */
+       next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+       if (&next->sibling != &pos->parent->children)
+               return cgroup_leftmost_descendant(next);
+
+       /* no sibling left, visit parent */
+       next = pos->parent;
+       return next != cgroup ? next : NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
+
  void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
         __acquires(css_set_lock)
  {
@@ -4023,14 +4024,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
         cgrp->subsys[ss->subsys_id] = css;
  
         /*
-        * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
-        * which is put on the last css_put().  dput() requires process
-        * context, which css_put() may be called without.  @css->dput_work
-        * will be used to invoke dput() asynchronously from css_put().
+        * css holds an extra ref to @cgrp->dentry which is put on the last
+        * css_put().  dput() requires process context, which css_put() may
+        * be called without.  @css->dput_work will be used to invoke
+        * dput() asynchronously from css_put().
          */
         INIT_WORK(&css->dput_work, css_dput_fn);
-       if (ss->__DEPRECATED_clear_css_refs)
-               set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
  }
  
  /*
@@ -4054,6 +4053,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         if (!cgrp)
                 return -ENOMEM;
  
+       /*
+        * Only live parents can have children.  Note that the liveliness
+        * check isn't strictly necessary because cgroup_mkdir() and
+        * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
+        * anyway so that locking is contained inside cgroup proper and we
+        * don't get nasty surprises if we ever grow another caller.
+        */
+       if (!cgroup_lock_live_group(parent)) {
+               err = -ENODEV;
+               goto err_free;
+       }
+
         /* Grab a reference on the superblock so the hierarchy doesn't
          * get deleted on unmount if there are child cgroups.  This
          * can be done outside cgroup_mutex, since the sb can't
@@ -4061,8 +4072,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
          * fs */
         atomic_inc(&sb->s_active);
  
-       mutex_lock(&cgroup_mutex);
-
         init_cgroup_housekeeping(cgrp);
  
         cgrp->parent = parent;
@@ -4103,17 +4112,24 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                 }
         }
  
-       list_add(&cgrp->sibling, &cgrp->parent->children);
+       list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
         root->number_of_cgroups++;
  
-       err = cgroup_create_dir(cgrp, dentry, mode);
+       err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
         if (err < 0)
                 goto err_remove;
  
-       /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
-       for_each_subsys(root, ss)
-               if (!ss->__DEPRECATED_clear_css_refs)
-                       dget(dentry);
+       dentry->d_fsdata = cgrp;
+       rcu_assign_pointer(cgrp->dentry, dentry);
+
+       for_each_subsys(root, ss) {
+               /* each css holds a ref to the cgroup's dentry */
+               dget(dentry);
+
+               /* creation succeeded, notify subsystems */
+               if (ss->post_create)
+                       ss->post_create(cgrp);
+       }
  
         /* The cgroup directory was pre-locked for us */
         BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
@@ -4130,7 +4146,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
  
   err_remove:
  
-       list_del(&cgrp->sibling);
+       list_del_rcu(&cgrp->sibling);
         root->number_of_cgroups--;
  
   err_destroy:
@@ -4144,7 +4160,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
  
         /* Release the reference count that we took on the superblock */
         deactivate_super(sb);
-
+err_free:
         kfree(cgrp);
         return err;
  }
@@ -4198,71 +4214,6 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
         return 0;
  }
  
-/*
- * Atomically mark all (or else none) of the cgroup's CSS objects as
- * CSS_REMOVED. Return true on success, or false if the cgroup has
- * busy subsystems. Call with cgroup_mutex held
- *
- * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
- * not, cgroup removal behaves differently.
- *
- * If clear is set, css refcnt for the subsystem should be zero before
- * cgroup removal can be committed.  This is implemented by
- * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
- * called multiple times until all css refcnts reach zero and is allowed to
- * veto removal on any invocation.  This behavior is deprecated and will be
- * removed as soon as the existing user (memcg) is updated.
- *
- * If clear is not set, each css holds an extra reference to the cgroup's
- * dentry and cgroup removal proceeds regardless of css refs.
- * ->pre_destroy() will be called at least once and is not allowed to fail.
- * On the last put of each css, whenever that may be, the extra dentry ref
- * is put so that dentry destruction happens only after all css's are
- * released.
- */
-static int cgroup_clear_css_refs(struct cgroup *cgrp)
-{
-       struct cgroup_subsys *ss;
-       unsigned long flags;
-       bool failed = false;
-
-       local_irq_save(flags);
-
-       /*
-        * Block new css_tryget() by deactivating refcnt.  If all refcnts
-        * for subsystems w/ clear_css_refs set were 1 at the moment of
-        * deactivation, we succeeded.
-        */
-       for_each_subsys(cgrp->root, ss) {
-               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-
-               WARN_ON(atomic_read(&css->refcnt) < 0);
-               atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-
-               if (ss->__DEPRECATED_clear_css_refs)
-                       failed |= css_refcnt(css) != 1;
-       }
-
-       /*
-        * If succeeded, set REMOVED and put all the base refs; otherwise,
-        * restore refcnts to positive values.  Either way, all in-progress
-        * css_tryget() will be released.
-        */
-       for_each_subsys(cgrp->root, ss) {
-               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-
-               if (!failed) {
-                       set_bit(CSS_REMOVED, &css->flags);
-                       css_put(css);
-               } else {
-                       atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
-               }
-       }
-
-       local_irq_restore(flags);
-       return !failed;
-}
-
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
  {
         struct cgroup *cgrp = dentry->d_fsdata;
@@ -4270,76 +4221,58 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
         struct cgroup *parent;
         DEFINE_WAIT(wait);
         struct cgroup_event *event, *tmp;
-       int ret;
+       struct cgroup_subsys *ss;
  
         /* the vfs holds both inode->i_mutex already */
-again:
         mutex_lock(&cgroup_mutex);
-       if (atomic_read(&cgrp->count) != 0) {
-               mutex_unlock(&cgroup_mutex);
-               return -EBUSY;
-       }
-       if (!list_empty(&cgrp->children)) {
+       parent = cgrp->parent;
+       if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
                 mutex_unlock(&cgroup_mutex);
                 return -EBUSY;
         }
-       mutex_unlock(&cgroup_mutex);
  
         /*
-        * In general, subsystem has no css->refcnt after pre_destroy(). But
-        * in racy cases, subsystem may have to get css->refcnt after
-        * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
-        * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
-        * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
-        * and subsystem's reference count handling. Please see css_get/put
-        * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
+        * Block new css_tryget() by deactivating refcnt and mark @cgrp
+        * removed.  This makes future css_tryget() and child creation
+        * attempts fail thus maintaining the removal conditions verified
+        * above.
          */
-       set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+       for_each_subsys(cgrp->root, ss) {
+               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  
-       /*
-        * Call pre_destroy handlers of subsys. Notify subsystems
-        * that rmdir() request comes.
-        */
-       ret = cgroup_call_pre_destroy(cgrp);
-       if (ret) {
-               clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-               return ret;
+               WARN_ON(atomic_read(&css->refcnt) < 0);
+               atomic_add(CSS_DEACT_BIAS, &css->refcnt);
         }
+       set_bit(CGRP_REMOVED, &cgrp->flags);
  
+       /*
+        * Tell subsystems to initate destruction.  pre_destroy() should be
+        * called with cgroup_mutex unlocked.  See 3fa59dfbc3 ("cgroup: fix
+        * potential deadlock in pre_destroy") for details.
+        */
+       mutex_unlock(&cgroup_mutex);
+       for_each_subsys(cgrp->root, ss)
+               if (ss->pre_destroy)
+                       ss->pre_destroy(cgrp);
         mutex_lock(&cgroup_mutex);
-       parent = cgrp->parent;
-       if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
-               clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-               mutex_unlock(&cgroup_mutex);
-               return -EBUSY;
-       }
-       prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
-       if (!cgroup_clear_css_refs(cgrp)) {
-               mutex_unlock(&cgroup_mutex);
-               /*
-                * Because someone may call cgroup_wakeup_rmdir_waiter() before
-                * prepare_to_wait(), we need to check this flag.
-                */
-               if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
-                       schedule();
-               finish_wait(&cgroup_rmdir_waitq, &wait);
-               clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-               if (signal_pending(current))
-                       return -EINTR;
-               goto again;
-       }
-       /* NO css_tryget() can success after here. */
-       finish_wait(&cgroup_rmdir_waitq, &wait);
-       clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+
+       /*
+        * Put all the base refs.  Each css holds an extra reference to the
+        * cgroup's dentry and cgroup removal proceeds regardless of css
+        * refs.  On the last put of each css, whenever that may be, the
+        * extra dentry ref is put so that dentry destruction happens only
+        * after all css's are released.
+        */
+       for_each_subsys(cgrp->root, ss)
+               css_put(cgrp->subsys[ss->subsys_id]);
  
         raw_spin_lock(&release_list_lock);
-       set_bit(CGRP_REMOVED, &cgrp->flags);
         if (!list_empty(&cgrp->release_list))
                 list_del_init(&cgrp->release_list);
         raw_spin_unlock(&release_list_lock);
  
         /* delete this cgroup from parent->children */
-       list_del_init(&cgrp->sibling);
+       list_del_rcu(&cgrp->sibling);
  
         list_del_init(&cgrp->allcg_node);
  
@@ -4415,6 +4348,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
  
         ss->active = 1;
  
+       if (ss->post_create)
+               ss->post_create(&ss->root->top_cgroup);
+
         /* this function shouldn't be used with modular subsystems, since they
          * need to register a subsys_id, among other things */
         BUG_ON(ss->module);
@@ -4524,6 +4460,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
  
         ss->active = 1;
  
+       if (ss->post_create)
+               ss->post_create(&ss->root->top_cgroup);
+
         /* success! */
         mutex_unlock(&cgroup_mutex);
         return 0;
@@ -4815,73 +4754,37 @@ static const struct file_operations proc_cgroupstats_operations = {
   *
   * A pointer to the shared css_set was automatically copied in
   * fork.c by dup_task_struct().  However, we ignore that copy, since
- * it was not made under the protection of RCU, cgroup_mutex or
- * threadgroup_change_begin(), so it might no longer be a valid
- * cgroup pointer.  cgroup_attach_task() might have already changed
- * current->cgroups, allowing the previously referenced cgroup
- * group to be removed and freed.
- *
- * Outside the pointer validity we also need to process the css_set
- * inheritance between threadgoup_change_begin() and
- * threadgoup_change_end(), this way there is no leak in any process
- * wide migration performed by cgroup_attach_proc() that could otherwise
- * miss a thread because it is too early or too late in the fork stage.
+ * it was not made under the protection of RCU or cgroup_mutex, so
+ * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
+ * have already changed current->cgroups, allowing the previously
+ * referenced cgroup group to be removed and freed.
   *
   * At the point that cgroup_fork() is called, 'current' is the parent
   * task, and the passed argument 'child' points to the child task.
   */
  void cgroup_fork(struct task_struct *child)
  {
-       /*
-        * We don't need to task_lock() current because current->cgroups
-        * can't be changed concurrently here. The parent obviously hasn't
-        * exited and called cgroup_exit(), and we are synchronized against
-        * cgroup migration through threadgroup_change_begin().
-        */
+       task_lock(current);
         child->cgroups = current->cgroups;
         get_css_set(child->cgroups);
+       task_unlock(current);
         INIT_LIST_HEAD(&child->cg_list);
  }
  
-/**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
-       if (need_forkexit_callback) {
-               int i;
-               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                       struct cgroup_subsys *ss = subsys[i];
-
-                       /*
-                        * forkexit callbacks are only supported for
-                        * builtin subsystems.
-                        */
-                       if (!ss || ss->module)
-                               continue;
-
-                       if (ss->fork)
-                               ss->fork(child);
-               }
-       }
-}
-
  /**
   * cgroup_post_fork - called on a new task after adding it to the task list
   * @child: the task in question
   *
- * Adds the task to the list running through its css_set if necessary.
- * Has to be after the task is visible on the task list in case we race
- * with the first call to cgroup_iter_start() - to guarantee that the
- * new task ends up on its list.
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks.  Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_iter_start() - to guarantee that the new task ends up on its
+ * list.
   */
  void cgroup_post_fork(struct task_struct *child)
  {
+       int i;
+
         /*
          * use_task_css_set_links is set to 1 before we walk the tasklist
          * under the tasklist_lock and we read it here after we added the child
@@ -4895,22 +4798,36 @@ void cgroup_post_fork(struct task_struct *child)
          */
         if (use_task_css_set_links) {
                 write_lock(&css_set_lock);
-               if (list_empty(&child->cg_list)) {
+               task_lock(child);
+               if (list_empty(&child->cg_list))
+                       list_add(&child->cg_list, &child->cgroups->tasks);
+               task_unlock(child);
+               write_unlock(&css_set_lock);
+       }
+
+       /*
+        * Call ss->fork().  This must happen after @child is linked on
+        * css_set; otherwise, @child might change state between ->fork()
+        * and addition to css_set.
+        */
+       if (need_forkexit_callback) {
+               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                       struct cgroup_subsys *ss = subsys[i];
+
                         /*
-                        * It's safe to use child->cgroups without task_lock()
-                        * here because we are protected through
-                        * threadgroup_change_begin() against concurrent
-                        * css_set change in cgroup_task_migrate(). Also
-                        * the task can't exit at that point until
-                        * wake_up_new_task() is called, so we are protected
-                        * against cgroup_exit() setting child->cgroup to
-                        * init_css_set.
+                        * fork/exit callbacks are supported only for
+                        * builtin subsystems and we don't need further
+                        * synchronization as they never go away.
                          */
-                       list_add(&child->cg_list, &child->cgroups->tasks);
+                       if (!ss || ss->module)
+                               continue;
+
+                       if (ss->fork)
+                               ss->fork(child);
                 }
-               write_unlock(&css_set_lock);
         }
  }
+
  /**
   * cgroup_exit - detach cgroup from exiting task
   * @tsk: pointer to task_struct of exiting process
@@ -5043,15 +4960,17 @@ static void check_for_release(struct cgroup *cgrp)
  /* Caller must verify that the css is not for root cgroup */
  bool __css_tryget(struct cgroup_subsys_state *css)
  {
-       do {
-               int v = css_refcnt(css);
+       while (true) {
+               int t, v;
  
-               if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
+               v = css_refcnt(css);
+               t = atomic_cmpxchg(&css->refcnt, v, v + 1);
+               if (likely(t == v))
                         return true;
+               else if (t < 0)
+                       return false;
                 cpu_relax();
-       } while (!test_bit(CSS_REMOVED, &css->flags));
-
-       return false;
+       }
  }
  EXPORT_SYMBOL_GPL(__css_tryget);
  
@@ -5070,11 +4989,9 @@ void __css_put(struct cgroup_subsys_state *css)
                         set_bit(CGRP_RELEASABLE, &cgrp->flags);
                         check_for_release(cgrp);
                 }
-               cgroup_wakeup_rmdir_waiter(cgrp);
                 break;
         case 0:
-               if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
-                       schedule_work(&css->dput_work);
+               schedule_work(&css->dput_work);
                 break;
         }
         rcu_read_unlock();