Merge branch 'for-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 4 Aug 2014 17:11:28 +0000 (10:11 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 4 Aug 2014 17:11:28 +0000 (10:11 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 4 Aug 2014 17:11:28 +0000 (10:11 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 4 Aug 2014 17:11:28 +0000 (10:11 -0700)
diff --combined block/blk-cgroup.c

index 28d227c,2541cf0..e17da94
--- 1/block/blk-cgroup.c
--- 2/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@@ -80,7 -80,7 +80,7 @@@ static struct blkcg_gq *blkg_alloc(stru
         blkg->q = q;
         INIT_LIST_HEAD(&blkg->q_node);
         blkg->blkcg = blkcg;
- -      blkg->refcnt = 1;
+ +      atomic_set(&blkg->refcnt, 1);
   
         /* root blkg uses @q->root_rl, init rl only for !root blkgs */
         if (blkcg != &blkcg_root) {
@@@ -399,8 -399,11 +399,8 @@@ void __blkg_release_rcu(struct rcu_hea
   
         /* release the blkcg and parent blkg refs this blkg has been holding */
         css_put(&blkg->blkcg->css);
- -      if (blkg->parent) {
- -              spin_lock_irq(blkg->q->queue_lock);
+ +      if (blkg->parent)
                 blkg_put(blkg->parent);
- -              spin_unlock_irq(blkg->q->queue_lock);
- -      }
   
         blkg_free(blkg);
   }
@@@ -872,13 -875,6 +872,13 @@@ void blkcg_drain_queue(struct request_q
   {
         lockdep_assert_held(q->queue_lock);
   
+ +      /*
+ +       * @q could be exiting and already have destroyed all blkgs as
+ +       * indicated by NULL root_blkg.  If so, don't confuse policies.
+ +       */
+ +      if (!q->root_blkg)
+ +              return;
+ +
         blk_throtl_drain(q);
   }
   
@@@ -928,7 -924,15 +928,15 @@@ struct cgroup_subsys blkio_cgrp_subsys 
         .css_offline = blkcg_css_offline,
         .css_free = blkcg_css_free,
         .can_attach = blkcg_can_attach,
-       .base_cftypes = blkcg_files,
+       .legacy_cftypes = blkcg_files,
+ #ifdef CONFIG_MEMCG
+       /*
+        * This ensures that, if available, memcg is automatically enabled
+        * together on the default hierarchy so that the owner cgroup can
+        * be retrieved from writeback pages.
+        */
+       .depends_on = 1 << memory_cgrp_id,
+ #endif
   };
   EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
   
@@@ -1097,7 -1101,7 +1105,7 @@@ EXPORT_SYMBOL_GPL(blkcg_deactivate_poli
    * Register @pol with blkcg core.  Might sleep and @pol may be modified on
    * successful registration.  Returns 0 on success and -errno on failure.
    */
- -int __init blkcg_policy_register(struct blkcg_policy *pol)
+ +int blkcg_policy_register(struct blkcg_policy *pol)
   {
         int i, ret;
   
@@@ -1120,7 -1124,8 +1128,8 @@@
   
         /* everything is in place, add intf files for the new policy */
         if (pol->cftypes)
-               WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes));
+               WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
+                                                 pol->cftypes));
         ret = 0;
   out_unlock:
         mutex_unlock(&blkcg_pol_mutex);
diff --combined kernel/cgroup.c

index aad41f0,f2a6795..7dc8788
--- 1/kernel/cgroup.c
--- 2/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@@ -149,12 -149,14 +149,14 @@@ struct cgroup_root cgrp_dfl_root
    */
   static bool cgrp_dfl_root_visible;
   
+ /*
+  * Set by the boot param of the same name and makes subsystems with NULL
+  * ->dfl_files to use ->legacy_files on the default hierarchy.
+  */
+ static bool cgroup_legacy_files_on_dfl;
+ 
   /* some controllers are not supported in the default hierarchy */
- static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
- #ifdef CONFIG_CGROUP_DEBUG
-       | (1 << debug_cgrp_id)
- #endif
-       ;
+ static unsigned int cgrp_dfl_root_inhibit_ss_mask;
   
   /* The list of hierarchy roots */
   
@@@ -180,13 -182,15 +182,15 @@@ static u64 css_serial_nr_next = 1
    */
   static int need_forkexit_callback __read_mostly;
   
- static struct cftype cgroup_base_files[];
+ static struct cftype cgroup_dfl_base_files[];
+ static struct cftype cgroup_legacy_base_files[];
   
   static void cgroup_put(struct cgroup *cgrp);
   static int rebind_subsystems(struct cgroup_root *dst_root,
                              unsigned int ss_mask);
   static int cgroup_destroy_locked(struct cgroup *cgrp);
- static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+                     bool visible);
   static void css_release(struct percpu_ref *ref);
   static void kill_css(struct cgroup_subsys_state *css);
   static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
@@@ -1036,6 -1040,58 +1040,58 @@@ static void cgroup_put(struct cgroup *c
         css_put(&cgrp->self);
   }
   
+ /**
+  * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+  * @cgrp: the target cgroup
+  *
+  * On the default hierarchy, a subsystem may request other subsystems to be
+  * enabled together through its ->depends_on mask.  In such cases, more
+  * subsystems than specified in "cgroup.subtree_control" may be enabled.
+  *
+  * This function determines which subsystems need to be enabled given the
+  * current @cgrp->subtree_control and records it in
+  * @cgrp->child_subsys_mask.  The resulting mask is always a superset of
+  * @cgrp->subtree_control and follows the usual hierarchy rules.
+  */
+ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+ {
+       struct cgroup *parent = cgroup_parent(cgrp);
+       unsigned int cur_ss_mask = cgrp->subtree_control;
+       struct cgroup_subsys *ss;
+       int ssid;
+ 
+       lockdep_assert_held(&cgroup_mutex);
+ 
+       if (!cgroup_on_dfl(cgrp)) {
+               cgrp->child_subsys_mask = cur_ss_mask;
+               return;
+       }
+ 
+       while (true) {
+               unsigned int new_ss_mask = cur_ss_mask;
+ 
+               for_each_subsys(ss, ssid)
+                       if (cur_ss_mask & (1 << ssid))
+                               new_ss_mask |= ss->depends_on;
+ 
+               /*
+                * Mask out subsystems which aren't available.  This can
+                * happen only if some depended-upon subsystems were bound
+                * to non-default hierarchies.
+                */
+               if (parent)
+                       new_ss_mask &= parent->child_subsys_mask;
+               else
+                       new_ss_mask &= cgrp->root->subsys_mask;
+ 
+               if (new_ss_mask == cur_ss_mask)
+                       break;
+               cur_ss_mask = new_ss_mask;
+       }
+ 
+       cgrp->child_subsys_mask = cur_ss_mask;
+ }
+ 
   /**
    * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
    * @kn: the kernfs_node being serviced
@@@ -1208,12 -1264,15 +1264,15 @@@ static int rebind_subsystems(struct cgr
                 up_write(&css_set_rwsem);
   
                 src_root->subsys_mask &= ~(1 << ssid);
-               src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+               src_root->cgrp.subtree_control &= ~(1 << ssid);
+               cgroup_refresh_child_subsys_mask(&src_root->cgrp);
   
                 /* default hierarchy doesn't enable controllers by default */
                 dst_root->subsys_mask |= 1 << ssid;
-               if (dst_root != &cgrp_dfl_root)
-                       dst_root->cgrp.child_subsys_mask |= 1 << ssid;
+               if (dst_root != &cgrp_dfl_root) {
+                       dst_root->cgrp.subtree_control |= 1 << ssid;
+                       cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+               }
   
                 if (ss->bind)
                         ss->bind(css);
@@@ -1233,8 -1292,6 +1292,6 @@@ static int cgroup_show_options(struct s
         for_each_subsys(ss, ssid)
                 if (root->subsys_mask & (1 << ssid))
                         seq_printf(seq, ",%s", ss->name);
-       if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
-               seq_puts(seq, ",sane_behavior");
         if (root->flags & CGRP_ROOT_NOPREFIX)
                 seq_puts(seq, ",noprefix");
         if (root->flags & CGRP_ROOT_XATTR)
@@@ -1268,6 -1325,7 +1325,7 @@@ static int parse_cgroupfs_options(char 
         bool all_ss = false, one_ss = false;
         unsigned int mask = -1U;
         struct cgroup_subsys *ss;
+       int nr_opts = 0;
         int i;
   
   #ifdef CONFIG_CPUSETS
@@@ -1277,6 -1335,8 +1335,8 @@@
         memset(opts, 0, sizeof(*opts));
   
         while ((token = strsep(&o, ",")) != NULL) {
+               nr_opts++;
+ 
                 if (!*token)
                         return -EINVAL;
                 if (!strcmp(token, "none")) {
@@@ -1361,36 -1421,32 +1421,32 @@@
                         return -ENOENT;
         }
   
-       /* Consistency checks */
- 
         if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
                 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- 
-               if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
-                   opts->cpuset_clone_children || opts->release_agent ||
-                   opts->name) {
-                       pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+               if (nr_opts != 1) {
+                       pr_err("sane_behavior: no other mount options allowed\n");
                         return -EINVAL;
                 }
-       } else {
-               /*
-                * If the 'all' option was specified select all the
-                * subsystems, otherwise if 'none', 'name=' and a subsystem
-                * name options were not specified, let's default to 'all'
-                */
-               if (all_ss || (!one_ss && !opts->none && !opts->name))
-                       for_each_subsys(ss, i)
-                               if (!ss->disabled)
-                                       opts->subsys_mask |= (1 << i);
- 
-               /*
-                * We either have to specify by name or by subsystems. (So
-                * all empty hierarchies must have a name).
-                */
-               if (!opts->subsys_mask && !opts->name)
-                       return -EINVAL;
+               return 0;
         }
   
+       /*
+        * If the 'all' option was specified select all the subsystems,
+        * otherwise if 'none', 'name=' and a subsystem name options were
+        * not specified, let's default to 'all'
+        */
+       if (all_ss || (!one_ss && !opts->none && !opts->name))
+               for_each_subsys(ss, i)
+                       if (!ss->disabled)
+                               opts->subsys_mask |= (1 << i);
+ 
+       /*
+        * We either have to specify by name or by subsystems. (So all
+        * empty hierarchies must have a name).
+        */
+       if (!opts->subsys_mask && !opts->name)
+               return -EINVAL;
+ 
         /*
          * Option noprefix was introduced just for backward compatibility
          * with the old cpuset, so we allow noprefix only if mounting just
@@@ -1399,7 -1455,6 +1455,6 @@@
         if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
                 return -EINVAL;
   
- 
         /* Can't specify "none" and some subsystems */
         if (opts->subsys_mask && opts->none)
                 return -EINVAL;
@@@ -1414,8 -1469,8 +1469,8 @@@ static int cgroup_remount(struct kernfs
         struct cgroup_sb_opts opts;
         unsigned int added_mask, removed_mask;
   
-       if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
-               pr_err("sane_behavior: remount is not allowed\n");
+       if (root == &cgrp_dfl_root) {
+               pr_err("remount is not allowed\n");
                 return -EINVAL;
         }
   
@@@ -1434,11 -1489,10 +1489,10 @@@
         removed_mask = root->subsys_mask & ~opts.subsys_mask;
   
         /* Don't allow flags or name to change at remount */
-       if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
+       if ((opts.flags ^ root->flags) ||
             (opts.name && strcmp(opts.name, root->name))) {
                 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
-                      opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
-                      root->flags & CGRP_ROOT_OPTION_MASK, root->name);
+                      opts.flags, opts.name ?: "", root->flags, root->name);
                 ret = -EINVAL;
                 goto out_unlock;
         }
@@@ -1563,6 -1617,7 +1617,7 @@@ static int cgroup_setup_root(struct cgr
   {
         LIST_HEAD(tmp_links);
         struct cgroup *root_cgrp = &root->cgrp;
+       struct cftype *base_files;
         struct css_set *cset;
         int i, ret;
   
@@@ -1600,7 -1655,12 +1655,12 @@@
         }
         root_cgrp->kn = root->kf_root->kn;
   
-       ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+       if (root == &cgrp_dfl_root)
+               base_files = cgroup_dfl_base_files;
+       else
+               base_files = cgroup_legacy_base_files;
+ 
+       ret = cgroup_addrm_files(root_cgrp, base_files, true);
         if (ret)
                 goto destroy_root;
   
@@@ -1638,7 -1698,7 +1698,7 @@@ destroy_root
   exit_root_id:
         cgroup_exit_root_id(root);
   cancel_ref:
- -      percpu_ref_cancel_init(&root_cgrp->self.refcnt);
+ +      percpu_ref_exit(&root_cgrp->self.refcnt);
   out:
         free_cgrp_cset_links(&tmp_links);
         return ret;
@@@ -1648,13 -1708,10 +1708,13 @@@ static struct dentry *cgroup_mount(stru
                          int flags, const char *unused_dev_name,
                          void *data)
   {
+ +      struct super_block *pinned_sb = NULL;
+ +      struct cgroup_subsys *ss;
         struct cgroup_root *root;
         struct cgroup_sb_opts opts;
         struct dentry *dentry;
         int ret;
+ +      int i;
         bool new_sb;
   
         /*
@@@ -1672,7 -1729,7 +1732,7 @@@
                 goto out_unlock;
   
         /* look for a matching existing root */
-       if (!opts.subsys_mask && !opts.none && !opts.name) {
+       if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
                 cgrp_dfl_root_visible = true;
                 root = &cgrp_dfl_root;
                 cgroup_get(&root->cgrp);
@@@ -1680,27 -1737,6 +1740,27 @@@
                 goto out_unlock;
         }
   
+ +      /*
+ +       * Destruction of cgroup root is asynchronous, so subsystems may
+ +       * still be dying after the previous unmount.  Let's drain the
+ +       * dying subsystems.  We just need to ensure that the ones
+ +       * unmounted previously finish dying and don't care about new ones
+ +       * starting.  Testing ref liveliness is good enough.
+ +       */
+ +      for_each_subsys(ss, i) {
+ +              if (!(opts.subsys_mask & (1 << i)) ||
+ +                  ss->root == &cgrp_dfl_root)
+ +                      continue;
+ +
+ +              if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+ +                      mutex_unlock(&cgroup_mutex);
+ +                      msleep(10);
+ +                      ret = restart_syscall();
+ +                      goto out_free;
+ +              }
+ +              cgroup_put(&ss->root->cgrp);
+ +      }
+ +
         for_each_root(root) {
                 bool name_match = false;
   
@@@ -1730,34 -1766,19 +1790,27 @@@
                         goto out_unlock;
                 }
   
-               if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
-                       if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
-                               pr_err("sane_behavior: new mount options should match the existing superblock\n");
-                               ret = -EINVAL;
-                               goto out_unlock;
-                       } else {
-                               pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-                       }
-               }
+               if (root->flags ^ opts.flags)
+                       pr_warn("new mount options do not match the existing superblock, will be ignored\n");
   
                 /*
- -               * A root's lifetime is governed by its root cgroup.
- -               * tryget_live failure indicate that the root is being
- -               * destroyed.  Wait for destruction to complete so that the
- -               * subsystems are free.  We can use wait_queue for the wait
- -               * but this path is super cold.  Let's just sleep for a bit
- -               * and retry.
+ +               * We want to reuse @root whose lifetime is governed by its
+ +               * ->cgrp.  Let's check whether @root is alive and keep it
+ +               * that way.  As cgroup_kill_sb() can happen anytime, we
+ +               * want to block it by pinning the sb so that @root doesn't
+ +               * get killed before mount is complete.
+ +               *
+ +               * With the sb pinned, tryget_live can reliably indicate
+ +               * whether @root can be reused.  If it's being killed,
+ +               * drain it.  We can use wait_queue for the wait but this
+ +               * path is super cold.  Let's just sleep a bit and retry.
                  */
- -              if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ +              pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ +              if (IS_ERR(pinned_sb) ||
+ +                  !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
                         mutex_unlock(&cgroup_mutex);
+ +                      if (!IS_ERR_OR_NULL(pinned_sb))
+ +                              deactivate_super(pinned_sb);
                         msleep(10);
                         ret = restart_syscall();
                         goto out_free;
@@@ -1802,16 -1823,6 +1855,16 @@@ out_free
                                 CGROUP_SUPER_MAGIC, &new_sb);
         if (IS_ERR(dentry) || !new_sb)
                 cgroup_put(&root->cgrp);
+ +
+ +      /*
+ +       * If @pinned_sb, we're reusing an existing root and holding an
+ +       * extra ref on its sb.  Mount is complete.  Put the extra ref.
+ +       */
+ +      if (pinned_sb) {
+ +              WARN_ON(new_sb);
+ +              deactivate_super(pinned_sb);
+ +      }
+ +
         return dentry;
   }
   
@@@ -2457,9 -2468,7 +2510,7 @@@ static int cgroup_release_agent_show(st
   
   static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
   {
-       struct cgroup *cgrp = seq_css(seq)->cgroup;
- 
-       seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+       seq_puts(seq, "0\n");
         return 0;
   }
   
@@@ -2496,7 -2505,7 +2547,7 @@@ static int cgroup_controllers_show(stru
   {
         struct cgroup *cgrp = seq_css(seq)->cgroup;
   
-       cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+       cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
         return 0;
   }
   
@@@ -2505,7 -2514,7 +2556,7 @@@ static int cgroup_subtree_control_show(
   {
         struct cgroup *cgrp = seq_css(seq)->cgroup;
   
-       cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+       cgroup_print_ss_mask(seq, cgrp->subtree_control);
         return 0;
   }
   
@@@ -2611,6 -2620,7 +2662,7 @@@ static ssize_t cgroup_subtree_control_w
                                             loff_t off)
   {
         unsigned int enable = 0, disable = 0;
+       unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
         struct cgroup *cgrp, *child;
         struct cgroup_subsys *ss;
         char *tok;
@@@ -2650,11 -2660,26 +2702,26 @@@
   
         for_each_subsys(ss, ssid) {
                 if (enable & (1 << ssid)) {
-                       if (cgrp->child_subsys_mask & (1 << ssid)) {
+                       if (cgrp->subtree_control & (1 << ssid)) {
                                 enable &= ~(1 << ssid);
                                 continue;
                         }
   
+                       /* unavailable or not enabled on the parent? */
+                       if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+                           (cgroup_parent(cgrp) &&
+                            !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+                               ret = -ENOENT;
+                               goto out_unlock;
+                       }
+ 
+                       /*
+                        * @ss is already enabled through dependency and
+                        * we'll just make it visible.  Skip draining.
+                        */
+                       if (cgrp->child_subsys_mask & (1 << ssid))
+                               continue;
+ 
                         /*
                          * Because css offlining is asynchronous, userland
                          * might try to re-enable the same controller while
@@@ -2677,23 -2702,15 +2744,15 @@@
   
                                 return restart_syscall();
                         }
- 
-                       /* unavailable or not enabled on the parent? */
-                       if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
-                           (cgroup_parent(cgrp) &&
-                            !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
-                               ret = -ENOENT;
-                               goto out_unlock;
-                       }
                 } else if (disable & (1 << ssid)) {
-                       if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+                       if (!(cgrp->subtree_control & (1 << ssid))) {
                                 disable &= ~(1 << ssid);
                                 continue;
                         }
   
                         /* a child has it enabled? */
                         cgroup_for_each_live_child(child, cgrp) {
-                               if (child->child_subsys_mask & (1 << ssid)) {
+                               if (child->subtree_control & (1 << ssid)) {
                                         ret = -EBUSY;
                                         goto out_unlock;
                                 }
@@@ -2707,7 -2724,7 +2766,7 @@@
         }
   
         /*
-        * Except for the root, child_subsys_mask must be zero for a cgroup
+        * Except for the root, subtree_control must be zero for a cgroup
          * with tasks so that child cgroups don't compete against tasks.
          */
         if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
@@@ -2716,36 -2733,75 +2775,75 @@@
         }
   
         /*
-        * Create csses for enables and update child_subsys_mask.  This
-        * changes cgroup_e_css() results which in turn makes the
-        * subsequent cgroup_update_dfl_csses() associate all tasks in the
-        * subtree to the updated csses.
+        * Update subsys masks and calculate what needs to be done.  More
+        * subsystems than specified may need to be enabled or disabled
+        * depending on subsystem dependencies.
+        */
+       cgrp->subtree_control |= enable;
+       cgrp->subtree_control &= ~disable;
+ 
+       old_ctrl = cgrp->child_subsys_mask;
+       cgroup_refresh_child_subsys_mask(cgrp);
+       new_ctrl = cgrp->child_subsys_mask;
+ 
+       css_enable = ~old_ctrl & new_ctrl;
+       css_disable = old_ctrl & ~new_ctrl;
+       enable |= css_enable;
+       disable |= css_disable;
+ 
+       /*
+        * Create new csses or make the existing ones visible.  A css is
+        * created invisible if it's being implicitly enabled through
+        * dependency.  An invisible css is made visible when the userland
+        * explicitly enables it.
          */
         for_each_subsys(ss, ssid) {
                 if (!(enable & (1 << ssid)))
                         continue;
   
                 cgroup_for_each_live_child(child, cgrp) {
-                       ret = create_css(child, ss);
+                       if (css_enable & (1 << ssid))
+                               ret = create_css(child, ss,
+                                       cgrp->subtree_control & (1 << ssid));
+                       else
+                               ret = cgroup_populate_dir(child, 1 << ssid);
                         if (ret)
                                 goto err_undo_css;
                 }
         }
   
-       cgrp->child_subsys_mask |= enable;
-       cgrp->child_subsys_mask &= ~disable;
- 
+       /*
+        * At this point, cgroup_e_css() results reflect the new csses
+        * making the following cgroup_update_dfl_csses() properly update
+        * css associations of all tasks in the subtree.
+        */
         ret = cgroup_update_dfl_csses(cgrp);
         if (ret)
                 goto err_undo_css;
   
-       /* all tasks are now migrated away from the old csses, kill them */
+       /*
+        * All tasks are migrated out of disabled csses.  Kill or hide
+        * them.  A css is hidden when the userland requests it to be
+        * disabled while other subsystems are still depending on it.  The
+        * css must not actively control resources and be in the vanilla
+        * state if it's made visible again later.  Controllers which may
+        * be depended upon should provide ->css_reset() for this purpose.
+        */
         for_each_subsys(ss, ssid) {
                 if (!(disable & (1 << ssid)))
                         continue;
   
-               cgroup_for_each_live_child(child, cgrp)
-                       kill_css(cgroup_css(child, ss));
+               cgroup_for_each_live_child(child, cgrp) {
+                       struct cgroup_subsys_state *css = cgroup_css(child, ss);
+ 
+                       if (css_disable & (1 << ssid)) {
+                               kill_css(css);
+                       } else {
+                               cgroup_clear_dir(child, 1 << ssid);
+                               if (ss->css_reset)
+                                       ss->css_reset(css);
+                       }
+               }
         }
   
         kernfs_activate(cgrp->kn);
@@@ -2755,8 -2811,9 +2853,9 @@@ out_unlock
         return ret ?: nbytes;
   
   err_undo_css:
-       cgrp->child_subsys_mask &= ~enable;
-       cgrp->child_subsys_mask |= disable;
+       cgrp->subtree_control &= ~enable;
+       cgrp->subtree_control |= disable;
+       cgroup_refresh_child_subsys_mask(cgrp);
   
         for_each_subsys(ss, ssid) {
                 if (!(enable & (1 << ssid)))
@@@ -2764,8 -2821,14 +2863,14 @@@
   
                 cgroup_for_each_live_child(child, cgrp) {
                         struct cgroup_subsys_state *css = cgroup_css(child, ss);
-                       if (css)
+ 
+                       if (!css)
+                               continue;
+ 
+                       if (css_enable & (1 << ssid))
                                 kill_css(css);
+                       else
+                               cgroup_clear_dir(child, 1 << ssid);
                 }
         }
         goto out_unlock;
@@@ -2878,9 -2941,9 +2983,9 @@@ static int cgroup_rename(struct kernfs_
   
         /*
          * This isn't a proper migration and its usefulness is very
-        * limited.  Disallow if sane_behavior.
+        * limited.  Disallow on the default hierarchy.
          */
-       if (cgroup_sane_behavior(cgrp))
+       if (cgroup_on_dfl(cgrp))
                 return -EPERM;
   
         /*
@@@ -2964,9 -3027,9 +3069,9 @@@ static int cgroup_addrm_files(struct cg
   
         for (cft = cfts; cft->name[0] != '\0'; cft++) {
                 /* does cft->flags tell us to skip this file on @cgrp? */
-               if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+               if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                         continue;
-               if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+               if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
                         continue;
                 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
                         continue;
@@@ -3024,6 -3087,9 +3129,9 @@@ static void cgroup_exit_cftypes(struct 
                         kfree(cft->kf_ops);
                 cft->kf_ops = NULL;
                 cft->ss = NULL;
+ 
+               /* revert flags set by cgroup core while adding @cfts */
+               cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
         }
   }
   
@@@ -3109,7 -3175,7 +3217,7 @@@ int cgroup_rm_cftypes(struct cftype *cf
    * function currently returns 0 as long as @cfts registration is successful
    * even if some file creation attempts on existing cgroups fail.
    */
- int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
   {
         int ret;
   
@@@ -3134,6 -3200,40 +3242,40 @@@
         return ret;
   }
   
+ /**
+  * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
+  * @ss: target cgroup subsystem
+  * @cfts: zero-length name terminated array of cftypes
+  *
+  * Similar to cgroup_add_cftypes() but the added files are only used for
+  * the default hierarchy.
+  */
+ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+ {
+       struct cftype *cft;
+ 
+       for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+               cft->flags |= __CFTYPE_ONLY_ON_DFL;
+       return cgroup_add_cftypes(ss, cfts);
+ }
+ 
+ /**
+  * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
+  * @ss: target cgroup subsystem
+  * @cfts: zero-length name terminated array of cftypes
+  *
+  * Similar to cgroup_add_cftypes() but the added files are only used for
+  * the legacy hierarchies.
+  */
+ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+ {
+       struct cftype *cft;
+ 
+       for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+               cft->flags |= __CFTYPE_NOT_ON_DFL;
+       return cgroup_add_cftypes(ss, cfts);
+ }
+ 
   /**
    * cgroup_task_count - count the number of tasks in a cgroup.
    * @cgrp: the cgroup in question
@@@ -3370,7 -3470,7 +3512,7 @@@ bool css_has_online_children(struct cgr
   
         rcu_read_lock();
         css_for_each_child(child, css) {
- -              if (css->flags & CSS_ONLINE) {
+ +              if (child->flags & CSS_ONLINE) {
                         ret = true;
                         break;
                 }
@@@ -3699,8 -3799,9 +3841,9 @@@ after
    *
    * All this extra complexity was caused by the original implementation
    * committing to an entirely unnecessary property.  In the long term, we
-  * want to do away with it.  Explicitly scramble sort order if
-  * sane_behavior so that no such expectation exists in the new interface.
+  * want to do away with it.  Explicitly scramble sort order if on the
+  * default hierarchy so that no such expectation exists in the new
+  * interface.
    *
    * Scrambling is done by swapping every two consecutive bits, which is
    * non-identity one-to-one mapping which disturbs sort order sufficiently.
@@@ -3715,7 -3816,7 +3858,7 @@@ static pid_t pid_fry(pid_t pid
   
   static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
   {
-       if (cgroup_sane_behavior(cgrp))
+       if (cgroup_on_dfl(cgrp))
                 return pid_fry(pid);
         else
                 return pid;
@@@ -3818,7 -3919,7 +3961,7 @@@ static int pidlist_array_load(struct cg
         css_task_iter_end(&it);
         length = n;
         /* now sort & (if procs) strip out duplicates */
-       if (cgroup_sane_behavior(cgrp))
+       if (cgroup_on_dfl(cgrp))
                 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
         else
                 sort(array, length, sizeof(pid_t), cmppid, NULL);
@@@ -4040,7 -4141,8 +4183,8 @@@ static int cgroup_clone_children_write(
         return 0;
   }
   
- static struct cftype cgroup_base_files[] = {
+ /* cgroup core interface files for the default hierarchy */
+ static struct cftype cgroup_dfl_base_files[] = {
         {
                 .name = "cgroup.procs",
                 .seq_start = cgroup_pidlist_start,
@@@ -4051,47 -4153,53 +4195,53 @@@
                 .write = cgroup_procs_write,
                 .mode = S_IRUGO | S_IWUSR,
         },
-       {
-               .name = "cgroup.clone_children",
-               .flags = CFTYPE_INSANE,
-               .read_u64 = cgroup_clone_children_read,
-               .write_u64 = cgroup_clone_children_write,
-       },
-       {
-               .name = "cgroup.sane_behavior",
-               .flags = CFTYPE_ONLY_ON_ROOT,
-               .seq_show = cgroup_sane_behavior_show,
-       },
         {
                 .name = "cgroup.controllers",
-               .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+               .flags = CFTYPE_ONLY_ON_ROOT,
                 .seq_show = cgroup_root_controllers_show,
         },
         {
                 .name = "cgroup.controllers",
-               .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+               .flags = CFTYPE_NOT_ON_ROOT,
                 .seq_show = cgroup_controllers_show,
         },
         {
                 .name = "cgroup.subtree_control",
-               .flags = CFTYPE_ONLY_ON_DFL,
                 .seq_show = cgroup_subtree_control_show,
                 .write = cgroup_subtree_control_write,
         },
         {
                 .name = "cgroup.populated",
-               .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+               .flags = CFTYPE_NOT_ON_ROOT,
                 .seq_show = cgroup_populated_show,
         },
+       { }     /* terminate */
+ };
   
-       /*
-        * Historical crazy stuff.  These don't have "cgroup."  prefix and
-        * don't exist if sane_behavior.  If you're depending on these, be
-        * prepared to be burned.
-        */
+ /* cgroup core interface files for the legacy hierarchies */
+ static struct cftype cgroup_legacy_base_files[] = {
+       {
+               .name = "cgroup.procs",
+               .seq_start = cgroup_pidlist_start,
+               .seq_next = cgroup_pidlist_next,
+               .seq_stop = cgroup_pidlist_stop,
+               .seq_show = cgroup_pidlist_show,
+               .private = CGROUP_FILE_PROCS,
+               .write = cgroup_procs_write,
+               .mode = S_IRUGO | S_IWUSR,
+       },
+       {
+               .name = "cgroup.clone_children",
+               .read_u64 = cgroup_clone_children_read,
+               .write_u64 = cgroup_clone_children_write,
+       },
+       {
+               .name = "cgroup.sane_behavior",
+               .flags = CFTYPE_ONLY_ON_ROOT,
+               .seq_show = cgroup_sane_behavior_show,
+       },
         {
                 .name = "tasks",
-               .flags = CFTYPE_INSANE,         /* use "procs" instead */
                 .seq_start = cgroup_pidlist_start,
                 .seq_next = cgroup_pidlist_next,
                 .seq_stop = cgroup_pidlist_stop,
@@@ -4102,13 -4210,12 +4252,12 @@@
         },
         {
                 .name = "notify_on_release",
-               .flags = CFTYPE_INSANE,
                 .read_u64 = cgroup_read_notify_on_release,
                 .write_u64 = cgroup_write_notify_on_release,
         },
         {
                 .name = "release_agent",
-               .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
+               .flags = CFTYPE_ONLY_ON_ROOT,
                 .seq_show = cgroup_release_agent_show,
                 .write = cgroup_release_agent_write,
                 .max_write_len = PATH_MAX - 1,
@@@ -4175,8 -4282,6 +4324,8 @@@ static void css_free_work_fn(struct wor
                 container_of(work, struct cgroup_subsys_state, destroy_work);
         struct cgroup *cgrp = css->cgroup;
   
+ +      percpu_ref_exit(&css->refcnt);
+ +
         if (css->ss) {
                 /* css free path */
                 if (css->parent)
@@@ -4316,12 -4421,14 +4465,14 @@@ static void offline_css(struct cgroup_s
    * create_css - create a cgroup_subsys_state
    * @cgrp: the cgroup new css will be associated with
    * @ss: the subsys of new css
+  * @visible: whether to create control knobs for the new css or not
    *
    * Create a new css associated with @cgrp - @ss pair.  On success, the new
-  * css is online and installed in @cgrp with all interface files created.
-  * Returns 0 on success, -errno on failure.
+  * css is online and installed in @cgrp with all interface files created if
+  * @visible.  Returns 0 on success, -errno on failure.
    */
- static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
+ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+                     bool visible)
   {
         struct cgroup *parent = cgroup_parent(cgrp);
         struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
@@@ -4345,9 -4452,11 +4496,11 @@@
                 goto err_free_percpu_ref;
         css->id = err;
   
-       err = cgroup_populate_dir(cgrp, 1 << ss->id);
-       if (err)
-               goto err_free_id;
+       if (visible) {
+               err = cgroup_populate_dir(cgrp, 1 << ss->id);
+               if (err)
+                       goto err_free_id;
+       }
   
         /* @css is ready to be brought online now, make it visible */
         list_add_tail_rcu(&css->sibling, &parent_css->children);
@@@ -4374,7 -4483,7 +4527,7 @@@ err_list_del
   err_free_id:
         cgroup_idr_remove(&ss->css_idr, css->id);
   err_free_percpu_ref:
- -      percpu_ref_cancel_init(&css->refcnt);
+ +      percpu_ref_exit(&css->refcnt);
   err_free_css:
         call_rcu(&css->rcu_head, css_free_rcu_fn);
         return err;
@@@ -4387,6 -4496,7 +4540,7 @@@ static int cgroup_mkdir(struct kernfs_n
         struct cgroup_root *root;
         struct cgroup_subsys *ss;
         struct kernfs_node *kn;
+       struct cftype *base_files;
         int ssid, ret;
   
         parent = cgroup_kn_lock_live(parent_kn);
@@@ -4457,14 -4567,20 +4611,20 @@@
         if (ret)
                 goto out_destroy;
   
-       ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+       if (cgroup_on_dfl(cgrp))
+               base_files = cgroup_dfl_base_files;
+       else
+               base_files = cgroup_legacy_base_files;
+ 
+       ret = cgroup_addrm_files(cgrp, base_files, true);
         if (ret)
                 goto out_destroy;
   
         /* let's create and online css's */
         for_each_subsys(ss, ssid) {
                 if (parent->child_subsys_mask & (1 << ssid)) {
-                       ret = create_css(cgrp, ss);
+                       ret = create_css(cgrp, ss,
+                                        parent->subtree_control & (1 << ssid));
                         if (ret)
                                 goto out_destroy;
                 }
@@@ -4472,10 -4588,12 +4632,12 @@@
   
         /*
          * On the default hierarchy, a child doesn't automatically inherit
-        * child_subsys_mask from the parent.  Each is configured manually.
+        * subtree_control from the parent.  Each is configured manually.
          */
-       if (!cgroup_on_dfl(cgrp))
-               cgrp->child_subsys_mask = parent->child_subsys_mask;
+       if (!cgroup_on_dfl(cgrp)) {
+               cgrp->subtree_control = parent->subtree_control;
+               cgroup_refresh_child_subsys_mask(cgrp);
+       }
   
         kernfs_activate(kn);
   
@@@ -4485,7 -4603,7 +4647,7 @@@
   out_free_id:
         cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
   out_cancel_ref:
- -      percpu_ref_cancel_init(&cgrp->self.refcnt);
+ +      percpu_ref_exit(&cgrp->self.refcnt);
   out_free_cgrp:
         kfree(cgrp);
   out_unlock:
@@@ -4738,8 -4856,7 +4900,7 @@@ static void __init cgroup_init_subsys(s
    */
   int __init cgroup_init_early(void)
   {
-       static struct cgroup_sb_opts __initdata opts =
-               { .flags = CGRP_ROOT_SANE_BEHAVIOR };
+       static struct cgroup_sb_opts __initdata opts;
         struct cgroup_subsys *ss;
         int i;
   
@@@ -4777,7 -4894,8 +4938,8 @@@ int __init cgroup_init(void
         unsigned long key;
         int ssid, err;
   
-       BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+       BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
+       BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
   
         mutex_lock(&cgroup_mutex);
   
@@@ -4809,9 -4927,22 +4971,22 @@@
                  * disabled flag and cftype registration needs kmalloc,
                  * both of which aren't available during early_init.
                  */
-               if (!ss->disabled) {
-                       cgrp_dfl_root.subsys_mask |= 1 << ss->id;
-                       WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+               if (ss->disabled)
+                       continue;
+ 
+               cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+ 
+               if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
+                       ss->dfl_cftypes = ss->legacy_cftypes;
+ 
+               if (!ss->dfl_cftypes)
+                       cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+ 
+               if (ss->dfl_cftypes == ss->legacy_cftypes) {
+                       WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
+               } else {
+                       WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
+                       WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
                 }
         }
   
@@@ -5207,6 -5338,14 +5382,14 @@@ static int __init cgroup_disable(char *
   }
   __setup("cgroup_disable=", cgroup_disable);
   
+ static int __init cgroup_set_legacy_files_on_dfl(char *str)
+ {
+       printk("cgroup: using legacy files on the default hierarchy\n");
+       cgroup_legacy_files_on_dfl = true;
+       return 0;
+ }
+ __setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
+ 
   /**
    * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
    * @dentry: directory dentry of interest
@@@ -5401,6 -5540,6 +5584,6 @@@ static struct cftype debug_files[] =  
   struct cgroup_subsys debug_cgrp_subsys = {
         .css_alloc = debug_css_alloc,
         .css_free = debug_css_free,
-       .base_cftypes = debug_files,
+       .legacy_cftypes = debug_files,
   };
   #endif /* CONFIG_CGROUP_DEBUG */
diff --combined kernel/cpuset.c

index 116a416,9d7264b..22874d7
--- 1/kernel/cpuset.c
--- 2/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@@ -76,8 -76,34 +76,34 @@@ struct cpuset 
         struct cgroup_subsys_state css;
   
         unsigned long flags;            /* "unsigned long" so bitops work */
-       cpumask_var_t cpus_allowed;     /* CPUs allowed to tasks in cpuset */
-       nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
+ 
+       /*
+        * On default hierarchy:
+        *
+        * The user-configured masks can only be changed by writing to
+        * cpuset.cpus and cpuset.mems, and won't be limited by the
+        * parent masks.
+        *
+        * The effective masks is the real masks that apply to the tasks
+        * in the cpuset. They may be changed if the configured masks are
+        * changed or hotplug happens.
+        *
+        * effective_mask == configured_mask & parent's effective_mask,
+        * and if it ends up empty, it will inherit the parent's mask.
+        *
+        *
+        * On legacy hierachy:
+        *
+        * The user-configured masks are always the same with effective masks.
+        */
+ 
+       /* user-configured CPUs and Memory Nodes allow to tasks */
+       cpumask_var_t cpus_allowed;
+       nodemask_t mems_allowed;
+ 
+       /* effective CPUs and Memory Nodes allow to tasks */
+       cpumask_var_t effective_cpus;
+       nodemask_t effective_mems;
   
         /*
          * This is old Memory Nodes tasks took on.
@@@ -307,9 -333,9 +333,9 @@@ static struct file_system_type cpuset_f
    */
   static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
   {
-       while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+       while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
                 cs = parent_cs(cs);
-       cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
+       cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
   }
   
   /*
@@@ -325,9 -351,9 +351,9 @@@
    */
   static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
   {
-       while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+       while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
                 cs = parent_cs(cs);
-       nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
+       nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
   }
   
   /*
@@@ -376,13 -402,20 +402,20 @@@ static struct cpuset *alloc_trial_cpuse
         if (!trial)
                 return NULL;
   
-       if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
-               kfree(trial);
-               return NULL;
-       }
-       cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+       if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
+               goto free_cs;
+       if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
+               goto free_cpus;
   
+       cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+       cpumask_copy(trial->effective_cpus, cs->effective_cpus);
         return trial;
+ 
+ free_cpus:
+       free_cpumask_var(trial->cpus_allowed);
+ free_cs:
+       kfree(trial);
+       return NULL;
   }
   
   /**
@@@ -391,6 -424,7 +424,7 @@@
    */
   static void free_trial_cpuset(struct cpuset *trial)
   {
+       free_cpumask_var(trial->effective_cpus);
         free_cpumask_var(trial->cpus_allowed);
         kfree(trial);
   }
@@@ -436,9 -470,9 +470,9 @@@ static int validate_change(struct cpuse
   
         par = parent_cs(cur);
   
-       /* We must be a subset of our parent cpuset */
+       /* On legacy hiearchy, we must be a subset of our parent cpuset. */
         ret = -EACCES;
-       if (!is_cpuset_subset(trial, par))
+       if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
                 goto out;
   
         /*
@@@ -480,11 -514,11 +514,11 @@@ out
   #ifdef CONFIG_SMP
   /*
    * Helper routine for generate_sched_domains().
-  * Do cpusets a, b have overlapping cpus_allowed masks?
+  * Do cpusets a, b have overlapping effective cpus_allowed masks?
    */
   static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
   {
-       return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
+       return cpumask_intersects(a->effective_cpus, b->effective_cpus);
   }
   
   static void
@@@ -601,7 -635,7 +635,7 @@@ static int generate_sched_domains(cpuma
                         *dattr = SD_ATTR_INIT;
                         update_domain_attr_tree(dattr, &top_cpuset);
                 }
-               cpumask_copy(doms[0], top_cpuset.cpus_allowed);
+               cpumask_copy(doms[0], top_cpuset.effective_cpus);
   
                 goto done;
         }
@@@ -705,7 -739,7 +739,7 @@@ restart
                         struct cpuset *b = csa[j];
   
                         if (apn == b->pn) {
-                               cpumask_or(dp, dp, b->cpus_allowed);
+                               cpumask_or(dp, dp, b->effective_cpus);
                                 if (dattr)
                                         update_domain_attr_tree(dattr + nslot, b);
   
@@@ -757,7 -791,7 +791,7 @@@ static void rebuild_sched_domains_locke
          * passing doms with offlined cpu to partition_sched_domains().
          * Anyways, hotplug work item will rebuild sched domains.
          */
-       if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+       if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
                 goto out;
   
         /* Generate domain masks and attrs */
@@@ -781,45 -815,6 +815,6 @@@ void rebuild_sched_domains(void
         mutex_unlock(&cpuset_mutex);
   }
   
- /*
-  * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
-  * @cs: the cpuset in interest
-  *
-  * A cpuset's effective cpumask is the cpumask of the nearest ancestor
-  * with non-empty cpus. We use effective cpumask whenever:
-  * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
-  *   if the cpuset they reside in has no cpus)
-  * - we want to retrieve task_cs(tsk)'s cpus_allowed.
-  *
-  * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
-  * exception. See comments there.
-  */
- static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
- {
-       while (cpumask_empty(cs->cpus_allowed))
-               cs = parent_cs(cs);
-       return cs;
- }
- 
- /*
-  * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
-  * @cs: the cpuset in interest
-  *
-  * A cpuset's effective nodemask is the nodemask of the nearest ancestor
-  * with non-empty memss. We use effective nodemask whenever:
-  * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
-  *   if the cpuset they reside in has no mems)
-  * - we want to retrieve task_cs(tsk)'s mems_allowed.
-  *
-  * Called with cpuset_mutex held.
-  */
- static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
- {
-       while (nodes_empty(cs->mems_allowed))
-               cs = parent_cs(cs);
-       return cs;
- }
- 
   /**
    * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
    * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@@ -830,53 -825,80 +825,80 @@@
    */
   static void update_tasks_cpumask(struct cpuset *cs)
   {
-       struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
         struct css_task_iter it;
         struct task_struct *task;
   
         css_task_iter_start(&cs->css, &it);
         while ((task = css_task_iter_next(&it)))
-               set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+               set_cpus_allowed_ptr(task, cs->effective_cpus);
         css_task_iter_end(&it);
   }
   
   /*
-  * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
-  * @root_cs: the root cpuset of the hierarchy
-  * @update_root: update root cpuset or not?
+  * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+  * @cs: the cpuset to consider
+  * @new_cpus: temp variable for calculating new effective_cpus
+  *
+  * When congifured cpumask is changed, the effective cpumasks of this cpuset
+  * and all its descendants need to be updated.
    *
-  * This will update cpumasks of tasks in @root_cs and all other empty cpusets
-  * which take on cpumask of @root_cs.
+  * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
    *
    * Called with cpuset_mutex held
    */
- static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
+ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
   {
         struct cpuset *cp;
         struct cgroup_subsys_state *pos_css;
+       bool need_rebuild_sched_domains = false;
   
         rcu_read_lock();
-       cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-               if (cp == root_cs) {
-                       if (!update_root)
-                               continue;
-               } else {
-                       /* skip the whole subtree if @cp have some CPU */
-                       if (!cpumask_empty(cp->cpus_allowed)) {
-                               pos_css = css_rightmost_descendant(pos_css);
-                               continue;
-                       }
+       cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+               struct cpuset *parent = parent_cs(cp);
+ 
+               cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+ 
+               /*
+                * If it becomes empty, inherit the effective mask of the
+                * parent, which is guaranteed to have some CPUs.
+                */
+               if (cpumask_empty(new_cpus))
+                       cpumask_copy(new_cpus, parent->effective_cpus);
+ 
+               /* Skip the whole subtree if the cpumask remains the same. */
+               if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+                       pos_css = css_rightmost_descendant(pos_css);
+                       continue;
                 }
+ 
                 if (!css_tryget_online(&cp->css))
                         continue;
                 rcu_read_unlock();
   
+               mutex_lock(&callback_mutex);
+               cpumask_copy(cp->effective_cpus, new_cpus);
+               mutex_unlock(&callback_mutex);
+ 
+               WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+                       !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+ 
                 update_tasks_cpumask(cp);
   
+               /*
+                * If the effective cpumask of any non-empty cpuset is changed,
+                * we need to rebuild sched domains.
+                */
+               if (!cpumask_empty(cp->cpus_allowed) &&
+                   is_sched_load_balance(cp))
+                       need_rebuild_sched_domains = true;
+ 
                 rcu_read_lock();
                 css_put(&cp->css);
         }
         rcu_read_unlock();
+ 
+       if (need_rebuild_sched_domains)
+               rebuild_sched_domains_locked();
   }
   
   /**
@@@ -889,7 -911,6 +911,6 @@@ static int update_cpumask(struct cpuse
                           const char *buf)
   {
         int retval;
-       int is_load_balanced;
   
         /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
         if (cs == &top_cpuset)
@@@ -908,7 -929,8 +929,8 @@@
                 if (retval < 0)
                         return retval;
   
-               if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
+               if (!cpumask_subset(trialcs->cpus_allowed,
+                                   top_cpuset.cpus_allowed))
                         return -EINVAL;
         }
   
@@@ -920,16 -942,12 +942,12 @@@
         if (retval < 0)
                 return retval;
   
-       is_load_balanced = is_sched_load_balance(trialcs);
- 
         mutex_lock(&callback_mutex);
         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
         mutex_unlock(&callback_mutex);
   
-       update_tasks_cpumask_hier(cs, true);
- 
-       if (is_load_balanced)
-               rebuild_sched_domains_locked();
+       /* use trialcs->cpus_allowed as a temp variable */
+       update_cpumasks_hier(cs, trialcs->cpus_allowed);
         return 0;
   }
   
@@@ -951,15 -969,13 +969,13 @@@ static void cpuset_migrate_mm(struct mm
                                                         const nodemask_t *to)
   {
         struct task_struct *tsk = current;
-       struct cpuset *mems_cs;
   
         tsk->mems_allowed = *to;
   
         do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
   
         rcu_read_lock();
-       mems_cs = effective_nodemask_cpuset(task_cs(tsk));
-       guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+       guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
         rcu_read_unlock();
   }
   
@@@ -1028,13 -1044,12 +1044,12 @@@ static void *cpuset_being_rebound
   static void update_tasks_nodemask(struct cpuset *cs)
   {
         static nodemask_t newmems;      /* protected by cpuset_mutex */
-       struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
         struct css_task_iter it;
         struct task_struct *task;
   
         cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
   
-       guarantee_online_mems(mems_cs, &newmems);
+       guarantee_online_mems(cs, &newmems);
   
         /*
          * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@@ -1077,36 -1092,52 +1092,52 @@@
   }
   
   /*
-  * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
-  * @cs: the root cpuset of the hierarchy
-  * @update_root: update the root cpuset or not?
+  * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
+  * @cs: the cpuset to consider
+  * @new_mems: a temp variable for calculating new effective_mems
    *
-  * This will update nodemasks of tasks in @root_cs and all other empty cpusets
-  * which take on nodemask of @root_cs.
+  * When configured nodemask is changed, the effective nodemasks of this cpuset
+  * and all its descendants need to be updated.
+  *
+  * On legacy hiearchy, effective_mems will be the same with mems_allowed.
    *
    * Called with cpuset_mutex held
    */
- static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
+ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
   {
         struct cpuset *cp;
         struct cgroup_subsys_state *pos_css;
   
         rcu_read_lock();
-       cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-               if (cp == root_cs) {
-                       if (!update_root)
-                               continue;
-               } else {
-                       /* skip the whole subtree if @cp have some CPU */
-                       if (!nodes_empty(cp->mems_allowed)) {
-                               pos_css = css_rightmost_descendant(pos_css);
-                               continue;
-                       }
+       cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+               struct cpuset *parent = parent_cs(cp);
+ 
+               nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+ 
+               /*
+                * If it becomes empty, inherit the effective mask of the
+                * parent, which is guaranteed to have some MEMs.
+                */
+               if (nodes_empty(*new_mems))
+                       *new_mems = parent->effective_mems;
+ 
+               /* Skip the whole subtree if the nodemask remains the same. */
+               if (nodes_equal(*new_mems, cp->effective_mems)) {
+                       pos_css = css_rightmost_descendant(pos_css);
+                       continue;
                 }
+ 
                 if (!css_tryget_online(&cp->css))
                         continue;
                 rcu_read_unlock();
   
+               mutex_lock(&callback_mutex);
+               cp->effective_mems = *new_mems;
+               mutex_unlock(&callback_mutex);
+ 
+               WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+                       !nodes_equal(cp->mems_allowed, cp->effective_mems));
+ 
                 update_tasks_nodemask(cp);
   
                 rcu_read_lock();
@@@ -1156,8 -1187,8 +1187,8 @@@ static int update_nodemask(struct cpuse
                         goto done;
   
                 if (!nodes_subset(trialcs->mems_allowed,
-                               node_states[N_MEMORY])) {
-                       retval =  -EINVAL;
+                                 top_cpuset.mems_allowed)) {
+                       retval = -EINVAL;
                         goto done;
                 }
         }
@@@ -1174,20 -1205,15 +1205,21 @@@
         cs->mems_allowed = trialcs->mems_allowed;
         mutex_unlock(&callback_mutex);
   
-       update_tasks_nodemask_hier(cs, true);
+       /* use trialcs->mems_allowed as a temp variable */
+       update_nodemasks_hier(cs, &cs->mems_allowed);
   done:
         return retval;
   }
   
   int current_cpuset_is_being_rebound(void)
   {
- -      return task_cs(current) == cpuset_being_rebound;
+ +      int ret;
+ +
+ +      rcu_read_lock();
+ +      ret = task_cs(current) == cpuset_being_rebound;
+ +      rcu_read_unlock();
+ +
+ +      return ret;
   }
   
   static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@@ -1389,12 -1415,9 +1421,9 @@@ static int cpuset_can_attach(struct cgr
   
         mutex_lock(&cpuset_mutex);
   
-       /*
-        * We allow to move tasks into an empty cpuset if sane_behavior
-        * flag is set.
-        */
+       /* allow moving tasks into an empty cpuset if on default hierarchy */
         ret = -ENOSPC;
-       if (!cgroup_sane_behavior(css->cgroup) &&
+       if (!cgroup_on_dfl(css->cgroup) &&
             (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                 goto out_unlock;
   
@@@ -1452,8 -1475,6 +1481,6 @@@ static void cpuset_attach(struct cgroup
         struct task_struct *leader = cgroup_taskset_first(tset);
         struct cpuset *cs = css_cs(css);
         struct cpuset *oldcs = cpuset_attach_old_cs;
-       struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
-       struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
   
         mutex_lock(&cpuset_mutex);
   
@@@ -1461,9 -1482,9 +1488,9 @@@
         if (cs == &top_cpuset)
                 cpumask_copy(cpus_attach, cpu_possible_mask);
         else
-               guarantee_online_cpus(cpus_cs, cpus_attach);
+               guarantee_online_cpus(cs, cpus_attach);
   
-       guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+       guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
   
         cgroup_taskset_for_each(task, tset) {
                 /*
@@@ -1480,11 -1501,9 +1507,9 @@@
          * Change mm, possibly for multiple threads in a threadgroup. This is
          * expensive and may sleep.
          */
-       cpuset_attach_nodemask_to = cs->mems_allowed;
+       cpuset_attach_nodemask_to = cs->effective_mems;
         mm = get_task_mm(leader);
         if (mm) {
-               struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
- 
                 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
   
                 /*
@@@ -1495,7 -1514,7 +1520,7 @@@
                  * mm from.
                  */
                 if (is_memory_migrate(cs)) {
-                       cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
+                       cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
                                           &cpuset_attach_nodemask_to);
                 }
                 mmput(mm);
@@@ -1516,6 -1535,8 +1541,8 @@@ typedef enum 
         FILE_MEMORY_MIGRATE,
         FILE_CPULIST,
         FILE_MEMLIST,
+       FILE_EFFECTIVE_CPULIST,
+       FILE_EFFECTIVE_MEMLIST,
         FILE_CPU_EXCLUSIVE,
         FILE_MEM_EXCLUSIVE,
         FILE_MEM_HARDWALL,
@@@ -1623,17 -1644,7 +1650,17 @@@ static ssize_t cpuset_write_resmask(str
          * resources, wait for the previously scheduled operations before
          * proceeding, so that we don't end up keep removing tasks added
          * after execution capability is restored.
+ +       *
+ +       * cpuset_hotplug_work calls back into cgroup core via
+ +       * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+ +       * operation like this one can lead to a deadlock through kernfs
+ +       * active_ref protection.  Let's break the protection.  Losing the
+ +       * protection is okay as we check whether @cs is online after
+ +       * grabbing cpuset_mutex anyway.  This only happens on the legacy
+ +       * hierarchies.
          */
+ +      css_get(&cs->css);
+ +      kernfs_break_active_protection(of->kn);
         flush_work(&cpuset_hotplug_work);
   
         mutex_lock(&cpuset_mutex);
@@@ -1661,8 -1672,6 +1688,8 @@@
         free_trial_cpuset(trialcs);
   out_unlock:
         mutex_unlock(&cpuset_mutex);
+ +      kernfs_unbreak_active_protection(of->kn);
+ +      css_put(&cs->css);
         return retval ?: nbytes;
   }
   
@@@ -1694,6 -1703,12 +1721,12 @@@ static int cpuset_common_seq_show(struc
         case FILE_MEMLIST:
                 s += nodelist_scnprintf(s, count, cs->mems_allowed);
                 break;
+       case FILE_EFFECTIVE_CPULIST:
+               s += cpulist_scnprintf(s, count, cs->effective_cpus);
+               break;
+       case FILE_EFFECTIVE_MEMLIST:
+               s += nodelist_scnprintf(s, count, cs->effective_mems);
+               break;
         default:
                 ret = -EINVAL;
                 goto out_unlock;
@@@ -1778,6 -1793,18 +1811,18 @@@ static struct cftype files[] = 
                 .private = FILE_MEMLIST,
         },
   
+       {
+               .name = "effective_cpus",
+               .seq_show = cpuset_common_seq_show,
+               .private = FILE_EFFECTIVE_CPULIST,
+       },
+ 
+       {
+               .name = "effective_mems",
+               .seq_show = cpuset_common_seq_show,
+               .private = FILE_EFFECTIVE_MEMLIST,
+       },
+ 
         {
                 .name = "cpu_exclusive",
                 .read_u64 = cpuset_read_u64,
@@@ -1869,18 -1896,26 +1914,26 @@@ cpuset_css_alloc(struct cgroup_subsys_s
         cs = kzalloc(sizeof(*cs), GFP_KERNEL);
         if (!cs)
                 return ERR_PTR(-ENOMEM);
-       if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
-               kfree(cs);
-               return ERR_PTR(-ENOMEM);
-       }
+       if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
+               goto free_cs;
+       if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
+               goto free_cpus;
   
         set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
         cpumask_clear(cs->cpus_allowed);
         nodes_clear(cs->mems_allowed);
+       cpumask_clear(cs->effective_cpus);
+       nodes_clear(cs->effective_mems);
         fmeter_init(&cs->fmeter);
         cs->relax_domain_level = -1;
   
         return &cs->css;
+ 
+ free_cpus:
+       free_cpumask_var(cs->cpus_allowed);
+ free_cs:
+       kfree(cs);
+       return ERR_PTR(-ENOMEM);
   }
   
   static int cpuset_css_online(struct cgroup_subsys_state *css)
@@@ -1903,6 -1938,13 +1956,13 @@@
   
         cpuset_inc();
   
+       mutex_lock(&callback_mutex);
+       if (cgroup_on_dfl(cs->css.cgroup)) {
+               cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+               cs->effective_mems = parent->effective_mems;
+       }
+       mutex_unlock(&callback_mutex);
+ 
         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                 goto out_unlock;
   
@@@ -1962,20 -2004,40 +2022,40 @@@ static void cpuset_css_free(struct cgro
   {
         struct cpuset *cs = css_cs(css);
   
+       free_cpumask_var(cs->effective_cpus);
         free_cpumask_var(cs->cpus_allowed);
         kfree(cs);
   }
   
+ static void cpuset_bind(struct cgroup_subsys_state *root_css)
+ {
+       mutex_lock(&cpuset_mutex);
+       mutex_lock(&callback_mutex);
+ 
+       if (cgroup_on_dfl(root_css->cgroup)) {
+               cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+               top_cpuset.mems_allowed = node_possible_map;
+       } else {
+               cpumask_copy(top_cpuset.cpus_allowed,
+                            top_cpuset.effective_cpus);
+               top_cpuset.mems_allowed = top_cpuset.effective_mems;
+       }
+ 
+       mutex_unlock(&callback_mutex);
+       mutex_unlock(&cpuset_mutex);
+ }
+ 
   struct cgroup_subsys cpuset_cgrp_subsys = {
-       .css_alloc = cpuset_css_alloc,
-       .css_online = cpuset_css_online,
-       .css_offline = cpuset_css_offline,
-       .css_free = cpuset_css_free,
-       .can_attach = cpuset_can_attach,
-       .cancel_attach = cpuset_cancel_attach,
-       .attach = cpuset_attach,
-       .base_cftypes = files,
-       .early_init = 1,
+       .css_alloc      = cpuset_css_alloc,
+       .css_online     = cpuset_css_online,
+       .css_offline    = cpuset_css_offline,
+       .css_free       = cpuset_css_free,
+       .can_attach     = cpuset_can_attach,
+       .cancel_attach  = cpuset_cancel_attach,
+       .attach         = cpuset_attach,
+       .bind           = cpuset_bind,
+       .legacy_cftypes = files,
+       .early_init     = 1,
   };
   
   /**
@@@ -1990,9 -2052,13 +2070,13 @@@ int __init cpuset_init(void
   
         if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
                 BUG();
+       if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
+               BUG();
   
         cpumask_setall(top_cpuset.cpus_allowed);
         nodes_setall(top_cpuset.mems_allowed);
+       cpumask_setall(top_cpuset.effective_cpus);
+       nodes_setall(top_cpuset.effective_mems);
   
         fmeter_init(&top_cpuset.fmeter);
         set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
@@@ -2035,6 -2101,66 +2119,66 @@@ static void remove_tasks_in_empty_cpuse
         }
   }
   
+ static void
+ hotplug_update_tasks_legacy(struct cpuset *cs,
+                           struct cpumask *new_cpus, nodemask_t *new_mems,
+                           bool cpus_updated, bool mems_updated)
+ {
+       bool is_empty;
+ 
+       mutex_lock(&callback_mutex);
+       cpumask_copy(cs->cpus_allowed, new_cpus);
+       cpumask_copy(cs->effective_cpus, new_cpus);
+       cs->mems_allowed = *new_mems;
+       cs->effective_mems = *new_mems;
+       mutex_unlock(&callback_mutex);
+ 
+       /*
+        * Don't call update_tasks_cpumask() if the cpuset becomes empty,
+        * as the tasks will be migratecd to an ancestor.
+        */
+       if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+               update_tasks_cpumask(cs);
+       if (mems_updated && !nodes_empty(cs->mems_allowed))
+               update_tasks_nodemask(cs);
+ 
+       is_empty = cpumask_empty(cs->cpus_allowed) ||
+                  nodes_empty(cs->mems_allowed);
+ 
+       mutex_unlock(&cpuset_mutex);
+ 
+       /*
+        * Move tasks to the nearest ancestor with execution resources,
+        * This is full cgroup operation which will also call back into
+        * cpuset. Should be done outside any lock.
+        */
+       if (is_empty)
+               remove_tasks_in_empty_cpuset(cs);
+ 
+       mutex_lock(&cpuset_mutex);
+ }
+ 
+ static void
+ hotplug_update_tasks(struct cpuset *cs,
+                    struct cpumask *new_cpus, nodemask_t *new_mems,
+                    bool cpus_updated, bool mems_updated)
+ {
+       if (cpumask_empty(new_cpus))
+               cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+       if (nodes_empty(*new_mems))
+               *new_mems = parent_cs(cs)->effective_mems;
+ 
+       mutex_lock(&callback_mutex);
+       cpumask_copy(cs->effective_cpus, new_cpus);
+       cs->effective_mems = *new_mems;
+       mutex_unlock(&callback_mutex);
+ 
+       if (cpus_updated)
+               update_tasks_cpumask(cs);
+       if (mems_updated)
+               update_tasks_nodemask(cs);
+ }
+ 
   /**
    * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
    * @cs: cpuset in interest
@@@ -2045,11 -2171,10 +2189,10 @@@
    */
   static void cpuset_hotplug_update_tasks(struct cpuset *cs)
   {
-       static cpumask_t off_cpus;
-       static nodemask_t off_mems;
-       bool is_empty;
-       bool sane = cgroup_sane_behavior(cs->css.cgroup);
- 
+       static cpumask_t new_cpus;
+       static nodemask_t new_mems;
+       bool cpus_updated;
+       bool mems_updated;
   retry:
         wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
   
@@@ -2064,51 -2189,20 +2207,20 @@@
                 goto retry;
         }
   
-       cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
-       nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
- 
-       mutex_lock(&callback_mutex);
-       cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
-       mutex_unlock(&callback_mutex);
- 
-       /*
-        * If sane_behavior flag is set, we need to update tasks' cpumask
-        * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
-        * call update_tasks_cpumask() if the cpuset becomes empty, as
-        * the tasks in it will be migrated to an ancestor.
-        */
-       if ((sane && cpumask_empty(cs->cpus_allowed)) ||
-           (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
-               update_tasks_cpumask(cs);
+       cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+       nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
   
-       mutex_lock(&callback_mutex);
-       nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
-       mutex_unlock(&callback_mutex);
- 
-       /*
-        * If sane_behavior flag is set, we need to update tasks' nodemask
-        * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
-        * call update_tasks_nodemask() if the cpuset becomes empty, as
-        * the tasks in it will be migratd to an ancestor.
-        */
-       if ((sane && nodes_empty(cs->mems_allowed)) ||
-           (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
-               update_tasks_nodemask(cs);
+       cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
+       mems_updated = !nodes_equal(new_mems, cs->effective_mems);
   
-       is_empty = cpumask_empty(cs->cpus_allowed) ||
-               nodes_empty(cs->mems_allowed);
+       if (cgroup_on_dfl(cs->css.cgroup))
+               hotplug_update_tasks(cs, &new_cpus, &new_mems,
+                                    cpus_updated, mems_updated);
+       else
+               hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+                                           cpus_updated, mems_updated);
   
         mutex_unlock(&cpuset_mutex);
- 
-       /*
-        * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
-        *
-        * Otherwise move tasks to the nearest ancestor with execution
-        * resources.  This is full cgroup operation which will
-        * also call back into cpuset.  Should be done outside any lock.
-        */
-       if (!sane && is_empty)
-               remove_tasks_in_empty_cpuset(cs);
   }
   
   /**
@@@ -2132,6 -2226,7 +2244,7 @@@ static void cpuset_hotplug_workfn(struc
         static cpumask_t new_cpus;
         static nodemask_t new_mems;
         bool cpus_updated, mems_updated;
+       bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
   
         mutex_lock(&cpuset_mutex);
   
@@@ -2139,13 -2234,15 +2252,15 @@@
         cpumask_copy(&new_cpus, cpu_active_mask);
         new_mems = node_states[N_MEMORY];
   
-       cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
-       mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+       cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+       mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
   
         /* synchronize cpus_allowed to cpu_active_mask */
         if (cpus_updated) {
                 mutex_lock(&callback_mutex);
-               cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+               if (!on_dfl)
+                       cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+               cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
                 mutex_unlock(&callback_mutex);
                 /* we don't mess with cpumasks of tasks in top_cpuset */
         }
@@@ -2153,7 -2250,9 +2268,9 @@@
         /* synchronize mems_allowed to N_MEMORY */
         if (mems_updated) {
                 mutex_lock(&callback_mutex);
-               top_cpuset.mems_allowed = new_mems;
+               if (!on_dfl)
+                       top_cpuset.mems_allowed = new_mems;
+               top_cpuset.effective_mems = new_mems;
                 mutex_unlock(&callback_mutex);
                 update_tasks_nodemask(&top_cpuset);
         }
@@@ -2228,6 -2327,9 +2345,9 @@@ void __init cpuset_init_smp(void
         top_cpuset.mems_allowed = node_states[N_MEMORY];
         top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
   
+       cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
+       top_cpuset.effective_mems = node_states[N_MEMORY];
+ 
         register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
   }
   
@@@ -2244,23 -2346,17 +2364,17 @@@
   
   void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
   {
-       struct cpuset *cpus_cs;
- 
         mutex_lock(&callback_mutex);
         rcu_read_lock();
-       cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
-       guarantee_online_cpus(cpus_cs, pmask);
+       guarantee_online_cpus(task_cs(tsk), pmask);
         rcu_read_unlock();
         mutex_unlock(&callback_mutex);
   }
   
   void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
   {
-       struct cpuset *cpus_cs;
- 
         rcu_read_lock();
-       cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
-       do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
+       do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
         rcu_read_unlock();
   
         /*
@@@ -2299,13 -2395,11 +2413,11 @@@ void cpuset_init_current_mems_allowed(v
   
   nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
   {
-       struct cpuset *mems_cs;
         nodemask_t mask;
   
         mutex_lock(&callback_mutex);
         rcu_read_lock();
-       mems_cs = effective_nodemask_cpuset(task_cs(tsk));
-       guarantee_online_mems(mems_cs, &mask);
+       guarantee_online_mems(task_cs(tsk), &mask);
         rcu_read_unlock();
         mutex_unlock(&callback_mutex);
   
diff --combined kernel/sched/core.c

index bc1638b,6628e80..126f7e3
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -4147,6 -4147,7 +4147,6 @@@ static void __cond_resched(void
   
   int __sched _cond_resched(void)
   {
- -      rcu_cond_resched();
         if (should_resched()) {
                 __cond_resched();
                 return 1;
@@@ -4165,15 -4166,18 +4165,15 @@@ EXPORT_SYMBOL(_cond_resched)
    */
   int __cond_resched_lock(spinlock_t *lock)
   {
- -      bool need_rcu_resched = rcu_should_resched();
         int resched = should_resched();
         int ret = 0;
   
         lockdep_assert_held(lock);
   
- -      if (spin_needbreak(lock) || resched || need_rcu_resched) {
+ +      if (spin_needbreak(lock) || resched) {
                 spin_unlock(lock);
                 if (resched)
                         __cond_resched();
- -              else if (unlikely(need_rcu_resched))
- -                      rcu_resched();
                 else
                         cpu_relax();
                 ret = 1;
@@@ -4187,6 -4191,7 +4187,6 @@@ int __sched __cond_resched_softirq(void
   {
         BUG_ON(!in_softirq());
   
- -      rcu_cond_resched();  /* BH disabled OK, just recording QSes. */
         if (should_resched()) {
                 local_bh_enable();
                 __cond_resched();
@@@ -8083,7 -8088,7 +8083,7 @@@ struct cgroup_subsys cpu_cgrp_subsys = 
         .can_attach     = cpu_cgroup_can_attach,
         .attach         = cpu_cgroup_attach,
         .exit           = cpu_cgroup_exit,
-       .base_cftypes   = cpu_files,
+       .legacy_cftypes = cpu_files,
         .early_init     = 1,
   };
   
diff --combined mm/memcontrol.c

index 1f14a43,45c10c6..f009a14
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -5415,12 -5415,8 +5415,12 @@@ static int mem_cgroup_oom_notify_cb(str
   {
         struct mem_cgroup_eventfd_list *ev;
   
+ +      spin_lock(&memcg_oom_lock);
+ +
         list_for_each_entry(ev, &memcg->oom_notify, list)
                 eventfd_signal(ev->eventfd, 1);
+ +
+ +      spin_unlock(&memcg_oom_lock);
         return 0;
   }
   
@@@ -6007,7 -6003,6 +6007,6 @@@ static struct cftype mem_cgroup_files[
         },
         {
                 .name = "use_hierarchy",
-               .flags = CFTYPE_INSANE,
                 .write_u64 = mem_cgroup_hierarchy_write,
                 .read_u64 = mem_cgroup_hierarchy_read,
         },
@@@ -6411,6 -6406,29 +6410,29 @@@ static void mem_cgroup_css_free(struct 
         __mem_cgroup_free(memcg);
   }
   
+ /**
+  * mem_cgroup_css_reset - reset the states of a mem_cgroup
+  * @css: the target css
+  *
+  * Reset the states of the mem_cgroup associated with @css.  This is
+  * invoked when the userland requests disabling on the default hierarchy
+  * but the memcg is pinned through dependency.  The memcg should stop
+  * applying policies and should revert to the vanilla state as it may be
+  * made visible again.
+  *
+  * The current implementation only resets the essential configurations.
+  * This needs to be expanded to cover all the visible parts.
+  */
+ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
+ {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ 
+       mem_cgroup_resize_limit(memcg, ULLONG_MAX);
+       mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
+       memcg_update_kmem_limit(memcg, ULLONG_MAX);
+       res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
+ }
+ 
   #ifdef CONFIG_MMU
   /* Handlers for move charge at task migration. */
   #define PRECHARGE_COUNT_AT_ONCE       256
@@@ -7005,16 -7023,17 +7027,17 @@@ static void mem_cgroup_move_task(struc
   
   /*
    * Cgroup retains root cgroups across [un]mount cycles making it necessary
-  * to verify sane_behavior flag on each mount attempt.
+  * to verify whether we're attached to the default hierarchy on each mount
+  * attempt.
    */
   static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
   {
         /*
-        * use_hierarchy is forced with sane_behavior.  cgroup core
+        * use_hierarchy is forced on the default hierarchy.  cgroup core
          * guarantees that @root doesn't have any children, so turning it
          * on for the root memcg is enough.
          */
-       if (cgroup_sane_behavior(root_css->cgroup))
+       if (cgroup_on_dfl(root_css->cgroup))
                 mem_cgroup_from_css(root_css)->use_hierarchy = true;
   }
   
@@@ -7023,11 -7042,12 +7046,12 @@@ struct cgroup_subsys memory_cgrp_subsy
         .css_online = mem_cgroup_css_online,
         .css_offline = mem_cgroup_css_offline,
         .css_free = mem_cgroup_css_free,
+       .css_reset = mem_cgroup_css_reset,
         .can_attach = mem_cgroup_can_attach,
         .cancel_attach = mem_cgroup_cancel_attach,
         .attach = mem_cgroup_move_task,
         .bind = mem_cgroup_bind,
-       .base_cftypes = mem_cgroup_files,
+       .legacy_cftypes = mem_cgroup_files,
         .early_init = 0,
   };
   
@@@ -7044,7 -7064,8 +7068,8 @@@ __setup("swapaccount=", enable_swap_acc
   
   static void __init memsw_file_init(void)
   {
-       WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
+       WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+                                         memsw_cgroup_files));
   }
   
   static void __init enable_swap_cgroup(void)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 4 Aug 2014 17:11:28 +0000 (10:11 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 4 Aug 2014 17:11:28 +0000 (10:11 -0700)
		1	2
block/blk-cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cpuset.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history