Merge branch 'for-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 4 Aug 2014 17:11:28 +0000 (10:11 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 4 Aug 2014 17:11:28 +0000 (10:11 -0700)
Pull cgroup changes from Tejun Heo:
 "Mostly changes to get the v2 interface ready.  The core features are
  mostly ready now and I think it's reasonable to expect to drop the
  devel mask in one or two devel cycles at least for a subset of
  controllers.

   - cgroup added a controller dependency mechanism so that block cgroup
     can depend on memory cgroup.  This will be used to finally support
     IO provisioning on the writeback traffic, which is currently being
     implemented.

   - The v2 interface now uses a separate table so that the interface
     files for the new interface are explicitly declared in one place.
     Each controller will explicitly review and add the files for the
     new interface.

   - cpuset is getting ready for the hierarchical behavior which is in
     the similar style with other controllers so that an ancestor's
     configuration change doesn't change the descendants' configurations
     irreversibly and processes aren't silently migrated when a CPU or
     node goes down.

  All the changes are to the new interface and no behavior changed for
  the multiple hierarchies"

* 'for-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (29 commits)
  cpuset: fix the WARN_ON() in update_nodemasks_hier()
  cgroup: initialize cgrp_dfl_root_inhibit_ss_mask from !->dfl_files test
  cgroup: make CFTYPE_ONLY_ON_DFL and CFTYPE_NO_ internal to cgroup core
  cgroup: distinguish the default and legacy hierarchies when handling cftypes
  cgroup: replace cgroup_add_cftypes() with cgroup_add_legacy_cftypes()
  cgroup: rename cgroup_subsys->base_cftypes to ->legacy_cftypes
  cgroup: split cgroup_base_files[] into cgroup_{dfl|legacy}_base_files[]
  cpuset: export effective masks to userspace
  cpuset: allow writing offlined masks to cpuset.cpus/mems
  cpuset: enable onlined cpu/node in effective masks
  cpuset: refactor cpuset_hotplug_update_tasks()
  cpuset: make cs->{cpus, mems}_allowed as user-configured masks
  cpuset: apply cs->effective_{cpus,mems}
  cpuset: initialize top_cpuset's configured masks at mount
  cpuset: use effective cpumask to build sched domains
  cpuset: inherit ancestor's masks if effective_{cpus, mems} becomes empty
  cpuset: update cs->effective_{cpus, mems} when config changes
  cpuset: update cpuset->effective_{cpus,mems} at hotplug
  cpuset: add cs->effective_cpus and cs->effective_mems
  cgroup: clean up sane_behavior handling
  ...

1  2 
block/blk-cgroup.c
kernel/cgroup.c
kernel/cpuset.c
kernel/sched/core.c
mm/memcontrol.c

diff --combined block/blk-cgroup.c
@@@ -80,7 -80,7 +80,7 @@@ static struct blkcg_gq *blkg_alloc(stru
        blkg->q = q;
        INIT_LIST_HEAD(&blkg->q_node);
        blkg->blkcg = blkcg;
 -      blkg->refcnt = 1;
 +      atomic_set(&blkg->refcnt, 1);
  
        /* root blkg uses @q->root_rl, init rl only for !root blkgs */
        if (blkcg != &blkcg_root) {
@@@ -399,8 -399,11 +399,8 @@@ void __blkg_release_rcu(struct rcu_hea
  
        /* release the blkcg and parent blkg refs this blkg has been holding */
        css_put(&blkg->blkcg->css);
 -      if (blkg->parent) {
 -              spin_lock_irq(blkg->q->queue_lock);
 +      if (blkg->parent)
                blkg_put(blkg->parent);
 -              spin_unlock_irq(blkg->q->queue_lock);
 -      }
  
        blkg_free(blkg);
  }
@@@ -872,13 -875,6 +872,13 @@@ void blkcg_drain_queue(struct request_q
  {
        lockdep_assert_held(q->queue_lock);
  
 +      /*
 +       * @q could be exiting and already have destroyed all blkgs as
 +       * indicated by NULL root_blkg.  If so, don't confuse policies.
 +       */
 +      if (!q->root_blkg)
 +              return;
 +
        blk_throtl_drain(q);
  }
  
@@@ -928,7 -924,15 +928,15 @@@ struct cgroup_subsys blkio_cgrp_subsys 
        .css_offline = blkcg_css_offline,
        .css_free = blkcg_css_free,
        .can_attach = blkcg_can_attach,
-       .base_cftypes = blkcg_files,
+       .legacy_cftypes = blkcg_files,
+ #ifdef CONFIG_MEMCG
+       /*
+        * This ensures that, if available, memcg is automatically enabled
+        * together on the default hierarchy so that the owner cgroup can
+        * be retrieved from writeback pages.
+        */
+       .depends_on = 1 << memory_cgrp_id,
+ #endif
  };
  EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
  
@@@ -1097,7 -1101,7 +1105,7 @@@ EXPORT_SYMBOL_GPL(blkcg_deactivate_poli
   * Register @pol with blkcg core.  Might sleep and @pol may be modified on
   * successful registration.  Returns 0 on success and -errno on failure.
   */
 -int __init blkcg_policy_register(struct blkcg_policy *pol)
 +int blkcg_policy_register(struct blkcg_policy *pol)
  {
        int i, ret;
  
  
        /* everything is in place, add intf files for the new policy */
        if (pol->cftypes)
-               WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes));
+               WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
+                                                 pol->cftypes));
        ret = 0;
  out_unlock:
        mutex_unlock(&blkcg_pol_mutex);
diff --combined kernel/cgroup.c
@@@ -149,12 -149,14 +149,14 @@@ struct cgroup_root cgrp_dfl_root
   */
  static bool cgrp_dfl_root_visible;
  
+ /*
+  * Set by the boot param of the same name and makes subsystems with NULL
+  * ->dfl_files to use ->legacy_files on the default hierarchy.
+  */
+ static bool cgroup_legacy_files_on_dfl;
  /* some controllers are not supported in the default hierarchy */
- static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
- #ifdef CONFIG_CGROUP_DEBUG
-       | (1 << debug_cgrp_id)
- #endif
-       ;
+ static unsigned int cgrp_dfl_root_inhibit_ss_mask;
  
  /* The list of hierarchy roots */
  
@@@ -180,13 -182,15 +182,15 @@@ static u64 css_serial_nr_next = 1
   */
  static int need_forkexit_callback __read_mostly;
  
- static struct cftype cgroup_base_files[];
+ static struct cftype cgroup_dfl_base_files[];
+ static struct cftype cgroup_legacy_base_files[];
  
  static void cgroup_put(struct cgroup *cgrp);
  static int rebind_subsystems(struct cgroup_root *dst_root,
                             unsigned int ss_mask);
  static int cgroup_destroy_locked(struct cgroup *cgrp);
- static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+                     bool visible);
  static void css_release(struct percpu_ref *ref);
  static void kill_css(struct cgroup_subsys_state *css);
  static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
@@@ -1036,6 -1040,58 +1040,58 @@@ static void cgroup_put(struct cgroup *c
        css_put(&cgrp->self);
  }
  
+ /**
+  * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+  * @cgrp: the target cgroup
+  *
+  * On the default hierarchy, a subsystem may request other subsystems to be
+  * enabled together through its ->depends_on mask.  In such cases, more
+  * subsystems than specified in "cgroup.subtree_control" may be enabled.
+  *
+  * This function determines which subsystems need to be enabled given the
+  * current @cgrp->subtree_control and records it in
+  * @cgrp->child_subsys_mask.  The resulting mask is always a superset of
+  * @cgrp->subtree_control and follows the usual hierarchy rules.
+  */
+ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+ {
+       struct cgroup *parent = cgroup_parent(cgrp);
+       unsigned int cur_ss_mask = cgrp->subtree_control;
+       struct cgroup_subsys *ss;
+       int ssid;
+       lockdep_assert_held(&cgroup_mutex);
+       if (!cgroup_on_dfl(cgrp)) {
+               cgrp->child_subsys_mask = cur_ss_mask;
+               return;
+       }
+       while (true) {
+               unsigned int new_ss_mask = cur_ss_mask;
+               for_each_subsys(ss, ssid)
+                       if (cur_ss_mask & (1 << ssid))
+                               new_ss_mask |= ss->depends_on;
+               /*
+                * Mask out subsystems which aren't available.  This can
+                * happen only if some depended-upon subsystems were bound
+                * to non-default hierarchies.
+                */
+               if (parent)
+                       new_ss_mask &= parent->child_subsys_mask;
+               else
+                       new_ss_mask &= cgrp->root->subsys_mask;
+               if (new_ss_mask == cur_ss_mask)
+                       break;
+               cur_ss_mask = new_ss_mask;
+       }
+       cgrp->child_subsys_mask = cur_ss_mask;
+ }
  /**
   * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
   * @kn: the kernfs_node being serviced
@@@ -1208,12 -1264,15 +1264,15 @@@ static int rebind_subsystems(struct cgr
                up_write(&css_set_rwsem);
  
                src_root->subsys_mask &= ~(1 << ssid);
-               src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+               src_root->cgrp.subtree_control &= ~(1 << ssid);
+               cgroup_refresh_child_subsys_mask(&src_root->cgrp);
  
                /* default hierarchy doesn't enable controllers by default */
                dst_root->subsys_mask |= 1 << ssid;
-               if (dst_root != &cgrp_dfl_root)
-                       dst_root->cgrp.child_subsys_mask |= 1 << ssid;
+               if (dst_root != &cgrp_dfl_root) {
+                       dst_root->cgrp.subtree_control |= 1 << ssid;
+                       cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+               }
  
                if (ss->bind)
                        ss->bind(css);
@@@ -1233,8 -1292,6 +1292,6 @@@ static int cgroup_show_options(struct s
        for_each_subsys(ss, ssid)
                if (root->subsys_mask & (1 << ssid))
                        seq_printf(seq, ",%s", ss->name);
-       if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
-               seq_puts(seq, ",sane_behavior");
        if (root->flags & CGRP_ROOT_NOPREFIX)
                seq_puts(seq, ",noprefix");
        if (root->flags & CGRP_ROOT_XATTR)
@@@ -1268,6 -1325,7 +1325,7 @@@ static int parse_cgroupfs_options(char 
        bool all_ss = false, one_ss = false;
        unsigned int mask = -1U;
        struct cgroup_subsys *ss;
+       int nr_opts = 0;
        int i;
  
  #ifdef CONFIG_CPUSETS
        memset(opts, 0, sizeof(*opts));
  
        while ((token = strsep(&o, ",")) != NULL) {
+               nr_opts++;
                if (!*token)
                        return -EINVAL;
                if (!strcmp(token, "none")) {
                        return -ENOENT;
        }
  
-       /* Consistency checks */
        if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
                pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
-               if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
-                   opts->cpuset_clone_children || opts->release_agent ||
-                   opts->name) {
-                       pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+               if (nr_opts != 1) {
+                       pr_err("sane_behavior: no other mount options allowed\n");
                        return -EINVAL;
                }
-       } else {
-               /*
-                * If the 'all' option was specified select all the
-                * subsystems, otherwise if 'none', 'name=' and a subsystem
-                * name options were not specified, let's default to 'all'
-                */
-               if (all_ss || (!one_ss && !opts->none && !opts->name))
-                       for_each_subsys(ss, i)
-                               if (!ss->disabled)
-                                       opts->subsys_mask |= (1 << i);
-               /*
-                * We either have to specify by name or by subsystems. (So
-                * all empty hierarchies must have a name).
-                */
-               if (!opts->subsys_mask && !opts->name)
-                       return -EINVAL;
+               return 0;
        }
  
+       /*
+        * If the 'all' option was specified select all the subsystems,
+        * otherwise if 'none', 'name=' and a subsystem name options were
+        * not specified, let's default to 'all'
+        */
+       if (all_ss || (!one_ss && !opts->none && !opts->name))
+               for_each_subsys(ss, i)
+                       if (!ss->disabled)
+                               opts->subsys_mask |= (1 << i);
+       /*
+        * We either have to specify by name or by subsystems. (So all
+        * empty hierarchies must have a name).
+        */
+       if (!opts->subsys_mask && !opts->name)
+               return -EINVAL;
        /*
         * Option noprefix was introduced just for backward compatibility
         * with the old cpuset, so we allow noprefix only if mounting just
        if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
                return -EINVAL;
  
        /* Can't specify "none" and some subsystems */
        if (opts->subsys_mask && opts->none)
                return -EINVAL;
@@@ -1414,8 -1469,8 +1469,8 @@@ static int cgroup_remount(struct kernfs
        struct cgroup_sb_opts opts;
        unsigned int added_mask, removed_mask;
  
-       if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
-               pr_err("sane_behavior: remount is not allowed\n");
+       if (root == &cgrp_dfl_root) {
+               pr_err("remount is not allowed\n");
                return -EINVAL;
        }
  
        removed_mask = root->subsys_mask & ~opts.subsys_mask;
  
        /* Don't allow flags or name to change at remount */
-       if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
+       if ((opts.flags ^ root->flags) ||
            (opts.name && strcmp(opts.name, root->name))) {
                pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
-                      opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
-                      root->flags & CGRP_ROOT_OPTION_MASK, root->name);
+                      opts.flags, opts.name ?: "", root->flags, root->name);
                ret = -EINVAL;
                goto out_unlock;
        }
@@@ -1563,6 -1617,7 +1617,7 @@@ static int cgroup_setup_root(struct cgr
  {
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
+       struct cftype *base_files;
        struct css_set *cset;
        int i, ret;
  
        }
        root_cgrp->kn = root->kf_root->kn;
  
-       ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+       if (root == &cgrp_dfl_root)
+               base_files = cgroup_dfl_base_files;
+       else
+               base_files = cgroup_legacy_base_files;
+       ret = cgroup_addrm_files(root_cgrp, base_files, true);
        if (ret)
                goto destroy_root;
  
@@@ -1638,7 -1698,7 +1698,7 @@@ destroy_root
  exit_root_id:
        cgroup_exit_root_id(root);
  cancel_ref:
 -      percpu_ref_cancel_init(&root_cgrp->self.refcnt);
 +      percpu_ref_exit(&root_cgrp->self.refcnt);
  out:
        free_cgrp_cset_links(&tmp_links);
        return ret;
@@@ -1648,13 -1708,10 +1708,13 @@@ static struct dentry *cgroup_mount(stru
                         int flags, const char *unused_dev_name,
                         void *data)
  {
 +      struct super_block *pinned_sb = NULL;
 +      struct cgroup_subsys *ss;
        struct cgroup_root *root;
        struct cgroup_sb_opts opts;
        struct dentry *dentry;
        int ret;
 +      int i;
        bool new_sb;
  
        /*
                goto out_unlock;
  
        /* look for a matching existing root */
-       if (!opts.subsys_mask && !opts.none && !opts.name) {
+       if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
                cgrp_dfl_root_visible = true;
                root = &cgrp_dfl_root;
                cgroup_get(&root->cgrp);
                goto out_unlock;
        }
  
 +      /*
 +       * Destruction of cgroup root is asynchronous, so subsystems may
 +       * still be dying after the previous unmount.  Let's drain the
 +       * dying subsystems.  We just need to ensure that the ones
 +       * unmounted previously finish dying and don't care about new ones
 +       * starting.  Testing ref liveliness is good enough.
 +       */
 +      for_each_subsys(ss, i) {
 +              if (!(opts.subsys_mask & (1 << i)) ||
 +                  ss->root == &cgrp_dfl_root)
 +                      continue;
 +
 +              if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
 +                      mutex_unlock(&cgroup_mutex);
 +                      msleep(10);
 +                      ret = restart_syscall();
 +                      goto out_free;
 +              }
 +              cgroup_put(&ss->root->cgrp);
 +      }
 +
        for_each_root(root) {
                bool name_match = false;
  
                        goto out_unlock;
                }
  
-               if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
-                       if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
-                               pr_err("sane_behavior: new mount options should match the existing superblock\n");
-                               ret = -EINVAL;
-                               goto out_unlock;
-                       } else {
-                               pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-                       }
-               }
+               if (root->flags ^ opts.flags)
+                       pr_warn("new mount options do not match the existing superblock, will be ignored\n");
  
                /*
 -               * A root's lifetime is governed by its root cgroup.
 -               * tryget_live failure indicate that the root is being
 -               * destroyed.  Wait for destruction to complete so that the
 -               * subsystems are free.  We can use wait_queue for the wait
 -               * but this path is super cold.  Let's just sleep for a bit
 -               * and retry.
 +               * We want to reuse @root whose lifetime is governed by its
 +               * ->cgrp.  Let's check whether @root is alive and keep it
 +               * that way.  As cgroup_kill_sb() can happen anytime, we
 +               * want to block it by pinning the sb so that @root doesn't
 +               * get killed before mount is complete.
 +               *
 +               * With the sb pinned, tryget_live can reliably indicate
 +               * whether @root can be reused.  If it's being killed,
 +               * drain it.  We can use wait_queue for the wait but this
 +               * path is super cold.  Let's just sleep a bit and retry.
                 */
 -              if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
 +              pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
 +              if (IS_ERR(pinned_sb) ||
 +                  !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
                        mutex_unlock(&cgroup_mutex);
 +                      if (!IS_ERR_OR_NULL(pinned_sb))
 +                              deactivate_super(pinned_sb);
                        msleep(10);
                        ret = restart_syscall();
                        goto out_free;
@@@ -1802,16 -1823,6 +1855,16 @@@ out_free
                                CGROUP_SUPER_MAGIC, &new_sb);
        if (IS_ERR(dentry) || !new_sb)
                cgroup_put(&root->cgrp);
 +
 +      /*
 +       * If @pinned_sb, we're reusing an existing root and holding an
 +       * extra ref on its sb.  Mount is complete.  Put the extra ref.
 +       */
 +      if (pinned_sb) {
 +              WARN_ON(new_sb);
 +              deactivate_super(pinned_sb);
 +      }
 +
        return dentry;
  }
  
@@@ -2457,9 -2468,7 +2510,7 @@@ static int cgroup_release_agent_show(st
  
  static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
  {
-       struct cgroup *cgrp = seq_css(seq)->cgroup;
-       seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+       seq_puts(seq, "0\n");
        return 0;
  }
  
@@@ -2496,7 -2505,7 +2547,7 @@@ static int cgroup_controllers_show(stru
  {
        struct cgroup *cgrp = seq_css(seq)->cgroup;
  
-       cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+       cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
        return 0;
  }
  
@@@ -2505,7 -2514,7 +2556,7 @@@ static int cgroup_subtree_control_show(
  {
        struct cgroup *cgrp = seq_css(seq)->cgroup;
  
-       cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+       cgroup_print_ss_mask(seq, cgrp->subtree_control);
        return 0;
  }
  
@@@ -2611,6 -2620,7 +2662,7 @@@ static ssize_t cgroup_subtree_control_w
                                            loff_t off)
  {
        unsigned int enable = 0, disable = 0;
+       unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
        struct cgroup *cgrp, *child;
        struct cgroup_subsys *ss;
        char *tok;
  
        for_each_subsys(ss, ssid) {
                if (enable & (1 << ssid)) {
-                       if (cgrp->child_subsys_mask & (1 << ssid)) {
+                       if (cgrp->subtree_control & (1 << ssid)) {
                                enable &= ~(1 << ssid);
                                continue;
                        }
  
+                       /* unavailable or not enabled on the parent? */
+                       if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+                           (cgroup_parent(cgrp) &&
+                            !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+                               ret = -ENOENT;
+                               goto out_unlock;
+                       }
+                       /*
+                        * @ss is already enabled through dependency and
+                        * we'll just make it visible.  Skip draining.
+                        */
+                       if (cgrp->child_subsys_mask & (1 << ssid))
+                               continue;
                        /*
                         * Because css offlining is asynchronous, userland
                         * might try to re-enable the same controller while
  
                                return restart_syscall();
                        }
-                       /* unavailable or not enabled on the parent? */
-                       if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
-                           (cgroup_parent(cgrp) &&
-                            !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
-                               ret = -ENOENT;
-                               goto out_unlock;
-                       }
                } else if (disable & (1 << ssid)) {
-                       if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+                       if (!(cgrp->subtree_control & (1 << ssid))) {
                                disable &= ~(1 << ssid);
                                continue;
                        }
  
                        /* a child has it enabled? */
                        cgroup_for_each_live_child(child, cgrp) {
-                               if (child->child_subsys_mask & (1 << ssid)) {
+                               if (child->subtree_control & (1 << ssid)) {
                                        ret = -EBUSY;
                                        goto out_unlock;
                                }
        }
  
        /*
-        * Except for the root, child_subsys_mask must be zero for a cgroup
+        * Except for the root, subtree_control must be zero for a cgroup
         * with tasks so that child cgroups don't compete against tasks.
         */
        if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
        }
  
        /*
-        * Create csses for enables and update child_subsys_mask.  This
-        * changes cgroup_e_css() results which in turn makes the
-        * subsequent cgroup_update_dfl_csses() associate all tasks in the
-        * subtree to the updated csses.
+        * Update subsys masks and calculate what needs to be done.  More
+        * subsystems than specified may need to be enabled or disabled
+        * depending on subsystem dependencies.
+        */
+       cgrp->subtree_control |= enable;
+       cgrp->subtree_control &= ~disable;
+       old_ctrl = cgrp->child_subsys_mask;
+       cgroup_refresh_child_subsys_mask(cgrp);
+       new_ctrl = cgrp->child_subsys_mask;
+       css_enable = ~old_ctrl & new_ctrl;
+       css_disable = old_ctrl & ~new_ctrl;
+       enable |= css_enable;
+       disable |= css_disable;
+       /*
+        * Create new csses or make the existing ones visible.  A css is
+        * created invisible if it's being implicitly enabled through
+        * dependency.  An invisible css is made visible when the userland
+        * explicitly enables it.
         */
        for_each_subsys(ss, ssid) {
                if (!(enable & (1 << ssid)))
                        continue;
  
                cgroup_for_each_live_child(child, cgrp) {
-                       ret = create_css(child, ss);
+                       if (css_enable & (1 << ssid))
+                               ret = create_css(child, ss,
+                                       cgrp->subtree_control & (1 << ssid));
+                       else
+                               ret = cgroup_populate_dir(child, 1 << ssid);
                        if (ret)
                                goto err_undo_css;
                }
        }
  
-       cgrp->child_subsys_mask |= enable;
-       cgrp->child_subsys_mask &= ~disable;
+       /*
+        * At this point, cgroup_e_css() results reflect the new csses
+        * making the following cgroup_update_dfl_csses() properly update
+        * css associations of all tasks in the subtree.
+        */
        ret = cgroup_update_dfl_csses(cgrp);
        if (ret)
                goto err_undo_css;
  
-       /* all tasks are now migrated away from the old csses, kill them */
+       /*
+        * All tasks are migrated out of disabled csses.  Kill or hide
+        * them.  A css is hidden when the userland requests it to be
+        * disabled while other subsystems are still depending on it.  The
+        * css must not actively control resources and be in the vanilla
+        * state if it's made visible again later.  Controllers which may
+        * be depended upon should provide ->css_reset() for this purpose.
+        */
        for_each_subsys(ss, ssid) {
                if (!(disable & (1 << ssid)))
                        continue;
  
-               cgroup_for_each_live_child(child, cgrp)
-                       kill_css(cgroup_css(child, ss));
+               cgroup_for_each_live_child(child, cgrp) {
+                       struct cgroup_subsys_state *css = cgroup_css(child, ss);
+                       if (css_disable & (1 << ssid)) {
+                               kill_css(css);
+                       } else {
+                               cgroup_clear_dir(child, 1 << ssid);
+                               if (ss->css_reset)
+                                       ss->css_reset(css);
+                       }
+               }
        }
  
        kernfs_activate(cgrp->kn);
@@@ -2755,8 -2811,9 +2853,9 @@@ out_unlock
        return ret ?: nbytes;
  
  err_undo_css:
-       cgrp->child_subsys_mask &= ~enable;
-       cgrp->child_subsys_mask |= disable;
+       cgrp->subtree_control &= ~enable;
+       cgrp->subtree_control |= disable;
+       cgroup_refresh_child_subsys_mask(cgrp);
  
        for_each_subsys(ss, ssid) {
                if (!(enable & (1 << ssid)))
  
                cgroup_for_each_live_child(child, cgrp) {
                        struct cgroup_subsys_state *css = cgroup_css(child, ss);
-                       if (css)
+                       if (!css)
+                               continue;
+                       if (css_enable & (1 << ssid))
                                kill_css(css);
+                       else
+                               cgroup_clear_dir(child, 1 << ssid);
                }
        }
        goto out_unlock;
@@@ -2878,9 -2941,9 +2983,9 @@@ static int cgroup_rename(struct kernfs_
  
        /*
         * This isn't a proper migration and its usefulness is very
-        * limited.  Disallow if sane_behavior.
+        * limited.  Disallow on the default hierarchy.
         */
-       if (cgroup_sane_behavior(cgrp))
+       if (cgroup_on_dfl(cgrp))
                return -EPERM;
  
        /*
@@@ -2964,9 -3027,9 +3069,9 @@@ static int cgroup_addrm_files(struct cg
  
        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
-               if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+               if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                        continue;
-               if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+               if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
                        continue;
@@@ -3024,6 -3087,9 +3129,9 @@@ static void cgroup_exit_cftypes(struct 
                        kfree(cft->kf_ops);
                cft->kf_ops = NULL;
                cft->ss = NULL;
+               /* revert flags set by cgroup core while adding @cfts */
+               cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
        }
  }
  
@@@ -3109,7 -3175,7 +3217,7 @@@ int cgroup_rm_cftypes(struct cftype *cf
   * function currently returns 0 as long as @cfts registration is successful
   * even if some file creation attempts on existing cgroups fail.
   */
- int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
  {
        int ret;
  
        return ret;
  }
  
+ /**
+  * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
+  * @ss: target cgroup subsystem
+  * @cfts: zero-length name terminated array of cftypes
+  *
+  * Similar to cgroup_add_cftypes() but the added files are only used for
+  * the default hierarchy.
+  */
+ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+ {
+       struct cftype *cft;
+       for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+               cft->flags |= __CFTYPE_ONLY_ON_DFL;
+       return cgroup_add_cftypes(ss, cfts);
+ }
+ /**
+  * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
+  * @ss: target cgroup subsystem
+  * @cfts: zero-length name terminated array of cftypes
+  *
+  * Similar to cgroup_add_cftypes() but the added files are only used for
+  * the legacy hierarchies.
+  */
+ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+ {
+       struct cftype *cft;
+       for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+               cft->flags |= __CFTYPE_NOT_ON_DFL;
+       return cgroup_add_cftypes(ss, cfts);
+ }
  /**
   * cgroup_task_count - count the number of tasks in a cgroup.
   * @cgrp: the cgroup in question
@@@ -3370,7 -3470,7 +3512,7 @@@ bool css_has_online_children(struct cgr
  
        rcu_read_lock();
        css_for_each_child(child, css) {
 -              if (css->flags & CSS_ONLINE) {
 +              if (child->flags & CSS_ONLINE) {
                        ret = true;
                        break;
                }
@@@ -3699,8 -3799,9 +3841,9 @@@ after
   *
   * All this extra complexity was caused by the original implementation
   * committing to an entirely unnecessary property.  In the long term, we
-  * want to do away with it.  Explicitly scramble sort order if
-  * sane_behavior so that no such expectation exists in the new interface.
+  * want to do away with it.  Explicitly scramble sort order if on the
+  * default hierarchy so that no such expectation exists in the new
+  * interface.
   *
   * Scrambling is done by swapping every two consecutive bits, which is
   * non-identity one-to-one mapping which disturbs sort order sufficiently.
@@@ -3715,7 -3816,7 +3858,7 @@@ static pid_t pid_fry(pid_t pid
  
  static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
  {
-       if (cgroup_sane_behavior(cgrp))
+       if (cgroup_on_dfl(cgrp))
                return pid_fry(pid);
        else
                return pid;
@@@ -3818,7 -3919,7 +3961,7 @@@ static int pidlist_array_load(struct cg
        css_task_iter_end(&it);
        length = n;
        /* now sort & (if procs) strip out duplicates */
-       if (cgroup_sane_behavior(cgrp))
+       if (cgroup_on_dfl(cgrp))
                sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
        else
                sort(array, length, sizeof(pid_t), cmppid, NULL);
@@@ -4040,7 -4141,8 +4183,8 @@@ static int cgroup_clone_children_write(
        return 0;
  }
  
- static struct cftype cgroup_base_files[] = {
+ /* cgroup core interface files for the default hierarchy */
+ static struct cftype cgroup_dfl_base_files[] = {
        {
                .name = "cgroup.procs",
                .seq_start = cgroup_pidlist_start,
                .write = cgroup_procs_write,
                .mode = S_IRUGO | S_IWUSR,
        },
-       {
-               .name = "cgroup.clone_children",
-               .flags = CFTYPE_INSANE,
-               .read_u64 = cgroup_clone_children_read,
-               .write_u64 = cgroup_clone_children_write,
-       },
-       {
-               .name = "cgroup.sane_behavior",
-               .flags = CFTYPE_ONLY_ON_ROOT,
-               .seq_show = cgroup_sane_behavior_show,
-       },
        {
                .name = "cgroup.controllers",
-               .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+               .flags = CFTYPE_ONLY_ON_ROOT,
                .seq_show = cgroup_root_controllers_show,
        },
        {
                .name = "cgroup.controllers",
-               .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+               .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_controllers_show,
        },
        {
                .name = "cgroup.subtree_control",
-               .flags = CFTYPE_ONLY_ON_DFL,
                .seq_show = cgroup_subtree_control_show,
                .write = cgroup_subtree_control_write,
        },
        {
                .name = "cgroup.populated",
-               .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+               .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_populated_show,
        },
+       { }     /* terminate */
+ };
  
-       /*
-        * Historical crazy stuff.  These don't have "cgroup."  prefix and
-        * don't exist if sane_behavior.  If you're depending on these, be
-        * prepared to be burned.
-        */
+ /* cgroup core interface files for the legacy hierarchies */
+ static struct cftype cgroup_legacy_base_files[] = {
+       {
+               .name = "cgroup.procs",
+               .seq_start = cgroup_pidlist_start,
+               .seq_next = cgroup_pidlist_next,
+               .seq_stop = cgroup_pidlist_stop,
+               .seq_show = cgroup_pidlist_show,
+               .private = CGROUP_FILE_PROCS,
+               .write = cgroup_procs_write,
+               .mode = S_IRUGO | S_IWUSR,
+       },
+       {
+               .name = "cgroup.clone_children",
+               .read_u64 = cgroup_clone_children_read,
+               .write_u64 = cgroup_clone_children_write,
+       },
+       {
+               .name = "cgroup.sane_behavior",
+               .flags = CFTYPE_ONLY_ON_ROOT,
+               .seq_show = cgroup_sane_behavior_show,
+       },
        {
                .name = "tasks",
-               .flags = CFTYPE_INSANE,         /* use "procs" instead */
                .seq_start = cgroup_pidlist_start,
                .seq_next = cgroup_pidlist_next,
                .seq_stop = cgroup_pidlist_stop,
        },
        {
                .name = "notify_on_release",
-               .flags = CFTYPE_INSANE,
                .read_u64 = cgroup_read_notify_on_release,
                .write_u64 = cgroup_write_notify_on_release,
        },
        {
                .name = "release_agent",
-               .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
+               .flags = CFTYPE_ONLY_ON_ROOT,
                .seq_show = cgroup_release_agent_show,
                .write = cgroup_release_agent_write,
                .max_write_len = PATH_MAX - 1,
@@@ -4175,8 -4282,6 +4324,8 @@@ static void css_free_work_fn(struct wor
                container_of(work, struct cgroup_subsys_state, destroy_work);
        struct cgroup *cgrp = css->cgroup;
  
 +      percpu_ref_exit(&css->refcnt);
 +
        if (css->ss) {
                /* css free path */
                if (css->parent)
@@@ -4316,12 -4421,14 +4465,14 @@@ static void offline_css(struct cgroup_s
   * create_css - create a cgroup_subsys_state
   * @cgrp: the cgroup new css will be associated with
   * @ss: the subsys of new css
+  * @visible: whether to create control knobs for the new css or not
   *
   * Create a new css associated with @cgrp - @ss pair.  On success, the new
-  * css is online and installed in @cgrp with all interface files created.
-  * Returns 0 on success, -errno on failure.
+  * css is online and installed in @cgrp with all interface files created if
+  * @visible.  Returns 0 on success, -errno on failure.
   */
- static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
+ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+                     bool visible)
  {
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
                goto err_free_percpu_ref;
        css->id = err;
  
-       err = cgroup_populate_dir(cgrp, 1 << ss->id);
-       if (err)
-               goto err_free_id;
+       if (visible) {
+               err = cgroup_populate_dir(cgrp, 1 << ss->id);
+               if (err)
+                       goto err_free_id;
+       }
  
        /* @css is ready to be brought online now, make it visible */
        list_add_tail_rcu(&css->sibling, &parent_css->children);
@@@ -4374,7 -4483,7 +4527,7 @@@ err_list_del
  err_free_id:
        cgroup_idr_remove(&ss->css_idr, css->id);
  err_free_percpu_ref:
 -      percpu_ref_cancel_init(&css->refcnt);
 +      percpu_ref_exit(&css->refcnt);
  err_free_css:
        call_rcu(&css->rcu_head, css_free_rcu_fn);
        return err;
@@@ -4387,6 -4496,7 +4540,7 @@@ static int cgroup_mkdir(struct kernfs_n
        struct cgroup_root *root;
        struct cgroup_subsys *ss;
        struct kernfs_node *kn;
+       struct cftype *base_files;
        int ssid, ret;
  
        parent = cgroup_kn_lock_live(parent_kn);
        if (ret)
                goto out_destroy;
  
-       ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+       if (cgroup_on_dfl(cgrp))
+               base_files = cgroup_dfl_base_files;
+       else
+               base_files = cgroup_legacy_base_files;
+       ret = cgroup_addrm_files(cgrp, base_files, true);
        if (ret)
                goto out_destroy;
  
        /* let's create and online css's */
        for_each_subsys(ss, ssid) {
                if (parent->child_subsys_mask & (1 << ssid)) {
-                       ret = create_css(cgrp, ss);
+                       ret = create_css(cgrp, ss,
+                                        parent->subtree_control & (1 << ssid));
                        if (ret)
                                goto out_destroy;
                }
  
        /*
         * On the default hierarchy, a child doesn't automatically inherit
-        * child_subsys_mask from the parent.  Each is configured manually.
+        * subtree_control from the parent.  Each is configured manually.
         */
-       if (!cgroup_on_dfl(cgrp))
-               cgrp->child_subsys_mask = parent->child_subsys_mask;
+       if (!cgroup_on_dfl(cgrp)) {
+               cgrp->subtree_control = parent->subtree_control;
+               cgroup_refresh_child_subsys_mask(cgrp);
+       }
  
        kernfs_activate(kn);
  
  out_free_id:
        cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
  out_cancel_ref:
 -      percpu_ref_cancel_init(&cgrp->self.refcnt);
 +      percpu_ref_exit(&cgrp->self.refcnt);
  out_free_cgrp:
        kfree(cgrp);
  out_unlock:
@@@ -4738,8 -4856,7 +4900,7 @@@ static void __init cgroup_init_subsys(s
   */
  int __init cgroup_init_early(void)
  {
-       static struct cgroup_sb_opts __initdata opts =
-               { .flags = CGRP_ROOT_SANE_BEHAVIOR };
+       static struct cgroup_sb_opts __initdata opts;
        struct cgroup_subsys *ss;
        int i;
  
@@@ -4777,7 -4894,8 +4938,8 @@@ int __init cgroup_init(void
        unsigned long key;
        int ssid, err;
  
-       BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+       BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
+       BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
  
        mutex_lock(&cgroup_mutex);
  
                 * disabled flag and cftype registration needs kmalloc,
                 * both of which aren't available during early_init.
                 */
-               if (!ss->disabled) {
-                       cgrp_dfl_root.subsys_mask |= 1 << ss->id;
-                       WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+               if (ss->disabled)
+                       continue;
+               cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+               if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
+                       ss->dfl_cftypes = ss->legacy_cftypes;
+               if (!ss->dfl_cftypes)
+                       cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+               if (ss->dfl_cftypes == ss->legacy_cftypes) {
+                       WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
+               } else {
+                       WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
+                       WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
                }
        }
  
@@@ -5207,6 -5338,14 +5382,14 @@@ static int __init cgroup_disable(char *
  }
  __setup("cgroup_disable=", cgroup_disable);
  
+ static int __init cgroup_set_legacy_files_on_dfl(char *str)
+ {
+       printk("cgroup: using legacy files on the default hierarchy\n");
+       cgroup_legacy_files_on_dfl = true;
+       return 0;
+ }
+ __setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
  /**
   * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
   * @dentry: directory dentry of interest
@@@ -5401,6 -5540,6 +5584,6 @@@ static struct cftype debug_files[] =  
  struct cgroup_subsys debug_cgrp_subsys = {
        .css_alloc = debug_css_alloc,
        .css_free = debug_css_free,
-       .base_cftypes = debug_files,
+       .legacy_cftypes = debug_files,
  };
  #endif /* CONFIG_CGROUP_DEBUG */
diff --combined kernel/cpuset.c
@@@ -76,8 -76,34 +76,34 @@@ struct cpuset 
        struct cgroup_subsys_state css;
  
        unsigned long flags;            /* "unsigned long" so bitops work */
-       cpumask_var_t cpus_allowed;     /* CPUs allowed to tasks in cpuset */
-       nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
+       /*
+        * On default hierarchy:
+        *
+        * The user-configured masks can only be changed by writing to
+        * cpuset.cpus and cpuset.mems, and won't be limited by the
+        * parent masks.
+        *
+        * The effective masks is the real masks that apply to the tasks
+        * in the cpuset. They may be changed if the configured masks are
+        * changed or hotplug happens.
+        *
+        * effective_mask == configured_mask & parent's effective_mask,
+        * and if it ends up empty, it will inherit the parent's mask.
+        *
+        *
+        * On legacy hierachy:
+        *
+        * The user-configured masks are always the same with effective masks.
+        */
+       /* user-configured CPUs and Memory Nodes allow to tasks */
+       cpumask_var_t cpus_allowed;
+       nodemask_t mems_allowed;
+       /* effective CPUs and Memory Nodes allow to tasks */
+       cpumask_var_t effective_cpus;
+       nodemask_t effective_mems;
  
        /*
         * This is old Memory Nodes tasks took on.
@@@ -307,9 -333,9 +333,9 @@@ static struct file_system_type cpuset_f
   */
  static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
  {
-       while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+       while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
                cs = parent_cs(cs);
-       cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
+       cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
  }
  
  /*
   */
  static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
  {
-       while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+       while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
                cs = parent_cs(cs);
-       nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
+       nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
  }
  
  /*
@@@ -376,13 -402,20 +402,20 @@@ static struct cpuset *alloc_trial_cpuse
        if (!trial)
                return NULL;
  
-       if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
-               kfree(trial);
-               return NULL;
-       }
-       cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+       if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
+               goto free_cs;
+       if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
+               goto free_cpus;
  
+       cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+       cpumask_copy(trial->effective_cpus, cs->effective_cpus);
        return trial;
+ free_cpus:
+       free_cpumask_var(trial->cpus_allowed);
+ free_cs:
+       kfree(trial);
+       return NULL;
  }
  
  /**
   */
  static void free_trial_cpuset(struct cpuset *trial)
  {
+       free_cpumask_var(trial->effective_cpus);
        free_cpumask_var(trial->cpus_allowed);
        kfree(trial);
  }
@@@ -436,9 -470,9 +470,9 @@@ static int validate_change(struct cpuse
  
        par = parent_cs(cur);
  
-       /* We must be a subset of our parent cpuset */
+       /* On legacy hiearchy, we must be a subset of our parent cpuset. */
        ret = -EACCES;
-       if (!is_cpuset_subset(trial, par))
+       if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
                goto out;
  
        /*
@@@ -480,11 -514,11 +514,11 @@@ out
  #ifdef CONFIG_SMP
  /*
   * Helper routine for generate_sched_domains().
-  * Do cpusets a, b have overlapping cpus_allowed masks?
+  * Do cpusets a, b have overlapping effective cpus_allowed masks?
   */
  static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
  {
-       return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
+       return cpumask_intersects(a->effective_cpus, b->effective_cpus);
  }
  
  static void
@@@ -601,7 -635,7 +635,7 @@@ static int generate_sched_domains(cpuma
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
-               cpumask_copy(doms[0], top_cpuset.cpus_allowed);
+               cpumask_copy(doms[0], top_cpuset.effective_cpus);
  
                goto done;
        }
@@@ -705,7 -739,7 +739,7 @@@ restart
                        struct cpuset *b = csa[j];
  
                        if (apn == b->pn) {
-                               cpumask_or(dp, dp, b->cpus_allowed);
+                               cpumask_or(dp, dp, b->effective_cpus);
                                if (dattr)
                                        update_domain_attr_tree(dattr + nslot, b);
  
@@@ -757,7 -791,7 +791,7 @@@ static void rebuild_sched_domains_locke
         * passing doms with offlined cpu to partition_sched_domains().
         * Anyways, hotplug work item will rebuild sched domains.
         */
-       if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+       if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
                goto out;
  
        /* Generate domain masks and attrs */
@@@ -781,45 -815,6 +815,6 @@@ void rebuild_sched_domains(void
        mutex_unlock(&cpuset_mutex);
  }
  
- /*
-  * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
-  * @cs: the cpuset in interest
-  *
-  * A cpuset's effective cpumask is the cpumask of the nearest ancestor
-  * with non-empty cpus. We use effective cpumask whenever:
-  * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
-  *   if the cpuset they reside in has no cpus)
-  * - we want to retrieve task_cs(tsk)'s cpus_allowed.
-  *
-  * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
-  * exception. See comments there.
-  */
- static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
- {
-       while (cpumask_empty(cs->cpus_allowed))
-               cs = parent_cs(cs);
-       return cs;
- }
- /*
-  * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
-  * @cs: the cpuset in interest
-  *
-  * A cpuset's effective nodemask is the nodemask of the nearest ancestor
-  * with non-empty memss. We use effective nodemask whenever:
-  * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
-  *   if the cpuset they reside in has no mems)
-  * - we want to retrieve task_cs(tsk)'s mems_allowed.
-  *
-  * Called with cpuset_mutex held.
-  */
- static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
- {
-       while (nodes_empty(cs->mems_allowed))
-               cs = parent_cs(cs);
-       return cs;
- }
  /**
   * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
   */
  static void update_tasks_cpumask(struct cpuset *cs)
  {
-       struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
        struct css_task_iter it;
        struct task_struct *task;
  
        css_task_iter_start(&cs->css, &it);
        while ((task = css_task_iter_next(&it)))
-               set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+               set_cpus_allowed_ptr(task, cs->effective_cpus);
        css_task_iter_end(&it);
  }
  
  /*
-  * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
-  * @root_cs: the root cpuset of the hierarchy
-  * @update_root: update root cpuset or not?
+  * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+  * @cs: the cpuset to consider
+  * @new_cpus: temp variable for calculating new effective_cpus
+  *
+  * When congifured cpumask is changed, the effective cpumasks of this cpuset
+  * and all its descendants need to be updated.
   *
-  * This will update cpumasks of tasks in @root_cs and all other empty cpusets
-  * which take on cpumask of @root_cs.
+  * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
   *
   * Called with cpuset_mutex held
   */
- static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
+ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
  {
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
+       bool need_rebuild_sched_domains = false;
  
        rcu_read_lock();
-       cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-               if (cp == root_cs) {
-                       if (!update_root)
-                               continue;
-               } else {
-                       /* skip the whole subtree if @cp have some CPU */
-                       if (!cpumask_empty(cp->cpus_allowed)) {
-                               pos_css = css_rightmost_descendant(pos_css);
-                               continue;
-                       }
+       cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+               struct cpuset *parent = parent_cs(cp);
+               cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+               /*
+                * If it becomes empty, inherit the effective mask of the
+                * parent, which is guaranteed to have some CPUs.
+                */
+               if (cpumask_empty(new_cpus))
+                       cpumask_copy(new_cpus, parent->effective_cpus);
+               /* Skip the whole subtree if the cpumask remains the same. */
+               if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+                       pos_css = css_rightmost_descendant(pos_css);
+                       continue;
                }
                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();
  
+               mutex_lock(&callback_mutex);
+               cpumask_copy(cp->effective_cpus, new_cpus);
+               mutex_unlock(&callback_mutex);
+               WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+                       !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
                update_tasks_cpumask(cp);
  
+               /*
+                * If the effective cpumask of any non-empty cpuset is changed,
+                * we need to rebuild sched domains.
+                */
+               if (!cpumask_empty(cp->cpus_allowed) &&
+                   is_sched_load_balance(cp))
+                       need_rebuild_sched_domains = true;
                rcu_read_lock();
                css_put(&cp->css);
        }
        rcu_read_unlock();
+       if (need_rebuild_sched_domains)
+               rebuild_sched_domains_locked();
  }
  
  /**
@@@ -889,7 -911,6 +911,6 @@@ static int update_cpumask(struct cpuse
                          const char *buf)
  {
        int retval;
-       int is_load_balanced;
  
        /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
        if (cs == &top_cpuset)
                if (retval < 0)
                        return retval;
  
-               if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
+               if (!cpumask_subset(trialcs->cpus_allowed,
+                                   top_cpuset.cpus_allowed))
                        return -EINVAL;
        }
  
        if (retval < 0)
                return retval;
  
-       is_load_balanced = is_sched_load_balance(trialcs);
        mutex_lock(&callback_mutex);
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
        mutex_unlock(&callback_mutex);
  
-       update_tasks_cpumask_hier(cs, true);
-       if (is_load_balanced)
-               rebuild_sched_domains_locked();
+       /* use trialcs->cpus_allowed as a temp variable */
+       update_cpumasks_hier(cs, trialcs->cpus_allowed);
        return 0;
  }
  
@@@ -951,15 -969,13 +969,13 @@@ static void cpuset_migrate_mm(struct mm
                                                        const nodemask_t *to)
  {
        struct task_struct *tsk = current;
-       struct cpuset *mems_cs;
  
        tsk->mems_allowed = *to;
  
        do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
  
        rcu_read_lock();
-       mems_cs = effective_nodemask_cpuset(task_cs(tsk));
-       guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+       guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
        rcu_read_unlock();
  }
  
@@@ -1028,13 -1044,12 +1044,12 @@@ static void *cpuset_being_rebound
  static void update_tasks_nodemask(struct cpuset *cs)
  {
        static nodemask_t newmems;      /* protected by cpuset_mutex */
-       struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
        struct css_task_iter it;
        struct task_struct *task;
  
        cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
  
-       guarantee_online_mems(mems_cs, &newmems);
+       guarantee_online_mems(cs, &newmems);
  
        /*
         * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
  }
  
  /*
-  * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
-  * @cs: the root cpuset of the hierarchy
-  * @update_root: update the root cpuset or not?
+  * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
+  * @cs: the cpuset to consider
+  * @new_mems: a temp variable for calculating new effective_mems
   *
-  * This will update nodemasks of tasks in @root_cs and all other empty cpusets
-  * which take on nodemask of @root_cs.
+  * When configured nodemask is changed, the effective nodemasks of this cpuset
+  * and all its descendants need to be updated.
+  *
+  * On legacy hiearchy, effective_mems will be the same with mems_allowed.
   *
   * Called with cpuset_mutex held
   */
- static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
+ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
  {
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
  
        rcu_read_lock();
-       cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-               if (cp == root_cs) {
-                       if (!update_root)
-                               continue;
-               } else {
-                       /* skip the whole subtree if @cp have some CPU */
-                       if (!nodes_empty(cp->mems_allowed)) {
-                               pos_css = css_rightmost_descendant(pos_css);
-                               continue;
-                       }
+       cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+               struct cpuset *parent = parent_cs(cp);
+               nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+               /*
+                * If it becomes empty, inherit the effective mask of the
+                * parent, which is guaranteed to have some MEMs.
+                */
+               if (nodes_empty(*new_mems))
+                       *new_mems = parent->effective_mems;
+               /* Skip the whole subtree if the nodemask remains the same. */
+               if (nodes_equal(*new_mems, cp->effective_mems)) {
+                       pos_css = css_rightmost_descendant(pos_css);
+                       continue;
                }
                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();
  
+               mutex_lock(&callback_mutex);
+               cp->effective_mems = *new_mems;
+               mutex_unlock(&callback_mutex);
+               WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+                       !nodes_equal(cp->mems_allowed, cp->effective_mems));
                update_tasks_nodemask(cp);
  
                rcu_read_lock();
@@@ -1156,8 -1187,8 +1187,8 @@@ static int update_nodemask(struct cpuse
                        goto done;
  
                if (!nodes_subset(trialcs->mems_allowed,
-                               node_states[N_MEMORY])) {
-                       retval =  -EINVAL;
+                                 top_cpuset.mems_allowed)) {
+                       retval = -EINVAL;
                        goto done;
                }
        }
        cs->mems_allowed = trialcs->mems_allowed;
        mutex_unlock(&callback_mutex);
  
-       update_tasks_nodemask_hier(cs, true);
+       /* use trialcs->mems_allowed as a temp variable */
+       update_nodemasks_hier(cs, &cs->mems_allowed);
  done:
        return retval;
  }
  
  int current_cpuset_is_being_rebound(void)
  {
 -      return task_cs(current) == cpuset_being_rebound;
 +      int ret;
 +
 +      rcu_read_lock();
 +      ret = task_cs(current) == cpuset_being_rebound;
 +      rcu_read_unlock();
 +
 +      return ret;
  }
  
  static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@@ -1389,12 -1415,9 +1421,9 @@@ static int cpuset_can_attach(struct cgr
  
        mutex_lock(&cpuset_mutex);
  
-       /*
-        * We allow to move tasks into an empty cpuset if sane_behavior
-        * flag is set.
-        */
+       /* allow moving tasks into an empty cpuset if on default hierarchy */
        ret = -ENOSPC;
-       if (!cgroup_sane_behavior(css->cgroup) &&
+       if (!cgroup_on_dfl(css->cgroup) &&
            (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                goto out_unlock;
  
@@@ -1452,8 -1475,6 +1481,6 @@@ static void cpuset_attach(struct cgroup
        struct task_struct *leader = cgroup_taskset_first(tset);
        struct cpuset *cs = css_cs(css);
        struct cpuset *oldcs = cpuset_attach_old_cs;
-       struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
-       struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
  
        mutex_lock(&cpuset_mutex);
  
        if (cs == &top_cpuset)
                cpumask_copy(cpus_attach, cpu_possible_mask);
        else
-               guarantee_online_cpus(cpus_cs, cpus_attach);
+               guarantee_online_cpus(cs, cpus_attach);
  
-       guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+       guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
  
        cgroup_taskset_for_each(task, tset) {
                /*
         * Change mm, possibly for multiple threads in a threadgroup. This is
         * expensive and may sleep.
         */
-       cpuset_attach_nodemask_to = cs->mems_allowed;
+       cpuset_attach_nodemask_to = cs->effective_mems;
        mm = get_task_mm(leader);
        if (mm) {
-               struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
                mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
  
                /*
                 * mm from.
                 */
                if (is_memory_migrate(cs)) {
-                       cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
+                       cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
                                          &cpuset_attach_nodemask_to);
                }
                mmput(mm);
@@@ -1516,6 -1535,8 +1541,8 @@@ typedef enum 
        FILE_MEMORY_MIGRATE,
        FILE_CPULIST,
        FILE_MEMLIST,
+       FILE_EFFECTIVE_CPULIST,
+       FILE_EFFECTIVE_MEMLIST,
        FILE_CPU_EXCLUSIVE,
        FILE_MEM_EXCLUSIVE,
        FILE_MEM_HARDWALL,
@@@ -1623,17 -1644,7 +1650,17 @@@ static ssize_t cpuset_write_resmask(str
         * resources, wait for the previously scheduled operations before
         * proceeding, so that we don't end up keep removing tasks added
         * after execution capability is restored.
 +       *
 +       * cpuset_hotplug_work calls back into cgroup core via
 +       * cgroup_transfer_tasks() and waiting for it from a cgroupfs
 +       * operation like this one can lead to a deadlock through kernfs
 +       * active_ref protection.  Let's break the protection.  Losing the
 +       * protection is okay as we check whether @cs is online after
 +       * grabbing cpuset_mutex anyway.  This only happens on the legacy
 +       * hierarchies.
         */
 +      css_get(&cs->css);
 +      kernfs_break_active_protection(of->kn);
        flush_work(&cpuset_hotplug_work);
  
        mutex_lock(&cpuset_mutex);
        free_trial_cpuset(trialcs);
  out_unlock:
        mutex_unlock(&cpuset_mutex);
 +      kernfs_unbreak_active_protection(of->kn);
 +      css_put(&cs->css);
        return retval ?: nbytes;
  }
  
@@@ -1694,6 -1703,12 +1721,12 @@@ static int cpuset_common_seq_show(struc
        case FILE_MEMLIST:
                s += nodelist_scnprintf(s, count, cs->mems_allowed);
                break;
+       case FILE_EFFECTIVE_CPULIST:
+               s += cpulist_scnprintf(s, count, cs->effective_cpus);
+               break;
+       case FILE_EFFECTIVE_MEMLIST:
+               s += nodelist_scnprintf(s, count, cs->effective_mems);
+               break;
        default:
                ret = -EINVAL;
                goto out_unlock;
@@@ -1778,6 -1793,18 +1811,18 @@@ static struct cftype files[] = 
                .private = FILE_MEMLIST,
        },
  
+       {
+               .name = "effective_cpus",
+               .seq_show = cpuset_common_seq_show,
+               .private = FILE_EFFECTIVE_CPULIST,
+       },
+       {
+               .name = "effective_mems",
+               .seq_show = cpuset_common_seq_show,
+               .private = FILE_EFFECTIVE_MEMLIST,
+       },
        {
                .name = "cpu_exclusive",
                .read_u64 = cpuset_read_u64,
@@@ -1869,18 -1896,26 +1914,26 @@@ cpuset_css_alloc(struct cgroup_subsys_s
        cs = kzalloc(sizeof(*cs), GFP_KERNEL);
        if (!cs)
                return ERR_PTR(-ENOMEM);
-       if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
-               kfree(cs);
-               return ERR_PTR(-ENOMEM);
-       }
+       if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
+               goto free_cs;
+       if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
+               goto free_cpus;
  
        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        cpumask_clear(cs->cpus_allowed);
        nodes_clear(cs->mems_allowed);
+       cpumask_clear(cs->effective_cpus);
+       nodes_clear(cs->effective_mems);
        fmeter_init(&cs->fmeter);
        cs->relax_domain_level = -1;
  
        return &cs->css;
+ free_cpus:
+       free_cpumask_var(cs->cpus_allowed);
+ free_cs:
+       kfree(cs);
+       return ERR_PTR(-ENOMEM);
  }
  
  static int cpuset_css_online(struct cgroup_subsys_state *css)
  
        cpuset_inc();
  
+       mutex_lock(&callback_mutex);
+       if (cgroup_on_dfl(cs->css.cgroup)) {
+               cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+               cs->effective_mems = parent->effective_mems;
+       }
+       mutex_unlock(&callback_mutex);
        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                goto out_unlock;
  
@@@ -1962,20 -2004,40 +2022,40 @@@ static void cpuset_css_free(struct cgro
  {
        struct cpuset *cs = css_cs(css);
  
+       free_cpumask_var(cs->effective_cpus);
        free_cpumask_var(cs->cpus_allowed);
        kfree(cs);
  }
  
+ static void cpuset_bind(struct cgroup_subsys_state *root_css)
+ {
+       mutex_lock(&cpuset_mutex);
+       mutex_lock(&callback_mutex);
+       if (cgroup_on_dfl(root_css->cgroup)) {
+               cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+               top_cpuset.mems_allowed = node_possible_map;
+       } else {
+               cpumask_copy(top_cpuset.cpus_allowed,
+                            top_cpuset.effective_cpus);
+               top_cpuset.mems_allowed = top_cpuset.effective_mems;
+       }
+       mutex_unlock(&callback_mutex);
+       mutex_unlock(&cpuset_mutex);
+ }
  struct cgroup_subsys cpuset_cgrp_subsys = {
-       .css_alloc = cpuset_css_alloc,
-       .css_online = cpuset_css_online,
-       .css_offline = cpuset_css_offline,
-       .css_free = cpuset_css_free,
-       .can_attach = cpuset_can_attach,
-       .cancel_attach = cpuset_cancel_attach,
-       .attach = cpuset_attach,
-       .base_cftypes = files,
-       .early_init = 1,
+       .css_alloc      = cpuset_css_alloc,
+       .css_online     = cpuset_css_online,
+       .css_offline    = cpuset_css_offline,
+       .css_free       = cpuset_css_free,
+       .can_attach     = cpuset_can_attach,
+       .cancel_attach  = cpuset_cancel_attach,
+       .attach         = cpuset_attach,
+       .bind           = cpuset_bind,
+       .legacy_cftypes = files,
+       .early_init     = 1,
  };
  
  /**
@@@ -1990,9 -2052,13 +2070,13 @@@ int __init cpuset_init(void
  
        if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
                BUG();
+       if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
+               BUG();
  
        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
+       cpumask_setall(top_cpuset.effective_cpus);
+       nodes_setall(top_cpuset.effective_mems);
  
        fmeter_init(&top_cpuset.fmeter);
        set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
@@@ -2035,6 -2101,66 +2119,66 @@@ static void remove_tasks_in_empty_cpuse
        }
  }
  
+ static void
+ hotplug_update_tasks_legacy(struct cpuset *cs,
+                           struct cpumask *new_cpus, nodemask_t *new_mems,
+                           bool cpus_updated, bool mems_updated)
+ {
+       bool is_empty;
+       mutex_lock(&callback_mutex);
+       cpumask_copy(cs->cpus_allowed, new_cpus);
+       cpumask_copy(cs->effective_cpus, new_cpus);
+       cs->mems_allowed = *new_mems;
+       cs->effective_mems = *new_mems;
+       mutex_unlock(&callback_mutex);
+       /*
+        * Don't call update_tasks_cpumask() if the cpuset becomes empty,
+        * as the tasks will be migratecd to an ancestor.
+        */
+       if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+               update_tasks_cpumask(cs);
+       if (mems_updated && !nodes_empty(cs->mems_allowed))
+               update_tasks_nodemask(cs);
+       is_empty = cpumask_empty(cs->cpus_allowed) ||
+                  nodes_empty(cs->mems_allowed);
+       mutex_unlock(&cpuset_mutex);
+       /*
+        * Move tasks to the nearest ancestor with execution resources,
+        * This is full cgroup operation which will also call back into
+        * cpuset. Should be done outside any lock.
+        */
+       if (is_empty)
+               remove_tasks_in_empty_cpuset(cs);
+       mutex_lock(&cpuset_mutex);
+ }
+ static void
+ hotplug_update_tasks(struct cpuset *cs,
+                    struct cpumask *new_cpus, nodemask_t *new_mems,
+                    bool cpus_updated, bool mems_updated)
+ {
+       if (cpumask_empty(new_cpus))
+               cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+       if (nodes_empty(*new_mems))
+               *new_mems = parent_cs(cs)->effective_mems;
+       mutex_lock(&callback_mutex);
+       cpumask_copy(cs->effective_cpus, new_cpus);
+       cs->effective_mems = *new_mems;
+       mutex_unlock(&callback_mutex);
+       if (cpus_updated)
+               update_tasks_cpumask(cs);
+       if (mems_updated)
+               update_tasks_nodemask(cs);
+ }
  /**
   * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
   * @cs: cpuset in interest
   */
  static void cpuset_hotplug_update_tasks(struct cpuset *cs)
  {
-       static cpumask_t off_cpus;
-       static nodemask_t off_mems;
-       bool is_empty;
-       bool sane = cgroup_sane_behavior(cs->css.cgroup);
+       static cpumask_t new_cpus;
+       static nodemask_t new_mems;
+       bool cpus_updated;
+       bool mems_updated;
  retry:
        wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
  
                goto retry;
        }
  
-       cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
-       nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
-       mutex_lock(&callback_mutex);
-       cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
-       mutex_unlock(&callback_mutex);
-       /*
-        * If sane_behavior flag is set, we need to update tasks' cpumask
-        * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
-        * call update_tasks_cpumask() if the cpuset becomes empty, as
-        * the tasks in it will be migrated to an ancestor.
-        */
-       if ((sane && cpumask_empty(cs->cpus_allowed)) ||
-           (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
-               update_tasks_cpumask(cs);
+       cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+       nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
  
-       mutex_lock(&callback_mutex);
-       nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
-       mutex_unlock(&callback_mutex);
-       /*
-        * If sane_behavior flag is set, we need to update tasks' nodemask
-        * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
-        * call update_tasks_nodemask() if the cpuset becomes empty, as
-        * the tasks in it will be migratd to an ancestor.
-        */
-       if ((sane && nodes_empty(cs->mems_allowed)) ||
-           (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
-               update_tasks_nodemask(cs);
+       cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
+       mems_updated = !nodes_equal(new_mems, cs->effective_mems);
  
-       is_empty = cpumask_empty(cs->cpus_allowed) ||
-               nodes_empty(cs->mems_allowed);
+       if (cgroup_on_dfl(cs->css.cgroup))
+               hotplug_update_tasks(cs, &new_cpus, &new_mems,
+                                    cpus_updated, mems_updated);
+       else
+               hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+                                           cpus_updated, mems_updated);
  
        mutex_unlock(&cpuset_mutex);
-       /*
-        * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
-        *
-        * Otherwise move tasks to the nearest ancestor with execution
-        * resources.  This is full cgroup operation which will
-        * also call back into cpuset.  Should be done outside any lock.
-        */
-       if (!sane && is_empty)
-               remove_tasks_in_empty_cpuset(cs);
  }
  
  /**
@@@ -2132,6 -2226,7 +2244,7 @@@ static void cpuset_hotplug_workfn(struc
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
+       bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
  
        mutex_lock(&cpuset_mutex);
  
        cpumask_copy(&new_cpus, cpu_active_mask);
        new_mems = node_states[N_MEMORY];
  
-       cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
-       mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+       cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+       mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
  
        /* synchronize cpus_allowed to cpu_active_mask */
        if (cpus_updated) {
                mutex_lock(&callback_mutex);
-               cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+               if (!on_dfl)
+                       cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+               cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
                mutex_unlock(&callback_mutex);
                /* we don't mess with cpumasks of tasks in top_cpuset */
        }
        /* synchronize mems_allowed to N_MEMORY */
        if (mems_updated) {
                mutex_lock(&callback_mutex);
-               top_cpuset.mems_allowed = new_mems;
+               if (!on_dfl)
+                       top_cpuset.mems_allowed = new_mems;
+               top_cpuset.effective_mems = new_mems;
                mutex_unlock(&callback_mutex);
                update_tasks_nodemask(&top_cpuset);
        }
@@@ -2228,6 -2327,9 +2345,9 @@@ void __init cpuset_init_smp(void
        top_cpuset.mems_allowed = node_states[N_MEMORY];
        top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
  
+       cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
+       top_cpuset.effective_mems = node_states[N_MEMORY];
        register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
  }
  
  
  void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
  {
-       struct cpuset *cpus_cs;
        mutex_lock(&callback_mutex);
        rcu_read_lock();
-       cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
-       guarantee_online_cpus(cpus_cs, pmask);
+       guarantee_online_cpus(task_cs(tsk), pmask);
        rcu_read_unlock();
        mutex_unlock(&callback_mutex);
  }
  
  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
  {
-       struct cpuset *cpus_cs;
        rcu_read_lock();
-       cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
-       do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
+       do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
        rcu_read_unlock();
  
        /*
@@@ -2299,13 -2395,11 +2413,11 @@@ void cpuset_init_current_mems_allowed(v
  
  nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
  {
-       struct cpuset *mems_cs;
        nodemask_t mask;
  
        mutex_lock(&callback_mutex);
        rcu_read_lock();
-       mems_cs = effective_nodemask_cpuset(task_cs(tsk));
-       guarantee_online_mems(mems_cs, &mask);
+       guarantee_online_mems(task_cs(tsk), &mask);
        rcu_read_unlock();
        mutex_unlock(&callback_mutex);
  
diff --combined kernel/sched/core.c
@@@ -4147,6 -4147,7 +4147,6 @@@ static void __cond_resched(void
  
  int __sched _cond_resched(void)
  {
 -      rcu_cond_resched();
        if (should_resched()) {
                __cond_resched();
                return 1;
@@@ -4165,15 -4166,18 +4165,15 @@@ EXPORT_SYMBOL(_cond_resched)
   */
  int __cond_resched_lock(spinlock_t *lock)
  {
 -      bool need_rcu_resched = rcu_should_resched();
        int resched = should_resched();
        int ret = 0;
  
        lockdep_assert_held(lock);
  
 -      if (spin_needbreak(lock) || resched || need_rcu_resched) {
 +      if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
                if (resched)
                        __cond_resched();
 -              else if (unlikely(need_rcu_resched))
 -                      rcu_resched();
                else
                        cpu_relax();
                ret = 1;
@@@ -4187,6 -4191,7 +4187,6 @@@ int __sched __cond_resched_softirq(void
  {
        BUG_ON(!in_softirq());
  
 -      rcu_cond_resched();  /* BH disabled OK, just recording QSes. */
        if (should_resched()) {
                local_bh_enable();
                __cond_resched();
@@@ -8083,7 -8088,7 +8083,7 @@@ struct cgroup_subsys cpu_cgrp_subsys = 
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
-       .base_cftypes   = cpu_files,
+       .legacy_cftypes = cpu_files,
        .early_init     = 1,
  };
  
diff --combined mm/memcontrol.c
@@@ -5415,12 -5415,8 +5415,12 @@@ static int mem_cgroup_oom_notify_cb(str
  {
        struct mem_cgroup_eventfd_list *ev;
  
 +      spin_lock(&memcg_oom_lock);
 +
        list_for_each_entry(ev, &memcg->oom_notify, list)
                eventfd_signal(ev->eventfd, 1);
 +
 +      spin_unlock(&memcg_oom_lock);
        return 0;
  }
  
@@@ -6007,7 -6003,6 +6007,6 @@@ static struct cftype mem_cgroup_files[
        },
        {
                .name = "use_hierarchy",
-               .flags = CFTYPE_INSANE,
                .write_u64 = mem_cgroup_hierarchy_write,
                .read_u64 = mem_cgroup_hierarchy_read,
        },
@@@ -6411,6 -6406,29 +6410,29 @@@ static void mem_cgroup_css_free(struct 
        __mem_cgroup_free(memcg);
  }
  
+ /**
+  * mem_cgroup_css_reset - reset the states of a mem_cgroup
+  * @css: the target css
+  *
+  * Reset the states of the mem_cgroup associated with @css.  This is
+  * invoked when the userland requests disabling on the default hierarchy
+  * but the memcg is pinned through dependency.  The memcg should stop
+  * applying policies and should revert to the vanilla state as it may be
+  * made visible again.
+  *
+  * The current implementation only resets the essential configurations.
+  * This needs to be expanded to cover all the visible parts.
+  */
+ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
+ {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       mem_cgroup_resize_limit(memcg, ULLONG_MAX);
+       mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
+       memcg_update_kmem_limit(memcg, ULLONG_MAX);
+       res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
+ }
  #ifdef CONFIG_MMU
  /* Handlers for move charge at task migration. */
  #define PRECHARGE_COUNT_AT_ONCE       256
@@@ -7005,16 -7023,17 +7027,17 @@@ static void mem_cgroup_move_task(struc
  
  /*
   * Cgroup retains root cgroups across [un]mount cycles making it necessary
-  * to verify sane_behavior flag on each mount attempt.
+  * to verify whether we're attached to the default hierarchy on each mount
+  * attempt.
   */
  static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
  {
        /*
-        * use_hierarchy is forced with sane_behavior.  cgroup core
+        * use_hierarchy is forced on the default hierarchy.  cgroup core
         * guarantees that @root doesn't have any children, so turning it
         * on for the root memcg is enough.
         */
-       if (cgroup_sane_behavior(root_css->cgroup))
+       if (cgroup_on_dfl(root_css->cgroup))
                mem_cgroup_from_css(root_css)->use_hierarchy = true;
  }
  
@@@ -7023,11 -7042,12 +7046,12 @@@ struct cgroup_subsys memory_cgrp_subsy
        .css_online = mem_cgroup_css_online,
        .css_offline = mem_cgroup_css_offline,
        .css_free = mem_cgroup_css_free,
+       .css_reset = mem_cgroup_css_reset,
        .can_attach = mem_cgroup_can_attach,
        .cancel_attach = mem_cgroup_cancel_attach,
        .attach = mem_cgroup_move_task,
        .bind = mem_cgroup_bind,
-       .base_cftypes = mem_cgroup_files,
+       .legacy_cftypes = mem_cgroup_files,
        .early_init = 0,
  };
  
@@@ -7044,7 -7064,8 +7068,8 @@@ __setup("swapaccount=", enable_swap_acc
  
  static void __init memsw_file_init(void)
  {
-       WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
+       WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+                                         memsw_cgroup_files));
  }
  
  static void __init enable_swap_cgroup(void)