cpuset: PF_SPREAD_PAGE and PF_SPREAD_SLAB should be atomic flags
[pandora-kernel.git] / kernel / cpuset.c
index 116a416..52cb04c 100644 (file)
@@ -76,8 +76,34 @@ struct cpuset {
        struct cgroup_subsys_state css;
 
        unsigned long flags;            /* "unsigned long" so bitops work */
-       cpumask_var_t cpus_allowed;     /* CPUs allowed to tasks in cpuset */
-       nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
+
+       /*
+        * On default hierarchy:
+        *
+        * The user-configured masks can only be changed by writing to
+        * cpuset.cpus and cpuset.mems, and won't be limited by the
+        * parent masks.
+        *
+        * The effective masks is the real masks that apply to the tasks
+        * in the cpuset. They may be changed if the configured masks are
+        * changed or hotplug happens.
+        *
+        * effective_mask == configured_mask & parent's effective_mask,
+        * and if it ends up empty, it will inherit the parent's mask.
+        *
+        *
+        * On legacy hierachy:
+        *
+        * The user-configured masks are always the same with effective masks.
+        */
+
+       /* user-configured CPUs and Memory Nodes allow to tasks */
+       cpumask_var_t cpus_allowed;
+       nodemask_t mems_allowed;
+
+       /* effective CPUs and Memory Nodes allow to tasks */
+       cpumask_var_t effective_cpus;
+       nodemask_t effective_mems;
 
        /*
         * This is old Memory Nodes tasks took on.
@@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = {
  */
 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 {
-       while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+       while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
                cs = parent_cs(cs);
-       cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
+       cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
 }
 
 /*
@@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
  */
 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 {
-       while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+       while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
                cs = parent_cs(cs);
-       nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
+       nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
 }
 
 /*
@@ -339,13 +365,14 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
                                        struct task_struct *tsk)
 {
        if (is_spread_page(cs))
-               tsk->flags |= PF_SPREAD_PAGE;
+               task_set_spread_page(tsk);
        else
-               tsk->flags &= ~PF_SPREAD_PAGE;
+               task_clear_spread_page(tsk);
+
        if (is_spread_slab(cs))
-               tsk->flags |= PF_SPREAD_SLAB;
+               task_set_spread_slab(tsk);
        else
-               tsk->flags &= ~PF_SPREAD_SLAB;
+               task_clear_spread_slab(tsk);
 }
 
 /*
@@ -376,13 +403,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
        if (!trial)
                return NULL;
 
-       if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
-               kfree(trial);
-               return NULL;
-       }
-       cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+       if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
+               goto free_cs;
+       if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
+               goto free_cpus;
 
+       cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+       cpumask_copy(trial->effective_cpus, cs->effective_cpus);
        return trial;
+
+free_cpus:
+       free_cpumask_var(trial->cpus_allowed);
+free_cs:
+       kfree(trial);
+       return NULL;
 }
 
 /**
@@ -391,6 +425,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
  */
 static void free_trial_cpuset(struct cpuset *trial)
 {
+       free_cpumask_var(trial->effective_cpus);
        free_cpumask_var(trial->cpus_allowed);
        kfree(trial);
 }
@@ -436,9 +471,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 
        par = parent_cs(cur);
 
-       /* We must be a subset of our parent cpuset */
+       /* On legacy hiearchy, we must be a subset of our parent cpuset. */
        ret = -EACCES;
-       if (!is_cpuset_subset(trial, par))
+       if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
                goto out;
 
        /*
@@ -480,11 +515,11 @@ out:
 #ifdef CONFIG_SMP
 /*
  * Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping cpus_allowed masks?
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
  */
 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 {
-       return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
+       return cpumask_intersects(a->effective_cpus, b->effective_cpus);
 }
 
 static void
@@ -601,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
-               cpumask_copy(doms[0], top_cpuset.cpus_allowed);
+               cpumask_copy(doms[0], top_cpuset.effective_cpus);
 
                goto done;
        }
@@ -705,7 +740,7 @@ restart:
                        struct cpuset *b = csa[j];
 
                        if (apn == b->pn) {
-                               cpumask_or(dp, dp, b->cpus_allowed);
+                               cpumask_or(dp, dp, b->effective_cpus);
                                if (dattr)
                                        update_domain_attr_tree(dattr + nslot, b);
 
@@ -757,7 +792,7 @@ static void rebuild_sched_domains_locked(void)
         * passing doms with offlined cpu to partition_sched_domains().
         * Anyways, hotplug work item will rebuild sched domains.
         */
-       if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+       if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
                goto out;
 
        /* Generate domain masks and attrs */
@@ -781,45 +816,6 @@ void rebuild_sched_domains(void)
        mutex_unlock(&cpuset_mutex);
 }
 
-/*
- * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
- * @cs: the cpuset in interest
- *
- * A cpuset's effective cpumask is the cpumask of the nearest ancestor
- * with non-empty cpus. We use effective cpumask whenever:
- * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
- *   if the cpuset they reside in has no cpus)
- * - we want to retrieve task_cs(tsk)'s cpus_allowed.
- *
- * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
- * exception. See comments there.
- */
-static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
-{
-       while (cpumask_empty(cs->cpus_allowed))
-               cs = parent_cs(cs);
-       return cs;
-}
-
-/*
- * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
- * @cs: the cpuset in interest
- *
- * A cpuset's effective nodemask is the nodemask of the nearest ancestor
- * with non-empty memss. We use effective nodemask whenever:
- * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
- *   if the cpuset they reside in has no mems)
- * - we want to retrieve task_cs(tsk)'s mems_allowed.
- *
- * Called with cpuset_mutex held.
- */
-static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
-{
-       while (nodes_empty(cs->mems_allowed))
-               cs = parent_cs(cs);
-       return cs;
-}
-
 /**
  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -830,53 +826,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
  */
 static void update_tasks_cpumask(struct cpuset *cs)
 {
-       struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
        struct css_task_iter it;
        struct task_struct *task;
 
        css_task_iter_start(&cs->css, &it);
        while ((task = css_task_iter_next(&it)))
-               set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+               set_cpus_allowed_ptr(task, cs->effective_cpus);
        css_task_iter_end(&it);
 }
 
 /*
- * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
- * @root_cs: the root cpuset of the hierarchy
- * @update_root: update root cpuset or not?
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_cpus: temp variable for calculating new effective_cpus
+ *
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
  *
- * This will update cpumasks of tasks in @root_cs and all other empty cpusets
- * which take on cpumask of @root_cs.
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
  *
  * Called with cpuset_mutex held
  */
-static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
+static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
+       bool need_rebuild_sched_domains = false;
 
        rcu_read_lock();
-       cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-               if (cp == root_cs) {
-                       if (!update_root)
-                               continue;
-               } else {
-                       /* skip the whole subtree if @cp have some CPU */
-                       if (!cpumask_empty(cp->cpus_allowed)) {
-                               pos_css = css_rightmost_descendant(pos_css);
-                               continue;
-                       }
+       cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+               struct cpuset *parent = parent_cs(cp);
+
+               cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+
+               /*
+                * If it becomes empty, inherit the effective mask of the
+                * parent, which is guaranteed to have some CPUs.
+                */
+               if (cpumask_empty(new_cpus))
+                       cpumask_copy(new_cpus, parent->effective_cpus);
+
+               /* Skip the whole subtree if the cpumask remains the same. */
+               if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+                       pos_css = css_rightmost_descendant(pos_css);
+                       continue;
                }
+
                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();
 
+               mutex_lock(&callback_mutex);
+               cpumask_copy(cp->effective_cpus, new_cpus);
+               mutex_unlock(&callback_mutex);
+
+               WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+                       !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+
                update_tasks_cpumask(cp);
 
+               /*
+                * If the effective cpumask of any non-empty cpuset is changed,
+                * we need to rebuild sched domains.
+                */
+               if (!cpumask_empty(cp->cpus_allowed) &&
+                   is_sched_load_balance(cp))
+                       need_rebuild_sched_domains = true;
+
                rcu_read_lock();
                css_put(&cp->css);
        }
        rcu_read_unlock();
+
+       if (need_rebuild_sched_domains)
+               rebuild_sched_domains_locked();
 }
 
 /**
@@ -889,7 +912,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                          const char *buf)
 {
        int retval;
-       int is_load_balanced;
 
        /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
        if (cs == &top_cpuset)
@@ -908,7 +930,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                if (retval < 0)
                        return retval;
 
-               if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
+               if (!cpumask_subset(trialcs->cpus_allowed,
+                                   top_cpuset.cpus_allowed))
                        return -EINVAL;
        }
 
@@ -920,16 +943,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
        if (retval < 0)
                return retval;
 
-       is_load_balanced = is_sched_load_balance(trialcs);
-
        mutex_lock(&callback_mutex);
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
        mutex_unlock(&callback_mutex);
 
-       update_tasks_cpumask_hier(cs, true);
-
-       if (is_load_balanced)
-               rebuild_sched_domains_locked();
+       /* use trialcs->cpus_allowed as a temp variable */
+       update_cpumasks_hier(cs, trialcs->cpus_allowed);
        return 0;
 }
 
@@ -951,15 +970,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
                                                        const nodemask_t *to)
 {
        struct task_struct *tsk = current;
-       struct cpuset *mems_cs;
 
        tsk->mems_allowed = *to;
 
        do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
 
        rcu_read_lock();
-       mems_cs = effective_nodemask_cpuset(task_cs(tsk));
-       guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+       guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
        rcu_read_unlock();
 }
 
@@ -1028,13 +1045,12 @@ static void *cpuset_being_rebound;
 static void update_tasks_nodemask(struct cpuset *cs)
 {
        static nodemask_t newmems;      /* protected by cpuset_mutex */
-       struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
        struct css_task_iter it;
        struct task_struct *task;
 
        cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
 
-       guarantee_online_mems(mems_cs, &newmems);
+       guarantee_online_mems(cs, &newmems);
 
        /*
         * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1077,36 +1093,52 @@ static void update_tasks_nodemask(struct cpuset *cs)
 }
 
 /*
- * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
- * @cs: the root cpuset of the hierarchy
- * @update_root: update the root cpuset or not?
+ * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_mems: a temp variable for calculating new effective_mems
  *
- * This will update nodemasks of tasks in @root_cs and all other empty cpusets
- * which take on nodemask of @root_cs.
+ * When configured nodemask is changed, the effective nodemasks of this cpuset
+ * and all its descendants need to be updated.
+ *
+ * On legacy hiearchy, effective_mems will be the same with mems_allowed.
  *
  * Called with cpuset_mutex held
  */
-static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
+static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 {
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
 
        rcu_read_lock();
-       cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-               if (cp == root_cs) {
-                       if (!update_root)
-                               continue;
-               } else {
-                       /* skip the whole subtree if @cp have some CPU */
-                       if (!nodes_empty(cp->mems_allowed)) {
-                               pos_css = css_rightmost_descendant(pos_css);
-                               continue;
-                       }
+       cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+               struct cpuset *parent = parent_cs(cp);
+
+               nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+
+               /*
+                * If it becomes empty, inherit the effective mask of the
+                * parent, which is guaranteed to have some MEMs.
+                */
+               if (nodes_empty(*new_mems))
+                       *new_mems = parent->effective_mems;
+
+               /* Skip the whole subtree if the nodemask remains the same. */
+               if (nodes_equal(*new_mems, cp->effective_mems)) {
+                       pos_css = css_rightmost_descendant(pos_css);
+                       continue;
                }
+
                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();
 
+               mutex_lock(&callback_mutex);
+               cp->effective_mems = *new_mems;
+               mutex_unlock(&callback_mutex);
+
+               WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+                       !nodes_equal(cp->mems_allowed, cp->effective_mems));
+
                update_tasks_nodemask(cp);
 
                rcu_read_lock();
@@ -1156,8 +1188,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                        goto done;
 
                if (!nodes_subset(trialcs->mems_allowed,
-                               node_states[N_MEMORY])) {
-                       retval =  -EINVAL;
+                                 top_cpuset.mems_allowed)) {
+                       retval = -EINVAL;
                        goto done;
                }
        }
@@ -1174,7 +1206,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        cs->mems_allowed = trialcs->mems_allowed;
        mutex_unlock(&callback_mutex);
 
-       update_tasks_nodemask_hier(cs, true);
+       /* use trialcs->mems_allowed as a temp variable */
+       update_nodemasks_hier(cs, &cs->mems_allowed);
 done:
        return retval;
 }
@@ -1389,12 +1422,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
 
        mutex_lock(&cpuset_mutex);
 
-       /*
-        * We allow to move tasks into an empty cpuset if sane_behavior
-        * flag is set.
-        */
+       /* allow moving tasks into an empty cpuset if on default hierarchy */
        ret = -ENOSPC;
-       if (!cgroup_sane_behavior(css->cgroup) &&
+       if (!cgroup_on_dfl(css->cgroup) &&
            (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                goto out_unlock;
 
@@ -1452,8 +1482,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
        struct task_struct *leader = cgroup_taskset_first(tset);
        struct cpuset *cs = css_cs(css);
        struct cpuset *oldcs = cpuset_attach_old_cs;
-       struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
-       struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
 
        mutex_lock(&cpuset_mutex);
 
@@ -1461,9 +1489,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
        if (cs == &top_cpuset)
                cpumask_copy(cpus_attach, cpu_possible_mask);
        else
-               guarantee_online_cpus(cpus_cs, cpus_attach);
+               guarantee_online_cpus(cs, cpus_attach);
 
-       guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+       guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
 
        cgroup_taskset_for_each(task, tset) {
                /*
@@ -1480,11 +1508,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
         * Change mm, possibly for multiple threads in a threadgroup. This is
         * expensive and may sleep.
         */
-       cpuset_attach_nodemask_to = cs->mems_allowed;
+       cpuset_attach_nodemask_to = cs->effective_mems;
        mm = get_task_mm(leader);
        if (mm) {
-               struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
-
                mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
 
                /*
@@ -1495,7 +1521,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
                 * mm from.
                 */
                if (is_memory_migrate(cs)) {
-                       cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
+                       cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
                                          &cpuset_attach_nodemask_to);
                }
                mmput(mm);
@@ -1516,6 +1542,8 @@ typedef enum {
        FILE_MEMORY_MIGRATE,
        FILE_CPULIST,
        FILE_MEMLIST,
+       FILE_EFFECTIVE_CPULIST,
+       FILE_EFFECTIVE_MEMLIST,
        FILE_CPU_EXCLUSIVE,
        FILE_MEM_EXCLUSIVE,
        FILE_MEM_HARDWALL,
@@ -1694,6 +1722,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
        case FILE_MEMLIST:
                s += nodelist_scnprintf(s, count, cs->mems_allowed);
                break;
+       case FILE_EFFECTIVE_CPULIST:
+               s += cpulist_scnprintf(s, count, cs->effective_cpus);
+               break;
+       case FILE_EFFECTIVE_MEMLIST:
+               s += nodelist_scnprintf(s, count, cs->effective_mems);
+               break;
        default:
                ret = -EINVAL;
                goto out_unlock;
@@ -1778,6 +1812,18 @@ static struct cftype files[] = {
                .private = FILE_MEMLIST,
        },
 
+       {
+               .name = "effective_cpus",
+               .seq_show = cpuset_common_seq_show,
+               .private = FILE_EFFECTIVE_CPULIST,
+       },
+
+       {
+               .name = "effective_mems",
+               .seq_show = cpuset_common_seq_show,
+               .private = FILE_EFFECTIVE_MEMLIST,
+       },
+
        {
                .name = "cpu_exclusive",
                .read_u64 = cpuset_read_u64,
@@ -1869,18 +1915,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
        cs = kzalloc(sizeof(*cs), GFP_KERNEL);
        if (!cs)
                return ERR_PTR(-ENOMEM);
-       if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
-               kfree(cs);
-               return ERR_PTR(-ENOMEM);
-       }
+       if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
+               goto free_cs;
+       if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
+               goto free_cpus;
 
        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        cpumask_clear(cs->cpus_allowed);
        nodes_clear(cs->mems_allowed);
+       cpumask_clear(cs->effective_cpus);
+       nodes_clear(cs->effective_mems);
        fmeter_init(&cs->fmeter);
        cs->relax_domain_level = -1;
 
        return &cs->css;
+
+free_cpus:
+       free_cpumask_var(cs->cpus_allowed);
+free_cs:
+       kfree(cs);
+       return ERR_PTR(-ENOMEM);
 }
 
 static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1903,6 +1957,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 
        cpuset_inc();
 
+       mutex_lock(&callback_mutex);
+       if (cgroup_on_dfl(cs->css.cgroup)) {
+               cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+               cs->effective_mems = parent->effective_mems;
+       }
+       mutex_unlock(&callback_mutex);
+
        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                goto out_unlock;
 
@@ -1962,20 +2023,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
 {
        struct cpuset *cs = css_cs(css);
 
+       free_cpumask_var(cs->effective_cpus);
        free_cpumask_var(cs->cpus_allowed);
        kfree(cs);
 }
 
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
+{
+       mutex_lock(&cpuset_mutex);
+       mutex_lock(&callback_mutex);
+
+       if (cgroup_on_dfl(root_css->cgroup)) {
+               cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+               top_cpuset.mems_allowed = node_possible_map;
+       } else {
+               cpumask_copy(top_cpuset.cpus_allowed,
+                            top_cpuset.effective_cpus);
+               top_cpuset.mems_allowed = top_cpuset.effective_mems;
+       }
+
+       mutex_unlock(&callback_mutex);
+       mutex_unlock(&cpuset_mutex);
+}
+
 struct cgroup_subsys cpuset_cgrp_subsys = {
-       .css_alloc = cpuset_css_alloc,
-       .css_online = cpuset_css_online,
-       .css_offline = cpuset_css_offline,
-       .css_free = cpuset_css_free,
-       .can_attach = cpuset_can_attach,
-       .cancel_attach = cpuset_cancel_attach,
-       .attach = cpuset_attach,
-       .base_cftypes = files,
-       .early_init = 1,
+       .css_alloc      = cpuset_css_alloc,
+       .css_online     = cpuset_css_online,
+       .css_offline    = cpuset_css_offline,
+       .css_free       = cpuset_css_free,
+       .can_attach     = cpuset_can_attach,
+       .cancel_attach  = cpuset_cancel_attach,
+       .attach         = cpuset_attach,
+       .bind           = cpuset_bind,
+       .legacy_cftypes = files,
+       .early_init     = 1,
 };
 
 /**
@@ -1990,9 +2071,13 @@ int __init cpuset_init(void)
 
        if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
                BUG();
+       if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
+               BUG();
 
        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
+       cpumask_setall(top_cpuset.effective_cpus);
+       nodes_setall(top_cpuset.effective_mems);
 
        fmeter_init(&top_cpuset.fmeter);
        set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
@@ -2035,6 +2120,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
        }
 }
 
+static void
+hotplug_update_tasks_legacy(struct cpuset *cs,
+                           struct cpumask *new_cpus, nodemask_t *new_mems,
+                           bool cpus_updated, bool mems_updated)
+{
+       bool is_empty;
+
+       mutex_lock(&callback_mutex);
+       cpumask_copy(cs->cpus_allowed, new_cpus);
+       cpumask_copy(cs->effective_cpus, new_cpus);
+       cs->mems_allowed = *new_mems;
+       cs->effective_mems = *new_mems;
+       mutex_unlock(&callback_mutex);
+
+       /*
+        * Don't call update_tasks_cpumask() if the cpuset becomes empty,
+        * as the tasks will be migratecd to an ancestor.
+        */
+       if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+               update_tasks_cpumask(cs);
+       if (mems_updated && !nodes_empty(cs->mems_allowed))
+               update_tasks_nodemask(cs);
+
+       is_empty = cpumask_empty(cs->cpus_allowed) ||
+                  nodes_empty(cs->mems_allowed);
+
+       mutex_unlock(&cpuset_mutex);
+
+       /*
+        * Move tasks to the nearest ancestor with execution resources,
+        * This is full cgroup operation which will also call back into
+        * cpuset. Should be done outside any lock.
+        */
+       if (is_empty)
+               remove_tasks_in_empty_cpuset(cs);
+
+       mutex_lock(&cpuset_mutex);
+}
+
+static void
+hotplug_update_tasks(struct cpuset *cs,
+                    struct cpumask *new_cpus, nodemask_t *new_mems,
+                    bool cpus_updated, bool mems_updated)
+{
+       if (cpumask_empty(new_cpus))
+               cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+       if (nodes_empty(*new_mems))
+               *new_mems = parent_cs(cs)->effective_mems;
+
+       mutex_lock(&callback_mutex);
+       cpumask_copy(cs->effective_cpus, new_cpus);
+       cs->effective_mems = *new_mems;
+       mutex_unlock(&callback_mutex);
+
+       if (cpus_updated)
+               update_tasks_cpumask(cs);
+       if (mems_updated)
+               update_tasks_nodemask(cs);
+}
+
 /**
  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
  * @cs: cpuset in interest
@@ -2045,11 +2190,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  */
 static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 {
-       static cpumask_t off_cpus;
-       static nodemask_t off_mems;
-       bool is_empty;
-       bool sane = cgroup_sane_behavior(cs->css.cgroup);
-
+       static cpumask_t new_cpus;
+       static nodemask_t new_mems;
+       bool cpus_updated;
+       bool mems_updated;
 retry:
        wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
 
@@ -2064,51 +2208,20 @@ retry:
                goto retry;
        }
 
-       cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
-       nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
-
-       mutex_lock(&callback_mutex);
-       cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
-       mutex_unlock(&callback_mutex);
-
-       /*
-        * If sane_behavior flag is set, we need to update tasks' cpumask
-        * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
-        * call update_tasks_cpumask() if the cpuset becomes empty, as
-        * the tasks in it will be migrated to an ancestor.
-        */
-       if ((sane && cpumask_empty(cs->cpus_allowed)) ||
-           (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
-               update_tasks_cpumask(cs);
+       cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+       nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
 
-       mutex_lock(&callback_mutex);
-       nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
-       mutex_unlock(&callback_mutex);
-
-       /*
-        * If sane_behavior flag is set, we need to update tasks' nodemask
-        * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
-        * call update_tasks_nodemask() if the cpuset becomes empty, as
-        * the tasks in it will be migratd to an ancestor.
-        */
-       if ((sane && nodes_empty(cs->mems_allowed)) ||
-           (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
-               update_tasks_nodemask(cs);
+       cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
+       mems_updated = !nodes_equal(new_mems, cs->effective_mems);
 
-       is_empty = cpumask_empty(cs->cpus_allowed) ||
-               nodes_empty(cs->mems_allowed);
+       if (cgroup_on_dfl(cs->css.cgroup))
+               hotplug_update_tasks(cs, &new_cpus, &new_mems,
+                                    cpus_updated, mems_updated);
+       else
+               hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+                                           cpus_updated, mems_updated);
 
        mutex_unlock(&cpuset_mutex);
-
-       /*
-        * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
-        *
-        * Otherwise move tasks to the nearest ancestor with execution
-        * resources.  This is full cgroup operation which will
-        * also call back into cpuset.  Should be done outside any lock.
-        */
-       if (!sane && is_empty)
-               remove_tasks_in_empty_cpuset(cs);
 }
 
 /**
@@ -2132,6 +2245,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
+       bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
 
        mutex_lock(&cpuset_mutex);
 
@@ -2139,13 +2253,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        cpumask_copy(&new_cpus, cpu_active_mask);
        new_mems = node_states[N_MEMORY];
 
-       cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
-       mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+       cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+       mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
 
        /* synchronize cpus_allowed to cpu_active_mask */
        if (cpus_updated) {
                mutex_lock(&callback_mutex);
-               cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+               if (!on_dfl)
+                       cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+               cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
                mutex_unlock(&callback_mutex);
                /* we don't mess with cpumasks of tasks in top_cpuset */
        }
@@ -2153,7 +2269,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        /* synchronize mems_allowed to N_MEMORY */
        if (mems_updated) {
                mutex_lock(&callback_mutex);
-               top_cpuset.mems_allowed = new_mems;
+               if (!on_dfl)
+                       top_cpuset.mems_allowed = new_mems;
+               top_cpuset.effective_mems = new_mems;
                mutex_unlock(&callback_mutex);
                update_tasks_nodemask(&top_cpuset);
        }
@@ -2228,6 +2346,9 @@ void __init cpuset_init_smp(void)
        top_cpuset.mems_allowed = node_states[N_MEMORY];
        top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
 
+       cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
+       top_cpuset.effective_mems = node_states[N_MEMORY];
+
        register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
 }
 
@@ -2244,23 +2365,17 @@ void __init cpuset_init_smp(void)
 
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
-       struct cpuset *cpus_cs;
-
        mutex_lock(&callback_mutex);
        rcu_read_lock();
-       cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
-       guarantee_online_cpus(cpus_cs, pmask);
+       guarantee_online_cpus(task_cs(tsk), pmask);
        rcu_read_unlock();
        mutex_unlock(&callback_mutex);
 }
 
 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
-       struct cpuset *cpus_cs;
-
        rcu_read_lock();
-       cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
-       do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
+       do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
        rcu_read_unlock();
 
        /*
@@ -2299,13 +2414,11 @@ void cpuset_init_current_mems_allowed(void)
 
 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 {
-       struct cpuset *mems_cs;
        nodemask_t mask;
 
        mutex_lock(&callback_mutex);
        rcu_read_lock();
-       mems_cs = effective_nodemask_cpuset(task_cs(tsk));
-       guarantee_online_mems(mems_cs, &mask);
+       guarantee_online_mems(task_cs(tsk), &mask);
        rcu_read_unlock();
        mutex_unlock(&callback_mutex);