sched: fix cpu hotplug, cleanup
[pandora-kernel.git] / kernel / cpuset.c
index b557127..9fceb97 100644 (file)
@@ -127,6 +127,7 @@ struct cpuset_hotplug_scanner {
 typedef enum {
        CS_CPU_EXCLUSIVE,
        CS_MEM_EXCLUSIVE,
+       CS_MEM_HARDWALL,
        CS_MEMORY_MIGRATE,
        CS_SCHED_LOAD_BALANCE,
        CS_SPREAD_PAGE,
@@ -144,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
        return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
 }
 
+static inline int is_mem_hardwall(const struct cpuset *cs)
+{
+       return test_bit(CS_MEM_HARDWALL, &cs->flags);
+}
+
 static inline int is_sched_load_balance(const struct cpuset *cs)
 {
        return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
@@ -791,8 +797,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
                retval = cpulist_parse(buf, trialcs.cpus_allowed);
                if (retval < 0)
                        return retval;
+
+               if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map))
+                       return -EINVAL;
        }
-       cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
        retval = validate_change(cs, &trialcs);
        if (retval < 0)
                return retval;
@@ -926,9 +934,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                retval = nodelist_parse(buf, trialcs.mems_allowed);
                if (retval < 0)
                        goto done;
+
+               if (!nodes_subset(trialcs.mems_allowed,
+                               node_states[N_HIGH_MEMORY]))
+                       return -EINVAL;
        }
-       nodes_and(trialcs.mems_allowed, trialcs.mems_allowed,
-                                               node_states[N_HIGH_MEMORY]);
        oldmem = cs->mems_allowed;
        if (nodes_equal(oldmem, trialcs.mems_allowed)) {
                retval = 0;             /* Too easy - nothing to do */
@@ -1025,12 +1035,10 @@ int current_cpuset_is_being_rebound(void)
        return task_cs(current) == cpuset_being_rebound;
 }
 
-static int update_relax_domain_level(struct cpuset *cs, char *buf)
+static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
-       int val = simple_strtol(buf, NULL, 10);
-
-       if (val < 0)
-               val = -1;
+       if (val < -1 || val >= SD_LV_MAX)
+               return -EINVAL;
 
        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
@@ -1042,12 +1050,9 @@ static int update_relax_domain_level(struct cpuset *cs, char *buf)
 
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
- * bit:        the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- *                             CS_SCHED_LOAD_BALANCE,
- *                             CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
- *                             CS_SPREAD_PAGE, CS_SPREAD_SLAB)
- * cs: the cpuset to update
- * buf:        the buffer where we read the 0 or 1
+ * bit:                the bit to update (see cpuset_flagbits_t)
+ * cs:         the cpuset to update
+ * turning_on:         whether the flag is being set or cleared
  *
  * Call with cgroup_mutex held.
  */
@@ -1228,6 +1233,7 @@ typedef enum {
        FILE_MEMLIST,
        FILE_CPU_EXCLUSIVE,
        FILE_MEM_EXCLUSIVE,
+       FILE_MEM_HARDWALL,
        FILE_SCHED_LOAD_BALANCE,
        FILE_SCHED_RELAX_DOMAIN_LEVEL,
        FILE_MEMORY_PRESSURE_ENABLED,
@@ -1276,9 +1282,6 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
        case FILE_MEMLIST:
                retval = update_nodemask(cs, buffer);
                break;
-       case FILE_SCHED_RELAX_DOMAIN_LEVEL:
-               retval = update_relax_domain_level(cs, buffer);
-               break;
        default:
                retval = -EINVAL;
                goto out2;
@@ -1313,6 +1316,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
        case FILE_MEM_EXCLUSIVE:
                retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
                break;
+       case FILE_MEM_HARDWALL:
+               retval = update_flag(CS_MEM_HARDWALL, cs, val);
+               break;
        case FILE_SCHED_LOAD_BALANCE:
                retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
                break;
@@ -1341,6 +1347,30 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
        return retval;
 }
 
+static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
+{
+       int retval = 0;
+       struct cpuset *cs = cgroup_cs(cgrp);
+       cpuset_filetype_t type = cft->private;
+
+       cgroup_lock();
+
+       if (cgroup_is_removed(cgrp)) {
+               cgroup_unlock();
+               return -ENODEV;
+       }
+       switch (type) {
+       case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+               retval = update_relax_domain_level(cs, val);
+               break;
+       default:
+               retval = -EINVAL;
+               break;
+       }
+       cgroup_unlock();
+       return retval;
+}
+
 /*
  * These ascii lists should be read in a single call, by using a user
  * buffer large enough to hold the entire map.  If read in smaller
@@ -1399,9 +1429,6 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
        case FILE_MEMLIST:
                s += cpuset_sprintf_memlist(s, cs);
                break;
-       case FILE_SCHED_RELAX_DOMAIN_LEVEL:
-               s += sprintf(s, "%d", cs->relax_domain_level);
-               break;
        default:
                retval = -EINVAL;
                goto out;
@@ -1423,6 +1450,8 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
                return is_cpu_exclusive(cs);
        case FILE_MEM_EXCLUSIVE:
                return is_mem_exclusive(cs);
+       case FILE_MEM_HARDWALL:
+               return is_mem_hardwall(cs);
        case FILE_SCHED_LOAD_BALANCE:
                return is_sched_load_balance(cs);
        case FILE_MEMORY_MIGRATE:
@@ -1440,58 +1469,100 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
        }
 }
 
+static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
+{
+       struct cpuset *cs = cgroup_cs(cont);
+       cpuset_filetype_t type = cft->private;
+       switch (type) {
+       case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+               return cs->relax_domain_level;
+       default:
+               BUG();
+       }
+}
+
 
 /*
  * for the common functions, 'private' gives the type of file
  */
 
-static struct cftype cft_cpus = {
-       .name = "cpus",
-       .read = cpuset_common_file_read,
-       .write = cpuset_common_file_write,
-       .private = FILE_CPULIST,
-};
-
-static struct cftype cft_mems = {
-       .name = "mems",
-       .read = cpuset_common_file_read,
-       .write = cpuset_common_file_write,
-       .private = FILE_MEMLIST,
-};
-
-static struct cftype cft_cpu_exclusive = {
-       .name = "cpu_exclusive",
-       .read_u64 = cpuset_read_u64,
-       .write_u64 = cpuset_write_u64,
-       .private = FILE_CPU_EXCLUSIVE,
-};
-
-static struct cftype cft_mem_exclusive = {
-       .name = "mem_exclusive",
-       .read_u64 = cpuset_read_u64,
-       .write_u64 = cpuset_write_u64,
-       .private = FILE_MEM_EXCLUSIVE,
-};
-
-static struct cftype cft_sched_load_balance = {
-       .name = "sched_load_balance",
-       .read_u64 = cpuset_read_u64,
-       .write_u64 = cpuset_write_u64,
-       .private = FILE_SCHED_LOAD_BALANCE,
-};
-
-static struct cftype cft_sched_relax_domain_level = {
-       .name = "sched_relax_domain_level",
-       .read = cpuset_common_file_read,
-       .write = cpuset_common_file_write,
-       .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
-};
-
-static struct cftype cft_memory_migrate = {
-       .name = "memory_migrate",
-       .read_u64 = cpuset_read_u64,
-       .write_u64 = cpuset_write_u64,
-       .private = FILE_MEMORY_MIGRATE,
+static struct cftype files[] = {
+       {
+               .name = "cpus",
+               .read = cpuset_common_file_read,
+               .write = cpuset_common_file_write,
+               .private = FILE_CPULIST,
+       },
+
+       {
+               .name = "mems",
+               .read = cpuset_common_file_read,
+               .write = cpuset_common_file_write,
+               .private = FILE_MEMLIST,
+       },
+
+       {
+               .name = "cpu_exclusive",
+               .read_u64 = cpuset_read_u64,
+               .write_u64 = cpuset_write_u64,
+               .private = FILE_CPU_EXCLUSIVE,
+       },
+
+       {
+               .name = "mem_exclusive",
+               .read_u64 = cpuset_read_u64,
+               .write_u64 = cpuset_write_u64,
+               .private = FILE_MEM_EXCLUSIVE,
+       },
+
+       {
+               .name = "mem_hardwall",
+               .read_u64 = cpuset_read_u64,
+               .write_u64 = cpuset_write_u64,
+               .private = FILE_MEM_HARDWALL,
+       },
+
+       {
+               .name = "sched_load_balance",
+               .read_u64 = cpuset_read_u64,
+               .write_u64 = cpuset_write_u64,
+               .private = FILE_SCHED_LOAD_BALANCE,
+       },
+
+       {
+               .name = "sched_relax_domain_level",
+               .read_s64 = cpuset_read_s64,
+               .write_s64 = cpuset_write_s64,
+               .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+       },
+
+       {
+               .name = "memory_migrate",
+               .read_u64 = cpuset_read_u64,
+               .write_u64 = cpuset_write_u64,
+               .private = FILE_MEMORY_MIGRATE,
+       },
+
+       {
+               .name = "memory_pressure",
+               .read_u64 = cpuset_read_u64,
+               .write_u64 = cpuset_write_u64,
+               .private = FILE_MEMORY_PRESSURE,
+       },
+
+       {
+               .name = "memory_spread_page",
+               .read_u64 = cpuset_read_u64,
+               .write_u64 = cpuset_write_u64,
+               .private = FILE_SPREAD_PAGE,
+       },
+
+       {
+               .name = "memory_spread_slab",
+               .read_u64 = cpuset_read_u64,
+               .write_u64 = cpuset_write_u64,
+               .private = FILE_SPREAD_SLAB,
+       },
 };
 
 static struct cftype cft_memory_pressure_enabled = {
@@ -1501,57 +1572,18 @@ static struct cftype cft_memory_pressure_enabled = {
        .private = FILE_MEMORY_PRESSURE_ENABLED,
 };
 
-static struct cftype cft_memory_pressure = {
-       .name = "memory_pressure",
-       .read_u64 = cpuset_read_u64,
-       .write_u64 = cpuset_write_u64,
-       .private = FILE_MEMORY_PRESSURE,
-};
-
-static struct cftype cft_spread_page = {
-       .name = "memory_spread_page",
-       .read_u64 = cpuset_read_u64,
-       .write_u64 = cpuset_write_u64,
-       .private = FILE_SPREAD_PAGE,
-};
-
-static struct cftype cft_spread_slab = {
-       .name = "memory_spread_slab",
-       .read_u64 = cpuset_read_u64,
-       .write_u64 = cpuset_write_u64,
-       .private = FILE_SPREAD_SLAB,
-};
-
 static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
        int err;
 
-       if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0)
-               return err;
-       if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
-               return err;
-       if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
-               return err;
-       if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
-               return err;
-       if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
-               return err;
-       if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
-               return err;
-       if ((err = cgroup_add_file(cont, ss,
-                                       &cft_sched_relax_domain_level)) < 0)
-               return err;
-       if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
-               return err;
-       if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
-               return err;
-       if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
+       err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
+       if (err)
                return err;
        /* memory_pressure_enabled is in root cpuset only */
-       if (err == 0 && !cont->parent)
+       if (!cont->parent)
                err = cgroup_add_file(cont, ss,
-                                        &cft_memory_pressure_enabled);
-       return 0;
+                                     &cft_memory_pressure_enabled);
+       return err;
 }
 
 /*
@@ -1858,6 +1890,12 @@ static void common_cpu_mem_hotplug_unplug(void)
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
        scan_for_empty_cpusets(&top_cpuset);
 
+       /*
+        * Scheduler destroys domains on hotplug events.
+        * Rebuild them based on the current settings.
+        */
+       rebuild_sched_domains();
+
        cgroup_unlock();
 }
 
@@ -1979,14 +2017,14 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 }
 
 /*
- * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
- * ancestor to the specified cpuset.  Call holding callback_mutex.
- * If no ancestor is mem_exclusive (an unusual configuration), then
- * returns the root cpuset.
+ * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
+ * mem_hardwall ancestor to the specified cpuset.  Call holding
+ * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall
+ * (an unusual configuration), then returns the root cpuset.
  */
-static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 {
-       while (!is_mem_exclusive(cs) && cs->parent)
+       while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
                cs = cs->parent;
        return cs;
 }
@@ -2000,7 +2038,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * __GFP_THISNODE is set, yes, we can always allocate.  If zone
  * z's node is in our tasks mems_allowed, yes.  If it's not a
  * __GFP_HARDWALL request and this zone's nodes is in the nearest
- * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * hardwalled cpuset ancestor to this tasks cpuset, yes.
  * If the task has been OOM killed and has access to memory reserves
  * as specified by the TIF_MEMDIE flag, yes.
  * Otherwise, no.
@@ -2023,7 +2061,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * and do not allow allocations outside the current tasks cpuset
  * unless the task has been OOM killed as is marked TIF_MEMDIE.
  * GFP_KERNEL allocations are not so marked, so can escape to the
- * nearest enclosing mem_exclusive ancestor cpuset.
+ * nearest enclosing hardwalled ancestor cpuset.
  *
  * Scanning up parent cpusets requires callback_mutex.  The
  * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
@@ -2046,7 +2084,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  *     in_interrupt - any node ok (current task context irrelevant)
  *     GFP_ATOMIC   - any node ok
  *     TIF_MEMDIE   - any node ok
- *     GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
+ *     GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
  *     GFP_USER     - only nodes in current tasks mems allowed ok.
  *
  * Rule:
@@ -2083,7 +2121,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
        mutex_lock(&callback_mutex);
 
        task_lock(current);
-       cs = nearest_exclusive_ancestor(task_cs(current));
+       cs = nearest_hardwall_ancestor(task_cs(current));
        task_unlock(current);
 
        allowed = node_isset(node, cs->mems_allowed);