Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/drzeus/mmc

[pandora-kernel.git] / kernel / cpuset.c
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index f382b0f..57e6448 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -42,7 +42,6 @@
  #include <linux/seq_file.h>
  #include <linux/security.h>
  #include <linux/slab.h>
-#include <linux/smp_lock.h>
  #include <linux/spinlock.h>
  #include <linux/stat.h>
  #include <linux/string.h>
@@ -517,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf)
         envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
         envp[i] = NULL;
  
-       call_usermodehelper(argv[0], argv, envp, 0);
+       call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
         kfree(pathbuf);
  }
  
@@ -822,11 +821,22 @@ static int update_cpumask(struct cpuset *cs, char *buf)
                 return -EACCES;
  
         trialcs = *cs;
-       retval = cpulist_parse(buf, trialcs.cpus_allowed);
-       if (retval < 0)
-               return retval;
+
+       /*
+        * We allow a cpuset's cpus_allowed to be empty; if it has attached
+        * tasks, we'll catch it later when we validate the change and return
+        * -ENOSPC.
+        */
+       if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
+               cpus_clear(trialcs.cpus_allowed);
+       } else {
+               retval = cpulist_parse(buf, trialcs.cpus_allowed);
+               if (retval < 0)
+                       return retval;
+       }
         cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
-       if (cpus_empty(trialcs.cpus_allowed))
+       /* cpus_allowed cannot be empty for a cpuset with attached tasks. */
+       if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed))
                 return -ENOSPC;
         retval = validate_change(cs, &trialcs);
         if (retval < 0)
@@ -919,16 +929,27 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                 return -EACCES;
  
         trialcs = *cs;
-       retval = nodelist_parse(buf, trialcs.mems_allowed);
-       if (retval < 0)
-               goto done;
+
+       /*
+        * We allow a cpuset's mems_allowed to be empty; if it has attached
+        * tasks, we'll catch it later when we validate the change and return
+        * -ENOSPC.
+        */
+       if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
+               nodes_clear(trialcs.mems_allowed);
+       } else {
+               retval = nodelist_parse(buf, trialcs.mems_allowed);
+               if (retval < 0)
+                       goto done;
+       }
         nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
         oldmem = cs->mems_allowed;
         if (nodes_equal(oldmem, trialcs.mems_allowed)) {
                 retval = 0;             /* Too easy - nothing to do */
                 goto done;
         }
-       if (nodes_empty(trialcs.mems_allowed)) {
+       /* mems_allowed cannot be empty for a cpuset with attached tasks. */
+       if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) {
                 retval = -ENOSPC;
                 goto done;
         }
@@ -960,10 +981,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
                 if (!mmarray)
                         goto done;
-               write_lock_irq(&tasklist_lock);         /* block fork */
+               read_lock(&tasklist_lock);              /* block fork */
                 if (atomic_read(&cs->count) <= ntasks)
                         break;                          /* got enough */
-               write_unlock_irq(&tasklist_lock);       /* try again */
+               read_unlock(&tasklist_lock);            /* try again */
                 kfree(mmarray);
         }
  
@@ -985,7 +1006,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                         continue;
                 mmarray[n++] = mm;
         } while_each_thread(g, p);
-       write_unlock_irq(&tasklist_lock);
+       read_unlock(&tasklist_lock);
  
         /*
          * Now that we've dropped the tasklist spinlock, we can
@@ -1661,9 +1682,9 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
  
         do_each_thread(g, p) {
                 if (p->cpuset == cs) {
-                       pidarray[n++] = p->pid;
                         if (unlikely(n == npids))
                                 goto array_full;
+                       pidarray[n++] = p->pid;
                 }
         } while_each_thread(g, p);
  
@@ -1751,12 +1772,7 @@ static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
  {
         struct ctr_struct *ctr = file->private_data;
  
-       if (*ppos + nbytes > ctr->bufsz)
-               nbytes = ctr->bufsz - *ppos;
-       if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
-               return -EFAULT;
-       *ppos += nbytes;
-       return nbytes;
+       return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
  }
  
  static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
@@ -2122,6 +2138,9 @@ static void common_cpu_mem_hotplug_unplug(void)
  static int cpuset_handle_cpuhp(struct notifier_block *nb,
                                 unsigned long phase, void *cpu)
  {
+       if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
+               return NOTIFY_DONE;
+
         common_cpu_mem_hotplug_unplug();
         return 0;
  }
@@ -2200,10 +2219,6 @@ void cpuset_fork(struct task_struct *child)
   * it is holding that mutex while calling check_for_release(),
   * which calls kmalloc(), so can't be called holding callback_mutex().
   *
- * We don't need to task_lock() this reference to tsk->cpuset,
- * because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it, or task is a failed fork, never visible to attach_task.
- *
   * the_top_cpuset_hack:
   *
   *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
@@ -2242,8 +2257,10 @@ void cpuset_exit(struct task_struct *tsk)
  {
         struct cpuset *cs;
  
+       task_lock(current);
         cs = tsk->cpuset;
         tsk->cpuset = &top_cpuset;      /* the_top_cpuset_hack - see above */
+       task_unlock(current);
  
         if (notify_on_release(cs)) {
                 char *pathbuf = NULL;
@@ -2351,6 +2368,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
   * z's node is in our tasks mems_allowed, yes.  If it's not a
   * __GFP_HARDWALL request and this zone's nodes is in the nearest
   * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * If the task has been OOM killed and has access to memory reserves
+ * as specified by the TIF_MEMDIE flag, yes.
   * Otherwise, no.
   *
   * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
@@ -2368,7 +2387,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
   * calls get to this routine, we should just shut up and say 'yes'.
   *
   * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
- * and do not allow allocations outside the current tasks cpuset.
+ * and do not allow allocations outside the current tasks cpuset
+ * unless the task has been OOM killed as is marked TIF_MEMDIE.
   * GFP_KERNEL allocations are not so marked, so can escape to the
   * nearest enclosing mem_exclusive ancestor cpuset.
   *
@@ -2392,6 +2412,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
   * affect that:
   *     in_interrupt - any node ok (current task context irrelevant)
   *     GFP_ATOMIC   - any node ok
+ *     TIF_MEMDIE   - any node ok
   *     GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
   *     GFP_USER     - only nodes in current tasks mems allowed ok.
   *
@@ -2413,6 +2434,12 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
         might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
         if (node_isset(node, current->mems_allowed))
                 return 1;
+       /*
+        * Allow tasks that have access to memory reserves because they have
+        * been OOM killed to get memory anywhere.
+        */
+       if (unlikely(test_thread_flag(TIF_MEMDIE)))
+               return 1;
         if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
                 return 0;
  
@@ -2438,7 +2465,9 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
   *
   * If we're in interrupt, yes, we can always allocate.
   * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
- * z's node is in our tasks mems_allowed, yes.   Otherwise, no.
+ * z's node is in our tasks mems_allowed, yes.   If the task has been
+ * OOM killed and has access to memory reserves as specified by the
+ * TIF_MEMDIE flag, yes.  Otherwise, no.
   *
   * The __GFP_THISNODE placement logic is really handled elsewhere,
   * by forcibly using a zonelist starting at a specified node, and by
@@ -2462,6 +2491,12 @@ int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
         node = zone_to_nid(z);
         if (node_isset(node, current->mems_allowed))
                 return 1;
+        /*
+         * Allow tasks that have access to memory reserves because they have
+         * been OOM killed to get memory anywhere.
+         */
+        if (unlikely(test_thread_flag(TIF_MEMDIE)))
+                return 1;
         return 0;
  }