ipv4: avoid parallel route cache gc executions

[pandora-kernel.git] / kernel / cgroup.c
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 1d2b6ce..ffcf896 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -265,7 +265,7 @@ list_for_each_entry(_root, &roots, root_list)
  /* the list of cgroups eligible for automatic release. Protected by
   * release_list_lock */
  static LIST_HEAD(release_list);
-static DEFINE_SPINLOCK(release_list_lock);
+static DEFINE_RAW_SPINLOCK(release_list_lock);
  static void cgroup_release_agent(struct work_struct *work);
  static DECLARE_WORK(release_agent_work, cgroup_release_agent);
  static void check_for_release(struct cgroup *cgrp);
@@ -361,12 +361,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
                 struct cgroup *cgrp = link->cgrp;
                 list_del(&link->cg_link_list);
                 list_del(&link->cgrp_link_list);
+
+               /*
+                * We may not be holding cgroup_mutex, and if cgrp->count is
+                * dropped to 0 the cgroup can be destroyed at any time, hence
+                * rcu_read_lock is used to keep it alive.
+                */
+               rcu_read_lock();
                 if (atomic_dec_and_test(&cgrp->count) &&
                     notify_on_release(cgrp)) {
                         if (taskexit)
                                 set_bit(CGRP_RELEASABLE, &cgrp->flags);
                         check_for_release(cgrp);
                 }
+               rcu_read_unlock();
  
                 kfree(link);
         }
@@ -1175,10 +1183,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
  
         /*
          * If the 'all' option was specified select all the subsystems,
-        * otherwise 'all, 'none' and a subsystem name options were not
-        * specified, let's default to 'all'
+        * otherwise if 'none', 'name=' and a subsystem name options
+        * were not specified, let's default to 'all'
          */
-       if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+       if (all_ss || (!one_ss && !opts->none && !opts->name)) {
                 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                         struct cgroup_subsys *ss = subsys[i];
                         if (ss == NULL)
@@ -1803,9 +1811,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
          * trading it for newcg is protected by cgroup_mutex, we're safe to drop
          * it here; it will be freed under RCU.
          */
-       put_css_set(oldcg);
-
         set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+       put_css_set(oldcg);
         return 0;
  }
  
@@ -2022,12 +2029,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         if (!group)
                 return -ENOMEM;
         /* pre-allocate to guarantee space while iterating in rcu read-side. */
-       retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+       retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
         if (retval)
                 goto out_free_group_list;
  
         /* prevent changes to the threadgroup list while we take a snapshot. */
-       rcu_read_lock();
+       read_lock(&tasklist_lock);
         if (!thread_group_leader(leader)) {
                 /*
                  * a race with de_thread from another thread's exec() may strip
@@ -2036,7 +2043,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                  * throw this task away and try again (from cgroup_procs_write);
                  * this is "double-double-toil-and-trouble-check locking".
                  */
-               rcu_read_unlock();
+               read_unlock(&tasklist_lock);
                 retval = -EAGAIN;
                 goto out_free_group_list;
         }
@@ -2057,7 +2064,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         } while_each_thread(leader, tsk);
         /* remember the number of threads in the array for later. */
         group_size = i;
-       rcu_read_unlock();
+       read_unlock(&tasklist_lock);
  
         /*
          * step 1: check that we can legitimately attach to the cgroup.
@@ -2098,11 +2105,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                         continue;
                 /* get old css_set pointer */
                 task_lock(tsk);
-               if (tsk->flags & PF_EXITING) {
-                       /* ignore this task if it's going away */
-                       task_unlock(tsk);
-                       continue;
-               }
                 oldcg = tsk->cgroups;
                 get_css_set(oldcg);
                 task_unlock(tsk);
@@ -2135,14 +2137,17 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                 oldcgrp = task_cgroup_from_root(tsk, root);
                 if (cgrp == oldcgrp)
                         continue;
-               /* attach each task to each subsystem */
-               for_each_subsys(root, ss) {
-                       if (ss->attach_task)
-                               ss->attach_task(cgrp, tsk);
-               }
                 /* if the thread is PF_EXITING, it can just get skipped. */
                 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
-               BUG_ON(retval != 0 && retval != -ESRCH);
+               if (retval == 0) {
+                       /* attach each task to each subsystem */
+                       for_each_subsys(root, ss) {
+                               if (ss->attach_task)
+                                       ss->attach_task(cgrp, tsk);
+                       }
+               } else {
+                       BUG_ON(retval != -ESRCH);
+               }
         }
         /* nothing is sensitive to fork() after this point. */
  
@@ -2639,9 +2644,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
                 dentry->d_fsdata = cgrp;
                 inc_nlink(parent->d_inode);
                 rcu_assign_pointer(cgrp->dentry, dentry);
-               dget(dentry);
         }
-       dput(dentry);
  
         return error;
  }
@@ -2782,9 +2785,14 @@ static void cgroup_enable_task_cg_lists(void)
                  * We should check if the process is exiting, otherwise
                  * it will race with cgroup_exit() in that the list
                  * entry won't be deleted though the process has exited.
+                * Do it while holding siglock so that we don't end up
+                * racing against cgroup_exit().
                  */
+               spin_lock_irq(&p->sighand->siglock);
                 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
                         list_add(&p->cg_list, &p->cgroups->tasks);
+               spin_unlock_irq(&p->sighand->siglock);
+
                 task_unlock(p);
         } while_each_thread(g, p);
         write_unlock(&css_set_lock);
@@ -3501,6 +3509,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
                                       const char *buffer)
  {
         struct cgroup_event *event = NULL;
+       struct cgroup *cgrp_cfile;
         unsigned int efd, cfd;
         struct file *efile = NULL;
         struct file *cfile = NULL;
@@ -3556,6 +3565,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
                 goto fail;
         }
  
+       /*
+        * The file to be monitored must be in the same cgroup as
+        * cgroup.event_control is.
+        */
+       cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+       if (cgrp_cfile != cgrp) {
+               ret = -EINVAL;
+               goto fail;
+       }
+
         if (!event->cft->register_event || !event->cft->unregister_event) {
                 ret = -EINVAL;
                 goto fail;
@@ -3852,6 +3871,11 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  {
         struct cgroup *c_parent = dentry->d_parent->d_fsdata;
  
+       /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
+        */
+       if (strchr(dentry->d_name.name, '\n'))
+               return -EINVAL;
+
         /* the vfs holds inode->i_mutex already */
         return cgroup_create(c_parent, dentry, mode | S_IFDIR);
  }
@@ -4014,11 +4038,11 @@ again:
         finish_wait(&cgroup_rmdir_waitq, &wait);
         clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  
-       spin_lock(&release_list_lock);
+       raw_spin_lock(&release_list_lock);
         set_bit(CGRP_REMOVED, &cgrp->flags);
         if (!list_empty(&cgrp->release_list))
                 list_del_init(&cgrp->release_list);
-       spin_unlock(&release_list_lock);
+       raw_spin_unlock(&release_list_lock);
  
         cgroup_lock_hierarchy(cgrp->root);
         /* delete this cgroup from parent->children */
@@ -4510,42 +4534,20 @@ void cgroup_fork(struct task_struct *child)
         INIT_LIST_HEAD(&child->cg_list);
  }
  
-/**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
-       if (need_forkexit_callback) {
-               int i;
-               /*
-                * forkexit callbacks are only supported for builtin
-                * subsystems, and the builtin section of the subsys array is
-                * immutable, so we don't need to lock the subsys array here.
-                */
-               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
-                       struct cgroup_subsys *ss = subsys[i];
-                       if (ss->fork)
-                               ss->fork(ss, child);
-               }
-       }
-}
-
  /**
   * cgroup_post_fork - called on a new task after adding it to the task list
   * @child: the task in question
   *
- * Adds the task to the list running through its css_set if necessary.
- * Has to be after the task is visible on the task list in case we race
- * with the first call to cgroup_iter_start() - to guarantee that the
- * new task ends up on its list.
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks.  Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_iter_start() - to guarantee that the new task ends up on its
+ * list.
   */
  void cgroup_post_fork(struct task_struct *child)
  {
+       int i;
+
         if (use_task_css_set_links) {
                 write_lock(&css_set_lock);
                 task_lock(child);
@@ -4554,7 +4556,21 @@ void cgroup_post_fork(struct task_struct *child)
                 task_unlock(child);
                 write_unlock(&css_set_lock);
         }
+
+       /*
+        * Call ss->fork().  This must happen after @child is linked on
+        * css_set; otherwise, @child might change state between ->fork()
+        * and addition to css_set.
+        */
+       if (need_forkexit_callback) {
+               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+                       struct cgroup_subsys *ss = subsys[i];
+                       if (ss->fork)
+                               ss->fork(ss, child);
+               }
+       }
  }
+
  /**
   * cgroup_exit - detach cgroup from exiting task
   * @tsk: pointer to task_struct of exiting process
@@ -4671,13 +4687,13 @@ static void check_for_release(struct cgroup *cgrp)
                  * already queued for a userspace notification, queue
                  * it now */
                 int need_schedule_work = 0;
-               spin_lock(&release_list_lock);
+               raw_spin_lock(&release_list_lock);
                 if (!cgroup_is_removed(cgrp) &&
                     list_empty(&cgrp->release_list)) {
                         list_add(&cgrp->release_list, &release_list);
                         need_schedule_work = 1;
                 }
-               spin_unlock(&release_list_lock);
+               raw_spin_unlock(&release_list_lock);
                 if (need_schedule_work)
                         schedule_work(&release_agent_work);
         }
@@ -4729,7 +4745,7 @@ static void cgroup_release_agent(struct work_struct *work)
  {
         BUG_ON(work != &release_agent_work);
         mutex_lock(&cgroup_mutex);
-       spin_lock(&release_list_lock);
+       raw_spin_lock(&release_list_lock);
         while (!list_empty(&release_list)) {
                 char *argv[3], *envp[3];
                 int i;
@@ -4738,7 +4754,7 @@ static void cgroup_release_agent(struct work_struct *work)
                                                     struct cgroup,
                                                     release_list);
                 list_del_init(&cgrp->release_list);
-               spin_unlock(&release_list_lock);
+               raw_spin_unlock(&release_list_lock);
                 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
                 if (!pathbuf)
                         goto continue_free;
@@ -4768,9 +4784,9 @@ static void cgroup_release_agent(struct work_struct *work)
   continue_free:
                 kfree(pathbuf);
                 kfree(agentbuf);
-               spin_lock(&release_list_lock);
+               raw_spin_lock(&release_list_lock);
         }
-       spin_unlock(&release_list_lock);
+       raw_spin_unlock(&release_list_lock);
         mutex_unlock(&cgroup_mutex);
  }
  
@@ -4880,9 +4896,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
  
         rcu_assign_pointer(id->css, NULL);
         rcu_assign_pointer(css->id, NULL);
-       spin_lock(&ss->id_lock);
+       write_lock(&ss->id_lock);
         idr_remove(&ss->idr, id->id);
-       spin_unlock(&ss->id_lock);
+       write_unlock(&ss->id_lock);
         kfree_rcu(id, rcu_head);
  }
  EXPORT_SYMBOL_GPL(free_css_id);
@@ -4908,10 +4924,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
                 error = -ENOMEM;
                 goto err_out;
         }
-       spin_lock(&ss->id_lock);
+       write_lock(&ss->id_lock);
         /* Don't use 0. allocates an ID of 1-65535 */
         error = idr_get_new_above(&ss->idr, newid, 1, &myid);
-       spin_unlock(&ss->id_lock);
+       write_unlock(&ss->id_lock);
  
         /* Returns error when there are no free spaces for new ID.*/
         if (error) {
@@ -4926,9 +4942,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
         return newid;
  remove_idr:
         error = -ENOSPC;
-       spin_lock(&ss->id_lock);
+       write_lock(&ss->id_lock);
         idr_remove(&ss->idr, myid);
-       spin_unlock(&ss->id_lock);
+       write_unlock(&ss->id_lock);
  err_out:
         kfree(newid);
         return ERR_PTR(error);
@@ -4940,7 +4956,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
  {
         struct css_id *newid;
  
-       spin_lock_init(&ss->id_lock);
+       rwlock_init(&ss->id_lock);
         idr_init(&ss->idr);
  
         newid = get_new_cssid(ss, 0);
@@ -5035,9 +5051,9 @@ css_get_next(struct cgroup_subsys *ss, int id,
                  * scan next entry from bitmap(tree), tmpid is updated after
                  * idr_get_next().
                  */
-               spin_lock(&ss->id_lock);
+               read_lock(&ss->id_lock);
                 tmp = idr_get_next(&ss->idr, &tmpid);
-               spin_unlock(&ss->id_lock);
+               read_unlock(&ss->id_lock);
  
                 if (!tmp)
                         break;