* The css to which this ID points. This pointer is set to valid value
* after cgroup is populated. If cgroup is removed, this will be NULL.
* This pointer is expected to be RCU-safe because destroy()
- * is called after synchronize_rcu(). But for safe use, css_is_removed()
- * css_tryget() should be used for avoiding race.
+ * is called after synchronize_rcu(). But for safe use, css_tryget()
+ * should be used for avoiding race.
*/
struct cgroup_subsys_state __rcu *css;
/*
*/
static int need_forkexit_callback __read_mostly;
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+
#ifdef CONFIG_PROVE_LOCKING
int cgroup_lock_is_held(void)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
-static int clone_children(const struct cgroup *cgrp)
-{
- return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
-}
-
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
return inode;
}
-/*
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
- */
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
-{
- struct cgroup_subsys *ss;
- int ret = 0;
-
- for_each_subsys(cgrp->root, ss) {
- if (!ss->pre_destroy)
- continue;
-
- ret = ss->pre_destroy(cgrp);
- if (ret) {
- /* ->pre_destroy() failure is being deprecated */
- WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
- break;
- }
- }
-
- return ret;
-}
-
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
* Release the subsystem state objects.
*/
for_each_subsys(cgrp->root, ss)
- ss->destroy(cgrp);
+ ss->css_free(cgrp);
cgrp->root->number_of_cgroups--;
mutex_unlock(&cgroup_mutex);
remove_dir(dentry);
}
-/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
-}
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
- css_get(css);
-}
-
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
- cgroup_wakeup_rmdir_waiter(css->cgroup);
- css_put(css);
-}
-
/*
* Call with cgroup_mutex held. Drops reference counts on modules, including
* any duplicate ones that parse_cgroupfs_options took. If this function
seq_puts(seq, ",xattr");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
- if (clone_children(&root->top_cgroup))
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
unsigned long subsys_mask;
unsigned long flags;
char *release_agent;
- bool clone_children;
+ bool cpuset_clone_children;
char *name;
/* User explicitly requested empty subsystem */
bool none;
continue;
}
if (!strcmp(token, "clone_children")) {
- opts->clone_children = true;
+ opts->cpuset_clone_children = true;
continue;
}
if (!strcmp(token, "xattr")) {
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->files);
INIT_LIST_HEAD(&cgrp->css_sets);
+ INIT_LIST_HEAD(&cgrp->allcg_node);
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
- if (opts->clone_children)
- set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
+ if (opts->cpuset_clone_children)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
return root;
}
free_cg_links(&tmp_cg_links);
- BUG_ON(!list_empty(&root_cgrp->sibling));
BUG_ON(!list_empty(&root_cgrp->children));
BUG_ON(root->number_of_cgroups != 1);
BUG_ON(root->number_of_cgroups != 1);
BUG_ON(!list_empty(&cgrp->children));
- BUG_ON(!list_empty(&cgrp->sibling));
mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
*/
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
{
+ struct dentry *dentry = cgrp->dentry;
char *start;
- struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
- cgroup_lock_is_held());
+
+ rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
+ "cgroup_path() called without proper locking");
if (!dentry || cgrp == dummytop) {
/*
return 0;
}
- start = buf + buflen;
+ start = buf + buflen - 1;
- *--start = '\0';
+ *start = '\0';
for (;;) {
int len = dentry->d_name.len;
if (!cgrp)
break;
- dentry = rcu_dereference_check(cgrp->dentry,
- cgroup_lock_is_held());
+ dentry = cgrp->dentry;
if (!cgrp->parent)
continue;
if (--start < buf)
* trading it for newcg is protected by cgroup_mutex, we're safe to drop
* it here; it will be freed under RCU.
*/
- put_css_set(oldcg);
-
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ put_css_set(oldcg);
}
/**
}
synchronize_rcu();
-
- /*
- * wake up rmdir() waiter. the rmdir should fail since the cgroup
- * is no longer empty.
- */
- cgroup_wakeup_rmdir_waiter(cgrp);
out:
if (retval) {
for_each_subsys(root, ss) {
* step 5: success! and cleanup
*/
synchronize_rcu();
- cgroup_wakeup_rmdir_waiter(cgrp);
retval = 0;
out_put_css_set_refs:
if (retval) {
/* start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
+ inc_nlink(dentry->d_parent->d_inode);
- /* start with the directory inode held, so that we can
- * populate it without racing with another mkdir */
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ /*
+ * Control reaches here with cgroup_mutex held.
+ * @inode->i_mutex should nest outside cgroup_mutex but we
+ * want to populate it immediately without releasing
+ * cgroup_mutex. As @inode isn't visible to anyone else
+ * yet, trylock will always succeed without affecting
+ * lockdep checks.
+ */
+ WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
} else if (S_ISREG(mode)) {
inode->i_size = 0;
inode->i_fop = &cgroup_file_operations;
return 0;
}
-/*
- * cgroup_create_dir - create a directory for an object.
- * @cgrp: the cgroup we create the directory for. It must have a valid
- * ->parent field. And we are going to fill its ->dentry field.
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new directory.
- */
-static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
- umode_t mode)
-{
- struct dentry *parent;
- int error = 0;
-
- parent = cgrp->parent->dentry;
- error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
- if (!error) {
- dentry->d_fsdata = cgrp;
- inc_nlink(parent->d_inode);
- rcu_assign_pointer(cgrp->dentry, dentry);
- dget(dentry);
- }
- dput(dentry);
-
- return error;
-}
-
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
write_unlock(&css_set_lock);
}
+/**
+ * cgroup_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_pre(). Find the next
+ * descendant to visit for pre-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
+ struct cgroup *cgroup)
+{
+ struct cgroup *next;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* if first iteration, pretend we just visited @cgroup */
+ if (!pos) {
+ if (list_empty(&cgroup->children))
+ return NULL;
+ pos = cgroup;
+ }
+
+ /* visit the first child if exists */
+ next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
+ if (next)
+ return next;
+
+ /* no child, visit my or the closest ancestor's next sibling */
+ do {
+ next = list_entry_rcu(pos->sibling.next, struct cgroup,
+ sibling);
+ if (&next->sibling != &pos->parent->children)
+ return next;
+
+ pos = pos->parent;
+ } while (pos != cgroup);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+
+static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
+{
+ struct cgroup *last;
+
+ do {
+ last = pos;
+ pos = list_first_or_null_rcu(&pos->children, struct cgroup,
+ sibling);
+ } while (pos);
+
+ return last;
+}
+
+/**
+ * cgroup_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_post(). Find the next
+ * descendant to visit for post-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
+ struct cgroup *cgroup)
+{
+ struct cgroup *next;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* if first iteration, visit the leftmost descendant */
+ if (!pos) {
+ next = cgroup_leftmost_descendant(cgroup);
+ return next != cgroup ? next : NULL;
+ }
+
+ /* if there's an unvisited sibling, visit its leftmost descendant */
+ next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+ if (&next->sibling != &pos->parent->children)
+ return cgroup_leftmost_descendant(next);
+
+ /* no sibling left, visit parent */
+ next = pos->parent;
+ return next != cgroup ? next : NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
+
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
__acquires(css_set_lock)
{
static u64 cgroup_clone_children_read(struct cgroup *cgrp,
struct cftype *cft)
{
- return clone_children(cgrp);
+ return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
}
static int cgroup_clone_children_write(struct cgroup *cgrp,
u64 val)
{
if (val)
- set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
else
- clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
return 0;
}
css->flags = 0;
css->id = NULL;
if (cgrp == dummytop)
- set_bit(CSS_ROOT, &css->flags);
+ css->flags |= CSS_ROOT;
BUG_ON(cgrp->subsys[ss->subsys_id]);
cgrp->subsys[ss->subsys_id] = css;
/*
- * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
- * which is put on the last css_put(). dput() requires process
- * context, which css_put() may be called without. @css->dput_work
- * will be used to invoke dput() asynchronously from css_put().
+ * css holds an extra ref to @cgrp->dentry which is put on the last
+ * css_put(). dput() requires process context, which css_put() may
+ * be called without. @css->dput_work will be used to invoke
+ * dput() asynchronously from css_put().
*/
INIT_WORK(&css->dput_work, css_dput_fn);
- if (ss->__DEPRECATED_clear_css_refs)
- set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
+}
+
+/* invoke ->post_create() on a new CSS and mark it online if successful */
+static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ int ret = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (ss->css_online)
+ ret = ss->css_online(cgrp);
+ if (!ret)
+ cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
+ return ret;
+}
+
+/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
+static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+{
+ struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!(css->flags & CSS_ONLINE))
+ return;
+
+ /*
+ * css_offline() should be called with cgroup_mutex unlocked. See
+ * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
+ * details. This temporary unlocking should go away once
+ * cgroup_mutex is unexported from controllers.
+ */
+ if (ss->css_offline) {
+ mutex_unlock(&cgroup_mutex);
+ ss->css_offline(cgrp);
+ mutex_lock(&cgroup_mutex);
+ }
+
+ cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
}
/*
if (!cgrp)
return -ENOMEM;
+ /*
+ * Only live parents can have children. Note that the liveliness
+ * check isn't strictly necessary because cgroup_mkdir() and
+ * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
+ * anyway so that locking is contained inside cgroup proper and we
+ * don't get nasty surprises if we ever grow another caller.
+ */
+ if (!cgroup_lock_live_group(parent)) {
+ err = -ENODEV;
+ goto err_free_cgrp;
+ }
+
/* Grab a reference on the superblock so the hierarchy doesn't
* get deleted on unmount if there are child cgroups. This
* can be done outside cgroup_mutex, since the sb can't
* fs */
atomic_inc(&sb->s_active);
- mutex_lock(&cgroup_mutex);
-
init_cgroup_housekeeping(cgrp);
cgrp->parent = parent;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
- if (clone_children(parent))
- set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
for_each_subsys(root, ss) {
struct cgroup_subsys_state *css;
- css = ss->create(cgrp);
+ css = ss->css_alloc(cgrp);
if (IS_ERR(css)) {
err = PTR_ERR(css);
- goto err_destroy;
+ goto err_free_all;
}
init_cgroup_css(css, ss, cgrp);
if (ss->use_id) {
err = alloc_css_id(ss, parent, cgrp);
if (err)
- goto err_destroy;
+ goto err_free_all;
}
- /* At error, ->destroy() callback has to free assigned ID. */
- if (clone_children(parent) && ss->post_clone)
+ /* At error, ->css_free() callback has to free assigned ID. */
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags) &&
+ ss->post_clone)
ss->post_clone(cgrp);
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
}
}
- list_add(&cgrp->sibling, &cgrp->parent->children);
- root->number_of_cgroups++;
-
- err = cgroup_create_dir(cgrp, dentry, mode);
+ /*
+ * Create directory. cgroup_create_file() returns with the new
+ * directory locked on success so that it can be populated without
+ * dropping cgroup_mutex.
+ */
+ err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
if (err < 0)
- goto err_remove;
+ goto err_free_all;
+ lockdep_assert_held(&dentry->d_inode->i_mutex);
- /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
- for_each_subsys(root, ss)
- if (!ss->__DEPRECATED_clear_css_refs)
- dget(dentry);
+ /* allocation complete, commit to creation */
+ dentry->d_fsdata = cgrp;
+ cgrp->dentry = dentry;
+ list_add_tail(&cgrp->allcg_node, &root->allcg_list);
+ list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
+ root->number_of_cgroups++;
- /* The cgroup directory was pre-locked for us */
- BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
+ /* each css holds a ref to the cgroup's dentry */
+ for_each_subsys(root, ss)
+ dget(dentry);
- list_add_tail(&cgrp->allcg_node, &root->allcg_list);
+ /* creation succeeded, notify subsystems */
+ for_each_subsys(root, ss) {
+ err = online_css(ss, cgrp);
+ if (err)
+ goto err_destroy;
+ }
err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
- /* If err < 0, we have a half-filled directory - oh well ;) */
+ if (err)
+ goto err_destroy;
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return 0;
- err_remove:
-
- list_del(&cgrp->sibling);
- root->number_of_cgroups--;
-
- err_destroy:
-
+err_free_all:
for_each_subsys(root, ss) {
if (cgrp->subsys[ss->subsys_id])
- ss->destroy(cgrp);
+ ss->css_free(cgrp);
}
-
mutex_unlock(&cgroup_mutex);
-
/* Release the reference count that we took on the superblock */
deactivate_super(sb);
-
+err_free_cgrp:
kfree(cgrp);
return err;
+
+err_destroy:
+ cgroup_destroy_locked(cgrp);
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&dentry->d_inode->i_mutex);
+ return err;
}
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return 0;
}
-/*
- * Atomically mark all (or else none) of the cgroup's CSS objects as
- * CSS_REMOVED. Return true on success, or false if the cgroup has
- * busy subsystems. Call with cgroup_mutex held
- *
- * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
- * not, cgroup removal behaves differently.
- *
- * If clear is set, css refcnt for the subsystem should be zero before
- * cgroup removal can be committed. This is implemented by
- * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
- * called multiple times until all css refcnts reach zero and is allowed to
- * veto removal on any invocation. This behavior is deprecated and will be
- * removed as soon as the existing user (memcg) is updated.
- *
- * If clear is not set, each css holds an extra reference to the cgroup's
- * dentry and cgroup removal proceeds regardless of css refs.
- * ->pre_destroy() will be called at least once and is not allowed to fail.
- * On the last put of each css, whenever that may be, the extra dentry ref
- * is put so that dentry destruction happens only after all css's are
- * released.
- */
-static int cgroup_clear_css_refs(struct cgroup *cgrp)
+static int cgroup_destroy_locked(struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
+ struct dentry *d = cgrp->dentry;
+ struct cgroup *parent = cgrp->parent;
+ DEFINE_WAIT(wait);
+ struct cgroup_event *event, *tmp;
struct cgroup_subsys *ss;
- unsigned long flags;
- bool failed = false;
- local_irq_save(flags);
+ lockdep_assert_held(&d->d_inode->i_mutex);
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
+ return -EBUSY;
/*
- * Block new css_tryget() by deactivating refcnt. If all refcnts
- * for subsystems w/ clear_css_refs set were 1 at the moment of
- * deactivation, we succeeded.
+ * Block new css_tryget() by deactivating refcnt and mark @cgrp
+ * removed. This makes future css_tryget() and child creation
+ * attempts fail thus maintaining the removal conditions verified
+ * above.
*/
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
WARN_ON(atomic_read(&css->refcnt) < 0);
atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-
- if (ss->__DEPRECATED_clear_css_refs)
- failed |= css_refcnt(css) != 1;
}
+ set_bit(CGRP_REMOVED, &cgrp->flags);
- /*
- * If succeeded, set REMOVED and put all the base refs; otherwise,
- * restore refcnts to positive values. Either way, all in-progress
- * css_tryget() will be released.
- */
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-
- if (!failed) {
- set_bit(CSS_REMOVED, &css->flags);
- css_put(css);
- } else {
- atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
- }
- }
-
- local_irq_restore(flags);
- return !failed;
-}
-
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
-{
- struct cgroup *cgrp = dentry->d_fsdata;
- struct dentry *d;
- struct cgroup *parent;
- DEFINE_WAIT(wait);
- struct cgroup_event *event, *tmp;
- int ret;
-
- /* the vfs holds both inode->i_mutex already */
-again:
- mutex_lock(&cgroup_mutex);
- if (atomic_read(&cgrp->count) != 0) {
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- if (!list_empty(&cgrp->children)) {
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- mutex_unlock(&cgroup_mutex);
-
- /*
- * In general, subsystem has no css->refcnt after pre_destroy(). But
- * in racy cases, subsystem may have to get css->refcnt after
- * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
- * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
- * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
- * and subsystem's reference count handling. Please see css_get/put
- * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
- */
- set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ /* tell subsystems to initate destruction */
+ for_each_subsys(cgrp->root, ss)
+ offline_css(ss, cgrp);
/*
- * Call pre_destroy handlers of subsys. Notify subsystems
- * that rmdir() request comes.
+ * Put all the base refs. Each css holds an extra reference to the
+ * cgroup's dentry and cgroup removal proceeds regardless of css
+ * refs. On the last put of each css, whenever that may be, the
+ * extra dentry ref is put so that dentry destruction happens only
+ * after all css's are released.
*/
- ret = cgroup_call_pre_destroy(cgrp);
- if (ret) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- return ret;
- }
-
- mutex_lock(&cgroup_mutex);
- parent = cgrp->parent;
- if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
- if (!cgroup_clear_css_refs(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- /*
- * Because someone may call cgroup_wakeup_rmdir_waiter() before
- * prepare_to_wait(), we need to check this flag.
- */
- if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
- schedule();
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- if (signal_pending(current))
- return -EINTR;
- goto again;
- }
- /* NO css_tryget() can success after here. */
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ for_each_subsys(cgrp->root, ss)
+ css_put(cgrp->subsys[ss->subsys_id]);
raw_spin_lock(&release_list_lock);
- set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
/* delete this cgroup from parent->children */
- list_del_init(&cgrp->sibling);
-
+ list_del_rcu(&cgrp->sibling);
list_del_init(&cgrp->allcg_node);
- d = dget(cgrp->dentry);
-
+ dget(d);
cgroup_d_remove_dir(d);
dput(d);
}
spin_unlock(&cgrp->event_list_lock);
- mutex_unlock(&cgroup_mutex);
return 0;
}
+static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = cgroup_destroy_locked(dentry->d_fsdata);
+ mutex_unlock(&cgroup_mutex);
+
+ return ret;
+}
+
static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
{
INIT_LIST_HEAD(&ss->cftsets);
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+ mutex_lock(&cgroup_mutex);
+
/* init base cftset */
cgroup_init_cftsets(ss);
/* Create the top cgroup state for this subsystem */
list_add(&ss->sibling, &rootnode.subsys_list);
ss->root = &rootnode;
- css = ss->create(dummytop);
+ css = ss->css_alloc(dummytop);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_cgroup_css(css, ss, dummytop);
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
* init_css_set is in the subsystem's top cgroup. */
- init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
+ init_css_set.subsys[ss->subsys_id] = css;
need_forkexit_callback |= ss->fork || ss->exit;
BUG_ON(!list_empty(&init_task.tasks));
ss->active = 1;
+ BUG_ON(online_css(ss, dummytop));
+
+ mutex_unlock(&cgroup_mutex);
/* this function shouldn't be used with modular subsystems, since they
* need to register a subsys_id, among other things */
*/
int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
{
- int i;
struct cgroup_subsys_state *css;
+ int i, ret;
/* check name and function validity */
if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
- ss->create == NULL || ss->destroy == NULL)
+ ss->css_alloc == NULL || ss->css_free == NULL)
return -EINVAL;
/*
subsys[ss->subsys_id] = ss;
/*
- * no ss->create seems to need anything important in the ss struct, so
- * this can happen first (i.e. before the rootnode attachment).
+ * no ss->css_alloc seems to need anything important in the ss
+ * struct, so this can happen first (i.e. before the rootnode
+ * attachment).
*/
- css = ss->create(dummytop);
+ css = ss->css_alloc(dummytop);
if (IS_ERR(css)) {
/* failure case - need to deassign the subsys[] slot. */
subsys[ss->subsys_id] = NULL;
init_cgroup_css(css, ss, dummytop);
/* init_idr must be after init_cgroup_css because it sets css->id. */
if (ss->use_id) {
- int ret = cgroup_init_idr(ss, css);
- if (ret) {
- dummytop->subsys[ss->subsys_id] = NULL;
- ss->destroy(dummytop);
- subsys[ss->subsys_id] = NULL;
- mutex_unlock(&cgroup_mutex);
- return ret;
- }
+ ret = cgroup_init_idr(ss, css);
+ if (ret)
+ goto err_unload;
}
/*
write_unlock(&css_set_lock);
ss->active = 1;
+ ret = online_css(ss, dummytop);
+ if (ret)
+ goto err_unload;
/* success! */
mutex_unlock(&cgroup_mutex);
return 0;
+
+err_unload:
+ mutex_unlock(&cgroup_mutex);
+ /* @ss can't be mounted here as try_module_get() would fail */
+ cgroup_unload_subsys(ss);
+ return ret;
}
EXPORT_SYMBOL_GPL(cgroup_load_subsys);
BUG_ON(ss->root != &rootnode);
mutex_lock(&cgroup_mutex);
+
+ offline_css(ss, dummytop);
+ ss->active = 0;
+
+ if (ss->use_id) {
+ idr_remove_all(&ss->idr);
+ idr_destroy(&ss->idr);
+ }
+
/* deassign the subsys_id */
subsys[ss->subsys_id] = NULL;
struct css_set *cg = link->cg;
hlist_del(&cg->hlist);
- BUG_ON(!cg->subsys[ss->subsys_id]);
cg->subsys[ss->subsys_id] = NULL;
hhead = css_set_hash(cg->subsys);
hlist_add_head(&cg->hlist, hhead);
write_unlock(&css_set_lock);
/*
- * remove subsystem's css from the dummytop and free it - need to free
- * before marking as null because ss->destroy needs the cgrp->subsys
- * pointer to find their state. note that this also takes care of
- * freeing the css_id.
+ * remove subsystem's css from the dummytop and free it - need to
+ * free before marking as null because ss->css_free needs the
+ * cgrp->subsys pointer to find their state. note that this also
+ * takes care of freeing the css_id.
*/
- ss->destroy(dummytop);
+ ss->css_free(dummytop);
dummytop->subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
BUG_ON(!ss->name);
BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
- BUG_ON(!ss->create);
- BUG_ON(!ss->destroy);
+ BUG_ON(!ss->css_alloc);
+ BUG_ON(!ss->css_free);
if (ss->subsys_id != i) {
printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
ss->name, ss->subsys_id);
*
* A pointer to the shared css_set was automatically copied in
* fork.c by dup_task_struct(). However, we ignore that copy, since
- * it was not made under the protection of RCU, cgroup_mutex or
- * threadgroup_change_begin(), so it might no longer be a valid
- * cgroup pointer. cgroup_attach_task() might have already changed
- * current->cgroups, allowing the previously referenced cgroup
- * group to be removed and freed.
- *
- * Outside the pointer validity we also need to process the css_set
- * inheritance between threadgoup_change_begin() and
- * threadgoup_change_end(), this way there is no leak in any process
- * wide migration performed by cgroup_attach_proc() that could otherwise
- * miss a thread because it is too early or too late in the fork stage.
+ * it was not made under the protection of RCU or cgroup_mutex, so
+ * might no longer be a valid cgroup pointer. cgroup_attach_task() might
+ * have already changed current->cgroups, allowing the previously
+ * referenced cgroup group to be removed and freed.
*
* At the point that cgroup_fork() is called, 'current' is the parent
* task, and the passed argument 'child' points to the child task.
*/
void cgroup_fork(struct task_struct *child)
{
- /*
- * We don't need to task_lock() current because current->cgroups
- * can't be changed concurrently here. The parent obviously hasn't
- * exited and called cgroup_exit(), and we are synchronized against
- * cgroup migration through threadgroup_change_begin().
- */
+ task_lock(current);
child->cgroups = current->cgroups;
get_css_set(child->cgroups);
+ task_unlock(current);
INIT_LIST_HEAD(&child->cg_list);
}
-/**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
- if (need_forkexit_callback) {
- int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
-
- /*
- * forkexit callbacks are only supported for
- * builtin subsystems.
- */
- if (!ss || ss->module)
- continue;
-
- if (ss->fork)
- ss->fork(child);
- }
- }
-}
-
/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
- * Adds the task to the list running through its css_set if necessary.
- * Has to be after the task is visible on the task list in case we race
- * with the first call to cgroup_iter_start() - to guarantee that the
- * new task ends up on its list.
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks. Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_iter_start() - to guarantee that the new task ends up on its
+ * list.
*/
void cgroup_post_fork(struct task_struct *child)
{
+ int i;
+
/*
* use_task_css_set_links is set to 1 before we walk the tasklist
* under the tasklist_lock and we read it here after we added the child
*/
if (use_task_css_set_links) {
write_lock(&css_set_lock);
- if (list_empty(&child->cg_list)) {
+ task_lock(child);
+ if (list_empty(&child->cg_list))
+ list_add(&child->cg_list, &child->cgroups->tasks);
+ task_unlock(child);
+ write_unlock(&css_set_lock);
+ }
+
+ /*
+ * Call ss->fork(). This must happen after @child is linked on
+ * css_set; otherwise, @child might change state between ->fork()
+ * and addition to css_set.
+ */
+ if (need_forkexit_callback) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+
/*
- * It's safe to use child->cgroups without task_lock()
- * here because we are protected through
- * threadgroup_change_begin() against concurrent
- * css_set change in cgroup_task_migrate(). Also
- * the task can't exit at that point until
- * wake_up_new_task() is called, so we are protected
- * against cgroup_exit() setting child->cgroup to
- * init_css_set.
+ * fork/exit callbacks are supported only for
+ * builtin subsystems and we don't need further
+ * synchronization as they never go away.
*/
- list_add(&child->cg_list, &child->cgroups->tasks);
+ if (!ss || ss->module)
+ continue;
+
+ if (ss->fork)
+ ss->fork(child);
}
- write_unlock(&css_set_lock);
}
}
+
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
/* Caller must verify that the css is not for root cgroup */
bool __css_tryget(struct cgroup_subsys_state *css)
{
- do {
- int v = css_refcnt(css);
+ while (true) {
+ int t, v;
- if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
+ v = css_refcnt(css);
+ t = atomic_cmpxchg(&css->refcnt, v, v + 1);
+ if (likely(t == v))
return true;
+ else if (t < 0)
+ return false;
cpu_relax();
- } while (!test_bit(CSS_REMOVED, &css->flags));
-
- return false;
+ }
}
EXPORT_SYMBOL_GPL(__css_tryget);
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
- cgroup_wakeup_rmdir_waiter(cgrp);
break;
case 0:
- if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
- schedule_work(&css->dput_work);
+ schedule_work(&css->dput_work);
break;
}
rcu_read_unlock();
}
#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
+static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
{
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
return css;
}
-static void debug_destroy(struct cgroup *cont)
+static void debug_css_free(struct cgroup *cont)
{
kfree(cont->subsys[debug_subsys_id]);
}
struct cgroup_subsys debug_subsys = {
.name = "debug",
- .create = debug_create,
- .destroy = debug_destroy,
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
.subsys_id = debug_subsys_id,
.base_cftypes = debug_files,
};