1 #ifndef _LINUX_CGROUP_H
2 #define _LINUX_CGROUP_H
6 * Copyright (C) 2003 BULL SA
7 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
11 #include <linux/sched.h>
12 #include <linux/cpumask.h>
13 #include <linux/nodemask.h>
14 #include <linux/rcupdate.h>
15 #include <linux/cgroupstats.h>
16 #include <linux/prio_heap.h>
17 #include <linux/rwsem.h>
18 #include <linux/idr.h>
28 extern int cgroup_init_early(void);
29 extern int cgroup_init(void);
30 extern void cgroup_lock(void);
31 extern int cgroup_lock_is_held(void);
32 extern bool cgroup_lock_live_group(struct cgroup *cgrp);
33 extern void cgroup_unlock(void);
34 extern void cgroup_fork(struct task_struct *p);
35 extern void cgroup_post_fork(struct task_struct *p);
36 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
37 extern int cgroupstats_build(struct cgroupstats *stats,
38 struct dentry *dentry);
39 extern int cgroup_load_subsys(struct cgroup_subsys *ss);
40 extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
42 extern const struct file_operations proc_cgroup_operations;
44 /* Define the enumeration of all builtin cgroup subsystems */
45 #define SUBSYS(_x) _x ## _subsys_id,
46 enum cgroup_subsys_id {
47 #include <linux/cgroup_subsys.h>
48 CGROUP_BUILTIN_SUBSYS_COUNT
52 * This define indicates the maximum number of subsystems that can be loaded
53 * at once. We limit to this many since cgroupfs_root has subsys_bits to keep
54 * track of all of them.
56 #define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long))
58 /* Per-subsystem/per-cgroup state maintained by the system. */
59 struct cgroup_subsys_state {
61 * The cgroup that this subsystem is attached to. Useful
62 * for subsystems that want to know about the cgroup
65 struct cgroup *cgroup;
68 * State maintained by the cgroup system to allow subsystems
69 * to be "busy". Should be accessed via css_get(),
70 * css_tryget() and and css_put().
76 /* ID for this css, if possible */
77 struct css_id __rcu *id;
80 /* bits in struct cgroup_subsys_state flags field */
82 CSS_ROOT, /* This CSS is the root of the subsystem */
83 CSS_REMOVED, /* This CSS is dead */
86 /* Caller must verify that the css is not for root cgroup */
87 static inline void __css_get(struct cgroup_subsys_state *css, int count)
89 atomic_add(count, &css->refcnt);
93 * Call css_get() to hold a reference on the css; it can be used
94 * for a reference obtained via:
95 * - an existing ref-counted reference to the css
96 * - task->cgroups for a locked task
99 static inline void css_get(struct cgroup_subsys_state *css)
101 /* We don't need to reference count the root state */
102 if (!test_bit(CSS_ROOT, &css->flags))
106 static inline bool css_is_removed(struct cgroup_subsys_state *css)
108 return test_bit(CSS_REMOVED, &css->flags);
112 * Call css_tryget() to take a reference on a css if your existing
113 * (known-valid) reference isn't already ref-counted. Returns false if
114 * the css has been destroyed.
117 static inline bool css_tryget(struct cgroup_subsys_state *css)
119 if (test_bit(CSS_ROOT, &css->flags))
121 while (!atomic_inc_not_zero(&css->refcnt)) {
122 if (test_bit(CSS_REMOVED, &css->flags))
130 * css_put() should be called to release a reference taken by
131 * css_get() or css_tryget()
134 extern void __css_put(struct cgroup_subsys_state *css, int count);
135 static inline void css_put(struct cgroup_subsys_state *css)
137 if (!test_bit(CSS_ROOT, &css->flags))
141 /* bits in struct cgroup flags field */
143 /* Control Group is dead */
146 * Control Group has previously had a child cgroup or a task,
147 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
150 /* Control Group requires release notifications to userspace */
151 CGRP_NOTIFY_ON_RELEASE,
153 * A thread in rmdir() is wating for this cgroup.
157 * Clone cgroup values when creating a new child cgroup
162 /* which pidlist file are we talking about? */
163 enum cgroup_filetype {
169 * A pidlist is a list of pids that virtually represents the contents of one
170 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
171 * a pair (one each for procs, tasks) for each pid namespace that's relevant
174 struct cgroup_pidlist {
176 * used to find which pidlist is wanted. doesn't change as long as
177 * this particular list stays in the list.
179 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
182 /* how many elements the above list has */
184 /* how many files are using the current array */
186 /* each of these stored in a list by its cgroup */
187 struct list_head links;
188 /* pointer to the cgroup we belong to, for list removal purposes */
189 struct cgroup *owner;
190 /* protects the other fields */
191 struct rw_semaphore mutex;
195 unsigned long flags; /* "unsigned long" so bitops work */
198 * count users of this cgroup. >0 means busy, but doesn't
199 * necessarily indicate the number of tasks in the cgroup
204 * We link our 'sibling' struct into our parent's 'children'.
205 * Our children link their 'sibling' into our 'children'.
207 struct list_head sibling; /* my parent's children */
208 struct list_head children; /* my children */
210 struct cgroup *parent; /* my parent */
211 struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */
213 /* Private pointers for each registered subsystem */
214 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
216 struct cgroupfs_root *root;
217 struct cgroup *top_cgroup;
220 * List of cg_cgroup_links pointing at css_sets with
221 * tasks in this cgroup. Protected by css_set_lock
223 struct list_head css_sets;
226 * Linked list running through all cgroups that can
227 * potentially be reaped by the release agent. Protected by
230 struct list_head release_list;
233 * list of pidlists, up to two for each namespace (one for procs, one
234 * for tasks); created on demand.
236 struct list_head pidlists;
237 struct mutex pidlist_mutex;
239 /* For RCU-protected deletion */
240 struct rcu_head rcu_head;
242 /* List of events which userspace want to receive */
243 struct list_head event_list;
244 spinlock_t event_list_lock;
248 * A css_set is a structure holding pointers to a set of
249 * cgroup_subsys_state objects. This saves space in the task struct
250 * object and speeds up fork()/exit(), since a single inc/dec and a
251 * list_add()/del() can bump the reference count on the entire cgroup
257 /* Reference count */
261 * List running through all cgroup groups in the same hash
262 * slot. Protected by css_set_lock
264 struct hlist_node hlist;
267 * List running through all tasks using this cgroup
268 * group. Protected by css_set_lock
270 struct list_head tasks;
273 * List of cg_cgroup_link objects on link chains from
274 * cgroups referenced from this css_set. Protected by
277 struct list_head cg_links;
280 * Set of subsystem states, one for each subsystem. This array
281 * is immutable after creation apart from the init_css_set
282 * during subsystem registration (at boot time) and modular subsystem
285 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
287 /* For RCU-protected deletion */
288 struct rcu_head rcu_head;
292 * cgroup_map_cb is an abstract callback API for reporting map-valued
296 struct cgroup_map_cb {
297 int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
302 * struct cftype: handler definitions for cgroup control files
304 * When reading/writing to a file:
305 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata
306 * - the 'cftype' of the file is file->f_dentry->d_fsdata
309 #define MAX_CFTYPE_NAME 64
312 * By convention, the name should begin with the name of the
313 * subsystem, followed by a period
315 char name[MAX_CFTYPE_NAME];
318 * If not 0, file mode is set to this value, otherwise it will
319 * be figured out automatically
324 * If non-zero, defines the maximum length of string that can
325 * be passed to write_string; defaults to 64
327 size_t max_write_len;
329 int (*open)(struct inode *inode, struct file *file);
330 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
332 char __user *buf, size_t nbytes, loff_t *ppos);
334 * read_u64() is a shortcut for the common case of returning a
335 * single integer. Use it in place of read()
337 u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft);
339 * read_s64() is a signed version of read_u64()
341 s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft);
343 * read_map() is used for defining a map of key/value
344 * pairs. It should call cb->fill(cb, key, value) for each
345 * entry. The key/value pairs (and their ordering) should not
346 * change between reboots.
348 int (*read_map)(struct cgroup *cont, struct cftype *cft,
349 struct cgroup_map_cb *cb);
351 * read_seq_string() is used for outputting a simple sequence
354 int (*read_seq_string)(struct cgroup *cont, struct cftype *cft,
357 ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft,
359 const char __user *buf, size_t nbytes, loff_t *ppos);
362 * write_u64() is a shortcut for the common case of accepting
363 * a single integer (as parsed by simple_strtoull) from
364 * userspace. Use in place of write(); return 0 or error.
366 int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val);
368 * write_s64() is a signed version of write_u64()
370 int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val);
373 * write_string() is passed a nul-terminated kernelspace
374 * buffer of maximum length determined by max_write_len.
375 * Returns 0 or -ve error code.
377 int (*write_string)(struct cgroup *cgrp, struct cftype *cft,
380 * trigger() callback can be used to get some kick from the
381 * userspace, when the actual string written is not important
382 * at all. The private field can be used to determine the
383 * kick type for multiplexing.
385 int (*trigger)(struct cgroup *cgrp, unsigned int event);
387 int (*release)(struct inode *inode, struct file *file);
390 * register_event() callback will be used to add new userspace
391 * waiter for changes related to the cftype. Implement it if
392 * you want to provide this functionality. Use eventfd_signal()
393 * on eventfd to send notification to userspace.
395 int (*register_event)(struct cgroup *cgrp, struct cftype *cft,
396 struct eventfd_ctx *eventfd, const char *args);
398 * unregister_event() callback will be called when userspace
399 * closes the eventfd or on cgroup removing.
400 * This callback must be implemented, if you want provide
401 * notification functionality.
403 void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft,
404 struct eventfd_ctx *eventfd);
407 struct cgroup_scanner {
409 int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
410 void (*process_task)(struct task_struct *p,
411 struct cgroup_scanner *scan);
412 struct ptr_heap *heap;
417 * Add a new file to the given cgroup directory. Should only be
418 * called by subsystems from within a populate() method
420 int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
421 const struct cftype *cft);
424 * Add a set of new files to the given cgroup directory. Should
425 * only be called by subsystems from within a populate() method
427 int cgroup_add_files(struct cgroup *cgrp,
428 struct cgroup_subsys *subsys,
429 const struct cftype cft[],
432 int cgroup_is_removed(const struct cgroup *cgrp);
434 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
436 int cgroup_task_count(const struct cgroup *cgrp);
438 /* Return true if cgrp is a descendant of the task's cgroup */
439 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
442 * When the subsys has to access css and may add permanent refcnt to css,
443 * it should take care of racy conditions with rmdir(). Following set of
444 * functions, is for stop/restart rmdir if necessary.
445 * Because these will call css_get/put, "css" should be alive css.
447 * cgroup_exclude_rmdir();
448 * ...do some jobs which may access arbitrary empty cgroup
449 * cgroup_release_and_wakeup_rmdir();
451 * When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
452 * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
455 void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
456 void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
459 * Control Group subsystem type.
460 * See Documentation/cgroups/cgroups.txt for details
463 struct cgroup_subsys {
464 struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
465 struct cgroup *cgrp);
466 int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
467 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
468 int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
469 struct task_struct *tsk);
470 int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
471 void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
472 struct task_struct *tsk);
473 void (*pre_attach)(struct cgroup *cgrp);
474 void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
475 void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
476 struct cgroup *old_cgrp, struct task_struct *tsk);
477 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
478 void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
479 struct cgroup *old_cgrp, struct task_struct *task);
480 int (*populate)(struct cgroup_subsys *ss,
481 struct cgroup *cgrp);
482 void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
483 void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
490 * True if this subsys uses ID. ID is not available before cgroup_init()
491 * (not available in early_init time.)
494 #define MAX_CGROUP_TYPE_NAMELEN 32
498 * Protects sibling/children links of cgroups in this
499 * hierarchy, plus protects which hierarchy (or none) the
500 * subsystem is a part of (i.e. root/sibling). To avoid
501 * potential deadlocks, the following operations should not be
502 * undertaken while holding any hierarchy_mutex:
504 * - allocating memory
505 * - initiating hotplug events
507 struct mutex hierarchy_mutex;
508 struct lock_class_key subsys_key;
511 * Link to parent, and list entry in parent's children.
512 * Protected by this->hierarchy_mutex and cgroup_lock()
514 struct cgroupfs_root *root;
515 struct list_head sibling;
516 /* used when use_id == true */
520 /* should be defined only by modular subsystems */
521 struct module *module;
524 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
525 #include <linux/cgroup_subsys.h>
528 static inline struct cgroup_subsys_state *cgroup_subsys_state(
529 struct cgroup *cgrp, int subsys_id)
531 return cgrp->subsys[subsys_id];
535 * task_css_set_check - obtain a task's css_set with extra access conditions
536 * @task: the task to obtain css_set for
537 * @__c: extra condition expression to be passed to rcu_dereference_check()
539 * A task's css_set is RCU protected, initialized and exited while holding
540 * task_lock(), and can only be modified while holding both cgroup_mutex
541 * and task_lock() while the task is alive. This macro verifies that the
542 * caller is inside proper critical section and returns @task's css_set.
544 * The caller can also specify additional allowed conditions via @__c, such
545 * as locks used during the cgroup_subsys::attach() methods.
547 #define task_css_set_check(task, __c) \
548 rcu_dereference_check((task)->cgroups, \
549 lockdep_is_held(&(task)->alloc_lock) || \
550 cgroup_lock_is_held() || (__c))
553 * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds
554 * @task: the target task
555 * @subsys_id: the target subsystem ID
556 * @__c: extra condition expression to be passed to rcu_dereference_check()
558 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The
559 * synchronization rules are the same as task_css_set_check().
561 #define task_subsys_state_check(task, subsys_id, __c) \
562 task_css_set_check((task), (__c))->subsys[(subsys_id)]
565 * task_css_set - obtain a task's css_set
566 * @task: the task to obtain css_set for
568 * See task_css_set_check().
570 static inline struct css_set *task_css_set(struct task_struct *task)
572 return task_css_set_check(task, false);
576 * task_subsys_state - obtain css for (task, subsys)
577 * @task: the target task
578 * @subsys_id: the target subsystem ID
580 * See task_subsys_state_check().
582 static inline struct cgroup_subsys_state *
583 task_subsys_state(struct task_struct *task, int subsys_id)
585 return task_subsys_state_check(task, subsys_id, false);
588 static inline struct cgroup* task_cgroup(struct task_struct *task,
591 return task_subsys_state(task, subsys_id)->cgroup;
594 /* A cgroup_iter should be treated as an opaque object */
596 struct list_head *cg_link;
597 struct list_head *task;
601 * To iterate across the tasks in a cgroup:
603 * 1) call cgroup_iter_start to initialize an iterator
605 * 2) call cgroup_iter_next() to retrieve member tasks until it
606 * returns NULL or until you want to end the iteration
608 * 3) call cgroup_iter_end() to destroy the iterator.
610 * Or, call cgroup_scan_tasks() to iterate through every task in a
611 * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling
612 * the test_task() callback, but not while calling the process_task()
615 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it);
616 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
617 struct cgroup_iter *it);
618 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
619 int cgroup_scan_tasks(struct cgroup_scanner *scan);
620 int cgroup_attach_task(struct cgroup *, struct task_struct *);
621 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
623 static inline int cgroup_attach_task_current_cg(struct task_struct *tsk)
625 return cgroup_attach_task_all(current, tsk);
629 * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
630 * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
631 * CSS ID is assigned at cgroup allocation (create) automatically
632 * and removed when subsys calls free_css_id() function. This is because
633 * the lifetime of cgroup_subsys_state is subsys's matter.
635 * Looking up and scanning function should be called under rcu_read_lock().
636 * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
637 * But the css returned by this routine can be "not populated yet" or "being
638 * destroyed". The caller should check css and cgroup's status.
642 * Typically Called at ->destroy(), or somewhere the subsys frees
643 * cgroup_subsys_state.
645 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
647 /* Find a cgroup_subsys_state which has given ID */
649 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
652 * Get a cgroup whose id is greater than or equal to id under tree of root.
653 * Returning a cgroup_subsys_state or NULL.
655 struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
656 struct cgroup_subsys_state *root, int *foundid);
658 /* Returns true if root is ancestor of cg */
659 bool css_is_ancestor(struct cgroup_subsys_state *cg,
660 const struct cgroup_subsys_state *root);
662 /* Get id and depth of css */
663 unsigned short css_id(struct cgroup_subsys_state *css);
664 unsigned short css_depth(struct cgroup_subsys_state *css);
665 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
667 #else /* !CONFIG_CGROUPS */
669 static inline int cgroup_init_early(void) { return 0; }
670 static inline int cgroup_init(void) { return 0; }
671 static inline void cgroup_fork(struct task_struct *p) {}
672 static inline void cgroup_fork_callbacks(struct task_struct *p) {}
673 static inline void cgroup_post_fork(struct task_struct *p) {}
674 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
676 static inline void cgroup_lock(void) {}
677 static inline void cgroup_unlock(void) {}
678 static inline int cgroupstats_build(struct cgroupstats *stats,
679 struct dentry *dentry)
684 /* No cgroups - nothing to do */
685 static inline int cgroup_attach_task_all(struct task_struct *from,
686 struct task_struct *t)
690 static inline int cgroup_attach_task_current_cg(struct task_struct *t)
695 #endif /* !CONFIG_CGROUPS */
697 #endif /* _LINUX_CGROUP_H */