include/linux/cgroup.h

   1 #ifndef _LINUX_CGROUP_H
   2 #define _LINUX_CGROUP_H
   3 /*
   4  *  cgroup interface
   5  *
   6  *  Copyright (C) 2003 BULL SA
   7  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
   8  *
   9  */
  10
  11 #include <linux/sched.h>
  12 #include <linux/cpumask.h>
  13 #include <linux/nodemask.h>
  14 #include <linux/rcupdate.h>
  15 #include <linux/cgroupstats.h>
  16 #include <linux/prio_heap.h>
  17 #include <linux/rwsem.h>
  18 #include <linux/idr.h>
  19
  20 #ifdef CONFIG_CGROUPS
  21
  22 struct cgroupfs_root;
  23 struct cgroup_subsys;
  24 struct inode;
  25 struct cgroup;
  26 struct css_id;
  27
  28 extern int cgroup_init_early(void);
  29 extern int cgroup_init(void);
  30 extern void cgroup_lock(void);
  31 extern int cgroup_lock_is_held(void);
  32 extern bool cgroup_lock_live_group(struct cgroup *cgrp);
  33 extern void cgroup_unlock(void);
  34 extern void cgroup_fork(struct task_struct *p);
  35 extern void cgroup_fork_callbacks(struct task_struct *p);
  36 extern void cgroup_post_fork(struct task_struct *p);
  37 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
  38 extern int cgroupstats_build(struct cgroupstats *stats,
  39                                 struct dentry *dentry);
  40
  41 extern const struct file_operations proc_cgroup_operations;
  42
  43 /* Define the enumeration of all builtin cgroup subsystems */
  44 #define SUBSYS(_x) _x ## _subsys_id,
  45 enum cgroup_subsys_id {
  46 #include <linux/cgroup_subsys.h>
  47         CGROUP_BUILTIN_SUBSYS_COUNT
  48 };
  49 #undef SUBSYS
  50 /*
  51  * This define indicates the maximum number of subsystems that can be loaded
  52  * at once. We limit to this many since cgroupfs_root has subsys_bits to keep
  53  * track of all of them.
  54  */
  55 #define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long))
  56
  57 /* Per-subsystem/per-cgroup state maintained by the system. */
  58 struct cgroup_subsys_state {
  59         /*
  60          * The cgroup that this subsystem is attached to. Useful
  61          * for subsystems that want to know about the cgroup
  62          * hierarchy structure
  63          */
  64         struct cgroup *cgroup;
  65
  66         /*
  67          * State maintained by the cgroup system to allow subsystems
  68          * to be "busy". Should be accessed via css_get(),
  69          * css_tryget() and and css_put().
  70          */
  71
  72         atomic_t refcnt;
  73
  74         unsigned long flags;
  75         /* ID for this css, if possible */
  76         struct css_id *id;
  77 };
  78
  79 /* bits in struct cgroup_subsys_state flags field */
  80 enum {
  81         CSS_ROOT, /* This CSS is the root of the subsystem */
  82         CSS_REMOVED, /* This CSS is dead */
  83 };
  84
  85 /* Caller must verify that the css is not for root cgroup */
  86 static inline void __css_get(struct cgroup_subsys_state *css, int count)
  87 {
  88         atomic_add(count, &css->refcnt);
  89 }
  90
  91 /*
  92  * Call css_get() to hold a reference on the css; it can be used
  93  * for a reference obtained via:
  94  * - an existing ref-counted reference to the css
  95  * - task->cgroups for a locked task
  96  */
  97
  98 static inline void css_get(struct cgroup_subsys_state *css)
  99 {
 100         /* We don't need to reference count the root state */
 101         if (!test_bit(CSS_ROOT, &css->flags))
 102                 __css_get(css, 1);
 103 }
 104
 105 static inline bool css_is_removed(struct cgroup_subsys_state *css)
 106 {
 107         return test_bit(CSS_REMOVED, &css->flags);
 108 }
 109
 110 /*
 111  * Call css_tryget() to take a reference on a css if your existing
 112  * (known-valid) reference isn't already ref-counted. Returns false if
 113  * the css has been destroyed.
 114  */
 115
 116 static inline bool css_tryget(struct cgroup_subsys_state *css)
 117 {
 118         if (test_bit(CSS_ROOT, &css->flags))
 119                 return true;
 120         while (!atomic_inc_not_zero(&css->refcnt)) {
 121                 if (test_bit(CSS_REMOVED, &css->flags))
 122                         return false;
 123                 cpu_relax();
 124         }
 125         return true;
 126 }
 127
 128 /*
 129  * css_put() should be called to release a reference taken by
 130  * css_get() or css_tryget()
 131  */
 132
 133 extern void __css_put(struct cgroup_subsys_state *css, int count);
 134 static inline void css_put(struct cgroup_subsys_state *css)
 135 {
 136         if (!test_bit(CSS_ROOT, &css->flags))
 137                 __css_put(css, 1);
 138 }
 139
 140 /* bits in struct cgroup flags field */
 141 enum {
 142         /* Control Group is dead */
 143         CGRP_REMOVED,
 144         /*
 145          * Control Group has previously had a child cgroup or a task,
 146          * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
 147          */
 148         CGRP_RELEASABLE,
 149         /* Control Group requires release notifications to userspace */
 150         CGRP_NOTIFY_ON_RELEASE,
 151         /*
 152          * A thread in rmdir() is wating for this cgroup.
 153          */
 154         CGRP_WAIT_ON_RMDIR,
 155 };
 156
 157 /* which pidlist file are we talking about? */
 158 enum cgroup_filetype {
 159         CGROUP_FILE_PROCS,
 160         CGROUP_FILE_TASKS,
 161 };
 162
 163 /*
 164  * A pidlist is a list of pids that virtually represents the contents of one
 165  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
 166  * a pair (one each for procs, tasks) for each pid namespace that's relevant
 167  * to the cgroup.
 168  */
 169 struct cgroup_pidlist {
 170         /*
 171          * used to find which pidlist is wanted. doesn't change as long as
 172          * this particular list stays in the list.
 173          */
 174         struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
 175         /* array of xids */
 176         pid_t *list;
 177         /* how many elements the above list has */
 178         int length;
 179         /* how many files are using the current array */
 180         int use_count;
 181         /* each of these stored in a list by its cgroup */
 182         struct list_head links;
 183         /* pointer to the cgroup we belong to, for list removal purposes */
 184         struct cgroup *owner;
 185         /* protects the other fields */
 186         struct rw_semaphore mutex;
 187 };
 188
 189 struct cgroup {
 190         unsigned long flags;            /* "unsigned long" so bitops work */
 191
 192         /*
 193          * count users of this cgroup. >0 means busy, but doesn't
 194          * necessarily indicate the number of tasks in the cgroup
 195          */
 196         atomic_t count;
 197
 198         /*
 199          * We link our 'sibling' struct into our parent's 'children'.
 200          * Our children link their 'sibling' into our 'children'.
 201          */
 202         struct list_head sibling;       /* my parent's children */
 203         struct list_head children;      /* my children */
 204
 205         struct cgroup *parent;          /* my parent */
 206         struct dentry *dentry;          /* cgroup fs entry, RCU protected */
 207
 208         /* Private pointers for each registered subsystem */
 209         struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 210
 211         struct cgroupfs_root *root;
 212         struct cgroup *top_cgroup;
 213
 214         /*
 215          * List of cg_cgroup_links pointing at css_sets with
 216          * tasks in this cgroup. Protected by css_set_lock
 217          */
 218         struct list_head css_sets;
 219
 220         /*
 221          * Linked list running through all cgroups that can
 222          * potentially be reaped by the release agent. Protected by
 223          * release_list_lock
 224          */
 225         struct list_head release_list;
 226
 227         /*
 228          * list of pidlists, up to two for each namespace (one for procs, one
 229          * for tasks); created on demand.
 230          */
 231         struct list_head pidlists;
 232         struct mutex pidlist_mutex;
 233
 234         /* For RCU-protected deletion */
 235         struct rcu_head rcu_head;
 236 };
 237
 238 /*
 239  * A css_set is a structure holding pointers to a set of
 240  * cgroup_subsys_state objects. This saves space in the task struct
 241  * object and speeds up fork()/exit(), since a single inc/dec and a
 242  * list_add()/del() can bump the reference count on the entire cgroup
 243  * set for a task.
 244  */
 245
 246 struct css_set {
 247
 248         /* Reference count */
 249         atomic_t refcount;
 250
 251         /*
 252          * List running through all cgroup groups in the same hash
 253          * slot. Protected by css_set_lock
 254          */
 255         struct hlist_node hlist;
 256
 257         /*
 258          * List running through all tasks using this cgroup
 259          * group. Protected by css_set_lock
 260          */
 261         struct list_head tasks;
 262
 263         /*
 264          * List of cg_cgroup_link objects on link chains from
 265          * cgroups referenced from this css_set. Protected by
 266          * css_set_lock
 267          */
 268         struct list_head cg_links;
 269
 270         /*
 271          * Set of subsystem states, one for each subsystem. This array
 272          * is immutable after creation apart from the init_css_set
 273          * during subsystem registration (at boot time).
 274          */
 275         struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 276
 277         /* For RCU-protected deletion */
 278         struct rcu_head rcu_head;
 279 };
 280
 281 /*
 282  * cgroup_map_cb is an abstract callback API for reporting map-valued
 283  * control files
 284  */
 285
 286 struct cgroup_map_cb {
 287         int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
 288         void *state;
 289 };
 290
 291 /*
 292  * struct cftype: handler definitions for cgroup control files
 293  *
 294  * When reading/writing to a file:
 295  *      - the cgroup to use is file->f_dentry->d_parent->d_fsdata
 296  *      - the 'cftype' of the file is file->f_dentry->d_fsdata
 297  */
 298
 299 #define MAX_CFTYPE_NAME 64
 300 struct cftype {
 301         /*
 302          * By convention, the name should begin with the name of the
 303          * subsystem, followed by a period
 304          */
 305         char name[MAX_CFTYPE_NAME];
 306         int private;
 307         /*
 308          * If not 0, file mode is set to this value, otherwise it will
 309          * be figured out automatically
 310          */
 311         mode_t mode;
 312
 313         /*
 314          * If non-zero, defines the maximum length of string that can
 315          * be passed to write_string; defaults to 64
 316          */
 317         size_t max_write_len;
 318
 319         int (*open)(struct inode *inode, struct file *file);
 320         ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
 321                         struct file *file,
 322                         char __user *buf, size_t nbytes, loff_t *ppos);
 323         /*
 324          * read_u64() is a shortcut for the common case of returning a
 325          * single integer. Use it in place of read()
 326          */
 327         u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft);
 328         /*
 329          * read_s64() is a signed version of read_u64()
 330          */
 331         s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft);
 332         /*
 333          * read_map() is used for defining a map of key/value
 334          * pairs. It should call cb->fill(cb, key, value) for each
 335          * entry. The key/value pairs (and their ordering) should not
 336          * change between reboots.
 337          */
 338         int (*read_map)(struct cgroup *cont, struct cftype *cft,
 339                         struct cgroup_map_cb *cb);
 340         /*
 341          * read_seq_string() is used for outputting a simple sequence
 342          * using seqfile.
 343          */
 344         int (*read_seq_string)(struct cgroup *cont, struct cftype *cft,
 345                                struct seq_file *m);
 346
 347         ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft,
 348                          struct file *file,
 349                          const char __user *buf, size_t nbytes, loff_t *ppos);
 350
 351         /*
 352          * write_u64() is a shortcut for the common case of accepting
 353          * a single integer (as parsed by simple_strtoull) from
 354          * userspace. Use in place of write(); return 0 or error.
 355          */
 356         int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val);
 357         /*
 358          * write_s64() is a signed version of write_u64()
 359          */
 360         int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val);
 361
 362         /*
 363          * write_string() is passed a nul-terminated kernelspace
 364          * buffer of maximum length determined by max_write_len.
 365          * Returns 0 or -ve error code.
 366          */
 367         int (*write_string)(struct cgroup *cgrp, struct cftype *cft,
 368                             const char *buffer);
 369         /*
 370          * trigger() callback can be used to get some kick from the
 371          * userspace, when the actual string written is not important
 372          * at all. The private field can be used to determine the
 373          * kick type for multiplexing.
 374          */
 375         int (*trigger)(struct cgroup *cgrp, unsigned int event);
 376
 377         int (*release)(struct inode *inode, struct file *file);
 378 };
 379
 380 struct cgroup_scanner {
 381         struct cgroup *cg;
 382         int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
 383         void (*process_task)(struct task_struct *p,
 384                         struct cgroup_scanner *scan);
 385         struct ptr_heap *heap;
 386         void *data;
 387 };
 388
 389 /*
 390  * Add a new file to the given cgroup directory. Should only be
 391  * called by subsystems from within a populate() method
 392  */
 393 int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 394                        const struct cftype *cft);
 395
 396 /*
 397  * Add a set of new files to the given cgroup directory. Should
 398  * only be called by subsystems from within a populate() method
 399  */
 400 int cgroup_add_files(struct cgroup *cgrp,
 401                         struct cgroup_subsys *subsys,
 402                         const struct cftype cft[],
 403                         int count);
 404
 405 int cgroup_is_removed(const struct cgroup *cgrp);
 406
 407 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 408
 409 int cgroup_task_count(const struct cgroup *cgrp);
 410
 411 /* Return true if cgrp is a descendant of the task's cgroup */
 412 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
 413
 414 /*
 415  * When the subsys has to access css and may add permanent refcnt to css,
 416  * it should take care of racy conditions with rmdir(). Following set of
 417  * functions, is for stop/restart rmdir if necessary.
 418  * Because these will call css_get/put, "css" should be alive css.
 419  *
 420  *  cgroup_exclude_rmdir();
 421  *  ...do some jobs which may access arbitrary empty cgroup
 422  *  cgroup_release_and_wakeup_rmdir();
 423  *
 424  *  When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
 425  *  it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
 426  */
 427
 428 void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
 429 void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
 430
 431 /*
 432  * Control Group subsystem type.
 433  * See Documentation/cgroups/cgroups.txt for details
 434  */
 435
 436 struct cgroup_subsys {
 437         struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
 438                                                   struct cgroup *cgrp);
 439         int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 440         void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 441         int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 442                           struct task_struct *tsk, bool threadgroup);
 443         void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 444                           struct task_struct *tsk, bool threadgroup);
 445         void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 446                         struct cgroup *old_cgrp, struct task_struct *tsk,
 447                         bool threadgroup);
 448         void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
 449         void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
 450         int (*populate)(struct cgroup_subsys *ss,
 451                         struct cgroup *cgrp);
 452         void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 453         void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
 454
 455         int subsys_id;
 456         int active;
 457         int disabled;
 458         int early_init;
 459         /*
 460          * True if this subsys uses ID. ID is not available before cgroup_init()
 461          * (not available in early_init time.)
 462          */
 463         bool use_id;
 464 #define MAX_CGROUP_TYPE_NAMELEN 32
 465         const char *name;
 466
 467         /*
 468          * Protects sibling/children links of cgroups in this
 469          * hierarchy, plus protects which hierarchy (or none) the
 470          * subsystem is a part of (i.e. root/sibling).  To avoid
 471          * potential deadlocks, the following operations should not be
 472          * undertaken while holding any hierarchy_mutex:
 473          *
 474          * - allocating memory
 475          * - initiating hotplug events
 476          */
 477         struct mutex hierarchy_mutex;
 478         struct lock_class_key subsys_key;
 479
 480         /*
 481          * Link to parent, and list entry in parent's children.
 482          * Protected by this->hierarchy_mutex and cgroup_lock()
 483          */
 484         struct cgroupfs_root *root;
 485         struct list_head sibling;
 486         /* used when use_id == true */
 487         struct idr idr;
 488         spinlock_t id_lock;
 489 };
 490
 491 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
 492 #include <linux/cgroup_subsys.h>
 493 #undef SUBSYS
 494
 495 static inline struct cgroup_subsys_state *cgroup_subsys_state(
 496         struct cgroup *cgrp, int subsys_id)
 497 {
 498         return cgrp->subsys[subsys_id];
 499 }
 500
 501 static inline struct cgroup_subsys_state *task_subsys_state(
 502         struct task_struct *task, int subsys_id)
 503 {
 504         return rcu_dereference_check(task->cgroups->subsys[subsys_id],
 505                                      rcu_read_lock_held() ||
 506                                      cgroup_lock_is_held());
 507 }
 508
 509 static inline struct cgroup* task_cgroup(struct task_struct *task,
 510                                                int subsys_id)
 511 {
 512         return task_subsys_state(task, subsys_id)->cgroup;
 513 }
 514
 515 int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss,
 516                                                         char *nodename);
 517
 518 /* A cgroup_iter should be treated as an opaque object */
 519 struct cgroup_iter {
 520         struct list_head *cg_link;
 521         struct list_head *task;
 522 };
 523
 524 /*
 525  * To iterate across the tasks in a cgroup:
 526  *
 527  * 1) call cgroup_iter_start to intialize an iterator
 528  *
 529  * 2) call cgroup_iter_next() to retrieve member tasks until it
 530  *    returns NULL or until you want to end the iteration
 531  *
 532  * 3) call cgroup_iter_end() to destroy the iterator.
 533  *
 534  * Or, call cgroup_scan_tasks() to iterate through every task in a
 535  * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling
 536  * the test_task() callback, but not while calling the process_task()
 537  * callback.
 538  */
 539 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it);
 540 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 541                                         struct cgroup_iter *it);
 542 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 543 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 544 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 545
 546 /*
 547  * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
 548  * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
 549  * CSS ID is assigned at cgroup allocation (create) automatically
 550  * and removed when subsys calls free_css_id() function. This is because
 551  * the lifetime of cgroup_subsys_state is subsys's matter.
 552  *
 553  * Looking up and scanning function should be called under rcu_read_lock().
 554  * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
 555  * But the css returned by this routine can be "not populated yet" or "being
 556  * destroyed". The caller should check css and cgroup's status.
 557  */
 558
 559 /*
 560  * Typically Called at ->destroy(), or somewhere the subsys frees
 561  * cgroup_subsys_state.
 562  */
 563 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
 564
 565 /* Find a cgroup_subsys_state which has given ID */
 566
 567 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
 568
 569 /*
 570  * Get a cgroup whose id is greater than or equal to id under tree of root.
 571  * Returning a cgroup_subsys_state or NULL.
 572  */
 573 struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
 574                 struct cgroup_subsys_state *root, int *foundid);
 575
 576 /* Returns true if root is ancestor of cg */
 577 bool css_is_ancestor(struct cgroup_subsys_state *cg,
 578                      const struct cgroup_subsys_state *root);
 579
 580 /* Get id and depth of css */
 581 unsigned short css_id(struct cgroup_subsys_state *css);
 582 unsigned short css_depth(struct cgroup_subsys_state *css);
 583
 584 #else /* !CONFIG_CGROUPS */
 585
 586 static inline int cgroup_init_early(void) { return 0; }
 587 static inline int cgroup_init(void) { return 0; }
 588 static inline void cgroup_fork(struct task_struct *p) {}
 589 static inline void cgroup_fork_callbacks(struct task_struct *p) {}
 590 static inline void cgroup_post_fork(struct task_struct *p) {}
 591 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
 592
 593 static inline void cgroup_lock(void) {}
 594 static inline void cgroup_unlock(void) {}
 595 static inline int cgroupstats_build(struct cgroupstats *stats,
 596                                         struct dentry *dentry)
 597 {
 598         return -EINVAL;
 599 }
 600
 601 #endif /* !CONFIG_CGROUPS */
 602
 603 #endif /* _LINUX_CGROUP_H */