mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  */
  19
  20 #include <linux/res_counter.h>
  21 #include <linux/memcontrol.h>
  22 #include <linux/cgroup.h>
  23 #include <linux/mm.h>
  24 #include <linux/hugetlb.h>
  25 #include <linux/pagemap.h>
  26 #include <linux/smp.h>
  27 #include <linux/page-flags.h>
  28 #include <linux/backing-dev.h>
  29 #include <linux/bit_spinlock.h>
  30 #include <linux/rcupdate.h>
  31 #include <linux/limits.h>
  32 #include <linux/mutex.h>
  33 #include <linux/rbtree.h>
  34 #include <linux/slab.h>
  35 #include <linux/swap.h>
  36 #include <linux/spinlock.h>
  37 #include <linux/fs.h>
  38 #include <linux/seq_file.h>
  39 #include <linux/vmalloc.h>
  40 #include <linux/mm_inline.h>
  41 #include <linux/page_cgroup.h>
  42 #include <linux/cpu.h>
  43 #include "internal.h"
  44
  45 #include <asm/uaccess.h>
  46
  47 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  48 #define MEM_CGROUP_RECLAIM_RETRIES      5
  49 struct mem_cgroup *root_mem_cgroup __read_mostly;
  50
  51 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  52 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
  53 int do_swap_account __read_mostly;
  54 static int really_do_swap_account __initdata = 1; /* for remember boot option*/
  55 #else
  56 #define do_swap_account         (0)
  57 #endif
  58
  59 #define SOFTLIMIT_EVENTS_THRESH (1000)
  60
  61 /*
  62  * Statistics for memory cgroup.
  63  */
  64 enum mem_cgroup_stat_index {
  65         /*
  66          * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  67          */
  68         MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
  69         MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
  70         MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
  71         MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
  72         MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
  73         MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
  74         MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
  75
  76         MEM_CGROUP_STAT_NSTATS,
  77 };
  78
  79 struct mem_cgroup_stat_cpu {
  80         s64 count[MEM_CGROUP_STAT_NSTATS];
  81 } ____cacheline_aligned_in_smp;
  82
  83 struct mem_cgroup_stat {
  84         struct mem_cgroup_stat_cpu cpustat[0];
  85 };
  86
  87 static inline void
  88 __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
  89                                 enum mem_cgroup_stat_index idx)
  90 {
  91         stat->count[idx] = 0;
  92 }
  93
  94 static inline s64
  95 __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
  96                                 enum mem_cgroup_stat_index idx)
  97 {
  98         return stat->count[idx];
  99 }
 100
 101 /*
 102  * For accounting under irq disable, no need for increment preempt count.
 103  */
 104 static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
 105                 enum mem_cgroup_stat_index idx, int val)
 106 {
 107         stat->count[idx] += val;
 108 }
 109
 110 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
 111                 enum mem_cgroup_stat_index idx)
 112 {
 113         int cpu;
 114         s64 ret = 0;
 115         for_each_possible_cpu(cpu)
 116                 ret += stat->cpustat[cpu].count[idx];
 117         return ret;
 118 }
 119
 120 static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
 121 {
 122         s64 ret;
 123
 124         ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
 125         ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
 126         return ret;
 127 }
 128
 129 /*
 130  * per-zone information in memory controller.
 131  */
 132 struct mem_cgroup_per_zone {
 133         /*
 134          * spin_lock to protect the per cgroup LRU
 135          */
 136         struct list_head        lists[NR_LRU_LISTS];
 137         unsigned long           count[NR_LRU_LISTS];
 138
 139         struct zone_reclaim_stat reclaim_stat;
 140         struct rb_node          tree_node;      /* RB tree node */
 141         unsigned long long      usage_in_excess;/* Set to the value by which */
 142                                                 /* the soft limit is exceeded*/
 143         bool                    on_tree;
 144         struct mem_cgroup       *mem;           /* Back pointer, we cannot */
 145                                                 /* use container_of        */
 146 };
 147 /* Macro for accessing counter */
 148 #define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
 149
 150 struct mem_cgroup_per_node {
 151         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 152 };
 153
 154 struct mem_cgroup_lru_info {
 155         struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 156 };
 157
 158 /*
 159  * Cgroups above their limits are maintained in a RB-Tree, independent of
 160  * their hierarchy representation
 161  */
 162
 163 struct mem_cgroup_tree_per_zone {
 164         struct rb_root rb_root;
 165         spinlock_t lock;
 166 };
 167
 168 struct mem_cgroup_tree_per_node {
 169         struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 170 };
 171
 172 struct mem_cgroup_tree {
 173         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 174 };
 175
 176 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 177
 178 /*
 179  * The memory controller data structure. The memory controller controls both
 180  * page cache and RSS per cgroup. We would eventually like to provide
 181  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 182  * to help the administrator determine what knobs to tune.
 183  *
 184  * TODO: Add a water mark for the memory controller. Reclaim will begin when
 185  * we hit the water mark. May be even add a low water mark, such that
 186  * no reclaim occurs from a cgroup at it's low water mark, this is
 187  * a feature that will be implemented much later in the future.
 188  */
 189 struct mem_cgroup {
 190         struct cgroup_subsys_state css;
 191         /*
 192          * the counter to account for memory usage
 193          */
 194         struct res_counter res;
 195         /*
 196          * the counter to account for mem+swap usage.
 197          */
 198         struct res_counter memsw;
 199         /*
 200          * Per cgroup active and inactive list, similar to the
 201          * per zone LRU lists.
 202          */
 203         struct mem_cgroup_lru_info info;
 204
 205         /*
 206           protect against reclaim related member.
 207         */
 208         spinlock_t reclaim_param_lock;
 209
 210         int     prev_priority;  /* for recording reclaim priority */
 211
 212         /*
 213          * While reclaiming in a hierarchy, we cache the last child we
 214          * reclaimed from.
 215          */
 216         int last_scanned_child;
 217         /*
 218          * Should the accounting and control be hierarchical, per subtree?
 219          */
 220         bool use_hierarchy;
 221         unsigned long   last_oom_jiffies;
 222         atomic_t        refcnt;
 223
 224         unsigned int    swappiness;
 225
 226         /* set when res.limit == memsw.limit */
 227         bool            memsw_is_minimum;
 228
 229         /*
 230          * Should we move charges of a task when a task is moved into this
 231          * mem_cgroup ? And what type of charges should we move ?
 232          */
 233         unsigned long   move_charge_at_immigrate;
 234
 235         /*
 236          * statistics. This must be placed at the end of memcg.
 237          */
 238         struct mem_cgroup_stat stat;
 239 };
 240
 241 /* Stuffs for move charges at task migration. */
 242 /*
 243  * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
 244  * left-shifted bitmap of these types.
 245  */
 246 enum move_type {
 247         MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
 248         NR_MOVE_TYPE,
 249 };
 250
 251 /* "mc" and its members are protected by cgroup_mutex */
 252 static struct move_charge_struct {
 253         struct mem_cgroup *from;
 254         struct mem_cgroup *to;
 255         unsigned long precharge;
 256         unsigned long moved_charge;
 257 } mc;
 258
 259 /*
 260  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 261  * limit reclaim to prevent infinite loops, if they ever occur.
 262  */
 263 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            (100)
 264 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
 265
 266 enum charge_type {
 267         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 268         MEM_CGROUP_CHARGE_TYPE_MAPPED,
 269         MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
 270         MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
 271         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 272         MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 273         NR_CHARGE_TYPE,
 274 };
 275
 276 /* only for here (for easy reading.) */
 277 #define PCGF_CACHE      (1UL << PCG_CACHE)
 278 #define PCGF_USED       (1UL << PCG_USED)
 279 #define PCGF_LOCK       (1UL << PCG_LOCK)
 280 /* Not used, but added here for completeness */
 281 #define PCGF_ACCT       (1UL << PCG_ACCT)
 282
 283 /* for encoding cft->private value on file */
 284 #define _MEM                    (0)
 285 #define _MEMSWAP                (1)
 286 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
 287 #define MEMFILE_TYPE(val)       (((val) >> 16) & 0xffff)
 288 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
 289
 290 /*
 291  * Reclaim flags for mem_cgroup_hierarchical_reclaim
 292  */
 293 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT   0x0
 294 #define MEM_CGROUP_RECLAIM_NOSWAP       (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 295 #define MEM_CGROUP_RECLAIM_SHRINK_BIT   0x1
 296 #define MEM_CGROUP_RECLAIM_SHRINK       (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 297 #define MEM_CGROUP_RECLAIM_SOFT_BIT     0x2
 298 #define MEM_CGROUP_RECLAIM_SOFT         (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 299
 300 static void mem_cgroup_get(struct mem_cgroup *mem);
 301 static void mem_cgroup_put(struct mem_cgroup *mem);
 302 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 303 static void drain_all_stock_async(void);
 304
 305 static struct mem_cgroup_per_zone *
 306 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 307 {
 308         return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 309 }
 310
 311 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
 312 {
 313         return &mem->css;
 314 }
 315
 316 static struct mem_cgroup_per_zone *
 317 page_cgroup_zoneinfo(struct page_cgroup *pc)
 318 {
 319         struct mem_cgroup *mem = pc->mem_cgroup;
 320         int nid = page_cgroup_nid(pc);
 321         int zid = page_cgroup_zid(pc);
 322
 323         if (!mem)
 324                 return NULL;
 325
 326         return mem_cgroup_zoneinfo(mem, nid, zid);
 327 }
 328
 329 static struct mem_cgroup_tree_per_zone *
 330 soft_limit_tree_node_zone(int nid, int zid)
 331 {
 332         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 333 }
 334
 335 static struct mem_cgroup_tree_per_zone *
 336 soft_limit_tree_from_page(struct page *page)
 337 {
 338         int nid = page_to_nid(page);
 339         int zid = page_zonenum(page);
 340
 341         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 342 }
 343
 344 static void
 345 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
 346                                 struct mem_cgroup_per_zone *mz,
 347                                 struct mem_cgroup_tree_per_zone *mctz,
 348                                 unsigned long long new_usage_in_excess)
 349 {
 350         struct rb_node **p = &mctz->rb_root.rb_node;
 351         struct rb_node *parent = NULL;
 352         struct mem_cgroup_per_zone *mz_node;
 353
 354         if (mz->on_tree)
 355                 return;
 356
 357         mz->usage_in_excess = new_usage_in_excess;
 358         if (!mz->usage_in_excess)
 359                 return;
 360         while (*p) {
 361                 parent = *p;
 362                 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 363                                         tree_node);
 364                 if (mz->usage_in_excess < mz_node->usage_in_excess)
 365                         p = &(*p)->rb_left;
 366                 /*
 367                  * We can't avoid mem cgroups that are over their soft
 368                  * limit by the same amount
 369                  */
 370                 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 371                         p = &(*p)->rb_right;
 372         }
 373         rb_link_node(&mz->tree_node, parent, p);
 374         rb_insert_color(&mz->tree_node, &mctz->rb_root);
 375         mz->on_tree = true;
 376 }
 377
 378 static void
 379 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 380                                 struct mem_cgroup_per_zone *mz,
 381                                 struct mem_cgroup_tree_per_zone *mctz)
 382 {
 383         if (!mz->on_tree)
 384                 return;
 385         rb_erase(&mz->tree_node, &mctz->rb_root);
 386         mz->on_tree = false;
 387 }
 388
 389 static void
 390 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 391                                 struct mem_cgroup_per_zone *mz,
 392                                 struct mem_cgroup_tree_per_zone *mctz)
 393 {
 394         spin_lock(&mctz->lock);
 395         __mem_cgroup_remove_exceeded(mem, mz, mctz);
 396         spin_unlock(&mctz->lock);
 397 }
 398
 399 static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
 400 {
 401         bool ret = false;
 402         int cpu;
 403         s64 val;
 404         struct mem_cgroup_stat_cpu *cpustat;
 405
 406         cpu = get_cpu();
 407         cpustat = &mem->stat.cpustat[cpu];
 408         val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
 409         if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
 410                 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
 411                 ret = true;
 412         }
 413         put_cpu();
 414         return ret;
 415 }
 416
 417 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
 418 {
 419         unsigned long long excess;
 420         struct mem_cgroup_per_zone *mz;
 421         struct mem_cgroup_tree_per_zone *mctz;
 422         int nid = page_to_nid(page);
 423         int zid = page_zonenum(page);
 424         mctz = soft_limit_tree_from_page(page);
 425
 426         /*
 427          * Necessary to update all ancestors when hierarchy is used.
 428          * because their event counter is not touched.
 429          */
 430         for (; mem; mem = parent_mem_cgroup(mem)) {
 431                 mz = mem_cgroup_zoneinfo(mem, nid, zid);
 432                 excess = res_counter_soft_limit_excess(&mem->res);
 433                 /*
 434                  * We have to update the tree if mz is on RB-tree or
 435                  * mem is over its softlimit.
 436                  */
 437                 if (excess || mz->on_tree) {
 438                         spin_lock(&mctz->lock);
 439                         /* if on-tree, remove it */
 440                         if (mz->on_tree)
 441                                 __mem_cgroup_remove_exceeded(mem, mz, mctz);
 442                         /*
 443                          * Insert again. mz->usage_in_excess will be updated.
 444                          * If excess is 0, no tree ops.
 445                          */
 446                         __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
 447                         spin_unlock(&mctz->lock);
 448                 }
 449         }
 450 }
 451
 452 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
 453 {
 454         int node, zone;
 455         struct mem_cgroup_per_zone *mz;
 456         struct mem_cgroup_tree_per_zone *mctz;
 457
 458         for_each_node_state(node, N_POSSIBLE) {
 459                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 460                         mz = mem_cgroup_zoneinfo(mem, node, zone);
 461                         mctz = soft_limit_tree_node_zone(node, zone);
 462                         mem_cgroup_remove_exceeded(mem, mz, mctz);
 463                 }
 464         }
 465 }
 466
 467 static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
 468 {
 469         return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
 470 }
 471
 472 static struct mem_cgroup_per_zone *
 473 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 474 {
 475         struct rb_node *rightmost = NULL;
 476         struct mem_cgroup_per_zone *mz;
 477
 478 retry:
 479         mz = NULL;
 480         rightmost = rb_last(&mctz->rb_root);
 481         if (!rightmost)
 482                 goto done;              /* Nothing to reclaim from */
 483
 484         mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 485         /*
 486          * Remove the node now but someone else can add it back,
 487          * we will to add it back at the end of reclaim to its correct
 488          * position in the tree.
 489          */
 490         __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
 491         if (!res_counter_soft_limit_excess(&mz->mem->res) ||
 492                 !css_tryget(&mz->mem->css))
 493                 goto retry;
 494 done:
 495         return mz;
 496 }
 497
 498 static struct mem_cgroup_per_zone *
 499 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 500 {
 501         struct mem_cgroup_per_zone *mz;
 502
 503         spin_lock(&mctz->lock);
 504         mz = __mem_cgroup_largest_soft_limit_node(mctz);
 505         spin_unlock(&mctz->lock);
 506         return mz;
 507 }
 508
 509 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
 510                                          bool charge)
 511 {
 512         int val = (charge) ? 1 : -1;
 513         struct mem_cgroup_stat *stat = &mem->stat;
 514         struct mem_cgroup_stat_cpu *cpustat;
 515         int cpu = get_cpu();
 516
 517         cpustat = &stat->cpustat[cpu];
 518         __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
 519         put_cpu();
 520 }
 521
 522 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 523                                          struct page_cgroup *pc,
 524                                          bool charge)
 525 {
 526         int val = (charge) ? 1 : -1;
 527         struct mem_cgroup_stat *stat = &mem->stat;
 528         struct mem_cgroup_stat_cpu *cpustat;
 529         int cpu = get_cpu();
 530
 531         cpustat = &stat->cpustat[cpu];
 532         if (PageCgroupCache(pc))
 533                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
 534         else
 535                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
 536
 537         if (charge)
 538                 __mem_cgroup_stat_add_safe(cpustat,
 539                                 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
 540         else
 541                 __mem_cgroup_stat_add_safe(cpustat,
 542                                 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
 543         __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
 544         put_cpu();
 545 }
 546
 547 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
 548                                         enum lru_list idx)
 549 {
 550         int nid, zid;
 551         struct mem_cgroup_per_zone *mz;
 552         u64 total = 0;
 553
 554         for_each_online_node(nid)
 555                 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 556                         mz = mem_cgroup_zoneinfo(mem, nid, zid);
 557                         total += MEM_CGROUP_ZSTAT(mz, idx);
 558                 }
 559         return total;
 560 }
 561
 562 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 563 {
 564         return container_of(cgroup_subsys_state(cont,
 565                                 mem_cgroup_subsys_id), struct mem_cgroup,
 566                                 css);
 567 }
 568
 569 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 570 {
 571         /*
 572          * mm_update_next_owner() may clear mm->owner to NULL
 573          * if it races with swapoff, page migration, etc.
 574          * So this can be called with p == NULL.
 575          */
 576         if (unlikely(!p))
 577                 return NULL;
 578
 579         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 580                                 struct mem_cgroup, css);
 581 }
 582
 583 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 584 {
 585         struct mem_cgroup *mem = NULL;
 586
 587         if (!mm)
 588                 return NULL;
 589         /*
 590          * Because we have no locks, mm->owner's may be being moved to other
 591          * cgroup. We use css_tryget() here even if this looks
 592          * pessimistic (rather than adding locks here).
 593          */
 594         rcu_read_lock();
 595         do {
 596                 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 597                 if (unlikely(!mem))
 598                         break;
 599         } while (!css_tryget(&mem->css));
 600         rcu_read_unlock();
 601         return mem;
 602 }
 603
 604 /*
 605  * Call callback function against all cgroup under hierarchy tree.
 606  */
 607 static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
 608                           int (*func)(struct mem_cgroup *, void *))
 609 {
 610         int found, ret, nextid;
 611         struct cgroup_subsys_state *css;
 612         struct mem_cgroup *mem;
 613
 614         if (!root->use_hierarchy)
 615                 return (*func)(root, data);
 616
 617         nextid = 1;
 618         do {
 619                 ret = 0;
 620                 mem = NULL;
 621
 622                 rcu_read_lock();
 623                 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
 624                                    &found);
 625                 if (css && css_tryget(css))
 626                         mem = container_of(css, struct mem_cgroup, css);
 627                 rcu_read_unlock();
 628
 629                 if (mem) {
 630                         ret = (*func)(mem, data);
 631                         css_put(&mem->css);
 632                 }
 633                 nextid = found + 1;
 634         } while (!ret && css);
 635
 636         return ret;
 637 }
 638
 639 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
 640 {
 641         return (mem == root_mem_cgroup);
 642 }
 643
 644 /*
 645  * Following LRU functions are allowed to be used without PCG_LOCK.
 646  * Operations are called by routine of global LRU independently from memcg.
 647  * What we have to take care of here is validness of pc->mem_cgroup.
 648  *
 649  * Changes to pc->mem_cgroup happens when
 650  * 1. charge
 651  * 2. moving account
 652  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
 653  * It is added to LRU before charge.
 654  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
 655  * When moving account, the page is not on LRU. It's isolated.
 656  */
 657
 658 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
 659 {
 660         struct page_cgroup *pc;
 661         struct mem_cgroup_per_zone *mz;
 662
 663         if (mem_cgroup_disabled())
 664                 return;
 665         pc = lookup_page_cgroup(page);
 666         /* can happen while we handle swapcache. */
 667         if (!TestClearPageCgroupAcctLRU(pc))
 668                 return;
 669         VM_BUG_ON(!pc->mem_cgroup);
 670         /*
 671          * We don't check PCG_USED bit. It's cleared when the "page" is finally
 672          * removed from global LRU.
 673          */
 674         mz = page_cgroup_zoneinfo(pc);
 675         MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 676         if (mem_cgroup_is_root(pc->mem_cgroup))
 677                 return;
 678         VM_BUG_ON(list_empty(&pc->lru));
 679         list_del_init(&pc->lru);
 680         return;
 681 }
 682
 683 void mem_cgroup_del_lru(struct page *page)
 684 {
 685         mem_cgroup_del_lru_list(page, page_lru(page));
 686 }
 687
 688 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
 689 {
 690         struct mem_cgroup_per_zone *mz;
 691         struct page_cgroup *pc;
 692
 693         if (mem_cgroup_disabled())
 694                 return;
 695
 696         pc = lookup_page_cgroup(page);
 697         /*
 698          * Used bit is set without atomic ops but after smp_wmb().
 699          * For making pc->mem_cgroup visible, insert smp_rmb() here.
 700          */
 701         smp_rmb();
 702         /* unused or root page is not rotated. */
 703         if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
 704                 return;
 705         mz = page_cgroup_zoneinfo(pc);
 706         list_move(&pc->lru, &mz->lists[lru]);
 707 }
 708
 709 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 710 {
 711         struct page_cgroup *pc;
 712         struct mem_cgroup_per_zone *mz;
 713
 714         if (mem_cgroup_disabled())
 715                 return;
 716         pc = lookup_page_cgroup(page);
 717         VM_BUG_ON(PageCgroupAcctLRU(pc));
 718         /*
 719          * Used bit is set without atomic ops but after smp_wmb().
 720          * For making pc->mem_cgroup visible, insert smp_rmb() here.
 721          */
 722         smp_rmb();
 723         if (!PageCgroupUsed(pc))
 724                 return;
 725
 726         mz = page_cgroup_zoneinfo(pc);
 727         MEM_CGROUP_ZSTAT(mz, lru) += 1;
 728         SetPageCgroupAcctLRU(pc);
 729         if (mem_cgroup_is_root(pc->mem_cgroup))
 730                 return;
 731         list_add(&pc->lru, &mz->lists[lru]);
 732 }
 733
 734 /*
 735  * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
 736  * lru because the page may.be reused after it's fully uncharged (because of
 737  * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
 738  * it again. This function is only used to charge SwapCache. It's done under
 739  * lock_page and expected that zone->lru_lock is never held.
 740  */
 741 static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
 742 {
 743         unsigned long flags;
 744         struct zone *zone = page_zone(page);
 745         struct page_cgroup *pc = lookup_page_cgroup(page);
 746
 747         spin_lock_irqsave(&zone->lru_lock, flags);
 748         /*
 749          * Forget old LRU when this page_cgroup is *not* used. This Used bit
 750          * is guarded by lock_page() because the page is SwapCache.
 751          */
 752         if (!PageCgroupUsed(pc))
 753                 mem_cgroup_del_lru_list(page, page_lru(page));
 754         spin_unlock_irqrestore(&zone->lru_lock, flags);
 755 }
 756
 757 static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
 758 {
 759         unsigned long flags;
 760         struct zone *zone = page_zone(page);
 761         struct page_cgroup *pc = lookup_page_cgroup(page);
 762
 763         spin_lock_irqsave(&zone->lru_lock, flags);
 764         /* link when the page is linked to LRU but page_cgroup isn't */
 765         if (PageLRU(page) && !PageCgroupAcctLRU(pc))
 766                 mem_cgroup_add_lru_list(page, page_lru(page));
 767         spin_unlock_irqrestore(&zone->lru_lock, flags);
 768 }
 769
 770
 771 void mem_cgroup_move_lists(struct page *page,
 772                            enum lru_list from, enum lru_list to)
 773 {
 774         if (mem_cgroup_disabled())
 775                 return;
 776         mem_cgroup_del_lru_list(page, from);
 777         mem_cgroup_add_lru_list(page, to);
 778 }
 779
 780 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 781 {
 782         int ret;
 783         struct mem_cgroup *curr = NULL;
 784
 785         task_lock(task);
 786         rcu_read_lock();
 787         curr = try_get_mem_cgroup_from_mm(task->mm);
 788         rcu_read_unlock();
 789         task_unlock(task);
 790         if (!curr)
 791                 return 0;
 792         /*
 793          * We should check use_hierarchy of "mem" not "curr". Because checking
 794          * use_hierarchy of "curr" here make this function true if hierarchy is
 795          * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
 796          * hierarchy(even if use_hierarchy is disabled in "mem").
 797          */
 798         if (mem->use_hierarchy)
 799                 ret = css_is_ancestor(&curr->css, &mem->css);
 800         else
 801                 ret = (curr == mem);
 802         css_put(&curr->css);
 803         return ret;
 804 }
 805
 806 /*
 807  * prev_priority control...this will be used in memory reclaim path.
 808  */
 809 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 810 {
 811         int prev_priority;
 812
 813         spin_lock(&mem->reclaim_param_lock);
 814         prev_priority = mem->prev_priority;
 815         spin_unlock(&mem->reclaim_param_lock);
 816
 817         return prev_priority;
 818 }
 819
 820 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
 821 {
 822         spin_lock(&mem->reclaim_param_lock);
 823         if (priority < mem->prev_priority)
 824                 mem->prev_priority = priority;
 825         spin_unlock(&mem->reclaim_param_lock);
 826 }
 827
 828 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 829 {
 830         spin_lock(&mem->reclaim_param_lock);
 831         mem->prev_priority = priority;
 832         spin_unlock(&mem->reclaim_param_lock);
 833 }
 834
 835 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
 836 {
 837         unsigned long active;
 838         unsigned long inactive;
 839         unsigned long gb;
 840         unsigned long inactive_ratio;
 841
 842         inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
 843         active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
 844
 845         gb = (inactive + active) >> (30 - PAGE_SHIFT);
 846         if (gb)
 847                 inactive_ratio = int_sqrt(10 * gb);
 848         else
 849                 inactive_ratio = 1;
 850
 851         if (present_pages) {
 852                 present_pages[0] = inactive;
 853                 present_pages[1] = active;
 854         }
 855
 856         return inactive_ratio;
 857 }
 858
 859 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
 860 {
 861         unsigned long active;
 862         unsigned long inactive;
 863         unsigned long present_pages[2];
 864         unsigned long inactive_ratio;
 865
 866         inactive_ratio = calc_inactive_ratio(memcg, present_pages);
 867
 868         inactive = present_pages[0];
 869         active = present_pages[1];
 870
 871         if (inactive * inactive_ratio < active)
 872                 return 1;
 873
 874         return 0;
 875 }
 876
 877 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
 878 {
 879         unsigned long active;
 880         unsigned long inactive;
 881
 882         inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
 883         active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
 884
 885         return (active > inactive);
 886 }
 887
 888 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 889                                        struct zone *zone,
 890                                        enum lru_list lru)
 891 {
 892         int nid = zone->zone_pgdat->node_id;
 893         int zid = zone_idx(zone);
 894         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 895
 896         return MEM_CGROUP_ZSTAT(mz, lru);
 897 }
 898
 899 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
 900                                                       struct zone *zone)
 901 {
 902         int nid = zone->zone_pgdat->node_id;
 903         int zid = zone_idx(zone);
 904         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 905
 906         return &mz->reclaim_stat;
 907 }
 908
 909 struct zone_reclaim_stat *
 910 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 911 {
 912         struct page_cgroup *pc;
 913         struct mem_cgroup_per_zone *mz;
 914
 915         if (mem_cgroup_disabled())
 916                 return NULL;
 917
 918         pc = lookup_page_cgroup(page);
 919         /*
 920          * Used bit is set without atomic ops but after smp_wmb().
 921          * For making pc->mem_cgroup visible, insert smp_rmb() here.
 922          */
 923         smp_rmb();
 924         if (!PageCgroupUsed(pc))
 925                 return NULL;
 926
 927         mz = page_cgroup_zoneinfo(pc);
 928         if (!mz)
 929                 return NULL;
 930
 931         return &mz->reclaim_stat;
 932 }
 933
 934 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 935                                         struct list_head *dst,
 936                                         unsigned long *scanned, int order,
 937                                         int mode, struct zone *z,
 938                                         struct mem_cgroup *mem_cont,
 939                                         int active, int file)
 940 {
 941         unsigned long nr_taken = 0;
 942         struct page *page;
 943         unsigned long scan;
 944         LIST_HEAD(pc_list);
 945         struct list_head *src;
 946         struct page_cgroup *pc, *tmp;
 947         int nid = z->zone_pgdat->node_id;
 948         int zid = zone_idx(z);
 949         struct mem_cgroup_per_zone *mz;
 950         int lru = LRU_FILE * file + active;
 951         int ret;
 952
 953         BUG_ON(!mem_cont);
 954         mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 955         src = &mz->lists[lru];
 956
 957         scan = 0;
 958         list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 959                 if (scan >= nr_to_scan)
 960                         break;
 961
 962                 page = pc->page;
 963                 if (unlikely(!PageCgroupUsed(pc)))
 964                         continue;
 965                 if (unlikely(!PageLRU(page)))
 966                         continue;
 967
 968                 scan++;
 969                 ret = __isolate_lru_page(page, mode, file);
 970                 switch (ret) {
 971                 case 0:
 972                         list_move(&page->lru, dst);
 973                         mem_cgroup_del_lru(page);
 974                         nr_taken++;
 975                         break;
 976                 case -EBUSY:
 977                         /* we don't affect global LRU but rotate in our LRU */
 978                         mem_cgroup_rotate_lru_list(page, page_lru(page));
 979                         break;
 980                 default:
 981                         break;
 982                 }
 983         }
 984
 985         *scanned = scan;
 986         return nr_taken;
 987 }
 988
 989 #define mem_cgroup_from_res_counter(counter, member)    \
 990         container_of(counter, struct mem_cgroup, member)
 991
 992 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
 993 {
 994         if (do_swap_account) {
 995                 if (res_counter_check_under_limit(&mem->res) &&
 996                         res_counter_check_under_limit(&mem->memsw))
 997                         return true;
 998         } else
 999                 if (res_counter_check_under_limit(&mem->res))
1000                         return true;
1001         return false;
1002 }
1003
1004 static unsigned int get_swappiness(struct mem_cgroup *memcg)
1005 {
1006         struct cgroup *cgrp = memcg->css.cgroup;
1007         unsigned int swappiness;
1008
1009         /* root ? */
1010         if (cgrp->parent == NULL)
1011                 return vm_swappiness;
1012
1013         spin_lock(&memcg->reclaim_param_lock);
1014         swappiness = memcg->swappiness;
1015         spin_unlock(&memcg->reclaim_param_lock);
1016
1017         return swappiness;
1018 }
1019
1020 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1021 {
1022         int *val = data;
1023         (*val)++;
1024         return 0;
1025 }
1026
1027 /**
1028  * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
1029  * @memcg: The memory cgroup that went over limit
1030  * @p: Task that is going to be killed
1031  *
1032  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1033  * enabled
1034  */
1035 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1036 {
1037         struct cgroup *task_cgrp;
1038         struct cgroup *mem_cgrp;
1039         /*
1040          * Need a buffer in BSS, can't rely on allocations. The code relies
1041          * on the assumption that OOM is serialized for memory controller.
1042          * If this assumption is broken, revisit this code.
1043          */
1044         static char memcg_name[PATH_MAX];
1045         int ret;
1046
1047         if (!memcg || !p)
1048                 return;
1049
1050
1051         rcu_read_lock();
1052
1053         mem_cgrp = memcg->css.cgroup;
1054         task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1055
1056         ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1057         if (ret < 0) {
1058                 /*
1059                  * Unfortunately, we are unable to convert to a useful name
1060                  * But we'll still print out the usage information
1061                  */
1062                 rcu_read_unlock();
1063                 goto done;
1064         }
1065         rcu_read_unlock();
1066
1067         printk(KERN_INFO "Task in %s killed", memcg_name);
1068
1069         rcu_read_lock();
1070         ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1071         if (ret < 0) {
1072                 rcu_read_unlock();
1073                 goto done;
1074         }
1075         rcu_read_unlock();
1076
1077         /*
1078          * Continues from above, so we don't need an KERN_ level
1079          */
1080         printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1081 done:
1082
1083         printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1084                 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1085                 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1086                 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1087         printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1088                 "failcnt %llu\n",
1089                 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1090                 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1091                 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1092 }
1093
1094 /*
1095  * This function returns the number of memcg under hierarchy tree. Returns
1096  * 1(self count) if no children.
1097  */
1098 static int mem_cgroup_count_children(struct mem_cgroup *mem)
1099 {
1100         int num = 0;
1101         mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
1102         return num;
1103 }
1104
1105 /*
1106  * Visit the first child (need not be the first child as per the ordering
1107  * of the cgroup list, since we track last_scanned_child) of @mem and use
1108  * that to reclaim free pages from.
1109  */
1110 static struct mem_cgroup *
1111 mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1112 {
1113         struct mem_cgroup *ret = NULL;
1114         struct cgroup_subsys_state *css;
1115         int nextid, found;
1116
1117         if (!root_mem->use_hierarchy) {
1118                 css_get(&root_mem->css);
1119                 ret = root_mem;
1120         }
1121
1122         while (!ret) {
1123                 rcu_read_lock();
1124                 nextid = root_mem->last_scanned_child + 1;
1125                 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1126                                    &found);
1127                 if (css && css_tryget(css))
1128                         ret = container_of(css, struct mem_cgroup, css);
1129
1130                 rcu_read_unlock();
1131                 /* Updates scanning parameter */
1132                 spin_lock(&root_mem->reclaim_param_lock);
1133                 if (!css) {
1134                         /* this means start scan from ID:1 */
1135                         root_mem->last_scanned_child = 0;
1136                 } else
1137                         root_mem->last_scanned_child = found;
1138                 spin_unlock(&root_mem->reclaim_param_lock);
1139         }
1140
1141         return ret;
1142 }
1143
1144 /*
1145  * Scan the hierarchy if needed to reclaim memory. We remember the last child
1146  * we reclaimed from, so that we don't end up penalizing one child extensively
1147  * based on its position in the children list.
1148  *
1149  * root_mem is the original ancestor that we've been reclaim from.
1150  *
1151  * We give up and return to the caller when we visit root_mem twice.
1152  * (other groups can be removed while we're walking....)
1153  *
1154  * If shrink==true, for avoiding to free too much, this returns immedieately.
1155  */
1156 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1157                                                 struct zone *zone,
1158                                                 gfp_t gfp_mask,
1159                                                 unsigned long reclaim_options)
1160 {
1161         struct mem_cgroup *victim;
1162         int ret, total = 0;
1163         int loop = 0;
1164         bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1165         bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1166         bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1167         unsigned long excess = mem_cgroup_get_excess(root_mem);
1168
1169         /* If memsw_is_minimum==1, swap-out is of-no-use. */
1170         if (root_mem->memsw_is_minimum)
1171                 noswap = true;
1172
1173         while (1) {
1174                 victim = mem_cgroup_select_victim(root_mem);
1175                 if (victim == root_mem) {
1176                         loop++;
1177                         if (loop >= 1)
1178                                 drain_all_stock_async();
1179                         if (loop >= 2) {
1180                                 /*
1181                                  * If we have not been able to reclaim
1182                                  * anything, it might because there are
1183                                  * no reclaimable pages under this hierarchy
1184                                  */
1185                                 if (!check_soft || !total) {
1186                                         css_put(&victim->css);
1187                                         break;
1188                                 }
1189                                 /*
1190                                  * We want to do more targetted reclaim.
1191                                  * excess >> 2 is not to excessive so as to
1192                                  * reclaim too much, nor too less that we keep
1193                                  * coming back to reclaim from this cgroup
1194                                  */
1195                                 if (total >= (excess >> 2) ||
1196                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1197                                         css_put(&victim->css);
1198                                         break;
1199                                 }
1200                         }
1201                 }
1202                 if (!mem_cgroup_local_usage(&victim->stat)) {
1203                         /* this cgroup's local usage == 0 */
1204                         css_put(&victim->css);
1205                         continue;
1206                 }
1207                 /* we use swappiness of local cgroup */
1208                 if (check_soft)
1209                         ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1210                                 noswap, get_swappiness(victim), zone,
1211                                 zone->zone_pgdat->node_id);
1212                 else
1213                         ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1214                                                 noswap, get_swappiness(victim));
1215                 css_put(&victim->css);
1216                 /*
1217                  * At shrinking usage, we can't check we should stop here or
1218                  * reclaim more. It's depends on callers. last_scanned_child
1219                  * will work enough for keeping fairness under tree.
1220                  */
1221                 if (shrink)
1222                         return ret;
1223                 total += ret;
1224                 if (check_soft) {
1225                         if (res_counter_check_under_soft_limit(&root_mem->res))
1226                                 return total;
1227                 } else if (mem_cgroup_check_under_limit(root_mem))
1228                         return 1 + total;
1229         }
1230         return total;
1231 }
1232
1233 bool mem_cgroup_oom_called(struct task_struct *task)
1234 {
1235         bool ret = false;
1236         struct mem_cgroup *mem;
1237         struct mm_struct *mm;
1238
1239         rcu_read_lock();
1240         mm = task->mm;
1241         if (!mm)
1242                 mm = &init_mm;
1243         mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
1244         if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
1245                 ret = true;
1246         rcu_read_unlock();
1247         return ret;
1248 }
1249
1250 static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
1251 {
1252         mem->last_oom_jiffies = jiffies;
1253         return 0;
1254 }
1255
1256 static void record_last_oom(struct mem_cgroup *mem)
1257 {
1258         mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
1259 }
1260
1261 /*
1262  * Currently used to update mapped file statistics, but the routine can be
1263  * generalized to update other statistics as well.
1264  */
1265 void mem_cgroup_update_file_mapped(struct page *page, int val)
1266 {
1267         struct mem_cgroup *mem;
1268         struct mem_cgroup_stat *stat;
1269         struct mem_cgroup_stat_cpu *cpustat;
1270         int cpu;
1271         struct page_cgroup *pc;
1272
1273         pc = lookup_page_cgroup(page);
1274         if (unlikely(!pc))
1275                 return;
1276
1277         lock_page_cgroup(pc);
1278         mem = pc->mem_cgroup;
1279         if (!mem)
1280                 goto done;
1281
1282         if (!PageCgroupUsed(pc))
1283                 goto done;
1284
1285         /*
1286          * Preemption is already disabled, we don't need get_cpu()
1287          */
1288         cpu = smp_processor_id();
1289         stat = &mem->stat;
1290         cpustat = &stat->cpustat[cpu];
1291
1292         __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
1293 done:
1294         unlock_page_cgroup(pc);
1295 }
1296
1297 /*
1298  * size of first charge trial. "32" comes from vmscan.c's magic value.
1299  * TODO: maybe necessary to use big numbers in big irons.
1300  */
1301 #define CHARGE_SIZE     (32 * PAGE_SIZE)
1302 struct memcg_stock_pcp {
1303         struct mem_cgroup *cached; /* this never be root cgroup */
1304         int charge;
1305         struct work_struct work;
1306 };
1307 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1308 static atomic_t memcg_drain_count;
1309
1310 /*
1311  * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
1312  * from local stock and true is returned. If the stock is 0 or charges from a
1313  * cgroup which is not current target, returns false. This stock will be
1314  * refilled.
1315  */
1316 static bool consume_stock(struct mem_cgroup *mem)
1317 {
1318         struct memcg_stock_pcp *stock;
1319         bool ret = true;
1320
1321         stock = &get_cpu_var(memcg_stock);
1322         if (mem == stock->cached && stock->charge)
1323                 stock->charge -= PAGE_SIZE;
1324         else /* need to call res_counter_charge */
1325                 ret = false;
1326         put_cpu_var(memcg_stock);
1327         return ret;
1328 }
1329
1330 /*
1331  * Returns stocks cached in percpu to res_counter and reset cached information.
1332  */
1333 static void drain_stock(struct memcg_stock_pcp *stock)
1334 {
1335         struct mem_cgroup *old = stock->cached;
1336
1337         if (stock->charge) {
1338                 res_counter_uncharge(&old->res, stock->charge);
1339                 if (do_swap_account)
1340                         res_counter_uncharge(&old->memsw, stock->charge);
1341         }
1342         stock->cached = NULL;
1343         stock->charge = 0;
1344 }
1345
1346 /*
1347  * This must be called under preempt disabled or must be called by
1348  * a thread which is pinned to local cpu.
1349  */
1350 static void drain_local_stock(struct work_struct *dummy)
1351 {
1352         struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1353         drain_stock(stock);
1354 }
1355
1356 /*
1357  * Cache charges(val) which is from res_counter, to local per_cpu area.
1358  * This will be consumed by consumt_stock() function, later.
1359  */
1360 static void refill_stock(struct mem_cgroup *mem, int val)
1361 {
1362         struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1363
1364         if (stock->cached != mem) { /* reset if necessary */
1365                 drain_stock(stock);
1366                 stock->cached = mem;
1367         }
1368         stock->charge += val;
1369         put_cpu_var(memcg_stock);
1370 }
1371
1372 /*
1373  * Tries to drain stocked charges in other cpus. This function is asynchronous
1374  * and just put a work per cpu for draining localy on each cpu. Caller can
1375  * expects some charges will be back to res_counter later but cannot wait for
1376  * it.
1377  */
1378 static void drain_all_stock_async(void)
1379 {
1380         int cpu;
1381         /* This function is for scheduling "drain" in asynchronous way.
1382          * The result of "drain" is not directly handled by callers. Then,
1383          * if someone is calling drain, we don't have to call drain more.
1384          * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1385          * there is a race. We just do loose check here.
1386          */
1387         if (atomic_read(&memcg_drain_count))
1388                 return;
1389         /* Notify other cpus that system-wide "drain" is running */
1390         atomic_inc(&memcg_drain_count);
1391         get_online_cpus();
1392         for_each_online_cpu(cpu) {
1393                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1394                 schedule_work_on(cpu, &stock->work);
1395         }
1396         put_online_cpus();
1397         atomic_dec(&memcg_drain_count);
1398         /* We don't wait for flush_work */
1399 }
1400
1401 /* This is a synchronous drain interface. */
1402 static void drain_all_stock_sync(void)
1403 {
1404         /* called when force_empty is called */
1405         atomic_inc(&memcg_drain_count);
1406         schedule_on_each_cpu(drain_local_stock);
1407         atomic_dec(&memcg_drain_count);
1408 }
1409
1410 static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1411                                         unsigned long action,
1412                                         void *hcpu)
1413 {
1414         int cpu = (unsigned long)hcpu;
1415         struct memcg_stock_pcp *stock;
1416
1417         if (action != CPU_DEAD)
1418                 return NOTIFY_OK;
1419         stock = &per_cpu(memcg_stock, cpu);
1420         drain_stock(stock);
1421         return NOTIFY_OK;
1422 }
1423
1424 /*
1425  * Unlike exported interface, "oom" parameter is added. if oom==true,
1426  * oom-killer can be invoked.
1427  */
1428 static int __mem_cgroup_try_charge(struct mm_struct *mm,
1429                         gfp_t gfp_mask, struct mem_cgroup **memcg,
1430                         bool oom, struct page *page)
1431 {
1432         struct mem_cgroup *mem, *mem_over_limit;
1433         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1434         struct res_counter *fail_res;
1435         int csize = CHARGE_SIZE;
1436
1437         if (unlikely(test_thread_flag(TIF_MEMDIE))) {
1438                 /* Don't account this! */
1439                 *memcg = NULL;
1440                 return 0;
1441         }
1442
1443         /*
1444          * We always charge the cgroup the mm_struct belongs to.
1445          * The mm_struct's mem_cgroup changes on task migration if the
1446          * thread group leader migrates. It's possible that mm is not
1447          * set, if so charge the init_mm (happens for pagecache usage).
1448          */
1449         mem = *memcg;
1450         if (likely(!mem)) {
1451                 mem = try_get_mem_cgroup_from_mm(mm);
1452                 *memcg = mem;
1453         } else {
1454                 css_get(&mem->css);
1455         }
1456         if (unlikely(!mem))
1457                 return 0;
1458
1459         VM_BUG_ON(css_is_removed(&mem->css));
1460         if (mem_cgroup_is_root(mem))
1461                 goto done;
1462
1463         while (1) {
1464                 int ret = 0;
1465                 unsigned long flags = 0;
1466
1467                 if (consume_stock(mem))
1468                         goto charged;
1469
1470                 ret = res_counter_charge(&mem->res, csize, &fail_res);
1471                 if (likely(!ret)) {
1472                         if (!do_swap_account)
1473                                 break;
1474                         ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1475                         if (likely(!ret))
1476                                 break;
1477                         /* mem+swap counter fails */
1478                         res_counter_uncharge(&mem->res, csize);
1479                         flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1480                         mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1481                                                                         memsw);
1482                 } else
1483                         /* mem counter fails */
1484                         mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1485                                                                         res);
1486
1487                 /* reduce request size and retry */
1488                 if (csize > PAGE_SIZE) {
1489                         csize = PAGE_SIZE;
1490                         continue;
1491                 }
1492                 if (!(gfp_mask & __GFP_WAIT))
1493                         goto nomem;
1494
1495                 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1496                                                 gfp_mask, flags);
1497                 if (ret)
1498                         continue;
1499
1500                 /*
1501                  * try_to_free_mem_cgroup_pages() might not give us a full
1502                  * picture of reclaim. Some pages are reclaimed and might be
1503                  * moved to swap cache or just unmapped from the cgroup.
1504                  * Check the limit again to see if the reclaim reduced the
1505                  * current usage of the cgroup before giving up
1506                  *
1507                  */
1508                 if (mem_cgroup_check_under_limit(mem_over_limit))
1509                         continue;
1510
1511                 if (!nr_retries--) {
1512                         if (oom) {
1513                                 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
1514                                 record_last_oom(mem_over_limit);
1515                         }
1516                         goto nomem;
1517                 }
1518         }
1519         if (csize > PAGE_SIZE)
1520                 refill_stock(mem, csize - PAGE_SIZE);
1521 charged:
1522         /*
1523          * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1524          * if they exceeds softlimit.
1525          */
1526         if (page && mem_cgroup_soft_limit_check(mem))
1527                 mem_cgroup_update_tree(mem, page);
1528 done:
1529         return 0;
1530 nomem:
1531         css_put(&mem->css);
1532         return -ENOMEM;
1533 }
1534
1535 /*
1536  * Somemtimes we have to undo a charge we got by try_charge().
1537  * This function is for that and do uncharge, put css's refcnt.
1538  * gotten by try_charge().
1539  */
1540 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1541                                                         unsigned long count)
1542 {
1543         if (!mem_cgroup_is_root(mem)) {
1544                 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1545                 if (do_swap_account)
1546                         res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1547                 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1548                 WARN_ON_ONCE(count > INT_MAX);
1549                 __css_put(&mem->css, (int)count);
1550         }
1551         /* we don't need css_put for root */
1552 }
1553
1554 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1555 {
1556         __mem_cgroup_cancel_charge(mem, 1);
1557 }
1558
1559 /*
1560  * A helper function to get mem_cgroup from ID. must be called under
1561  * rcu_read_lock(). The caller must check css_is_removed() or some if
1562  * it's concern. (dropping refcnt from swap can be called against removed
1563  * memcg.)
1564  */
1565 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1566 {
1567         struct cgroup_subsys_state *css;
1568
1569         /* ID 0 is unused ID */
1570         if (!id)
1571                 return NULL;
1572         css = css_lookup(&mem_cgroup_subsys, id);
1573         if (!css)
1574                 return NULL;
1575         return container_of(css, struct mem_cgroup, css);
1576 }
1577
1578 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1579 {
1580         struct mem_cgroup *mem = NULL;
1581         struct page_cgroup *pc;
1582         unsigned short id;
1583         swp_entry_t ent;
1584
1585         VM_BUG_ON(!PageLocked(page));
1586
1587         pc = lookup_page_cgroup(page);
1588         lock_page_cgroup(pc);
1589         if (PageCgroupUsed(pc)) {
1590                 mem = pc->mem_cgroup;
1591                 if (mem && !css_tryget(&mem->css))
1592                         mem = NULL;
1593         } else if (PageSwapCache(page)) {
1594                 ent.val = page_private(page);
1595                 id = lookup_swap_cgroup(ent);
1596                 rcu_read_lock();
1597                 mem = mem_cgroup_lookup(id);
1598                 if (mem && !css_tryget(&mem->css))
1599                         mem = NULL;
1600                 rcu_read_unlock();
1601         }
1602         unlock_page_cgroup(pc);
1603         return mem;
1604 }
1605
1606 /*
1607  * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
1608  * USED state. If already USED, uncharge and return.
1609  */
1610
1611 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1612                                      struct page_cgroup *pc,
1613                                      enum charge_type ctype)
1614 {
1615         /* try_charge() can return NULL to *memcg, taking care of it. */
1616         if (!mem)
1617                 return;
1618
1619         lock_page_cgroup(pc);
1620         if (unlikely(PageCgroupUsed(pc))) {
1621                 unlock_page_cgroup(pc);
1622                 mem_cgroup_cancel_charge(mem);
1623                 return;
1624         }
1625
1626         pc->mem_cgroup = mem;
1627         /*
1628          * We access a page_cgroup asynchronously without lock_page_cgroup().
1629          * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1630          * is accessed after testing USED bit. To make pc->mem_cgroup visible
1631          * before USED bit, we need memory barrier here.
1632          * See mem_cgroup_add_lru_list(), etc.
1633          */
1634         smp_wmb();
1635         switch (ctype) {
1636         case MEM_CGROUP_CHARGE_TYPE_CACHE:
1637         case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1638                 SetPageCgroupCache(pc);
1639                 SetPageCgroupUsed(pc);
1640                 break;
1641         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1642                 ClearPageCgroupCache(pc);
1643                 SetPageCgroupUsed(pc);
1644                 break;
1645         default:
1646                 break;
1647         }
1648
1649         mem_cgroup_charge_statistics(mem, pc, true);
1650
1651         unlock_page_cgroup(pc);
1652 }
1653
1654 /**
1655  * __mem_cgroup_move_account - move account of the page
1656  * @pc: page_cgroup of the page.
1657  * @from: mem_cgroup which the page is moved from.
1658  * @to: mem_cgroup which the page is moved to. @from != @to.
1659  * @uncharge: whether we should call uncharge and css_put against @from.
1660  *
1661  * The caller must confirm following.
1662  * - page is not on LRU (isolate_page() is useful.)
1663  * - the pc is locked, used, and ->mem_cgroup points to @from.
1664  *
1665  * This function doesn't do "charge" nor css_get to new cgroup. It should be
1666  * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
1667  * true, this function does "uncharge" from old cgroup, but it doesn't if
1668  * @uncharge is false, so a caller should do "uncharge".
1669  */
1670
1671 static void __mem_cgroup_move_account(struct page_cgroup *pc,
1672         struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1673 {
1674         struct page *page;
1675         int cpu;
1676         struct mem_cgroup_stat *stat;
1677         struct mem_cgroup_stat_cpu *cpustat;
1678
1679         VM_BUG_ON(from == to);
1680         VM_BUG_ON(PageLRU(pc->page));
1681         VM_BUG_ON(!PageCgroupLocked(pc));
1682         VM_BUG_ON(!PageCgroupUsed(pc));
1683         VM_BUG_ON(pc->mem_cgroup != from);
1684
1685         page = pc->page;
1686         if (page_mapped(page) && !PageAnon(page)) {
1687                 cpu = smp_processor_id();
1688                 /* Update mapped_file data for mem_cgroup "from" */
1689                 stat = &from->stat;
1690                 cpustat = &stat->cpustat[cpu];
1691                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1692                                                 -1);
1693
1694                 /* Update mapped_file data for mem_cgroup "to" */
1695                 stat = &to->stat;
1696                 cpustat = &stat->cpustat[cpu];
1697                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1698                                                 1);
1699         }
1700         mem_cgroup_charge_statistics(from, pc, false);
1701         if (uncharge)
1702                 /* This is not "cancel", but cancel_charge does all we need. */
1703                 mem_cgroup_cancel_charge(from);
1704
1705         /* caller should have done css_get */
1706         pc->mem_cgroup = to;
1707         mem_cgroup_charge_statistics(to, pc, true);
1708         /*
1709          * We charges against "to" which may not have any tasks. Then, "to"
1710          * can be under rmdir(). But in current implementation, caller of
1711          * this function is just force_empty() and move charge, so it's
1712          * garanteed that "to" is never removed. So, we don't check rmdir
1713          * status here.
1714          */
1715 }
1716
1717 /*
1718  * check whether the @pc is valid for moving account and call
1719  * __mem_cgroup_move_account()
1720  */
1721 static int mem_cgroup_move_account(struct page_cgroup *pc,
1722                 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1723 {
1724         int ret = -EINVAL;
1725         lock_page_cgroup(pc);
1726         if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1727                 __mem_cgroup_move_account(pc, from, to, uncharge);
1728                 ret = 0;
1729         }
1730         unlock_page_cgroup(pc);
1731         return ret;
1732 }
1733
1734 /*
1735  * move charges to its parent.
1736  */
1737
1738 static int mem_cgroup_move_parent(struct page_cgroup *pc,
1739                                   struct mem_cgroup *child,
1740                                   gfp_t gfp_mask)
1741 {
1742         struct page *page = pc->page;
1743         struct cgroup *cg = child->css.cgroup;
1744         struct cgroup *pcg = cg->parent;
1745         struct mem_cgroup *parent;
1746         int ret;
1747
1748         /* Is ROOT ? */
1749         if (!pcg)
1750                 return -EINVAL;
1751
1752         ret = -EBUSY;
1753         if (!get_page_unless_zero(page))
1754                 goto out;
1755         if (isolate_lru_page(page))
1756                 goto put;
1757
1758         parent = mem_cgroup_from_cont(pcg);
1759         ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1760         if (ret || !parent)
1761                 goto put_back;
1762
1763         ret = mem_cgroup_move_account(pc, child, parent, true);
1764         if (ret)
1765                 mem_cgroup_cancel_charge(parent);
1766 put_back:
1767         putback_lru_page(page);
1768 put:
1769         put_page(page);
1770 out:
1771         return ret;
1772 }
1773
1774 /*
1775  * Charge the memory controller for page usage.
1776  * Return
1777  * 0 if the charge was successful
1778  * < 0 if the cgroup is over its limit
1779  */
1780 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1781                                 gfp_t gfp_mask, enum charge_type ctype,
1782                                 struct mem_cgroup *memcg)
1783 {
1784         struct mem_cgroup *mem;
1785         struct page_cgroup *pc;
1786         int ret;
1787
1788         pc = lookup_page_cgroup(page);
1789         /* can happen at boot */
1790         if (unlikely(!pc))
1791                 return 0;
1792         prefetchw(pc);
1793
1794         mem = memcg;
1795         ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
1796         if (ret || !mem)
1797                 return ret;
1798
1799         __mem_cgroup_commit_charge(mem, pc, ctype);
1800         return 0;
1801 }
1802
1803 int mem_cgroup_newpage_charge(struct page *page,
1804                               struct mm_struct *mm, gfp_t gfp_mask)
1805 {
1806         if (mem_cgroup_disabled())
1807                 return 0;
1808         if (PageCompound(page))
1809                 return 0;
1810         /*
1811          * If already mapped, we don't have to account.
1812          * If page cache, page->mapping has address_space.
1813          * But page->mapping may have out-of-use anon_vma pointer,
1814          * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
1815          * is NULL.
1816          */
1817         if (page_mapped(page) || (page->mapping && !PageAnon(page)))
1818                 return 0;
1819         if (unlikely(!mm))
1820                 mm = &init_mm;
1821         return mem_cgroup_charge_common(page, mm, gfp_mask,
1822                                 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1823 }
1824
1825 static void
1826 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1827                                         enum charge_type ctype);
1828
1829 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1830                                 gfp_t gfp_mask)
1831 {
1832         struct mem_cgroup *mem = NULL;
1833         int ret;
1834
1835         if (mem_cgroup_disabled())
1836                 return 0;
1837         if (PageCompound(page))
1838                 return 0;
1839         /*
1840          * Corner case handling. This is called from add_to_page_cache()
1841          * in usual. But some FS (shmem) precharges this page before calling it
1842          * and call add_to_page_cache() with GFP_NOWAIT.
1843          *
1844          * For GFP_NOWAIT case, the page may be pre-charged before calling
1845          * add_to_page_cache(). (See shmem.c) check it here and avoid to call
1846          * charge twice. (It works but has to pay a bit larger cost.)
1847          * And when the page is SwapCache, it should take swap information
1848          * into account. This is under lock_page() now.
1849          */
1850         if (!(gfp_mask & __GFP_WAIT)) {
1851                 struct page_cgroup *pc;
1852
1853
1854                 pc = lookup_page_cgroup(page);
1855                 if (!pc)
1856                         return 0;
1857                 lock_page_cgroup(pc);
1858                 if (PageCgroupUsed(pc)) {
1859                         unlock_page_cgroup(pc);
1860                         return 0;
1861                 }
1862                 unlock_page_cgroup(pc);
1863         }
1864
1865         if (unlikely(!mm && !mem))
1866                 mm = &init_mm;
1867
1868         if (page_is_file_cache(page))
1869                 return mem_cgroup_charge_common(page, mm, gfp_mask,
1870                                 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1871
1872         /* shmem */
1873         if (PageSwapCache(page)) {
1874                 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1875                 if (!ret)
1876                         __mem_cgroup_commit_charge_swapin(page, mem,
1877                                         MEM_CGROUP_CHARGE_TYPE_SHMEM);
1878         } else
1879                 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
1880                                         MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1881
1882         return ret;
1883 }
1884
1885 /*
1886  * While swap-in, try_charge -> commit or cancel, the page is locked.
1887  * And when try_charge() successfully returns, one refcnt to memcg without
1888  * struct page_cgroup is acquired. This refcnt will be consumed by
1889  * "commit()" or removed by "cancel()"
1890  */
1891 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1892                                  struct page *page,
1893                                  gfp_t mask, struct mem_cgroup **ptr)
1894 {
1895         struct mem_cgroup *mem;
1896         int ret;
1897
1898         if (mem_cgroup_disabled())
1899                 return 0;
1900
1901         if (!do_swap_account)
1902                 goto charge_cur_mm;
1903         /*
1904          * A racing thread's fault, or swapoff, may have already updated
1905          * the pte, and even removed page from swap cache: in those cases
1906          * do_swap_page()'s pte_same() test will fail; but there's also a
1907          * KSM case which does need to charge the page.
1908          */
1909         if (!PageSwapCache(page))
1910                 goto charge_cur_mm;
1911         mem = try_get_mem_cgroup_from_page(page);
1912         if (!mem)
1913                 goto charge_cur_mm;
1914         *ptr = mem;
1915         ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
1916         /* drop extra refcnt from tryget */
1917         css_put(&mem->css);
1918         return ret;
1919 charge_cur_mm:
1920         if (unlikely(!mm))
1921                 mm = &init_mm;
1922         return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
1923 }
1924
1925 static void
1926 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1927                                         enum charge_type ctype)
1928 {
1929         struct page_cgroup *pc;
1930
1931         if (mem_cgroup_disabled())
1932                 return;
1933         if (!ptr)
1934                 return;
1935         cgroup_exclude_rmdir(&ptr->css);
1936         pc = lookup_page_cgroup(page);
1937         mem_cgroup_lru_del_before_commit_swapcache(page);
1938         __mem_cgroup_commit_charge(ptr, pc, ctype);
1939         mem_cgroup_lru_add_after_commit_swapcache(page);
1940         /*
1941          * Now swap is on-memory. This means this page may be
1942          * counted both as mem and swap....double count.
1943          * Fix it by uncharging from memsw. Basically, this SwapCache is stable
1944          * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
1945          * may call delete_from_swap_cache() before reach here.
1946          */
1947         if (do_swap_account && PageSwapCache(page)) {
1948                 swp_entry_t ent = {.val = page_private(page)};
1949                 unsigned short id;
1950                 struct mem_cgroup *memcg;
1951
1952                 id = swap_cgroup_record(ent, 0);
1953                 rcu_read_lock();
1954                 memcg = mem_cgroup_lookup(id);
1955                 if (memcg) {
1956                         /*
1957                          * This recorded memcg can be obsolete one. So, avoid
1958                          * calling css_tryget
1959                          */
1960                         if (!mem_cgroup_is_root(memcg))
1961                                 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1962                         mem_cgroup_swap_statistics(memcg, false);
1963                         mem_cgroup_put(memcg);
1964                 }
1965                 rcu_read_unlock();
1966         }
1967         /*
1968          * At swapin, we may charge account against cgroup which has no tasks.
1969          * So, rmdir()->pre_destroy() can be called while we do this charge.
1970          * In that case, we need to call pre_destroy() again. check it here.
1971          */
1972         cgroup_release_and_wakeup_rmdir(&ptr->css);
1973 }
1974
1975 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1976 {
1977         __mem_cgroup_commit_charge_swapin(page, ptr,
1978                                         MEM_CGROUP_CHARGE_TYPE_MAPPED);
1979 }
1980
1981 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1982 {
1983         if (mem_cgroup_disabled())
1984                 return;
1985         if (!mem)
1986                 return;
1987         mem_cgroup_cancel_charge(mem);
1988 }
1989
1990 static void
1991 __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
1992 {
1993         struct memcg_batch_info *batch = NULL;
1994         bool uncharge_memsw = true;
1995         /* If swapout, usage of swap doesn't decrease */
1996         if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1997                 uncharge_memsw = false;
1998         /*
1999          * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2000          * In those cases, all pages freed continously can be expected to be in
2001          * the same cgroup and we have chance to coalesce uncharges.
2002          * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2003          * because we want to do uncharge as soon as possible.
2004          */
2005         if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
2006                 goto direct_uncharge;
2007
2008         batch = &current->memcg_batch;
2009         /*
2010          * In usual, we do css_get() when we remember memcg pointer.
2011          * But in this case, we keep res->usage until end of a series of
2012          * uncharges. Then, it's ok to ignore memcg's refcnt.
2013          */
2014         if (!batch->memcg)
2015                 batch->memcg = mem;
2016         /*
2017          * In typical case, batch->memcg == mem. This means we can
2018          * merge a series of uncharges to an uncharge of res_counter.
2019          * If not, we uncharge res_counter ony by one.
2020          */
2021         if (batch->memcg != mem)
2022                 goto direct_uncharge;
2023         /* remember freed charge and uncharge it later */
2024         batch->bytes += PAGE_SIZE;
2025         if (uncharge_memsw)
2026                 batch->memsw_bytes += PAGE_SIZE;
2027         return;
2028 direct_uncharge:
2029         res_counter_uncharge(&mem->res, PAGE_SIZE);
2030         if (uncharge_memsw)
2031                 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2032         return;
2033 }
2034
2035 /*
2036  * uncharge if !page_mapped(page)
2037  */
2038 static struct mem_cgroup *
2039 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2040 {
2041         struct page_cgroup *pc;
2042         struct mem_cgroup *mem = NULL;
2043         struct mem_cgroup_per_zone *mz;
2044
2045         if (mem_cgroup_disabled())
2046                 return NULL;
2047
2048         if (PageSwapCache(page))
2049                 return NULL;
2050
2051         /*
2052          * Check if our page_cgroup is valid
2053          */
2054         pc = lookup_page_cgroup(page);
2055         if (unlikely(!pc || !PageCgroupUsed(pc)))
2056                 return NULL;
2057
2058         lock_page_cgroup(pc);
2059
2060         mem = pc->mem_cgroup;
2061
2062         if (!PageCgroupUsed(pc))
2063                 goto unlock_out;
2064
2065         switch (ctype) {
2066         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2067         case MEM_CGROUP_CHARGE_TYPE_DROP:
2068                 if (page_mapped(page))
2069                         goto unlock_out;
2070                 break;
2071         case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2072                 if (!PageAnon(page)) {  /* Shared memory */
2073                         if (page->mapping && !page_is_file_cache(page))
2074                                 goto unlock_out;
2075                 } else if (page_mapped(page)) /* Anon */
2076                                 goto unlock_out;
2077                 break;
2078         default:
2079                 break;
2080         }
2081
2082         if (!mem_cgroup_is_root(mem))
2083                 __do_uncharge(mem, ctype);
2084         if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2085                 mem_cgroup_swap_statistics(mem, true);
2086         mem_cgroup_charge_statistics(mem, pc, false);
2087
2088         ClearPageCgroupUsed(pc);
2089         /*
2090          * pc->mem_cgroup is not cleared here. It will be accessed when it's
2091          * freed from LRU. This is safe because uncharged page is expected not
2092          * to be reused (freed soon). Exception is SwapCache, it's handled by
2093          * special functions.
2094          */
2095
2096         mz = page_cgroup_zoneinfo(pc);
2097         unlock_page_cgroup(pc);
2098
2099         if (mem_cgroup_soft_limit_check(mem))
2100                 mem_cgroup_update_tree(mem, page);
2101         /* at swapout, this memcg will be accessed to record to swap */
2102         if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2103                 css_put(&mem->css);
2104
2105         return mem;
2106
2107 unlock_out:
2108         unlock_page_cgroup(pc);
2109         return NULL;
2110 }
2111
2112 void mem_cgroup_uncharge_page(struct page *page)
2113 {
2114         /* early check. */
2115         if (page_mapped(page))
2116                 return;
2117         if (page->mapping && !PageAnon(page))
2118                 return;
2119         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
2120 }
2121
2122 void mem_cgroup_uncharge_cache_page(struct page *page)
2123 {
2124         VM_BUG_ON(page_mapped(page));
2125         VM_BUG_ON(page->mapping);
2126         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
2127 }
2128
2129 /*
2130  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
2131  * In that cases, pages are freed continuously and we can expect pages
2132  * are in the same memcg. All these calls itself limits the number of
2133  * pages freed at once, then uncharge_start/end() is called properly.
2134  * This may be called prural(2) times in a context,
2135  */
2136
2137 void mem_cgroup_uncharge_start(void)
2138 {
2139         current->memcg_batch.do_batch++;
2140         /* We can do nest. */
2141         if (current->memcg_batch.do_batch == 1) {
2142                 current->memcg_batch.memcg = NULL;
2143                 current->memcg_batch.bytes = 0;
2144                 current->memcg_batch.memsw_bytes = 0;
2145         }
2146 }
2147
2148 void mem_cgroup_uncharge_end(void)
2149 {
2150         struct memcg_batch_info *batch = &current->memcg_batch;
2151
2152         if (!batch->do_batch)
2153                 return;
2154
2155         batch->do_batch--;
2156         if (batch->do_batch) /* If stacked, do nothing. */
2157                 return;
2158
2159         if (!batch->memcg)
2160                 return;
2161         /*
2162          * This "batch->memcg" is valid without any css_get/put etc...
2163          * bacause we hide charges behind us.
2164          */
2165         if (batch->bytes)
2166                 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2167         if (batch->memsw_bytes)
2168                 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2169         /* forget this pointer (for sanity check) */
2170         batch->memcg = NULL;
2171 }
2172
2173 #ifdef CONFIG_SWAP
2174 /*
2175  * called after __delete_from_swap_cache() and drop "page" account.
2176  * memcg information is recorded to swap_cgroup of "ent"
2177  */
2178 void
2179 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2180 {
2181         struct mem_cgroup *memcg;
2182         int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
2183
2184         if (!swapout) /* this was a swap cache but the swap is unused ! */
2185                 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
2186
2187         memcg = __mem_cgroup_uncharge_common(page, ctype);
2188
2189         /* record memcg information */
2190         if (do_swap_account && swapout && memcg) {
2191                 swap_cgroup_record(ent, css_id(&memcg->css));
2192                 mem_cgroup_get(memcg);
2193         }
2194         if (swapout && memcg)
2195                 css_put(&memcg->css);
2196 }
2197 #endif
2198
2199 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2200 /*
2201  * called from swap_entry_free(). remove record in swap_cgroup and
2202  * uncharge "memsw" account.
2203  */
2204 void mem_cgroup_uncharge_swap(swp_entry_t ent)
2205 {
2206         struct mem_cgroup *memcg;
2207         unsigned short id;
2208
2209         if (!do_swap_account)
2210                 return;
2211
2212         id = swap_cgroup_record(ent, 0);
2213         rcu_read_lock();
2214         memcg = mem_cgroup_lookup(id);
2215         if (memcg) {
2216                 /*
2217                  * We uncharge this because swap is freed.
2218                  * This memcg can be obsolete one. We avoid calling css_tryget
2219                  */
2220                 if (!mem_cgroup_is_root(memcg))
2221                         res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2222                 mem_cgroup_swap_statistics(memcg, false);
2223                 mem_cgroup_put(memcg);
2224         }
2225         rcu_read_unlock();
2226 }
2227 #endif
2228
2229 /*
2230  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
2231  * page belongs to.
2232  */
2233 int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
2234 {
2235         struct page_cgroup *pc;
2236         struct mem_cgroup *mem = NULL;
2237         int ret = 0;
2238
2239         if (mem_cgroup_disabled())
2240                 return 0;
2241
2242         pc = lookup_page_cgroup(page);
2243         lock_page_cgroup(pc);
2244         if (PageCgroupUsed(pc)) {
2245                 mem = pc->mem_cgroup;
2246                 css_get(&mem->css);
2247         }
2248         unlock_page_cgroup(pc);
2249
2250         if (mem) {
2251                 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
2252                                                 page);
2253                 css_put(&mem->css);
2254         }
2255         *ptr = mem;
2256         return ret;
2257 }
2258
2259 /* remove redundant charge if migration failed*/
2260 void mem_cgroup_end_migration(struct mem_cgroup *mem,
2261                 struct page *oldpage, struct page *newpage)
2262 {
2263         struct page *target, *unused;
2264         struct page_cgroup *pc;
2265         enum charge_type ctype;
2266
2267         if (!mem)
2268                 return;
2269         cgroup_exclude_rmdir(&mem->css);
2270         /* at migration success, oldpage->mapping is NULL. */
2271         if (oldpage->mapping) {
2272                 target = oldpage;
2273                 unused = NULL;
2274         } else {
2275                 target = newpage;
2276                 unused = oldpage;
2277         }
2278
2279         if (PageAnon(target))
2280                 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2281         else if (page_is_file_cache(target))
2282                 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2283         else
2284                 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2285
2286         /* unused page is not on radix-tree now. */
2287         if (unused)
2288                 __mem_cgroup_uncharge_common(unused, ctype);
2289
2290         pc = lookup_page_cgroup(target);
2291         /*
2292          * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
2293          * So, double-counting is effectively avoided.
2294          */
2295         __mem_cgroup_commit_charge(mem, pc, ctype);
2296
2297         /*
2298          * Both of oldpage and newpage are still under lock_page().
2299          * Then, we don't have to care about race in radix-tree.
2300          * But we have to be careful that this page is unmapped or not.
2301          *
2302          * There is a case for !page_mapped(). At the start of
2303          * migration, oldpage was mapped. But now, it's zapped.
2304          * But we know *target* page is not freed/reused under us.
2305          * mem_cgroup_uncharge_page() does all necessary checks.
2306          */
2307         if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
2308                 mem_cgroup_uncharge_page(target);
2309         /*
2310          * At migration, we may charge account against cgroup which has no tasks
2311          * So, rmdir()->pre_destroy() can be called while we do this charge.
2312          * In that case, we need to call pre_destroy() again. check it here.
2313          */
2314         cgroup_release_and_wakeup_rmdir(&mem->css);
2315 }
2316
2317 /*
2318  * A call to try to shrink memory usage on charge failure at shmem's swapin.
2319  * Calling hierarchical_reclaim is not enough because we should update
2320  * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
2321  * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
2322  * not from the memcg which this page would be charged to.
2323  * try_charge_swapin does all of these works properly.
2324  */
2325 int mem_cgroup_shmem_charge_fallback(struct page *page,
2326                             struct mm_struct *mm,
2327                             gfp_t gfp_mask)
2328 {
2329         struct mem_cgroup *mem = NULL;
2330         int ret;
2331
2332         if (mem_cgroup_disabled())
2333                 return 0;
2334
2335         ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2336         if (!ret)
2337                 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
2338
2339         return ret;
2340 }
2341
2342 static DEFINE_MUTEX(set_limit_mutex);
2343
2344 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2345                                 unsigned long long val)
2346 {
2347         int retry_count;
2348         u64 memswlimit;
2349         int ret = 0;
2350         int children = mem_cgroup_count_children(memcg);
2351         u64 curusage, oldusage;
2352
2353         /*
2354          * For keeping hierarchical_reclaim simple, how long we should retry
2355          * is depends on callers. We set our retry-count to be function
2356          * of # of children which we should visit in this loop.
2357          */
2358         retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
2359
2360         oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2361
2362         while (retry_count) {
2363                 if (signal_pending(current)) {
2364                         ret = -EINTR;
2365                         break;
2366                 }
2367                 /*
2368                  * Rather than hide all in some function, I do this in
2369                  * open coded manner. You see what this really does.
2370                  * We have to guarantee mem->res.limit < mem->memsw.limit.
2371                  */
2372                 mutex_lock(&set_limit_mutex);
2373                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2374                 if (memswlimit < val) {
2375                         ret = -EINVAL;
2376                         mutex_unlock(&set_limit_mutex);
2377                         break;
2378                 }
2379                 ret = res_counter_set_limit(&memcg->res, val);
2380                 if (!ret) {
2381                         if (memswlimit == val)
2382                                 memcg->memsw_is_minimum = true;
2383                         else
2384                                 memcg->memsw_is_minimum = false;
2385                 }
2386                 mutex_unlock(&set_limit_mutex);
2387
2388                 if (!ret)
2389                         break;
2390
2391                 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2392                                                 MEM_CGROUP_RECLAIM_SHRINK);
2393                 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2394                 /* Usage is reduced ? */
2395                 if (curusage >= oldusage)
2396                         retry_count--;
2397                 else
2398                         oldusage = curusage;
2399         }
2400
2401         return ret;
2402 }
2403
2404 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2405                                         unsigned long long val)
2406 {
2407         int retry_count;
2408         u64 memlimit, oldusage, curusage;
2409         int children = mem_cgroup_count_children(memcg);
2410         int ret = -EBUSY;
2411
2412         /* see mem_cgroup_resize_res_limit */
2413         retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
2414         oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2415         while (retry_count) {
2416                 if (signal_pending(current)) {
2417                         ret = -EINTR;
2418                         break;
2419                 }
2420                 /*
2421                  * Rather than hide all in some function, I do this in
2422                  * open coded manner. You see what this really does.
2423                  * We have to guarantee mem->res.limit < mem->memsw.limit.
2424                  */
2425                 mutex_lock(&set_limit_mutex);
2426                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2427                 if (memlimit > val) {
2428                         ret = -EINVAL;
2429                         mutex_unlock(&set_limit_mutex);
2430                         break;
2431                 }
2432                 ret = res_counter_set_limit(&memcg->memsw, val);
2433                 if (!ret) {
2434                         if (memlimit == val)
2435                                 memcg->memsw_is_minimum = true;
2436                         else
2437                                 memcg->memsw_is_minimum = false;
2438                 }
2439                 mutex_unlock(&set_limit_mutex);
2440
2441                 if (!ret)
2442                         break;
2443
2444                 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2445                                                 MEM_CGROUP_RECLAIM_NOSWAP |
2446                                                 MEM_CGROUP_RECLAIM_SHRINK);
2447                 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2448                 /* Usage is reduced ? */
2449                 if (curusage >= oldusage)
2450                         retry_count--;
2451                 else
2452                         oldusage = curusage;
2453         }
2454         return ret;
2455 }
2456
2457 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2458                                                 gfp_t gfp_mask, int nid,
2459                                                 int zid)
2460 {
2461         unsigned long nr_reclaimed = 0;
2462         struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2463         unsigned long reclaimed;
2464         int loop = 0;
2465         struct mem_cgroup_tree_per_zone *mctz;
2466         unsigned long long excess;
2467
2468         if (order > 0)
2469                 return 0;
2470
2471         mctz = soft_limit_tree_node_zone(nid, zid);
2472         /*
2473          * This loop can run a while, specially if mem_cgroup's continuously
2474          * keep exceeding their soft limit and putting the system under
2475          * pressure
2476          */
2477         do {
2478                 if (next_mz)
2479                         mz = next_mz;
2480                 else
2481                         mz = mem_cgroup_largest_soft_limit_node(mctz);
2482                 if (!mz)
2483                         break;
2484
2485                 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2486                                                 gfp_mask,
2487                                                 MEM_CGROUP_RECLAIM_SOFT);
2488                 nr_reclaimed += reclaimed;
2489                 spin_lock(&mctz->lock);
2490
2491                 /*
2492                  * If we failed to reclaim anything from this memory cgroup
2493                  * it is time to move on to the next cgroup
2494                  */
2495                 next_mz = NULL;
2496                 if (!reclaimed) {
2497                         do {
2498                                 /*
2499                                  * Loop until we find yet another one.
2500                                  *
2501                                  * By the time we get the soft_limit lock
2502                                  * again, someone might have aded the
2503                                  * group back on the RB tree. Iterate to
2504                                  * make sure we get a different mem.
2505                                  * mem_cgroup_largest_soft_limit_node returns
2506                                  * NULL if no other cgroup is present on
2507                                  * the tree
2508                                  */
2509                                 next_mz =
2510                                 __mem_cgroup_largest_soft_limit_node(mctz);
2511                                 if (next_mz == mz) {
2512                                         css_put(&next_mz->mem->css);
2513                                         next_mz = NULL;
2514                                 } else /* next_mz == NULL or other memcg */
2515                                         break;
2516                         } while (1);
2517                 }
2518                 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2519                 excess = res_counter_soft_limit_excess(&mz->mem->res);
2520                 /*
2521                  * One school of thought says that we should not add
2522                  * back the node to the tree if reclaim returns 0.
2523                  * But our reclaim could return 0, simply because due
2524                  * to priority we are exposing a smaller subset of
2525                  * memory to reclaim from. Consider this as a longer
2526                  * term TODO.
2527                  */
2528                 /* If excess == 0, no tree ops */
2529                 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
2530                 spin_unlock(&mctz->lock);
2531                 css_put(&mz->mem->css);
2532                 loop++;
2533                 /*
2534                  * Could not reclaim anything and there are no more
2535                  * mem cgroups to try or we seem to be looping without
2536                  * reclaiming anything.
2537                  */
2538                 if (!nr_reclaimed &&
2539                         (next_mz == NULL ||
2540                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2541                         break;
2542         } while (!nr_reclaimed);
2543         if (next_mz)
2544                 css_put(&next_mz->mem->css);
2545         return nr_reclaimed;
2546 }
2547
2548 /*
2549  * This routine traverse page_cgroup in given list and drop them all.
2550  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
2551  */
2552 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2553                                 int node, int zid, enum lru_list lru)
2554 {
2555         struct zone *zone;
2556         struct mem_cgroup_per_zone *mz;
2557         struct page_cgroup *pc, *busy;
2558         unsigned long flags, loop;
2559         struct list_head *list;
2560         int ret = 0;
2561
2562         zone = &NODE_DATA(node)->node_zones[zid];
2563         mz = mem_cgroup_zoneinfo(mem, node, zid);
2564         list = &mz->lists[lru];
2565
2566         loop = MEM_CGROUP_ZSTAT(mz, lru);
2567         /* give some margin against EBUSY etc...*/
2568         loop += 256;
2569         busy = NULL;
2570         while (loop--) {
2571                 ret = 0;
2572                 spin_lock_irqsave(&zone->lru_lock, flags);
2573                 if (list_empty(list)) {
2574                         spin_unlock_irqrestore(&zone->lru_lock, flags);
2575                         break;
2576                 }
2577                 pc = list_entry(list->prev, struct page_cgroup, lru);
2578                 if (busy == pc) {
2579                         list_move(&pc->lru, list);
2580                         busy = NULL;
2581                         spin_unlock_irqrestore(&zone->lru_lock, flags);
2582                         continue;
2583                 }
2584                 spin_unlock_irqrestore(&zone->lru_lock, flags);
2585
2586                 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
2587                 if (ret == -ENOMEM)
2588                         break;
2589
2590                 if (ret == -EBUSY || ret == -EINVAL) {
2591                         /* found lock contention or "pc" is obsolete. */
2592                         busy = pc;
2593                         cond_resched();
2594                 } else
2595                         busy = NULL;
2596         }
2597
2598         if (!ret && !list_empty(list))
2599                 return -EBUSY;
2600         return ret;
2601 }
2602
2603 /*
2604  * make mem_cgroup's charge to be 0 if there is no task.
2605  * This enables deleting this mem_cgroup.
2606  */
2607 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2608 {
2609         int ret;
2610         int node, zid, shrink;
2611         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2612         struct cgroup *cgrp = mem->css.cgroup;
2613
2614         css_get(&mem->css);
2615
2616         shrink = 0;
2617         /* should free all ? */
2618         if (free_all)
2619                 goto try_to_free;
2620 move_account:
2621         do {
2622                 ret = -EBUSY;
2623                 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2624                         goto out;
2625                 ret = -EINTR;
2626                 if (signal_pending(current))
2627                         goto out;
2628                 /* This is for making all *used* pages to be on LRU. */
2629                 lru_add_drain_all();
2630                 drain_all_stock_sync();
2631                 ret = 0;
2632                 for_each_node_state(node, N_HIGH_MEMORY) {
2633                         for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
2634                                 enum lru_list l;
2635                                 for_each_lru(l) {
2636                                         ret = mem_cgroup_force_empty_list(mem,
2637                                                         node, zid, l);
2638                                         if (ret)
2639                                                 break;
2640                                 }
2641                         }
2642                         if (ret)
2643                                 break;
2644                 }
2645                 /* it seems parent cgroup doesn't have enough mem */
2646                 if (ret == -ENOMEM)
2647                         goto try_to_free;
2648                 cond_resched();
2649         /* "ret" should also be checked to ensure all lists are empty. */
2650         } while (mem->res.usage > 0 || ret);
2651 out:
2652         css_put(&mem->css);
2653         return ret;
2654
2655 try_to_free:
2656         /* returns EBUSY if there is a task or if we come here twice. */
2657         if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
2658                 ret = -EBUSY;
2659                 goto out;
2660         }
2661         /* we call try-to-free pages for make this cgroup empty */
2662         lru_add_drain_all();
2663         /* try to free all pages in this cgroup */
2664         shrink = 1;
2665         while (nr_retries && mem->res.usage > 0) {
2666                 int progress;
2667
2668                 if (signal_pending(current)) {
2669                         ret = -EINTR;
2670                         goto out;
2671                 }
2672                 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
2673                                                 false, get_swappiness(mem));
2674                 if (!progress) {
2675                         nr_retries--;
2676                         /* maybe some writeback is necessary */
2677                         congestion_wait(BLK_RW_ASYNC, HZ/10);
2678                 }
2679
2680         }
2681         lru_add_drain();
2682         /* try move_account...there may be some *locked* pages. */
2683         goto move_account;
2684 }
2685
2686 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
2687 {
2688         return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
2689 }
2690
2691
2692 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
2693 {
2694         return mem_cgroup_from_cont(cont)->use_hierarchy;
2695 }
2696
2697 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2698                                         u64 val)
2699 {
2700         int retval = 0;
2701         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2702         struct cgroup *parent = cont->parent;
2703         struct mem_cgroup *parent_mem = NULL;
2704
2705         if (parent)
2706                 parent_mem = mem_cgroup_from_cont(parent);
2707
2708         cgroup_lock();
2709         /*
2710          * If parent's use_hierarchy is set, we can't make any modifications
2711          * in the child subtrees. If it is unset, then the change can
2712          * occur, provided the current cgroup has no children.
2713          *
2714          * For the root cgroup, parent_mem is NULL, we allow value to be
2715          * set if there are no children.
2716          */
2717         if ((!parent_mem || !parent_mem->use_hierarchy) &&
2718                                 (val == 1 || val == 0)) {
2719                 if (list_empty(&cont->children))
2720                         mem->use_hierarchy = val;
2721                 else
2722                         retval = -EBUSY;
2723         } else
2724                 retval = -EINVAL;
2725         cgroup_unlock();
2726
2727         return retval;
2728 }
2729
2730 struct mem_cgroup_idx_data {
2731         s64 val;
2732         enum mem_cgroup_stat_index idx;
2733 };
2734
2735 static int
2736 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2737 {
2738         struct mem_cgroup_idx_data *d = data;
2739         d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
2740         return 0;
2741 }
2742
2743 static void
2744 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2745                                 enum mem_cgroup_stat_index idx, s64 *val)
2746 {
2747         struct mem_cgroup_idx_data d;
2748         d.idx = idx;
2749         d.val = 0;
2750         mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
2751         *val = d.val;
2752 }
2753
2754 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2755 {
2756         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2757         u64 idx_val, val;
2758         int type, name;
2759
2760         type = MEMFILE_TYPE(cft->private);
2761         name = MEMFILE_ATTR(cft->private);
2762         switch (type) {
2763         case _MEM:
2764                 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2765                         mem_cgroup_get_recursive_idx_stat(mem,
2766                                 MEM_CGROUP_STAT_CACHE, &idx_val);
2767                         val = idx_val;
2768                         mem_cgroup_get_recursive_idx_stat(mem,
2769                                 MEM_CGROUP_STAT_RSS, &idx_val);
2770                         val += idx_val;
2771                         val <<= PAGE_SHIFT;
2772                 } else
2773                         val = res_counter_read_u64(&mem->res, name);
2774                 break;
2775         case _MEMSWAP:
2776                 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2777                         mem_cgroup_get_recursive_idx_stat(mem,
2778                                 MEM_CGROUP_STAT_CACHE, &idx_val);
2779                         val = idx_val;
2780                         mem_cgroup_get_recursive_idx_stat(mem,
2781                                 MEM_CGROUP_STAT_RSS, &idx_val);
2782                         val += idx_val;
2783                         mem_cgroup_get_recursive_idx_stat(mem,
2784                                 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2785                         val += idx_val;
2786                         val <<= PAGE_SHIFT;
2787                 } else
2788                         val = res_counter_read_u64(&mem->memsw, name);
2789                 break;
2790         default:
2791                 BUG();
2792                 break;
2793         }
2794         return val;
2795 }
2796 /*
2797  * The user of this function is...
2798  * RES_LIMIT.
2799  */
2800 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2801                             const char *buffer)
2802 {
2803         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2804         int type, name;
2805         unsigned long long val;
2806         int ret;
2807
2808         type = MEMFILE_TYPE(cft->private);
2809         name = MEMFILE_ATTR(cft->private);
2810         switch (name) {
2811         case RES_LIMIT:
2812                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2813                         ret = -EINVAL;
2814                         break;
2815                 }
2816                 /* This function does all necessary parse...reuse it */
2817                 ret = res_counter_memparse_write_strategy(buffer, &val);
2818                 if (ret)
2819                         break;
2820                 if (type == _MEM)
2821                         ret = mem_cgroup_resize_limit(memcg, val);
2822                 else
2823                         ret = mem_cgroup_resize_memsw_limit(memcg, val);
2824                 break;
2825         case RES_SOFT_LIMIT:
2826                 ret = res_counter_memparse_write_strategy(buffer, &val);
2827                 if (ret)
2828                         break;
2829                 /*
2830                  * For memsw, soft limits are hard to implement in terms
2831                  * of semantics, for now, we support soft limits for
2832                  * control without swap
2833                  */
2834                 if (type == _MEM)
2835                         ret = res_counter_set_soft_limit(&memcg->res, val);
2836                 else
2837                         ret = -EINVAL;
2838                 break;
2839         default:
2840                 ret = -EINVAL; /* should be BUG() ? */
2841                 break;
2842         }
2843         return ret;
2844 }
2845
2846 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
2847                 unsigned long long *mem_limit, unsigned long long *memsw_limit)
2848 {
2849         struct cgroup *cgroup;
2850         unsigned long long min_limit, min_memsw_limit, tmp;
2851
2852         min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2853         min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2854         cgroup = memcg->css.cgroup;
2855         if (!memcg->use_hierarchy)
2856                 goto out;
2857
2858         while (cgroup->parent) {
2859                 cgroup = cgroup->parent;
2860                 memcg = mem_cgroup_from_cont(cgroup);
2861                 if (!memcg->use_hierarchy)
2862                         break;
2863                 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
2864                 min_limit = min(min_limit, tmp);
2865                 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2866                 min_memsw_limit = min(min_memsw_limit, tmp);
2867         }
2868 out:
2869         *mem_limit = min_limit;
2870         *memsw_limit = min_memsw_limit;
2871         return;
2872 }
2873
2874 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2875 {
2876         struct mem_cgroup *mem;
2877         int type, name;
2878
2879         mem = mem_cgroup_from_cont(cont);
2880         type = MEMFILE_TYPE(event);
2881         name = MEMFILE_ATTR(event);
2882         switch (name) {
2883         case RES_MAX_USAGE:
2884                 if (type == _MEM)
2885                         res_counter_reset_max(&mem->res);
2886                 else
2887                         res_counter_reset_max(&mem->memsw);
2888                 break;
2889         case RES_FAILCNT:
2890                 if (type == _MEM)
2891                         res_counter_reset_failcnt(&mem->res);
2892                 else
2893                         res_counter_reset_failcnt(&mem->memsw);
2894                 break;
2895         }
2896
2897         return 0;
2898 }
2899
2900 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
2901                                         struct cftype *cft)
2902 {
2903         return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
2904 }
2905
2906 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
2907                                         struct cftype *cft, u64 val)
2908 {
2909         struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
2910
2911         if (val >= (1 << NR_MOVE_TYPE))
2912                 return -EINVAL;
2913         /*
2914          * We check this value several times in both in can_attach() and
2915          * attach(), so we need cgroup lock to prevent this value from being
2916          * inconsistent.
2917          */
2918         cgroup_lock();
2919         mem->move_charge_at_immigrate = val;
2920         cgroup_unlock();
2921
2922         return 0;
2923 }
2924
2925
2926 /* For read statistics */
2927 enum {
2928         MCS_CACHE,
2929         MCS_RSS,
2930         MCS_FILE_MAPPED,
2931         MCS_PGPGIN,
2932         MCS_PGPGOUT,
2933         MCS_SWAP,
2934         MCS_INACTIVE_ANON,
2935         MCS_ACTIVE_ANON,
2936         MCS_INACTIVE_FILE,
2937         MCS_ACTIVE_FILE,
2938         MCS_UNEVICTABLE,
2939         NR_MCS_STAT,
2940 };
2941
2942 struct mcs_total_stat {
2943         s64 stat[NR_MCS_STAT];
2944 };
2945
2946 struct {
2947         char *local_name;
2948         char *total_name;
2949 } memcg_stat_strings[NR_MCS_STAT] = {
2950         {"cache", "total_cache"},
2951         {"rss", "total_rss"},
2952         {"mapped_file", "total_mapped_file"},
2953         {"pgpgin", "total_pgpgin"},
2954         {"pgpgout", "total_pgpgout"},
2955         {"swap", "total_swap"},
2956         {"inactive_anon", "total_inactive_anon"},
2957         {"active_anon", "total_active_anon"},
2958         {"inactive_file", "total_inactive_file"},
2959         {"active_file", "total_active_file"},
2960         {"unevictable", "total_unevictable"}
2961 };
2962
2963
2964 static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2965 {
2966         struct mcs_total_stat *s = data;
2967         s64 val;
2968
2969         /* per cpu stat */
2970         val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
2971         s->stat[MCS_CACHE] += val * PAGE_SIZE;
2972         val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2973         s->stat[MCS_RSS] += val * PAGE_SIZE;
2974         val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
2975         s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
2976         val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2977         s->stat[MCS_PGPGIN] += val;
2978         val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2979         s->stat[MCS_PGPGOUT] += val;
2980         if (do_swap_account) {
2981                 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
2982                 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2983         }
2984
2985         /* per zone stat */
2986         val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
2987         s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
2988         val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
2989         s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
2990         val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
2991         s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
2992         val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
2993         s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
2994         val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
2995         s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
2996         return 0;
2997 }
2998
2999 static void
3000 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3001 {
3002         mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
3003 }
3004
3005 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3006                                  struct cgroup_map_cb *cb)
3007 {
3008         struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
3009         struct mcs_total_stat mystat;
3010         int i;
3011
3012         memset(&mystat, 0, sizeof(mystat));
3013         mem_cgroup_get_local_stat(mem_cont, &mystat);
3014
3015         for (i = 0; i < NR_MCS_STAT; i++) {
3016                 if (i == MCS_SWAP && !do_swap_account)
3017                         continue;
3018                 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
3019         }
3020
3021         /* Hierarchical information */
3022         {
3023                 unsigned long long limit, memsw_limit;
3024                 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
3025                 cb->fill(cb, "hierarchical_memory_limit", limit);
3026                 if (do_swap_account)
3027                         cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
3028         }
3029
3030         memset(&mystat, 0, sizeof(mystat));
3031         mem_cgroup_get_total_stat(mem_cont, &mystat);
3032         for (i = 0; i < NR_MCS_STAT; i++) {
3033                 if (i == MCS_SWAP && !do_swap_account)
3034                         continue;
3035                 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
3036         }
3037
3038 #ifdef CONFIG_DEBUG_VM
3039         cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
3040
3041         {
3042                 int nid, zid;
3043                 struct mem_cgroup_per_zone *mz;
3044                 unsigned long recent_rotated[2] = {0, 0};
3045                 unsigned long recent_scanned[2] = {0, 0};
3046
3047                 for_each_online_node(nid)
3048                         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3049                                 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
3050
3051                                 recent_rotated[0] +=
3052                                         mz->reclaim_stat.recent_rotated[0];
3053                                 recent_rotated[1] +=
3054                                         mz->reclaim_stat.recent_rotated[1];
3055                                 recent_scanned[0] +=
3056                                         mz->reclaim_stat.recent_scanned[0];
3057                                 recent_scanned[1] +=
3058                                         mz->reclaim_stat.recent_scanned[1];
3059                         }
3060                 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
3061                 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
3062                 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
3063                 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
3064         }
3065 #endif
3066
3067         return 0;
3068 }
3069
3070 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
3071 {
3072         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3073
3074         return get_swappiness(memcg);
3075 }
3076
3077 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3078                                        u64 val)
3079 {
3080         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3081         struct mem_cgroup *parent;
3082
3083         if (val > 100)
3084                 return -EINVAL;
3085
3086         if (cgrp->parent == NULL)
3087                 return -EINVAL;
3088
3089         parent = mem_cgroup_from_cont(cgrp->parent);
3090
3091         cgroup_lock();
3092
3093         /* If under hierarchy, only empty-root can set this value */
3094         if ((parent->use_hierarchy) ||
3095             (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
3096                 cgroup_unlock();
3097                 return -EINVAL;
3098         }
3099
3100         spin_lock(&memcg->reclaim_param_lock);
3101         memcg->swappiness = val;
3102         spin_unlock(&memcg->reclaim_param_lock);
3103
3104         cgroup_unlock();
3105
3106         return 0;
3107 }
3108
3109
3110 static struct cftype mem_cgroup_files[] = {
3111         {
3112                 .name = "usage_in_bytes",
3113                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3114                 .read_u64 = mem_cgroup_read,
3115         },
3116         {
3117                 .name = "max_usage_in_bytes",
3118                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3119                 .trigger = mem_cgroup_reset,
3120                 .read_u64 = mem_cgroup_read,
3121         },
3122         {
3123                 .name = "limit_in_bytes",
3124                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3125                 .write_string = mem_cgroup_write,
3126                 .read_u64 = mem_cgroup_read,
3127         },
3128         {
3129                 .name = "soft_limit_in_bytes",
3130                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3131                 .write_string = mem_cgroup_write,
3132                 .read_u64 = mem_cgroup_read,
3133         },
3134         {
3135                 .name = "failcnt",
3136                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3137                 .trigger = mem_cgroup_reset,
3138                 .read_u64 = mem_cgroup_read,
3139         },
3140         {
3141                 .name = "stat",
3142                 .read_map = mem_control_stat_show,
3143         },
3144         {
3145                 .name = "force_empty",
3146                 .trigger = mem_cgroup_force_empty_write,
3147         },
3148         {
3149                 .name = "use_hierarchy",
3150                 .write_u64 = mem_cgroup_hierarchy_write,
3151                 .read_u64 = mem_cgroup_hierarchy_read,
3152         },
3153         {
3154                 .name = "swappiness",
3155                 .read_u64 = mem_cgroup_swappiness_read,
3156                 .write_u64 = mem_cgroup_swappiness_write,
3157         },
3158         {
3159                 .name = "move_charge_at_immigrate",
3160                 .read_u64 = mem_cgroup_move_charge_read,
3161                 .write_u64 = mem_cgroup_move_charge_write,
3162         },
3163 };
3164
3165 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3166 static struct cftype memsw_cgroup_files[] = {
3167         {
3168                 .name = "memsw.usage_in_bytes",
3169                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3170                 .read_u64 = mem_cgroup_read,
3171         },
3172         {
3173                 .name = "memsw.max_usage_in_bytes",
3174                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
3175                 .trigger = mem_cgroup_reset,
3176                 .read_u64 = mem_cgroup_read,
3177         },
3178         {
3179                 .name = "memsw.limit_in_bytes",
3180                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
3181                 .write_string = mem_cgroup_write,
3182                 .read_u64 = mem_cgroup_read,
3183         },
3184         {
3185                 .name = "memsw.failcnt",
3186                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
3187                 .trigger = mem_cgroup_reset,
3188                 .read_u64 = mem_cgroup_read,
3189         },
3190 };
3191
3192 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3193 {
3194         if (!do_swap_account)
3195                 return 0;
3196         return cgroup_add_files(cont, ss, memsw_cgroup_files,
3197                                 ARRAY_SIZE(memsw_cgroup_files));
3198 };
3199 #else
3200 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3201 {
3202         return 0;
3203 }
3204 #endif
3205
3206 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3207 {
3208         struct mem_cgroup_per_node *pn;
3209         struct mem_cgroup_per_zone *mz;
3210         enum lru_list l;
3211         int zone, tmp = node;
3212         /*
3213          * This routine is called against possible nodes.
3214          * But it's BUG to call kmalloc() against offline node.
3215          *
3216          * TODO: this routine can waste much memory for nodes which will
3217          *       never be onlined. It's better to use memory hotplug callback
3218          *       function.
3219          */
3220         if (!node_state(node, N_NORMAL_MEMORY))
3221                 tmp = -1;
3222         pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
3223         if (!pn)
3224                 return 1;
3225
3226         mem->info.nodeinfo[node] = pn;
3227         memset(pn, 0, sizeof(*pn));
3228
3229         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3230                 mz = &pn->zoneinfo[zone];
3231                 for_each_lru(l)
3232                         INIT_LIST_HEAD(&mz->lists[l]);
3233                 mz->usage_in_excess = 0;
3234                 mz->on_tree = false;
3235                 mz->mem = mem;
3236         }
3237         return 0;
3238 }
3239
3240 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3241 {
3242         kfree(mem->info.nodeinfo[node]);
3243 }
3244
3245 static int mem_cgroup_size(void)
3246 {
3247         int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
3248         return sizeof(struct mem_cgroup) + cpustat_size;
3249 }
3250
3251 static struct mem_cgroup *mem_cgroup_alloc(void)
3252 {
3253         struct mem_cgroup *mem;
3254         int size = mem_cgroup_size();
3255
3256         if (size < PAGE_SIZE)
3257                 mem = kmalloc(size, GFP_KERNEL);
3258         else
3259                 mem = vmalloc(size);
3260
3261         if (mem)
3262                 memset(mem, 0, size);
3263         return mem;
3264 }
3265
3266 /*
3267  * At destroying mem_cgroup, references from swap_cgroup can remain.
3268  * (scanning all at force_empty is too costly...)
3269  *
3270  * Instead of clearing all references at force_empty, we remember
3271  * the number of reference from swap_cgroup and free mem_cgroup when
3272  * it goes down to 0.
3273  *
3274  * Removal of cgroup itself succeeds regardless of refs from swap.
3275  */
3276
3277 static void __mem_cgroup_free(struct mem_cgroup *mem)
3278 {
3279         int node;
3280
3281         mem_cgroup_remove_from_trees(mem);
3282         free_css_id(&mem_cgroup_subsys, &mem->css);
3283
3284         for_each_node_state(node, N_POSSIBLE)
3285                 free_mem_cgroup_per_zone_info(mem, node);
3286
3287         if (mem_cgroup_size() < PAGE_SIZE)
3288                 kfree(mem);
3289         else
3290                 vfree(mem);
3291 }
3292
3293 static void mem_cgroup_get(struct mem_cgroup *mem)
3294 {
3295         atomic_inc(&mem->refcnt);
3296 }
3297
3298 static void mem_cgroup_put(struct mem_cgroup *mem)
3299 {
3300         if (atomic_dec_and_test(&mem->refcnt)) {
3301                 struct mem_cgroup *parent = parent_mem_cgroup(mem);
3302                 __mem_cgroup_free(mem);
3303                 if (parent)
3304                         mem_cgroup_put(parent);
3305         }
3306 }
3307
3308 /*
3309  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
3310  */
3311 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
3312 {
3313         if (!mem->res.parent)
3314                 return NULL;
3315         return mem_cgroup_from_res_counter(mem->res.parent, res);
3316 }
3317
3318 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3319 static void __init enable_swap_cgroup(void)
3320 {
3321         if (!mem_cgroup_disabled() && really_do_swap_account)
3322                 do_swap_account = 1;
3323 }
3324 #else
3325 static void __init enable_swap_cgroup(void)
3326 {
3327 }
3328 #endif
3329
3330 static int mem_cgroup_soft_limit_tree_init(void)
3331 {
3332         struct mem_cgroup_tree_per_node *rtpn;
3333         struct mem_cgroup_tree_per_zone *rtpz;
3334         int tmp, node, zone;
3335
3336         for_each_node_state(node, N_POSSIBLE) {
3337                 tmp = node;
3338                 if (!node_state(node, N_NORMAL_MEMORY))
3339                         tmp = -1;
3340                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
3341                 if (!rtpn)
3342                         return 1;
3343
3344                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
3345
3346                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3347                         rtpz = &rtpn->rb_tree_per_zone[zone];
3348                         rtpz->rb_root = RB_ROOT;
3349                         spin_lock_init(&rtpz->lock);
3350                 }
3351         }
3352         return 0;
3353 }
3354
3355 static struct cgroup_subsys_state * __ref
3356 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3357 {
3358         struct mem_cgroup *mem, *parent;
3359         long error = -ENOMEM;
3360         int node;
3361
3362         mem = mem_cgroup_alloc();
3363         if (!mem)
3364                 return ERR_PTR(error);
3365
3366         for_each_node_state(node, N_POSSIBLE)
3367                 if (alloc_mem_cgroup_per_zone_info(mem, node))
3368                         goto free_out;
3369
3370         /* root ? */
3371         if (cont->parent == NULL) {
3372                 int cpu;
3373                 enable_swap_cgroup();
3374                 parent = NULL;
3375                 root_mem_cgroup = mem;
3376                 if (mem_cgroup_soft_limit_tree_init())
3377                         goto free_out;
3378                 for_each_possible_cpu(cpu) {
3379                         struct memcg_stock_pcp *stock =
3380                                                 &per_cpu(memcg_stock, cpu);
3381                         INIT_WORK(&stock->work, drain_local_stock);
3382                 }
3383                 hotcpu_notifier(memcg_stock_cpu_callback, 0);
3384
3385         } else {
3386                 parent = mem_cgroup_from_cont(cont->parent);
3387                 mem->use_hierarchy = parent->use_hierarchy;
3388         }
3389
3390         if (parent && parent->use_hierarchy) {
3391                 res_counter_init(&mem->res, &parent->res);
3392                 res_counter_init(&mem->memsw, &parent->memsw);
3393                 /*
3394                  * We increment refcnt of the parent to ensure that we can
3395                  * safely access it on res_counter_charge/uncharge.
3396                  * This refcnt will be decremented when freeing this
3397                  * mem_cgroup(see mem_cgroup_put).
3398                  */
3399                 mem_cgroup_get(parent);
3400         } else {
3401                 res_counter_init(&mem->res, NULL);
3402                 res_counter_init(&mem->memsw, NULL);
3403         }
3404         mem->last_scanned_child = 0;
3405         spin_lock_init(&mem->reclaim_param_lock);
3406
3407         if (parent)
3408                 mem->swappiness = get_swappiness(parent);
3409         atomic_set(&mem->refcnt, 1);
3410         mem->move_charge_at_immigrate = 0;
3411         return &mem->css;
3412 free_out:
3413         __mem_cgroup_free(mem);
3414         root_mem_cgroup = NULL;
3415         return ERR_PTR(error);
3416 }
3417
3418 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
3419                                         struct cgroup *cont)
3420 {
3421         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3422
3423         return mem_cgroup_force_empty(mem, false);
3424 }
3425
3426 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
3427                                 struct cgroup *cont)
3428 {
3429         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3430
3431         mem_cgroup_put(mem);
3432 }
3433
3434 static int mem_cgroup_populate(struct cgroup_subsys *ss,
3435                                 struct cgroup *cont)
3436 {
3437         int ret;
3438
3439         ret = cgroup_add_files(cont, ss, mem_cgroup_files,
3440                                 ARRAY_SIZE(mem_cgroup_files));
3441
3442         if (!ret)
3443                 ret = register_memsw_files(cont, ss);
3444         return ret;
3445 }
3446
3447 /* Handlers for move charge at task migration. */
3448 #define PRECHARGE_COUNT_AT_ONCE 256
3449 static int mem_cgroup_do_precharge(unsigned long count)
3450 {
3451         int ret = 0;
3452         int batch_count = PRECHARGE_COUNT_AT_ONCE;
3453         struct mem_cgroup *mem = mc.to;
3454
3455         if (mem_cgroup_is_root(mem)) {
3456                 mc.precharge += count;
3457                 /* we don't need css_get for root */
3458                 return ret;
3459         }
3460         /* try to charge at once */
3461         if (count > 1) {
3462                 struct res_counter *dummy;
3463                 /*
3464                  * "mem" cannot be under rmdir() because we've already checked
3465                  * by cgroup_lock_live_cgroup() that it is not removed and we
3466                  * are still under the same cgroup_mutex. So we can postpone
3467                  * css_get().
3468                  */
3469                 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
3470                         goto one_by_one;
3471                 if (do_swap_account && res_counter_charge(&mem->memsw,
3472                                                 PAGE_SIZE * count, &dummy)) {
3473                         res_counter_uncharge(&mem->res, PAGE_SIZE * count);
3474                         goto one_by_one;
3475                 }
3476                 mc.precharge += count;
3477                 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
3478                 WARN_ON_ONCE(count > INT_MAX);
3479                 __css_get(&mem->css, (int)count);
3480                 return ret;
3481         }
3482 one_by_one:
3483         /* fall back to one by one charge */
3484         while (count--) {
3485                 if (signal_pending(current)) {
3486                         ret = -EINTR;
3487                         break;
3488                 }
3489                 if (!batch_count--) {
3490                         batch_count = PRECHARGE_COUNT_AT_ONCE;
3491                         cond_resched();
3492                 }
3493                 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem,
3494                                                                 false, NULL);
3495                 if (ret || !mem)
3496                         /* mem_cgroup_clear_mc() will do uncharge later */
3497                         return -ENOMEM;
3498                 mc.precharge++;
3499         }
3500         return ret;
3501 }
3502
3503 /**
3504  * is_target_pte_for_mc - check a pte whether it is valid for move charge
3505  * @vma: the vma the pte to be checked belongs
3506  * @addr: the address corresponding to the pte to be checked
3507  * @ptent: the pte to be checked
3508  * @target: the pointer the target page will be stored(can be NULL)
3509  *
3510  * Returns
3511  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
3512  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
3513  *     move charge. if @target is not NULL, the page is stored in target->page
3514  *     with extra refcnt got(Callers should handle it).
3515  *
3516  * Called with pte lock held.
3517  */
3518 /* We add a new member later. */
3519 union mc_target {
3520         struct page     *page;
3521 };
3522
3523 /* We add a new type later. */
3524 enum mc_target_type {
3525         MC_TARGET_NONE, /* not used */
3526         MC_TARGET_PAGE,
3527 };
3528
3529 static int is_target_pte_for_mc(struct vm_area_struct *vma,
3530                 unsigned long addr, pte_t ptent, union mc_target *target)
3531 {
3532         struct page *page;
3533         struct page_cgroup *pc;
3534         int ret = 0;
3535         bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
3536                                         &mc.to->move_charge_at_immigrate);
3537
3538         if (!pte_present(ptent))
3539                 return 0;
3540
3541         page = vm_normal_page(vma, addr, ptent);
3542         if (!page || !page_mapped(page))
3543                 return 0;
3544         /*
3545          * TODO: We don't move charges of file(including shmem/tmpfs) pages for
3546          * now.
3547          */
3548         if (!move_anon || !PageAnon(page))
3549                 return 0;
3550         /*
3551          * TODO: We don't move charges of shared(used by multiple processes)
3552          * pages for now.
3553          */
3554         if (page_mapcount(page) > 1)
3555                 return 0;
3556         if (!get_page_unless_zero(page))
3557                 return 0;
3558
3559         pc = lookup_page_cgroup(page);
3560         /*
3561          * Do only loose check w/o page_cgroup lock. mem_cgroup_move_account()
3562          * checks the pc is valid or not under the lock.
3563          */
3564         if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
3565                 ret = MC_TARGET_PAGE;
3566                 if (target)
3567                         target->page = page;
3568         }
3569
3570         if (!ret || !target)
3571                 put_page(page);
3572
3573         return ret;
3574 }
3575
3576 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
3577                                         unsigned long addr, unsigned long end,
3578                                         struct mm_walk *walk)
3579 {
3580         struct vm_area_struct *vma = walk->private;
3581         pte_t *pte;
3582         spinlock_t *ptl;
3583
3584         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
3585         for (; addr != end; pte++, addr += PAGE_SIZE)
3586                 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
3587                         mc.precharge++; /* increment precharge temporarily */
3588         pte_unmap_unlock(pte - 1, ptl);
3589         cond_resched();
3590
3591         return 0;
3592 }
3593
3594 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
3595 {
3596         unsigned long precharge;
3597         struct vm_area_struct *vma;
3598
3599         down_read(&mm->mmap_sem);
3600         for (vma = mm->mmap; vma; vma = vma->vm_next) {
3601                 struct mm_walk mem_cgroup_count_precharge_walk = {
3602                         .pmd_entry = mem_cgroup_count_precharge_pte_range,
3603                         .mm = mm,
3604                         .private = vma,
3605                 };
3606                 if (is_vm_hugetlb_page(vma))
3607                         continue;
3608                 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
3609                 if (vma->vm_flags & VM_SHARED)
3610                         continue;
3611                 walk_page_range(vma->vm_start, vma->vm_end,
3612                                         &mem_cgroup_count_precharge_walk);
3613         }
3614         up_read(&mm->mmap_sem);
3615
3616         precharge = mc.precharge;
3617         mc.precharge = 0;
3618
3619         return precharge;
3620 }
3621
3622 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
3623 {
3624         return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
3625 }
3626
3627 static void mem_cgroup_clear_mc(void)
3628 {
3629         /* we must uncharge all the leftover precharges from mc.to */
3630         if (mc.precharge) {
3631                 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
3632                 mc.precharge = 0;
3633         }
3634         /*
3635          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
3636          * we must uncharge here.
3637          */
3638         if (mc.moved_charge) {
3639                 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
3640                 mc.moved_charge = 0;
3641         }
3642         mc.from = NULL;
3643         mc.to = NULL;
3644 }
3645
3646 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
3647                                 struct cgroup *cgroup,
3648                                 struct task_struct *p,
3649                                 bool threadgroup)
3650 {
3651         int ret = 0;
3652         struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
3653
3654         if (mem->move_charge_at_immigrate) {
3655                 struct mm_struct *mm;
3656                 struct mem_cgroup *from = mem_cgroup_from_task(p);
3657
3658                 VM_BUG_ON(from == mem);
3659
3660                 mm = get_task_mm(p);
3661                 if (!mm)
3662                         return 0;
3663                 /* We move charges only when we move a owner of the mm */
3664                 if (mm->owner == p) {
3665                         VM_BUG_ON(mc.from);
3666                         VM_BUG_ON(mc.to);
3667                         VM_BUG_ON(mc.precharge);
3668                         VM_BUG_ON(mc.moved_charge);
3669                         mc.from = from;
3670                         mc.to = mem;
3671                         mc.precharge = 0;
3672                         mc.moved_charge = 0;
3673
3674                         ret = mem_cgroup_precharge_mc(mm);
3675                         if (ret)
3676                                 mem_cgroup_clear_mc();
3677                 }
3678                 mmput(mm);
3679         }
3680         return ret;
3681 }
3682
3683 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
3684                                 struct cgroup *cgroup,
3685                                 struct task_struct *p,
3686                                 bool threadgroup)
3687 {
3688         mem_cgroup_clear_mc();
3689 }
3690
3691 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
3692                                 unsigned long addr, unsigned long end,
3693                                 struct mm_walk *walk)
3694 {
3695         int ret = 0;
3696         struct vm_area_struct *vma = walk->private;
3697         pte_t *pte;
3698         spinlock_t *ptl;
3699
3700 retry:
3701         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
3702         for (; addr != end; addr += PAGE_SIZE) {
3703                 pte_t ptent = *(pte++);
3704                 union mc_target target;
3705                 int type;
3706                 struct page *page;
3707                 struct page_cgroup *pc;
3708
3709                 if (!mc.precharge)
3710                         break;
3711
3712                 type = is_target_pte_for_mc(vma, addr, ptent, &target);
3713                 switch (type) {
3714                 case MC_TARGET_PAGE:
3715                         page = target.page;
3716                         if (isolate_lru_page(page))
3717                                 goto put;
3718                         pc = lookup_page_cgroup(page);
3719                         if (!mem_cgroup_move_account(pc,
3720                                                 mc.from, mc.to, false)) {
3721                                 mc.precharge--;
3722                                 /* we uncharge from mc.from later. */
3723                                 mc.moved_charge++;
3724                         }
3725                         putback_lru_page(page);
3726 put:                    /* is_target_pte_for_mc() gets the page */
3727                         put_page(page);
3728                         break;
3729                 default:
3730                         break;
3731                 }
3732         }
3733         pte_unmap_unlock(pte - 1, ptl);
3734         cond_resched();
3735
3736         if (addr != end) {
3737                 /*
3738                  * We have consumed all precharges we got in can_attach().
3739                  * We try charge one by one, but don't do any additional
3740                  * charges to mc.to if we have failed in charge once in attach()
3741                  * phase.
3742                  */
3743                 ret = mem_cgroup_do_precharge(1);
3744                 if (!ret)
3745                         goto retry;
3746         }
3747
3748         return ret;
3749 }
3750
3751 static void mem_cgroup_move_charge(struct mm_struct *mm)
3752 {
3753         struct vm_area_struct *vma;
3754
3755         lru_add_drain_all();
3756         down_read(&mm->mmap_sem);
3757         for (vma = mm->mmap; vma; vma = vma->vm_next) {
3758                 int ret;
3759                 struct mm_walk mem_cgroup_move_charge_walk = {
3760                         .pmd_entry = mem_cgroup_move_charge_pte_range,
3761                         .mm = mm,
3762                         .private = vma,
3763                 };
3764                 if (is_vm_hugetlb_page(vma))
3765                         continue;
3766                 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
3767                 if (vma->vm_flags & VM_SHARED)
3768                         continue;
3769                 ret = walk_page_range(vma->vm_start, vma->vm_end,
3770                                                 &mem_cgroup_move_charge_walk);
3771                 if (ret)
3772                         /*
3773                          * means we have consumed all precharges and failed in
3774                          * doing additional charge. Just abandon here.
3775                          */
3776                         break;
3777         }
3778         up_read(&mm->mmap_sem);
3779 }
3780
3781 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3782                                 struct cgroup *cont,
3783                                 struct cgroup *old_cont,
3784                                 struct task_struct *p,
3785                                 bool threadgroup)
3786 {
3787         struct mm_struct *mm;
3788
3789         if (!mc.to)
3790                 /* no need to move charge */
3791                 return;
3792
3793         mm = get_task_mm(p);
3794         if (mm) {
3795                 mem_cgroup_move_charge(mm);
3796                 mmput(mm);
3797         }
3798         mem_cgroup_clear_mc();
3799 }
3800
3801 struct cgroup_subsys mem_cgroup_subsys = {
3802         .name = "memory",
3803         .subsys_id = mem_cgroup_subsys_id,
3804         .create = mem_cgroup_create,
3805         .pre_destroy = mem_cgroup_pre_destroy,
3806         .destroy = mem_cgroup_destroy,
3807         .populate = mem_cgroup_populate,
3808         .can_attach = mem_cgroup_can_attach,
3809         .cancel_attach = mem_cgroup_cancel_attach,
3810         .attach = mem_cgroup_move_task,
3811         .early_init = 0,
3812         .use_id = 1,
3813 };
3814
3815 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3816
3817 static int __init disable_swap_account(char *s)
3818 {
3819         really_do_swap_account = 0;
3820         return 1;
3821 }
3822 __setup("noswapaccount", disable_swap_account);
3823 #endif