Merge branch 'next-i2c' of git://git.fluff.org/bjdooks/linux

[pandora-kernel.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index e013b8e..5f84d23 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -205,6 +205,50 @@ struct mem_cgroup_eventfd_list {
  static void mem_cgroup_threshold(struct mem_cgroup *mem);
  static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
  
+enum {
+       SCAN_BY_LIMIT,
+       SCAN_BY_SYSTEM,
+       NR_SCAN_CONTEXT,
+       SCAN_BY_SHRINK, /* not recorded now */
+};
+
+enum {
+       SCAN,
+       SCAN_ANON,
+       SCAN_FILE,
+       ROTATE,
+       ROTATE_ANON,
+       ROTATE_FILE,
+       FREED,
+       FREED_ANON,
+       FREED_FILE,
+       ELAPSED,
+       NR_SCANSTATS,
+};
+
+struct scanstat {
+       spinlock_t      lock;
+       unsigned long   stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
+       unsigned long   rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
+};
+
+const char *scanstat_string[NR_SCANSTATS] = {
+       "scanned_pages",
+       "scanned_anon_pages",
+       "scanned_file_pages",
+       "rotated_pages",
+       "rotated_anon_pages",
+       "rotated_file_pages",
+       "freed_pages",
+       "freed_anon_pages",
+       "freed_file_pages",
+       "elapsed_ns",
+};
+#define SCANSTAT_WORD_LIMIT    "_by_limit"
+#define SCANSTAT_WORD_SYSTEM   "_by_system"
+#define SCANSTAT_WORD_HIERARCHY        "_under_hierarchy"
+
+
  /*
   * The memory controller data structure. The memory controller controls both
   * page cache and RSS per cgroup. We would eventually like to provide
@@ -246,10 +290,13 @@ struct mem_cgroup {
          * Should the accounting and control be hierarchical, per subtree?
          */
         bool use_hierarchy;
-       atomic_t        oom_lock;
+
+       bool            oom_lock;
+       atomic_t        under_oom;
+
         atomic_t        refcnt;
  
-       unsigned int    swappiness;
+       int     swappiness;
         /* OOM-Killer disable */
         int             oom_kill_disable;
  
@@ -267,7 +314,8 @@ struct mem_cgroup {
  
         /* For oom notifier event fd */
         struct list_head oom_notify;
-
+       /* For recording LRU-scan statistics */
+       struct scanstat scanstat;
         /*
          * Should we move charges of a task when a task is moved into this
          * mem_cgroup ? And what type of charges should we move ?
@@ -636,27 +684,44 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
         preempt_enable();
  }
  
-static unsigned long
-mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+unsigned long
+mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
+                       unsigned int lru_mask)
  {
         struct mem_cgroup_per_zone *mz;
+       enum lru_list l;
+       unsigned long ret = 0;
+
+       mz = mem_cgroup_zoneinfo(mem, nid, zid);
+
+       for_each_lru(l) {
+               if (BIT(l) & lru_mask)
+                       ret += MEM_CGROUP_ZSTAT(mz, l);
+       }
+       return ret;
+}
+
+static unsigned long
+mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
+                       int nid, unsigned int lru_mask)
+{
         u64 total = 0;
         int zid;
  
-       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-               mz = mem_cgroup_zoneinfo(mem, nid, zid);
-               total += MEM_CGROUP_ZSTAT(mz, idx);
-       }
+       for (zid = 0; zid < MAX_NR_ZONES; zid++)
+               total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
+
         return total;
  }
-static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
-                                       enum lru_list idx)
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
+                       unsigned int lru_mask)
  {
         int nid;
         u64 total = 0;
  
-       for_each_online_node(nid)
-               total += mem_cgroup_get_zonestat_node(mem, nid, idx);
+       for_each_node_state(nid, N_HIGH_MEMORY)
+               total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
         return total;
  }
  
@@ -1043,6 +1108,21 @@ void mem_cgroup_move_lists(struct page *page,
         mem_cgroup_add_lru_list(page, to);
  }
  
+/*
+ * Checks whether given mem is same or in the root_mem's
+ * hierarchy subtree
+ */
+static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
+               struct mem_cgroup *mem)
+{
+       if (root_mem != mem) {
+               return (root_mem->use_hierarchy &&
+                       css_is_ancestor(&mem->css, &root_mem->css));
+       }
+
+       return true;
+}
+
  int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
  {
         int ret;
@@ -1062,10 +1142,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
          * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
          * hierarchy(even if use_hierarchy is disabled in "mem").
          */
-       if (mem->use_hierarchy)
-               ret = css_is_ancestor(&curr->css, &mem->css);
-       else
-               ret = (curr == mem);
+       ret = mem_cgroup_same_or_subtree(mem, curr);
         css_put(&curr->css);
         return ret;
  }
@@ -1077,8 +1154,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
         unsigned long gb;
         unsigned long inactive_ratio;
  
-       inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
-       active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
+       inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
+       active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
  
         gb = (inactive + active) >> (30 - PAGE_SHIFT);
         if (gb)
@@ -1117,109 +1194,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
         unsigned long active;
         unsigned long inactive;
  
-       inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
-       active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
+       inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
+       active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
  
         return (active > inactive);
  }
  
-unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
-                                               struct zone *zone,
-                                               enum lru_list lru)
-{
-       int nid = zone_to_nid(zone);
-       int zid = zone_idx(zone);
-       struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-
-       return MEM_CGROUP_ZSTAT(mz, lru);
-}
-
-static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
-                                                       int nid)
-{
-       unsigned long ret;
-
-       ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
-               mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
-
-       return ret;
-}
-
-static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
-                                                       int nid)
-{
-       unsigned long ret;
-
-       ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
-               mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
-       return ret;
-}
-
-#if MAX_NUMNODES > 1
-static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
-{
-       u64 total = 0;
-       int nid;
-
-       for_each_node_state(nid, N_HIGH_MEMORY)
-               total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
-
-       return total;
-}
-
-static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
-{
-       u64 total = 0;
-       int nid;
-
-       for_each_node_state(nid, N_HIGH_MEMORY)
-               total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
-
-       return total;
-}
-
-static unsigned long
-mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
-{
-       return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
-}
-
-static unsigned long
-mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
-{
-       u64 total = 0;
-       int nid;
-
-       for_each_node_state(nid, N_HIGH_MEMORY)
-               total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
-
-       return total;
-}
-
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
-                                                       int nid)
-{
-       enum lru_list l;
-       u64 total = 0;
-
-       for_each_lru(l)
-               total += mem_cgroup_get_zonestat_node(memcg, nid, l);
-
-       return total;
-}
-
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
-{
-       u64 total = 0;
-       int nid;
-
-       for_each_node_state(nid, N_HIGH_MEMORY)
-               total += mem_cgroup_node_nr_lru_pages(memcg, nid);
-
-       return total;
-}
-#endif /* CONFIG_NUMA */
-
  struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
                                                       struct zone *zone)
  {
@@ -1329,7 +1309,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
         return margin >> PAGE_SHIFT;
  }
  
-static unsigned int get_swappiness(struct mem_cgroup *memcg)
+int mem_cgroup_swappiness(struct mem_cgroup *memcg)
  {
         struct cgroup *cgrp = memcg->css.cgroup;
  
@@ -1401,10 +1381,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
         to = mc.to;
         if (!from)
                 goto unlock;
-       if (from == mem || to == mem
-           || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
-           || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
-               ret = true;
+
+       ret = mem_cgroup_same_or_subtree(mem, from)
+               || mem_cgroup_same_or_subtree(mem, to);
  unlock:
         spin_unlock(&mc.lock);
         return ret;
@@ -1576,11 +1555,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
  static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
                 int nid, bool noswap)
  {
-       if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
+       if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
                 return true;
         if (noswap || !total_swap_pages)
                 return false;
-       if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
+       if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
                 return true;
         return false;
  
@@ -1700,6 +1679,44 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
  }
  #endif
  
+static void __mem_cgroup_record_scanstat(unsigned long *stats,
+                          struct memcg_scanrecord *rec)
+{
+
+       stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
+       stats[SCAN_ANON] += rec->nr_scanned[0];
+       stats[SCAN_FILE] += rec->nr_scanned[1];
+
+       stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
+       stats[ROTATE_ANON] += rec->nr_rotated[0];
+       stats[ROTATE_FILE] += rec->nr_rotated[1];
+
+       stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
+       stats[FREED_ANON] += rec->nr_freed[0];
+       stats[FREED_FILE] += rec->nr_freed[1];
+
+       stats[ELAPSED] += rec->elapsed;
+}
+
+static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
+{
+       struct mem_cgroup *mem;
+       int context = rec->context;
+
+       if (context >= NR_SCAN_CONTEXT)
+               return;
+
+       mem = rec->mem;
+       spin_lock(&mem->scanstat.lock);
+       __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
+       spin_unlock(&mem->scanstat.lock);
+
+       mem = rec->root;
+       spin_lock(&mem->scanstat.lock);
+       __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
+       spin_unlock(&mem->scanstat.lock);
+}
+
  /*
   * Scan the hierarchy if needed to reclaim memory. We remember the last child
   * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1724,15 +1741,25 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
         bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
         bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
         bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
+       struct memcg_scanrecord rec;
         unsigned long excess;
-       unsigned long nr_scanned;
+       unsigned long scanned;
  
         excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
  
         /* If memsw_is_minimum==1, swap-out is of-no-use. */
-       if (!check_soft && root_mem->memsw_is_minimum)
+       if (!check_soft && !shrink && root_mem->memsw_is_minimum)
                 noswap = true;
  
+       if (shrink)
+               rec.context = SCAN_BY_SHRINK;
+       else if (check_soft)
+               rec.context = SCAN_BY_SYSTEM;
+       else
+               rec.context = SCAN_BY_LIMIT;
+
+       rec.root = root_mem;
+
         while (1) {
                 victim = mem_cgroup_select_victim(root_mem);
                 if (victim == root_mem) {
@@ -1773,15 +1800,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                         css_put(&victim->css);
                         continue;
                 }
+               rec.mem = victim;
+               rec.nr_scanned[0] = 0;
+               rec.nr_scanned[1] = 0;
+               rec.nr_rotated[0] = 0;
+               rec.nr_rotated[1] = 0;
+               rec.nr_freed[0] = 0;
+               rec.nr_freed[1] = 0;
+               rec.elapsed = 0;
                 /* we use swappiness of local cgroup */
                 if (check_soft) {
                         ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                               noswap, get_swappiness(victim), zone,
-                               &nr_scanned);
-                       *total_scanned += nr_scanned;
+                               noswap, zone, &rec, &scanned);
+                       *total_scanned += scanned;
                 } else
                         ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
-                                               noswap, get_swappiness(victim));
+                                               noswap, &rec);
+               mem_cgroup_record_scanstat(&rec);
                 css_put(&victim->css);
                 /*
                  * At shrinking usage, we can't check we should stop here or
@@ -1803,38 +1838,84 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
  /*
   * Check OOM-Killer is already running under our hierarchy.
   * If someone is running, return false.
+ * Has to be called with memcg_oom_lock
   */
  static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
  {
-       int x, lock_count = 0;
-       struct mem_cgroup *iter;
+       int lock_count = -1;
+       struct mem_cgroup *iter, *failed = NULL;
+       bool cond = true;
+
+       for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+               bool locked = iter->oom_lock;
  
-       for_each_mem_cgroup_tree(iter, mem) {
-               x = atomic_inc_return(&iter->oom_lock);
-               lock_count = max(x, lock_count);
+               iter->oom_lock = true;
+               if (lock_count == -1)
+                       lock_count = iter->oom_lock;
+               else if (lock_count != locked) {
+                       /*
+                        * this subtree of our hierarchy is already locked
+                        * so we cannot give a lock.
+                        */
+                       lock_count = 0;
+                       failed = iter;
+                       cond = false;
+               }
         }
  
-       if (lock_count == 1)
-               return true;
-       return false;
+       if (!failed)
+               goto done;
+
+       /*
+        * OK, we failed to lock the whole subtree so we have to clean up
+        * what we set up to the failing subtree
+        */
+       cond = true;
+       for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+               if (iter == failed) {
+                       cond = false;
+                       continue;
+               }
+               iter->oom_lock = false;
+       }
+done:
+       return lock_count;
  }
  
+/*
+ * Has to be called with memcg_oom_lock
+ */
  static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
  {
         struct mem_cgroup *iter;
  
+       for_each_mem_cgroup_tree(iter, mem)
+               iter->oom_lock = false;
+       return 0;
+}
+
+static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
+{
+       struct mem_cgroup *iter;
+
+       for_each_mem_cgroup_tree(iter, mem)
+               atomic_inc(&iter->under_oom);
+}
+
+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
+{
+       struct mem_cgroup *iter;
+
         /*
          * When a new child is created while the hierarchy is under oom,
          * mem_cgroup_oom_lock() may not be called. We have to use
          * atomic_add_unless() here.
          */
         for_each_mem_cgroup_tree(iter, mem)
-               atomic_add_unless(&iter->oom_lock, -1, 0);
-       return 0;
+               atomic_add_unless(&iter->under_oom, -1, 0);
  }
  
-
-static DEFINE_MUTEX(memcg_oom_mutex);
+static DEFINE_SPINLOCK(memcg_oom_lock);
  static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
  
  struct oom_wait_info {
@@ -1845,25 +1926,20 @@ struct oom_wait_info {
  static int memcg_oom_wake_function(wait_queue_t *wait,
         unsigned mode, int sync, void *arg)
  {
-       struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
+       struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
+                         *oom_wait_mem;
         struct oom_wait_info *oom_wait_info;
  
         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
+       oom_wait_mem = oom_wait_info->mem;
  
-       if (oom_wait_info->mem == wake_mem)
-               goto wakeup;
-       /* if no hierarchy, no match */
-       if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
-               return 0;
         /*
          * Both of oom_wait_info->mem and wake_mem are stable under us.
          * Then we can use css_is_ancestor without taking care of RCU.
          */
-       if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
-           !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
+       if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
+                       && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
                 return 0;
-
-wakeup:
         return autoremove_wake_function(wait, mode, sync, arg);
  }
  
@@ -1875,7 +1951,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
  
  static void memcg_oom_recover(struct mem_cgroup *mem)
  {
-       if (mem && atomic_read(&mem->oom_lock))
+       if (mem && atomic_read(&mem->under_oom))
                 memcg_wakeup_oom(mem);
  }
  
@@ -1893,8 +1969,10 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
         owait.wait.private = current;
         INIT_LIST_HEAD(&owait.wait.task_list);
         need_to_kill = true;
+       mem_cgroup_mark_under_oom(mem);
+
         /* At first, try to OOM lock hierarchy under mem.*/
-       mutex_lock(&memcg_oom_mutex);
+       spin_lock(&memcg_oom_lock);
         locked = mem_cgroup_oom_lock(mem);
         /*
          * Even if signal_pending(), we can't quit charge() loop without
@@ -1906,7 +1984,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
                 need_to_kill = false;
         if (locked)
                 mem_cgroup_oom_notify(mem);
-       mutex_unlock(&memcg_oom_mutex);
+       spin_unlock(&memcg_oom_lock);
  
         if (need_to_kill) {
                 finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1915,10 +1993,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
                 schedule();
                 finish_wait(&memcg_oom_waitq, &owait.wait);
         }
-       mutex_lock(&memcg_oom_mutex);
-       mem_cgroup_oom_unlock(mem);
+       spin_lock(&memcg_oom_lock);
+       if (locked)
+               mem_cgroup_oom_unlock(mem);
         memcg_wakeup_oom(mem);
-       mutex_unlock(&memcg_oom_mutex);
+       spin_unlock(&memcg_oom_lock);
+
+       mem_cgroup_unmark_under_oom(mem);
  
         if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
                 return false;
@@ -2011,7 +2092,6 @@ struct memcg_stock_pcp {
  #define FLUSHING_CACHED_CHARGE (0)
  };
  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
-static DEFINE_MUTEX(percpu_charge_mutex);
  
  /*
   * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -2079,19 +2159,14 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
  }
  
  /*
- * Tries to drain stocked charges in other cpus. This function is asynchronous
- * and just put a work per cpu for draining localy on each cpu. Caller can
- * expects some charges will be back to res_counter later but cannot wait for
- * it.
+ * Drains all per-CPU charge caches for given root_mem resp. subtree
+ * of the hierarchy under it. sync flag says whether we should block
+ * until the work is done.
   */
-static void drain_all_stock_async(struct mem_cgroup *root_mem)
+static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
  {
         int cpu, curcpu;
-       /*
-        * If someone calls draining, avoid adding more kworker runs.
-        */
-       if (!mutex_trylock(&percpu_charge_mutex))
-               return;
+
         /* Notify other cpus that system-wide "drain" is running */
         get_online_cpus();
         /*
@@ -2105,34 +2180,48 @@ static void drain_all_stock_async(struct mem_cgroup *root_mem)
                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
                 struct mem_cgroup *mem;
  
-               if (cpu == curcpu)
-                       continue;
-
                 mem = stock->cached;
-               if (!mem)
+               if (!mem || !stock->nr_pages)
+                       continue;
+               if (!mem_cgroup_same_or_subtree(root_mem, mem))
                         continue;
-               if (mem != root_mem) {
-                       if (!root_mem->use_hierarchy)
-                               continue;
-                       /* check whether "mem" is under tree of "root_mem" */
-                       if (!css_is_ancestor(&mem->css, &root_mem->css))
-                               continue;
+               if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+                       if (cpu == curcpu)
+                               drain_local_stock(&stock->work);
+                       else
+                               schedule_work_on(cpu, &stock->work);
                 }
-               if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
-                       schedule_work_on(cpu, &stock->work);
         }
+
+       if (!sync)
+               goto out;
+
+       for_each_online_cpu(cpu) {
+               struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
+               if (mem_cgroup_same_or_subtree(root_mem, stock->cached) &&
+                               test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+                       flush_work(&stock->work);
+       }
+out:
         put_online_cpus();
-       mutex_unlock(&percpu_charge_mutex);
-       /* We don't wait for flush_work */
+}
+
+/*
+ * Tries to drain stocked charges in other cpus. This function is asynchronous
+ * and just put a work per cpu for draining localy on each cpu. Caller can
+ * expects some charges will be back to res_counter later but cannot wait for
+ * it.
+ */
+static void drain_all_stock_async(struct mem_cgroup *root_mem)
+{
+       drain_all_stock(root_mem, false);
  }
  
  /* This is a synchronous drain interface. */
-static void drain_all_stock_sync(void)
+static void drain_all_stock_sync(struct mem_cgroup *root_mem)
  {
         /* called when force_empty is called */
-       mutex_lock(&percpu_charge_mutex);
-       schedule_on_each_cpu(drain_local_stock);
-       mutex_unlock(&percpu_charge_mutex);
+       drain_all_stock(root_mem, true);
  }
  
  /*
@@ -3780,7 +3869,7 @@ move_account:
                         goto out;
                 /* This is for making all *used* pages to be on LRU. */
                 lru_add_drain_all();
-               drain_all_stock_sync();
+               drain_all_stock_sync(mem);
                 ret = 0;
                 mem_cgroup_start_move(mem);
                 for_each_node_state(node, N_HIGH_MEMORY) {
@@ -3819,14 +3908,18 @@ try_to_free:
         /* try to free all pages in this cgroup */
         shrink = 1;
         while (nr_retries && mem->res.usage > 0) {
+               struct memcg_scanrecord rec;
                 int progress;
  
                 if (signal_pending(current)) {
                         ret = -EINTR;
                         goto out;
                 }
+               rec.context = SCAN_BY_SHRINK;
+               rec.mem = mem;
+               rec.root = mem;
                 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
-                                               false, get_swappiness(mem));
+                                               false, &rec);
                 if (!progress) {
                         nr_retries--;
                         /* maybe some writeback is necessary */
@@ -4152,15 +4245,15 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
         s->stat[MCS_PGMAJFAULT] += val;
  
         /* per zone stat */
-       val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
+       val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
         s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
-       val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
+       val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
         s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
-       val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
+       val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
         s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
-       val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
+       val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
         s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
-       val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
+       val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
         s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
  }
  
@@ -4182,35 +4275,37 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
         struct cgroup *cont = m->private;
         struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
  
-       total_nr = mem_cgroup_nr_lru_pages(mem_cont);
+       total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
         seq_printf(m, "total=%lu", total_nr);
         for_each_node_state(nid, N_HIGH_MEMORY) {
-               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid);
+               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
                 seq_printf(m, " N%d=%lu", nid, node_nr);
         }
         seq_putc(m, '\n');
  
-       file_nr = mem_cgroup_nr_file_lru_pages(mem_cont);
+       file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
         seq_printf(m, "file=%lu", file_nr);
         for_each_node_state(nid, N_HIGH_MEMORY) {
-               node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid);
+               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+                               LRU_ALL_FILE);
                 seq_printf(m, " N%d=%lu", nid, node_nr);
         }
         seq_putc(m, '\n');
  
-       anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont);
+       anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
         seq_printf(m, "anon=%lu", anon_nr);
         for_each_node_state(nid, N_HIGH_MEMORY) {
-               node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid);
+               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+                               LRU_ALL_ANON);
                 seq_printf(m, " N%d=%lu", nid, node_nr);
         }
         seq_putc(m, '\n');
  
-       unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont);
+       unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
         seq_printf(m, "unevictable=%lu", unevictable_nr);
         for_each_node_state(nid, N_HIGH_MEMORY) {
-               node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont,
-                                                                       nid);
+               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+                               BIT(LRU_UNEVICTABLE));
                 seq_printf(m, " N%d=%lu", nid, node_nr);
         }
         seq_putc(m, '\n');
@@ -4288,7 +4383,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
  
-       return get_swappiness(memcg);
+       return mem_cgroup_swappiness(memcg);
  }
  
  static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
@@ -4578,15 +4673,15 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
         if (!event)
                 return -ENOMEM;
  
-       mutex_lock(&memcg_oom_mutex);
+       spin_lock(&memcg_oom_lock);
  
         event->eventfd = eventfd;
         list_add(&event->list, &memcg->oom_notify);
  
         /* already in OOM ? */
-       if (atomic_read(&memcg->oom_lock))
+       if (atomic_read(&memcg->under_oom))
                 eventfd_signal(eventfd, 1);
-       mutex_unlock(&memcg_oom_mutex);
+       spin_unlock(&memcg_oom_lock);
  
         return 0;
  }
@@ -4600,7 +4695,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
  
         BUG_ON(type != _OOM_TYPE);
  
-       mutex_lock(&memcg_oom_mutex);
+       spin_lock(&memcg_oom_lock);
  
         list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
                 if (ev->eventfd == eventfd) {
@@ -4609,7 +4704,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
                 }
         }
  
-       mutex_unlock(&memcg_oom_mutex);
+       spin_unlock(&memcg_oom_lock);
  }
  
  static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
@@ -4619,7 +4714,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
  
         cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
  
-       if (atomic_read(&mem->oom_lock))
+       if (atomic_read(&mem->under_oom))
                 cb->fill(cb, "under_oom", 1);
         else
                 cb->fill(cb, "under_oom", 0);
@@ -4668,6 +4763,54 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
  }
  #endif /* CONFIG_NUMA */
  
+static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
+                               struct cftype *cft,
+                               struct cgroup_map_cb *cb)
+{
+       struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+       char string[64];
+       int i;
+
+       for (i = 0; i < NR_SCANSTATS; i++) {
+               strcpy(string, scanstat_string[i]);
+               strcat(string, SCANSTAT_WORD_LIMIT);
+               cb->fill(cb, string,  mem->scanstat.stats[SCAN_BY_LIMIT][i]);
+       }
+
+       for (i = 0; i < NR_SCANSTATS; i++) {
+               strcpy(string, scanstat_string[i]);
+               strcat(string, SCANSTAT_WORD_SYSTEM);
+               cb->fill(cb, string,  mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
+       }
+
+       for (i = 0; i < NR_SCANSTATS; i++) {
+               strcpy(string, scanstat_string[i]);
+               strcat(string, SCANSTAT_WORD_LIMIT);
+               strcat(string, SCANSTAT_WORD_HIERARCHY);
+               cb->fill(cb, string,  mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
+       }
+       for (i = 0; i < NR_SCANSTATS; i++) {
+               strcpy(string, scanstat_string[i]);
+               strcat(string, SCANSTAT_WORD_SYSTEM);
+               strcat(string, SCANSTAT_WORD_HIERARCHY);
+               cb->fill(cb, string,  mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
+       }
+       return 0;
+}
+
+static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
+                               unsigned int event)
+{
+       struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+
+       spin_lock(&mem->scanstat.lock);
+       memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
+       memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
+       spin_unlock(&mem->scanstat.lock);
+       return 0;
+}
+
+
  static struct cftype mem_cgroup_files[] = {
         {
                 .name = "usage_in_bytes",
@@ -4738,6 +4881,11 @@ static struct cftype mem_cgroup_files[] = {
                 .mode = S_IRUGO,
         },
  #endif
+       {
+               .name = "vmscan_stat",
+               .read_map = mem_cgroup_vmscan_stat_read,
+               .trigger = mem_cgroup_reset_vmscan_stat,
+       },
  };
  
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4997,10 +5145,11 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
         INIT_LIST_HEAD(&mem->oom_notify);
  
         if (parent)
-               mem->swappiness = get_swappiness(parent);
+               mem->swappiness = mem_cgroup_swappiness(parent);
         atomic_set(&mem->refcnt, 1);
         mem->move_charge_at_immigrate = 0;
         mutex_init(&mem->thresholds_lock);
+       spin_lock_init(&mem->scanstat.lock);
         return &mem->css;
  free_out:
         __mem_cgroup_free(mem);