mm: fix typo in refill_stock() comment
[pandora-kernel.git] / mm / memcontrol.c
index a82464b..6e8533e 100644 (file)
@@ -63,8 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
 #define do_swap_account                (0)
 #endif
 
-#define SOFTLIMIT_EVENTS_THRESH (1000)
-#define THRESHOLDS_EVENTS_THRESH (100)
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. This counter
+ * is used for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ *
+ * These values will be used as !((event) & ((1 <<(thresh)) - 1))
+ */
+#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
+#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
 
 /*
  * Statistics for memory cgroup.
@@ -79,64 +86,15 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
        MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
        MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
-       MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out.
-                                       used by soft limit implementation */
-       MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out.
-                                       used by threshold implementation */
+       MEM_CGROUP_EVENTS,      /* incremented at every  pagein/pageout */
 
        MEM_CGROUP_STAT_NSTATS,
 };
 
 struct mem_cgroup_stat_cpu {
        s64 count[MEM_CGROUP_STAT_NSTATS];
-} ____cacheline_aligned_in_smp;
-
-struct mem_cgroup_stat {
-       struct mem_cgroup_stat_cpu cpustat[0];
 };
 
-static inline void
-__mem_cgroup_stat_set_safe(struct mem_cgroup_stat_cpu *stat,
-                               enum mem_cgroup_stat_index idx, s64 val)
-{
-       stat->count[idx] = val;
-}
-
-static inline s64
-__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
-                               enum mem_cgroup_stat_index idx)
-{
-       return stat->count[idx];
-}
-
-/*
- * For accounting under irq disable, no need for increment preempt count.
- */
-static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
-               enum mem_cgroup_stat_index idx, int val)
-{
-       stat->count[idx] += val;
-}
-
-static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
-               enum mem_cgroup_stat_index idx)
-{
-       int cpu;
-       s64 ret = 0;
-       for_each_possible_cpu(cpu)
-               ret += stat->cpustat[cpu].count[idx];
-       return ret;
-}
-
-static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
-{
-       s64 ret;
-
-       ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
-       ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
-       return ret;
-}
-
 /*
  * per-zone information in memory controller.
  */
@@ -200,7 +158,6 @@ struct mem_cgroup_threshold_ary {
        struct mem_cgroup_threshold entries[0];
 };
 
-static bool mem_cgroup_threshold_check(struct mem_cgroup *mem);
 static void mem_cgroup_threshold(struct mem_cgroup *mem);
 
 /*
@@ -246,7 +203,7 @@ struct mem_cgroup {
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;
-       unsigned long   last_oom_jiffies;
+       atomic_t        oom_lock;
        atomic_t        refcnt;
 
        unsigned int    swappiness;
@@ -270,9 +227,9 @@ struct mem_cgroup {
        unsigned long   move_charge_at_immigrate;
 
        /*
-        * statistics. This must be placed at the end of memcg.
+        * percpu counter.
         */
-       struct mem_cgroup_stat stat;
+       struct mem_cgroup_stat_cpu *stat;
 };
 
 /* Stuffs for move charges at task migration. */
@@ -438,24 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
        spin_unlock(&mctz->lock);
 }
 
-static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
-{
-       bool ret = false;
-       int cpu;
-       s64 val;
-       struct mem_cgroup_stat_cpu *cpustat;
-
-       cpu = get_cpu();
-       cpustat = &mem->stat.cpustat[cpu];
-       val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_SOFTLIMIT);
-       if (unlikely(val < 0)) {
-               __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT,
-                               SOFTLIMIT_EVENTS_THRESH);
-               ret = true;
-       }
-       put_cpu();
-       return ret;
-}
 
 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
 {
@@ -549,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
        return mz;
 }
 
+static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
+               enum mem_cgroup_stat_index idx)
+{
+       int cpu;
+       s64 val = 0;
+
+       for_each_possible_cpu(cpu)
+               val += per_cpu(mem->stat->count[idx], cpu);
+       return val;
+}
+
+static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
+{
+       s64 ret;
+
+       ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
+       ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
+       return ret;
+}
+
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
                                         bool charge)
 {
        int val = (charge) ? 1 : -1;
-       struct mem_cgroup_stat *stat = &mem->stat;
-       struct mem_cgroup_stat_cpu *cpustat;
-       int cpu = get_cpu();
-
-       cpustat = &stat->cpustat[cpu];
-       __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
-       put_cpu();
+       this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 }
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -567,26 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
                                         bool charge)
 {
        int val = (charge) ? 1 : -1;
-       struct mem_cgroup_stat *stat = &mem->stat;
-       struct mem_cgroup_stat_cpu *cpustat;
-       int cpu = get_cpu();
 
-       cpustat = &stat->cpustat[cpu];
+       preempt_disable();
+
        if (PageCgroupCache(pc))
-               __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
+               __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
        else
-               __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
+               __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
 
        if (charge)
-               __mem_cgroup_stat_add_safe(cpustat,
-                               MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
+               __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
        else
-               __mem_cgroup_stat_add_safe(cpustat,
-                               MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
-       __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1);
-       __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1);
+               __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+       __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
 
-       put_cpu();
+       preempt_enable();
 }
 
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -604,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
        return total;
 }
 
+static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
+{
+       s64 val;
+
+       val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
+
+       return !(val & ((1 << event_mask_shift) - 1));
+}
+
+/*
+ * Check events in order.
+ *
+ */
+static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
+{
+       /* threshold event is triggered in finer grain than soft limit */
+       if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
+               mem_cgroup_threshold(mem);
+               if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
+                       mem_cgroup_update_tree(mem, page);
+       }
+}
+
 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
        return container_of(cgroup_subsys_state(cont,
@@ -1244,7 +1215,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                }
                        }
                }
-               if (!mem_cgroup_local_usage(&victim->stat)) {
+               if (!mem_cgroup_local_usage(victim)) {
                        /* this cgroup's local usage == 0 */
                        css_put(&victim->css);
                        continue;
@@ -1275,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        return total;
 }
 
-bool mem_cgroup_oom_called(struct task_struct *task)
+static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
 {
-       bool ret = false;
-       struct mem_cgroup *mem;
-       struct mm_struct *mm;
+       int *val = (int *)data;
+       int x;
+       /*
+        * Logically, we can stop scanning immediately when we find
+        * a memcg is already locked. But condidering unlock ops and
+        * creation/removal of memcg, scan-all is simple operation.
+        */
+       x = atomic_inc_return(&mem->oom_lock);
+       *val = max(x, *val);
+       return 0;
+}
+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+{
+       int lock_count = 0;
 
-       rcu_read_lock();
-       mm = task->mm;
-       if (!mm)
-               mm = &init_mm;
-       mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-       if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
-               ret = true;
-       rcu_read_unlock();
-       return ret;
+       mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
+
+       if (lock_count == 1)
+               return true;
+       return false;
 }
 
-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
 {
-       mem->last_oom_jiffies = jiffies;
+       /*
+        * When a new child is created while the hierarchy is under oom,
+        * mem_cgroup_oom_lock() may not be called. We have to use
+        * atomic_add_unless() here.
+        */
+       atomic_add_unless(&mem->oom_lock, -1, 0);
        return 0;
 }
 
-static void record_last_oom(struct mem_cgroup *mem)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
+{
+       mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
+}
+
+static DEFINE_MUTEX(memcg_oom_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+
+/*
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ */
+bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 {
-       mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
+       DEFINE_WAIT(wait);
+       bool locked;
+
+       /* At first, try to OOM lock hierarchy under mem.*/
+       mutex_lock(&memcg_oom_mutex);
+       locked = mem_cgroup_oom_lock(mem);
+       /*
+        * Even if signal_pending(), we can't quit charge() loop without
+        * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+        * under OOM is always welcomed, use TASK_KILLABLE here.
+        */
+       if (!locked)
+               prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
+       mutex_unlock(&memcg_oom_mutex);
+
+       if (locked)
+               mem_cgroup_out_of_memory(mem, mask);
+       else {
+               schedule();
+               finish_wait(&memcg_oom_waitq, &wait);
+       }
+       mutex_lock(&memcg_oom_mutex);
+       mem_cgroup_oom_unlock(mem);
+       /*
+        * Here, we use global waitq .....more fine grained waitq ?
+        * Assume following hierarchy.
+        * A/
+        *   01
+        *   02
+        * assume OOM happens both in A and 01 at the same time. Tthey are
+        * mutually exclusive by lock. (kill in 01 helps A.)
+        * When we use per memcg waitq, we have to wake up waiters on A and 02
+        * in addtion to waiters on 01. We use global waitq for avoiding mess.
+        * It will not be a big problem.
+        * (And a task may be moved to other groups while it's waiting for OOM.)
+        */
+       wake_up_all(&memcg_oom_waitq);
+       mutex_unlock(&memcg_oom_mutex);
+
+       if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+               return false;
+       /* Give chance to dying process */
+       schedule_timeout(1);
+       return true;
 }
 
 /*
@@ -1310,9 +1351,6 @@ static void record_last_oom(struct mem_cgroup *mem)
 void mem_cgroup_update_file_mapped(struct page *page, int val)
 {
        struct mem_cgroup *mem;
-       struct mem_cgroup_stat *stat;
-       struct mem_cgroup_stat_cpu *cpustat;
-       int cpu;
        struct page_cgroup *pc;
 
        pc = lookup_page_cgroup(page);
@@ -1328,13 +1366,10 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
                goto done;
 
        /*
-        * Preemption is already disabled, we don't need get_cpu()
+        * Preemption is already disabled. We can use __this_cpu_xxx
         */
-       cpu = smp_processor_id();
-       stat = &mem->stat;
-       cpustat = &stat->cpustat[cpu];
+       __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
 
-       __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
 done:
        unlock_page_cgroup(pc);
 }
@@ -1400,7 +1435,7 @@ static void drain_local_stock(struct work_struct *dummy)
 
 /*
  * Cache charges(val) which is from res_counter, to local per_cpu area.
- * This will be consumed by consumt_stock() function, later.
+ * This will be consumed by consume_stock() function, later.
  */
 static void refill_stock(struct mem_cgroup *mem, int val)
 {
@@ -1471,19 +1506,21 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
  * oom-killer can be invoked.
  */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
-                       gfp_t gfp_mask, struct mem_cgroup **memcg,
-                       bool oom, struct page *page)
+                       gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
 {
        struct mem_cgroup *mem, *mem_over_limit;
        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct res_counter *fail_res;
        int csize = CHARGE_SIZE;
 
-       if (unlikely(test_thread_flag(TIF_MEMDIE))) {
-               /* Don't account this! */
-               *memcg = NULL;
-               return 0;
-       }
+       /*
+        * Unlike gloval-vm's OOM-kill, we're not in memory shortage
+        * in system level. So, allow to go ahead dying process in addition to
+        * MEMDIE process.
+        */
+       if (unlikely(test_thread_flag(TIF_MEMDIE)
+                    || fatal_signal_pending(current)))
+               goto bypass;
 
        /*
         * We always charge the cgroup the mm_struct belongs to.
@@ -1510,7 +1547,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                unsigned long flags = 0;
 
                if (consume_stock(mem))
-                       goto charged;
+                       goto done;
 
                ret = res_counter_charge(&mem->res, csize, &fail_res);
                if (likely(!ret)) {
@@ -1596,29 +1633,27 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                }
 
                if (!nr_retries--) {
-                       if (oom) {
-                               mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
-                               record_last_oom(mem_over_limit);
+                       if (!oom)
+                               goto nomem;
+                       if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
+                               nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+                               continue;
                        }
-                       goto nomem;
+                       /* When we reach here, current task is dying .*/
+                       css_put(&mem->css);
+                       goto bypass;
                }
        }
        if (csize > PAGE_SIZE)
                refill_stock(mem, csize - PAGE_SIZE);
-charged:
-       /*
-        * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
-        * if they exceeds softlimit.
-        */
-       if (page && mem_cgroup_soft_limit_check(mem))
-               mem_cgroup_update_tree(mem, page);
 done:
-       if (mem_cgroup_threshold_check(mem))
-               mem_cgroup_threshold(mem);
        return 0;
 nomem:
        css_put(&mem->css);
        return -ENOMEM;
+bypass:
+       *memcg = NULL;
+       return 0;
 }
 
 /*
@@ -1738,6 +1773,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
        mem_cgroup_charge_statistics(mem, pc, true);
 
        unlock_page_cgroup(pc);
+       /*
+        * "charge_statistics" updated event counter. Then, check it.
+        * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+        * if they exceeds softlimit.
+        */
+       memcg_check_events(mem, pc->page);
 }
 
 /**
@@ -1761,9 +1802,6 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
        struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
        struct page *page;
-       int cpu;
-       struct mem_cgroup_stat *stat;
-       struct mem_cgroup_stat_cpu *cpustat;
 
        VM_BUG_ON(from == to);
        VM_BUG_ON(PageLRU(pc->page));
@@ -1773,18 +1811,11 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
 
        page = pc->page;
        if (page_mapped(page) && !PageAnon(page)) {
-               cpu = smp_processor_id();
-               /* Update mapped_file data for mem_cgroup "from" */
-               stat = &from->stat;
-               cpustat = &stat->cpustat[cpu];
-               __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
-                                               -1);
-
-               /* Update mapped_file data for mem_cgroup "to" */
-               stat = &to->stat;
-               cpustat = &stat->cpustat[cpu];
-               __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
-                                               1);
+               /* Update mapped_file data for mem_cgroup */
+               preempt_disable();
+               __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+               __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+               preempt_enable();
        }
        mem_cgroup_charge_statistics(from, pc, false);
        if (uncharge)
@@ -1817,6 +1848,11 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
                ret = 0;
        }
        unlock_page_cgroup(pc);
+       /*
+        * check events
+        */
+       memcg_check_events(to, pc->page);
+       memcg_check_events(from, pc->page);
        return ret;
 }
 
@@ -1845,7 +1881,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
                goto put;
 
        parent = mem_cgroup_from_cont(pcg);
-       ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
+       ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
        if (ret || !parent)
                goto put_back;
 
@@ -1881,7 +1917,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        prefetchw(pc);
 
        mem = memcg;
-       ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
+       ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
        if (ret || !mem)
                return ret;
 
@@ -2001,14 +2037,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
        if (!mem)
                goto charge_cur_mm;
        *ptr = mem;
-       ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
+       ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
        /* drop extra refcnt from tryget */
        css_put(&mem->css);
        return ret;
 charge_cur_mm:
        if (unlikely(!mm))
                mm = &init_mm;
-       return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
+       return __mem_cgroup_try_charge(mm, mask, ptr, true);
 }
 
 static void
@@ -2185,10 +2221,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        mz = page_cgroup_zoneinfo(pc);
        unlock_page_cgroup(pc);
 
-       if (mem_cgroup_soft_limit_check(mem))
-               mem_cgroup_update_tree(mem, page);
-       if (mem_cgroup_threshold_check(mem))
-               mem_cgroup_threshold(mem);
+       memcg_check_events(mem, page);
        /* at swapout, this memcg will be accessed to record to swap */
        if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
                css_put(&mem->css);
@@ -2397,8 +2430,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
        unlock_page_cgroup(pc);
 
        if (mem) {
-               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
-                                               page);
+               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
                css_put(&mem->css);
        }
        *ptr = mem;
@@ -2885,7 +2917,7 @@ static int
 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
 {
        struct mem_cgroup_idx_data *d = data;
-       d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
+       d->val += mem_cgroup_read_stat(mem, d->idx);
        return 0;
 }
 
@@ -3134,18 +3166,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
        s64 val;
 
        /* per cpu stat */
-       val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
+       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
        s->stat[MCS_CACHE] += val * PAGE_SIZE;
-       val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
+       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
        s->stat[MCS_RSS] += val * PAGE_SIZE;
-       val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
+       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
        s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
-       val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
+       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
        s->stat[MCS_PGPGIN] += val;
-       val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
+       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
        s->stat[MCS_PGPGOUT] += val;
        if (do_swap_account) {
-               val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
+               val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
                s->stat[MCS_SWAP] += val * PAGE_SIZE;
        }
 
@@ -3273,25 +3305,6 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
        return 0;
 }
 
-static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
-{
-       bool ret = false;
-       int cpu;
-       s64 val;
-       struct mem_cgroup_stat_cpu *cpustat;
-
-       cpu = get_cpu();
-       cpustat = &mem->stat.cpustat[cpu];
-       val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS);
-       if (unlikely(val < 0)) {
-               __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS,
-                               THRESHOLDS_EVENTS_THRESH);
-               ret = true;
-       }
-       put_cpu();
-       return ret;
-}
-
 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 {
        struct mem_cgroup_threshold_ary *t;
@@ -3428,12 +3441,6 @@ static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
                }
        }
 
-       /*
-        * We need to increment refcnt to be sure that all thresholds
-        * will be unregistered before calling __mem_cgroup_free()
-        */
-       mem_cgroup_get(memcg);
-
        if (type == _MEM)
                rcu_assign_pointer(memcg->thresholds, thresholds_new);
        else
@@ -3527,9 +3534,6 @@ assign:
        /* To be sure that nobody uses thresholds before freeing it */
        synchronize_rcu();
 
-       for (i = 0; i < thresholds->size - size; i++)
-               mem_cgroup_put(memcg);
-
        kfree(thresholds);
 unlock:
        mutex_unlock(&memcg->thresholds_lock);
@@ -3676,17 +3680,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
        kfree(mem->info.nodeinfo[node]);
 }
 
-static int mem_cgroup_size(void)
-{
-       int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
-       return sizeof(struct mem_cgroup) + cpustat_size;
-}
-
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
        struct mem_cgroup *mem;
-       int size = mem_cgroup_size();
+       int size = sizeof(struct mem_cgroup);
 
+       /* Can be very big if MAX_NUMNODES is very big */
        if (size < PAGE_SIZE)
                mem = kmalloc(size, GFP_KERNEL);
        else
@@ -3694,6 +3693,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 
        if (mem)
                memset(mem, 0, size);
+       mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+       if (!mem->stat) {
+               if (size < PAGE_SIZE)
+                       kfree(mem);
+               else
+                       vfree(mem);
+               mem = NULL;
+       }
        return mem;
 }
 
@@ -3718,7 +3725,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
        for_each_node_state(node, N_POSSIBLE)
                free_mem_cgroup_per_zone_info(mem, node);
 
-       if (mem_cgroup_size() < PAGE_SIZE)
+       free_percpu(mem->stat);
+       if (sizeof(struct mem_cgroup) < PAGE_SIZE)
                kfree(mem);
        else
                vfree(mem);
@@ -3930,8 +3938,7 @@ one_by_one:
                        batch_count = PRECHARGE_COUNT_AT_ONCE;
                        cond_resched();
                }
-               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem,
-                                                               false, NULL);
+               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
                if (ret || !mem)
                        /* mem_cgroup_clear_mc() will do uncharge later */
                        return -ENOMEM;