memory controller: soft limit reclaim on contention
[pandora-kernel.git] / mm / memcontrol.c
index 90f0b13..011aba6 100644 (file)
@@ -139,6 +139,8 @@ struct mem_cgroup_per_zone {
        unsigned long long      usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                    on_tree;
+       struct mem_cgroup       *mem;           /* Back pointer, we cannot */
+                                               /* use container_of        */
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)      ((mz)->count[(idx)])
@@ -228,6 +230,13 @@ struct mem_cgroup {
        struct mem_cgroup_stat stat;
 };
 
+/*
+ * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
+ * limit reclaim to prevent infinite loops, if they ever occur.
+ */
+#define        MEM_CGROUP_MAX_RECLAIM_LOOPS            (100)
+#define        MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
+
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
        MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -259,6 +268,8 @@ enum charge_type {
 #define MEM_CGROUP_RECLAIM_NOSWAP      (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 #define MEM_CGROUP_RECLAIM_SHRINK_BIT  0x1
 #define MEM_CGROUP_RECLAIM_SHRINK      (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+#define MEM_CGROUP_RECLAIM_SOFT_BIT    0x2
+#define MEM_CGROUP_RECLAIM_SOFT                (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
@@ -299,7 +310,7 @@ soft_limit_tree_from_page(struct page *page)
 }
 
 static void
-mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
                                struct mem_cgroup_per_zone *mz,
                                struct mem_cgroup_tree_per_zone *mctz)
 {
@@ -311,7 +322,6 @@ mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
                return;
 
        mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
-       spin_lock(&mctz->lock);
        while (*p) {
                parent = *p;
                mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
@@ -328,6 +338,26 @@ mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
        rb_link_node(&mz->tree_node, parent, p);
        rb_insert_color(&mz->tree_node, &mctz->rb_root);
        mz->on_tree = true;
+}
+
+static void
+__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+                               struct mem_cgroup_per_zone *mz,
+                               struct mem_cgroup_tree_per_zone *mctz)
+{
+       if (!mz->on_tree)
+               return;
+       rb_erase(&mz->tree_node, &mctz->rb_root);
+       mz->on_tree = false;
+}
+
+static void
+mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+                               struct mem_cgroup_per_zone *mz,
+                               struct mem_cgroup_tree_per_zone *mctz)
+{
+       spin_lock(&mctz->lock);
+       __mem_cgroup_insert_exceeded(mem, mz, mctz);
        spin_unlock(&mctz->lock);
 }
 
@@ -337,8 +367,7 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
                                struct mem_cgroup_tree_per_zone *mctz)
 {
        spin_lock(&mctz->lock);
-       rb_erase(&mz->tree_node, &mctz->rb_root);
-       mz->on_tree = false;
+       __mem_cgroup_remove_exceeded(mem, mz, mctz);
        spin_unlock(&mctz->lock);
 }
 
@@ -408,6 +437,47 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
        }
 }
 
+static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
+{
+       return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
+}
+
+static struct mem_cgroup_per_zone *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+       struct rb_node *rightmost = NULL;
+       struct mem_cgroup_per_zone *mz = NULL;
+
+retry:
+       rightmost = rb_last(&mctz->rb_root);
+       if (!rightmost)
+               goto done;              /* Nothing to reclaim from */
+
+       mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
+       /*
+        * Remove the node now but someone else can add it back,
+        * we will to add it back at the end of reclaim to its correct
+        * position in the tree.
+        */
+       __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+       if (!res_counter_soft_limit_excess(&mz->mem->res) ||
+               !css_tryget(&mz->mem->css))
+               goto retry;
+done:
+       return mz;
+}
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+       struct mem_cgroup_per_zone *mz;
+
+       spin_lock(&mctz->lock);
+       mz = __mem_cgroup_largest_soft_limit_node(mctz);
+       spin_unlock(&mctz->lock);
+       return mz;
+}
+
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
                                         struct page_cgroup *pc,
                                         bool charge)
@@ -1037,6 +1107,7 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
  * If shrink==true, for avoiding to free too much, this returns immedieately.
  */
 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
+                                               struct zone *zone,
                                                gfp_t gfp_mask,
                                                unsigned long reclaim_options)
 {
@@ -1045,23 +1116,53 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        int loop = 0;
        bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
        bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
+       bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
+       unsigned long excess = mem_cgroup_get_excess(root_mem);
 
        /* If memsw_is_minimum==1, swap-out is of-no-use. */
        if (root_mem->memsw_is_minimum)
                noswap = true;
 
-       while (loop < 2) {
+       while (1) {
                victim = mem_cgroup_select_victim(root_mem);
-               if (victim == root_mem)
+               if (victim == root_mem) {
                        loop++;
+                       if (loop >= 2) {
+                               /*
+                                * If we have not been able to reclaim
+                                * anything, it might because there are
+                                * no reclaimable pages under this hierarchy
+                                */
+                               if (!check_soft || !total) {
+                                       css_put(&victim->css);
+                                       break;
+                               }
+                               /*
+                                * We want to do more targetted reclaim.
+                                * excess >> 2 is not to excessive so as to
+                                * reclaim too much, nor too less that we keep
+                                * coming back to reclaim from this cgroup
+                                */
+                               if (total >= (excess >> 2) ||
+                                       (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
+                                       css_put(&victim->css);
+                                       break;
+                               }
+                       }
+               }
                if (!mem_cgroup_local_usage(&victim->stat)) {
                        /* this cgroup's local usage == 0 */
                        css_put(&victim->css);
                        continue;
                }
                /* we use swappiness of local cgroup */
-               ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
-                                                  get_swappiness(victim));
+               if (check_soft)
+                       ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
+                               noswap, get_swappiness(victim), zone,
+                               zone->zone_pgdat->node_id);
+               else
+                       ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
+                                               noswap, get_swappiness(victim));
                css_put(&victim->css);
                /*
                 * At shrinking usage, we can't check we should stop here or
@@ -1071,7 +1172,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                if (shrink)
                        return ret;
                total += ret;
-               if (mem_cgroup_check_under_limit(root_mem))
+               if (check_soft) {
+                       if (res_counter_check_under_soft_limit(&root_mem->res))
+                               return total;
+               } else if (mem_cgroup_check_under_limit(root_mem))
                        return 1 + total;
        }
        return total;
@@ -1206,8 +1310,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                if (!(gfp_mask & __GFP_WAIT))
                        goto nomem;
 
-               ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
-                                                       flags);
+               ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+                                               gfp_mask, flags);
                if (ret)
                        continue;
 
@@ -2018,8 +2122,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                if (!ret)
                        break;
 
-               progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
-                                                  MEM_CGROUP_RECLAIM_SHRINK);
+               progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
+                                               GFP_KERNEL,
+                                               MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -2071,7 +2176,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                if (!ret)
                        break;
 
-               mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
+               mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
                                                MEM_CGROUP_RECLAIM_NOSWAP |
                                                MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
@@ -2084,6 +2189,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
        return ret;
 }
 
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+                                               gfp_t gfp_mask, int nid,
+                                               int zid)
+{
+       unsigned long nr_reclaimed = 0;
+       struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+       unsigned long reclaimed;
+       int loop = 0;
+       struct mem_cgroup_tree_per_zone *mctz;
+
+       if (order > 0)
+               return 0;
+
+       mctz = soft_limit_tree_node_zone(nid, zid);
+       /*
+        * This loop can run a while, specially if mem_cgroup's continuously
+        * keep exceeding their soft limit and putting the system under
+        * pressure
+        */
+       do {
+               if (next_mz)
+                       mz = next_mz;
+               else
+                       mz = mem_cgroup_largest_soft_limit_node(mctz);
+               if (!mz)
+                       break;
+
+               reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
+                                               gfp_mask,
+                                               MEM_CGROUP_RECLAIM_SOFT);
+               nr_reclaimed += reclaimed;
+               spin_lock(&mctz->lock);
+
+               /*
+                * If we failed to reclaim anything from this memory cgroup
+                * it is time to move on to the next cgroup
+                */
+               next_mz = NULL;
+               if (!reclaimed) {
+                       do {
+                               /*
+                                * Loop until we find yet another one.
+                                *
+                                * By the time we get the soft_limit lock
+                                * again, someone might have aded the
+                                * group back on the RB tree. Iterate to
+                                * make sure we get a different mem.
+                                * mem_cgroup_largest_soft_limit_node returns
+                                * NULL if no other cgroup is present on
+                                * the tree
+                                */
+                               next_mz =
+                               __mem_cgroup_largest_soft_limit_node(mctz);
+                               if (next_mz == mz) {
+                                       css_put(&next_mz->mem->css);
+                                       next_mz = NULL;
+                               } else /* next_mz == NULL or other memcg */
+                                       break;
+                       } while (1);
+               }
+               mz->usage_in_excess =
+                       res_counter_soft_limit_excess(&mz->mem->res);
+               __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+               /*
+                * One school of thought says that we should not add
+                * back the node to the tree if reclaim returns 0.
+                * But our reclaim could return 0, simply because due
+                * to priority we are exposing a smaller subset of
+                * memory to reclaim from. Consider this as a longer
+                * term TODO.
+                */
+               if (mz->usage_in_excess)
+                       __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
+               spin_unlock(&mctz->lock);
+               css_put(&mz->mem->css);
+               loop++;
+               /*
+                * Could not reclaim anything and there are no more
+                * mem cgroups to try or we seem to be looping without
+                * reclaiming anything.
+                */
+               if (!nr_reclaimed &&
+                       (next_mz == NULL ||
+                       loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+                       break;
+       } while (!nr_reclaimed);
+       if (next_mz)
+               css_put(&next_mz->mem->css);
+       return nr_reclaimed;
+}
+
 /*
  * This routine traverse page_cgroup in given list and drop them all.
  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -2686,6 +2882,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
                for_each_lru(l)
                        INIT_LIST_HEAD(&mz->lists[l]);
                mz->usage_in_excess = 0;
+               mz->on_tree = false;
+               mz->mem = mem;
        }
        return 0;
 }