memcg: reclaim memory from nodes in round-robin order

author Ying Han <yinghan@google.com>

Thu, 26 May 2011 23:25:33 +0000 (16:25 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 27 May 2011 00:12:35 +0000 (17:12 -0700)
author Ying Han <yinghan@google.com>
Thu, 26 May 2011 23:25:33 +0000 (16:25 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 27 May 2011 00:12:35 +0000 (17:12 -0700)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 0629121..1605211 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -106,6 +106,7 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
   */
  int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
  int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
  unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
                                        struct zone *zone,
                                        enum lru_list lru);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index fc62c71..1520efd 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -231,6 +231,11 @@ struct mem_cgroup {
          * reclaimed from.
          */
         int last_scanned_child;
+       int last_scanned_node;
+#if MAX_NUMNODES > 1
+       nodemask_t      scan_nodes;
+       unsigned long   next_scan_node_update;
+#endif
         /*
          * Should the accounting and control be hierarchical, per subtree?
          */
@@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
         preempt_enable();
  }
  
+static unsigned long
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+{
+       struct mem_cgroup_per_zone *mz;
+       u64 total = 0;
+       int zid;
+
+       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+               mz = mem_cgroup_zoneinfo(mem, nid, zid);
+               total += MEM_CGROUP_ZSTAT(mz, idx);
+       }
+       return total;
+}
  static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
                                         enum lru_list idx)
  {
-       int nid, zid;
-       struct mem_cgroup_per_zone *mz;
+       int nid;
         u64 total = 0;
  
         for_each_online_node(nid)
-               for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-                       mz = mem_cgroup_zoneinfo(mem, nid, zid);
-                       total += MEM_CGROUP_ZSTAT(mz, idx);
-               }
+               total += mem_cgroup_get_zonestat_node(mem, nid, idx);
         return total;
  }
  
@@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
         return ret;
  }
  
+#if MAX_NUMNODES > 1
+
+/*
+ * Always updating the nodemask is not very good - even if we have an empty
+ * list or the wrong list here, we can start from some node and traverse all
+ * nodes based on the zonelist. So update the list loosely once per 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+       int nid;
+
+       if (time_after(mem->next_scan_node_update, jiffies))
+               return;
+
+       mem->next_scan_node_update = jiffies + 10*HZ;
+       /* make a nodemask where this memcg uses memory from */
+       mem->scan_nodes = node_states[N_HIGH_MEMORY];
+
+       for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+
+               if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+                   mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+                       continue;
+
+               if (total_swap_pages &&
+                   (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
+                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
+                       continue;
+               node_clear(nid, mem->scan_nodes);
+       }
+}
+
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+       int node;
+
+       mem_cgroup_may_update_nodemask(mem);
+       node = mem->last_scanned_node;
+
+       node = next_node(node, mem->scan_nodes);
+       if (node == MAX_NUMNODES)
+               node = first_node(mem->scan_nodes);
+       /*
+        * We call this when we hit limit, not when pages are added to LRU.
+        * No LRU may hold pages because all pages are UNEVICTABLE or
+        * memcg is too small and all pages are not on LRU. In that case,
+        * we use curret node.
+        */
+       if (unlikely(node == MAX_NUMNODES))
+               node = numa_node_id();
+
+       mem->last_scanned_node = node;
+       return node;
+}
+
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+       return 0;
+}
+#endif
+
  /*
   * Scan the hierarchy if needed to reclaim memory. We remember the last child
   * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                 res_counter_init(&mem->memsw, NULL);
         }
         mem->last_scanned_child = 0;
+       mem->last_scanned_node = MAX_NUMNODES;
         INIT_LIST_HEAD(&mem->oom_notify);
  
         if (parent)
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 884ae08..b087587 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2226,6 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
  {
         struct zonelist *zonelist;
         unsigned long nr_reclaimed;
+       int nid;
         struct scan_control sc = {
                 .may_writepage = !laptop_mode,
                 .may_unmap = 1,
@@ -2242,7 +2243,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                 .gfp_mask = sc.gfp_mask,
         };
  
-       zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+       /*
+        * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
+        * take care of from where we get pages. So the node where we start the
+        * scan does not need to be the current node.
+        */
+       nid = mem_cgroup_select_victim_node(mem_cont);
+
+       zonelist = NODE_DATA(nid)->node_zonelists;
  
         trace_mm_vmscan_memcg_reclaim_begin(0,
                                             sc.may_writepage,
author	Ying Han <yinghan@google.com>
	Thu, 26 May 2011 23:25:33 +0000 (16:25 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 27 May 2011 00:12:35 +0000 (17:12 -0700)
include/linux/memcontrol.h		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history