Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/lrg/voltage-2.6

[pandora-kernel.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index e37c44d..e013b8e 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
  #include <linux/limits.h>
  #include <linux/mutex.h>
  #include <linux/rbtree.h>
+#include <linux/shmem_fs.h>
  #include <linux/slab.h>
  #include <linux/swap.h>
  #include <linux/swapops.h>
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index {
  enum mem_cgroup_events_target {
         MEM_CGROUP_TARGET_THRESH,
         MEM_CGROUP_TARGET_SOFTLIMIT,
+       MEM_CGROUP_TARGET_NUMAINFO,
         MEM_CGROUP_NTARGETS,
  };
  #define THRESHOLDS_EVENTS_TARGET (128)
  #define SOFTLIMIT_EVENTS_TARGET (1024)
+#define NUMAINFO_EVENTS_TARGET (1024)
  
  struct mem_cgroup_stat_cpu {
         long count[MEM_CGROUP_STAT_NSTATS];
@@ -236,7 +239,8 @@ struct mem_cgroup {
         int last_scanned_node;
  #if MAX_NUMNODES > 1
         nodemask_t      scan_nodes;
-       unsigned long   next_scan_node_update;
+       atomic_t        numainfo_events;
+       atomic_t        numainfo_updating;
  #endif
         /*
          * Should the accounting and control be hierarchical, per subtree?
@@ -359,7 +363,7 @@ enum charge_type {
  static void mem_cgroup_get(struct mem_cgroup *mem);
  static void mem_cgroup_put(struct mem_cgroup *mem);
  static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void drain_all_stock_async(void);
+static void drain_all_stock_async(struct mem_cgroup *mem);
  
  static struct mem_cgroup_per_zone *
  mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
         return val;
  }
  
-static long mem_cgroup_local_usage(struct mem_cgroup *mem)
-{
-       long ret;
-
-       ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
-       ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
-       return ret;
-}
-
  static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
                                          bool charge)
  {
@@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
         case MEM_CGROUP_TARGET_SOFTLIMIT:
                 next = val + SOFTLIMIT_EVENTS_TARGET;
                 break;
+       case MEM_CGROUP_TARGET_NUMAINFO:
+               next = val + NUMAINFO_EVENTS_TARGET;
+               break;
         default:
                 return;
         }
@@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
                 mem_cgroup_threshold(mem);
                 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
                 if (unlikely(__memcg_event_check(mem,
-                       MEM_CGROUP_TARGET_SOFTLIMIT))){
+                            MEM_CGROUP_TARGET_SOFTLIMIT))) {
                         mem_cgroup_update_tree(mem, page);
                         __mem_cgroup_target_update(mem,
-                               MEM_CGROUP_TARGET_SOFTLIMIT);
+                                                  MEM_CGROUP_TARGET_SOFTLIMIT);
                 }
+#if MAX_NUMNODES > 1
+               if (unlikely(__memcg_event_check(mem,
+                       MEM_CGROUP_TARGET_NUMAINFO))) {
+                       atomic_inc(&mem->numainfo_events);
+                       __mem_cgroup_target_update(mem,
+                               MEM_CGROUP_TARGET_NUMAINFO);
+               }
+#endif
         }
  }
  
@@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
         return MEM_CGROUP_ZSTAT(mz, lru);
  }
  
-#ifdef CONFIG_NUMA
  static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
                                                         int nid)
  {
@@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
         return ret;
  }
  
+static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
+                                                       int nid)
+{
+       unsigned long ret;
+
+       ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
+               mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
+       return ret;
+}
+
+#if MAX_NUMNODES > 1
  static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
  {
         u64 total = 0;
@@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
         return total;
  }
  
-static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
-                                                       int nid)
-{
-       unsigned long ret;
-
-       ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
-               mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
-
-       return ret;
-}
-
  static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
  {
         u64 total = 0;
@@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
         return ret;
  }
  
+/**
+ * test_mem_cgroup_node_reclaimable
+ * @mem: the target memcg
+ * @nid: the node ID to be checked.
+ * @noswap : specify true here if the user wants flle only information.
+ *
+ * This function returns whether the specified memcg contains any
+ * reclaimable pages on a node. Returns true if there are any reclaimable
+ * pages in the node.
+ */
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+               int nid, bool noswap)
+{
+       if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
+               return true;
+       if (noswap || !total_swap_pages)
+               return false;
+       if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
+               return true;
+       return false;
+
+}
  #if MAX_NUMNODES > 1
  
  /*
@@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
  static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
  {
         int nid;
-
-       if (time_after(mem->next_scan_node_update, jiffies))
+       /*
+        * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
+        * pagein/pageout changes since the last update.
+        */
+       if (!atomic_read(&mem->numainfo_events))
+               return;
+       if (atomic_inc_return(&mem->numainfo_updating) > 1)
                 return;
  
-       mem->next_scan_node_update = jiffies + 10*HZ;
         /* make a nodemask where this memcg uses memory from */
         mem->scan_nodes = node_states[N_HIGH_MEMORY];
  
         for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
  
-               if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
-                   mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
-                       continue;
-
-               if (total_swap_pages &&
-                   (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
-                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
-                       continue;
-               node_clear(nid, mem->scan_nodes);
+               if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
+                       node_clear(nid, mem->scan_nodes);
         }
+
+       atomic_set(&mem->numainfo_events, 0);
+       atomic_set(&mem->numainfo_updating, 0);
  }
  
  /*
@@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
         return node;
  }
  
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+       int nid;
+
+       /*
+        * quick check...making use of scan_node.
+        * We can skip unused nodes.
+        */
+       if (!nodes_empty(mem->scan_nodes)) {
+               for (nid = first_node(mem->scan_nodes);
+                    nid < MAX_NUMNODES;
+                    nid = next_node(nid, mem->scan_nodes)) {
+
+                       if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                               return true;
+               }
+       }
+       /*
+        * Check rest of nodes.
+        */
+       for_each_node_state(nid, N_HIGH_MEMORY) {
+               if (node_isset(nid, mem->scan_nodes))
+                       continue;
+               if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                       return true;
+       }
+       return false;
+}
+
  #else
  int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
  {
         return 0;
  }
+
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+       return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+}
  #endif
  
  /*
@@ -1663,15 +1730,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
         excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
  
         /* If memsw_is_minimum==1, swap-out is of-no-use. */
-       if (root_mem->memsw_is_minimum)
+       if (!check_soft && root_mem->memsw_is_minimum)
                 noswap = true;
  
         while (1) {
                 victim = mem_cgroup_select_victim(root_mem);
                 if (victim == root_mem) {
                         loop++;
-                       if (loop >= 1)
-                               drain_all_stock_async();
+                       /*
+                        * We are not draining per cpu cached charges during
+                        * soft limit reclaim  because global reclaim doesn't
+                        * care about charges. It tries to free some memory and
+                        * charges will not give any.
+                        */
+                       if (!check_soft && loop >= 1)
+                               drain_all_stock_async(root_mem);
                         if (loop >= 2) {
                                 /*
                                  * If we have not been able to reclaim
@@ -1695,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                 }
                         }
                 }
-               if (!mem_cgroup_local_usage(victim)) {
+               if (!mem_cgroup_reclaimable(victim, noswap)) {
                         /* this cgroup's local usage == 0 */
                         css_put(&victim->css);
                         continue;
@@ -1934,9 +2007,11 @@ struct memcg_stock_pcp {
         struct mem_cgroup *cached; /* this never be root cgroup */
         unsigned int nr_pages;
         struct work_struct work;
+       unsigned long flags;
+#define FLUSHING_CACHED_CHARGE (0)
  };
  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
-static atomic_t memcg_drain_count;
+static DEFINE_MUTEX(percpu_charge_mutex);
  
  /*
   * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -1984,6 +2059,7 @@ static void drain_local_stock(struct work_struct *dummy)
  {
         struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
         drain_stock(stock);
+       clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
  }
  
  /*
@@ -2008,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
   * expects some charges will be back to res_counter later but cannot wait for
   * it.
   */
-static void drain_all_stock_async(void)
+static void drain_all_stock_async(struct mem_cgroup *root_mem)
  {
-       int cpu;
-       /* This function is for scheduling "drain" in asynchronous way.
-        * The result of "drain" is not directly handled by callers. Then,
-        * if someone is calling drain, we don't have to call drain more.
-        * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
-        * there is a race. We just do loose check here.
+       int cpu, curcpu;
+       /*
+        * If someone calls draining, avoid adding more kworker runs.
          */
-       if (atomic_read(&memcg_drain_count))
+       if (!mutex_trylock(&percpu_charge_mutex))
                 return;
         /* Notify other cpus that system-wide "drain" is running */
-       atomic_inc(&memcg_drain_count);
         get_online_cpus();
+       /*
+        * Get a hint for avoiding draining charges on the current cpu,
+        * which must be exhausted by our charging.  It is not required that
+        * this be a precise check, so we use raw_smp_processor_id() instead of
+        * getcpu()/putcpu().
+        */
+       curcpu = raw_smp_processor_id();
         for_each_online_cpu(cpu) {
                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-               schedule_work_on(cpu, &stock->work);
+               struct mem_cgroup *mem;
+
+               if (cpu == curcpu)
+                       continue;
+
+               mem = stock->cached;
+               if (!mem)
+                       continue;
+               if (mem != root_mem) {
+                       if (!root_mem->use_hierarchy)
+                               continue;
+                       /* check whether "mem" is under tree of "root_mem" */
+                       if (!css_is_ancestor(&mem->css, &root_mem->css))
+                               continue;
+               }
+               if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+                       schedule_work_on(cpu, &stock->work);
         }
         put_online_cpus();
-       atomic_dec(&memcg_drain_count);
+       mutex_unlock(&percpu_charge_mutex);
         /* We don't wait for flush_work */
  }
  
@@ -2035,9 +2130,9 @@ static void drain_all_stock_async(void)
  static void drain_all_stock_sync(void)
  {
         /* called when force_empty is called */
-       atomic_inc(&memcg_drain_count);
+       mutex_lock(&percpu_charge_mutex);
         schedule_on_each_cpu(drain_local_stock);
-       atomic_dec(&memcg_drain_count);
+       mutex_unlock(&percpu_charge_mutex);
  }
  
  /*
@@ -4640,6 +4735,7 @@ static struct cftype mem_cgroup_files[] = {
         {
                 .name = "numa_stat",
                 .open = mem_control_numa_stat_open,
+               .mode = S_IRUGO,
         },
  #endif
  };