Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/lrg/voltage-2.6
[pandora-kernel.git] / mm / memcontrol.c
index e37c44d..e013b8e 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
+#include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index {
 enum mem_cgroup_events_target {
        MEM_CGROUP_TARGET_THRESH,
        MEM_CGROUP_TARGET_SOFTLIMIT,
+       MEM_CGROUP_TARGET_NUMAINFO,
        MEM_CGROUP_NTARGETS,
 };
 #define THRESHOLDS_EVENTS_TARGET (128)
 #define SOFTLIMIT_EVENTS_TARGET (1024)
+#define NUMAINFO_EVENTS_TARGET (1024)
 
 struct mem_cgroup_stat_cpu {
        long count[MEM_CGROUP_STAT_NSTATS];
@@ -236,7 +239,8 @@ struct mem_cgroup {
        int last_scanned_node;
 #if MAX_NUMNODES > 1
        nodemask_t      scan_nodes;
-       unsigned long   next_scan_node_update;
+       atomic_t        numainfo_events;
+       atomic_t        numainfo_updating;
 #endif
        /*
         * Should the accounting and control be hierarchical, per subtree?
@@ -359,7 +363,7 @@ enum charge_type {
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void drain_all_stock_async(void);
+static void drain_all_stock_async(struct mem_cgroup *mem);
 
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
        return val;
 }
 
-static long mem_cgroup_local_usage(struct mem_cgroup *mem)
-{
-       long ret;
-
-       ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
-       ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
-       return ret;
-}
-
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
                                         bool charge)
 {
@@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
        case MEM_CGROUP_TARGET_SOFTLIMIT:
                next = val + SOFTLIMIT_EVENTS_TARGET;
                break;
+       case MEM_CGROUP_TARGET_NUMAINFO:
+               next = val + NUMAINFO_EVENTS_TARGET;
+               break;
        default:
                return;
        }
@@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
                mem_cgroup_threshold(mem);
                __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
                if (unlikely(__memcg_event_check(mem,
-                       MEM_CGROUP_TARGET_SOFTLIMIT))){
+                            MEM_CGROUP_TARGET_SOFTLIMIT))) {
                        mem_cgroup_update_tree(mem, page);
                        __mem_cgroup_target_update(mem,
-                               MEM_CGROUP_TARGET_SOFTLIMIT);
+                                                  MEM_CGROUP_TARGET_SOFTLIMIT);
                }
+#if MAX_NUMNODES > 1
+               if (unlikely(__memcg_event_check(mem,
+                       MEM_CGROUP_TARGET_NUMAINFO))) {
+                       atomic_inc(&mem->numainfo_events);
+                       __mem_cgroup_target_update(mem,
+                               MEM_CGROUP_TARGET_NUMAINFO);
+               }
+#endif
        }
 }
 
@@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
        return MEM_CGROUP_ZSTAT(mz, lru);
 }
 
-#ifdef CONFIG_NUMA
 static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
                                                        int nid)
 {
@@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
        return ret;
 }
 
+static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
+                                                       int nid)
+{
+       unsigned long ret;
+
+       ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
+               mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
+       return ret;
+}
+
+#if MAX_NUMNODES > 1
 static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
 {
        u64 total = 0;
@@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
        return total;
 }
 
-static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
-                                                       int nid)
-{
-       unsigned long ret;
-
-       ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
-               mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
-
-       return ret;
-}
-
 static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
 {
        u64 total = 0;
@@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
        return ret;
 }
 
+/**
+ * test_mem_cgroup_node_reclaimable
+ * @mem: the target memcg
+ * @nid: the node ID to be checked.
+ * @noswap : specify true here if the user wants flle only information.
+ *
+ * This function returns whether the specified memcg contains any
+ * reclaimable pages on a node. Returns true if there are any reclaimable
+ * pages in the node.
+ */
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+               int nid, bool noswap)
+{
+       if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
+               return true;
+       if (noswap || !total_swap_pages)
+               return false;
+       if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
+               return true;
+       return false;
+
+}
 #if MAX_NUMNODES > 1
 
 /*
@@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
 {
        int nid;
-
-       if (time_after(mem->next_scan_node_update, jiffies))
+       /*
+        * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
+        * pagein/pageout changes since the last update.
+        */
+       if (!atomic_read(&mem->numainfo_events))
+               return;
+       if (atomic_inc_return(&mem->numainfo_updating) > 1)
                return;
 
-       mem->next_scan_node_update = jiffies + 10*HZ;
        /* make a nodemask where this memcg uses memory from */
        mem->scan_nodes = node_states[N_HIGH_MEMORY];
 
        for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
 
-               if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
-                   mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
-                       continue;
-
-               if (total_swap_pages &&
-                   (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
-                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
-                       continue;
-               node_clear(nid, mem->scan_nodes);
+               if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
+                       node_clear(nid, mem->scan_nodes);
        }
+
+       atomic_set(&mem->numainfo_events, 0);
+       atomic_set(&mem->numainfo_updating, 0);
 }
 
 /*
@@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
        return node;
 }
 
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+       int nid;
+
+       /*
+        * quick check...making use of scan_node.
+        * We can skip unused nodes.
+        */
+       if (!nodes_empty(mem->scan_nodes)) {
+               for (nid = first_node(mem->scan_nodes);
+                    nid < MAX_NUMNODES;
+                    nid = next_node(nid, mem->scan_nodes)) {
+
+                       if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                               return true;
+               }
+       }
+       /*
+        * Check rest of nodes.
+        */
+       for_each_node_state(nid, N_HIGH_MEMORY) {
+               if (node_isset(nid, mem->scan_nodes))
+                       continue;
+               if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                       return true;
+       }
+       return false;
+}
+
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
 {
        return 0;
 }
+
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+       return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+}
 #endif
 
 /*
@@ -1663,15 +1730,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
 
        /* If memsw_is_minimum==1, swap-out is of-no-use. */
-       if (root_mem->memsw_is_minimum)
+       if (!check_soft && root_mem->memsw_is_minimum)
                noswap = true;
 
        while (1) {
                victim = mem_cgroup_select_victim(root_mem);
                if (victim == root_mem) {
                        loop++;
-                       if (loop >= 1)
-                               drain_all_stock_async();
+                       /*
+                        * We are not draining per cpu cached charges during
+                        * soft limit reclaim  because global reclaim doesn't
+                        * care about charges. It tries to free some memory and
+                        * charges will not give any.
+                        */
+                       if (!check_soft && loop >= 1)
+                               drain_all_stock_async(root_mem);
                        if (loop >= 2) {
                                /*
                                 * If we have not been able to reclaim
@@ -1695,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                }
                        }
                }
-               if (!mem_cgroup_local_usage(victim)) {
+               if (!mem_cgroup_reclaimable(victim, noswap)) {
                        /* this cgroup's local usage == 0 */
                        css_put(&victim->css);
                        continue;
@@ -1934,9 +2007,11 @@ struct memcg_stock_pcp {
        struct mem_cgroup *cached; /* this never be root cgroup */
        unsigned int nr_pages;
        struct work_struct work;
+       unsigned long flags;
+#define FLUSHING_CACHED_CHARGE (0)
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
-static atomic_t memcg_drain_count;
+static DEFINE_MUTEX(percpu_charge_mutex);
 
 /*
  * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -1984,6 +2059,7 @@ static void drain_local_stock(struct work_struct *dummy)
 {
        struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
        drain_stock(stock);
+       clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 
 /*
@@ -2008,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
  * expects some charges will be back to res_counter later but cannot wait for
  * it.
  */
-static void drain_all_stock_async(void)
+static void drain_all_stock_async(struct mem_cgroup *root_mem)
 {
-       int cpu;
-       /* This function is for scheduling "drain" in asynchronous way.
-        * The result of "drain" is not directly handled by callers. Then,
-        * if someone is calling drain, we don't have to call drain more.
-        * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
-        * there is a race. We just do loose check here.
+       int cpu, curcpu;
+       /*
+        * If someone calls draining, avoid adding more kworker runs.
         */
-       if (atomic_read(&memcg_drain_count))
+       if (!mutex_trylock(&percpu_charge_mutex))
                return;
        /* Notify other cpus that system-wide "drain" is running */
-       atomic_inc(&memcg_drain_count);
        get_online_cpus();
+       /*
+        * Get a hint for avoiding draining charges on the current cpu,
+        * which must be exhausted by our charging.  It is not required that
+        * this be a precise check, so we use raw_smp_processor_id() instead of
+        * getcpu()/putcpu().
+        */
+       curcpu = raw_smp_processor_id();
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-               schedule_work_on(cpu, &stock->work);
+               struct mem_cgroup *mem;
+
+               if (cpu == curcpu)
+                       continue;
+
+               mem = stock->cached;
+               if (!mem)
+                       continue;
+               if (mem != root_mem) {
+                       if (!root_mem->use_hierarchy)
+                               continue;
+                       /* check whether "mem" is under tree of "root_mem" */
+                       if (!css_is_ancestor(&mem->css, &root_mem->css))
+                               continue;
+               }
+               if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+                       schedule_work_on(cpu, &stock->work);
        }
        put_online_cpus();
-       atomic_dec(&memcg_drain_count);
+       mutex_unlock(&percpu_charge_mutex);
        /* We don't wait for flush_work */
 }
 
@@ -2035,9 +2130,9 @@ static void drain_all_stock_async(void)
 static void drain_all_stock_sync(void)
 {
        /* called when force_empty is called */
-       atomic_inc(&memcg_drain_count);
+       mutex_lock(&percpu_charge_mutex);
        schedule_on_each_cpu(drain_local_stock);
-       atomic_dec(&memcg_drain_count);
+       mutex_unlock(&percpu_charge_mutex);
 }
 
 /*
@@ -4640,6 +4735,7 @@ static struct cftype mem_cgroup_files[] = {
        {
                .name = "numa_stat",
                .open = mem_control_numa_stat_open,
+               .mode = S_IRUGO,
        },
 #endif
 };