mmc: card: Skip secure erase on MoviNAND; causes unrecoverable corruption.
[pandora-kernel.git] / mm / vmscan.c
index 347b3ff..8d01243 100644 (file)
@@ -133,7 +133,7 @@ long vm_total_pages;        /* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
 {
        return !sc->target_mem_cgroup;
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
        cond_resched();
 
+       mem_cgroup_uncharge_start();
        while (!list_empty(page_list)) {
                enum page_references references;
                struct address_space *mapping;
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 
                if (PageWriteback(page)) {
-                       nr_writeback++;
-                       unlock_page(page);
-                       goto keep;
+                       /*
+                        * memcg doesn't have any dirty pages throttling so we
+                        * could easily OOM just because too many pages are in
+                        * writeback and there is nothing else to reclaim.
+                        *
+                        * Check __GFP_IO, certainly because a loop driver
+                        * thread might enter reclaim, and deadlock if it waits
+                        * on a page for which it is needed to do the write
+                        * (loop masks off __GFP_IO|__GFP_FS for this reason);
+                        * but more thought would probably show more reasons.
+                        *
+                        * Don't require __GFP_FS, since we're not going into
+                        * the FS, just waiting on its writeback completion.
+                        * Worryingly, ext4 gfs2 and xfs allocate pages with
+                        * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
+                        * testing may_enter_fs here is liable to OOM on them.
+                        */
+                       if (global_reclaim(sc) ||
+                           !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+                               /*
+                                * This is slightly racy - end_page_writeback()
+                                * might have just cleared PageReclaim, then
+                                * setting PageReclaim here end up interpreted
+                                * as PageReadahead - but that does not matter
+                                * enough to care.  What we do want is for this
+                                * page to have PageReclaim set next time memcg
+                                * reclaim reaches the tests above, so it will
+                                * then wait_on_page_writeback() to avoid OOM;
+                                * and it's also appropriate in global reclaim.
+                                */
+                               SetPageReclaim(page);
+                               nr_writeback++;
+                               goto keep_locked;
+                       }
+                       wait_on_page_writeback(page);
                }
 
                references = page_check_references(page, sc);
@@ -921,6 +954,7 @@ keep:
 
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
+       mem_cgroup_uncharge_end();
        *ret_nr_dirty += nr_dirty;
        *ret_nr_writeback += nr_writeback;
        return nr_reclaimed;
@@ -2112,6 +2146,83 @@ out:
        return 0;
 }
 
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+{
+       struct zone *zone;
+       unsigned long pfmemalloc_reserve = 0;
+       unsigned long free_pages = 0;
+       int i;
+       bool wmark_ok;
+
+       for (i = 0; i <= ZONE_NORMAL; i++) {
+               zone = &pgdat->node_zones[i];
+               pfmemalloc_reserve += min_wmark_pages(zone);
+               free_pages += zone_page_state(zone, NR_FREE_PAGES);
+       }
+
+       wmark_ok = free_pages > pfmemalloc_reserve / 2;
+
+       /* kswapd must be awake if processes are being throttled */
+       if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
+               pgdat->classzone_idx = min(pgdat->classzone_idx,
+                                               (enum zone_type)ZONE_NORMAL);
+               wake_up_interruptible(&pgdat->kswapd_wait);
+       }
+
+       return wmark_ok;
+}
+
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached
+ */
+static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+                                       nodemask_t *nodemask)
+{
+       struct zone *zone;
+       int high_zoneidx = gfp_zone(gfp_mask);
+       pg_data_t *pgdat;
+
+       /*
+        * Kernel threads should not be throttled as they may be indirectly
+        * responsible for cleaning pages necessary for reclaim to make forward
+        * progress. kjournald for example may enter direct reclaim while
+        * committing a transaction where throttling it could forcing other
+        * processes to block on log_wait_commit().
+        */
+       if (current->flags & PF_KTHREAD)
+               return;
+
+       /* Check if the pfmemalloc reserves are ok */
+       first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+       pgdat = zone->zone_pgdat;
+       if (pfmemalloc_watermark_ok(pgdat))
+               return;
+
+       /* Account for the throttling */
+       count_vm_event(PGSCAN_DIRECT_THROTTLE);
+
+       /*
+        * If the caller cannot enter the filesystem, it's possible that it
+        * is due to the caller holding an FS lock or performing a journal
+        * transaction in the case of a filesystem like ext[3|4]. In this case,
+        * it is not safe to block on pfmemalloc_wait as kswapd could be
+        * blocked waiting on the same lock. Instead, throttle for up to a
+        * second before continuing.
+        */
+       if (!(gfp_mask & __GFP_FS)) {
+               wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
+                       pfmemalloc_watermark_ok(pgdat), HZ);
+               return;
+       }
+
+       /* Throttle until kswapd wakes the process */
+       wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+               pfmemalloc_watermark_ok(pgdat));
+}
+
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                gfp_t gfp_mask, nodemask_t *nodemask)
 {
@@ -2131,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .gfp_mask = sc.gfp_mask,
        };
 
+       throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
+
+       /*
+        * Do not enter reclaim if fatal signal is pending. 1 is returned so
+        * that the page allocator does not consider triggering OOM
+        */
+       if (fatal_signal_pending(current))
+               return 1;
+
        trace_mm_vmscan_direct_reclaim_begin(order,
                                sc.may_writepage,
                                gfp_mask);
@@ -2142,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        return nr_reclaimed;
 }
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
 
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                                                gfp_t gfp_mask, bool noswap,
@@ -2275,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
        return balanced_pages >= (present_pages >> 2);
 }
 
-/* is kswapd sleeping prematurely? */
-static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+/*
+ * Prepare kswapd for sleeping. This verifies that there are no processes
+ * waiting in throttle_direct_reclaim() and that watermarks have been met.
+ *
+ * Returns true if kswapd is ready to sleep
+ */
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                                        int classzone_idx)
 {
        int i;
@@ -2285,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
 
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
-               return true;
+               return false;
+
+       /*
+        * There is a potential race between when kswapd checks its watermarks
+        * and a process gets throttled. There is also a potential race if
+        * processes get throttled, kswapd wakes, a large process exits therby
+        * balancing the zones that causes kswapd to miss a wakeup. If kswapd
+        * is going to sleep, no process should be sleeping on pfmemalloc_wait
+        * so wake them now if necessary. If necessary, processes will wake
+        * kswapd and get throttled again
+        */
+       if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
+               wake_up(&pgdat->pfmemalloc_wait);
+               return false;
+       }
 
        /* Check the watermark levels */
        for (i = 0; i <= classzone_idx; i++) {
@@ -2318,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
         * must be balanced
         */
        if (order)
-               return !pgdat_balanced(pgdat, balanced, classzone_idx);
+               return pgdat_balanced(pgdat, balanced, classzone_idx);
        else
-               return !all_zones_ok;
+               return all_zones_ok;
 }
 
 /*
@@ -2546,6 +2685,16 @@ loop_again:
                        }
 
                }
+
+               /*
+                * If the low watermark is met there is no need for processes
+                * to be throttled on pfmemalloc_wait as they should not be
+                * able to safely make forward progress. Wake them
+                */
+               if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+                               pfmemalloc_watermark_ok(pgdat))
+                       wake_up(&pgdat->pfmemalloc_wait);
+
                if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
                        break;          /* kswapd: all done */
                /*
@@ -2647,7 +2796,7 @@ out:
        }
 
        /*
-        * Return the order we were reclaiming at so sleeping_prematurely()
+        * Return the order we were reclaiming at so prepare_kswapd_sleep()
         * makes a decision on the order we were last reclaiming at. However,
         * if another caller entered the allocator slow path while kswapd
         * was awake, order will remain at the higher level
@@ -2667,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 
        /* Try to sleep for a short interval */
-       if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+       if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
                remaining = schedule_timeout(HZ/10);
                finish_wait(&pgdat->kswapd_wait, &wait);
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2677,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         * After a short sleep, check if it was a premature sleep. If not, then
         * go fully to sleep until explicitly woken up.
         */
-       if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+       if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 
                /*