Merge branch 'for-linus' of git://git.infradead.org/users/vkoul/slave-dma
[pandora-kernel.git] / mm / vmscan.c
index 32c661d..e01ded3 100644 (file)
@@ -11,6 +11,8 @@
  *  Multiqueue VM started 5.8.00, Rik van Riel.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/gfp.h>
@@ -43,6 +45,7 @@
 #include <linux/sysctl.h>
 #include <linux/oom.h>
 #include <linux/prefetch.h>
+#include <linux/printk.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -83,6 +86,9 @@ struct scan_control {
        /* Scan (total_size >> priority) pages at once */
        int priority;
 
+       /* anon vs. file LRUs scanning "ratio" */
+       int swappiness;
+
        /*
         * The memory cgroup that hit its limit and as a result is the
         * primary target of this reclaim invocation.
@@ -324,7 +330,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
        else
                new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
 
-       trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+       trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
        return freed;
 }
 
@@ -477,7 +483,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                if (page_has_private(page)) {
                        if (try_to_free_buffers(page)) {
                                ClearPageDirty(page);
-                               printk("%s: orphaned page\n", __func__);
+                               pr_info("%s: orphaned page\n", __func__);
                                return PAGE_CLEAN;
                        }
                }
@@ -1121,7 +1127,7 @@ keep:
                VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
        }
 
-       free_hot_cold_page_list(&free_pages, 1);
+       free_hot_cold_page_list(&free_pages, true);
 
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
@@ -1438,6 +1444,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
        list_splice(&pages_to_free, page_list);
 }
 
+/*
+ * If a kernel thread (such as nfsd for loop-back mounts) services
+ * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
+ * In that case we should only throttle if the backing device it is
+ * writing to is congested.  In other cases it is safe to throttle.
+ */
+static int current_may_throttle(void)
+{
+       return !(current->flags & PF_LESS_THROTTLE) ||
+               current->backing_dev_info == NULL ||
+               bdi_write_congested(current->backing_dev_info);
+}
+
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
@@ -1519,7 +1538,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
        spin_unlock_irq(&zone->lru_lock);
 
-       free_hot_cold_page_list(&page_list, 1);
+       free_hot_cold_page_list(&page_list, true);
 
        /*
         * If reclaim is isolating dirty pages under writeback, it implies
@@ -1554,19 +1573,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                 * If dirty pages are scanned that are not queued for IO, it
                 * implies that flushers are not keeping up. In this case, flag
                 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
-                * pages from reclaim context. It will forcibly stall in the
-                * next check.
+                * pages from reclaim context.
                 */
                if (nr_unqueued_dirty == nr_taken)
                        zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
 
                /*
-                * In addition, if kswapd scans pages marked marked for
-                * immediate reclaim and under writeback (nr_immediate), it
-                * implies that pages are cycling through the LRU faster than
+                * If kswapd scans pages marked marked for immediate
+                * reclaim and under writeback (nr_immediate), it implies
+                * that pages are cycling through the LRU faster than
                 * they are written so also forcibly stall.
                 */
-               if (nr_unqueued_dirty == nr_taken || nr_immediate)
+               if (nr_immediate && current_may_throttle())
                        congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
 
@@ -1575,7 +1593,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         * is congested. Allow kswapd to continue until it starts encountering
         * unqueued dirty pages or cycling through the LRU too quickly.
         */
-       if (!sc->hibernation_mode && !current_is_kswapd())
+       if (!sc->hibernation_mode && !current_is_kswapd() &&
+           current_may_throttle())
                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
 
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1740,7 +1759,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
        spin_unlock_irq(&zone->lru_lock);
 
-       free_hot_cold_page_list(&l_hold, 1);
+       free_hot_cold_page_list(&l_hold, true);
 }
 
 #ifdef CONFIG_SWAP
@@ -1830,13 +1849,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
        return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
 }
 
-static int vmscan_swappiness(struct scan_control *sc)
-{
-       if (global_reclaim(sc))
-               return vm_swappiness;
-       return mem_cgroup_swappiness(sc->target_mem_cgroup);
-}
-
 enum scan_balance {
        SCAN_EQUAL,
        SCAN_FRACT,
@@ -1866,6 +1878,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        bool force_scan = false;
        unsigned long ap, fp;
        enum lru_list lru;
+       bool some_scanned;
+       int pass;
 
        /*
         * If the zone or memcg is small, nr[l] can be 0.  This
@@ -1895,7 +1909,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * using the memory controller's swap limit feature would be
         * too expensive.
         */
-       if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
+       if (!global_reclaim(sc) && !sc->swappiness) {
                scan_balance = SCAN_FILE;
                goto out;
        }
@@ -1905,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * system is close to OOM, scan both anon and file equally
         * (unless the swappiness setting disagrees with swapping).
         */
-       if (!sc->priority && vmscan_swappiness(sc)) {
+       if (!sc->priority && sc->swappiness) {
                scan_balance = SCAN_EQUAL;
                goto out;
        }
@@ -1948,7 +1962,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * With swappiness at 100, anonymous and file have the same priority.
         * This scanning priority is essentially the inverse of IO cost.
         */
-       anon_prio = vmscan_swappiness(sc);
+       anon_prio = sc->swappiness;
        file_prio = 200 - anon_prio;
 
        /*
@@ -1989,39 +2003,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        fraction[1] = fp;
        denominator = ap + fp + 1;
 out:
-       for_each_evictable_lru(lru) {
-               int file = is_file_lru(lru);
-               unsigned long size;
-               unsigned long scan;
+       some_scanned = false;
+       /* Only use force_scan on second pass. */
+       for (pass = 0; !some_scanned && pass < 2; pass++) {
+               for_each_evictable_lru(lru) {
+                       int file = is_file_lru(lru);
+                       unsigned long size;
+                       unsigned long scan;
 
-               size = get_lru_size(lruvec, lru);
-               scan = size >> sc->priority;
+                       size = get_lru_size(lruvec, lru);
+                       scan = size >> sc->priority;
 
-               if (!scan && force_scan)
-                       scan = min(size, SWAP_CLUSTER_MAX);
+                       if (!scan && pass && force_scan)
+                               scan = min(size, SWAP_CLUSTER_MAX);
 
-               switch (scan_balance) {
-               case SCAN_EQUAL:
-                       /* Scan lists relative to size */
-                       break;
-               case SCAN_FRACT:
+                       switch (scan_balance) {
+                       case SCAN_EQUAL:
+                               /* Scan lists relative to size */
+                               break;
+                       case SCAN_FRACT:
+                               /*
+                                * Scan types proportional to swappiness and
+                                * their relative recent reclaim efficiency.
+                                */
+                               scan = div64_u64(scan * fraction[file],
+                                                       denominator);
+                               break;
+                       case SCAN_FILE:
+                       case SCAN_ANON:
+                               /* Scan one type exclusively */
+                               if ((scan_balance == SCAN_FILE) != file)
+                                       scan = 0;
+                               break;
+                       default:
+                               /* Look ma, no brain */
+                               BUG();
+                       }
+                       nr[lru] = scan;
                        /*
-                        * Scan types proportional to swappiness and
-                        * their relative recent reclaim efficiency.
+                        * Skip the second pass and don't force_scan,
+                        * if we found something to scan.
                         */
-                       scan = div64_u64(scan * fraction[file], denominator);
-                       break;
-               case SCAN_FILE:
-               case SCAN_ANON:
-                       /* Scan one type exclusively */
-                       if ((scan_balance == SCAN_FILE) != file)
-                               scan = 0;
-                       break;
-               default:
-                       /* Look ma, no brain */
-                       BUG();
+                       some_scanned |= !!scan;
                }
-               nr[lru] = scan;
        }
 }
 
@@ -2037,13 +2061,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
        unsigned long nr_reclaimed = 0;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        struct blk_plug plug;
-       bool scan_adjusted = false;
+       bool scan_adjusted;
 
        get_scan_count(lruvec, sc, nr);
 
        /* Record the original scan target for proportional adjustments later */
        memcpy(targets, nr, sizeof(nr));
 
+       /*
+        * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
+        * event that can occur when there is little memory pressure e.g.
+        * multiple streaming readers/writers. Hence, we do not abort scanning
+        * when the requested number of pages are reclaimed when scanning at
+        * DEF_PRIORITY on the assumption that the fact we are direct
+        * reclaiming implies that kswapd is not keeping up and it is best to
+        * do a batch of work at once. For memcg reclaim one check is made to
+        * abort proportional reclaim if either the file or anon lru has already
+        * dropped to zero at the first pass.
+        */
+       scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
+                        sc->priority == DEF_PRIORITY);
+
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
@@ -2063,18 +2101,9 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
                if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
                        continue;
 
-               /*
-                * For global direct reclaim, reclaim only the number of pages
-                * requested. Less care is taken to scan proportionally as it
-                * is more important to minimise direct reclaim stall latency
-                * than it is to properly age the LRU lists.
-                */
-               if (global_reclaim(sc) && !current_is_kswapd())
-                       break;
-
                /*
                 * For kswapd and memcg, reclaim at least the number of pages
-                * requested. Ensure that the anon and file LRUs shrink
+                * requested. Ensure that the anon and file LRUs are scanned
                 * proportionally what was requested by get_scan_count(). We
                 * stop reclaiming one LRU and reduce the amount scanning
                 * proportional to the original scan target.
@@ -2082,6 +2111,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
                nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
                nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
 
+               /*
+                * It's just vindictive to attack the larger once the smaller
+                * has gone to zero.  And given the way we stop scanning the
+                * smaller below, this makes sure that we only make one nudge
+                * towards proportionality once we've got nr_to_reclaim.
+                */
+               if (!nr_file || !nr_anon)
+                       break;
+
                if (nr_file > nr_anon) {
                        unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
                                                targets[LRU_ACTIVE_ANON] + 1;
@@ -2224,6 +2262,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 
                        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
+                       sc->swappiness = mem_cgroup_swappiness(memcg);
                        shrink_lruvec(lruvec, sc);
 
                        /*
@@ -2268,9 +2307,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
         * there is a buffer of free pages available to give compaction
         * a reasonable chance of completing and allocating the page
         */
-       balance_gap = min(low_wmark_pages(zone),
-               (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
-                       KSWAPD_ZONE_BALANCE_GAP_RATIO);
+       balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
+                       zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
        watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
        watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
 
@@ -2525,10 +2563,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 
        for (i = 0; i <= ZONE_NORMAL; i++) {
                zone = &pgdat->node_zones[i];
+               if (!populated_zone(zone))
+                       continue;
+
                pfmemalloc_reserve += min_wmark_pages(zone);
                free_pages += zone_page_state(zone, NR_FREE_PAGES);
        }
 
+       /* If there are no reserves (unexpected config) then do not throttle */
+       if (!pfmemalloc_reserve)
+               return true;
+
        wmark_ok = free_pages > pfmemalloc_reserve / 2;
 
        /* kswapd must be awake if processes are being throttled */
@@ -2553,9 +2598,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                                        nodemask_t *nodemask)
 {
+       struct zoneref *z;
        struct zone *zone;
-       int high_zoneidx = gfp_zone(gfp_mask);
-       pg_data_t *pgdat;
+       pg_data_t *pgdat = NULL;
 
        /*
         * Kernel threads should not be throttled as they may be indirectly
@@ -2574,10 +2619,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
        if (fatal_signal_pending(current))
                goto out;
 
-       /* Check if the pfmemalloc reserves are ok */
-       first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
-       pgdat = zone->zone_pgdat;
-       if (pfmemalloc_watermark_ok(pgdat))
+       /*
+        * Check if the pfmemalloc reserves are ok by finding the first node
+        * with a usable ZONE_NORMAL or lower zone. The expectation is that
+        * GFP_KERNEL will be required for allocating network buffers when
+        * swapping over the network so ZONE_HIGHMEM is unusable.
+        *
+        * Throttling is based on the first usable node and throttled processes
+        * wait on a queue until kswapd makes progress and wakes them. There
+        * is an affinity then between processes waking up and where reclaim
+        * progress has been made assuming the process wakes on the same node.
+        * More importantly, processes running on remote nodes will not compete
+        * for remote pfmemalloc reserves and processes on different nodes
+        * should make reasonable progress.
+        */
+       for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                       gfp_mask, nodemask) {
+               if (zone_idx(zone) > ZONE_NORMAL)
+                       continue;
+
+               /* Throttle based on the first usable node */
+               pgdat = zone->zone_pgdat;
+               if (pfmemalloc_watermark_ok(pgdat))
+                       goto out;
+               break;
+       }
+
+       /* If no zone was usable by the allocation flags then do not throttle */
+       if (!pgdat)
                goto out;
 
        /* Account for the throttling */
@@ -2660,6 +2729,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                .may_swap = !noswap,
                .order = 0,
                .priority = 0,
+               .swappiness = mem_cgroup_swappiness(memcg),
                .target_mem_cgroup = memcg,
        };
        struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
@@ -2891,9 +2961,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
         * high wmark plus a "gap" where the gap is either the low
         * watermark or 1% of the zone, whichever is smaller.
         */
-       balance_gap = min(low_wmark_pages(zone),
-               (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
-               KSWAPD_ZONE_BALANCE_GAP_RATIO);
+       balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
+                       zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
 
        /*
         * If there is no low memory pressure or the zone is balanced then no
@@ -3302,7 +3371,10 @@ static int kswapd(void *p)
                }
        }
 
+       tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
        current->reclaim_state = NULL;
+       lockdep_clear_current_reclaim_state();
+
        return 0;
 }
 
@@ -3422,7 +3494,7 @@ int kswapd_run(int nid)
 
 /*
  * Called by memory hotplug when all memory in a node is offlined.  Caller must
- * hold lock_memory_hotplug().
+ * hold mem_hotplug_begin/end().
  */
 void kswapd_stop(int nid)
 {