mm: vmscan: throttle reclaim if encountering too many dirty pages under writeback
[pandora-kernel.git] / mm / vmscan.c
index b55699c..7b0573f 100644 (file)
@@ -495,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        return PAGE_ACTIVATE;
                }
 
-               /*
-                * Wait on writeback if requested to. This happens when
-                * direct reclaiming a large contiguous area and the
-                * first attempt to free a range of pages fails.
-                */
-               if (PageWriteback(page) &&
-                   (sc->reclaim_mode & RECLAIM_MODE_SYNC))
-                       wait_on_page_writeback(page);
-
                if (!PageWriteback(page)) {
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
@@ -759,7 +750,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
                                      struct zone *zone,
-                                     struct scan_control *sc)
+                                     struct scan_control *sc,
+                                     int priority,
+                                     unsigned long *ret_nr_dirty,
+                                     unsigned long *ret_nr_writeback)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
@@ -767,6 +761,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        unsigned long nr_dirty = 0;
        unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
+       unsigned long nr_writeback = 0;
 
        cond_resched();
 
@@ -803,13 +798,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 
                if (PageWriteback(page)) {
+                       nr_writeback++;
                        /*
-                        * Synchronous reclaim is performed in two passes,
-                        * first an asynchronous pass over the list to
-                        * start parallel writeback, and a second synchronous
-                        * pass to wait for the IO to complete.  Wait here
-                        * for any page for which writeback has already
-                        * started.
+                        * Synchronous reclaim cannot queue pages for
+                        * writeback due to the possibility of stack overflow
+                        * but if it encounters a page under writeback, wait
+                        * for the IO to complete.
                         */
                        if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
                            may_enter_fs)
@@ -865,6 +859,17 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PageDirty(page)) {
                        nr_dirty++;
 
+                       /*
+                        * Only kswapd can writeback filesystem pages to
+                        * avoid risk of stack overflow but do not writeback
+                        * unless under significant pressure.
+                        */
+                       if (page_is_file_cache(page) &&
+                                       (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+                               inc_zone_page_state(page, NR_VMSCAN_WRITE_SKIP);
+                               goto keep_locked;
+                       }
+
                        if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
@@ -999,6 +1004,8 @@ keep_lumpy:
 
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
+       *ret_nr_dirty += nr_dirty;
+       *ret_nr_writeback += nr_writeback;
        return nr_reclaimed;
 }
 
@@ -1012,23 +1019,27 @@ keep_lumpy:
  *
  * returns 0 on success, -ve errno on failure.
  */
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 {
+       bool all_lru_mode;
        int ret = -EINVAL;
 
        /* Only take pages on the LRU. */
        if (!PageLRU(page))
                return ret;
 
+       all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+               (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
+
        /*
         * When checking the active state, we need to be sure we are
         * dealing with comparible boolean values.  Take the logical not
         * of each.
         */
-       if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+       if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
                return ret;
 
-       if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+       if (!all_lru_mode && !!page_is_file_cache(page) != file)
                return ret;
 
        /*
@@ -1041,6 +1052,12 @@ int __isolate_lru_page(struct page *page, int mode, int file)
 
        ret = -EBUSY;
 
+       if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
+               return ret;
+
+       if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+               return ret;
+
        if (likely(get_page_unless_zero(page))) {
                /*
                 * Be careful not to clear PageLRU until after we're
@@ -1076,7 +1093,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                struct list_head *src, struct list_head *dst,
-               unsigned long *scanned, int order, int mode, int file)
+               unsigned long *scanned, int order, isolate_mode_t mode,
+               int file)
 {
        unsigned long nr_taken = 0;
        unsigned long nr_lumpy_taken = 0;
@@ -1201,8 +1219,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 static unsigned long isolate_pages_global(unsigned long nr,
                                        struct list_head *dst,
                                        unsigned long *scanned, int order,
-                                       int mode, struct zone *z,
-                                       int active, int file)
+                                       isolate_mode_t mode,
+                                       struct zone *z, int active, int file)
 {
        int lru = LRU_BASE;
        if (active)
@@ -1394,7 +1412,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
 }
 
 /*
- * Returns true if the caller should wait to clean dirty/writeback pages.
+ * Returns true if a direct reclaim should wait on pages under writeback.
  *
  * If we are direct reclaiming for contiguous pages and we do not reclaim
  * everything in the list, try again and wait for writeback IO to complete.
@@ -1416,7 +1434,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
        if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
                return false;
 
-       /* If we have relaimed everything on the isolated list, no stall */
+       /* If we have reclaimed everything on the isolated list, no stall */
        if (nr_freed == nr_taken)
                return false;
 
@@ -1448,6 +1466,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        unsigned long nr_taken;
        unsigned long nr_anon;
        unsigned long nr_file;
+       unsigned long nr_dirty = 0;
+       unsigned long nr_writeback = 0;
+       isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
 
        while (unlikely(too_many_isolated(zone, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1458,15 +1479,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        }
 
        set_reclaim_mode(priority, sc, false);
+       if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+               reclaim_mode |= ISOLATE_ACTIVE;
+
        lru_add_drain();
+
+       if (!sc->may_unmap)
+               reclaim_mode |= ISOLATE_UNMAPPED;
+       if (!sc->may_writepage)
+               reclaim_mode |= ISOLATE_CLEAN;
+
        spin_lock_irq(&zone->lru_lock);
 
        if (scanning_global_lru(sc)) {
-               nr_taken = isolate_pages_global(nr_to_scan,
-                       &page_list, &nr_scanned, sc->order,
-                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
-                       zone, 0, file);
+               nr_taken = isolate_pages_global(nr_to_scan, &page_list,
+                       &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1475,12 +1502,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                               nr_scanned);
        } else {
-               nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
-                       &page_list, &nr_scanned, sc->order,
-                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
-                       zone, sc->mem_cgroup,
-                       0, file);
+               nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
+                       &nr_scanned, sc->order, reclaim_mode, zone,
+                       sc->mem_cgroup, 0, file);
                /*
                 * mem_cgroup_isolate_pages() keeps track of
                 * scanned pages on its own.
@@ -1496,12 +1520,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 
        spin_unlock_irq(&zone->lru_lock);
 
-       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
+       nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
+                                               &nr_dirty, &nr_writeback);
 
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                set_reclaim_mode(priority, sc, true);
-               nr_reclaimed += shrink_page_list(&page_list, zone, sc);
+               nr_reclaimed += shrink_page_list(&page_list, zone, sc,
+                                       priority, &nr_dirty, &nr_writeback);
        }
 
        local_irq_disable();
@@ -1511,6 +1537,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 
        putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
 
+       /*
+        * If reclaim is isolating dirty pages under writeback, it implies
+        * that the long-lived page allocation rate is exceeding the page
+        * laundering rate. Either the global limits are not being effective
+        * at throttling processes due to the page distribution throughout
+        * zones or there is heavy usage of a slow backing device. The
+        * only option is to throttle from reclaim context which is not ideal
+        * as there is no guarantee the dirtying process is throttled in the
+        * same way balance_dirty_pages() manages.
+        *
+        * This scales the number of dirty pages that must be under writeback
+        * before throttling depending on priority. It is a simple backoff
+        * function that has the most effect in the range DEF_PRIORITY to
+        * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+        * in trouble and reclaim is considered to be in trouble.
+        *
+        * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
+        * DEF_PRIORITY-1  50% must be PageWriteback
+        * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
+        * ...
+        * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
+        *                     isolated page is PageWriteback
+        */
+       if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+               wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
                zone_idx(zone),
                nr_scanned, nr_reclaimed,
@@ -1582,19 +1634,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        struct page *page;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        unsigned long nr_rotated = 0;
+       isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
 
        lru_add_drain();
+
+       if (!sc->may_unmap)
+               reclaim_mode |= ISOLATE_UNMAPPED;
+       if (!sc->may_writepage)
+               reclaim_mode |= ISOLATE_CLEAN;
+
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_pages, &l_hold,
                                                &pgscanned, sc->order,
-                                               ISOLATE_ACTIVE, zone,
+                                               reclaim_mode, zone,
                                                1, file);
                zone->pages_scanned += pgscanned;
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
                                                &pgscanned, sc->order,
-                                               ISOLATE_ACTIVE, zone,
+                                               reclaim_mode, zone,
                                                sc->mem_cgroup, 1, file);
                /*
                 * mem_cgroup_isolate_pages() keeps track of
@@ -1795,12 +1854,19 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        enum lru_list l;
        int noswap = 0;
        bool force_scan = false;
-       unsigned long nr_force_scan[2];
 
-       /* kswapd does zone balancing and needs to scan this zone */
+       /*
+        * If the zone or memcg is small, nr[l] can be 0.  This
+        * results in no scanning on this priority and a potential
+        * priority drop.  Global direct reclaim can go to the next
+        * zone and tends to have no problems. Global kswapd is for
+        * zone balancing and it needs to scan a minimum amount. When
+        * reclaiming for a memcg, a priority drop can cause high
+        * latencies, so it's better to scan a minimum amount there as
+        * well.
+        */
        if (scanning_global_lru(sc) && current_is_kswapd())
                force_scan = true;
-       /* memcg may have small limit and need to avoid priority drop */
        if (!scanning_global_lru(sc))
                force_scan = true;
 
@@ -1810,8 +1876,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                fraction[0] = 0;
                fraction[1] = 1;
                denominator = 1;
-               nr_force_scan[0] = 0;
-               nr_force_scan[1] = SWAP_CLUSTER_MAX;
                goto out;
        }
 
@@ -1828,8 +1892,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                        fraction[0] = 1;
                        fraction[1] = 0;
                        denominator = 1;
-                       nr_force_scan[0] = SWAP_CLUSTER_MAX;
-                       nr_force_scan[1] = 0;
                        goto out;
                }
        }
@@ -1878,11 +1940,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        fraction[0] = ap;
        fraction[1] = fp;
        denominator = ap + fp + 1;
-       if (force_scan) {
-               unsigned long scan = SWAP_CLUSTER_MAX;
-               nr_force_scan[0] = div64_u64(scan * ap, denominator);
-               nr_force_scan[1] = div64_u64(scan * fp, denominator);
-       }
 out:
        for_each_evictable_lru(l) {
                int file = is_file_lru(l);
@@ -1891,20 +1948,10 @@ out:
                scan = zone_nr_lru_pages(zone, sc, l);
                if (priority || noswap) {
                        scan >>= priority;
+                       if (!scan && force_scan)
+                               scan = SWAP_CLUSTER_MAX;
                        scan = div64_u64(scan * fraction[file], denominator);
                }
-
-               /*
-                * If zone is small or memcg is small, nr[l] can be 0.
-                * This results no-scan on this priority and priority drop down.
-                * For global direct reclaim, it can visit next zone and tend
-                * not to have problems. For global kswapd, it's for zone
-                * balancing and it need to scan a small amounts. When using
-                * memcg, priority drop can cause big latency. So, it's better
-                * to scan small amount. See may_noscan above.
-                */
-               if (!scan && force_scan)
-                       scan = nr_force_scan[file];
                nr[l] = scan;
        }
 }
@@ -1983,12 +2030,14 @@ static void shrink_zone(int priority, struct zone *zone,
        enum lru_list l;
        unsigned long nr_reclaimed, nr_scanned;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+       struct blk_plug plug;
 
 restart:
        nr_reclaimed = 0;
        nr_scanned = sc->nr_scanned;
        get_scan_count(zone, sc, nr, priority);
 
+       blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
                for_each_evictable_lru(l) {
@@ -2012,6 +2061,7 @@ restart:
                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
+       blk_finish_plug(&plug);
        sc->nr_reclaimed += nr_reclaimed;
 
        /*