Merge branch 'msm-fix' of git://codeaurora.org/quic/kernel/davidb/linux-msm into...

[pandora-kernel.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index f54a05b..2880396 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -103,8 +103,11 @@ struct scan_control {
          */
         reclaim_mode_t reclaim_mode;
  
-       /* Which cgroup do we reclaim from */
-       struct mem_cgroup *mem_cgroup;
+       /*
+        * The memory cgroup that hit its limit and as a result is the
+        * primary target of this reclaim invocation.
+        */
+       struct mem_cgroup *target_mem_cgroup;
  
         /*
          * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -113,6 +116,11 @@ struct scan_control {
         nodemask_t      *nodemask;
  };
  
+struct mem_cgroup_zone {
+       struct mem_cgroup *mem_cgroup;
+       struct zone *zone;
+};
+
  #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  
  #ifdef ARCH_HAS_PREFETCH
@@ -153,28 +161,45 @@ static LIST_HEAD(shrinker_list);
  static DECLARE_RWSEM(shrinker_rwsem);
  
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
-#define scanning_global_lru(sc)        (!(sc)->mem_cgroup)
+static bool global_reclaim(struct scan_control *sc)
+{
+       return !sc->target_mem_cgroup;
+}
+
+static bool scanning_global_lru(struct mem_cgroup_zone *mz)
+{
+       return !mz->mem_cgroup;
+}
  #else
-#define scanning_global_lru(sc)        (1)
+static bool global_reclaim(struct scan_control *sc)
+{
+       return true;
+}
+
+static bool scanning_global_lru(struct mem_cgroup_zone *mz)
+{
+       return true;
+}
  #endif
  
-static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
-                                                 struct scan_control *sc)
+static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
  {
-       if (!scanning_global_lru(sc))
-               return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
+       if (!scanning_global_lru(mz))
+               return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
  
-       return &zone->reclaim_stat;
+       return &mz->zone->reclaim_stat;
  }
  
-static unsigned long zone_nr_lru_pages(struct zone *zone,
-                               struct scan_control *sc, enum lru_list lru)
+static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
+                                      enum lru_list lru)
  {
-       if (!scanning_global_lru(sc))
-               return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
-                               zone_to_nid(zone), zone_idx(zone), BIT(lru));
+       if (!scanning_global_lru(mz))
+               return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
+                                                   zone_to_nid(mz->zone),
+                                                   zone_idx(mz->zone),
+                                                   BIT(lru));
  
-       return zone_page_state(zone, NR_LRU_BASE + lru);
+       return zone_page_state(mz->zone, NR_LRU_BASE + lru);
  }
  
  
@@ -677,12 +702,13 @@ enum page_references {
  };
  
  static enum page_references page_check_references(struct page *page,
+                                                 struct mem_cgroup_zone *mz,
                                                   struct scan_control *sc)
  {
         int referenced_ptes, referenced_page;
         unsigned long vm_flags;
  
-       referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
+       referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
         referenced_page = TestClearPageReferenced(page);
  
         /* Lumpy reclaim - ignore references */
@@ -715,7 +741,13 @@ static enum page_references page_check_references(struct page *page,
                  */
                 SetPageReferenced(page);
  
-               if (referenced_page)
+               if (referenced_page || referenced_ptes > 1)
+                       return PAGEREF_ACTIVATE;
+
+               /*
+                * Activate file-backed executable pages after first usage.
+                */
+               if (vm_flags & VM_EXEC)
                         return PAGEREF_ACTIVATE;
  
                 return PAGEREF_KEEP;
@@ -728,29 +760,11 @@ static enum page_references page_check_references(struct page *page,
         return PAGEREF_RECLAIM;
  }
  
-static noinline_for_stack void free_page_list(struct list_head *free_pages)
-{
-       struct pagevec freed_pvec;
-       struct page *page, *tmp;
-
-       pagevec_init(&freed_pvec, 1);
-
-       list_for_each_entry_safe(page, tmp, free_pages, lru) {
-               list_del(&page->lru);
-               if (!pagevec_add(&freed_pvec, page)) {
-                       __pagevec_free(&freed_pvec);
-                       pagevec_reinit(&freed_pvec);
-               }
-       }
-
-       pagevec_free(&freed_pvec);
-}
-
  /*
   * shrink_page_list() returns the number of reclaimed pages
   */
  static unsigned long shrink_page_list(struct list_head *page_list,
-                                     struct zone *zone,
+                                     struct mem_cgroup_zone *mz,
                                       struct scan_control *sc,
                                       int priority,
                                       unsigned long *ret_nr_dirty,
@@ -781,7 +795,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         goto keep;
  
                 VM_BUG_ON(PageActive(page));
-               VM_BUG_ON(page_zone(page) != zone);
+               VM_BUG_ON(page_zone(page) != mz->zone);
  
                 sc->nr_scanned++;
  
@@ -815,7 +829,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         }
                 }
  
-               references = page_check_references(page, sc);
+               references = page_check_references(page, mz, sc);
                 switch (references) {
                 case PAGEREF_ACTIVATE:
                         goto activate_locked;
@@ -1006,10 +1020,10 @@ keep_lumpy:
          * back off and wait for congestion to clear because further reclaim
          * will encounter the same problem
          */
-       if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
-               zone_set_flag(zone, ZONE_CONGESTED);
+       if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
+               zone_set_flag(mz->zone, ZONE_CONGESTED);
  
-       free_page_list(&free_pages);
+       free_hot_cold_page_list(&free_pages, 1);
  
         list_splice(&ret_pages, page_list);
         count_vm_events(PGACTIVATE, pgactivate);
@@ -1061,8 +1075,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
  
         ret = -EBUSY;
  
-       if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
-               return ret;
+       /*
+        * To minimise LRU disruption, the caller can indicate that it only
+        * wants to isolate pages it will be able to operate on without
+        * blocking - clean pages for the most part.
+        *
+        * ISOLATE_CLEAN means that only clean pages should be isolated. This
+        * is used by reclaim when it is cannot write to backing storage
+        *
+        * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+        * that it is possible to migrate without blocking
+        */
+       if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+               /* All the caller can do on PageWriteback is block */
+               if (PageWriteback(page))
+                       return ret;
+
+               if (PageDirty(page)) {
+                       struct address_space *mapping;
+
+                       /* ISOLATE_CLEAN means only clean pages */
+                       if (mode & ISOLATE_CLEAN)
+                               return ret;
+
+                       /*
+                        * Only pages without mappings or that have a
+                        * ->migratepage callback are possible to migrate
+                        * without blocking
+                        */
+                       mapping = page_mapping(page);
+                       if (mapping && !mapping->a_ops->migratepage)
+                               return ret;
+               }
+       }
  
         if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
                 return ret;
@@ -1091,25 +1136,36 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
   * Appropriate locks must be held before calling this function.
   *
   * @nr_to_scan:        The number of pages to look through on the list.
- * @src:       The LRU list to pull pages off.
+ * @mz:                The mem_cgroup_zone to pull pages from.
   * @dst:       The temp list to put pages on to.
- * @scanned:   The number of pages that were scanned.
+ * @nr_scanned:        The number of pages that were scanned.
   * @order:     The caller's attempted allocation order
   * @mode:      One of the LRU isolation modes
+ * @active:    True [1] if isolating active pages
   * @file:      True [1] if isolating file [!anon] pages
   *
   * returns how many pages were moved onto *@dst.
   */
  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
-               struct list_head *src, struct list_head *dst,
-               unsigned long *scanned, int order, isolate_mode_t mode,
-               int file)
+               struct mem_cgroup_zone *mz, struct list_head *dst,
+               unsigned long *nr_scanned, int order, isolate_mode_t mode,
+               int active, int file)
  {
+       struct lruvec *lruvec;
+       struct list_head *src;
         unsigned long nr_taken = 0;
         unsigned long nr_lumpy_taken = 0;
         unsigned long nr_lumpy_dirty = 0;
         unsigned long nr_lumpy_failed = 0;
         unsigned long scan;
+       int lru = LRU_BASE;
+
+       lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
+       if (active)
+               lru += LRU_ACTIVE;
+       if (file)
+               lru += LRU_FILE;
+       src = &lruvec->lists[lru];
  
         for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
                 struct page *page;
@@ -1125,15 +1181,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  
                 switch (__isolate_lru_page(page, mode, file)) {
                 case 0:
+                       mem_cgroup_lru_del(page);
                         list_move(&page->lru, dst);
-                       mem_cgroup_del_lru(page);
                         nr_taken += hpage_nr_pages(page);
                         break;
  
                 case -EBUSY:
                         /* else it is being freed elsewhere */
                         list_move(&page->lru, src);
-                       mem_cgroup_rotate_lru_list(page, page_lru(page));
                         continue;
  
                 default:
@@ -1178,18 +1233,22 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                          * anon page which don't already have a swap slot is
                          * pointless.
                          */
-                       if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+                       if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
                             !PageSwapCache(cursor_page))
                                 break;
  
                         if (__isolate_lru_page(cursor_page, mode, file) == 0) {
+                               unsigned int isolated_pages;
+
+                               mem_cgroup_lru_del(cursor_page);
                                 list_move(&cursor_page->lru, dst);
-                               mem_cgroup_del_lru(cursor_page);
-                               nr_taken += hpage_nr_pages(page);
-                               nr_lumpy_taken++;
+                               isolated_pages = hpage_nr_pages(cursor_page);
+                               nr_taken += isolated_pages;
+                               nr_lumpy_taken += isolated_pages;
                                 if (PageDirty(cursor_page))
-                                       nr_lumpy_dirty++;
+                                       nr_lumpy_dirty += isolated_pages;
                                 scan++;
+                               pfn += isolated_pages - 1;
                         } else {
                                 /*
                                  * Check if the page is freed already.
@@ -1215,57 +1274,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                         nr_lumpy_failed++;
         }
  
-       *scanned = scan;
+       *nr_scanned = scan;
  
         trace_mm_vmscan_lru_isolate(order,
                         nr_to_scan, scan,
                         nr_taken,
                         nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
-                       mode);
+                       mode, file);
         return nr_taken;
  }
  
-static unsigned long isolate_pages_global(unsigned long nr,
-                                       struct list_head *dst,
-                                       unsigned long *scanned, int order,
-                                       isolate_mode_t mode,
-                                       struct zone *z, int active, int file)
-{
-       int lru = LRU_BASE;
-       if (active)
-               lru += LRU_ACTIVE;
-       if (file)
-               lru += LRU_FILE;
-       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
-                                                               mode, file);
-}
-
-/*
- * clear_active_flags() is a helper for shrink_active_list(), clearing
- * any active bits from the pages in the list.
- */
-static unsigned long clear_active_flags(struct list_head *page_list,
-                                       unsigned int *count)
-{
-       int nr_active = 0;
-       int lru;
-       struct page *page;
-
-       list_for_each_entry(page, page_list, lru) {
-               int numpages = hpage_nr_pages(page);
-               lru = page_lru_base_type(page);
-               if (PageActive(page)) {
-                       lru += LRU_ACTIVE;
-                       ClearPageActive(page);
-                       nr_active += numpages;
-               }
-               if (count)
-                       count[lru] += numpages;
-       }
-
-       return nr_active;
-}
-
  /**
   * isolate_lru_page - tries to isolate a page from its LRU list
   * @page: page to isolate from its LRU list
@@ -1325,7 +1343,7 @@ static int too_many_isolated(struct zone *zone, int file,
         if (current_is_kswapd())
                 return 0;
  
-       if (!scanning_global_lru(sc))
+       if (!global_reclaim(sc))
                 return 0;
  
         if (file) {
@@ -1339,27 +1357,21 @@ static int too_many_isolated(struct zone *zone, int file,
         return isolated > inactive;
  }
  
-/*
- * TODO: Try merging with migrations version of putback_lru_pages
- */
  static noinline_for_stack void
-putback_lru_pages(struct zone *zone, struct scan_control *sc,
-                               unsigned long nr_anon, unsigned long nr_file,
-                               struct list_head *page_list)
+putback_inactive_pages(struct mem_cgroup_zone *mz,
+                      struct list_head *page_list)
  {
-       struct page *page;
-       struct pagevec pvec;
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
-
-       pagevec_init(&pvec, 1);
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+       struct zone *zone = mz->zone;
+       LIST_HEAD(pages_to_free);
  
         /*
          * Put back any unfreeable pages.
          */
-       spin_lock(&zone->lru_lock);
         while (!list_empty(page_list)) {
+               struct page *page = lru_to_page(page_list);
                 int lru;
-               page = lru_to_page(page_list);
+
                 VM_BUG_ON(PageLRU(page));
                 list_del(&page->lru);
                 if (unlikely(!page_evictable(page, NULL))) {
@@ -1376,30 +1388,53 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
                         int numpages = hpage_nr_pages(page);
                         reclaim_stat->recent_rotated[file] += numpages;
                 }
-               if (!pagevec_add(&pvec, page)) {
-                       spin_unlock_irq(&zone->lru_lock);
-                       __pagevec_release(&pvec);
-                       spin_lock_irq(&zone->lru_lock);
+               if (put_page_testzero(page)) {
+                       __ClearPageLRU(page);
+                       __ClearPageActive(page);
+                       del_page_from_lru_list(zone, page, lru);
+
+                       if (unlikely(PageCompound(page))) {
+                               spin_unlock_irq(&zone->lru_lock);
+                               (*get_compound_page_dtor(page))(page);
+                               spin_lock_irq(&zone->lru_lock);
+                       } else
+                               list_add(&page->lru, &pages_to_free);
                 }
         }
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
-       __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
  
-       spin_unlock_irq(&zone->lru_lock);
-       pagevec_release(&pvec);
+       /*
+        * To save our caller's stack, now use input list for pages to free.
+        */
+       list_splice(&pages_to_free, page_list);
  }
  
-static noinline_for_stack void update_isolated_counts(struct zone *zone,
-                                       struct scan_control *sc,
-                                       unsigned long *nr_anon,
-                                       unsigned long *nr_file,
-                                       struct list_head *isolated_list)
+static noinline_for_stack void
+update_isolated_counts(struct mem_cgroup_zone *mz,
+                      struct list_head *page_list,
+                      unsigned long *nr_anon,
+                      unsigned long *nr_file)
  {
-       unsigned long nr_active;
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+       struct zone *zone = mz->zone;
         unsigned int count[NR_LRU_LISTS] = { 0, };
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+       unsigned long nr_active = 0;
+       struct page *page;
+       int lru;
+
+       /*
+        * Count pages and clear active flags
+        */
+       list_for_each_entry(page, page_list, lru) {
+               int numpages = hpage_nr_pages(page);
+               lru = page_lru_base_type(page);
+               if (PageActive(page)) {
+                       lru += LRU_ACTIVE;
+                       ClearPageActive(page);
+                       nr_active += numpages;
+               }
+               count[lru] += numpages;
+       }
  
-       nr_active = clear_active_flags(isolated_list, count);
         __count_vm_events(PGDEACTIVATE, nr_active);
  
         __mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1413,8 +1448,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
  
         *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
         *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
-       __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
  
         reclaim_stat->recent_scanned[0] += *nr_anon;
         reclaim_stat->recent_scanned[1] += *nr_file;
@@ -1466,8 +1499,8 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
   * of reclaimed pages
   */
  static noinline_for_stack unsigned long
-shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
-                       struct scan_control *sc, int priority, int file)
+shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
+                    struct scan_control *sc, int priority, int file)
  {
         LIST_HEAD(page_list);
         unsigned long nr_scanned;
@@ -1478,6 +1511,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
         unsigned long nr_dirty = 0;
         unsigned long nr_writeback = 0;
         isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
+       struct zone *zone = mz->zone;
  
         while (unlikely(too_many_isolated(zone, file, sc))) {
                 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1500,9 +1534,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
  
         spin_lock_irq(&zone->lru_lock);
  
-       if (scanning_global_lru(sc)) {
-               nr_taken = isolate_pages_global(nr_to_scan, &page_list,
-                       &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
+       nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list,
+                                    &nr_scanned, sc->order,
+                                    reclaim_mode, 0, file);
+       if (global_reclaim(sc)) {
                 zone->pages_scanned += nr_scanned;
                 if (current_is_kswapd())
                         __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1510,14 +1545,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                 else
                         __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                                nr_scanned);
-       } else {
-               nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
-                       &nr_scanned, sc->order, reclaim_mode, zone,
-                       sc->mem_cgroup, 0, file);
-               /*
-                * mem_cgroup_isolate_pages() keeps track of
-                * scanned pages on its own.
-                */
         }
  
         if (nr_taken == 0) {
@@ -1525,26 +1552,37 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                 return 0;
         }
  
-       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
+       update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
+
+       __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
+       __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
  
         spin_unlock_irq(&zone->lru_lock);
  
-       nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
+       nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
                                                 &nr_dirty, &nr_writeback);
  
         /* Check if we should syncronously wait for writeback */
         if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                 set_reclaim_mode(priority, sc, true);
-               nr_reclaimed += shrink_page_list(&page_list, zone, sc,
+               nr_reclaimed += shrink_page_list(&page_list, mz, sc,
                                         priority, &nr_dirty, &nr_writeback);
         }
  
-       local_irq_disable();
+       spin_lock_irq(&zone->lru_lock);
+
         if (current_is_kswapd())
                 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
         __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
  
-       putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+       putback_inactive_pages(mz, &page_list);
+
+       __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+       __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+
+       spin_unlock_irq(&zone->lru_lock);
+
+       free_hot_cold_page_list(&page_list, 1);
  
         /*
          * If reclaim is isolating dirty pages under writeback, it implies
@@ -1600,30 +1638,47 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
  
  static void move_active_pages_to_lru(struct zone *zone,
                                      struct list_head *list,
+                                    struct list_head *pages_to_free,
                                      enum lru_list lru)
  {
         unsigned long pgmoved = 0;
-       struct pagevec pvec;
         struct page *page;
  
-       pagevec_init(&pvec, 1);
+       if (buffer_heads_over_limit) {
+               spin_unlock_irq(&zone->lru_lock);
+               list_for_each_entry(page, list, lru) {
+                       if (page_has_private(page) && trylock_page(page)) {
+                               if (page_has_private(page))
+                                       try_to_release_page(page, 0);
+                               unlock_page(page);
+                       }
+               }
+               spin_lock_irq(&zone->lru_lock);
+       }
  
         while (!list_empty(list)) {
+               struct lruvec *lruvec;
+
                 page = lru_to_page(list);
  
                 VM_BUG_ON(PageLRU(page));
                 SetPageLRU(page);
  
-               list_move(&page->lru, &zone->lru[lru].list);
-               mem_cgroup_add_lru_list(page, lru);
+               lruvec = mem_cgroup_lru_add_list(zone, page, lru);
+               list_move(&page->lru, &lruvec->lists[lru]);
                 pgmoved += hpage_nr_pages(page);
  
-               if (!pagevec_add(&pvec, page) || list_empty(list)) {
-                       spin_unlock_irq(&zone->lru_lock);
-                       if (buffer_heads_over_limit)
-                               pagevec_strip(&pvec);
-                       __pagevec_release(&pvec);
-                       spin_lock_irq(&zone->lru_lock);
+               if (put_page_testzero(page)) {
+                       __ClearPageLRU(page);
+                       __ClearPageActive(page);
+                       del_page_from_lru_list(zone, page, lru);
+
+                       if (unlikely(PageCompound(page))) {
+                               spin_unlock_irq(&zone->lru_lock);
+                               (*get_compound_page_dtor(page))(page);
+                               spin_lock_irq(&zone->lru_lock);
+                       } else
+                               list_add(&page->lru, pages_to_free);
                 }
         }
         __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1631,19 +1686,22 @@ static void move_active_pages_to_lru(struct zone *zone,
                 __count_vm_events(PGDEACTIVATE, pgmoved);
  }
  
-static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
-                       struct scan_control *sc, int priority, int file)
+static void shrink_active_list(unsigned long nr_to_scan,
+                              struct mem_cgroup_zone *mz,
+                              struct scan_control *sc,
+                              int priority, int file)
  {
         unsigned long nr_taken;
-       unsigned long pgscanned;
+       unsigned long nr_scanned;
         unsigned long vm_flags;
         LIST_HEAD(l_hold);      /* The pages which were snipped off */
         LIST_HEAD(l_active);
         LIST_HEAD(l_inactive);
         struct page *page;
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
         unsigned long nr_rotated = 0;
         isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
+       struct zone *zone = mz->zone;
  
         lru_add_drain();
  
@@ -1653,26 +1711,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 reclaim_mode |= ISOLATE_CLEAN;
  
         spin_lock_irq(&zone->lru_lock);
-       if (scanning_global_lru(sc)) {
-               nr_taken = isolate_pages_global(nr_pages, &l_hold,
-                                               &pgscanned, sc->order,
-                                               reclaim_mode, zone,
-                                               1, file);
-               zone->pages_scanned += pgscanned;
-       } else {
-               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
-                                               &pgscanned, sc->order,
-                                               reclaim_mode, zone,
-                                               sc->mem_cgroup, 1, file);
-               /*
-                * mem_cgroup_isolate_pages() keeps track of
-                * scanned pages on its own.
-                */
-       }
+
+       nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold,
+                                    &nr_scanned, sc->order,
+                                    reclaim_mode, 1, file);
+       if (global_reclaim(sc))
+               zone->pages_scanned += nr_scanned;
  
         reclaim_stat->recent_scanned[file] += nr_taken;
  
-       __count_zone_vm_events(PGREFILL, zone, pgscanned);
+       __count_zone_vm_events(PGREFILL, zone, nr_scanned);
         if (file)
                 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
         else
@@ -1690,7 +1738,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                         continue;
                 }
  
-               if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
+               if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
                         nr_rotated += hpage_nr_pages(page);
                         /*
                          * Identify referenced, file-backed active pages and
@@ -1723,12 +1771,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
          */
         reclaim_stat->recent_rotated[file] += nr_rotated;
  
-       move_active_pages_to_lru(zone, &l_active,
+       move_active_pages_to_lru(zone, &l_active, &l_hold,
                                                 LRU_ACTIVE + file * LRU_FILE);
-       move_active_pages_to_lru(zone, &l_inactive,
+       move_active_pages_to_lru(zone, &l_inactive, &l_hold,
                                                 LRU_BASE   + file * LRU_FILE);
         __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
         spin_unlock_irq(&zone->lru_lock);
+
+       free_hot_cold_page_list(&l_hold, 1);
  }
  
  #ifdef CONFIG_SWAP
@@ -1753,10 +1803,8 @@ static int inactive_anon_is_low_global(struct zone *zone)
   * Returns true if the zone does not have enough inactive anon pages,
   * meaning some active anon pages need to be deactivated.
   */
-static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
+static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
  {
-       int low;
-
         /*
          * If we don't have swap space, anonymous page deactivation
          * is pointless.
@@ -1764,15 +1812,14 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
         if (!total_swap_pages)
                 return 0;
  
-       if (scanning_global_lru(sc))
-               low = inactive_anon_is_low_global(zone);
-       else
-               low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
-       return low;
+       if (!scanning_global_lru(mz))
+               return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
+                                                      mz->zone);
+
+       return inactive_anon_is_low_global(mz->zone);
  }
  #else
-static inline int inactive_anon_is_low(struct zone *zone,
-                                       struct scan_control *sc)
+static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
  {
         return 0;
  }
@@ -1790,8 +1837,7 @@ static int inactive_file_is_low_global(struct zone *zone)
  
  /**
   * inactive_file_is_low - check if file pages need to be deactivated
- * @zone: zone to check
- * @sc:   scan control of this context
+ * @mz: memory cgroup and zone to check
   *
   * When the system is doing streaming IO, memory pressure here
   * ensures that active file pages get deactivated, until more
@@ -1803,45 +1849,44 @@ static int inactive_file_is_low_global(struct zone *zone)
   * This uses a different ratio than the anonymous pages, because
   * the page cache uses a use-once replacement algorithm.
   */
-static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
+static int inactive_file_is_low(struct mem_cgroup_zone *mz)
  {
-       int low;
+       if (!scanning_global_lru(mz))
+               return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
+                                                      mz->zone);
  
-       if (scanning_global_lru(sc))
-               low = inactive_file_is_low_global(zone);
-       else
-               low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
-       return low;
+       return inactive_file_is_low_global(mz->zone);
  }
  
-static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
-                               int file)
+static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
  {
         if (file)
-               return inactive_file_is_low(zone, sc);
+               return inactive_file_is_low(mz);
         else
-               return inactive_anon_is_low(zone, sc);
+               return inactive_anon_is_low(mz);
  }
  
  static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-       struct zone *zone, struct scan_control *sc, int priority)
+                                struct mem_cgroup_zone *mz,
+                                struct scan_control *sc, int priority)
  {
         int file = is_file_lru(lru);
  
         if (is_active_lru(lru)) {
-               if (inactive_list_is_low(zone, sc, file))
-                   shrink_active_list(nr_to_scan, zone, sc, priority, file);
+               if (inactive_list_is_low(mz, file))
+                       shrink_active_list(nr_to_scan, mz, sc, priority, file);
                 return 0;
         }
  
-       return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
+       return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
  }
  
-static int vmscan_swappiness(struct scan_control *sc)
+static int vmscan_swappiness(struct mem_cgroup_zone *mz,
+                            struct scan_control *sc)
  {
-       if (scanning_global_lru(sc))
+       if (global_reclaim(sc))
                 return vm_swappiness;
-       return mem_cgroup_swappiness(sc->mem_cgroup);
+       return mem_cgroup_swappiness(mz->mem_cgroup);
  }
  
  /*
@@ -1852,15 +1897,15 @@ static int vmscan_swappiness(struct scan_control *sc)
   *
   * nr[0] = anon pages to scan; nr[1] = file pages to scan
   */
-static void get_scan_count(struct zone *zone, struct scan_control *sc,
-                                       unsigned long *nr, int priority)
+static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
+                          unsigned long *nr, int priority)
  {
         unsigned long anon, file, free;
         unsigned long anon_prio, file_prio;
         unsigned long ap, fp;
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
         u64 fraction[2], denominator;
-       enum lru_list l;
+       enum lru_list lru;
         int noswap = 0;
         bool force_scan = false;
  
@@ -1874,9 +1919,9 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
          * latencies, so it's better to scan a minimum amount there as
          * well.
          */
-       if (scanning_global_lru(sc) && current_is_kswapd())
+       if (current_is_kswapd() && mz->zone->all_unreclaimable)
                 force_scan = true;
-       if (!scanning_global_lru(sc))
+       if (!global_reclaim(sc))
                 force_scan = true;
  
         /* If we have no swap space, do not bother scanning anon pages. */
@@ -1888,16 +1933,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                 goto out;
         }
  
-       anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
-               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
-       file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
-               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+       anon  = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
+               zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
+       file  = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
+               zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
  
-       if (scanning_global_lru(sc)) {
-               free  = zone_page_state(zone, NR_FREE_PAGES);
+       if (global_reclaim(sc)) {
+               free  = zone_page_state(mz->zone, NR_FREE_PAGES);
                 /* If we have very few page cache pages,
                    force-scan anon pages. */
-               if (unlikely(file + free <= high_wmark_pages(zone))) {
+               if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
                         fraction[0] = 1;
                         fraction[1] = 0;
                         denominator = 1;
@@ -1909,8 +1954,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
          * With swappiness at 100, anonymous and file have the same priority.
          * This scanning priority is essentially the inverse of IO cost.
          */
-       anon_prio = vmscan_swappiness(sc);
-       file_prio = 200 - vmscan_swappiness(sc);
+       anon_prio = vmscan_swappiness(mz, sc);
+       file_prio = 200 - vmscan_swappiness(mz, sc);
  
         /*
          * OK, so we have swap space and a fair amount of page cache
@@ -1923,7 +1968,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
          *
          * anon in [0], file in [1]
          */
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&mz->zone->lru_lock);
         if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
                 reclaim_stat->recent_scanned[0] /= 2;
                 reclaim_stat->recent_rotated[0] /= 2;
@@ -1944,24 +1989,24 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
  
         fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
         fp /= reclaim_stat->recent_rotated[1] + 1;
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(&mz->zone->lru_lock);
  
         fraction[0] = ap;
         fraction[1] = fp;
         denominator = ap + fp + 1;
  out:
-       for_each_evictable_lru(l) {
-               int file = is_file_lru(l);
+       for_each_evictable_lru(lru) {
+               int file = is_file_lru(lru);
                 unsigned long scan;
  
-               scan = zone_nr_lru_pages(zone, sc, l);
+               scan = zone_nr_lru_pages(mz, lru);
                 if (priority || noswap) {
                         scan >>= priority;
                         if (!scan && force_scan)
                                 scan = SWAP_CLUSTER_MAX;
                         scan = div64_u64(scan * fraction[file], denominator);
                 }
-               nr[l] = scan;
+               nr[lru] = scan;
         }
  }
  
@@ -1972,7 +2017,7 @@ out:
   * back to the allocator and call try_to_compact_zone(), we ensure that
   * there are enough free pages for it to be likely successful
   */
-static inline bool should_continue_reclaim(struct zone *zone,
+static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
                                         unsigned long nr_reclaimed,
                                         unsigned long nr_scanned,
                                         struct scan_control *sc)
@@ -2012,14 +2057,15 @@ static inline bool should_continue_reclaim(struct zone *zone,
          * inactive lists are large enough, continue reclaiming
          */
         pages_for_compaction = (2UL << sc->order);
-       inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
-                               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+       inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+       if (nr_swap_pages > 0)
+               inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
         if (sc->nr_reclaimed < pages_for_compaction &&
                         inactive_lru_pages > pages_for_compaction)
                 return true;
  
         /* If compaction would go ahead or the allocation would succeed, stop */
-       switch (compaction_suitable(zone, sc->order)) {
+       switch (compaction_suitable(mz->zone, sc->order)) {
         case COMPACT_PARTIAL:
         case COMPACT_CONTINUE:
                 return false;
@@ -2031,12 +2077,12 @@ static inline bool should_continue_reclaim(struct zone *zone,
  /*
   * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
   */
-static void shrink_zone(int priority, struct zone *zone,
-                               struct scan_control *sc)
+static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
+                                  struct scan_control *sc)
  {
         unsigned long nr[NR_LRU_LISTS];
         unsigned long nr_to_scan;
-       enum lru_list l;
+       enum lru_list lru;
         unsigned long nr_reclaimed, nr_scanned;
         unsigned long nr_to_reclaim = sc->nr_to_reclaim;
         struct blk_plug plug;
@@ -2044,19 +2090,19 @@ static void shrink_zone(int priority, struct zone *zone,
  restart:
         nr_reclaimed = 0;
         nr_scanned = sc->nr_scanned;
-       get_scan_count(zone, sc, nr, priority);
+       get_scan_count(mz, sc, nr, priority);
  
         blk_start_plug(&plug);
         while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                         nr[LRU_INACTIVE_FILE]) {
-               for_each_evictable_lru(l) {
-                       if (nr[l]) {
+               for_each_evictable_lru(lru) {
+                       if (nr[lru]) {
                                 nr_to_scan = min_t(unsigned long,
-                                                  nr[l], SWAP_CLUSTER_MAX);
-                               nr[l] -= nr_to_scan;
+                                                  nr[lru], SWAP_CLUSTER_MAX);
+                               nr[lru] -= nr_to_scan;
  
-                               nr_reclaimed += shrink_list(l, nr_to_scan,
-                                                           zone, sc, priority);
+                               nr_reclaimed += shrink_list(lru, nr_to_scan,
+                                                           mz, sc, priority);
                         }
                 }
                 /*
@@ -2077,17 +2123,89 @@ restart:
          * Even if we did not try to evict anon pages at all, we want to
          * rebalance the anon lru active/inactive ratio.
          */
-       if (inactive_anon_is_low(zone, sc))
-               shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+       if (inactive_anon_is_low(mz))
+               shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
  
         /* reclaim/compaction might need reclaim to continue */
-       if (should_continue_reclaim(zone, nr_reclaimed,
+       if (should_continue_reclaim(mz, nr_reclaimed,
                                         sc->nr_scanned - nr_scanned, sc))
                 goto restart;
  
         throttle_vm_writeout(sc->gfp_mask);
  }
  
+static void shrink_zone(int priority, struct zone *zone,
+                       struct scan_control *sc)
+{
+       struct mem_cgroup *root = sc->target_mem_cgroup;
+       struct mem_cgroup_reclaim_cookie reclaim = {
+               .zone = zone,
+               .priority = priority,
+       };
+       struct mem_cgroup *memcg;
+
+       memcg = mem_cgroup_iter(root, NULL, &reclaim);
+       do {
+               struct mem_cgroup_zone mz = {
+                       .mem_cgroup = memcg,
+                       .zone = zone,
+               };
+
+               shrink_mem_cgroup_zone(priority, &mz, sc);
+               /*
+                * Limit reclaim has historically picked one memcg and
+                * scanned it with decreasing priority levels until
+                * nr_to_reclaim had been reclaimed.  This priority
+                * cycle is thus over after a single memcg.
+                *
+                * Direct reclaim and kswapd, on the other hand, have
+                * to scan all memory cgroups to fulfill the overall
+                * scan target for the zone.
+                */
+               if (!global_reclaim(sc)) {
+                       mem_cgroup_iter_break(root, memcg);
+                       break;
+               }
+               memcg = mem_cgroup_iter(root, memcg, &reclaim);
+       } while (memcg);
+}
+
+/* Returns true if compaction should go ahead for a high-order request */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+       unsigned long balance_gap, watermark;
+       bool watermark_ok;
+
+       /* Do not consider compaction for orders reclaim is meant to satisfy */
+       if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
+               return false;
+
+       /*
+        * Compaction takes time to run and there are potentially other
+        * callers using the pages just freed. Continue reclaiming until
+        * there is a buffer of free pages available to give compaction
+        * a reasonable chance of completing and allocating the page
+        */
+       balance_gap = min(low_wmark_pages(zone),
+               (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                       KSWAPD_ZONE_BALANCE_GAP_RATIO);
+       watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+
+       /*
+        * If compaction is deferred, reclaim up to a point where
+        * compaction will have a chance of success when re-enabled
+        */
+       if (compaction_deferred(zone))
+               return watermark_ok;
+
+       /* If compaction is not ready to start, keep reclaiming */
+       if (!compaction_suitable(zone, sc->order))
+               return false;
+
+       return watermark_ok;
+}
+
  /*
   * This is the direct reclaim path, for page-allocating processes.  We only
   * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2105,8 +2223,9 @@ restart:
   * scan then give up on it.
   *
   * This function returns true if a zone is being reclaimed for a costly
- * high-order allocation and compaction is either ready to begin or deferred.
- * This indicates to the caller that it should retry the allocation or fail.
+ * high-order allocation and compaction is ready to begin. This indicates to
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
   */
  static bool shrink_zones(int priority, struct zonelist *zonelist,
                                         struct scan_control *sc)
@@ -2115,7 +2234,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
         struct zone *zone;
         unsigned long nr_soft_reclaimed;
         unsigned long nr_soft_scanned;
-       bool should_abort_reclaim = false;
+       bool aborted_reclaim = false;
  
         for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                         gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2125,7 +2244,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                  * Take care memory controller reclaiming has small influence
                  * to global LRU.
                  */
-               if (scanning_global_lru(sc)) {
+               if (global_reclaim(sc)) {
                         if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                 continue;
                         if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -2140,10 +2259,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                                  * noticable problem, like transparent huge page
                                  * allocations.
                                  */
-                               if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
-                                       (compaction_suitable(zone, sc->order) ||
-                                        compaction_deferred(zone))) {
-                                       should_abort_reclaim = true;
+                               if (compaction_ready(zone, sc)) {
+                                       aborted_reclaim = true;
                                         continue;
                                 }
                         }
@@ -2165,7 +2282,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                 shrink_zone(priority, zone, sc);
         }
  
-       return should_abort_reclaim;
+       return aborted_reclaim;
  }
  
  static bool zone_reclaimable(struct zone *zone)
@@ -2219,25 +2336,25 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
         struct zoneref *z;
         struct zone *zone;
         unsigned long writeback_threshold;
+       bool aborted_reclaim;
  
         get_mems_allowed();
         delayacct_freepages_start();
  
-       if (scanning_global_lru(sc))
+       if (global_reclaim(sc))
                 count_vm_event(ALLOCSTALL);
  
         for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                 sc->nr_scanned = 0;
                 if (!priority)
-                       disable_swap_token(sc->mem_cgroup);
-               if (shrink_zones(priority, zonelist, sc))
-                       break;
+                       disable_swap_token(sc->target_mem_cgroup);
+               aborted_reclaim = shrink_zones(priority, zonelist, sc);
  
                 /*
                  * Don't shrink slabs when reclaiming memory from
                  * over limit cgroups
                  */
-               if (scanning_global_lru(sc)) {
+               if (global_reclaim(sc)) {
                         unsigned long lru_pages = 0;
                         for_each_zone_zonelist(zone, z, zonelist,
                                         gfp_zone(sc->gfp_mask)) {
@@ -2298,8 +2415,12 @@ out:
         if (oom_killer_disabled)
                 return 0;
  
+       /* Aborted reclaim to try compaction? don't OOM, then */
+       if (aborted_reclaim)
+               return 1;
+
         /* top priority shrink_zones still had more to do? don't OOM, then */
-       if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
+       if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
                 return 1;
  
         return 0;
@@ -2316,7 +2437,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                 .may_unmap = 1,
                 .may_swap = 1,
                 .order = order,
-               .mem_cgroup = NULL,
+               .target_mem_cgroup = NULL,
                 .nodemask = nodemask,
         };
         struct shrink_control shrink = {
@@ -2336,7 +2457,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
  
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
  
-unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                                                 gfp_t gfp_mask, bool noswap,
                                                 struct zone *zone,
                                                 unsigned long *nr_scanned)
@@ -2348,7 +2469,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                 .may_unmap = 1,
                 .may_swap = !noswap,
                 .order = 0,
-               .mem_cgroup = mem,
+               .target_mem_cgroup = memcg,
+       };
+       struct mem_cgroup_zone mz = {
+               .mem_cgroup = memcg,
+               .zone = zone,
         };
  
         sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2365,7 +2490,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
          * will pick up pages from other mem cgroup's as well. We hack
          * the priority and make it zero.
          */
-       shrink_zone(0, zone, &sc);
+       shrink_mem_cgroup_zone(0, &mz, &sc);
  
         trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
  
@@ -2373,7 +2498,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
         return sc.nr_reclaimed;
  }
  
-unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                            gfp_t gfp_mask,
                                            bool noswap)
  {
@@ -2386,7 +2511,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                 .may_swap = !noswap,
                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
                 .order = 0,
-               .mem_cgroup = mem_cont,
+               .target_mem_cgroup = memcg,
                 .nodemask = NULL, /* we don't care the placement */
                 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                                 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2400,7 +2525,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
          * take care of from where we get pages. So the node where we start the
          * scan does not need to be the current node.
          */
-       nid = mem_cgroup_select_victim_node(mem_cont);
+       nid = mem_cgroup_select_victim_node(memcg);
  
         zonelist = NODE_DATA(nid)->node_zonelists;
  
@@ -2416,6 +2541,29 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
  }
  #endif
  
+static void age_active_anon(struct zone *zone, struct scan_control *sc,
+                           int priority)
+{
+       struct mem_cgroup *memcg;
+
+       if (!total_swap_pages)
+               return;
+
+       memcg = mem_cgroup_iter(NULL, NULL, NULL);
+       do {
+               struct mem_cgroup_zone mz = {
+                       .mem_cgroup = memcg,
+                       .zone = zone,
+               };
+
+               if (inactive_anon_is_low(&mz))
+                       shrink_active_list(SWAP_CLUSTER_MAX, &mz,
+                                          sc, priority, 0);
+
+               memcg = mem_cgroup_iter(NULL, memcg, NULL);
+       } while (memcg);
+}
+
  /*
   * pgdat_balanced is used when checking if a node is balanced for high-order
   * allocations. Only zones that meet watermarks and are in a zone allowed
@@ -2536,7 +2684,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                  */
                 .nr_to_reclaim = ULONG_MAX,
                 .order = order,
-               .mem_cgroup = NULL,
+               .target_mem_cgroup = NULL,
         };
         struct shrink_control shrink = {
                 .gfp_mask = sc.gfp_mask,
@@ -2575,9 +2723,7 @@ loop_again:
                          * Do some background aging of the anon list, to give
                          * pages a chance to be referenced before reclaiming.
                          */
-                       if (inactive_anon_is_low(zone, &sc))
-                               shrink_active_list(SWAP_CLUSTER_MAX, zone,
-                                                       &sc, priority, 0);
+                       age_active_anon(zone, &sc, priority);
  
                         if (!zone_watermark_ok_safe(zone, order,
                                         high_wmark_pages(zone), 0, 0)) {
@@ -3366,16 +3512,18 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
   */
  static void check_move_unevictable_page(struct page *page, struct zone *zone)
  {
-       VM_BUG_ON(PageActive(page));
+       struct lruvec *lruvec;
  
+       VM_BUG_ON(PageActive(page));
  retry:
         ClearPageUnevictable(page);
         if (page_evictable(page, NULL)) {
                 enum lru_list l = page_lru_base_type(page);
  
                 __dec_zone_state(zone, NR_UNEVICTABLE);
-               list_move(&page->lru, &zone->lru[l].list);
-               mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
+               lruvec = mem_cgroup_lru_move_lists(zone, page,
+                                                  LRU_UNEVICTABLE, l);
+               list_move(&page->lru, &lruvec->lists[l]);
                 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
                 __count_vm_event(UNEVICTABLE_PGRESCUED);
         } else {
@@ -3383,8 +3531,9 @@ retry:
                  * rotate unevictable list
                  */
                 SetPageUnevictable(page);
-               list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
-               mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
+               lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE,
+                                                  LRU_UNEVICTABLE);
+               list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]);
                 if (page_evictable(page, NULL))
                         goto retry;
         }
@@ -3448,9 +3597,10 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
  static void warn_scan_unevictable_pages(void)
  {
         printk_once(KERN_WARNING
-                   "The scan_unevictable_pages sysctl/node-interface has been "
+                   "%s: The scan_unevictable_pages sysctl/node-interface has been "
                     "disabled for lack of a legitimate use case.  If you have "
-                   "one, please send an email to linux-mm@kvack.org.\n");
+                   "one, please send an email to linux-mm@kvack.org.\n",
+                   current->comm);
  }
  
  /*
@@ -3475,16 +3625,16 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
   * a specified node's per zone unevictable lists for evictable pages.
   */
  
-static ssize_t read_scan_unevictable_node(struct sys_device *dev,
-                                         struct sysdev_attribute *attr,
+static ssize_t read_scan_unevictable_node(struct device *dev,
+                                         struct device_attribute *attr,
                                           char *buf)
  {
         warn_scan_unevictable_pages();
         return sprintf(buf, "0\n");     /* always zero; should fit... */
  }
  
-static ssize_t write_scan_unevictable_node(struct sys_device *dev,
-                                          struct sysdev_attribute *attr,
+static ssize_t write_scan_unevictable_node(struct device *dev,
+                                          struct device_attribute *attr,
                                         const char *buf, size_t count)
  {
         warn_scan_unevictable_pages();
@@ -3492,17 +3642,17 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev,
  }
  
  
-static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
+static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
                         read_scan_unevictable_node,
                         write_scan_unevictable_node);
  
  int scan_unevictable_register_node(struct node *node)
  {
-       return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
+       return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
  }
  
  void scan_unevictable_unregister_node(struct node *node)
  {
-       sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
+       device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
  }
  #endif