net/mlx4_en: Fix mixed PFC and Global pause user control requests

[pandora-kernel.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 2b8ba3a..ba0fa4b 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
  #include <linux/ftrace_event.h>
  #include <linux/memcontrol.h>
  #include <linux/prefetch.h>
+#include <linux/nmi.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
@@ -541,7 +542,7 @@ static inline void __free_one_page(struct page *page,
                 combined_idx = buddy_idx & page_idx;
                 higher_page = page + (combined_idx - page_idx);
                 buddy_idx = __find_buddy_index(combined_idx, order + 1);
-               higher_buddy = page + (buddy_idx - combined_idx);
+               higher_buddy = higher_page + (buddy_idx - combined_idx);
                 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                         list_add_tail(&page->lru,
                                 &zone->free_area[order].free_list[migratetype]);
@@ -1124,9 +1125,14 @@ void drain_all_pages(void)
  
  #ifdef CONFIG_HIBERNATION
  
+/*
+ * Touch the watchdog for every WD_PAGE_COUNT pages.
+ */
+#define WD_PAGE_COUNT  (128*1024)
+
  void mark_free_pages(struct zone *zone)
  {
-       unsigned long pfn, max_zone_pfn;
+       unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
         unsigned long flags;
         int order, t;
         struct list_head *curr;
@@ -1141,6 +1147,11 @@ void mark_free_pages(struct zone *zone)
                 if (pfn_valid(pfn)) {
                         struct page *page = pfn_to_page(pfn);
  
+                       if (!--page_count) {
+                               touch_nmi_watchdog();
+                               page_count = WD_PAGE_COUNT;
+                       }
+
                         if (!swsusp_page_is_forbidden(page))
                                 swsusp_unset_page_free(page);
                 }
@@ -1150,8 +1161,13 @@ void mark_free_pages(struct zone *zone)
                         unsigned long i;
  
                         pfn = page_to_pfn(list_entry(curr, struct page, lru));
-                       for (i = 0; i < (1UL << order); i++)
+                       for (i = 0; i < (1UL << order); i++) {
+                               if (!--page_count) {
+                                       touch_nmi_watchdog();
+                                       page_count = WD_PAGE_COUNT;
+                               }
                                 swsusp_set_page_free(pfn_to_page(pfn + i));
+                       }
                 }
         }
         spin_unlock_irqrestore(&zone->lock, flags);
@@ -1759,6 +1775,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
         if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
                 return;
  
+       /*
+        * Walking all memory to count page types is very expensive and should
+        * be inhibited in non-blockable contexts.
+        */
+       if (!(gfp_mask & __GFP_WAIT))
+               filter |= SHOW_MEM_FILTER_PAGE_COUNT;
+
         /*
          * This documents exceptions given to allocations in certain
          * contexts that are allowed to allocate outside current's set
@@ -1886,13 +1909,19 @@ static struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         struct zonelist *zonelist, enum zone_type high_zoneidx,
         nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress,
-       bool sync_migration)
+       int migratetype, bool sync_migration,
+       bool *deferred_compaction,
+       unsigned long *did_some_progress)
  {
         struct page *page;
  
-       if (!order || compaction_deferred(preferred_zone))
+       if (!order)
+               return NULL;
+
+       if (compaction_deferred(preferred_zone)) {
+               *deferred_compaction = true;
                 return NULL;
+       }
  
         current->flags |= PF_MEMALLOC;
         *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
@@ -1921,7 +1950,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                  * but not enough to satisfy watermarks.
                  */
                 count_vm_event(COMPACTFAIL);
-               defer_compaction(preferred_zone);
+
+               /*
+                * As async compaction considers a subset of pageblocks, only
+                * defer if the failure was a sync compaction failure.
+                */
+               if (sync_migration)
+                       defer_compaction(preferred_zone);
  
                 cond_resched();
         }
@@ -1933,8 +1968,9 @@ static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         struct zonelist *zonelist, enum zone_type high_zoneidx,
         nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress,
-       bool sync_migration)
+       int migratetype, bool sync_migration,
+       bool *deferred_compaction,
+       unsigned long *did_some_progress)
  {
         return NULL;
  }
@@ -2034,7 +2070,7 @@ static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
         int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
-       const gfp_t wait = gfp_mask & __GFP_WAIT;
+       const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
  
         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2043,20 +2079,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
          * The caller may dip into page reserves a bit more if the caller
          * cannot run direct reclaim, or if the caller has realtime scheduling
          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-        * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+        * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
          */
         alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
  
-       if (!wait) {
+       if (atomic) {
                 /*
-                * Not worth trying to allocate harder for
-                * __GFP_NOMEMALLOC even if it can't schedule.
+                * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+                * if it can't schedule.
                  */
-               if  (!(gfp_mask & __GFP_NOMEMALLOC))
+               if (!(gfp_mask & __GFP_NOMEMALLOC))
                         alloc_flags |= ALLOC_HARDER;
                 /*
-                * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
-                * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+                * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+                * comment for __cpuset_node_allowed_softwall().
                  */
                 alloc_flags &= ~ALLOC_CPUSET;
         } else if (unlikely(rt_task(current)) && !in_interrupt())
@@ -2084,6 +2120,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         unsigned long pages_reclaimed = 0;
         unsigned long did_some_progress;
         bool sync_migration = false;
+       bool deferred_compaction = false;
  
         /*
          * In the slowpath, we sanity check order to avoid ever trying to
@@ -2164,12 +2201,22 @@ rebalance:
                                         zonelist, high_zoneidx,
                                         nodemask,
                                         alloc_flags, preferred_zone,
-                                       migratetype, &did_some_progress,
-                                       sync_migration);
+                                       migratetype, sync_migration,
+                                       &deferred_compaction,
+                                       &did_some_progress);
         if (page)
                 goto got_pg;
         sync_migration = true;
  
+       /*
+        * If compaction is deferred for high-order allocations, it is because
+        * sync compaction recently failed. In this is the case and the caller
+        * has requested the system not be heavily disrupted, fail the
+        * allocation now instead of entering direct reclaim
+        */
+       if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+               goto nopage;
+
         /* Try direct reclaim and then allocating */
         page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                         zonelist, high_zoneidx,
@@ -2232,8 +2279,9 @@ rebalance:
                                         zonelist, high_zoneidx,
                                         nodemask,
                                         alloc_flags, preferred_zone,
-                                       migratetype, &did_some_progress,
-                                       sync_migration);
+                                       migratetype, sync_migration,
+                                       &deferred_compaction,
+                                       &did_some_progress);
                 if (page)
                         goto got_pg;
         }
@@ -2257,8 +2305,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  {
         enum zone_type high_zoneidx = gfp_zone(gfp_mask);
         struct zone *preferred_zone;
-       struct page *page;
+       struct page *page = NULL;
         int migratetype = allocflags_to_migratetype(gfp_mask);
+       unsigned int cpuset_mems_cookie;
  
         gfp_mask &= gfp_allowed_mask;
  
@@ -2277,15 +2326,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
         if (unlikely(!zonelist->_zonerefs->zone))
                 return NULL;
  
-       get_mems_allowed();
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
+
         /* The preferred zone is used for statistics later */
         first_zones_zonelist(zonelist, high_zoneidx,
                                 nodemask ? : &cpuset_current_mems_allowed,
                                 &preferred_zone);
-       if (!preferred_zone) {
-               put_mems_allowed();
-               return NULL;
-       }
+       if (!preferred_zone)
+               goto out;
  
         /* First allocation attempt */
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2295,9 +2344,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                 page = __alloc_pages_slowpath(gfp_mask, order,
                                 zonelist, high_zoneidx, nodemask,
                                 preferred_zone, migratetype);
-       put_mems_allowed();
  
         trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+       /*
+        * When updating a task's mems_allowed, it is possible to race with
+        * parallel threads in such a way that an allocation can fail while
+        * the mask is being updated. If a page allocation is about to fail,
+        * check if the cpuset changed during allocation and if so, retry.
+        */
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
+
         return page;
  }
  EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2521,13 +2580,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  bool skip_free_areas_node(unsigned int flags, int nid)
  {
         bool ret = false;
+       unsigned int cpuset_mems_cookie;
  
         if (!(flags & SHOW_MEM_FILTER_NODES))
                 goto out;
  
-       get_mems_allowed();
-       ret = !node_isset(nid, cpuset_current_mems_allowed);
-       put_mems_allowed();
+       do {
+               cpuset_mems_cookie = get_mems_allowed();
+               ret = !node_isset(nid, cpuset_current_mems_allowed);
+       } while (!put_mems_allowed(cpuset_mems_cookie));
  out:
         return ret;
  }
@@ -3407,25 +3468,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                 if (page_to_nid(page) != zone_to_nid(zone))
                         continue;
  
-               /* Blocks with reserved pages will never free, skip them. */
-               block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-               if (pageblock_is_reserved(pfn, block_end_pfn))
-                       continue;
-
                 block_migratetype = get_pageblock_migratetype(page);
  
-               /* If this block is reserved, account for it */
-               if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
-                       reserve--;
-                       continue;
-               }
+               /* Only test what is necessary when the reserves are not met */
+               if (reserve > 0) {
+                       /*
+                        * Blocks with reserved pages will never free, skip
+                        * them.
+                        */
+                       block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                       if (pageblock_is_reserved(pfn, block_end_pfn))
+                               continue;
  
-               /* Suitable for reserving if this block is movable */
-               if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
-                       set_pageblock_migratetype(page, MIGRATE_RESERVE);
-                       move_freepages_block(zone, page, MIGRATE_RESERVE);
-                       reserve--;
-                       continue;
+                       /* If this block is reserved, account for it */
+                       if (block_migratetype == MIGRATE_RESERVE) {
+                               reserve--;
+                               continue;
+                       }
+
+                       /* Suitable for reserving if this block is movable */
+                       if (block_migratetype == MIGRATE_MOVABLE) {
+                               set_pageblock_migratetype(page,
+                                                       MIGRATE_RESERVE);
+                               move_freepages_block(zone, page,
+                                                       MIGRATE_RESERVE);
+                               reserve--;
+                               continue;
+                       }
                 }
  
                 /*
@@ -4207,10 +4276,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
   * round what is now in bits to nearest long in bits, then return it in
   * bytes.
   */
-static unsigned long __init usemap_size(unsigned long zonesize)
+static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
  {
         unsigned long usemapsize;
  
+       zonesize += zone_start_pfn & (pageblock_nr_pages-1);
         usemapsize = roundup(zonesize, pageblock_nr_pages);
         usemapsize = usemapsize >> pageblock_order;
         usemapsize *= NR_PAGEBLOCK_BITS;
@@ -4220,40 +4290,41 @@ static unsigned long __init usemap_size(unsigned long zonesize)
  }
  
  static void __init setup_usemap(struct pglist_data *pgdat,
-                               struct zone *zone, unsigned long zonesize)
+                               struct zone *zone,
+                               unsigned long zone_start_pfn,
+                               unsigned long zonesize)
  {
-       unsigned long usemapsize = usemap_size(zonesize);
+       unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
         zone->pageblock_flags = NULL;
         if (usemapsize)
                 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
                                                                    usemapsize);
  }
  #else
-static inline void setup_usemap(struct pglist_data *pgdat,
-                               struct zone *zone, unsigned long zonesize) {}
+static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
+                               unsigned long zone_start_pfn, unsigned long zonesize) {}
  #endif /* CONFIG_SPARSEMEM */
  
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
  
-/* Return a sensible default order for the pageblock size. */
-static inline int pageblock_default_order(void)
-{
-       if (HPAGE_SHIFT > PAGE_SHIFT)
-               return HUGETLB_PAGE_ORDER;
-
-       return MAX_ORDER-1;
-}
-
  /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(unsigned int order)
+void __init set_pageblock_order(void)
  {
+       unsigned int order;
+
         /* Check that pageblock_nr_pages has not already been setup */
         if (pageblock_order)
                 return;
  
+       if (HPAGE_SHIFT > PAGE_SHIFT)
+               order = HUGETLB_PAGE_ORDER;
+       else
+               order = MAX_ORDER - 1;
+
         /*
          * Assume the largest contiguous order of interest is a huge page.
-        * This value may be variable depending on boot parameters on IA64
+        * This value may be variable depending on boot parameters on IA64 and
+        * powerpc.
          */
         pageblock_order = order;
  }
@@ -4261,15 +4332,13 @@ static inline void __init set_pageblock_order(unsigned int order)
  
  /*
   * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
- * and pageblock_default_order() are unused as pageblock_order is set
- * at compile-time. See include/linux/pageblock-flags.h for the values of
- * pageblock_order based on the kernel config
+ * is unused as pageblock_order is set at compile-time. See
+ * include/linux/pageblock-flags.h for the values of pageblock_order based on
+ * the kernel config
   */
-static inline int pageblock_default_order(unsigned int order)
+void __init set_pageblock_order(void)
  {
-       return MAX_ORDER-1;
  }
-#define set_pageblock_order(x) do {} while (0)
  
  #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
  
@@ -4357,8 +4426,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                 if (!size)
                         continue;
  
-               set_pageblock_order(pageblock_default_order());
-               setup_usemap(pgdat, zone, size);
+               set_pageblock_order();
+               setup_usemap(pgdat, zone, zone_start_pfn, size);
                 ret = init_currently_empty_zone(zone, zone_start_pfn,
                                                 size, MEMMAP_EARLY);
                 BUG_ON(ret);
@@ -5489,7 +5558,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
         pfn &= (PAGES_PER_SECTION-1);
         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
  #else
-       pfn = pfn - zone->zone_start_pfn;
+       pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
  #endif /* CONFIG_SPARSEMEM */
  }
@@ -5608,6 +5677,17 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
  bool is_pageblock_removable_nolock(struct page *page)
  {
         struct zone *zone = page_zone(page);
+       unsigned long pfn = page_to_pfn(page);
+
+       /*
+        * We have to be careful here because we are iterating over memory
+        * sections which are not zone aware so we might end up outside of
+        * the zone but still within the section.
+        */
+       if (!zone || zone->zone_start_pfn > pfn ||
+                       zone->zone_start_pfn + zone->spanned_pages <= pfn)
+               return false;
+
         return __count_immobile_pages(zone, page, 0);
  }
  
@@ -5720,6 +5800,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                 zone->free_area[order].nr_free--;
                 __mod_zone_page_state(zone, NR_FREE_PAGES,
                                       - (1UL << order));
+#ifdef CONFIG_HIGHMEM
+               if (PageHighMem(page))
+                       totalhigh_pages -= 1 << order;
+#endif
                 for (i = 0; i < (1 << order); i++)
                         SetPageReserved((page+i));
                 pfn += (1 << order);