Merge x86-64 update from Andi
[pandora-kernel.git] / mm / page_alloc.c
index 259a71b..104e69c 100644 (file)
@@ -67,7 +67,6 @@ long nr_swap_pages;
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
 
 EXPORT_SYMBOL(totalram_pages);
-EXPORT_SYMBOL(nr_swap_pages);
 
 /*
  * Used by page_zone() to look up the address of the struct zone whose
@@ -736,9 +735,7 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
                }
                local_irq_restore(flags);
                put_cpu();
-       }
-
-       if (page == NULL) {
+       } else {
                spin_lock_irqsave(&zone->lock, flags);
                page = __rmqueue(zone, order);
                spin_unlock_irqrestore(&zone->lock, flags);
@@ -758,20 +755,25 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
        return page;
 }
 
+#define ALLOC_NO_WATERMARKS    0x01 /* don't check watermarks at all */
+#define ALLOC_HARDER           0x02 /* try to alloc harder */
+#define ALLOC_HIGH             0x04 /* __GFP_HIGH set */
+#define ALLOC_CPUSET           0x08 /* check for correct cpuset */
+
 /*
  * Return 1 if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                     int classzone_idx, int can_try_harder, gfp_t gfp_high)
+                     int classzone_idx, int alloc_flags)
 {
        /* free_pages my go negative - that's OK */
        long min = mark, free_pages = z->free_pages - (1 << order) + 1;
        int o;
 
-       if (gfp_high)
+       if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
-       if (can_try_harder)
+       if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
 
        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -789,14 +791,40 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return 1;
 }
 
-static inline int
-should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
+/*
+ * get_page_from_freeliest goes through the zonelist trying to allocate
+ * a page.
+ */
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+               struct zonelist *zonelist, int alloc_flags)
 {
-       if (!z->reclaim_pages)
-               return 0;
-       if (gfp_mask & __GFP_NORECLAIM)
-               return 0;
-       return 1;
+       struct zone **z = zonelist->zones;
+       struct page *page = NULL;
+       int classzone_idx = zone_idx(*z);
+
+       /*
+        * Go through the zonelist once, looking for a zone with enough free.
+        * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+        */
+       do {
+               if ((alloc_flags & ALLOC_CPUSET) &&
+                               !cpuset_zone_allowed(*z, gfp_mask))
+                       continue;
+
+               if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+                       if (!zone_watermark_ok(*z, order, (*z)->pages_low,
+                                   classzone_idx, alloc_flags))
+                               continue;
+               }
+
+               page = buffered_rmqueue(*z, order, gfp_mask);
+               if (page) {
+                       zone_statistics(zonelist, *z);
+                       break;
+               }
+       } while (*(++z) != NULL);
+       return page;
 }
 
 /*
@@ -807,105 +835,75 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
                struct zonelist *zonelist)
 {
        const gfp_t wait = gfp_mask & __GFP_WAIT;
-       struct zone **zones, *z;
+       struct zone **z;
        struct page *page;
        struct reclaim_state reclaim_state;
        struct task_struct *p = current;
-       int i;
-       int classzone_idx;
        int do_retry;
-       int can_try_harder;
+       int alloc_flags;
        int did_some_progress;
 
        might_sleep_if(wait);
 
-       /*
-        * The caller may dip into page reserves a bit more if the caller
-        * cannot run direct reclaim, or is the caller has realtime scheduling
-        * policy
-        */
-       can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
+       z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 
-       zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
-
-       if (unlikely(zones[0] == NULL)) {
+       if (unlikely(*z == NULL)) {
                /* Should this ever happen?? */
                return NULL;
        }
+restart:
+       page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+                               zonelist, ALLOC_CPUSET);
+       if (page)
+               goto got_pg;
 
-       classzone_idx = zone_idx(zones[0]);
+       do
+               wakeup_kswapd(*z, order);
+       while (*(++z));
 
-restart:
        /*
-        * Go through the zonelist once, looking for a zone with enough free.
-        * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+        * OK, we're below the kswapd watermark and have kicked background
+        * reclaim. Now things get more complex, so set up alloc_flags according
+        * to how we want to proceed.
+        *
+        * The caller may dip into page reserves a bit more if the caller
+        * cannot run direct reclaim, or if the caller has realtime scheduling
+        * policy.
         */
-       for (i = 0; (z = zones[i]) != NULL; i++) {
-               int do_reclaim = should_reclaim_zone(z, gfp_mask);
-
-               if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-                       continue;
-
-               /*
-                * If the zone is to attempt early page reclaim then this loop
-                * will try to reclaim pages and check the watermark a second
-                * time before giving up and falling back to the next zone.
-                */
-zone_reclaim_retry:
-               if (!zone_watermark_ok(z, order, z->pages_low,
-                                      classzone_idx, 0, 0)) {
-                       if (!do_reclaim)
-                               continue;
-                       else {
-                               zone_reclaim(z, gfp_mask, order);
-                               /* Only try reclaim once */
-                               do_reclaim = 0;
-                               goto zone_reclaim_retry;
-                       }
-               }
-
-               page = buffered_rmqueue(z, order, gfp_mask);
-               if (page)
-                       goto got_pg;
-       }
-
-       for (i = 0; (z = zones[i]) != NULL; i++)
-               wakeup_kswapd(z, order);
+       alloc_flags = 0;
+       if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+               alloc_flags |= ALLOC_HARDER;
+       if (gfp_mask & __GFP_HIGH)
+               alloc_flags |= ALLOC_HIGH;
+       if (wait)
+               alloc_flags |= ALLOC_CPUSET;
 
        /*
         * Go through the zonelist again. Let __GFP_HIGH and allocations
-        * coming from realtime tasks to go deeper into reserves
+        * coming from realtime tasks go deeper into reserves.
         *
         * This is the last chance, in general, before the goto nopage.
         * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
         */
-       for (i = 0; (z = zones[i]) != NULL; i++) {
-               if (!zone_watermark_ok(z, order, z->pages_min,
-                                      classzone_idx, can_try_harder,
-                                      gfp_mask & __GFP_HIGH))
-                       continue;
-
-               if (wait && !cpuset_zone_allowed(z, gfp_mask))
-                       continue;
-
-               page = buffered_rmqueue(z, order, gfp_mask);
-               if (page)
-                       goto got_pg;
-       }
+       page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+       if (page)
+               goto got_pg;
 
        /* This allocation should allow future memory freeing. */
 
        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
                        && !in_interrupt()) {
                if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+nofail_alloc:
                        /* go through the zonelist yet again, ignoring mins */
-                       for (i = 0; (z = zones[i]) != NULL; i++) {
-                               if (!cpuset_zone_allowed(z, gfp_mask))
-                                       continue;
-                               page = buffered_rmqueue(z, order, gfp_mask);
-                               if (page)
-                                       goto got_pg;
+                       page = get_page_from_freelist(gfp_mask, order,
+                               zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
+                       if (page)
+                               goto got_pg;
+                       if (gfp_mask & __GFP_NOFAIL) {
+                               blk_congestion_wait(WRITE, HZ/50);
+                               goto nofail_alloc;
                        }
                }
                goto nopage;
@@ -923,7 +921,7 @@ rebalance:
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
 
-       did_some_progress = try_to_free_pages(zones, gfp_mask);
+       did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
 
        p->reclaim_state = NULL;
        p->flags &= ~PF_MEMALLOC;
@@ -931,19 +929,10 @@ rebalance:
        cond_resched();
 
        if (likely(did_some_progress)) {
-               for (i = 0; (z = zones[i]) != NULL; i++) {
-                       if (!zone_watermark_ok(z, order, z->pages_min,
-                                              classzone_idx, can_try_harder,
-                                              gfp_mask & __GFP_HIGH))
-                               continue;
-
-                       if (!cpuset_zone_allowed(z, gfp_mask))
-                               continue;
-
-                       page = buffered_rmqueue(z, order, gfp_mask);
-                       if (page)
-                               goto got_pg;
-               }
+               page = get_page_from_freelist(gfp_mask, order,
+                                               zonelist, alloc_flags);
+               if (page)
+                       goto got_pg;
        } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
                /*
                 * Go through the zonelist yet one more time, keep
@@ -951,18 +940,10 @@ rebalance:
                 * a parallel oom killing, we must fail if we're still
                 * under heavy pressure.
                 */
-               for (i = 0; (z = zones[i]) != NULL; i++) {
-                       if (!zone_watermark_ok(z, order, z->pages_high,
-                                              classzone_idx, 0, 0))
-                               continue;
-
-                       if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-                               continue;
-
-                       page = buffered_rmqueue(z, order, gfp_mask);
-                       if (page)
-                               goto got_pg;
-               }
+               page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+                                               zonelist, ALLOC_CPUSET);
+               if (page)
+                       goto got_pg;
 
                out_of_memory(gfp_mask, order);
                goto restart;
@@ -995,9 +976,7 @@ nopage:
                dump_stack();
                show_mem();
        }
-       return NULL;
 got_pg:
-       zone_statistics(zonelist, z);
        return page;
 }
 
@@ -1334,7 +1313,7 @@ void show_free_areas(void)
                } else
                        printk("\n");
 
-               for_each_cpu(cpu) {
+               for_each_online_cpu(cpu) {
                        struct per_cpu_pageset *pageset;
 
                        pageset = zone_pcp(zone, cpu);
@@ -2426,13 +2405,18 @@ void setup_per_zone_pages_min(void)
        }
 
        for_each_zone(zone) {
+               unsigned long tmp;
                spin_lock_irqsave(&zone->lru_lock, flags);
+               tmp = (pages_min * zone->present_pages) / lowmem_pages;
                if (is_highmem(zone)) {
                        /*
-                        * Often, highmem doesn't need to reserve any pages.
-                        * But the pages_min/low/high values are also used for
-                        * batching up page reclaim activity so we need a
-                        * decent value here.
+                        * __GFP_HIGH and PF_MEMALLOC allocations usually don't
+                        * need highmem pages, so cap pages_min to a small
+                        * value here.
+                        *
+                        * The (pages_high-pages_low) and (pages_low-pages_min)
+                        * deltas controls asynch page reclaim, and so should
+                        * not be capped for highmem.
                         */
                        int min_pages;
 
@@ -2443,19 +2427,15 @@ void setup_per_zone_pages_min(void)
                                min_pages = 128;
                        zone->pages_min = min_pages;
                } else {
-                       /* if it's a lowmem zone, reserve a number of pages
+                       /*
+                        * If it's a lowmem zone, reserve a number of pages
                         * proportionate to the zone's size.
                         */
-                       zone->pages_min = (pages_min * zone->present_pages) /
-                                          lowmem_pages;
+                       zone->pages_min = tmp;
                }
 
-               /*
-                * When interpreting these watermarks, just keep in mind that:
-                * zone->pages_min == (zone->pages_min * 4) / 4;
-                */
-               zone->pages_low   = (zone->pages_min * 5) / 4;
-               zone->pages_high  = (zone->pages_min * 6) / 4;
+               zone->pages_low   = zone->pages_min + tmp / 4;
+               zone->pages_high  = zone->pages_min + tmp / 2;
                spin_unlock_irqrestore(&zone->lru_lock, flags);
        }
 }