mm: page allocator: calculate a better estimate of NR_FREE_PAGES when memory is low...

author Christoph Lameter <cl@linux.com>

Thu, 9 Sep 2010 23:38:17 +0000 (16:38 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 10 Sep 2010 01:57:25 +0000 (18:57 -0700)
author Christoph Lameter <cl@linux.com>
Thu, 9 Sep 2010 23:38:17 +0000 (16:38 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 10 Sep 2010 01:57:25 +0000 (18:57 -0700)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 6e6e626..3984c4e 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -283,6 +283,13 @@ struct zone {
         /* zone watermarks, access with *_wmark_pages(zone) macros */
         unsigned long watermark[NR_WMARK];
  
+       /*
+        * When free pages are below this point, additional steps are taken
+        * when reading the number of free pages to avoid per-cpu counter
+        * drift allowing watermarks to be breached
+        */
+       unsigned long percpu_drift_mark;
+
         /*
          * We don't know if the memory that we're going to allocate will be freeable
          * or/and it will be released eventually, so to avoid totally wasting several
@@ -441,6 +448,12 @@ static inline int zone_is_oom_locked(const struct zone *zone)
         return test_bit(ZONE_OOM_LOCKED, &zone->flags);
  }
  
+#ifdef CONFIG_SMP
+unsigned long zone_nr_free_pages(struct zone *zone);
+#else
+#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
+#endif /* CONFIG_SMP */
+
  /*
   * The "priority" of VM scanning is how much of the queues we will scan in one
   * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h

index 7f43ccd..eaaea37 100644 (file)
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -170,6 +170,28 @@ static inline unsigned long zone_page_state(struct zone *zone,
         return x;
  }
  
+/*
+ * More accurate version that also considers the currently pending
+ * deltas. For that we need to loop over all cpus to find the current
+ * deltas. There is no synchronization so the result cannot be
+ * exactly accurate either.
+ */
+static inline unsigned long zone_page_state_snapshot(struct zone *zone,
+                                       enum zone_stat_item item)
+{
+       long x = atomic_long_read(&zone->vm_stat[item]);
+
+#ifdef CONFIG_SMP
+       int cpu;
+       for_each_online_cpu(cpu)
+               x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
+
+       if (x < 0)
+               x = 0;
+#endif
+       return x;
+}
+
  extern unsigned long global_reclaimable_pages(void);
  extern unsigned long zone_reclaimable_pages(struct zone *zone);
  
diff --git a/mm/mmzone.c b/mm/mmzone.c

index f5b7d17..e35bfb8 100644 (file)
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn,
         return 1;
  }
  #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+
+#ifdef CONFIG_SMP
+/* Called when a more accurate view of NR_FREE_PAGES is needed */
+unsigned long zone_nr_free_pages(struct zone *zone)
+{
+       unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
+
+       /*
+        * While kswapd is awake, it is considered the zone is under some
+        * memory pressure. Under pressure, there is a risk that
+        * per-cpu-counter-drift will allow the min watermark to be breached
+        * potentially causing a live-lock. While kswapd is awake and
+        * free pages are low, get a better estimate for free pages
+        */
+       if (nr_free_pages < zone->percpu_drift_mark &&
+                       !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
+               return zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+       return nr_free_pages;
+}
+#endif /* CONFIG_SMP */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 452e2ba..b2d21e0 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1462,7 +1462,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
  {
         /* free_pages my go negative - that's OK */
         long min = mark;
-       long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
+       long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
         int o;
  
         if (alloc_flags & ALLOC_HIGH)
@@ -2424,7 +2424,7 @@ void show_free_areas(void)
                         " all_unreclaimable? %s"
                         "\n",
                         zone->name,
-                       K(zone_page_state(zone, NR_FREE_PAGES)),
+                       K(zone_nr_free_pages(zone)),
                         K(min_wmark_pages(zone)),
                         K(low_wmark_pages(zone)),
                         K(high_wmark_pages(zone)),
diff --git a/mm/vmstat.c b/mm/vmstat.c

index a8d6b59..355a9e6 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -138,11 +138,24 @@ static void refresh_zone_stat_thresholds(void)
         int threshold;
  
         for_each_populated_zone(zone) {
+               unsigned long max_drift, tolerate_drift;
+
                 threshold = calculate_threshold(zone);
  
                 for_each_online_cpu(cpu)
                         per_cpu_ptr(zone->pageset, cpu)->stat_threshold
                                                         = threshold;
+
+               /*
+                * Only set percpu_drift_mark if there is a danger that
+                * NR_FREE_PAGES reports the low watermark is ok when in fact
+                * the min watermark could be breached by an allocation
+                */
+               tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
+               max_drift = num_online_cpus() * threshold;
+               if (max_drift > tolerate_drift)
+                       zone->percpu_drift_mark = high_wmark_pages(zone) +
+                                       max_drift;
         }
  }
  
@@ -813,7 +826,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                    "\n        scanned  %lu"
                    "\n        spanned  %lu"
                    "\n        present  %lu",
-                  zone_page_state(zone, NR_FREE_PAGES),
+                  zone_nr_free_pages(zone),
                    min_wmark_pages(zone),
                    low_wmark_pages(zone),
                    high_wmark_pages(zone),
author	Christoph Lameter <cl@linux.com>
	Thu, 9 Sep 2010 23:38:17 +0000 (16:38 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 10 Sep 2010 01:57:25 +0000 (18:57 -0700)
include/linux/mmzone.h		patch \| blob \| history
include/linux/vmstat.h		patch \| blob \| history
mm/mmzone.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/vmstat.c		patch \| blob \| history