Merge branch 'for-linus' of git://git.o-hand.com/linux-rpurdie-leds

[pandora-kernel.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index bb49ce5..41341c4 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -14,6 +14,7 @@
  #include <linux/mempolicy.h>
  #include <linux/cpuset.h>
  #include <linux/mutex.h>
+#include <linux/bootmem.h>
  #include <linux/sysfs.h>
  
  #include <asm/page.h>
@@ -30,9 +31,12 @@ static int max_hstate;
  unsigned int default_hstate_idx;
  struct hstate hstates[HUGE_MAX_HSTATE];
  
+__initdata LIST_HEAD(huge_boot_pages);
+
  /* for command line parsing */
  static struct hstate * __initdata parsed_hstate;
  static unsigned long __initdata default_hstate_max_huge_pages;
+static unsigned long __initdata default_hstate_size;
  
  #define for_each_hstate(h) \
         for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
@@ -338,13 +342,13 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
  }
  
  /* Returns true if the VMA has associated reserve pages */
-static int vma_has_private_reserves(struct vm_area_struct *vma)
+static int vma_has_reserves(struct vm_area_struct *vma)
  {
         if (vma->vm_flags & VM_SHARED)
-               return 0;
-       if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER))
-               return 0;
-       return 1;
+               return 1;
+       if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+               return 1;
+       return 0;
  }
  
  static void clear_huge_page(struct page *page,
@@ -416,7 +420,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
          * have no page reserves. This check ensures that reservations are
          * not "stolen". The child may still get SIGKILLed
          */
-       if (!vma_has_private_reserves(vma) &&
+       if (!vma_has_reserves(vma) &&
                         h->free_huge_pages - h->resv_huge_pages == 0)
                 return NULL;
  
@@ -489,7 +493,7 @@ static void free_huge_page(struct page *page)
         INIT_LIST_HEAD(&page->lru);
  
         spin_lock(&hugetlb_lock);
-       if (h->surplus_huge_pages_node[nid]) {
+       if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
                 update_and_free_page(h, page);
                 h->surplus_huge_pages--;
                 h->surplus_huge_pages_node[nid]--;
@@ -550,6 +554,9 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
  {
         struct page *page;
  
+       if (h->order >= MAX_ORDER)
+               return NULL;
+
         page = alloc_pages_node(nid,
                 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
                                                 __GFP_REPEAT|__GFP_NOWARN,
@@ -565,6 +572,27 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
         return page;
  }
  
+/*
+ * Use a helper variable to find the next node and then
+ * copy it back to hugetlb_next_nid afterwards:
+ * otherwise there's a window in which a racer might
+ * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+ * But we don't need to use a spin_lock here: it really
+ * doesn't matter if occasionally a racer chooses the
+ * same nid as we do.  Move nid forward in the mask even
+ * if we just successfully allocated a hugepage so that
+ * the next caller gets hugepages on the next node.
+ */
+static int hstate_next_node(struct hstate *h)
+{
+       int next_nid;
+       next_nid = next_node(h->hugetlb_next_nid, node_online_map);
+       if (next_nid == MAX_NUMNODES)
+               next_nid = first_node(node_online_map);
+       h->hugetlb_next_nid = next_nid;
+       return next_nid;
+}
+
  static int alloc_fresh_huge_page(struct hstate *h)
  {
         struct page *page;
@@ -578,21 +606,7 @@ static int alloc_fresh_huge_page(struct hstate *h)
                 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
                 if (page)
                         ret = 1;
-               /*
-                * Use a helper variable to find the next node and then
-                * copy it back to hugetlb_next_nid afterwards:
-                * otherwise there's a window in which a racer might
-                * pass invalid nid MAX_NUMNODES to alloc_pages_node.
-                * But we don't need to use a spin_lock here: it really
-                * doesn't matter if occasionally a racer chooses the
-                * same nid as we do.  Move nid forward in the mask even
-                * if we just successfully allocated a hugepage so that
-                * the next caller gets hugepages on the next node.
-                */
-               next_nid = next_node(h->hugetlb_next_nid, node_online_map);
-               if (next_nid == MAX_NUMNODES)
-                       next_nid = first_node(node_online_map);
-               h->hugetlb_next_nid = next_nid;
+               next_nid = hstate_next_node(h);
         } while (!page && h->hugetlb_next_nid != start_nid);
  
         if (ret)
@@ -609,6 +623,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
         struct page *page;
         unsigned int nid;
  
+       if (h->order >= MAX_ORDER)
+               return NULL;
+
         /*
          * Assume we will successfully allocate the surplus page to
          * prevent racing processes from causing the surplus to exceed
@@ -785,6 +802,10 @@ static void return_unused_surplus_pages(struct hstate *h,
         /* Uncommit the reservation */
         h->resv_huge_pages -= unused_resv_pages;
  
+       /* Cannot return gigantic pages currently */
+       if (h->order >= MAX_ORDER)
+               return;
+
         nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
  
         while (remaining_iterations-- && nr_pages) {
@@ -906,20 +927,68 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
         return page;
  }
  
-static void __init hugetlb_init_one_hstate(struct hstate *h)
+__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
  {
-       unsigned long i;
+       struct huge_bootmem_page *m;
+       int nr_nodes = nodes_weight(node_online_map);
  
-       for (i = 0; i < MAX_NUMNODES; ++i)
-               INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+       while (nr_nodes) {
+               void *addr;
  
-       h->hugetlb_next_nid = first_node(node_online_map);
+               addr = __alloc_bootmem_node_nopanic(
+                               NODE_DATA(h->hugetlb_next_nid),
+                               huge_page_size(h), huge_page_size(h), 0);
+
+               if (addr) {
+                       /*
+                        * Use the beginning of the huge page to store the
+                        * huge_bootmem_page struct (until gather_bootmem
+                        * puts them into the mem_map).
+                        */
+                       m = addr;
+                       if (m)
+                               goto found;
+               }
+               hstate_next_node(h);
+               nr_nodes--;
+       }
+       return 0;
+
+found:
+       BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+       /* Put them into a private list first because mem_map is not up yet */
+       list_add(&m->list, &huge_boot_pages);
+       m->hstate = h;
+       return 1;
+}
+
+/* Put bootmem huge pages into the standard lists after mem_map is up */
+static void __init gather_bootmem_prealloc(void)
+{
+       struct huge_bootmem_page *m;
+
+       list_for_each_entry(m, &huge_boot_pages, list) {
+               struct page *page = virt_to_page(m);
+               struct hstate *h = m->hstate;
+               __ClearPageReserved(page);
+               WARN_ON(page_count(page) != 1);
+               prep_compound_page(page, h->order);
+               prep_new_huge_page(h, page, page_to_nid(page));
+       }
+}
+
+static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
+{
+       unsigned long i;
  
         for (i = 0; i < h->max_huge_pages; ++i) {
-               if (!alloc_fresh_huge_page(h))
+               if (h->order >= MAX_ORDER) {
+                       if (!alloc_bootmem_huge_page(h))
+                               break;
+               } else if (!alloc_fresh_huge_page(h))
                         break;
         }
-       h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i;
+       h->max_huge_pages = i;
  }
  
  static void __init hugetlb_init_hstates(void)
@@ -927,19 +996,33 @@ static void __init hugetlb_init_hstates(void)
         struct hstate *h;
  
         for_each_hstate(h) {
-               hugetlb_init_one_hstate(h);
+               /* oversize hugepages were init'ed in early boot */
+               if (h->order < MAX_ORDER)
+                       hugetlb_hstate_alloc_pages(h);
         }
  }
  
+static char * __init memfmt(char *buf, unsigned long n)
+{
+       if (n >= (1UL << 30))
+               sprintf(buf, "%lu GB", n >> 30);
+       else if (n >= (1UL << 20))
+               sprintf(buf, "%lu MB", n >> 20);
+       else
+               sprintf(buf, "%lu KB", n >> 10);
+       return buf;
+}
+
  static void __init report_hugepages(void)
  {
         struct hstate *h;
  
         for_each_hstate(h) {
-               printk(KERN_INFO "Total HugeTLB memory allocated, "
-                               "%ld %dMB pages\n",
-                               h->free_huge_pages,
-                               1 << (h->order + PAGE_SHIFT - 20));
+               char buf[32];
+               printk(KERN_INFO "HugeTLB registered %s page size, "
+                                "pre-allocated %ld pages\n",
+                       memfmt(buf, huge_page_size(h)),
+                       h->free_huge_pages);
         }
  }
  
@@ -949,6 +1032,9 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
  {
         int i;
  
+       if (h->order >= MAX_ORDER)
+               return;
+
         for (i = 0; i < MAX_NUMNODES; ++i) {
                 struct page *page, *next;
                 struct list_head *freel = &h->hugepage_freelists[i];
@@ -975,6 +1061,9 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
  {
         unsigned long min_count, ret;
  
+       if (h->order >= MAX_ORDER)
+               return h->max_huge_pages;
+
         /*
          * Increase the pool size
          * First take pages out of surplus state.  Then make up the
@@ -1195,14 +1284,19 @@ static int __init hugetlb_init(void)
  {
         BUILD_BUG_ON(HPAGE_SHIFT == 0);
  
-       if (!size_to_hstate(HPAGE_SIZE)) {
-               hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
-               parsed_hstate->max_huge_pages = default_hstate_max_huge_pages;
+       if (!size_to_hstate(default_hstate_size)) {
+               default_hstate_size = HPAGE_SIZE;
+               if (!size_to_hstate(default_hstate_size))
+                       hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
         }
-       default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates;
+       default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+       if (default_hstate_max_huge_pages)
+               default_hstate.max_huge_pages = default_hstate_max_huge_pages;
  
         hugetlb_init_hstates();
  
+       gather_bootmem_prealloc();
+
         report_hugepages();
  
         hugetlb_sysfs_init();
@@ -1215,6 +1309,8 @@ module_init(hugetlb_init);
  void __init hugetlb_add_hstate(unsigned order)
  {
         struct hstate *h;
+       unsigned long i;
+
         if (size_to_hstate(PAGE_SIZE << order)) {
                 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
                 return;
@@ -1224,15 +1320,21 @@ void __init hugetlb_add_hstate(unsigned order)
         h = &hstates[max_hstate++];
         h->order = order;
         h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+       h->nr_huge_pages = 0;
+       h->free_huge_pages = 0;
+       for (i = 0; i < MAX_NUMNODES; ++i)
+               INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+       h->hugetlb_next_nid = first_node(node_online_map);
         snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                         huge_page_size(h)/1024);
-       hugetlb_init_one_hstate(h);
+
         parsed_hstate = h;
  }
  
-static int __init hugetlb_setup(char *s)
+static int __init hugetlb_nrpages_setup(char *s)
  {
         unsigned long *mhp;
+       static unsigned long *last_mhp;
  
         /*
          * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
@@ -1243,12 +1345,35 @@ static int __init hugetlb_setup(char *s)
         else
                 mhp = &parsed_hstate->max_huge_pages;
  
+       if (mhp == last_mhp) {
+               printk(KERN_WARNING "hugepages= specified twice without "
+                       "interleaving hugepagesz=, ignoring\n");
+               return 1;
+       }
+
         if (sscanf(s, "%lu", mhp) <= 0)
                 *mhp = 0;
  
+       /*
+        * Global state is always initialized later in hugetlb_init.
+        * But we need to allocate >= MAX_ORDER hstates here early to still
+        * use the bootmem allocator.
+        */
+       if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+               hugetlb_hstate_alloc_pages(parsed_hstate);
+
+       last_mhp = mhp;
+
+       return 1;
+}
+__setup("hugepages=", hugetlb_nrpages_setup);
+
+static int __init hugetlb_default_setup(char *s)
+{
+       default_hstate_size = memparse(s, &s);
         return 1;
  }
-__setup("hugepages=", hugetlb_setup);
+__setup("default_hugepagesz=", hugetlb_default_setup);
  
  static unsigned int cpuset_mems_nr(unsigned int *array)
  {
@@ -1427,8 +1552,10 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
  
                 kref_put(&reservations->refs, resv_map_release);
  
-               if (reserve)
+               if (reserve) {
                         hugetlb_acct_memory(h, -reserve);
+                       hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+               }
         }
  }
  
@@ -1877,6 +2004,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         return ret;
  }
  
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+              pud_t *pud, int write)
+{
+       BUG();
+       return NULL;
+}
+
  int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         struct page **pages, struct vm_area_struct **vmas,
                         unsigned long *position, int *length, int i,