Merge branch 'for-linus' of git://git.o-hand.com/linux-rpurdie-leds
[pandora-kernel.git] / mm / hugetlb.c
index bb49ce5..41341c4 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
+#include <linux/bootmem.h>
 #include <linux/sysfs.h>
 
 #include <asm/page.h>
@@ -30,9 +31,12 @@ static int max_hstate;
 unsigned int default_hstate_idx;
 struct hstate hstates[HUGE_MAX_HSTATE];
 
+__initdata LIST_HEAD(huge_boot_pages);
+
 /* for command line parsing */
 static struct hstate * __initdata parsed_hstate;
 static unsigned long __initdata default_hstate_max_huge_pages;
+static unsigned long __initdata default_hstate_size;
 
 #define for_each_hstate(h) \
        for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
@@ -338,13 +342,13 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 }
 
 /* Returns true if the VMA has associated reserve pages */
-static int vma_has_private_reserves(struct vm_area_struct *vma)
+static int vma_has_reserves(struct vm_area_struct *vma)
 {
        if (vma->vm_flags & VM_SHARED)
-               return 0;
-       if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER))
-               return 0;
-       return 1;
+               return 1;
+       if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+               return 1;
+       return 0;
 }
 
 static void clear_huge_page(struct page *page,
@@ -416,7 +420,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
         * have no page reserves. This check ensures that reservations are
         * not "stolen". The child may still get SIGKILLed
         */
-       if (!vma_has_private_reserves(vma) &&
+       if (!vma_has_reserves(vma) &&
                        h->free_huge_pages - h->resv_huge_pages == 0)
                return NULL;
 
@@ -489,7 +493,7 @@ static void free_huge_page(struct page *page)
        INIT_LIST_HEAD(&page->lru);
 
        spin_lock(&hugetlb_lock);
-       if (h->surplus_huge_pages_node[nid]) {
+       if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
                update_and_free_page(h, page);
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
@@ -550,6 +554,9 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
        struct page *page;
 
+       if (h->order >= MAX_ORDER)
+               return NULL;
+
        page = alloc_pages_node(nid,
                htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
                                                __GFP_REPEAT|__GFP_NOWARN,
@@ -565,6 +572,27 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
        return page;
 }
 
+/*
+ * Use a helper variable to find the next node and then
+ * copy it back to hugetlb_next_nid afterwards:
+ * otherwise there's a window in which a racer might
+ * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+ * But we don't need to use a spin_lock here: it really
+ * doesn't matter if occasionally a racer chooses the
+ * same nid as we do.  Move nid forward in the mask even
+ * if we just successfully allocated a hugepage so that
+ * the next caller gets hugepages on the next node.
+ */
+static int hstate_next_node(struct hstate *h)
+{
+       int next_nid;
+       next_nid = next_node(h->hugetlb_next_nid, node_online_map);
+       if (next_nid == MAX_NUMNODES)
+               next_nid = first_node(node_online_map);
+       h->hugetlb_next_nid = next_nid;
+       return next_nid;
+}
+
 static int alloc_fresh_huge_page(struct hstate *h)
 {
        struct page *page;
@@ -578,21 +606,7 @@ static int alloc_fresh_huge_page(struct hstate *h)
                page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
                if (page)
                        ret = 1;
-               /*
-                * Use a helper variable to find the next node and then
-                * copy it back to hugetlb_next_nid afterwards:
-                * otherwise there's a window in which a racer might
-                * pass invalid nid MAX_NUMNODES to alloc_pages_node.
-                * But we don't need to use a spin_lock here: it really
-                * doesn't matter if occasionally a racer chooses the
-                * same nid as we do.  Move nid forward in the mask even
-                * if we just successfully allocated a hugepage so that
-                * the next caller gets hugepages on the next node.
-                */
-               next_nid = next_node(h->hugetlb_next_nid, node_online_map);
-               if (next_nid == MAX_NUMNODES)
-                       next_nid = first_node(node_online_map);
-               h->hugetlb_next_nid = next_nid;
+               next_nid = hstate_next_node(h);
        } while (!page && h->hugetlb_next_nid != start_nid);
 
        if (ret)
@@ -609,6 +623,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
        struct page *page;
        unsigned int nid;
 
+       if (h->order >= MAX_ORDER)
+               return NULL;
+
        /*
         * Assume we will successfully allocate the surplus page to
         * prevent racing processes from causing the surplus to exceed
@@ -785,6 +802,10 @@ static void return_unused_surplus_pages(struct hstate *h,
        /* Uncommit the reservation */
        h->resv_huge_pages -= unused_resv_pages;
 
+       /* Cannot return gigantic pages currently */
+       if (h->order >= MAX_ORDER)
+               return;
+
        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
 
        while (remaining_iterations-- && nr_pages) {
@@ -906,20 +927,68 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        return page;
 }
 
-static void __init hugetlb_init_one_hstate(struct hstate *h)
+__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
 {
-       unsigned long i;
+       struct huge_bootmem_page *m;
+       int nr_nodes = nodes_weight(node_online_map);
 
-       for (i = 0; i < MAX_NUMNODES; ++i)
-               INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+       while (nr_nodes) {
+               void *addr;
 
-       h->hugetlb_next_nid = first_node(node_online_map);
+               addr = __alloc_bootmem_node_nopanic(
+                               NODE_DATA(h->hugetlb_next_nid),
+                               huge_page_size(h), huge_page_size(h), 0);
+
+               if (addr) {
+                       /*
+                        * Use the beginning of the huge page to store the
+                        * huge_bootmem_page struct (until gather_bootmem
+                        * puts them into the mem_map).
+                        */
+                       m = addr;
+                       if (m)
+                               goto found;
+               }
+               hstate_next_node(h);
+               nr_nodes--;
+       }
+       return 0;
+
+found:
+       BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+       /* Put them into a private list first because mem_map is not up yet */
+       list_add(&m->list, &huge_boot_pages);
+       m->hstate = h;
+       return 1;
+}
+
+/* Put bootmem huge pages into the standard lists after mem_map is up */
+static void __init gather_bootmem_prealloc(void)
+{
+       struct huge_bootmem_page *m;
+
+       list_for_each_entry(m, &huge_boot_pages, list) {
+               struct page *page = virt_to_page(m);
+               struct hstate *h = m->hstate;
+               __ClearPageReserved(page);
+               WARN_ON(page_count(page) != 1);
+               prep_compound_page(page, h->order);
+               prep_new_huge_page(h, page, page_to_nid(page));
+       }
+}
+
+static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
+{
+       unsigned long i;
 
        for (i = 0; i < h->max_huge_pages; ++i) {
-               if (!alloc_fresh_huge_page(h))
+               if (h->order >= MAX_ORDER) {
+                       if (!alloc_bootmem_huge_page(h))
+                               break;
+               } else if (!alloc_fresh_huge_page(h))
                        break;
        }
-       h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i;
+       h->max_huge_pages = i;
 }
 
 static void __init hugetlb_init_hstates(void)
@@ -927,19 +996,33 @@ static void __init hugetlb_init_hstates(void)
        struct hstate *h;
 
        for_each_hstate(h) {
-               hugetlb_init_one_hstate(h);
+               /* oversize hugepages were init'ed in early boot */
+               if (h->order < MAX_ORDER)
+                       hugetlb_hstate_alloc_pages(h);
        }
 }
 
+static char * __init memfmt(char *buf, unsigned long n)
+{
+       if (n >= (1UL << 30))
+               sprintf(buf, "%lu GB", n >> 30);
+       else if (n >= (1UL << 20))
+               sprintf(buf, "%lu MB", n >> 20);
+       else
+               sprintf(buf, "%lu KB", n >> 10);
+       return buf;
+}
+
 static void __init report_hugepages(void)
 {
        struct hstate *h;
 
        for_each_hstate(h) {
-               printk(KERN_INFO "Total HugeTLB memory allocated, "
-                               "%ld %dMB pages\n",
-                               h->free_huge_pages,
-                               1 << (h->order + PAGE_SHIFT - 20));
+               char buf[32];
+               printk(KERN_INFO "HugeTLB registered %s page size, "
+                                "pre-allocated %ld pages\n",
+                       memfmt(buf, huge_page_size(h)),
+                       h->free_huge_pages);
        }
 }
 
@@ -949,6 +1032,9 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
 {
        int i;
 
+       if (h->order >= MAX_ORDER)
+               return;
+
        for (i = 0; i < MAX_NUMNODES; ++i) {
                struct page *page, *next;
                struct list_head *freel = &h->hugepage_freelists[i];
@@ -975,6 +1061,9 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
 {
        unsigned long min_count, ret;
 
+       if (h->order >= MAX_ORDER)
+               return h->max_huge_pages;
+
        /*
         * Increase the pool size
         * First take pages out of surplus state.  Then make up the
@@ -1195,14 +1284,19 @@ static int __init hugetlb_init(void)
 {
        BUILD_BUG_ON(HPAGE_SHIFT == 0);
 
-       if (!size_to_hstate(HPAGE_SIZE)) {
-               hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
-               parsed_hstate->max_huge_pages = default_hstate_max_huge_pages;
+       if (!size_to_hstate(default_hstate_size)) {
+               default_hstate_size = HPAGE_SIZE;
+               if (!size_to_hstate(default_hstate_size))
+                       hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
        }
-       default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates;
+       default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+       if (default_hstate_max_huge_pages)
+               default_hstate.max_huge_pages = default_hstate_max_huge_pages;
 
        hugetlb_init_hstates();
 
+       gather_bootmem_prealloc();
+
        report_hugepages();
 
        hugetlb_sysfs_init();
@@ -1215,6 +1309,8 @@ module_init(hugetlb_init);
 void __init hugetlb_add_hstate(unsigned order)
 {
        struct hstate *h;
+       unsigned long i;
+
        if (size_to_hstate(PAGE_SIZE << order)) {
                printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
                return;
@@ -1224,15 +1320,21 @@ void __init hugetlb_add_hstate(unsigned order)
        h = &hstates[max_hstate++];
        h->order = order;
        h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+       h->nr_huge_pages = 0;
+       h->free_huge_pages = 0;
+       for (i = 0; i < MAX_NUMNODES; ++i)
+               INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+       h->hugetlb_next_nid = first_node(node_online_map);
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
-       hugetlb_init_one_hstate(h);
+
        parsed_hstate = h;
 }
 
-static int __init hugetlb_setup(char *s)
+static int __init hugetlb_nrpages_setup(char *s)
 {
        unsigned long *mhp;
+       static unsigned long *last_mhp;
 
        /*
         * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
@@ -1243,12 +1345,35 @@ static int __init hugetlb_setup(char *s)
        else
                mhp = &parsed_hstate->max_huge_pages;
 
+       if (mhp == last_mhp) {
+               printk(KERN_WARNING "hugepages= specified twice without "
+                       "interleaving hugepagesz=, ignoring\n");
+               return 1;
+       }
+
        if (sscanf(s, "%lu", mhp) <= 0)
                *mhp = 0;
 
+       /*
+        * Global state is always initialized later in hugetlb_init.
+        * But we need to allocate >= MAX_ORDER hstates here early to still
+        * use the bootmem allocator.
+        */
+       if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+               hugetlb_hstate_alloc_pages(parsed_hstate);
+
+       last_mhp = mhp;
+
+       return 1;
+}
+__setup("hugepages=", hugetlb_nrpages_setup);
+
+static int __init hugetlb_default_setup(char *s)
+{
+       default_hstate_size = memparse(s, &s);
        return 1;
 }
-__setup("hugepages=", hugetlb_setup);
+__setup("default_hugepagesz=", hugetlb_default_setup);
 
 static unsigned int cpuset_mems_nr(unsigned int *array)
 {
@@ -1427,8 +1552,10 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 
                kref_put(&reservations->refs, resv_map_release);
 
-               if (reserve)
+               if (reserve) {
                        hugetlb_acct_memory(h, -reserve);
+                       hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+               }
        }
 }
 
@@ -1877,6 +2004,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return ret;
 }
 
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+              pud_t *pud, int write)
+{
+       BUG();
+       return NULL;
+}
+
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i,