thp: add numa awareness to hugepage allocations
[pandora-kernel.git] / mm / huge_memory.c
index ae2bf08..f6559e7 100644 (file)
@@ -28,6 +28,7 @@
  */
 unsigned long transparent_hugepage_flags __read_mostly =
        (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+       (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
 
 /* default scan 8*512 pte (or vmas) every 30 second */
@@ -85,6 +86,47 @@ struct khugepaged_scan {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
 };
 
+
+static int set_recommended_min_free_kbytes(void)
+{
+       struct zone *zone;
+       int nr_zones = 0;
+       unsigned long recommended_min;
+       extern int min_free_kbytes;
+
+       if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                     &transparent_hugepage_flags) &&
+           !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                     &transparent_hugepage_flags))
+               return 0;
+
+       for_each_populated_zone(zone)
+               nr_zones++;
+
+       /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+       recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+       /*
+        * Make sure that on average at least two pageblocks are almost free
+        * of another type, one for a migratetype to fall back to and a
+        * second to avoid subsequent fallbacks of other types There are 3
+        * MIGRATE_TYPES we care about.
+        */
+       recommended_min += pageblock_nr_pages * nr_zones *
+                          MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+       /* don't ever allow to reserve more than 5% of the lowmem */
+       recommended_min = min(recommended_min,
+                             (unsigned long) nr_free_buffer_pages() / 20);
+       recommended_min <<= (PAGE_SHIFT-10);
+
+       if (recommended_min > min_free_kbytes)
+               min_free_kbytes = recommended_min;
+       setup_per_zone_wmarks();
+       return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+
 static int start_khugepaged(void)
 {
        int err = 0;
@@ -108,6 +150,8 @@ static int start_khugepaged(void)
                mutex_unlock(&khugepaged_mutex);
                if (wakeup)
                        wake_up_interruptible(&khugepaged_wait);
+
+               set_recommended_min_free_kbytes();
        } else
                /* wakeup to exit */
                wake_up_interruptible(&khugepaged_wait);
@@ -177,6 +221,13 @@ static ssize_t enabled_store(struct kobject *kobj,
                        ret = err;
        }
 
+       if (ret > 0 &&
+           (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                     &transparent_hugepage_flags) ||
+            test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                     &transparent_hugepage_flags)))
+               set_recommended_min_free_kbytes();
+
        return ret;
 }
 static struct kobj_attribute enabled_attr =
@@ -464,6 +515,8 @@ static int __init hugepage_init(void)
 
        start_khugepaged();
 
+       set_recommended_min_free_kbytes();
+
 out:
        return err;
 }
@@ -567,11 +620,26 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        return ret;
 }
 
+static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+{
+       return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+}
+
+static inline struct page *alloc_hugepage_vma(int defrag,
+                                             struct vm_area_struct *vma,
+                                             unsigned long haddr)
+{
+       return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
+                              HPAGE_PMD_ORDER, vma, haddr);
+}
+
+#ifndef CONFIG_NUMA
 static inline struct page *alloc_hugepage(int defrag)
 {
-       return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT),
+       return alloc_pages(alloc_hugepage_gfpmask(defrag),
                           HPAGE_PMD_ORDER);
 }
+#endif
 
 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                               unsigned long address, pmd_t *pmd,
@@ -586,7 +654,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        return VM_FAULT_OOM;
                if (unlikely(khugepaged_enter(vma)))
                        return VM_FAULT_OOM;
-               page = alloc_hugepage(transparent_hugepage_defrag(vma));
+               page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                                         vma, haddr);
                if (unlikely(!page))
                        goto out;
                if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
@@ -809,7 +878,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow())
-               new_page = alloc_hugepage(transparent_hugepage_defrag(vma));
+               new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                                             vma, haddr);
        else
                new_page = NULL;
 
@@ -923,6 +993,58 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        return ret;
 }
 
+int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long addr, unsigned long end,
+               unsigned char *vec)
+{
+       int ret = 0;
+
+       spin_lock(&vma->vm_mm->page_table_lock);
+       if (likely(pmd_trans_huge(*pmd))) {
+               ret = !pmd_trans_splitting(*pmd);
+               spin_unlock(&vma->vm_mm->page_table_lock);
+               if (unlikely(!ret))
+                       wait_split_huge_page(vma->anon_vma, pmd);
+               else {
+                       /*
+                        * All logical pages in the range are present
+                        * if backed by a huge page.
+                        */
+                       memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+               }
+       } else
+               spin_unlock(&vma->vm_mm->page_table_lock);
+
+       return ret;
+}
+
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long addr, pgprot_t newprot)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       int ret = 0;
+
+       spin_lock(&mm->page_table_lock);
+       if (likely(pmd_trans_huge(*pmd))) {
+               if (unlikely(pmd_trans_splitting(*pmd))) {
+                       spin_unlock(&mm->page_table_lock);
+                       wait_split_huge_page(vma->anon_vma, pmd);
+               } else {
+                       pmd_t entry;
+
+                       entry = pmdp_get_and_clear(mm, addr, pmd);
+                       entry = pmd_modify(entry, newprot);
+                       set_pmd_at(mm, addr, pmd, entry);
+                       spin_unlock(&vma->vm_mm->page_table_lock);
+                       flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+                       ret = 1;
+               }
+       } else
+               spin_unlock(&vma->vm_mm->page_table_lock);
+
+       return ret;
+}
+
 pmd_t *page_check_address_pmd(struct page *page,
                              struct mm_struct *mm,
                              unsigned long address,
@@ -1556,7 +1678,11 @@ static void collapse_huge_page(struct mm_struct *mm,
        unsigned long hstart, hend;
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
        VM_BUG_ON(!*hpage);
+#else
+       VM_BUG_ON(*hpage);
+#endif
 
        /*
         * Prevent all access to pagetables with the exception of
@@ -1594,9 +1720,17 @@ static void collapse_huge_page(struct mm_struct *mm,
        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
                goto out;
 
+#ifndef CONFIG_NUMA
        new_page = *hpage;
-       if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+#else
+       new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+       if (unlikely(!new_page)) {
+               *hpage = ERR_PTR(-ENOMEM);
                goto out;
+       }
+#endif
+       if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+               goto out_put_page;
 
        anon_vma_lock(vma->anon_vma);
 
@@ -1625,7 +1759,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                spin_unlock(&mm->page_table_lock);
                anon_vma_unlock(vma->anon_vma);
                mem_cgroup_uncharge_page(new_page);
-               goto out;
+               goto out_put_page;
        }
 
        /*
@@ -1660,10 +1794,19 @@ static void collapse_huge_page(struct mm_struct *mm,
        mm->nr_ptes--;
        spin_unlock(&mm->page_table_lock);
 
+#ifndef CONFIG_NUMA
        *hpage = NULL;
+#endif
        khugepaged_pages_collapsed++;
 out:
        up_write(&mm->mmap_sem);
+       return;
+
+out_put_page:
+#ifdef CONFIG_NUMA
+       put_page(new_page);
+#endif
+       goto out;
 }
 
 static int khugepaged_scan_pmd(struct mm_struct *mm,
@@ -1896,11 +2039,16 @@ static void khugepaged_do_scan(struct page **hpage)
        while (progress < pages) {
                cond_resched();
 
+#ifndef CONFIG_NUMA
                if (!*hpage) {
                        *hpage = alloc_hugepage(khugepaged_defrag());
                        if (unlikely(!*hpage))
                                break;
                }
+#else
+               if (IS_ERR(*hpage))
+                       break;
+#endif
 
                spin_lock(&khugepaged_mm_lock);
                if (!khugepaged_scan.mm_slot)
@@ -1915,37 +2063,55 @@ static void khugepaged_do_scan(struct page **hpage)
        }
 }
 
+static void khugepaged_alloc_sleep(void)
+{
+       DEFINE_WAIT(wait);
+       add_wait_queue(&khugepaged_wait, &wait);
+       schedule_timeout_interruptible(
+               msecs_to_jiffies(
+                       khugepaged_alloc_sleep_millisecs));
+       remove_wait_queue(&khugepaged_wait, &wait);
+}
+
+#ifndef CONFIG_NUMA
 static struct page *khugepaged_alloc_hugepage(void)
 {
        struct page *hpage;
 
        do {
                hpage = alloc_hugepage(khugepaged_defrag());
-               if (!hpage) {
-                       DEFINE_WAIT(wait);
-                       add_wait_queue(&khugepaged_wait, &wait);
-                       schedule_timeout_interruptible(
-                               msecs_to_jiffies(
-                                       khugepaged_alloc_sleep_millisecs));
-                       remove_wait_queue(&khugepaged_wait, &wait);
-               }
+               if (!hpage)
+                       khugepaged_alloc_sleep();
        } while (unlikely(!hpage) &&
                 likely(khugepaged_enabled()));
        return hpage;
 }
+#endif
 
 static void khugepaged_loop(void)
 {
        struct page *hpage;
 
+#ifdef CONFIG_NUMA
+       hpage = NULL;
+#endif
        while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
                hpage = khugepaged_alloc_hugepage();
                if (unlikely(!hpage))
                        break;
+#else
+               if (IS_ERR(hpage)) {
+                       khugepaged_alloc_sleep();
+                       hpage = NULL;
+               }
+#endif
 
                khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
                if (hpage)
                        put_page(hpage);
+#endif
                if (khugepaged_has_work()) {
                        DEFINE_WAIT(wait);
                        if (!khugepaged_scan_sleep_millisecs)