docs/sysfs: show() methods should use scnprintf().
[pandora-kernel.git] / mm / huge_memory.c
index 7b55fe0..e187454 100644 (file)
@@ -15,6 +15,8 @@
 #include <linux/mm_inline.h>
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
  * allocations.
  */
 unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
        (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+       (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+       (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
 
 /* default scan 8*512 pte (or vmas) every 30 second */
@@ -85,6 +93,47 @@ struct khugepaged_scan {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
 };
 
+
+static int set_recommended_min_free_kbytes(void)
+{
+       struct zone *zone;
+       int nr_zones = 0;
+       unsigned long recommended_min;
+       extern int min_free_kbytes;
+
+       if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                     &transparent_hugepage_flags) &&
+           !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                     &transparent_hugepage_flags))
+               return 0;
+
+       for_each_populated_zone(zone)
+               nr_zones++;
+
+       /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+       recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+       /*
+        * Make sure that on average at least two pageblocks are almost free
+        * of another type, one for a migratetype to fall back to and a
+        * second to avoid subsequent fallbacks of other types There are 3
+        * MIGRATE_TYPES we care about.
+        */
+       recommended_min += pageblock_nr_pages * nr_zones *
+                          MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+       /* don't ever allow to reserve more than 5% of the lowmem */
+       recommended_min = min(recommended_min,
+                             (unsigned long) nr_free_buffer_pages() / 20);
+       recommended_min <<= (PAGE_SHIFT-10);
+
+       if (recommended_min > min_free_kbytes)
+               min_free_kbytes = recommended_min;
+       setup_per_zone_wmarks();
+       return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+
 static int start_khugepaged(void)
 {
        int err = 0;
@@ -108,6 +157,8 @@ static int start_khugepaged(void)
                mutex_unlock(&khugepaged_mutex);
                if (wakeup)
                        wake_up_interruptible(&khugepaged_wait);
+
+               set_recommended_min_free_kbytes();
        } else
                /* wakeup to exit */
                wake_up_interruptible(&khugepaged_wait);
@@ -177,6 +228,13 @@ static ssize_t enabled_store(struct kobject *kobj,
                        ret = err;
        }
 
+       if (ret > 0 &&
+           (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                     &transparent_hugepage_flags) ||
+            test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                     &transparent_hugepage_flags)))
+               set_recommended_min_free_kbytes();
+
        return ret;
 }
 static struct kobj_attribute enabled_attr =
@@ -431,7 +489,15 @@ static int __init hugepage_init(void)
        int err;
 #ifdef CONFIG_SYSFS
        static struct kobject *hugepage_kobj;
+#endif
+
+       err = -EINVAL;
+       if (!has_transparent_hugepage()) {
+               transparent_hugepage_flags = 0;
+               goto out;
+       }
 
+#ifdef CONFIG_SYSFS
        err = -ENOMEM;
        hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
        if (unlikely(!hugepage_kobj)) {
@@ -462,8 +528,18 @@ static int __init hugepage_init(void)
                goto out;
        }
 
+       /*
+        * By default disable transparent hugepages on smaller systems,
+        * where the extra memory used could hurt more than TLB overhead
+        * is likely to save.  The admin can still enable it through /sys.
+        */
+       if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+               transparent_hugepage_flags = 0;
+
        start_khugepaged();
 
+       set_recommended_min_free_kbytes();
+
 out:
        return err;
 }
@@ -567,11 +643,26 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        return ret;
 }
 
+static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+{
+       return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+}
+
+static inline struct page *alloc_hugepage_vma(int defrag,
+                                             struct vm_area_struct *vma,
+                                             unsigned long haddr)
+{
+       return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
+                              HPAGE_PMD_ORDER, vma, haddr);
+}
+
+#ifndef CONFIG_NUMA
 static inline struct page *alloc_hugepage(int defrag)
 {
-       return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT),
+       return alloc_pages(alloc_hugepage_gfpmask(defrag),
                           HPAGE_PMD_ORDER);
 }
+#endif
 
 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                               unsigned long address, pmd_t *pmd,
@@ -586,7 +677,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        return VM_FAULT_OOM;
                if (unlikely(khugepaged_enter(vma)))
                        return VM_FAULT_OOM;
-               page = alloc_hugepage(transparent_hugepage_defrag(vma));
+               page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                                         vma, haddr);
                if (unlikely(!page))
                        goto out;
                if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
@@ -809,7 +901,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow())
-               new_page = alloc_hugepage(transparent_hugepage_defrag(vma));
+               new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                                             vma, haddr);
        else
                new_page = NULL;
 
@@ -1000,8 +1093,16 @@ pmd_t *page_check_address_pmd(struct page *page,
                goto out;
        if (pmd_page(*pmd) != page)
                goto out;
-       VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
-                 pmd_trans_splitting(*pmd));
+       /*
+        * split_vma() may create temporary aliased mappings. There is
+        * no risk as long as all huge pmd are found and have their
+        * splitting bit set before __split_huge_page_refcount
+        * runs. Finding the same huge pmd more than once during the
+        * same rmap walk is not a problem.
+        */
+       if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+           pmd_trans_splitting(*pmd))
+               goto out;
        if (pmd_trans_huge(*pmd)) {
                VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
                          !pmd_trans_splitting(*pmd));
@@ -1043,6 +1144,7 @@ static void __split_huge_page_refcount(struct page *page)
        int i;
        unsigned long head_index = page->index;
        struct zone *zone = page_zone(page);
+       int zonestat;
 
        /* prevent PageLRU to go away from under us, and freeze lru stats */
        spin_lock_irq(&zone->lru_lock);
@@ -1101,12 +1203,23 @@ static void __split_huge_page_refcount(struct page *page)
                BUG_ON(!PageDirty(page_tail));
                BUG_ON(!PageSwapBacked(page_tail));
 
+               mem_cgroup_split_huge_fixup(page, page_tail);
+
                lru_add_page_tail(zone, page, page_tail);
        }
 
        __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
 
+       /*
+        * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
+        * so adjust those appropriately if this page is on the LRU.
+        */
+       if (PageLRU(page)) {
+               zonestat = NR_LRU_BASE + page_lru(page);
+               __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
+       }
+
        ClearPageCompound(page);
        compound_unlock(page);
        spin_unlock_irq(&zone->lru_lock);
@@ -1278,18 +1391,49 @@ out:
        return ret;
 }
 
-int hugepage_madvise(unsigned long *vm_flags)
+int hugepage_madvise(struct vm_area_struct *vma,
+                    unsigned long *vm_flags, int advice)
 {
-       /*
-        * Be somewhat over-protective like KSM for now!
-        */
-       if (*vm_flags & (VM_HUGEPAGE | VM_SHARED  | VM_MAYSHARE   |
-                        VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
-                        VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
-                        VM_MIXEDMAP | VM_SAO))
-               return -EINVAL;
-
-       *vm_flags |= VM_HUGEPAGE;
+       switch (advice) {
+       case MADV_HUGEPAGE:
+               /*
+                * Be somewhat over-protective like KSM for now!
+                */
+               if (*vm_flags & (VM_HUGEPAGE |
+                                VM_SHARED   | VM_MAYSHARE   |
+                                VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+                                VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+                                VM_MIXEDMAP | VM_SAO))
+                       return -EINVAL;
+               *vm_flags &= ~VM_NOHUGEPAGE;
+               *vm_flags |= VM_HUGEPAGE;
+               /*
+                * If the vma become good for khugepaged to scan,
+                * register it here without waiting a page fault that
+                * may not happen any time soon.
+                */
+               if (unlikely(khugepaged_enter_vma_merge(vma)))
+                       return -ENOMEM;
+               break;
+       case MADV_NOHUGEPAGE:
+               /*
+                * Be somewhat over-protective like KSM for now!
+                */
+               if (*vm_flags & (VM_NOHUGEPAGE |
+                                VM_SHARED   | VM_MAYSHARE   |
+                                VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+                                VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+                                VM_MIXEDMAP | VM_SAO))
+                       return -EINVAL;
+               *vm_flags &= ~VM_HUGEPAGE;
+               *vm_flags |= VM_NOHUGEPAGE;
+               /*
+                * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+                * this vma even if we leave the mm registered in khugepaged if
+                * it got registered before VM_NOHUGEPAGE was set.
+                */
+               break;
+       }
 
        return 0;
 }
@@ -1541,7 +1685,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                VM_BUG_ON(PageLRU(page));
 
                /* If there is no mapped pte young don't collapse the page */
-               if (pte_young(pteval))
+               if (pte_young(pteval) || PageReferenced(page) ||
+                   mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = 1;
        }
        if (unlikely(!referenced))
@@ -1594,9 +1739,9 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 
 static void collapse_huge_page(struct mm_struct *mm,
                               unsigned long address,
-                              struct page **hpage)
+                              struct page **hpage,
+                              struct vm_area_struct *vma)
 {
-       struct vm_area_struct *vma;
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd, _pmd;
@@ -1608,7 +1753,36 @@ static void collapse_huge_page(struct mm_struct *mm,
        unsigned long hstart, hend;
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
        VM_BUG_ON(!*hpage);
+       new_page = *hpage;
+#else
+       VM_BUG_ON(*hpage);
+       /*
+        * Allocate the page while the vma is still valid and under
+        * the mmap_sem read mode so there is no memory allocation
+        * later when we take the mmap_sem in write mode. This is more
+        * friendly behavior (OTOH it may actually hide bugs) to
+        * filesystems in userland with daemons allocating memory in
+        * the userland I/O paths.  Allocating memory with the
+        * mmap_sem in read mode is good idea also to allow greater
+        * scalability.
+        */
+       new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+       if (unlikely(!new_page)) {
+               up_read(&mm->mmap_sem);
+               *hpage = ERR_PTR(-ENOMEM);
+               return;
+       }
+#endif
+       if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+               up_read(&mm->mmap_sem);
+               put_page(new_page);
+               return;
+       }
+
+       /* after allocating the hugepage upgrade to mmap_sem write mode */
+       up_read(&mm->mmap_sem);
 
        /*
         * Prevent all access to pagetables with the exception of
@@ -1625,7 +1799,8 @@ static void collapse_huge_page(struct mm_struct *mm,
        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
                goto out;
 
-       if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+       if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+           (vma->vm_flags & VM_NOHUGEPAGE))
                goto out;
 
        /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
@@ -1646,10 +1821,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
                goto out;
 
-       new_page = *hpage;
-       if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
-               goto out;
-
        anon_vma_lock(vma->anon_vma);
 
        pte = pte_offset_map(pmd, address);
@@ -1668,9 +1839,9 @@ static void collapse_huge_page(struct mm_struct *mm,
        spin_lock(ptl);
        isolated = __collapse_huge_page_isolate(vma, address, pte);
        spin_unlock(ptl);
-       pte_unmap(pte);
 
        if (unlikely(!isolated)) {
+               pte_unmap(pte);
                spin_lock(&mm->page_table_lock);
                BUG_ON(!pmd_none(*pmd));
                set_pmd_at(mm, address, pmd, _pmd);
@@ -1687,6 +1858,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        anon_vma_unlock(vma->anon_vma);
 
        __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+       pte_unmap(pte);
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
        VM_BUG_ON(page_count(pgtable) != 1);
@@ -1712,10 +1884,19 @@ static void collapse_huge_page(struct mm_struct *mm,
        mm->nr_ptes--;
        spin_unlock(&mm->page_table_lock);
 
+#ifndef CONFIG_NUMA
        *hpage = NULL;
+#endif
        khugepaged_pages_collapsed++;
-out:
+out_up_write:
        up_write(&mm->mmap_sem);
+       return;
+
+out:
+#ifdef CONFIG_NUMA
+       put_page(new_page);
+#endif
+       goto out_up_write;
 }
 
 static int khugepaged_scan_pmd(struct mm_struct *mm,
@@ -1767,17 +1948,17 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                /* cannot use mapcount: can't collapse if there's a gup pin */
                if (page_count(page) != 1)
                        goto out_unmap;
-               if (pte_young(pteval))
+               if (pte_young(pteval) || PageReferenced(page) ||
+                   mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = 1;
        }
        if (referenced)
                ret = 1;
 out_unmap:
        pte_unmap_unlock(pte, ptl);
-       if (ret) {
-               up_read(&mm->mmap_sem);
-               collapse_huge_page(mm, address, hpage);
-       }
+       if (ret)
+               /* collapse_huge_page will return with the mmap_sem released */
+               collapse_huge_page(mm, address, hpage, vma);
 out:
        return ret;
 }
@@ -1843,8 +2024,9 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                        break;
                }
 
-               if (!(vma->vm_flags & VM_HUGEPAGE) &&
-                   !khugepaged_always()) {
+               if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+                    !khugepaged_always()) ||
+                   (vma->vm_flags & VM_NOHUGEPAGE)) {
                        progress++;
                        continue;
                }
@@ -1948,11 +2130,19 @@ static void khugepaged_do_scan(struct page **hpage)
        while (progress < pages) {
                cond_resched();
 
+#ifndef CONFIG_NUMA
                if (!*hpage) {
                        *hpage = alloc_hugepage(khugepaged_defrag());
                        if (unlikely(!*hpage))
                                break;
                }
+#else
+               if (IS_ERR(*hpage))
+                       break;
+#endif
+
+               if (unlikely(kthread_should_stop() || freezing(current)))
+                       break;
 
                spin_lock(&khugepaged_mm_lock);
                if (!khugepaged_scan.mm_slot)
@@ -1967,37 +2157,58 @@ static void khugepaged_do_scan(struct page **hpage)
        }
 }
 
+static void khugepaged_alloc_sleep(void)
+{
+       DEFINE_WAIT(wait);
+       add_wait_queue(&khugepaged_wait, &wait);
+       schedule_timeout_interruptible(
+               msecs_to_jiffies(
+                       khugepaged_alloc_sleep_millisecs));
+       remove_wait_queue(&khugepaged_wait, &wait);
+}
+
+#ifndef CONFIG_NUMA
 static struct page *khugepaged_alloc_hugepage(void)
 {
        struct page *hpage;
 
        do {
                hpage = alloc_hugepage(khugepaged_defrag());
-               if (!hpage) {
-                       DEFINE_WAIT(wait);
-                       add_wait_queue(&khugepaged_wait, &wait);
-                       schedule_timeout_interruptible(
-                               msecs_to_jiffies(
-                                       khugepaged_alloc_sleep_millisecs));
-                       remove_wait_queue(&khugepaged_wait, &wait);
-               }
+               if (!hpage)
+                       khugepaged_alloc_sleep();
        } while (unlikely(!hpage) &&
                 likely(khugepaged_enabled()));
        return hpage;
 }
+#endif
 
 static void khugepaged_loop(void)
 {
        struct page *hpage;
 
+#ifdef CONFIG_NUMA
+       hpage = NULL;
+#endif
        while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
                hpage = khugepaged_alloc_hugepage();
                if (unlikely(!hpage))
                        break;
+#else
+               if (IS_ERR(hpage)) {
+                       khugepaged_alloc_sleep();
+                       hpage = NULL;
+               }
+#endif
 
                khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
                if (hpage)
                        put_page(hpage);
+#endif
+               try_to_freeze();
+               if (unlikely(kthread_should_stop()))
+                       break;
                if (khugepaged_has_work()) {
                        DEFINE_WAIT(wait);
                        if (!khugepaged_scan_sleep_millisecs)
@@ -2008,8 +2219,8 @@ static void khugepaged_loop(void)
                                        khugepaged_scan_sleep_millisecs));
                        remove_wait_queue(&khugepaged_wait, &wait);
                } else if (khugepaged_enabled())
-                       wait_event_interruptible(khugepaged_wait,
-                                                khugepaged_wait_event());
+                       wait_event_freezable(khugepaged_wait,
+                                            khugepaged_wait_event());
        }
 }
 
@@ -2017,6 +2228,7 @@ static int khugepaged(void *none)
 {
        struct mm_slot *mm_slot;
 
+       set_freezable();
        set_user_nice(current, 19);
 
        /* serialize with start_khugepaged() */
@@ -2031,6 +2243,8 @@ static int khugepaged(void *none)
                mutex_lock(&khugepaged_mutex);
                if (!khugepaged_enabled())
                        break;
+               if (unlikely(kthread_should_stop()))
+                       break;
        }
 
        spin_lock(&khugepaged_mm_lock);
@@ -2065,3 +2279,71 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
        put_page(page);
        BUG_ON(pmd_trans_huge(*pmd));
 }
+
+static void split_huge_page_address(struct mm_struct *mm,
+                                   unsigned long address)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+
+       pgd = pgd_offset(mm, address);
+       if (!pgd_present(*pgd))
+               return;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return;
+       /*
+        * Caller holds the mmap_sem write mode, so a huge pmd cannot
+        * materialize from under us.
+        */
+       split_huge_page_pmd(mm, pmd);
+}
+
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+                            unsigned long start,
+                            unsigned long end,
+                            long adjust_next)
+{
+       /*
+        * If the new start address isn't hpage aligned and it could
+        * previously contain an hugepage: check if we need to split
+        * an huge pmd.
+        */
+       if (start & ~HPAGE_PMD_MASK &&
+           (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+           (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+               split_huge_page_address(vma->vm_mm, start);
+
+       /*
+        * If the new end address isn't hpage aligned and it could
+        * previously contain an hugepage: check if we need to split
+        * an huge pmd.
+        */
+       if (end & ~HPAGE_PMD_MASK &&
+           (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+           (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+               split_huge_page_address(vma->vm_mm, end);
+
+       /*
+        * If we're also updating the vma->vm_next->vm_start, if the new
+        * vm_next->vm_start isn't page aligned and it could previously
+        * contain an hugepage: check if we need to split an huge pmd.
+        */
+       if (adjust_next > 0) {
+               struct vm_area_struct *next = vma->vm_next;
+               unsigned long nstart = next->vm_start;
+               nstart += adjust_next << PAGE_SHIFT;
+               if (nstart & ~HPAGE_PMD_MASK &&
+                   (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+                   (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+                       split_huge_page_address(next->vm_mm, nstart);
+       }
+}