hugetlbfs: add swap entry check in follow_hugetlb_page()
[pandora-kernel.git] / mm / hugetlb.c
index 7c535b0..70b4733 100644 (file)
@@ -538,8 +538,10 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
        struct zonelist *zonelist;
        struct zone *zone;
        struct zoneref *z;
+       unsigned int cpuset_mems_cookie;
 
-       get_mems_allowed();
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
        zonelist = huge_zonelist(vma, address,
                                        htlb_alloc_mask, &mpol, &nodemask);
        /*
@@ -566,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
                        }
                }
        }
-err:
+
        mpol_cond_put(mpol);
-       put_mems_allowed();
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
        return page;
+
+err:
+       mpol_cond_put(mpol);
+       return NULL;
 }
 
 static void update_and_free_page(struct hstate *h, struct page *page)
@@ -2085,8 +2092,12 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-       struct hstate *h = &default_hstate;
-       return h->nr_huge_pages * pages_per_huge_page(h);
+       struct hstate *h;
+       unsigned long nr_total_pages = 0;
+
+       for_each_hstate(h)
+               nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
+       return nr_total_pages;
 }
 
 static int hugetlb_acct_memory(struct hstate *h, long delta)
@@ -2375,6 +2386,25 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        }
 }
 
+void __unmap_hugepage_range_final(struct vm_area_struct *vma,
+                         unsigned long start, unsigned long end,
+                         struct page *ref_page)
+{
+       __unmap_hugepage_range(vma, start, end, ref_page);
+
+       /*
+        * Clear this flag so that x86's huge_pmd_share page_table_shareable
+        * test will fail on a vma being torn down, and not grab a page table
+        * on its way out.  We're lucky that the flag has such an appropriate
+        * name, and can in fact be safely cleared here. We could clear it
+        * before the __unmap_hugepage_range above, but all that's necessary
+        * is to clear it before releasing the i_mmap_mutex. This works
+        * because in the context this is called, the VMA is about to be
+        * destroyed and the i_mmap_mutex is held.
+        */
+       vma->vm_flags &= ~VM_MAYSHARE;
+}
+
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct page *ref_page)
 {
@@ -2403,8 +2433,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * from page cache lookup which is in HPAGE_SIZE units.
         */
        address = address & huge_page_mask(h);
-       pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
-               + (vma->vm_pgoff >> PAGE_SHIFT);
+       pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+                       vma->vm_pgoff;
        mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
 
        /*
@@ -2859,7 +2889,17 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        break;
                }
 
-               if (absent ||
+               /*
+                * We need call hugetlb_fault for both hugepages under migration
+                * (in which case hugetlb_fault waits for the migration,) and
+                * hwpoisoned hugepages (in which case we need to prevent the
+                * caller from accessing to them.) In order to do this, we use
+                * here is_swap_pte instead of is_hugetlb_entry_migration and
+                * is_hugetlb_entry_hwpoisoned. This is because it simply covers
+                * both cases, and because we can't follow correct pages
+                * directly from any kind of swap entries.
+                */
+               if (absent || is_swap_pte(huge_ptep_get(pte)) ||
                    ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
                        int ret;
 
@@ -2932,9 +2972,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                }
        }
        spin_unlock(&mm->page_table_lock);
-       mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
-
+       /*
+        * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+        * may have cleared our pud entry and done put_page on the page table:
+        * once we release i_mmap_mutex, another task can do the final put_page
+        * and that page table be reused and filled with junk.
+        */
        flush_tlb_range(vma, start, end);
+       mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
 
 int hugetlb_reserve_pages(struct inode *inode,