Merge with /pub/scm/linux/kernel/git/torvalds/linux-2.6.git
[pandora-kernel.git] / mm / hugetlb.c
index ea0826f..728e9bd 100644 (file)
@@ -103,6 +103,9 @@ static int __init hugetlb_init(void)
        unsigned long i;
        struct page *page;
 
+       if (HPAGE_SHIFT == 0)
+               return 0;
+
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&hugepage_freelists[i]);
 
@@ -234,7 +237,6 @@ unsigned long hugetlb_total_pages(void)
 {
        return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
 }
-EXPORT_SYMBOL(hugetlb_total_pages);
 
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
@@ -314,15 +316,14 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        BUG_ON(start & ~HPAGE_MASK);
        BUG_ON(end & ~HPAGE_MASK);
 
+       spin_lock(&mm->page_table_lock);
+
        /* Update high watermark before we lower rss */
        update_hiwater_rss(mm);
 
        for (address = start; address < end; address += HPAGE_SIZE) {
                ptep = huge_pte_offset(mm, address);
-               if (! ptep)
-                       /* This can happen on truncate, or if an
-                        * mmap() is aborted due to an error before
-                        * the prefault */
+               if (!ptep)
                        continue;
 
                pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -333,94 +334,97 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                put_page(page);
                add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
        }
-       flush_tlb_range(vma, start, end);
-}
-
-void zap_hugepage_range(struct vm_area_struct *vma,
-                       unsigned long start, unsigned long length)
-{
-       struct mm_struct *mm = vma->vm_mm;
 
-       spin_lock(&mm->page_table_lock);
-       unmap_hugepage_range(vma, start, start + length);
        spin_unlock(&mm->page_table_lock);
+       flush_tlb_range(vma, start, end);
 }
 
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+static struct page *find_lock_huge_page(struct address_space *mapping,
+                       unsigned long idx)
 {
-       struct mm_struct *mm = current->mm;
-       unsigned long addr;
-       int ret = 0;
-
-       WARN_ON(!is_vm_hugetlb_page(vma));
-       BUG_ON(vma->vm_start & ~HPAGE_MASK);
-       BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-       hugetlb_prefault_arch_hook(mm);
-
-       for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-               unsigned long idx;
-               pte_t *pte = huge_pte_alloc(mm, addr);
-               struct page *page;
-
-               if (!pte) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
+       struct page *page;
+       int err;
+       struct inode *inode = mapping->host;
+       unsigned long size;
+
+retry:
+       page = find_lock_page(mapping, idx);
+       if (page)
+               goto out;
+
+       /* Check to make sure the mapping hasn't been truncated */
+       size = i_size_read(inode) >> HPAGE_SHIFT;
+       if (idx >= size)
+               goto out;
+
+       if (hugetlb_get_quota(mapping))
+               goto out;
+       page = alloc_huge_page();
+       if (!page) {
+               hugetlb_put_quota(mapping);
+               goto out;
+       }
 
-               idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-                       + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-               page = find_get_page(mapping, idx);
-               if (!page) {
-                       /* charge the fs quota first */
-                       if (hugetlb_get_quota(mapping)) {
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-                       page = alloc_huge_page();
-                       if (!page) {
-                               hugetlb_put_quota(mapping);
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-                       ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-                       if (! ret) {
-                               unlock_page(page);
-                       } else {
-                               hugetlb_put_quota(mapping);
-                               free_huge_page(page);
-                               goto out;
-                       }
-               }
-               spin_lock(&mm->page_table_lock);
-               add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
-               set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
-               spin_unlock(&mm->page_table_lock);
+       err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+       if (err) {
+               put_page(page);
+               hugetlb_put_quota(mapping);
+               if (err == -EEXIST)
+                       goto retry;
+               page = NULL;
        }
 out:
-       return ret;
+       return page;
 }
 
-/*
- * On ia64 at least, it is possible to receive a hugetlb fault from a
- * stale zero entry left in the TLB from earlier hardware prefetching.
- * Low-level arch code should already have flushed the stale entry as
- * part of its fault handling, but we do need to accept this minor fault
- * and return successfully.  Whereas the "normal" case is that this is
- * an access to a hugetlb page which has been truncated off since mmap.
- */
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, int write_access)
 {
        int ret = VM_FAULT_SIGBUS;
+       unsigned long idx;
+       unsigned long size;
        pte_t *pte;
+       struct page *page;
+       struct address_space *mapping;
+
+       pte = huge_pte_alloc(mm, address);
+       if (!pte)
+               goto out;
+
+       mapping = vma->vm_file->f_mapping;
+       idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+               + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+       /*
+        * Use page lock to guard against racing truncation
+        * before we get page_table_lock.
+        */
+       page = find_lock_huge_page(mapping, idx);
+       if (!page)
+               goto out;
 
        spin_lock(&mm->page_table_lock);
-       pte = huge_pte_offset(mm, address);
-       if (pte && !pte_none(*pte))
-               ret = VM_FAULT_MINOR;
+       size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+       if (idx >= size)
+               goto backout;
+
+       ret = VM_FAULT_MINOR;
+       if (!pte_none(*pte))
+               goto backout;
+
+       add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+       set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
        spin_unlock(&mm->page_table_lock);
+       unlock_page(page);
+out:
        return ret;
+
+backout:
+       spin_unlock(&mm->page_table_lock);
+       hugetlb_put_quota(mapping);
+       unlock_page(page);
+       put_page(page);
+       goto out;
 }
 
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -430,34 +434,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long vpfn, vaddr = *position;
        int remainder = *length;
 
-       BUG_ON(!is_vm_hugetlb_page(vma));
-
        vpfn = vaddr/PAGE_SIZE;
        spin_lock(&mm->page_table_lock);
        while (vaddr < vma->vm_end && remainder) {
+               pte_t *pte;
+               struct page *page;
 
-               if (pages) {
-                       pte_t *pte;
-                       struct page *page;
-
-                       /* Some archs (sparc64, sh*) have multiple
-                        * pte_ts to each hugepage.  We have to make
-                        * sure we get the first, for the page
-                        * indexing below to work. */
-                       pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
-
-                       /* the hugetlb file might have been truncated */
-                       if (!pte || pte_none(*pte)) {
-                               remainder = 0;
-                               if (!i)
-                                       i = -EFAULT;
-                               break;
-                       }
+               /*
+                * Some archs (sparc64, sh*) have multiple pte_ts to
+                * each hugepage.  We have to make * sure we get the
+                * first, for the page indexing below to work.
+                */
+               pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
 
-                       page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+               if (!pte || pte_none(*pte)) {
+                       int ret;
 
-                       WARN_ON(!PageCompound(page));
+                       spin_unlock(&mm->page_table_lock);
+                       ret = hugetlb_fault(mm, vma, vaddr, 0);
+                       spin_lock(&mm->page_table_lock);
+                       if (ret == VM_FAULT_MINOR)
+                               continue;
+
+                       remainder = 0;
+                       if (!i)
+                               i = -EFAULT;
+                       break;
+               }
 
+               if (pages) {
+                       page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
                        get_page(page);
                        pages[i] = page;
                }