md: delay notification of 'active_idle' to the recovery thread

[pandora-kernel.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index fb5608a..2302d22 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -999,17 +999,15 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                 goto no_page_table;
  
         ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
-       if (!ptep)
-               goto out;
  
         pte = *ptep;
         if (!pte_present(pte))
-               goto unlock;
+               goto no_page;
         if ((flags & FOLL_WRITE) && !pte_write(pte))
                 goto unlock;
         page = vm_normal_page(vma, address, pte);
         if (unlikely(!page))
-               goto unlock;
+               goto bad_page;
  
         if (flags & FOLL_GET)
                 get_page(page);
@@ -1024,6 +1022,15 @@ unlock:
  out:
         return page;
  
+bad_page:
+       pte_unmap_unlock(ptep, ptl);
+       return ERR_PTR(-EFAULT);
+
+no_page:
+       pte_unmap_unlock(ptep, ptl);
+       if (!pte_none(pte))
+               return page;
+       /* Fall through to ZERO_PAGE handling */
  no_page_table:
         /*
          * When core dumping an enormous anonymous area that nobody
@@ -1038,6 +1045,26 @@ no_page_table:
         return page;
  }
  
+/* Can we do the FOLL_ANON optimization? */
+static inline int use_zero_page(struct vm_area_struct *vma)
+{
+       /*
+        * We don't want to optimize FOLL_ANON for make_pages_present()
+        * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
+        * we want to get the page from the page tables to make sure
+        * that we serialize and update with any other user of that
+        * mapping.
+        */
+       if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
+               return 0;
+       /*
+        * And if we have a fault or a nopfn routine, it's not an
+        * anonymous region.
+        */
+       return !vma->vm_ops ||
+               (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
+}
+
  int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                 unsigned long start, int len, int write, int force,
                 struct page **pages, struct vm_area_struct **vmas)
@@ -1112,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                 foll_flags = FOLL_TOUCH;
                 if (pages)
                         foll_flags |= FOLL_GET;
-               if (!write && !(vma->vm_flags & VM_LOCKED) &&
-                   (!vma->vm_ops || !vma->vm_ops->fault))
+               if (!write && use_zero_page(vma))
                         foll_flags |= FOLL_ANON;
  
                 do {
@@ -1125,7 +1151,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                          * be processed until returning to user space.
                          */
                         if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
-                               return -ENOMEM;
+                               return i ? i : -ENOMEM;
  
                         if (write)
                                 foll_flags |= FOLL_WRITE;
@@ -1159,6 +1185,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
  
                                 cond_resched();
                         }
+                       if (IS_ERR(page))
+                               return i ? i : PTR_ERR(page);
                         if (pages) {
                                 pages[i] = page;
  
@@ -1669,8 +1697,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         struct page *dirty_page = NULL;
  
         old_page = vm_normal_page(vma, address, orig_pte);
-       if (!old_page)
+       if (!old_page) {
+               /*
+                * VM_MIXEDMAP !pfn_valid() case
+                *
+                * We should not cow pages in a shared writeable mapping.
+                * Just mark the pages writable as we can't do any dirty
+                * accounting on raw pfn maps.
+                */
+               if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+                                    (VM_WRITE|VM_SHARED))
+                       goto reuse;
                 goto gotten;
+       }
  
         /*
          * Take out anonymous pages first, anonymous shared vmas are
@@ -1723,6 +1762,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         }
  
         if (reuse) {
+reuse:
                 flush_cache_page(vma, address, pte_pfn(orig_pte));
                 entry = pte_mkyoung(orig_pte);
                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1757,7 +1797,6 @@ gotten:
         page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
         if (likely(pte_same(*page_table, orig_pte))) {
                 if (old_page) {
-                       page_remove_rmap(old_page, vma);
                         if (!PageAnon(old_page)) {
                                 dec_mm_counter(mm, file_rss);
                                 inc_mm_counter(mm, anon_rss);
@@ -1779,6 +1818,32 @@ gotten:
                 lru_cache_add_active(new_page);
                 page_add_new_anon_rmap(new_page, vma, address);
  
+               if (old_page) {
+                       /*
+                        * Only after switching the pte to the new page may
+                        * we remove the mapcount here. Otherwise another
+                        * process may come and find the rmap count decremented
+                        * before the pte is switched to the new page, and
+                        * "reuse" the old page writing into it while our pte
+                        * here still points into it and can be read by other
+                        * threads.
+                        *
+                        * The critical issue is to order this
+                        * page_remove_rmap with the ptp_clear_flush above.
+                        * Those stores are ordered by (if nothing else,)
+                        * the barrier present in the atomic_add_negative
+                        * in page_remove_rmap.
+                        *
+                        * Then the TLB flush in ptep_clear_flush ensures that
+                        * no process can access the old page before the
+                        * decremented mapcount is visible. And the old page
+                        * cannot be reused until after the decremented
+                        * mapcount is visible. So transitively, TLBs to
+                        * old page will be flushed before it can be reused.
+                        */
+                       page_remove_rmap(old_page, vma);
+               }
+
                 /* Free the old page.. */
                 new_page = old_page;
                 ret |= VM_FAULT_WRITE;
@@ -2295,8 +2360,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         vmf.flags = flags;
         vmf.page = NULL;
  
-       BUG_ON(vma->vm_flags & VM_PFNMAP);
-
         ret = vma->vm_ops->fault(vma, &vmf);
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                 return ret;