anon_vma_prepare: properly lock even newly allocated entries

[pandora-kernel.git] / mm / rmap.c
diff --git a/mm/rmap.c b/mm/rmap.c

index 94a5246..e8d639b 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -55,7 +55,33 @@
  
  struct kmem_cache *anon_vma_cachep;
  
-/* This must be called under the mmap_sem. */
+/**
+ * anon_vma_prepare - attach an anon_vma to a memory region
+ * @vma: the memory region in question
+ *
+ * This makes sure the memory mapping described by 'vma' has
+ * an 'anon_vma' attached to it, so that we can associate the
+ * anonymous pages mapped into it with that anon_vma.
+ *
+ * The common case will be that we already have one, but if
+ * if not we either need to find an adjacent mapping that we
+ * can re-use the anon_vma from (very common when the only
+ * reason for splitting a vma has been mprotect()), or we
+ * allocate a new one.
+ *
+ * Anon-vma allocations are very subtle, because we may have
+ * optimistically looked up an anon_vma in page_lock_anon_vma()
+ * and that may actually touch the spinlock even in the newly
+ * allocated vma (it depends on RCU to make sure that the
+ * anon_vma isn't actually destroyed).
+ *
+ * As a result, we need to do proper anon_vma locking even
+ * for the new allocation. At the same time, we do not want
+ * to do any locking for the common case of already having
+ * an anon_vma.
+ *
+ * This must be called with the mmap_sem held for reading.
+ */
  int anon_vma_prepare(struct vm_area_struct *vma)
  {
         struct anon_vma *anon_vma = vma->anon_vma;
@@ -63,20 +89,17 @@ int anon_vma_prepare(struct vm_area_struct *vma)
         might_sleep();
         if (unlikely(!anon_vma)) {
                 struct mm_struct *mm = vma->vm_mm;
-               struct anon_vma *allocated, *locked;
+               struct anon_vma *allocated;
  
                 anon_vma = find_mergeable_anon_vma(vma);
-               if (anon_vma) {
-                       allocated = NULL;
-                       locked = anon_vma;
-                       spin_lock(&locked->lock);
-               } else {
+               allocated = NULL;
+               if (!anon_vma) {
                         anon_vma = anon_vma_alloc();
                         if (unlikely(!anon_vma))
                                 return -ENOMEM;
                         allocated = anon_vma;
-                       locked = NULL;
                 }
+               spin_lock(&anon_vma->lock);
  
                 /* page_table_lock to protect against threads */
                 spin_lock(&mm->page_table_lock);
@@ -87,8 +110,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                 }
                 spin_unlock(&mm->page_table_lock);
  
-               if (locked)
-                       spin_unlock(&locked->lock);
+               spin_unlock(&anon_vma->lock);
                 if (unlikely(allocated))
                         anon_vma_free(allocated);
         }
@@ -224,10 +246,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
  /*
   * Check that @page is mapped at @address into @mm.
   *
+ * If @sync is false, page_check_address may perform a racy check to avoid
+ * the page table lock when the pte is not present (helpful when reclaiming
+ * highly shared pages).
+ *
   * On success returns with pte mapped and locked.
   */
  pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-                         unsigned long address, spinlock_t **ptlp)
+                         unsigned long address, spinlock_t **ptlp, int sync)
  {
         pgd_t *pgd;
         pud_t *pud;
@@ -249,7 +275,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
  
         pte = pte_offset_map(pmd, address);
         /* Make a quick check before getting the lock */
-       if (!pte_present(*pte)) {
+       if (!sync && !pte_present(*pte)) {
                 pte_unmap(pte);
                 return NULL;
         }
@@ -281,7 +307,7 @@ static int page_referenced_one(struct page *page,
         if (address == -EFAULT)
                 goto out;
  
-       pte = page_check_address(page, mm, address, &ptl);
+       pte = page_check_address(page, mm, address, &ptl, 0);
         if (!pte)
                 goto out;
  
@@ -422,7 +448,7 @@ int page_referenced(struct page *page, int is_locked,
                         referenced += page_referenced_anon(page, mem_cont);
                 else if (is_locked)
                         referenced += page_referenced_file(page, mem_cont);
-               else if (TestSetPageLocked(page))
+               else if (!trylock_page(page))
                         referenced++;
                 else {
                         if (page->mapping)
@@ -450,7 +476,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
         if (address == -EFAULT)
                 goto out;
  
-       pte = page_check_address(page, mm, address, &ptl);
+       pte = page_check_address(page, mm, address, &ptl, 1);
         if (!pte)
                 goto out;
  
@@ -659,23 +685,30 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
                 }
  
                 /*
-                * It would be tidy to reset the PageAnon mapping here,
-                * but that might overwrite a racing page_add_anon_rmap
-                * which increments mapcount after us but sets mapping
-                * before us: so leave the reset to free_hot_cold_page,
-                * and remember that it's only reliable while mapped.
-                * Leaving it set also helps swapoff to reinstate ptes
-                * faster for those pages still in swapcache.
+                * Now that the last pte has gone, s390 must transfer dirty
+                * flag from storage key to struct page.  We can usually skip
+                * this if the page is anon, so about to be freed; but perhaps
+                * not if it's in swapcache - there might be another pte slot
+                * containing the swap entry, but page not yet written to swap.
                  */
                 if ((!PageAnon(page) || PageSwapCache(page)) &&
                     page_test_dirty(page)) {
                         page_clear_dirty(page);
                         set_page_dirty(page);
                 }
-               mem_cgroup_uncharge_page(page);
  
+               mem_cgroup_uncharge_page(page);
                 __dec_zone_page_state(page,
-                               PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+                       PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+               /*
+                * It would be tidy to reset the PageAnon mapping here,
+                * but that might overwrite a racing page_add_anon_rmap
+                * which increments mapcount after us but sets mapping
+                * before us: so leave the reset to free_hot_cold_page,
+                * and remember that it's only reliable while mapped.
+                * Leaving it set also helps swapoff to reinstate ptes
+                * faster for those pages still in swapcache.
+                */
         }
  }
  
@@ -697,7 +730,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         if (address == -EFAULT)
                 goto out;
  
-       pte = page_check_address(page, mm, address, &ptl);
+       pte = page_check_address(page, mm, address, &ptl, 0);
         if (!pte)
                 goto out;