mm: protect set_page_dirty() from ongoing truncation

[pandora-kernel.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index 829d437..759f915 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -205,10 +205,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
                 return 1;
         }
  
+       if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+               return 0;
+
         batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
         if (!batch)
                 return 0;
  
+       tlb->batch_count++;
         batch->next = NULL;
         batch->nr   = 0;
         batch->max  = MAX_GATHER_BATCH;
@@ -235,6 +239,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
         tlb->local.nr   = 0;
         tlb->local.max  = ARRAY_SIZE(tlb->__pages);
         tlb->active     = &tlb->local;
+       tlb->batch_count = 0;
  
  #ifdef CONFIG_HAVE_RCU_TABLE_FREE
         tlb->batch = NULL;
@@ -865,20 +870,20 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 if (!pte_file(pte)) {
                         swp_entry_t entry = pte_to_swp_entry(pte);
  
-                       if (swap_duplicate(entry) < 0)
-                               return entry.val;
-
-                       /* make sure dst_mm is on swapoff's mmlist. */
-                       if (unlikely(list_empty(&dst_mm->mmlist))) {
-                               spin_lock(&mmlist_lock);
-                               if (list_empty(&dst_mm->mmlist))
-                                       list_add(&dst_mm->mmlist,
-                                                &src_mm->mmlist);
-                               spin_unlock(&mmlist_lock);
-                       }
-                       if (likely(!non_swap_entry(entry)))
+                       if (likely(!non_swap_entry(entry))) {
+                               if (swap_duplicate(entry) < 0)
+                                       return entry.val;
+
+                               /* make sure dst_mm is on swapoff's mmlist. */
+                               if (unlikely(list_empty(&dst_mm->mmlist))) {
+                                       spin_lock(&mmlist_lock);
+                                       if (list_empty(&dst_mm->mmlist))
+                                               list_add(&dst_mm->mmlist,
+                                                        &src_mm->mmlist);
+                                       spin_unlock(&mmlist_lock);
+                               }
                                 rss[MM_SWAPENTS]++;
-                       else if (is_write_migration_entry(entry) &&
+                       } else if (is_write_migration_entry(entry) &&
                                         is_cow_mapping(vm_flags)) {
                                 /*
                                  * COW mappings require pages in both parent
@@ -1173,8 +1178,10 @@ again:
                         if (unlikely(page_mapcount(page) < 0))
                                 print_bad_pte(vma, addr, ptent, page);
                         force_flush = !__tlb_remove_page(tlb, page);
-                       if (force_flush)
+                       if (force_flush) {
+                               addr += PAGE_SIZE;
                                 break;
+                       }
                         continue;
                 }
                 /*
@@ -1228,16 +1235,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
         do {
                 next = pmd_addr_end(addr, end);
                 if (pmd_trans_huge(*pmd)) {
-                       if (next-addr != HPAGE_PMD_SIZE) {
+                       if (next - addr != HPAGE_PMD_SIZE) {
                                 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
                                 split_huge_page_pmd(vma->vm_mm, pmd);
                         } else if (zap_huge_pmd(tlb, vma, pmd))
-                               continue;
+                               goto next;
                         /* fall through */
                 }
-               if (pmd_none_or_clear_bad(pmd))
-                       continue;
+               /*
+                * Here there can be other concurrent MADV_DONTNEED or
+                * trans huge page faults running, and if the pmd is
+                * none or trans huge it can change under us. This is
+                * because MADV_DONTNEED holds the mmap_sem in read
+                * mode.
+                */
+               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+                       goto next;
                 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+next:
                 cond_resched();
         } while (pmd++, addr = next, addr != end);
  
@@ -1350,8 +1365,11 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
                                  * Since no pte has actually been setup, it is
                                  * safe to do nothing in this case.
                                  */
-                               if (vma->vm_file)
-                                       unmap_hugepage_range(vma, start, end, NULL);
+                               if (vma->vm_file) {
+                                       mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                                       __unmap_hugepage_range_final(vma, start, end, NULL);
+                                       mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                               }
  
                                 start = end;
                         } else
@@ -1836,12 +1854,17 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                      unsigned long address, unsigned int fault_flags)
  {
         struct vm_area_struct *vma;
+       vm_flags_t vm_flags;
         int ret;
  
         vma = find_extend_vma(mm, address);
         if (!vma || address < vma->vm_start)
                 return -EFAULT;
  
+       vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
+       if (!(vm_flags & vma->vm_flags))
+               return -EFAULT;
+
         ret = handle_mm_fault(mm, vma, address, fault_flags);
         if (ret & VM_FAULT_ERROR) {
                 if (ret & VM_FAULT_OOM)
@@ -2293,6 +2316,53 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
  }
  EXPORT_SYMBOL(remap_pfn_range);
  
+/**
+ * vm_iomap_memory - remap memory to userspace
+ * @vma: user vma to map to
+ * @start: start of area
+ * @len: size of area
+ *
+ * This is a simplified io_remap_pfn_range() for common driver use. The
+ * driver just needs to give us the physical memory range to be mapped,
+ * we'll figure out the rest from the vma information.
+ *
+ * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
+ * whatever write-combining details or similar.
+ */
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+{
+       unsigned long vm_len, pfn, pages;
+
+       /* Check that the physical memory area passed in looks valid */
+       if (start + len < start)
+               return -EINVAL;
+       /*
+        * You *really* shouldn't map things that aren't page-aligned,
+        * but we've historically allowed it because IO memory might
+        * just have smaller alignment.
+        */
+       len += start & ~PAGE_MASK;
+       pfn = start >> PAGE_SHIFT;
+       pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
+       if (pfn + pages < pfn)
+               return -EINVAL;
+
+       /* We start the mapping 'vm_pgoff' pages into the area */
+       if (vma->vm_pgoff > pages)
+               return -EINVAL;
+       pfn += vma->vm_pgoff;
+       pages -= vma->vm_pgoff;
+
+       /* Can we fit all of the mapping? */
+       vm_len = vma->vm_end - vma->vm_start;
+       if (vm_len >> PAGE_SHIFT > pages)
+               return -EINVAL;
+
+       /* Ok, let it rip */
+       return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_iomap_memory);
+
  static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                      unsigned long addr, unsigned long end,
                                      pte_fn_t fn, void *data)
@@ -2591,17 +2661,24 @@ reuse:
                 if (!dirty_page)
                         return ret;
  
-               /*
-                * Yes, Virginia, this is actually required to prevent a race
-                * with clear_page_dirty_for_io() from clearing the page dirty
-                * bit after it clear all dirty ptes, but before a racing
-                * do_wp_page installs a dirty pte.
-                *
-                * __do_fault is protected similarly.
-                */
                 if (!page_mkwrite) {
-                       wait_on_page_locked(dirty_page);
-                       set_page_dirty_balance(dirty_page, page_mkwrite);
+                       struct address_space *mapping;
+                       int dirtied;
+
+                       lock_page(dirty_page);
+                       dirtied = set_page_dirty(dirty_page);
+                       VM_BUG_ON(PageAnon(dirty_page));
+                       mapping = dirty_page->mapping;
+                       unlock_page(dirty_page);
+
+                       if (dirtied && mapping) {
+                               /*
+                                * Some device drivers do not set page.mapping
+                                * but still dirty their pages
+                                */
+                               balance_dirty_pages_ratelimited(mapping);
+                       }
+
                 }
                 put_page(dirty_page);
                 if (page_mkwrite) {
@@ -3047,7 +3124,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
                 if (prev && prev->vm_end == address)
                         return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
  
-               expand_downwards(vma, address - PAGE_SIZE);
+               return expand_downwards(vma, address - PAGE_SIZE);
         }
         if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
                 struct vm_area_struct *next = vma->vm_next;
@@ -3056,7 +3133,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
                 if (next && next->vm_start == address + PAGE_SIZE)
                         return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
  
-               expand_upwards(vma, address + PAGE_SIZE);
+               return expand_upwards(vma, address + PAGE_SIZE);
         }
         return 0;
  }
@@ -3458,6 +3535,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         if (unlikely(is_vm_hugetlb_page(vma)))
                 return hugetlb_fault(mm, vma, address, flags);
  
+retry:
         pgd = pgd_offset(mm, address);
         pud = pud_alloc(mm, pgd, address);
         if (!pud)
@@ -3471,13 +3549,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                           pmd, flags);
         } else {
                 pmd_t orig_pmd = *pmd;
+               int ret;
+
                 barrier();
                 if (pmd_trans_huge(orig_pmd)) {
                         if (flags & FAULT_FLAG_WRITE &&
                             !pmd_write(orig_pmd) &&
-                           !pmd_trans_splitting(orig_pmd))
-                               return do_huge_pmd_wp_page(mm, vma, address,
-                                                          pmd, orig_pmd);
+                           !pmd_trans_splitting(orig_pmd)) {
+                               ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
+                                                         orig_pmd);
+                               /*
+                                * If COW results in an oom, the huge pmd will
+                                * have been split, so retry the fault on the
+                                * pte for a smaller charge.
+                                */
+                               if (unlikely(ret & VM_FAULT_OOM))
+                                       goto retry;
+                               return ret;
+                       }
                         return 0;
                 }
         }