ext4: use EINVAL if not a regular file in ext4_collapse_range()
[pandora-kernel.git] / mm / memory.c
index 22dfa61..90cea22 100644 (file)
@@ -2586,6 +2586,38 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
                copy_user_highpage(dst, src, va, vma);
 }
 
+/*
+ * Notify the address space that the page is about to become writable so that
+ * it can prohibit this or wait for the page to get into an appropriate state.
+ *
+ * We do this without the lock held, so that it can sleep if it needs to.
+ */
+static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+              unsigned long address)
+{
+       struct vm_fault vmf;
+       int ret;
+
+       vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+       vmf.pgoff = page->index;
+       vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+       vmf.page = page;
+
+       ret = vma->vm_ops->page_mkwrite(vma, &vmf);
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+               return ret;
+       if (unlikely(!(ret & VM_FAULT_LOCKED))) {
+               lock_page(page);
+               if (!page->mapping) {
+                       unlock_page(page);
+                       return 0; /* retry */
+               }
+               ret |= VM_FAULT_LOCKED;
+       } else
+               VM_BUG_ON_PAGE(!PageLocked(page), page);
+       return ret;
+}
+
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -2668,42 +2700,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 * get_user_pages(.write=1, .force=1).
                 */
                if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
-                       struct vm_fault vmf;
                        int tmp;
-
-                       vmf.virtual_address = (void __user *)(address &
-                                                               PAGE_MASK);
-                       vmf.pgoff = old_page->index;
-                       vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
-                       vmf.page = old_page;
-
-                       /*
-                        * Notify the address space that the page is about to
-                        * become writable so that it can prohibit this or wait
-                        * for the page to get into an appropriate state.
-                        *
-                        * We do this without the lock held, so that it can
-                        * sleep if it needs to.
-                        */
                        page_cache_get(old_page);
                        pte_unmap_unlock(page_table, ptl);
-
-                       tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
-                       if (unlikely(tmp &
-                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
-                               ret = tmp;
-                               goto unwritable_page;
+                       tmp = do_page_mkwrite(vma, old_page, address);
+                       if (unlikely(!tmp || (tmp &
+                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+                               page_cache_release(old_page);
+                               return tmp;
                        }
-                       if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
-                               lock_page(old_page);
-                               if (!old_page->mapping) {
-                                       ret = 0; /* retry the fault */
-                                       unlock_page(old_page);
-                                       goto unwritable_page;
-                               }
-                       } else
-                               VM_BUG_ON_PAGE(!PageLocked(old_page), old_page);
-
                        /*
                         * Since we dropped the lock we need to revalidate
                         * the PTE as someone else may have changed it.  If
@@ -2748,7 +2753,7 @@ reuse:
                 * bit after it clear all dirty ptes, but before a racing
                 * do_wp_page installs a dirty pte.
                 *
-                * __do_fault is protected similarly.
+                * do_shared_fault is protected similarly.
                 */
                if (!page_mkwrite) {
                        wait_on_page_locked(dirty_page);
@@ -2892,10 +2897,6 @@ oom:
        if (old_page)
                page_cache_release(old_page);
        return VM_FAULT_OOM;
-
-unwritable_page:
-       page_cache_release(old_page);
-       return ret;
 }
 
 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -3286,53 +3287,11 @@ oom:
        return VM_FAULT_OOM;
 }
 
-/*
- * __do_fault() tries to create a new page mapping. It aggressively
- * tries to share with existing pages, but makes a separate copy if
- * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
- * the next page fault.
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte neither mapped nor locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
- */
-static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-               unsigned long address, pmd_t *pmd,
-               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+static int __do_fault(struct vm_area_struct *vma, unsigned long address,
+               pgoff_t pgoff, unsigned int flags, struct page **page)
 {
-       pte_t *page_table;
-       spinlock_t *ptl;
-       struct page *page;
-       struct page *cow_page;
-       pte_t entry;
-       int anon = 0;
-       struct page *dirty_page = NULL;
        struct vm_fault vmf;
        int ret;
-       int page_mkwrite = 0;
-
-       /*
-        * If we do COW later, allocate page befor taking lock_page()
-        * on the file cache page. This will reduce lock holding time.
-        */
-       if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
-
-               if (unlikely(anon_vma_prepare(vma)))
-                       return VM_FAULT_OOM;
-
-               cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-               if (!cow_page)
-                       return VM_FAULT_OOM;
-
-               if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
-                       page_cache_release(cow_page);
-                       return VM_FAULT_OOM;
-               }
-       } else
-               cow_page = NULL;
 
        vmf.virtual_address = (void __user *)(address & PAGE_MASK);
        vmf.pgoff = pgoff;
@@ -3340,151 +3299,176 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        vmf.page = NULL;
 
        ret = vma->vm_ops->fault(vma, &vmf);
-       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
-                           VM_FAULT_RETRY)))
-               goto uncharge_out;
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+               return ret;
 
        if (unlikely(PageHWPoison(vmf.page))) {
                if (ret & VM_FAULT_LOCKED)
                        unlock_page(vmf.page);
-               ret = VM_FAULT_HWPOISON;
                page_cache_release(vmf.page);
-               goto uncharge_out;
+               return VM_FAULT_HWPOISON;
        }
 
-       /*
-        * For consistency in subsequent calls, make the faulted page always
-        * locked.
-        */
        if (unlikely(!(ret & VM_FAULT_LOCKED)))
                lock_page(vmf.page);
        else
                VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
 
-       /*
-        * Should we do an early C-O-W break?
-        */
-       page = vmf.page;
-       if (flags & FAULT_FLAG_WRITE) {
-               if (!(vma->vm_flags & VM_SHARED)) {
-                       page = cow_page;
-                       anon = 1;
-                       copy_user_highpage(page, vmf.page, address, vma);
-                       __SetPageUptodate(page);
-               } else {
-                       /*
-                        * If the page will be shareable, see if the backing
-                        * address space wants to know that the page is about
-                        * to become writable
-                        */
-                       if (vma->vm_ops->page_mkwrite) {
-                               int tmp;
-
-                               unlock_page(page);
-                               vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
-                               tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
-                               if (unlikely(tmp &
-                                         (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
-                                       ret = tmp;
-                                       goto unwritable_page;
-                               }
-                               if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
-                                       lock_page(page);
-                                       if (!page->mapping) {
-                                               ret = 0; /* retry the fault */
-                                               unlock_page(page);
-                                               goto unwritable_page;
-                                       }
-                               } else
-                                       VM_BUG_ON_PAGE(!PageLocked(page), page);
-                               page_mkwrite = 1;
-                       }
-               }
+       *page = vmf.page;
+       return ret;
+}
 
+static void do_set_pte(struct vm_area_struct *vma, unsigned long address,
+               struct page *page, pte_t *pte, bool write, bool anon)
+{
+       pte_t entry;
+
+       flush_icache_page(vma, page);
+       entry = mk_pte(page, vma->vm_page_prot);
+       if (write)
+               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+       else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
+               pte_mksoft_dirty(entry);
+       if (anon) {
+               inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+               page_add_new_anon_rmap(page, vma, address);
+       } else {
+               inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
+               page_add_file_rmap(page);
        }
+       set_pte_at(vma->vm_mm, address, pte, entry);
 
-       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+       /* no need to invalidate: a not-present page won't be cached */
+       update_mmu_cache(vma, address, pte);
+}
 
-       /*
-        * This silly early PAGE_DIRTY setting removes a race
-        * due to the bad i386 page protection. But it's valid
-        * for other architectures too.
-        *
-        * Note that if FAULT_FLAG_WRITE is set, we either now have
-        * an exclusive copy of the page, or this is a shared mapping,
-        * so we can make it writable and dirty to avoid having to
-        * handle that later.
-        */
-       /* Only go through if we didn't race with anybody else... */
-       if (likely(pte_same(*page_table, orig_pte))) {
-               flush_icache_page(vma, page);
-               entry = mk_pte(page, vma->vm_page_prot);
-               if (flags & FAULT_FLAG_WRITE)
-                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-               else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte))
-                       pte_mksoft_dirty(entry);
-               if (anon) {
-                       inc_mm_counter_fast(mm, MM_ANONPAGES);
-                       page_add_new_anon_rmap(page, vma, address);
-               } else {
-                       inc_mm_counter_fast(mm, MM_FILEPAGES);
-                       page_add_file_rmap(page);
-                       if (flags & FAULT_FLAG_WRITE) {
-                               dirty_page = page;
-                               get_page(dirty_page);
-                       }
-               }
-               set_pte_at(mm, address, page_table, entry);
+static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pmd_t *pmd,
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+       struct page *fault_page;
+       spinlock_t *ptl;
+       pte_t *pte;
+       int ret;
 
-               /* no need to invalidate: a not-present page won't be cached */
-               update_mmu_cache(vma, address, page_table);
-       } else {
-               if (cow_page)
-                       mem_cgroup_uncharge_page(cow_page);
-               if (anon)
-                       page_cache_release(page);
-               else
-                       anon = 1; /* no anon but release faulted_page */
+       ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+               return ret;
+
+       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (unlikely(!pte_same(*pte, orig_pte))) {
+               pte_unmap_unlock(pte, ptl);
+               unlock_page(fault_page);
+               page_cache_release(fault_page);
+               return ret;
        }
+       do_set_pte(vma, address, fault_page, pte, false, false);
+       pte_unmap_unlock(pte, ptl);
+       unlock_page(fault_page);
+       return ret;
+}
 
-       pte_unmap_unlock(page_table, ptl);
+static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pmd_t *pmd,
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+       struct page *fault_page, *new_page;
+       spinlock_t *ptl;
+       pte_t *pte;
+       int ret;
 
-       if (dirty_page) {
-               struct address_space *mapping = page->mapping;
-               int dirtied = 0;
+       if (unlikely(anon_vma_prepare(vma)))
+               return VM_FAULT_OOM;
 
-               if (set_page_dirty(dirty_page))
-                       dirtied = 1;
-               unlock_page(dirty_page);
-               put_page(dirty_page);
-               if ((dirtied || page_mkwrite) && mapping) {
-                       /*
-                        * Some device drivers do not set page.mapping but still
-                        * dirty their pages
-                        */
-                       balance_dirty_pages_ratelimited(mapping);
-               }
+       new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+       if (!new_page)
+               return VM_FAULT_OOM;
 
-               /* file_update_time outside page_lock */
-               if (vma->vm_file && !page_mkwrite)
-                       file_update_time(vma->vm_file);
-       } else {
-               unlock_page(vmf.page);
-               if (anon)
-                       page_cache_release(vmf.page);
+       if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) {
+               page_cache_release(new_page);
+               return VM_FAULT_OOM;
        }
 
-       return ret;
+       ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+               goto uncharge_out;
 
-unwritable_page:
-       page_cache_release(page);
+       copy_user_highpage(new_page, fault_page, address, vma);
+       __SetPageUptodate(new_page);
+
+       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (unlikely(!pte_same(*pte, orig_pte))) {
+               pte_unmap_unlock(pte, ptl);
+               unlock_page(fault_page);
+               page_cache_release(fault_page);
+               goto uncharge_out;
+       }
+       do_set_pte(vma, address, new_page, pte, true, true);
+       pte_unmap_unlock(pte, ptl);
+       unlock_page(fault_page);
+       page_cache_release(fault_page);
        return ret;
 uncharge_out:
-       /* fs's fault handler get error */
-       if (cow_page) {
-               mem_cgroup_uncharge_page(cow_page);
-               page_cache_release(cow_page);
+       mem_cgroup_uncharge_page(new_page);
+       page_cache_release(new_page);
+       return ret;
+}
+
+static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pmd_t *pmd,
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+       struct page *fault_page;
+       struct address_space *mapping;
+       spinlock_t *ptl;
+       pte_t *pte;
+       int dirtied = 0;
+       int ret, tmp;
+
+       ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+               return ret;
+
+       /*
+        * Check if the backing address space wants to know that the page is
+        * about to become writable
+        */
+       if (vma->vm_ops->page_mkwrite) {
+               unlock_page(fault_page);
+               tmp = do_page_mkwrite(vma, fault_page, address);
+               if (unlikely(!tmp ||
+                               (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+                       page_cache_release(fault_page);
+                       return tmp;
+               }
        }
+
+       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (unlikely(!pte_same(*pte, orig_pte))) {
+               pte_unmap_unlock(pte, ptl);
+               unlock_page(fault_page);
+               page_cache_release(fault_page);
+               return ret;
+       }
+       do_set_pte(vma, address, fault_page, pte, true, false);
+       pte_unmap_unlock(pte, ptl);
+
+       if (set_page_dirty(fault_page))
+               dirtied = 1;
+       mapping = fault_page->mapping;
+       unlock_page(fault_page);
+       if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
+               /*
+                * Some device drivers do not set page.mapping but still
+                * dirty their pages
+                */
+               balance_dirty_pages_ratelimited(mapping);
+       }
+
+       /* file_update_time outside page_lock */
+       if (vma->vm_file && !vma->vm_ops->page_mkwrite)
+               file_update_time(vma->vm_file);
+
        return ret;
 }
 
@@ -3496,7 +3480,13 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 
        pte_unmap(page_table);
-       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+       if (!(flags & FAULT_FLAG_WRITE))
+               return do_read_fault(mm, vma, address, pmd, pgoff, flags,
+                               orig_pte);
+       if (!(vma->vm_flags & VM_SHARED))
+               return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
+                               orig_pte);
+       return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
 /*
@@ -3528,10 +3518,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        }
 
        pgoff = pte_to_pgoff(orig_pte);
-       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+       if (!(flags & FAULT_FLAG_WRITE))
+               return do_read_fault(mm, vma, address, pmd, pgoff, flags,
+                               orig_pte);
+       if (!(vma->vm_flags & VM_SHARED))
+               return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
+                               orig_pte);
+       return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
-int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
                                unsigned long addr, int page_nid,
                                int *flags)
 {
@@ -3546,7 +3542,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
        return mpol_misplaced(page, vma, addr);
 }
 
-int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
 {
        struct page *page = NULL;