tcp: enforce tcp_min_snd_mss in tcp_mtu_probing()
[pandora-kernel.git] / mm / memory.c
index 7762b1d..4771663 100644 (file)
@@ -1403,6 +1403,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
        tlb_finish_mmu(&tlb, address, end);
        return end;
 }
+EXPORT_SYMBOL_GPL(zap_page_range);
 
 /**
  * zap_vma_ptes - remove ptes mapping the vma
@@ -1427,6 +1428,24 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
+static inline bool can_follow_write_pte(pte_t pte, struct page *page,
+                                       unsigned int flags)
+{
+       if (pte_write(pte))
+               return true;
+
+       /*
+        * Make sure that we are really following CoWed page. We do not really
+        * have to care about exclusiveness of the page because we only want
+        * to ensure that once COWed page hasn't disappeared in the meantime
+        * or it hasn't been merged to a KSM page.
+        */
+       if ((flags & FOLL_FORCE) && (flags & FOLL_COW))
+               return page && PageAnon(page) && !PageKsm(page);
+
+       return false;
+}
+
 /**
  * follow_page - look up a page descriptor from a user-virtual address
  * @vma: vm_area_struct mapping @address
@@ -1509,10 +1528,13 @@ split_fallthrough:
        pte = *ptep;
        if (!pte_present(pte))
                goto no_page;
-       if ((flags & FOLL_WRITE) && !pte_write(pte))
-               goto unlock;
 
        page = vm_normal_page(vma, address, pte);
+       if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, flags)) {
+               pte_unmap_unlock(ptep, ptl);
+               return NULL;
+       }
+
        if (unlikely(!page)) {
                if ((flags & FOLL_DUMP) ||
                    !is_zero_pfn(pte_pfn(pte)))
@@ -1555,7 +1577,7 @@ split_fallthrough:
                        unlock_page(page);
                }
        }
-unlock:
+
        pte_unmap_unlock(ptep, ptl);
 out:
        return page;
@@ -1584,12 +1606,6 @@ no_page_table:
        return page;
 }
 
-static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
-{
-       return stack_guard_page_start(vma, addr) ||
-              stack_guard_page_end(vma, addr+PAGE_SIZE);
-}
-
 /**
  * __get_user_pages() - pin user pages in memory
  * @tsk:       task_struct of target task
@@ -1740,11 +1756,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                int ret;
                                unsigned int fault_flags = 0;
 
-                               /* For mlock, just skip the stack guard page. */
-                               if (foll_flags & FOLL_MLOCK) {
-                                       if (stack_guard_page(vma, start))
-                                               goto next_page;
-                               }
                                if (foll_flags & FOLL_WRITE)
                                        fault_flags |= FAULT_FLAG_WRITE;
                                if (nonblocking)
@@ -1789,17 +1800,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                 * The VM_FAULT_WRITE bit tells us that
                                 * do_wp_page has broken COW when necessary,
                                 * even if maybe_mkwrite decided not to set
-                                * pte_write. We can thus safely do subsequent
-                                * page lookups as if they were reads. But only
-                                * do so when looping for pte_write is futile:
-                                * in some cases userspace may also be wanting
-                                * to write to the gotten user page, which a
-                                * read fault here might prevent (a readonly
-                                * page might get reCOWed by userspace write).
+                                * pte_write. We cannot simply drop FOLL_WRITE
+                                * here because the COWed page might be gone by
+                                * the time we do the subsequent page lookups.
                                 */
                                if ((ret & VM_FAULT_WRITE) &&
                                    !(vma->vm_flags & VM_WRITE))
-                                       foll_flags &= ~FOLL_WRITE;
+                                       foll_flags |= FOLL_COW;
 
                                cond_resched();
                        }
@@ -3104,40 +3111,6 @@ out_release:
        return ret;
 }
 
-/*
- * This is like a special single-page "expand_{down|up}wards()",
- * except we must first make sure that 'address{-|+}PAGE_SIZE'
- * doesn't hit another vma.
- */
-static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
-{
-       address &= PAGE_MASK;
-       if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
-               struct vm_area_struct *prev = vma->vm_prev;
-
-               /*
-                * Is there a mapping abutting this one below?
-                *
-                * That's only ok if it's the same stack mapping
-                * that has gotten split..
-                */
-               if (prev && prev->vm_end == address)
-                       return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
-
-               return expand_downwards(vma, address - PAGE_SIZE);
-       }
-       if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
-               struct vm_area_struct *next = vma->vm_next;
-
-               /* As VM_GROWSDOWN but s/below/above/ */
-               if (next && next->vm_start == address + PAGE_SIZE)
-                       return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
-
-               return expand_upwards(vma, address + PAGE_SIZE);
-       }
-       return 0;
-}
-
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -3157,10 +3130,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (vma->vm_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;
 
-       /* Check if we need to add a guard page to the stack */
-       if (check_stack_guard_page(vma, address) < 0)
-               return VM_FAULT_SIGSEGV;
-
        /* Use the zero-page for reads */
        if (!(flags & FAULT_FLAG_WRITE)) {
                entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
@@ -3558,8 +3527,9 @@ retry:
 
                barrier();
                if (pmd_trans_huge(orig_pmd)) {
-                       if (flags & FAULT_FLAG_WRITE &&
-                           !pmd_write(orig_pmd) &&
+                       unsigned int dirty = flags & FAULT_FLAG_WRITE;
+
+                       if (dirty && !pmd_write(orig_pmd) &&
                            !pmd_trans_splitting(orig_pmd)) {
                                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
                                                          orig_pmd);
@@ -3571,6 +3541,9 @@ retry:
                                if (unlikely(ret & VM_FAULT_OOM))
                                        goto retry;
                                return ret;
+                       } else {
+                               huge_pmd_set_accessed(mm, vma, address, pmd,
+                                                     orig_pmd, dirty);
                        }
                        return 0;
                }
@@ -3583,8 +3556,18 @@ retry:
         */
        if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
                return VM_FAULT_OOM;
-       /* if an huge pmd materialized from under us just retry later */
-       if (unlikely(pmd_trans_huge(*pmd)))
+       /*
+        * If a huge pmd materialized under us just retry later.  Use
+        * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
+        * didn't become pmd_trans_huge under us and then back to pmd_none, as
+        * a result of MADV_DONTNEED running immediately after a huge pmd fault
+        * in a different thread of this mm, in turn leading to a misleading
+        * pmd_trans_huge() retval.  All we have to ensure is that it is a
+        * regular pmd that we can walk with pte_offset_map() and we can do that
+        * through an atomic read in C, which is what pmd_trans_unstable()
+        * provides.
+        */
+       if (unlikely(pmd_trans_unstable(pmd)))
                return 0;
        /*
         * A regular pmd is established and it can't morph into a huge pmd
@@ -3875,7 +3858,11 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                        vma = find_vma(mm, addr);
                        if (!vma || vma->vm_start > addr)
                                break;
-                       if (vma->vm_ops && vma->vm_ops->access)
+                       if ((vma->vm_flags & VM_PFNMAP) &&
+                           !(vma->vm_flags & VM_IO))
+                               ret = generic_access_phys(vma, addr, buf,
+                                                         len, write);
+                       if (ret <= 0 && vma->vm_ops && vma->vm_ops->access)
                                ret = vma->vm_ops->access(vma, addr, buf,
                                                          len, write);
                        if (ret <= 0)