net/mlx4_en: Fix mixed PFC and Global pause user control requests
[pandora-kernel.git] / mm / memory.c
index b2b8731..6325103 100644 (file)
@@ -47,7 +47,7 @@
 #include <linux/pagemap.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/delayacct.h>
 #include <linux/init.h>
 #include <linux/writeback.h>
@@ -205,10 +205,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
                return 1;
        }
 
+       if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+               return 0;
+
        batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
        if (!batch)
                return 0;
 
+       tlb->batch_count++;
        batch->next = NULL;
        batch->nr   = 0;
        batch->max  = MAX_GATHER_BATCH;
@@ -235,6 +239,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
        tlb->local.nr   = 0;
        tlb->local.max  = ARRAY_SIZE(tlb->__pages);
        tlb->active     = &tlb->local;
+       tlb->batch_count = 0;
 
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
        tlb->batch = NULL;
@@ -865,20 +870,20 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                if (!pte_file(pte)) {
                        swp_entry_t entry = pte_to_swp_entry(pte);
 
-                       if (swap_duplicate(entry) < 0)
-                               return entry.val;
-
-                       /* make sure dst_mm is on swapoff's mmlist. */
-                       if (unlikely(list_empty(&dst_mm->mmlist))) {
-                               spin_lock(&mmlist_lock);
-                               if (list_empty(&dst_mm->mmlist))
-                                       list_add(&dst_mm->mmlist,
-                                                &src_mm->mmlist);
-                               spin_unlock(&mmlist_lock);
-                       }
-                       if (likely(!non_swap_entry(entry)))
+                       if (likely(!non_swap_entry(entry))) {
+                               if (swap_duplicate(entry) < 0)
+                                       return entry.val;
+
+                               /* make sure dst_mm is on swapoff's mmlist. */
+                               if (unlikely(list_empty(&dst_mm->mmlist))) {
+                                       spin_lock(&mmlist_lock);
+                                       if (list_empty(&dst_mm->mmlist))
+                                               list_add(&dst_mm->mmlist,
+                                                        &src_mm->mmlist);
+                                       spin_unlock(&mmlist_lock);
+                               }
                                rss[MM_SWAPENTS]++;
-                       else if (is_write_migration_entry(entry) &&
+                       else if (is_write_migration_entry(entry) &&
                                        is_cow_mapping(vm_flags)) {
                                /*
                                 * COW mappings require pages in both parent
@@ -1173,8 +1178,10 @@ again:
                        if (unlikely(page_mapcount(page) < 0))
                                print_bad_pte(vma, addr, ptent, page);
                        force_flush = !__tlb_remove_page(tlb, page);
-                       if (force_flush)
+                       if (force_flush) {
+                               addr += PAGE_SIZE;
                                break;
+                       }
                        continue;
                }
                /*
@@ -1228,16 +1235,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*pmd)) {
-                       if (next-addr != HPAGE_PMD_SIZE) {
+                       if (next - addr != HPAGE_PMD_SIZE) {
                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
                                split_huge_page_pmd(vma->vm_mm, pmd);
                        } else if (zap_huge_pmd(tlb, vma, pmd))
-                               continue;
+                               goto next;
                        /* fall through */
                }
-               if (pmd_none_or_clear_bad(pmd))
-                       continue;
+               /*
+                * Here there can be other concurrent MADV_DONTNEED or
+                * trans huge page faults running, and if the pmd is
+                * none or trans huge it can change under us. This is
+                * because MADV_DONTNEED holds the mmap_sem in read
+                * mode.
+                */
+               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+                       goto next;
                next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+next:
                cond_resched();
        } while (pmd++, addr = next, addr != end);
 
@@ -1350,8 +1365,11 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
                                 * Since no pte has actually been setup, it is
                                 * safe to do nothing in this case.
                                 */
-                               if (vma->vm_file)
-                                       unmap_hugepage_range(vma, start, end, NULL);
+                               if (vma->vm_file) {
+                                       mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                                       __unmap_hugepage_range_final(vma, start, end, NULL);
+                                       mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                               }
 
                                start = end;
                        } else
@@ -1409,6 +1427,24 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
+static inline bool can_follow_write_pte(pte_t pte, struct page *page,
+                                       unsigned int flags)
+{
+       if (pte_write(pte))
+               return true;
+
+       /*
+        * Make sure that we are really following CoWed page. We do not really
+        * have to care about exclusiveness of the page because we only want
+        * to ensure that once COWed page hasn't disappeared in the meantime
+        * or it hasn't been merged to a KSM page.
+        */
+       if ((flags & FOLL_FORCE) && (flags & FOLL_COW))
+               return page && PageAnon(page) && !PageKsm(page);
+
+       return false;
+}
+
 /**
  * follow_page - look up a page descriptor from a user-virtual address
  * @vma: vm_area_struct mapping @address
@@ -1491,10 +1527,13 @@ split_fallthrough:
        pte = *ptep;
        if (!pte_present(pte))
                goto no_page;
-       if ((flags & FOLL_WRITE) && !pte_write(pte))
-               goto unlock;
 
        page = vm_normal_page(vma, address, pte);
+       if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, flags)) {
+               pte_unmap_unlock(ptep, ptl);
+               return NULL;
+       }
+
        if (unlikely(!page)) {
                if ((flags & FOLL_DUMP) ||
                    !is_zero_pfn(pte_pfn(pte)))
@@ -1537,7 +1576,7 @@ split_fallthrough:
                        unlock_page(page);
                }
        }
-unlock:
+
        pte_unmap_unlock(ptep, ptl);
 out:
        return page;
@@ -1566,12 +1605,6 @@ no_page_table:
        return page;
 }
 
-static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
-{
-       return stack_guard_page_start(vma, addr) ||
-              stack_guard_page_end(vma, addr+PAGE_SIZE);
-}
-
 /**
  * __get_user_pages() - pin user pages in memory
  * @tsk:       task_struct of target task
@@ -1722,11 +1755,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                int ret;
                                unsigned int fault_flags = 0;
 
-                               /* For mlock, just skip the stack guard page. */
-                               if (foll_flags & FOLL_MLOCK) {
-                                       if (stack_guard_page(vma, start))
-                                               goto next_page;
-                               }
                                if (foll_flags & FOLL_WRITE)
                                        fault_flags |= FAULT_FLAG_WRITE;
                                if (nonblocking)
@@ -1749,7 +1777,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                                else
                                                        return -EFAULT;
                                        }
-                                       if (ret & VM_FAULT_SIGBUS)
+                                       if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
                                                return i ? i : -EFAULT;
                                        BUG();
                                }
@@ -1771,17 +1799,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                 * The VM_FAULT_WRITE bit tells us that
                                 * do_wp_page has broken COW when necessary,
                                 * even if maybe_mkwrite decided not to set
-                                * pte_write. We can thus safely do subsequent
-                                * page lookups as if they were reads. But only
-                                * do so when looping for pte_write is futile:
-                                * in some cases userspace may also be wanting
-                                * to write to the gotten user page, which a
-                                * read fault here might prevent (a readonly
-                                * page might get reCOWed by userspace write).
+                                * pte_write. We cannot simply drop FOLL_WRITE
+                                * here because the COWed page might be gone by
+                                * the time we do the subsequent page lookups.
                                 */
                                if ((ret & VM_FAULT_WRITE) &&
                                    !(vma->vm_flags & VM_WRITE))
-                                       foll_flags &= ~FOLL_WRITE;
+                                       foll_flags |= FOLL_COW;
 
                                cond_resched();
                        }
@@ -1836,19 +1860,24 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long address, unsigned int fault_flags)
 {
        struct vm_area_struct *vma;
+       vm_flags_t vm_flags;
        int ret;
 
        vma = find_extend_vma(mm, address);
        if (!vma || address < vma->vm_start)
                return -EFAULT;
 
+       vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
+       if (!(vm_flags & vma->vm_flags))
+               return -EFAULT;
+
        ret = handle_mm_fault(mm, vma, address, fault_flags);
        if (ret & VM_FAULT_ERROR) {
                if (ret & VM_FAULT_OOM)
                        return -ENOMEM;
                if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
                        return -EHWPOISON;
-               if (ret & VM_FAULT_SIGBUS)
+               if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
                        return -EFAULT;
                BUG();
        }
@@ -2293,6 +2322,53 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(remap_pfn_range);
 
+/**
+ * vm_iomap_memory - remap memory to userspace
+ * @vma: user vma to map to
+ * @start: start of area
+ * @len: size of area
+ *
+ * This is a simplified io_remap_pfn_range() for common driver use. The
+ * driver just needs to give us the physical memory range to be mapped,
+ * we'll figure out the rest from the vma information.
+ *
+ * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
+ * whatever write-combining details or similar.
+ */
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+{
+       unsigned long vm_len, pfn, pages;
+
+       /* Check that the physical memory area passed in looks valid */
+       if (start + len < start)
+               return -EINVAL;
+       /*
+        * You *really* shouldn't map things that aren't page-aligned,
+        * but we've historically allowed it because IO memory might
+        * just have smaller alignment.
+        */
+       len += start & ~PAGE_MASK;
+       pfn = start >> PAGE_SHIFT;
+       pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
+       if (pfn + pages < pfn)
+               return -EINVAL;
+
+       /* We start the mapping 'vm_pgoff' pages into the area */
+       if (vma->vm_pgoff > pages)
+               return -EINVAL;
+       pfn += vma->vm_pgoff;
+       pages -= vma->vm_pgoff;
+
+       /* Can we fit all of the mapping? */
+       vm_len = vma->vm_end - vma->vm_start;
+       if (vm_len >> PAGE_SHIFT > pages)
+               return -EINVAL;
+
+       /* Ok, let it rip */
+       return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_iomap_memory);
+
 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data)
@@ -2591,17 +2667,24 @@ reuse:
                if (!dirty_page)
                        return ret;
 
-               /*
-                * Yes, Virginia, this is actually required to prevent a race
-                * with clear_page_dirty_for_io() from clearing the page dirty
-                * bit after it clear all dirty ptes, but before a racing
-                * do_wp_page installs a dirty pte.
-                *
-                * __do_fault is protected similarly.
-                */
                if (!page_mkwrite) {
-                       wait_on_page_locked(dirty_page);
-                       set_page_dirty_balance(dirty_page, page_mkwrite);
+                       struct address_space *mapping;
+                       int dirtied;
+
+                       lock_page(dirty_page);
+                       dirtied = set_page_dirty(dirty_page);
+                       VM_BUG_ON(PageAnon(dirty_page));
+                       mapping = dirty_page->mapping;
+                       unlock_page(dirty_page);
+
+                       if (dirtied && mapping) {
+                               /*
+                                * Some device drivers do not set page.mapping
+                                * but still dirty their pages
+                                */
+                               balance_dirty_pages_ratelimited(mapping);
+                       }
+
                }
                put_page(dirty_page);
                if (page_mkwrite) {
@@ -3027,40 +3110,6 @@ out_release:
        return ret;
 }
 
-/*
- * This is like a special single-page "expand_{down|up}wards()",
- * except we must first make sure that 'address{-|+}PAGE_SIZE'
- * doesn't hit another vma.
- */
-static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
-{
-       address &= PAGE_MASK;
-       if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
-               struct vm_area_struct *prev = vma->vm_prev;
-
-               /*
-                * Is there a mapping abutting this one below?
-                *
-                * That's only ok if it's the same stack mapping
-                * that has gotten split..
-                */
-               if (prev && prev->vm_end == address)
-                       return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
-
-               expand_downwards(vma, address - PAGE_SIZE);
-       }
-       if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
-               struct vm_area_struct *next = vma->vm_next;
-
-               /* As VM_GROWSDOWN but s/below/above/ */
-               if (next && next->vm_start == address + PAGE_SIZE)
-                       return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
-
-               expand_upwards(vma, address + PAGE_SIZE);
-       }
-       return 0;
-}
-
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -3076,8 +3125,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        pte_unmap(page_table);
 
-       /* Check if we need to add a guard page to the stack */
-       if (check_stack_guard_page(vma, address) < 0)
+       /* File mapping without ->vm_ops ? */
+       if (vma->vm_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;
 
        /* Use the zero-page for reads */
@@ -3335,6 +3384,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 
        pte_unmap(page_table);
+       /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
+       if (!vma->vm_ops->fault)
+               return VM_FAULT_SIGBUS;
        return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
@@ -3393,11 +3445,9 @@ int handle_pte_fault(struct mm_struct *mm,
        entry = *pte;
        if (!pte_present(entry)) {
                if (pte_none(entry)) {
-                       if (vma->vm_ops) {
-                               if (likely(vma->vm_ops->fault))
-                                       return do_linear_fault(mm, vma, address,
+                       if (vma->vm_ops)
+                               return do_linear_fault(mm, vma, address,
                                                pte, pmd, flags, entry);
-                       }
                        return do_anonymous_page(mm, vma, address,
                                                 pte, pmd, flags);
                }
@@ -3458,6 +3508,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(is_vm_hugetlb_page(vma)))
                return hugetlb_fault(mm, vma, address, flags);
 
+retry:
        pgd = pgd_offset(mm, address);
        pud = pud_alloc(mm, pgd, address);
        if (!pud)
@@ -3471,13 +3522,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                          pmd, flags);
        } else {
                pmd_t orig_pmd = *pmd;
+               int ret;
+
                barrier();
                if (pmd_trans_huge(orig_pmd)) {
                        if (flags & FAULT_FLAG_WRITE &&
                            !pmd_write(orig_pmd) &&
-                           !pmd_trans_splitting(orig_pmd))
-                               return do_huge_pmd_wp_page(mm, vma, address,
-                                                          pmd, orig_pmd);
+                           !pmd_trans_splitting(orig_pmd)) {
+                               ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
+                                                         orig_pmd);
+                               /*
+                                * If COW results in an oom, the huge pmd will
+                                * have been split, so retry the fault on the
+                                * pte for a smaller charge.
+                                */
+                               if (unlikely(ret & VM_FAULT_OOM))
+                                       goto retry;
+                               return ret;
+                       }
                        return 0;
                }
        }
@@ -3489,8 +3551,18 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
                return VM_FAULT_OOM;
-       /* if an huge pmd materialized from under us just retry later */
-       if (unlikely(pmd_trans_huge(*pmd)))
+       /*
+        * If a huge pmd materialized under us just retry later.  Use
+        * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
+        * didn't become pmd_trans_huge under us and then back to pmd_none, as
+        * a result of MADV_DONTNEED running immediately after a huge pmd fault
+        * in a different thread of this mm, in turn leading to a misleading
+        * pmd_trans_huge() retval.  All we have to ensure is that it is a
+        * regular pmd that we can walk with pte_offset_map() and we can do that
+        * through an atomic read in C, which is what pmd_trans_unstable()
+        * provides.
+        */
+       if (unlikely(pmd_trans_unstable(pmd)))
                return 0;
        /*
         * A regular pmd is established and it can't morph into a huge pmd
@@ -3742,7 +3814,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
        if (follow_phys(vma, addr, write, &prot, &phys_addr))
                return -EINVAL;
 
-       maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+       maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else