Merge branch 'semaphore' of git://git.kernel.org/pub/scm/linux/kernel/git/willy/misc

[pandora-kernel.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index d14b251..262e3eb 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,8 @@
  #include <linux/swapops.h>
  #include <linux/elf.h>
  
+#include "internal.h"
+
  #ifndef CONFIG_NEED_MULTIPLE_NODES
  /* use the per-pgdat data instead for discontigmem - mbligh */
  unsigned long max_mapnr;
@@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
   *
   * Must be called with pagetable lock held.
   */
-void free_pgd_range(struct mmu_gather **tlb,
+void free_pgd_range(struct mmu_gather *tlb,
                         unsigned long addr, unsigned long end,
                         unsigned long floor, unsigned long ceiling)
  {
@@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb,
                 return;
  
         start = addr;
-       pgd = pgd_offset((*tlb)->mm, addr);
+       pgd = pgd_offset(tlb->mm, addr);
         do {
                 next = pgd_addr_end(addr, end);
                 if (pgd_none_or_clear_bad(pgd))
                         continue;
-               free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+               free_pud_range(tlb, pgd, addr, next, floor, ceiling);
         } while (pgd++, addr = next, addr != end);
  }
  
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 unsigned long floor, unsigned long ceiling)
  {
         while (vma) {
@@ -899,9 +901,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                         }
  
                         if (unlikely(is_vm_hugetlb_page(vma))) {
-                               unmap_hugepage_range(vma, start, end);
-                               zap_work -= (end - start) /
-                                               (HPAGE_SIZE / PAGE_SIZE);
+                               /*
+                                * It is undesirable to test vma->vm_file as it
+                                * should be non-null for valid hugetlb area.
+                                * However, vm_file will be NULL in the error
+                                * cleanup path of do_mmap_pgoff. When
+                                * hugetlbfs ->mmap method fails,
+                                * do_mmap_pgoff() nullifies vma->vm_file
+                                * before calling this function to clean up.
+                                * Since no pte has actually been setup, it is
+                                * safe to do nothing in this case.
+                                */
+                               if (vma->vm_file) {
+                                       unmap_hugepage_range(vma, start, end, NULL);
+                                       zap_work -= (end - start) /
+                                       pages_per_huge_page(hstate_vma(vma));
+                               }
+
                                 start = end;
                         } else
                                 start = unmap_page_range(*tlbp, vma,
@@ -982,19 +998,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                 goto no_page_table;
  
         pud = pud_offset(pgd, address);
-       if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+       if (pud_none(*pud))
+               goto no_page_table;
+       if (pud_huge(*pud)) {
+               BUG_ON(flags & FOLL_GET);
+               page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+               goto out;
+       }
+       if (unlikely(pud_bad(*pud)))
                 goto no_page_table;
-       
+
         pmd = pmd_offset(pud, address);
         if (pmd_none(*pmd))
                 goto no_page_table;
-
         if (pmd_huge(*pmd)) {
                 BUG_ON(flags & FOLL_GET);
                 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                 goto out;
         }
-
         if (unlikely(pmd_bad(*pmd)))
                 goto no_page_table;
  
@@ -1058,11 +1079,9 @@ static inline int use_zero_page(struct vm_area_struct *vma)
         if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
                 return 0;
         /*
-        * And if we have a fault or a nopfn routine, it's not an
-        * anonymous region.
+        * And if we have a fault routine, it's not an anonymous region.
          */
-       return !vma->vm_ops ||
-               (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
+       return !vma->vm_ops || !vma->vm_ops->fault;
  }
  
  int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -1151,7 +1170,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                          * be processed until returning to user space.
                          */
                         if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
-                               return -ENOMEM;
+                               return i ? i : -ENOMEM;
  
                         if (write)
                                 foll_flags |= FOLL_WRITE;
@@ -1338,6 +1357,11 @@ out:
   *
   * This function should only be called from a vm_ops->fault handler, and
   * in that case the handler should return NULL.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
   */
  int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                         unsigned long pfn)
@@ -1548,6 +1572,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
         unsigned long next;
         int err;
  
+       BUG_ON(pud_huge(*pud));
+
         pmd = pmd_alloc(mm, pud, addr);
         if (!pmd)
                 return -ENOMEM;
@@ -1697,8 +1723,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         struct page *dirty_page = NULL;
  
         old_page = vm_normal_page(vma, address, orig_pte);
-       if (!old_page)
+       if (!old_page) {
+               /*
+                * VM_MIXEDMAP !pfn_valid() case
+                *
+                * We should not cow pages in a shared writeable mapping.
+                * Just mark the pages writable as we can't do any dirty
+                * accounting on raw pfn maps.
+                */
+               if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+                                    (VM_WRITE|VM_SHARED))
+                       goto reuse;
                 goto gotten;
+       }
  
         /*
          * Take out anonymous pages first, anonymous shared vmas are
@@ -1751,6 +1788,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         }
  
         if (reuse) {
+reuse:
                 flush_cache_page(vma, address, pte_pfn(orig_pte));
                 entry = pte_mkyoung(orig_pte);
                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2489,59 +2527,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
  }
  
-
-/*
- * do_no_pfn() tries to create a new page mapping for a page without
- * a struct_page backing it
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
- *
- * It is expected that the ->nopfn handler always returns the same pfn
- * for a given virtual mapping.
- *
- * Mark this `noinline' to prevent it from bloating the main pagefault code.
- */
-static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
-                    unsigned long address, pte_t *page_table, pmd_t *pmd,
-                    int write_access)
-{
-       spinlock_t *ptl;
-       pte_t entry;
-       unsigned long pfn;
-
-       pte_unmap(page_table);
-       BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
-       BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
-
-       pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
-
-       BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
-
-       if (unlikely(pfn == NOPFN_OOM))
-               return VM_FAULT_OOM;
-       else if (unlikely(pfn == NOPFN_SIGBUS))
-               return VM_FAULT_SIGBUS;
-       else if (unlikely(pfn == NOPFN_REFAULT))
-               return 0;
-
-       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-
-       /* Only go through if we didn't race with anybody else... */
-       if (pte_none(*page_table)) {
-               entry = pfn_pte(pfn, vma->vm_page_prot);
-               if (write_access)
-                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-               set_pte_at(mm, address, page_table, entry);
-       }
-       pte_unmap_unlock(page_table, ptl);
-       return 0;
-}
-
  /*
   * Fault of a previously existing named mapping. Repopulate the pte
   * from the encoded file_pte if possible. This enables swappable
@@ -2602,9 +2587,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
                                 if (likely(vma->vm_ops->fault))
                                         return do_linear_fault(mm, vma, address,
                                                 pte, pmd, write_access, entry);
-                               if (unlikely(vma->vm_ops->nopfn))
-                                       return do_no_pfn(mm, vma, address, pte,
-                                                        pmd, write_access);
                         }
                         return do_anonymous_page(mm, vma, address,
                                                  pte, pmd, write_access);
@@ -2792,6 +2774,86 @@ int in_gate_area_no_task(unsigned long addr)
  
  #endif /* __HAVE_ARCH_GATE_AREA */
  
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+static resource_size_t follow_phys(struct vm_area_struct *vma,
+                       unsigned long address, unsigned int flags,
+                       unsigned long *prot)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *ptep, pte;
+       spinlock_t *ptl;
+       resource_size_t phys_addr = 0;
+       struct mm_struct *mm = vma->vm_mm;
+
+       VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
+
+       pgd = pgd_offset(mm, address);
+       if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+               goto no_page_table;
+
+       pud = pud_offset(pgd, address);
+       if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+               goto no_page_table;
+
+       pmd = pmd_offset(pud, address);
+       if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+               goto no_page_table;
+
+       /* We cannot handle huge page PFN maps. Luckily they don't exist. */
+       if (pmd_huge(*pmd))
+               goto no_page_table;
+
+       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!ptep)
+               goto out;
+
+       pte = *ptep;
+       if (!pte_present(pte))
+               goto unlock;
+       if ((flags & FOLL_WRITE) && !pte_write(pte))
+               goto unlock;
+       phys_addr = pte_pfn(pte);
+       phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
+
+       *prot = pgprot_val(pte_pgprot(pte));
+
+unlock:
+       pte_unmap_unlock(ptep, ptl);
+out:
+       return phys_addr;
+no_page_table:
+       return 0;
+}
+
+int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
+                       void *buf, int len, int write)
+{
+       resource_size_t phys_addr;
+       unsigned long prot = 0;
+       void *maddr;
+       int offset = addr & (PAGE_SIZE-1);
+
+       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+               return -EINVAL;
+
+       phys_addr = follow_phys(vma, addr, write, &prot);
+
+       if (!phys_addr)
+               return -EINVAL;
+
+       maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+       if (write)
+               memcpy_toio(maddr + offset, buf, len);
+       else
+               memcpy_fromio(buf, maddr + offset, len);
+       iounmap(maddr);
+
+       return len;
+}
+#endif
+
  /*
   * Access another process' address space.
   * Source/target buffer must be kernel space,
@@ -2801,7 +2863,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
  {
         struct mm_struct *mm;
         struct vm_area_struct *vma;
-       struct page *page;
         void *old_buf = buf;
  
         mm = get_task_mm(tsk);
@@ -2813,28 +2874,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
         while (len) {
                 int bytes, ret, offset;
                 void *maddr;
+               struct page *page = NULL;
  
                 ret = get_user_pages(tsk, mm, addr, 1,
                                 write, 1, &page, &vma);
-               if (ret <= 0)
-                       break;
-
-               bytes = len;
-               offset = addr & (PAGE_SIZE-1);
-               if (bytes > PAGE_SIZE-offset)
-                       bytes = PAGE_SIZE-offset;
-
-               maddr = kmap(page);
-               if (write) {
-                       copy_to_user_page(vma, page, addr,
-                                         maddr + offset, buf, bytes);
-                       set_page_dirty_lock(page);
+               if (ret <= 0) {
+                       /*
+                        * Check if this is a VM_IO | VM_PFNMAP VMA, which
+                        * we can access using slightly different code.
+                        */
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+                       vma = find_vma(mm, addr);
+                       if (!vma)
+                               break;
+                       if (vma->vm_ops && vma->vm_ops->access)
+                               ret = vma->vm_ops->access(vma, addr, buf,
+                                                         len, write);
+                       if (ret <= 0)
+#endif
+                               break;
+                       bytes = ret;
                 } else {
-                       copy_from_user_page(vma, page, addr,
-                                           buf, maddr + offset, bytes);
+                       bytes = len;
+                       offset = addr & (PAGE_SIZE-1);
+                       if (bytes > PAGE_SIZE-offset)
+                               bytes = PAGE_SIZE-offset;
+
+                       maddr = kmap(page);
+                       if (write) {
+                               copy_to_user_page(vma, page, addr,
+                                                 maddr + offset, buf, bytes);
+                               set_page_dirty_lock(page);
+                       } else {
+                               copy_from_user_page(vma, page, addr,
+                                                   buf, maddr + offset, bytes);
+                       }
+                       kunmap(page);
+                       page_cache_release(page);
                 }
-               kunmap(page);
-               page_cache_release(page);
                 len -= bytes;
                 buf += bytes;
                 addr += bytes;