Merge branch 'master'

author Jeff Garzik <jgarzik@pobox.com>

Mon, 31 Oct 2005 01:37:44 +0000 (20:37 -0500)

committer Jeff Garzik <jgarzik@pobox.com>

Mon, 31 Oct 2005 01:37:44 +0000 (20:37 -0500)
author Jeff Garzik <jgarzik@pobox.com>
Mon, 31 Oct 2005 01:37:44 +0000 (20:37 -0500)
committer Jeff Garzik <jgarzik@pobox.com>
Mon, 31 Oct 2005 01:37:44 +0000 (20:37 -0500)
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt

index e132fb1..7eb715e 100644 (file)
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -49,9 +49,6 @@ changes occur:
         page table operations such as what happens during
         fork, and exec.
  
-       Platform developers note that generic code will always
-       invoke this interface without mm->page_table_lock held.
-
  3) void flush_tlb_range(struct vm_area_struct *vma,
                         unsigned long start, unsigned long end)
  
@@ -72,9 +69,6 @@ changes occur:
         call flush_tlb_page (see below) for each entry which may be
         modified.
  
-       Platform developers note that generic code will always
-       invoke this interface with mm->page_table_lock held.
-
  4) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
  
         This time we need to remove the PAGE_SIZE sized translation
@@ -93,9 +87,6 @@ changes occur:
  
         This is used primarily during fault processing.
  
-       Platform developers note that generic code will always
-       invoke this interface with mm->page_table_lock held.
-
  5) void flush_tlb_pgtables(struct mm_struct *mm,
                            unsigned long start, unsigned long end)
  
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 90766b7..5dffcfe 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1460,8 +1460,6 @@ running once the system is up.
         stifb=          [HW]
                         Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
  
-       stram_swap=     [HW,M68k]
-
         swiotlb=        [IA-64] Number of I/O TLB slabs
  
         switches=       [HW,M68k]
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt

index e191baa..d5d3f06 100644 (file)
--- a/Documentation/m68k/kernel-options.txt
+++ b/Documentation/m68k/kernel-options.txt
@@ -626,7 +626,7 @@ ignored (others aren't affected).
      can be performed in optimal order. Not all SCSI devices support
      tagged queuing (:-().
  
-4.6 switches=
+4.5 switches=
  -------------
  
  Syntax: switches=<list of switches>
@@ -661,28 +661,6 @@ correctly.
  earlier initialization ("ov_"-less) takes precedence. But the
  switching-off on reset still happens in this case.
  
-4.5) stram_swap=
-----------------
-
-Syntax: stram_swap=<do_swap>[,<max_swap>]
-
-  This option is available only if the kernel has been compiled with
-CONFIG_STRAM_SWAP enabled. Normally, the kernel then determines
-dynamically whether to actually use ST-RAM as swap space. (Currently,
-the fraction of ST-RAM must be less or equal 1/3 of total memory to
-enable this swapping.) You can override the kernel's decision by
-specifying this option. 1 for <do_swap> means always enable the swap,
-even if you have less alternate RAM. 0 stands for never swap to
-ST-RAM, even if it's small enough compared to the rest of memory.
-
-  If ST-RAM swapping is enabled, the kernel usually uses all free
-ST-RAM as swap "device". If the kernel resides in ST-RAM, the region
-allocated by it is obviously never used for swapping :-) You can also
-limit this amount by specifying the second parameter, <max_swap>, if
-you want to use parts of ST-RAM as normal system memory. <max_swap> is
-in kBytes and the number should be a multiple of 4 (otherwise: rounded
-down).
-
  5) Options for Amiga Only:
  ==========================
  
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c

index c7481d5..6d52512 100644 (file)
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -371,6 +371,8 @@ show_mem(void)
         show_free_areas();
         printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
         for_each_online_node(nid) {
+               unsigned long flags;
+               pgdat_resize_lock(NODE_DATA(nid), &flags);
                 i = node_spanned_pages(nid);
                 while (i-- > 0) {
                         struct page *page = nid_page_nr(nid, i);
@@ -384,6 +386,7 @@ show_mem(void)
                         else
                                 shared += page_count(page) - 1;
                 }
+               pgdat_resize_unlock(NODE_DATA(nid), &flags);
         }
         printk("%ld pages of RAM\n",total);
         printk("%ld free pages\n",free);
diff --git a/arch/alpha/mm/remap.c b/arch/alpha/mm/remap.c

index 19817ad..a78356c 100644 (file)
--- a/arch/alpha/mm/remap.c
+++ b/arch/alpha/mm/remap.c
@@ -2,7 +2,6 @@
  #include <asm/pgalloc.h>
  #include <asm/cacheflush.h>
  
-/* called with the page_table_lock held */
  static inline void 
  remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, 
                unsigned long phys_addr, unsigned long flags)
@@ -31,7 +30,6 @@ remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
         } while (address && (address < end));
  }
  
-/* called with the page_table_lock held */
  static inline int 
  remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, 
                unsigned long phys_addr, unsigned long flags)
@@ -46,7 +44,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
         if (address >= end)
                 BUG();
         do {
-               pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+               pte_t * pte = pte_alloc_kernel(pmd, address);
                 if (!pte)
                         return -ENOMEM;
                 remap_area_pte(pte, address, end - address, 
@@ -70,7 +68,6 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr,
         flush_cache_all();
         if (address >= end)
                 BUG();
-       spin_lock(&init_mm.page_table_lock);
         do {
                 pmd_t *pmd;
                 pmd = pmd_alloc(&init_mm, dir, address);
@@ -84,7 +81,6 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr,
                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
                 dir++;
         } while (address && (address < end));
-       spin_unlock(&init_mm.page_table_lock);
         return error;
  }
  
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c

index a94d75f..a917e3d 100644 (file)
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -139,93 +139,33 @@ struct iwmmxt_sigframe {
         unsigned long   storage[0x98/4];
  };
  
-static int page_present(struct mm_struct *mm, void __user *uptr, int wr)
-{
-       unsigned long addr = (unsigned long)uptr;
-       pgd_t *pgd = pgd_offset(mm, addr);
-       if (pgd_present(*pgd)) {
-               pmd_t *pmd = pmd_offset(pgd, addr);
-               if (pmd_present(*pmd)) {
-                       pte_t *pte = pte_offset_map(pmd, addr);
-                       return (pte_present(*pte) && (!wr || pte_write(*pte)));
-               }
-       }
-       return 0;
-}
-
-static int copy_locked(void __user *uptr, void *kptr, size_t size, int write,
-                      void (*copyfn)(void *, void __user *))
-{
-       unsigned char v, __user *userptr = uptr;
-       int err = 0;
-
-       do {
-               struct mm_struct *mm;
-
-               if (write) {
-                       __put_user_error(0, userptr, err);
-                       __put_user_error(0, userptr + size - 1, err);
-               } else {
-                       __get_user_error(v, userptr, err);
-                       __get_user_error(v, userptr + size - 1, err);
-               }
-
-               if (err)
-                       break;
-
-               mm = current->mm;
-               spin_lock(&mm->page_table_lock);
-               if (page_present(mm, userptr, write) &&
-                   page_present(mm, userptr + size - 1, write)) {
-                       copyfn(kptr, uptr);
-               } else
-                       err = 1;
-               spin_unlock(&mm->page_table_lock);
-       } while (err);
-
-       return err;
-}
-
  static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame)
  {
-       int err = 0;
+       char kbuf[sizeof(*frame) + 8];
+       struct iwmmxt_sigframe *kframe;
  
         /* the iWMMXt context must be 64 bit aligned */
-       WARN_ON((unsigned long)frame & 7);
-
-       __put_user_error(IWMMXT_MAGIC0, &frame->magic0, err);
-       __put_user_error(IWMMXT_MAGIC1, &frame->magic1, err);
-
-       /*
-        * iwmmxt_task_copy() doesn't check user permissions.
-        * Let's do a dummy write on the upper boundary to ensure
-        * access to user mem is OK all way up.
-        */
-       err |= copy_locked(&frame->storage, current_thread_info(),
-                          sizeof(frame->storage), 1, iwmmxt_task_copy);
-       return err;
+       kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
+       kframe->magic0 = IWMMXT_MAGIC0;
+       kframe->magic1 = IWMMXT_MAGIC1;
+       iwmmxt_task_copy(current_thread_info(), &kframe->storage);
+       return __copy_to_user(frame, kframe, sizeof(*frame));
  }
  
  static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame)
  {
-       unsigned long magic0, magic1;
-       int err = 0;
+       char kbuf[sizeof(*frame) + 8];
+       struct iwmmxt_sigframe *kframe;
  
-       /* the iWMMXt context is 64 bit aligned */
-       WARN_ON((unsigned long)frame & 7);
-
-       /*
-        * Validate iWMMXt context signature.
-        * Also, iwmmxt_task_restore() doesn't check user permissions.
-        * Let's do a dummy write on the upper boundary to ensure
-        * access to user mem is OK all way up.
-        */
-       __get_user_error(magic0, &frame->magic0, err);
-       __get_user_error(magic1, &frame->magic1, err);
-       if (!err && magic0 == IWMMXT_MAGIC0 && magic1 == IWMMXT_MAGIC1)
-               err = copy_locked(&frame->storage, current_thread_info(),
-                                 sizeof(frame->storage), 0, iwmmxt_task_restore);
-       return err;
+       /* the iWMMXt context must be 64 bit aligned */
+       kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
+       if (__copy_from_user(kframe, frame, sizeof(*frame)))
+               return -1;
+       if (kframe->magic0 != IWMMXT_MAGIC0 ||
+           kframe->magic1 != IWMMXT_MAGIC1)
+               return -1;
+       iwmmxt_task_restore(current_thread_info(), &kframe->storage);
+       return 0;
  }
  
  #endif
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c

index baa0960..66e5a05 100644 (file)
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -483,29 +483,33 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
                 unsigned long addr = regs->ARM_r2;
                 struct mm_struct *mm = current->mm;
                 pgd_t *pgd; pmd_t *pmd; pte_t *pte;
+               spinlock_t *ptl;
  
                 regs->ARM_cpsr &= ~PSR_C_BIT;
-               spin_lock(&mm->page_table_lock);
+               down_read(&mm->mmap_sem);
                 pgd = pgd_offset(mm, addr);
                 if (!pgd_present(*pgd))
                         goto bad_access;
                 pmd = pmd_offset(pgd, addr);
                 if (!pmd_present(*pmd))
                         goto bad_access;
-               pte = pte_offset_map(pmd, addr);
-               if (!pte_present(*pte) || !pte_write(*pte))
+               pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+               if (!pte_present(*pte) || !pte_write(*pte)) {
+                       pte_unmap_unlock(pte, ptl);
                         goto bad_access;
+               }
                 val = *(unsigned long *)addr;
                 val -= regs->ARM_r0;
                 if (val == 0) {
                         *(unsigned long *)addr = regs->ARM_r1;
                         regs->ARM_cpsr |= PSR_C_BIT;
                 }
-               spin_unlock(&mm->page_table_lock);
+               pte_unmap_unlock(pte, ptl);
+               up_read(&mm->mmap_sem);
                 return val;
  
                 bad_access:
-               spin_unlock(&mm->page_table_lock);
+               up_read(&mm->mmap_sem);
                 /* simulate a write access fault */
                 do_DataAbort(addr, 15 + (1 << 11), regs);
                 return -1;
diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c

index 82f4d5e..47b0b76 100644 (file)
--- a/arch/arm/mm/consistent.c
+++ b/arch/arm/mm/consistent.c
@@ -397,8 +397,6 @@ static int __init consistent_init(void)
         pte_t *pte;
         int ret = 0;
  
-       spin_lock(&init_mm.page_table_lock);
-
         do {
                 pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
                 pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE);
@@ -409,7 +407,7 @@ static int __init consistent_init(void)
                 }
                 WARN_ON(!pmd_none(*pmd));
  
-               pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE);
+               pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
                 if (!pte) {
                         printk(KERN_ERR "%s: no pte tables\n", __func__);
                         ret = -ENOMEM;
@@ -419,8 +417,6 @@ static int __init consistent_init(void)
                 consistent_pte = pte;
         } while (0);
  
-       spin_unlock(&init_mm.page_table_lock);
-
         return ret;
  }
  
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c

index be4ab3d..7fc1b35 100644 (file)
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -26,6 +26,11 @@ static unsigned long shared_pte_mask = L_PTE_CACHEABLE;
  /*
   * We take the easy way out of this problem - we make the
   * PTE uncacheable.  However, we leave the write buffer on.
+ *
+ * Note that the pte lock held when calling update_mmu_cache must also
+ * guard the pte (somewhere else in the same mm) that we modify here.
+ * Therefore those configurations which might call adjust_pte (those
+ * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
   */
  static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
  {
@@ -127,7 +132,7 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page);
   *  2. If we have multiple shared mappings of the same space in
   *     an object, we need to deal with the cache aliasing issues.
   *
- * Note that the page_table_lock will be held.
+ * Note that the pte lock will be held.
   */
  void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
  {
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c

index 6fb1258..0f128c2 100644 (file)
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -75,7 +75,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
  
         pgprot = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_WRITE | flags);
         do {
-               pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+               pte_t * pte = pte_alloc_kernel(pmd, address);
                 if (!pte)
                         return -ENOMEM;
                 remap_area_pte(pte, address, end - address, address + phys_addr, pgprot);
@@ -97,7 +97,6 @@ remap_area_pages(unsigned long start, unsigned long phys_addr,
         phys_addr -= address;
         dir = pgd_offset(&init_mm, address);
         BUG_ON(address >= end);
-       spin_lock(&init_mm.page_table_lock);
         do {
                 pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
                 if (!pmd) {
@@ -114,7 +113,6 @@ remap_area_pages(unsigned long start, unsigned long phys_addr,
                 dir++;
         } while (address && (address < end));
  
-       spin_unlock(&init_mm.page_table_lock);
         flush_cache_vmap(start, end);
         return err;
  }
diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c

index 61bc2fa..1221fdd 100644 (file)
--- a/arch/arm/mm/mm-armv.c
+++ b/arch/arm/mm/mm-armv.c
@@ -179,11 +179,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
         clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t));
  
         if (!vectors_high()) {
-               /*
-                * This lock is here just to satisfy pmd_alloc and pte_lock
-                */
-               spin_lock(&mm->page_table_lock);
-
                 /*
                  * On ARM, first page must always be allocated since it
                  * contains the machine vectors.
@@ -201,23 +196,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
                 set_pte(new_pte, *init_pte);
                 pte_unmap_nested(init_pte);
                 pte_unmap(new_pte);
-
-               spin_unlock(&mm->page_table_lock);
         }
  
         return new_pgd;
  
  no_pte:
-       spin_unlock(&mm->page_table_lock);
         pmd_free(new_pmd);
-       free_pages((unsigned long)new_pgd, 2);
-       return NULL;
-
  no_pmd:
-       spin_unlock(&mm->page_table_lock);
         free_pages((unsigned long)new_pgd, 2);
-       return NULL;
-
  no_pgd:
         return NULL;
  }
@@ -243,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd)
         pte = pmd_page(*pmd);
         pmd_clear(pmd);
         dec_page_state(nr_page_table_pages);
+       pte_lock_deinit(pte);
         pte_free(pte);
         pmd_free(pmd);
  free:
diff --git a/arch/arm/oprofile/backtrace.c b/arch/arm/oprofile/backtrace.c

index df35c45..7c22c12 100644 (file)
--- a/arch/arm/oprofile/backtrace.c
+++ b/arch/arm/oprofile/backtrace.c
@@ -49,42 +49,22 @@ static struct frame_tail* kernel_backtrace(struct frame_tail *tail)
  
  static struct frame_tail* user_backtrace(struct frame_tail *tail)
  {
-       struct frame_tail buftail;
+       struct frame_tail buftail[2];
  
-       /* hardware pte might not be valid due to dirty/accessed bit emulation
-        * so we use copy_from_user and benefit from exception fixups */
-       if (copy_from_user(&buftail, tail, sizeof(struct frame_tail)))
+       /* Also check accessibility of one struct frame_tail beyond */
+       if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
+               return NULL;
+       if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail)))
                 return NULL;
  
-       oprofile_add_trace(buftail.lr);
+       oprofile_add_trace(buftail[0].lr);
  
         /* frame pointers should strictly progress back up the stack
          * (towards higher addresses) */
-       if (tail >= buftail.fp)
+       if (tail >= buftail[0].fp)
                 return NULL;
  
-       return buftail.fp-1;
-}
-
-/* Compare two addresses and see if they're on the same page */
-#define CMP_ADDR_EQUAL(x,y,offset) ((((unsigned long) x) >> PAGE_SHIFT) \
-       == ((((unsigned long) y) + offset) >> PAGE_SHIFT))
-
-/* check that the page(s) containing the frame tail are present */
-static int pages_present(struct frame_tail *tail)
-{
-       struct mm_struct * mm = current->mm;
-
-       if (!check_user_page_readable(mm, (unsigned long)tail))
-               return 0;
-
-       if (CMP_ADDR_EQUAL(tail, tail, 8))
-               return 1;
-
-       if (!check_user_page_readable(mm, ((unsigned long)tail) + 8))
-               return 0;
-
-       return 1;
+       return buftail[0].fp-1;
  }
  
  /*
@@ -118,7 +98,6 @@ static int valid_kernel_stack(struct frame_tail *tail, struct pt_regs *regs)
  void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
  {
         struct frame_tail *tail;
-       unsigned long last_address = 0;
  
         tail = ((struct frame_tail *) regs->ARM_fp) - 1;
  
@@ -132,13 +111,6 @@ void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
                 return;
         }
  
-       while (depth-- && tail && !((unsigned long) tail & 3)) {
-               if ((!CMP_ADDR_EQUAL(last_address, tail, 0)
-                       || !CMP_ADDR_EQUAL(last_address, tail, 8))
-                               && !pages_present(tail))
-                       return;
-               last_address = (unsigned long) tail;
+       while (depth-- && tail && !((unsigned long) tail & 3))
                 tail = user_backtrace(tail);
-       }
  }
-
diff --git a/arch/arm26/mm/memc.c b/arch/arm26/mm/memc.c

index 8e8a2bb..34def63 100644 (file)
--- a/arch/arm26/mm/memc.c
+++ b/arch/arm26/mm/memc.c
@@ -78,12 +78,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
         if (!new_pgd)
                 goto no_pgd;
  
-       /*
-        * This lock is here just to satisfy pmd_alloc and pte_lock
-         * FIXME: I bet we could avoid taking it pretty much altogether
-        */
-       spin_lock(&mm->page_table_lock);
-
         /*
          * On ARM, first page must always be allocated since it contains
          * the machine vectors.
@@ -92,7 +86,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
         if (!new_pmd)
                 goto no_pmd;
  
-       new_pte = pte_alloc_kernel(mm, new_pmd, 0);
+       new_pte = pte_alloc_map(mm, new_pmd, 0);
         if (!new_pte)
                 goto no_pte;
  
@@ -101,6 +95,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
         init_pte = pte_offset(init_pmd, 0);
  
         set_pte(new_pte, *init_pte);
+       pte_unmap(new_pte);
  
         /*
          * the page table entries are zeroed
@@ -112,23 +107,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
         memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR,
                 (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t));
  
-       spin_unlock(&mm->page_table_lock);
-
         /* update MEMC tables */
         cpu_memc_update_all(new_pgd);
         return new_pgd;
  
  no_pte:
-       spin_unlock(&mm->page_table_lock);
         pmd_free(new_pmd);
-       free_pgd_slow(new_pgd);
-       return NULL;
-
  no_pmd:
-       spin_unlock(&mm->page_table_lock);
         free_pgd_slow(new_pgd);
-       return NULL;
-
  no_pgd:
         return NULL;
  }
diff --git a/arch/cris/arch-v32/mm/tlb.c b/arch/cris/arch-v32/mm/tlb.c

index 8233406..b08a28b 100644 (file)
--- a/arch/cris/arch-v32/mm/tlb.c
+++ b/arch/cris/arch-v32/mm/tlb.c
@@ -175,6 +175,8 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
         return 0;
  }
  
+static DEFINE_SPINLOCK(mmu_context_lock);
+
  /* Called in schedule() just before actually doing the switch_to. */
  void
  switch_mm(struct mm_struct *prev, struct mm_struct *next,
@@ -183,10 +185,10 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
         int cpu = smp_processor_id();
  
         /* Make sure there is a MMU context. */
-       spin_lock(&next->page_table_lock);
+       spin_lock(&mmu_context_lock);
         get_mmu_context(next);
         cpu_set(cpu, next->cpu_vm_mask);
-       spin_unlock(&next->page_table_lock);
+       spin_unlock(&mmu_context_lock);
  
         /*
          * Remember the pgd for the fault handlers. Keep a seperate copy of it
diff --git a/arch/cris/mm/ioremap.c b/arch/cris/mm/ioremap.c

index ebba11e..a92ac98 100644 (file)
--- a/arch/cris/mm/ioremap.c
+++ b/arch/cris/mm/ioremap.c
@@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
         if (address >= end)
                 BUG();
         do {
-               pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+               pte_t * pte = pte_alloc_kernel(pmd, address);
                 if (!pte)
                         return -ENOMEM;
                 remap_area_pte(pte, address, end - address, address + phys_addr, prot);
@@ -74,7 +74,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
         flush_cache_all();
         if (address >= end)
                 BUG();
-       spin_lock(&init_mm.page_table_lock);
         do {
                 pud_t *pud;
                 pmd_t *pmd;
@@ -94,7 +93,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
                 dir++;
         } while (address && (address < end));
-       spin_unlock(&init_mm.page_table_lock);
         flush_tlb_all();
         return error;
  }
diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c

index cfc4f97..342823a 100644 (file)
--- a/arch/frv/mm/dma-alloc.c
+++ b/arch/frv/mm/dma-alloc.c
@@ -55,21 +55,18 @@ static int map_page(unsigned long va, unsigned long pa, pgprot_t prot)
         pte_t *pte;
         int err = -ENOMEM;
  
-       spin_lock(&init_mm.page_table_lock);
-
         /* Use upper 10 bits of VA to index the first level map */
         pge = pgd_offset_k(va);
         pue = pud_offset(pge, va);
         pme = pmd_offset(pue, va);
  
         /* Use middle 10 bits of VA to index the second-level map */
-       pte = pte_alloc_kernel(&init_mm, pme, va);
+       pte = pte_alloc_kernel(pme, va);
         if (pte != 0) {
                 err = 0;
                 set_pte(pte, mk_pte_phys(pa & PAGE_MASK, prot));
         }
  
-       spin_unlock(&init_mm.page_table_lock);
         return err;
  }
  
diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c

index 4eaec0f..2c67dfe 100644 (file)
--- a/arch/frv/mm/pgalloc.c
+++ b/arch/frv/mm/pgalloc.c
@@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd)
         if (pgd_list)
                 pgd_list->private = (unsigned long) &page->index;
         pgd_list = page;
-       page->private = (unsigned long) &pgd_list;
+       set_page_private(page, (unsigned long)&pgd_list);
  }
  
  static inline void pgd_list_del(pgd_t *pgd)
  {
         struct page *next, **pprev, *page = virt_to_page(pgd);
         next = (struct page *) page->index;
-       pprev = (struct page **) page->private;
+       pprev = (struct page **)page_private(page);
         *pprev = next;
         if (next)
                 next->private = (unsigned long) pprev;
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c

index 16b4850..fc19935 100644 (file)
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -134,17 +134,16 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
         return ret;
  }
  
-static void mark_screen_rdonly(struct task_struct * tsk)
+static void mark_screen_rdonly(struct mm_struct *mm)
  {
         pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
-       pte_t *pte, *mapped;
+       pte_t *pte;
+       spinlock_t *ptl;
         int i;
  
-       preempt_disable();
-       spin_lock(&tsk->mm->page_table_lock);
-       pgd = pgd_offset(tsk->mm, 0xA0000);
+       pgd = pgd_offset(mm, 0xA0000);
         if (pgd_none_or_clear_bad(pgd))
                 goto out;
         pud = pud_offset(pgd, 0xA0000);
@@ -153,16 +152,14 @@ static void mark_screen_rdonly(struct task_struct * tsk)
         pmd = pmd_offset(pud, 0xA0000);
         if (pmd_none_or_clear_bad(pmd))
                 goto out;
-       pte = mapped = pte_offset_map(pmd, 0xA0000);
+       pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
         for (i = 0; i < 32; i++) {
                 if (pte_present(*pte))
                         set_pte(pte, pte_wrprotect(*pte));
                 pte++;
         }
-       pte_unmap(mapped);
+       pte_unmap_unlock(pte, ptl);
  out:
-       spin_unlock(&tsk->mm->page_table_lock);
-       preempt_enable();
         flush_tlb();
  }
  
@@ -306,7 +303,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
  
         tsk->thread.screen_bitmap = info->screen_bitmap;
         if (info->flags & VM86_SCREEN_BITMAP)
-               mark_screen_rdonly(tsk);
+               mark_screen_rdonly(tsk->mm);
         __asm__ __volatile__(
                 "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
                 "movl %0,%%esp\n\t"
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c

index 244d8ec..c4af963 100644 (file)
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -98,7 +98,7 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
  
  extern unsigned long find_max_low_pfn(void);
  extern void find_max_pfn(void);
-extern void one_highpage_init(struct page *, int, int);
+extern void add_one_highpage_init(struct page *, int, int);
  
  extern struct e820map e820;
  extern unsigned long init_pg_tables_end;
@@ -427,7 +427,7 @@ void __init set_highmem_pages_init(int bad_ppro)
                         if (!pfn_valid(node_pfn))
                                 continue;
                         page = pfn_to_page(node_pfn);
-                       one_highpage_init(page, node_pfn, bad_ppro);
+                       add_one_highpage_init(page, node_pfn, bad_ppro);
                 }
         }
         totalram_pages += totalhigh_pages;
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c

index 2ebaf75..542d929 100644 (file)
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -27,6 +27,7 @@
  #include <linux/slab.h>
  #include <linux/proc_fs.h>
  #include <linux/efi.h>
+#include <linux/memory_hotplug.h>
  
  #include <asm/processor.h>
  #include <asm/system.h>
@@ -266,17 +267,46 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
         pkmap_page_table = pte; 
  }
  
-void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
+void __devinit free_new_highpage(struct page *page)
+{
+       set_page_count(page, 1);
+       __free_page(page);
+       totalhigh_pages++;
+}
+
+void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
  {
         if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
                 ClearPageReserved(page);
-               set_page_count(page, 1);
-               __free_page(page);
-               totalhigh_pages++;
+               free_new_highpage(page);
         } else
                 SetPageReserved(page);
  }
  
+static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+{
+       free_new_highpage(page);
+       totalram_pages++;
+#ifdef CONFIG_FLATMEM
+       max_mapnr = max(pfn, max_mapnr);
+#endif
+       num_physpages++;
+       return 0;
+}
+
+/*
+ * Not currently handling the NUMA case.
+ * Assuming single node and all memory that
+ * has been added dynamically that would be
+ * onlined here is in HIGHMEM
+ */
+void online_page(struct page *page)
+{
+       ClearPageReserved(page);
+       add_one_highpage_hotplug(page, page_to_pfn(page));
+}
+
+
  #ifdef CONFIG_NUMA
  extern void set_highmem_pages_init(int);
  #else
@@ -284,7 +314,7 @@ static void __init set_highmem_pages_init(int bad_ppro)
  {
         int pfn;
         for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
-               one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+               add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
         totalram_pages += totalhigh_pages;
  }
  #endif /* CONFIG_FLATMEM */
@@ -615,6 +645,28 @@ void __init mem_init(void)
  #endif
  }
  
+/*
+ * this is for the non-NUMA, single node SMP system case.
+ * Specifically, in the case of x86, we will always add
+ * memory to the highmem for now.
+ */
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+int add_memory(u64 start, u64 size)
+{
+       struct pglist_data *pgdata = &contig_page_data;
+       struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+
+       return __add_pages(zone, start_pfn, nr_pages);
+}
+
+int remove_memory(u64 start, u64 size)
+{
+       return -EINVAL;
+}
+#endif
+
  kmem_cache_t *pgd_cache;
  kmem_cache_t *pmd_cache;
  
diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c

index f379b8d..5d09de8 100644 (file)
--- a/arch/i386/mm/ioremap.c
+++ b/arch/i386/mm/ioremap.c
@@ -28,7 +28,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
         unsigned long pfn;
  
         pfn = phys_addr >> PAGE_SHIFT;
-       pte = pte_alloc_kernel(&init_mm, pmd, addr);
+       pte = pte_alloc_kernel(pmd, addr);
         if (!pte)
                 return -ENOMEM;
         do {
@@ -87,14 +87,12 @@ static int ioremap_page_range(unsigned long addr,
         flush_cache_all();
         phys_addr -= addr;
         pgd = pgd_offset_k(addr);
-       spin_lock(&init_mm.page_table_lock);
         do {
                 next = pgd_addr_end(addr, end);
                 err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags);
                 if (err)
                         break;
         } while (pgd++, addr = next, addr != end);
-       spin_unlock(&init_mm.page_table_lock);
         flush_tlb_all();
         return err;
  }
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c

index dcdce2c..9db3242 100644 (file)
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -31,11 +31,13 @@ void show_mem(void)
         pg_data_t *pgdat;
         unsigned long i;
         struct page_state ps;
+       unsigned long flags;
  
         printk(KERN_INFO "Mem-info:\n");
         show_free_areas();
         printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
         for_each_pgdat(pgdat) {
+               pgdat_resize_lock(pgdat, &flags);
                 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
                         page = pgdat_page_nr(pgdat, i);
                         total++;
@@ -48,6 +50,7 @@ void show_mem(void)
                         else if (page_count(page))
                                 shared += page_count(page) - 1;
                 }
+               pgdat_resize_unlock(pgdat, &flags);
         }
         printk(KERN_INFO "%d pages of RAM\n", total);
         printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
@@ -188,19 +191,19 @@ static inline void pgd_list_add(pgd_t *pgd)
         struct page *page = virt_to_page(pgd);
         page->index = (unsigned long)pgd_list;
         if (pgd_list)
-               pgd_list->private = (unsigned long)&page->index;
+               set_page_private(pgd_list, (unsigned long)&page->index);
         pgd_list = page;
-       page->private = (unsigned long)&pgd_list;
+       set_page_private(page, (unsigned long)&pgd_list);
  }
  
  static inline void pgd_list_del(pgd_t *pgd)
  {
         struct page *next, **pprev, *page = virt_to_page(pgd);
         next = (struct page *)page->index;
-       pprev = (struct page **)page->private;
+       pprev = (struct page **)page_private(page);
         *pprev = next;
         if (next)
-               next->private = (unsigned long)pprev;
+               set_page_private(next, (unsigned long)pprev);
  }
  
  void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
diff --git a/arch/i386/oprofile/backtrace.c b/arch/i386/oprofile/backtrace.c

index 65dfd2e..21654be 100644 (file)
--- a/arch/i386/oprofile/backtrace.c
+++ b/arch/i386/oprofile/backtrace.c
@@ -12,6 +12,7 @@
  #include <linux/sched.h>
  #include <linux/mm.h>
  #include <asm/ptrace.h>
+#include <asm/uaccess.h>
  
  struct frame_head {
         struct frame_head * ebp;
@@ -21,26 +22,22 @@ struct frame_head {
  static struct frame_head *
  dump_backtrace(struct frame_head * head)
  {
-       oprofile_add_trace(head->ret);
+       struct frame_head bufhead[2];
  
-       /* frame pointers should strictly progress back up the stack
-        * (towards higher addresses) */
-       if (head >= head->ebp)
+       /* Also check accessibility of one struct frame_head beyond */
+       if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
+               return NULL;
+       if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
                 return NULL;
  
-       return head->ebp;
-}
-
-/* check that the page(s) containing the frame head are present */
-static int pages_present(struct frame_head * head)
-{
-       struct mm_struct * mm = current->mm;
+       oprofile_add_trace(bufhead[0].ret);
  
-       /* FIXME: only necessary once per page */
-       if (!check_user_page_readable(mm, (unsigned long)head))
-               return 0;
+       /* frame pointers should strictly progress back up the stack
+        * (towards higher addresses) */
+       if (head >= bufhead[0].ebp)
+               return NULL;
  
-       return check_user_page_readable(mm, (unsigned long)(head + 1));
+       return bufhead[0].ebp;
  }
  
  /*
@@ -97,15 +94,6 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
                 return;
         }
  
-#ifdef CONFIG_SMP
-       if (!spin_trylock(&current->mm->page_table_lock))
-               return;
-#endif
-
-       while (depth-- && head && pages_present(head))
+       while (depth-- && head)
                 head = dump_backtrace(head);
-
-#ifdef CONFIG_SMP
-       spin_unlock(&current->mm->page_table_lock);
-#endif
  }
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c

index d71731e..f7dfc10 100644 (file)
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2352,7 +2352,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
         insert_vm_struct(mm, vma);
  
         mm->total_vm  += size >> PAGE_SHIFT;
-       vm_stat_account(vma);
+       vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
+                                                       vma_pages(vma));
         up_write(&task->mm->mmap_sem);
  
         /*
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c

index a3788fb..a88cdb7 100644 (file)
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -555,9 +555,13 @@ void show_mem(void)
         show_free_areas();
         printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
         for_each_pgdat(pgdat) {
-               unsigned long present = pgdat->node_present_pages;
+               unsigned long present;
+               unsigned long flags;
                 int shared = 0, cached = 0, reserved = 0;
+
                 printk("Node ID: %d\n", pgdat->node_id);
+               pgdat_resize_lock(pgdat, &flags);
+               present = pgdat->node_present_pages;
                 for(i = 0; i < pgdat->node_spanned_pages; i++) {
                         struct page *page;
                         if (pfn_valid(pgdat->node_start_pfn + i))
@@ -571,6 +575,7 @@ void show_mem(void)
                         else if (page_count(page))
                                 shared += page_count(page)-1;
                 }
+               pgdat_resize_unlock(pgdat, &flags);
                 total_present += present;
                 total_reserved += reserved;
                 total_cached += cached;
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c

index 3c32af9..af7eb08 100644 (file)
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -19,32 +19,6 @@
  
  extern void die (char *, struct pt_regs *, long);
  
-/*
- * This routine is analogous to expand_stack() but instead grows the
- * register backing store (which grows towards higher addresses).
- * Since the register backing store is access sequentially, we
- * disallow growing the RBS by more than a page at a time.  Note that
- * the VM_GROWSUP flag can be set on any VM area but that's fine
- * because the total process size is still limited by RLIMIT_STACK and
- * RLIMIT_AS.
- */
-static inline long
-expand_backing_store (struct vm_area_struct *vma, unsigned long address)
-{
-       unsigned long grow;
-
-       grow = PAGE_SIZE >> PAGE_SHIFT;
-       if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur
-           || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur))
-               return -ENOMEM;
-       vma->vm_end += PAGE_SIZE;
-       vma->vm_mm->total_vm += grow;
-       if (vma->vm_flags & VM_LOCKED)
-               vma->vm_mm->locked_vm += grow;
-       __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
-       return 0;
-}
-
  /*
   * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
   * (inside region 5, on ia64) and that page is present.
@@ -185,7 +159,13 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
                 if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
                     || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
                         goto bad_area;
-               if (expand_backing_store(vma, address))
+               /*
+                * Since the register backing store is accessed sequentially,
+                * we disallow growing it by more than a page at a time.
+                */
+               if (address > vma->vm_end + PAGE_SIZE - sizeof(long))
+                       goto bad_area;
+               if (expand_upwards(vma, address))
                         goto bad_area;
         }
         goto good_area;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c

index 98246ac..e3215ba 100644 (file)
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -158,7 +158,7 @@ ia64_init_addr_space (void)
                 vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
                 vma->vm_end = vma->vm_start + PAGE_SIZE;
                 vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
-               vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP;
+               vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT;
                 down_write(&current->mm->mmap_sem);
                 if (insert_vm_struct(current->mm, vma)) {
                         up_write(&current->mm->mmap_sem);
@@ -275,26 +275,21 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
  
         pgd = pgd_offset_k(address);            /* note: this is NOT pgd_offset()! */
  
-       spin_lock(&init_mm.page_table_lock);
         {
                 pud = pud_alloc(&init_mm, pgd, address);
                 if (!pud)
                         goto out;
-
                 pmd = pmd_alloc(&init_mm, pud, address);
                 if (!pmd)
                         goto out;
-               pte = pte_alloc_map(&init_mm, pmd, address);
+               pte = pte_alloc_kernel(pmd, address);
                 if (!pte)
                         goto out;
-               if (!pte_none(*pte)) {
-                       pte_unmap(pte);
+               if (!pte_none(*pte))
                         goto out;
-               }
                 set_pte(pte, mk_pte(page, pgprot));
-               pte_unmap(pte);
         }
-  out: spin_unlock(&init_mm.page_table_lock);
+  out:
         /* no need for flush_tlb */
         return page;
  }
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c

index c93e0f2..c79a9b9 100644 (file)
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -158,10 +158,12 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long
  # ifdef CONFIG_SMP
         platform_global_tlb_purge(mm, start, end, nbits);
  # else
+       preempt_disable();
         do {
                 ia64_ptcl(start, (nbits<<2));
                 start += (1UL << nbits);
         } while (start < end);
+       preempt_enable();
  # endif
  
         ia64_srlz_i();                  /* srlz.i implies srlz.d */
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c

index d9a40b1..6facf15 100644 (file)
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -48,6 +48,8 @@ void show_mem(void)
         show_free_areas();
         printk("Free swap:       %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
         for_each_pgdat(pgdat) {
+               unsigned long flags;
+               pgdat_resize_lock(pgdat, &flags);
                 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
                         page = pgdat_page_nr(pgdat, i);
                         total++;
@@ -60,6 +62,7 @@ void show_mem(void)
                         else if (page_count(page))
                                 shared += page_count(page) - 1;
                 }
+               pgdat_resize_unlock(pgdat, &flags);
         }
         printk("%d pages of RAM\n", total);
         printk("%d pages of HIGHMEM\n",highmem);
@@ -150,10 +153,14 @@ int __init reservedpages_count(void)
         int reservedpages, nid, i;
  
         reservedpages = 0;
-       for_each_online_node(nid)
+       for_each_online_node(nid) {
+               unsigned long flags;
+               pgdat_resize_lock(NODE_DATA(nid), &flags);
                 for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++)
                         if (PageReserved(nid_page_nr(nid, i)))
                                 reservedpages++;
+               pgdat_resize_unlock(NODE_DATA(nid), &flags);
+       }
  
         return reservedpages;
  }
diff --git a/arch/m32r/mm/ioremap.c b/arch/m32r/mm/ioremap.c

index 70c5905..a151849 100644 (file)
--- a/arch/m32r/mm/ioremap.c
+++ b/arch/m32r/mm/ioremap.c
@@ -67,7 +67,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
         if (address >= end)
                 BUG();
         do {
-               pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+               pte_t * pte = pte_alloc_kernel(pmd, address);
                 if (!pte)
                         return -ENOMEM;
                 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -90,7 +90,6 @@ remap_area_pages(unsigned long address, unsigned long phys_addr,
         flush_cache_all();
         if (address >= end)
                 BUG();
-       spin_lock(&init_mm.page_table_lock);
         do {
                 pmd_t *pmd;
                 pmd = pmd_alloc(&init_mm, dir, address);
@@ -104,7 +103,6 @@ remap_area_pages(unsigned long address, unsigned long phys_addr,
                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
                 dir++;
         } while (address && (address < end));
-       spin_unlock(&init_mm.page_table_lock);
         flush_tlb_all();
         return error;
  }
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig

index ba960bb..1dd5d18 100644 (file)
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -388,33 +388,11 @@ config AMIGA_PCMCIA
           Include support in the kernel for pcmcia on Amiga 1200 and Amiga
           600. If you intend to use pcmcia cards say Y; otherwise say N.
  
-config STRAM_SWAP
-       bool "Support for ST-RAM as swap space"
-       depends on ATARI && BROKEN
-       ---help---
-         Some Atari 68k machines (including the 520STF and 1020STE) divide
-         their addressable memory into ST and TT sections.  The TT section
-         (up to 512MB) is the main memory; the ST section (up to 4MB) is
-         accessible to the built-in graphics board, runs slower, and is
-         present mainly for backward compatibility with older machines.
-
-         This enables support for using (parts of) ST-RAM as swap space,
-         instead of as normal system memory. This can first enhance system
-         performance if you have lots of alternate RAM (compared to the size
-         of ST-RAM), because executable code always will reside in faster
-         memory. ST-RAM will remain as ultra-fast swap space. On the other
-         hand, it allows much improved dynamic allocations of ST-RAM buffers
-         for device driver modules (e.g. floppy, ACSI, SLM printer, DMA
-         sound). The probability that such allocations at module load time
-         fail is drastically reduced.
-
  config STRAM_PROC
         bool "ST-RAM statistics in /proc"
         depends on ATARI
         help
-         Say Y here to report ST-RAM usage statistics in /proc/stram.  See
-         the help for CONFIG_STRAM_SWAP for discussion of ST-RAM and its
-         uses.
+         Say Y here to report ST-RAM usage statistics in /proc/stram.
  
  config HEARTBEAT
         bool "Use power LED as a heartbeat" if AMIGA || APOLLO || ATARI || MAC ||Q40
diff --git a/arch/m68k/atari/stram.c b/arch/m68k/atari/stram.c

index 5a3c106..22e0481 100644 (file)
--- a/arch/m68k/atari/stram.c
+++ b/arch/m68k/atari/stram.c
@@ -15,11 +15,9 @@
  #include <linux/kdev_t.h>
  #include <linux/major.h>
  #include <linux/init.h>
-#include <linux/swap.h>
  #include <linux/slab.h>
  #include <linux/vmalloc.h>
  #include <linux/pagemap.h>
-#include <linux/shm.h>
  #include <linux/bootmem.h>
  #include <linux/mount.h>
  #include <linux/blkdev.h>
@@ -33,8 +31,6 @@
  #include <asm/io.h>
  #include <asm/semaphore.h>
  
-#include <linux/swapops.h>
-
  #undef DEBUG
  
  #ifdef DEBUG
@@ -49,8 +45,7 @@
  #include <linux/proc_fs.h>
  #endif
  
-/* Pre-swapping comments:
- *
+/*
   * ++roman:
   *
   * New version of ST-Ram buffer allocation. Instead of using the
@@ -75,76 +70,6 @@
   *
   */
  
-/*
- * New Nov 1997: Use ST-RAM as swap space!
- *
- * In the past, there were often problems with modules that require ST-RAM
- * buffers. Such drivers have to use __get_dma_pages(), which unfortunately
- * often isn't very successful in allocating more than 1 page :-( [1] The net
- * result was that most of the time you couldn't insmod such modules (ataflop,
- * ACSI, SCSI on Falcon, Atari internal framebuffer, not to speak of acsi_slm,
- * which needs a 1 MB buffer... :-).
- *
- * To overcome this limitation, ST-RAM can now be turned into a very
- * high-speed swap space. If a request for an ST-RAM buffer comes, the kernel
- * now tries to unswap some pages on that swap device to make some free (and
- * contiguous) space. This works much better in comparison to
- * __get_dma_pages(), since used swap pages can be selectively freed by either
- * moving them to somewhere else in swap space, or by reading them back into
- * system memory. Ok, there operation of unswapping isn't really cheap (for
- * each page, one has to go through the page tables of all processes), but it
- * doesn't happen that often (only when allocation ST-RAM, i.e. when loading a
- * module that needs ST-RAM). But it at least makes it possible to load such
- * modules!
- *
- * It could also be that overall system performance increases a bit due to
- * ST-RAM swapping, since slow ST-RAM isn't used anymore for holding data or
- * executing code in. It's then just a (very fast, compared to disk) back
- * storage for not-so-often needed data. (But this effect must be compared
- * with the loss of total memory...) Don't know if the effect is already
- * visible on a TT, where the speed difference between ST- and TT-RAM isn't
- * that dramatic, but it should on machines where TT-RAM is really much faster
- * (e.g. Afterburner).
- *
- *   [1]: __get_free_pages() does a fine job if you only want one page, but if
- * you want more (contiguous) pages, it can give you such a block only if
- * there's already a free one. The algorithm can't try to free buffers or swap
- * out something in order to make more free space, since all that page-freeing
- * mechanisms work "target-less", i.e. they just free something, but not in a
- * specific place. I.e., __get_free_pages() can't do anything to free
- * *adjacent* pages :-( This situation becomes even worse for DMA memory,
- * since the freeing algorithms are also blind to DMA capability of pages.
- */
-
-/* 1998-10-20: ++andreas
-   unswap_by_move disabled because it does not handle swapped shm pages.
-*/
-
-/* 2000-05-01: ++andreas
-   Integrated with bootmem.  Remove all traces of unswap_by_move.
-*/
-
-#ifdef CONFIG_STRAM_SWAP
-#define ALIGN_IF_SWAP(x)       PAGE_ALIGN(x)
-#else
-#define ALIGN_IF_SWAP(x)       (x)
-#endif
-
-/* get index of swap page at address 'addr' */
-#define SWAP_NR(addr)          (((addr) - swap_start) >> PAGE_SHIFT)
-
-/* get address of swap page #'nr' */
-#define SWAP_ADDR(nr)          (swap_start + ((nr) << PAGE_SHIFT))
-
-/* get number of pages for 'n' bytes (already page-aligned) */
-#define N_PAGES(n)                     ((n) >> PAGE_SHIFT)
-
-/* The following two numbers define the maximum fraction of ST-RAM in total
- * memory, below that the kernel would automatically use ST-RAM as swap
- * space. This decision can be overridden with stram_swap= */
-#define MAX_STRAM_FRACTION_NOM         1
-#define MAX_STRAM_FRACTION_DENOM       3
-
  /* Start and end (virtual) of ST-RAM */
  static void *stram_start, *stram_end;
  
@@ -164,10 +89,9 @@ typedef struct stram_block {
  } BLOCK;
  
  /* values for flags field */
-#define BLOCK_FREE             0x01    /* free structure in the BLOCKs pool */
+#define BLOCK_FREE     0x01    /* free structure in the BLOCKs pool */
  #define BLOCK_KMALLOCED        0x02    /* structure allocated by kmalloc() */
-#define BLOCK_GFP              0x08    /* block allocated with __get_dma_pages() */
-#define BLOCK_INSWAP   0x10    /* block allocated in swap space */
+#define BLOCK_GFP      0x08    /* block allocated with __get_dma_pages() */
  
  /* list of allocated blocks */
  static BLOCK *alloc_list;
@@ -179,60 +103,8 @@ static BLOCK *alloc_list;
  #define N_STATIC_BLOCKS        20
  static BLOCK static_blocks[N_STATIC_BLOCKS];
  
-#ifdef CONFIG_STRAM_SWAP
-/* max. number of bytes to use for swapping
- *  0 = no ST-RAM swapping
- * -1 = do swapping (to whole ST-RAM) if it's less than MAX_STRAM_FRACTION of
- *      total memory
- */
-static int max_swap_size = -1;
-
-/* start and end of swapping area */
-static void *swap_start, *swap_end;
-
-/* The ST-RAM's swap info structure */
-static struct swap_info_struct *stram_swap_info;
-
-/* The ST-RAM's swap type */
-static int stram_swap_type;
-
-/* Semaphore for get_stram_region.  */
-static DECLARE_MUTEX(stram_swap_sem);
-
-/* major and minor device number of the ST-RAM device; for the major, we use
- * the same as Amiga z2ram, which is really similar and impossible on Atari,
- * and for the minor a relatively odd number to avoid the user creating and
- * using that device. */
-#define        STRAM_MAJOR             Z2RAM_MAJOR
-#define        STRAM_MINOR             13
-
-/* Some impossible pointer value */
-#define MAGIC_FILE_P   (struct file *)0xffffdead
-
-#ifdef DO_PROC
-static unsigned stat_swap_read;
-static unsigned stat_swap_write;
-static unsigned stat_swap_force;
-#endif /* DO_PROC */
-
-#endif /* CONFIG_STRAM_SWAP */
-
  /***************************** Prototypes *****************************/
  
-#ifdef CONFIG_STRAM_SWAP
-static int swap_init(void *start_mem, void *swap_data);
-static void *get_stram_region( unsigned long n_pages );
-static void free_stram_region( unsigned long offset, unsigned long n_pages
-                              );
-static int in_some_region(void *addr);
-static unsigned long find_free_region( unsigned long n_pages, unsigned long
-                                      *total_free, unsigned long
-                                      *region_free );
-static void do_stram_request(request_queue_t *);
-static int stram_open( struct inode *inode, struct file *filp );
-static int stram_release( struct inode *inode, struct file *filp );
-static void reserve_region(void *start, void *end);
-#endif
  static BLOCK *add_region( void *addr, unsigned long size );
  static BLOCK *find_region( void *addr );
  static int remove_region( BLOCK *block );
@@ -279,84 +151,11 @@ void __init atari_stram_init(void)
   */
  void __init atari_stram_reserve_pages(void *start_mem)
  {
-#ifdef CONFIG_STRAM_SWAP
-       /* if max_swap_size is negative (i.e. no stram_swap= option given),
-        * determine at run time whether to use ST-RAM swapping */
-       if (max_swap_size < 0)
-               /* Use swapping if ST-RAM doesn't make up more than MAX_STRAM_FRACTION
-                * of total memory. In that case, the max. size is set to 16 MB,
-                * because ST-RAM can never be bigger than that.
-                * Also, never use swapping on a Hades, there's no separate ST-RAM in
-                * that machine. */
-               max_swap_size =
-                       (!MACH_IS_HADES &&
-                        (N_PAGES(stram_end-stram_start)*MAX_STRAM_FRACTION_DENOM <=
-                         ((unsigned long)high_memory>>PAGE_SHIFT)*MAX_STRAM_FRACTION_NOM)) ? 16*1024*1024 : 0;
-       DPRINTK( "atari_stram_reserve_pages: max_swap_size = %d\n", max_swap_size );
-#endif
-
         /* always reserve first page of ST-RAM, the first 2 kB are
          * supervisor-only! */
         if (!kernel_in_stram)
                 reserve_bootmem (0, PAGE_SIZE);
  
-#ifdef CONFIG_STRAM_SWAP
-       {
-               void *swap_data;
-
-               start_mem = (void *) PAGE_ALIGN ((unsigned long) start_mem);
-               /* determine first page to use as swap: if the kernel is
-                  in TT-RAM, this is the first page of (usable) ST-RAM;
-                  otherwise just use the end of kernel data (= start_mem) */
-               swap_start = !kernel_in_stram ? stram_start + PAGE_SIZE : start_mem;
-               /* decrement by one page, rest of kernel assumes that first swap page
-                * is always reserved and maybe doesn't handle swp_entry == 0
-                * correctly */
-               swap_start -= PAGE_SIZE;
-               swap_end = stram_end;
-               if (swap_end-swap_start > max_swap_size)
-                       swap_end =  swap_start + max_swap_size;
-               DPRINTK( "atari_stram_reserve_pages: swapping enabled; "
-                                "swap=%p-%p\n", swap_start, swap_end);
-
-               /* reserve some amount of memory for maintainance of
-                * swapping itself: one page for each 2048 (PAGE_SIZE/2)
-                * swap pages. (2 bytes for each page) */
-               swap_data = start_mem;
-               start_mem += ((SWAP_NR(swap_end) + PAGE_SIZE/2 - 1)
-                             >> (PAGE_SHIFT-1)) << PAGE_SHIFT;
-               /* correct swap_start if necessary */
-               if (swap_start + PAGE_SIZE == swap_data)
-                       swap_start = start_mem - PAGE_SIZE;
-
-               if (!swap_init( start_mem, swap_data )) {
-                       printk( KERN_ERR "ST-RAM swap space initialization failed\n" );
-                       max_swap_size = 0;
-                       return;
-               }
-               /* reserve region for swapping meta-data */
-               reserve_region(swap_data, start_mem);
-               /* reserve swapping area itself */
-               reserve_region(swap_start + PAGE_SIZE, swap_end);
-
-               /*
-                * If the whole ST-RAM is used for swapping, there are no allocatable
-                * dma pages left. But unfortunately, some shared parts of the kernel
-                * (particularly the SCSI mid-level) call __get_dma_pages()
-                * unconditionally :-( These calls then fail, and scsi.c even doesn't
-                * check for NULL return values and just crashes. The quick fix for
-                * this (instead of doing much clean up work in the SCSI code) is to
-                * pretend all pages are DMA-able by setting mach_max_dma_address to
-                * ULONG_MAX. This doesn't change any functionality so far, since
-                * get_dma_pages() shouldn't be used on Atari anyway anymore (better
-                * use atari_stram_alloc()), and the Atari SCSI drivers don't need DMA
-                * memory. But unfortunately there's now no kind of warning (even not
-                * a NULL return value) if you use get_dma_pages() nevertheless :-(
-                * You just will get non-DMA-able memory...
-                */
-               mach_max_dma_address = 0xffffffff;
-       }
-#endif
  }
  
  void atari_stram_mem_init_hook (void)
@@ -367,7 +166,6 @@ void atari_stram_mem_init_hook (void)
  
  /*
   * This is main public interface: somehow allocate a ST-RAM block
- * There are three strategies:
   *
   *  - If we're before mem_init(), we have to make a static allocation. The
   *    region is taken in the kernel data area (if the kernel is in ST-RAM) or
@@ -375,14 +173,9 @@ void atari_stram_mem_init_hook (void)
   *    rsvd_stram_* region. The ST-RAM is somewhere in the middle of kernel
   *    address space in the latter case.
   *
- *  - If mem_init() already has been called and ST-RAM swapping is enabled,
- *    try to get the memory from the (pseudo) swap-space, either free already
- *    or by moving some other pages out of the swap.
- *
- *  - If mem_init() already has been called, and ST-RAM swapping is not
- *    enabled, the only possibility is to try with __get_dma_pages(). This has
- *    the disadvantage that it's very hard to get more than 1 page, and it is
- *    likely to fail :-(
+ *  - If mem_init() already has been called, try with __get_dma_pages().
+ *    This has the disadvantage that it's very hard to get more than 1 page,
+ *    and it is likely to fail :-(
   *
   */
  void *atari_stram_alloc(long size, const char *owner)
@@ -393,27 +186,13 @@ void *atari_stram_alloc(long size, const char *owner)
  
         DPRINTK("atari_stram_alloc(size=%08lx,owner=%s)\n", size, owner);
  
-       size = ALIGN_IF_SWAP(size);
-       DPRINTK( "atari_stram_alloc: rounded size = %08lx\n", size );
-#ifdef CONFIG_STRAM_SWAP
-       if (max_swap_size) {
-               /* If swapping is active: make some free space in the swap
-                  "device". */
-               DPRINTK( "atari_stram_alloc: after mem_init, swapping ok, "
-                                "calling get_region\n" );
-               addr = get_stram_region( N_PAGES(size) );
-               flags = BLOCK_INSWAP;
-       }
-       else
-#endif
         if (!mem_init_done)
                 return alloc_bootmem_low(size);
         else {
-               /* After mem_init() and no swapping: can only resort to
-                * __get_dma_pages() */
+               /* After mem_init(): can only resort to __get_dma_pages() */
                 addr = (void *)__get_dma_pages(GFP_KERNEL, get_order(size));
                 flags = BLOCK_GFP;
-               DPRINTK( "atari_stram_alloc: after mem_init, swapping off, "
+               DPRINTK( "atari_stram_alloc: after mem_init, "
                                  "get_pages=%p\n", addr );
         }
  
@@ -422,12 +201,7 @@ void *atari_stram_alloc(long size, const char *owner)
                         /* out of memory for BLOCK structure :-( */
                         DPRINTK( "atari_stram_alloc: out of mem for BLOCK -- "
                                          "freeing again\n" );
-#ifdef CONFIG_STRAM_SWAP
-                       if (flags == BLOCK_INSWAP)
-                               free_stram_region( SWAP_NR(addr), N_PAGES(size) );
-                       else
-#endif
-                               free_pages((unsigned long)addr, get_order(size));
+                       free_pages((unsigned long)addr, get_order(size));
                         return( NULL );
                 }
                 block->owner = owner;
@@ -451,25 +225,12 @@ void atari_stram_free( void *addr )
         DPRINTK( "atari_stram_free: found block (%p): size=%08lx, owner=%s, "
                          "flags=%02x\n", block, block->size, block->owner, block->flags );
  
-#ifdef CONFIG_STRAM_SWAP
-       if (!max_swap_size) {
-#endif
-               if (block->flags & BLOCK_GFP) {
-                       DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n",
-                               get_order(block->size));
-                       free_pages((unsigned long)addr, get_order(block->size));
-               }
-               else
-                       goto fail;
-#ifdef CONFIG_STRAM_SWAP
-       }
-       else if (block->flags & BLOCK_INSWAP) {
-               DPRINTK( "atari_stram_free: is swap-alloced\n" );
-               free_stram_region( SWAP_NR(block->start), N_PAGES(block->size) );
-       }
-       else
+       if (!(block->flags & BLOCK_GFP))
                 goto fail;
-#endif
+
+       DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n",
+               get_order(block->size));
+       free_pages((unsigned long)addr, get_order(block->size));
         remove_region( block );
         return;
  
@@ -478,612 +239,6 @@ void atari_stram_free( void *addr )
                         "(called from %p)\n", addr, __builtin_return_address(0) );
  }
  
-
-#ifdef CONFIG_STRAM_SWAP
-
-\f
-/* ------------------------------------------------------------------------ */
-/*                                                Main Swapping Functions                                                      */
-/* ------------------------------------------------------------------------ */
-
-
-/*
- * Initialize ST-RAM swap device
- * (lots copied and modified from sys_swapon() in mm/swapfile.c)
- */
-static int __init swap_init(void *start_mem, void *swap_data)
-{
-       static struct dentry fake_dentry;
-       static struct vfsmount fake_vfsmnt;
-       struct swap_info_struct *p;
-       struct inode swap_inode;
-       unsigned int type;
-       void *addr;
-       int i, j, k, prev;
-
-       DPRINTK("swap_init(start_mem=%p, swap_data=%p)\n",
-               start_mem, swap_data);
-
-       /* need at least one page for swapping to (and this also isn't very
-        * much... :-) */
-       if (swap_end - swap_start < 2*PAGE_SIZE) {
-               printk( KERN_WARNING "stram_swap_init: swap space too small\n" );
-               return( 0 );
-       }
-
-       /* find free slot in swap_info */
-       for( p = swap_info, type = 0; type < nr_swapfiles; type++, p++ )
-               if (!(p->flags & SWP_USED))
-                       break;
-       if (type >= MAX_SWAPFILES) {
-               printk( KERN_WARNING "stram_swap_init: max. number of "
-                               "swap devices exhausted\n" );
-               return( 0 );
-       }
-       if (type >= nr_swapfiles)
-               nr_swapfiles = type+1;
-
-       stram_swap_info = p;
-       stram_swap_type = type;
-
-       /* fake some dir cache entries to give us some name in /dev/swaps */
-       fake_dentry.d_parent = &fake_dentry;
-       fake_dentry.d_name.name = "stram (internal)";
-       fake_dentry.d_name.len = 16;
-       fake_vfsmnt.mnt_parent = &fake_vfsmnt;
-
-       p->flags        = SWP_USED;
-       p->swap_file    = &fake_dentry;
-       p->swap_vfsmnt  = &fake_vfsmnt;
-       p->swap_map     = swap_data;
-       p->cluster_nr   = 0;
-       p->next         = -1;
-       p->prio         = 0x7ff0;       /* a rather high priority, but not the higest
-                                                                * to give the user a chance to override */
-
-       /* call stram_open() directly, avoids at least the overhead in
-        * constructing a dummy file structure... */
-       swap_inode.i_rdev = MKDEV( STRAM_MAJOR, STRAM_MINOR );
-       stram_open( &swap_inode, MAGIC_FILE_P );
-       p->max = SWAP_NR(swap_end);
-
-       /* initialize swap_map: set regions that are already allocated or belong
-        * to kernel data space to SWAP_MAP_BAD, otherwise to free */
-       j = 0; /* # of free pages */
-       k = 0; /* # of already allocated pages (from pre-mem_init stram_alloc()) */
-       p->lowest_bit = 0;
-       p->highest_bit = 0;
-       for( i = 1, addr = SWAP_ADDR(1); i < p->max;
-                i++, addr += PAGE_SIZE ) {
-               if (in_some_region( addr )) {
-                       p->swap_map[i] = SWAP_MAP_BAD;
-                       ++k;
-               }
-               else if (kernel_in_stram && addr < start_mem ) {
-                       p->swap_map[i] = SWAP_MAP_BAD;
-               }
-               else {
-                       p->swap_map[i] = 0;
-                       ++j;
-                       if (!p->lowest_bit) p->lowest_bit = i;
-                       p->highest_bit = i;
-               }
-       }
-       /* first page always reserved (and doesn't really belong to swap space) */
-       p->swap_map[0] = SWAP_MAP_BAD;
-
-       /* now swapping to this device ok */
-       p->pages = j + k;
-       swap_list_lock();
-       nr_swap_pages += j;
-       p->flags = SWP_WRITEOK;
-
-       /* insert swap space into swap_list */
-       prev = -1;
-       for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
-               if (p->prio >= swap_info[i].prio) {
-                       break;
-               }
-               prev = i;
-       }
-       p->next = i;
-       if (prev < 0) {
-               swap_list.head = swap_list.next = p - swap_info;
-       } else {
-               swap_info[prev].next = p - swap_info;
-       }
-       swap_list_unlock();
-
-       printk( KERN_INFO "Using %dk (%d pages) of ST-RAM as swap space.\n",
-                       p->pages << 2, p->pages );
-       return( 1 );
-}
-
-
-/*
- * The swap entry has been read in advance, and we return 1 to indicate
- * that the page has been used or is no longer needed.
- *
- * Always set the resulting pte to be nowrite (the same as COW pages
- * after one process has exited).  We don't know just how many PTEs will
- * share this swap entry, so be cautious and let do_wp_page work out
- * what to do if a write is requested later.
- */
-static inline void unswap_pte(struct vm_area_struct * vma, unsigned long
-                             address, pte_t *dir, swp_entry_t entry,
-                             struct page *page)
-{
-       pte_t pte = *dir;
-
-       if (pte_none(pte))
-               return;
-       if (pte_present(pte)) {
-               /* If this entry is swap-cached, then page must already
-                   hold the right address for any copies in physical
-                   memory */
-               if (pte_page(pte) != page)
-                       return;
-               /* We will be removing the swap cache in a moment, so... */
-               set_pte(dir, pte_mkdirty(pte));
-               return;
-       }
-       if (pte_val(pte) != entry.val)
-               return;
-
-       DPRINTK("unswap_pte: replacing entry %08lx by new page %p",
-               entry.val, page);
-       set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-       swap_free(entry);
-       get_page(page);
-       inc_mm_counter(vma->vm_mm, rss);
-}
-
-static inline void unswap_pmd(struct vm_area_struct * vma, pmd_t *dir,
-                             unsigned long address, unsigned long size,
-                             unsigned long offset, swp_entry_t entry,
-                             struct page *page)
-{
-       pte_t * pte;
-       unsigned long end;
-
-       if (pmd_none(*dir))
-               return;
-       if (pmd_bad(*dir)) {
-               pmd_ERROR(*dir);
-               pmd_clear(dir);
-               return;
-       }
-       pte = pte_offset_kernel(dir, address);
-       offset += address & PMD_MASK;
-       address &= ~PMD_MASK;
-       end = address + size;
-       if (end > PMD_SIZE)
-               end = PMD_SIZE;
-       do {
-               unswap_pte(vma, offset+address-vma->vm_start, pte, entry, page);
-               address += PAGE_SIZE;
-               pte++;
-       } while (address < end);
-}
-
-static inline void unswap_pgd(struct vm_area_struct * vma, pgd_t *dir,
-                             unsigned long address, unsigned long size,
-                             swp_entry_t entry, struct page *page)
-{
-       pmd_t * pmd;
-       unsigned long offset, end;
-
-       if (pgd_none(*dir))
-               return;
-       if (pgd_bad(*dir)) {
-               pgd_ERROR(*dir);
-               pgd_clear(dir);
-               return;
-       }
-       pmd = pmd_offset(dir, address);
-       offset = address & PGDIR_MASK;
-       address &= ~PGDIR_MASK;
-       end = address + size;
-       if (end > PGDIR_SIZE)
-               end = PGDIR_SIZE;
-       do {
-               unswap_pmd(vma, pmd, address, end - address, offset, entry,
-                          page);
-               address = (address + PMD_SIZE) & PMD_MASK;
-               pmd++;
-       } while (address < end);
-}
-
-static void unswap_vma(struct vm_area_struct * vma, pgd_t *pgdir,
-                      swp_entry_t entry, struct page *page)
-{
-       unsigned long start = vma->vm_start, end = vma->vm_end;
-
-       do {
-               unswap_pgd(vma, pgdir, start, end - start, entry, page);
-               start = (start + PGDIR_SIZE) & PGDIR_MASK;
-               pgdir++;
-       } while (start < end);
-}
-
-static void unswap_process(struct mm_struct * mm, swp_entry_t entry,
-                          struct page *page)
-{
-       struct vm_area_struct* vma;
-
-       /*
-        * Go through process' page directory.
-        */
-       if (!mm)
-               return;
-       for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               pgd_t * pgd = pgd_offset(mm, vma->vm_start);
-               unswap_vma(vma, pgd, entry, page);
-       }
-}
-
-
-static int unswap_by_read(unsigned short *map, unsigned long max,
-                         unsigned long start, unsigned long n_pages)
-{
-       struct task_struct *p;
-       struct page *page;
-       swp_entry_t entry;
-       unsigned long i;
-
-       DPRINTK( "unswapping %lu..%lu by reading in\n",
-                        start, start+n_pages-1 );
-
-       for( i = start; i < start+n_pages; ++i ) {
-               if (map[i] == SWAP_MAP_BAD) {
-                       printk( KERN_ERR "get_stram_region: page %lu already "
-                                       "reserved??\n", i );
-                       continue;
-               }
-
-               if (map[i]) {
-                       entry = swp_entry(stram_swap_type, i);
-                       DPRINTK("unswap: map[i=%lu]=%u nr_swap=%ld\n",
-                               i, map[i], nr_swap_pages);
-
-                       swap_device_lock(stram_swap_info);
-                       map[i]++;
-                       swap_device_unlock(stram_swap_info);
-                       /* Get a page for the entry, using the existing
-                          swap cache page if there is one.  Otherwise,
-                          get a clean page and read the swap into it. */
-                       page = read_swap_cache_async(entry, NULL, 0);
-                       if (!page) {
-                               swap_free(entry);
-                               return -ENOMEM;
-                       }
-                       read_lock(&tasklist_lock);
-                       for_each_process(p)
-                               unswap_process(p->mm, entry, page);
-                       read_unlock(&tasklist_lock);
-                       shmem_unuse(entry, page);
-                       /* Now get rid of the extra reference to the
-                          temporary page we've been using. */
-                       if (PageSwapCache(page))
-                               delete_from_swap_cache(page);
-                       __free_page(page);
-       #ifdef DO_PROC
-                       stat_swap_force++;
-       #endif
-               }
-
-               DPRINTK( "unswap: map[i=%lu]=%u nr_swap=%ld\n",
-                                i, map[i], nr_swap_pages );
-               swap_list_lock();
-               swap_device_lock(stram_swap_info);
-               map[i] = SWAP_MAP_BAD;
-               if (stram_swap_info->lowest_bit == i)
-                       stram_swap_info->lowest_bit++;
-               if (stram_swap_info->highest_bit == i)
-                       stram_swap_info->highest_bit--;
-               --nr_swap_pages;
-               swap_device_unlock(stram_swap_info);
-               swap_list_unlock();
-       }
-
-       return 0;
-}
-
-/*
- * reserve a region in ST-RAM swap space for an allocation
- */
-static void *get_stram_region( unsigned long n_pages )
-{
-       unsigned short *map = stram_swap_info->swap_map;
-       unsigned long max = stram_swap_info->max;
-       unsigned long start, total_free, region_free;
-       int err;
-       void *ret = NULL;
-
-       DPRINTK( "get_stram_region(n_pages=%lu)\n", n_pages );
-
-       down(&stram_swap_sem);
-
-       /* disallow writing to the swap device now */
-       stram_swap_info->flags = SWP_USED;
-
-       /* find a region of n_pages pages in the swap space including as much free
-        * pages as possible (and excluding any already-reserved pages). */
-       if (!(start = find_free_region( n_pages, &total_free, &region_free )))
-               goto end;
-       DPRINTK( "get_stram_region: region starts at %lu, has %lu free pages\n",
-                        start, region_free );
-
-       err = unswap_by_read(map, max, start, n_pages);
-       if (err)
-               goto end;
-
-       ret = SWAP_ADDR(start);
-  end:
-       /* allow using swap device again */
-       stram_swap_info->flags = SWP_WRITEOK;
-       up(&stram_swap_sem);
-       DPRINTK( "get_stram_region: returning %p\n", ret );
-       return( ret );
-}
-
-
-/*
- * free a reserved region in ST-RAM swap space
- */
-static void free_stram_region( unsigned long offset, unsigned long n_pages )
-{
-       unsigned short *map = stram_swap_info->swap_map;
-
-       DPRINTK( "free_stram_region(offset=%lu,n_pages=%lu)\n", offset, n_pages );
-
-       if (offset < 1 || offset + n_pages > stram_swap_info->max) {
-               printk( KERN_ERR "free_stram_region: Trying to free non-ST-RAM\n" );
-               return;
-       }
-
-       swap_list_lock();
-       swap_device_lock(stram_swap_info);
-       /* un-reserve the freed pages */
-       for( ; n_pages > 0; ++offset, --n_pages ) {
-               if (map[offset] != SWAP_MAP_BAD)
-                       printk( KERN_ERR "free_stram_region: Swap page %lu was not "
-                                       "reserved\n", offset );
-               map[offset] = 0;
-       }
-
-       /* update swapping meta-data */
-       if (offset < stram_swap_info->lowest_bit)
-               stram_swap_info->lowest_bit = offset;
-       if (offset+n_pages-1 > stram_swap_info->highest_bit)
-               stram_swap_info->highest_bit = offset+n_pages-1;
-       if (stram_swap_info->prio > swap_info[swap_list.next].prio)
-               swap_list.next = swap_list.head;
-       nr_swap_pages += n_pages;
-       swap_device_unlock(stram_swap_info);
-       swap_list_unlock();
-}
-
-\f
-/* ------------------------------------------------------------------------ */
-/*                                             Utility Functions for Swapping                                          */
-/* ------------------------------------------------------------------------ */
-
-
-/* is addr in some of the allocated regions? */
-static int in_some_region(void *addr)
-{
-       BLOCK *p;
-
-       for( p = alloc_list; p; p = p->next ) {
-               if (p->start <= addr && addr < p->start + p->size)
-                       return( 1 );
-       }
-       return( 0 );
-}
-
-
-static unsigned long find_free_region(unsigned long n_pages,
-                                     unsigned long *total_free,
-                                     unsigned long *region_free)
-{
-       unsigned short *map = stram_swap_info->swap_map;
-       unsigned long max = stram_swap_info->max;
-       unsigned long head, tail, max_start;
-       long nfree, max_free;
-
-       /* first scan the swap space for a suitable place for the allocation */
-       head = 1;
-       max_start = 0;
-       max_free = -1;
-       *total_free = 0;
-
-  start_over:
-       /* increment tail until final window size reached, and count free pages */
-       nfree = 0;
-       for( tail = head; tail-head < n_pages && tail < max; ++tail ) {
-               if (map[tail] == SWAP_MAP_BAD) {
-                       head = tail+1;
-                       goto start_over;
-               }
-               if (!map[tail]) {
-                       ++nfree;
-                       ++*total_free;
-               }
-       }
-       if (tail-head < n_pages)
-               goto out;
-       if (nfree > max_free) {
-               max_start = head;
-               max_free  = nfree;
-               if (max_free >= n_pages)
-                       /* don't need more free pages... :-) */
-                       goto out;
-       }
-
-       /* now shift the window and look for the area where as much pages as
-        * possible are free */
-       while( tail < max ) {
-               nfree -= (map[head++] == 0);
-               if (map[tail] == SWAP_MAP_BAD) {
-                       head = tail+1;
-                       goto start_over;
-               }
-               if (!map[tail]) {
-                       ++nfree;
-                       ++*total_free;
-               }
-               ++tail;
-               if (nfree > max_free) {
-                       max_start = head;
-                       max_free  = nfree;
-                       if (max_free >= n_pages)
-                               /* don't need more free pages... :-) */
-                               goto out;
-               }
-       }
-
-  out:
-       if (max_free < 0) {
-               printk( KERN_NOTICE "get_stram_region: ST-RAM too full or fragmented "
-                               "-- can't allocate %lu pages\n", n_pages );
-               return( 0 );
-       }
-
-       *region_free = max_free;
-       return( max_start );
-}
-
-
-/* setup parameters from command line */
-void __init stram_swap_setup(char *str, int *ints)
-{
-       if (ints[0] >= 1)
-               max_swap_size = ((ints[1] < 0 ? 0 : ints[1]) * 1024) & PAGE_MASK;
-}
-
-\f
-/* ------------------------------------------------------------------------ */
-/*                                                             ST-RAM device                                                           */
-/* ------------------------------------------------------------------------ */
-
-static int refcnt;
-
-static void do_stram_request(request_queue_t *q)
-{
-       struct request *req;
-
-       while ((req = elv_next_request(q)) != NULL) {
-               void *start = swap_start + (req->sector << 9);
-               unsigned long len = req->current_nr_sectors << 9;
-               if ((start + len) > swap_end) {
-                       printk( KERN_ERR "stram: bad access beyond end of device: "
-                                       "block=%ld, count=%d\n",
-                                       req->sector,
-                                       req->current_nr_sectors );
-                       end_request(req, 0);
-                       continue;
-               }
-
-               if (req->cmd == READ) {
-                       memcpy(req->buffer, start, len);
-#ifdef DO_PROC
-                       stat_swap_read += N_PAGES(len);
-#endif
-               }
-               else {
-                       memcpy(start, req->buffer, len);
-#ifdef DO_PROC
-                       stat_swap_write += N_PAGES(len);
-#endif
-               }
-               end_request(req, 1);
-       }
-}
-
-
-static int stram_open( struct inode *inode, struct file *filp )
-{
-       if (filp != MAGIC_FILE_P) {
-               printk( KERN_NOTICE "Only kernel can open ST-RAM device\n" );
-               return( -EPERM );
-       }
-       if (refcnt)
-               return( -EBUSY );
-       ++refcnt;
-       return( 0 );
-}
-
-static int stram_release( struct inode *inode, struct file *filp )
-{
-       if (filp != MAGIC_FILE_P) {
-               printk( KERN_NOTICE "Only kernel can close ST-RAM device\n" );
-               return( -EPERM );
-       }
-       if (refcnt > 0)
-               --refcnt;
-       return( 0 );
-}
-
-
-static struct block_device_operations stram_fops = {
-       .open =         stram_open,
-       .release =      stram_release,
-};
-
-static struct gendisk *stram_disk;
-static struct request_queue *stram_queue;
-static DEFINE_SPINLOCK(stram_lock);
-
-int __init stram_device_init(void)
-{
-       if (!MACH_IS_ATARI)
-               /* no point in initializing this, I hope */
-               return -ENXIO;
-
-       if (!max_swap_size)
-               /* swapping not enabled */
-               return -ENXIO;
-       stram_disk = alloc_disk(1);
-       if (!stram_disk)
-               return -ENOMEM;
-
-       if (register_blkdev(STRAM_MAJOR, "stram")) {
-               put_disk(stram_disk);
-               return -ENXIO;
-       }
-
-       stram_queue = blk_init_queue(do_stram_request, &stram_lock);
-       if (!stram_queue) {
-               unregister_blkdev(STRAM_MAJOR, "stram");
-               put_disk(stram_disk);
-               return -ENOMEM;
-       }
-
-       stram_disk->major = STRAM_MAJOR;
-       stram_disk->first_minor = STRAM_MINOR;
-       stram_disk->fops = &stram_fops;
-       stram_disk->queue = stram_queue;
-       sprintf(stram_disk->disk_name, "stram");
-       set_capacity(stram_disk, (swap_end - swap_start)/512);
-       add_disk(stram_disk);
-       return 0;
-}
-
-
-\f
-/* ------------------------------------------------------------------------ */
-/*                                                     Misc Utility Functions                                                  */
-/* ------------------------------------------------------------------------ */
-
-/* reserve a range of pages */
-static void reserve_region(void *start, void *end)
-{
-       reserve_bootmem (virt_to_phys(start), end - start);
-}
-
-#endif /* CONFIG_STRAM_SWAP */
-
  \f
  /* ------------------------------------------------------------------------ */
  /*                                                       Region Management                                                             */
@@ -1173,50 +328,9 @@ int get_stram_list( char *buf )
  {
         int len = 0;
         BLOCK *p;
-#ifdef CONFIG_STRAM_SWAP
-       int i;
-       unsigned short *map = stram_swap_info->swap_map;
-       unsigned long max = stram_swap_info->max;
-       unsigned free = 0, used = 0, rsvd = 0;
-#endif
  
-#ifdef CONFIG_STRAM_SWAP
-       if (max_swap_size) {
-               for( i = 1; i < max; ++i ) {
-                       if (!map[i])
-                               ++free;
-                       else if (map[i] == SWAP_MAP_BAD)
-                               ++rsvd;
-                       else
-                               ++used;
-               }
-               PRINT_PROC(
-                       "Total ST-RAM:      %8u kB\n"
-                       "Total ST-RAM swap: %8lu kB\n"
-                       "Free swap:         %8u kB\n"
-                       "Used swap:         %8u kB\n"
-                       "Allocated swap:    %8u kB\n"
-                       "Swap Reads:        %8u\n"
-                       "Swap Writes:       %8u\n"
-                       "Swap Forced Reads: %8u\n",
-                       (stram_end - stram_start) >> 10,
-                       (max-1) << (PAGE_SHIFT-10),
-                       free << (PAGE_SHIFT-10),
-                       used << (PAGE_SHIFT-10),
-                       rsvd << (PAGE_SHIFT-10),
-                       stat_swap_read,
-                       stat_swap_write,
-                       stat_swap_force );
-       }
-       else {
-#endif
-               PRINT_PROC( "ST-RAM swapping disabled\n" );
-               PRINT_PROC("Total ST-RAM:      %8u kB\n",
+       PRINT_PROC("Total ST-RAM:      %8u kB\n",
                            (stram_end - stram_start) >> 10);
-#ifdef CONFIG_STRAM_SWAP
-       }
-#endif
-
         PRINT_PROC( "Allocated regions:\n" );
         for( p = alloc_list; p; p = p->next ) {
                 if (len + 50 >= PAGE_SIZE)
@@ -1227,8 +341,6 @@ int get_stram_list( char *buf )
                            p->owner);
                 if (p->flags & BLOCK_GFP)
                         PRINT_PROC( "page-alloced)\n" );
-               else if (p->flags & BLOCK_INSWAP)
-                       PRINT_PROC( "in swap)\n" );
                 else
                         PRINT_PROC( "??)\n" );
         }
diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c

index 5dcb3fa..fe2383e 100644 (file)
--- a/arch/m68k/mm/kmap.c
+++ b/arch/m68k/mm/kmap.c
@@ -201,7 +201,7 @@ void *__ioremap(unsigned long physaddr, unsigned long size, int cacheflag)
                         virtaddr += PTRTREESIZE;
                         size -= PTRTREESIZE;
                 } else {
-                       pte_dir = pte_alloc_kernel(&init_mm, pmd_dir, virtaddr);
+                       pte_dir = pte_alloc_kernel(pmd_dir, virtaddr);
                         if (!pte_dir) {
                                 printk("ioremap: no mem for pte_dir\n");
                                 return NULL;
diff --git a/arch/m68k/sun3x/dvma.c b/arch/m68k/sun3x/dvma.c

index 32e55ad..117481e 100644 (file)
--- a/arch/m68k/sun3x/dvma.c
+++ b/arch/m68k/sun3x/dvma.c
@@ -116,7 +116,7 @@ inline int dvma_map_cpu(unsigned long kaddr,
                         pte_t *pte;
                         unsigned long end3;
  
-                       if((pte = pte_alloc_kernel(&init_mm, pmd, vaddr)) == NULL) {
+                       if((pte = pte_alloc_kernel(pmd, vaddr)) == NULL) {
                                 ret = -ENOMEM;
                                 goto out;
                         }
diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c

index 99262fe..7ce34d4 100644 (file)
--- a/arch/mips/kernel/irixelf.c
+++ b/arch/mips/kernel/irixelf.c
@@ -697,7 +697,6 @@ static int load_irix_binary(struct linux_binprm * bprm, struct pt_regs * regs)
         /* Do this so that we can load the interpreter, if need be.  We will
          * change some of these later.
          */
-       set_mm_counter(current->mm, rss, 0);
         setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
         current->mm->start_stack = bprm->p;
  
diff --git a/arch/mips/mm/ioremap.c b/arch/mips/mm/ioremap.c

index 9c44ca7..3101d1d 100644 (file)
--- a/arch/mips/mm/ioremap.c
+++ b/arch/mips/mm/ioremap.c
@@ -55,7 +55,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address,
         if (address >= end)
                 BUG();
         do {
-               pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+               pte_t * pte = pte_alloc_kernel(pmd, address);
                 if (!pte)
                         return -ENOMEM;
                 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -77,7 +77,6 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr,
         flush_cache_all();
         if (address >= end)
                 BUG();
-       spin_lock(&init_mm.page_table_lock);
         do {
                 pud_t *pud;
                 pmd_t *pmd;
@@ -96,7 +95,6 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr,
                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
                 dir++;
         } while (address && (address < end));
-       spin_unlock(&init_mm.page_table_lock);
         flush_tlb_all();
         return error;
  }
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c

index e15f09e..a065349 100644 (file)
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -270,7 +270,6 @@ void flush_dcache_page(struct page *page)
         unsigned long offset;
         unsigned long addr;
         pgoff_t pgoff;
-       pte_t *pte;
         unsigned long pfn = page_to_pfn(page);
  
  
@@ -301,21 +300,16 @@ void flush_dcache_page(struct page *page)
                  * taking a page fault if the pte doesn't exist.
                  * This is just for speed.  If the page translation
                  * isn't there, there's no point exciting the
-                * nadtlb handler into a nullification frenzy */
-
-
-               if(!(pte = translation_exists(mpnt, addr)))
-                       continue;
-
-               /* make sure we really have this page: the private
+                * nadtlb handler into a nullification frenzy.
+                *
+                * Make sure we really have this page: the private
                  * mappings may cover this area but have COW'd this
-                * particular page */
-               if(pte_pfn(*pte) != pfn)
-                       continue;
-
-               __flush_cache_page(mpnt, addr);
-
-               break;
+                * particular page.
+                */
+               if (translation_exists(mpnt, addr, pfn)) {
+                       __flush_cache_page(mpnt, addr);
+                       break;
+               }
         }
         flush_dcache_mmap_unlock(mapping);
  }
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c

index ae6213d..f94a02e 100644 (file)
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -114,7 +114,7 @@ static inline int map_pmd_uncached(pmd_t * pmd, unsigned long vaddr,
         if (end > PGDIR_SIZE)
                 end = PGDIR_SIZE;
         do {
-               pte_t * pte = pte_alloc_kernel(&init_mm, pmd, vaddr);
+               pte_t * pte = pte_alloc_kernel(pmd, vaddr);
                 if (!pte)
                         return -ENOMEM;
                 if (map_pte_uncached(pte, orig_vaddr, end - vaddr, paddr_ptr))
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c

index 2886ad7..29b998e 100644 (file)
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -505,7 +505,9 @@ void show_mem(void)
  
                 for (j = node_start_pfn(i); j < node_end_pfn(i); j++) {
                         struct page *p;
+                       unsigned long flags;
  
+                       pgdat_resize_lock(NODE_DATA(i), &flags);
                         p = nid_page_nr(i, j) - node_start_pfn(i);
  
                         total++;
@@ -517,6 +519,7 @@ void show_mem(void)
                                 free++;
                         else
                                 shared += page_count(p) - 1;
+                       pgdat_resize_unlock(NODE_DATA(i), &flags);
                 }
         }
  #endif
diff --git a/arch/parisc/mm/ioremap.c b/arch/parisc/mm/ioremap.c

index f2df502..5c7a1b3 100644 (file)
--- a/arch/parisc/mm/ioremap.c
+++ b/arch/parisc/mm/ioremap.c
@@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
         if (address >= end)
                 BUG();
         do {
-               pte_t * pte = pte_alloc_kernel(NULL, pmd, address);
+               pte_t * pte = pte_alloc_kernel(pmd, address);
                 if (!pte)
                         return -ENOMEM;
                 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -75,10 +75,9 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
         flush_cache_all();
         if (address >= end)
                 BUG();
-       spin_lock(&init_mm.page_table_lock);
         do {
                 pmd_t *pmd;
-               pmd = pmd_alloc(dir, address);
+               pmd = pmd_alloc(&init_mm, dir, address);
                 error = -ENOMEM;
                 if (!pmd)
                         break;
@@ -89,7 +88,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
                 dir++;
         } while (address && (address < end));
-       spin_unlock(&init_mm.page_table_lock);
         flush_tlb_all();
         return error;
  }
diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c

index 0f710d2..685fd0d 100644 (file)
--- a/arch/ppc/kernel/dma-mapping.c
+++ b/arch/ppc/kernel/dma-mapping.c
@@ -335,8 +335,6 @@ static int __init dma_alloc_init(void)
         pte_t *pte;
         int ret = 0;
  
-       spin_lock(&init_mm.page_table_lock);
-
         do {
                 pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
                 pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE);
@@ -347,7 +345,7 @@ static int __init dma_alloc_init(void)
                 }
                 WARN_ON(!pmd_none(*pmd));
  
-               pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE);
+               pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
                 if (!pte) {
                         printk(KERN_ERR "%s: no pte tables\n", __func__);
                         ret = -ENOMEM;
@@ -357,8 +355,6 @@ static int __init dma_alloc_init(void)
                 consistent_pte = pte;
         } while (0);
  
-       spin_unlock(&init_mm.page_table_lock);
-
         return ret;
  }
  
diff --git a/arch/ppc/mm/4xx_mmu.c b/arch/ppc/mm/4xx_mmu.c

index b7bcbc2..4d006aa 100644 (file)
--- a/arch/ppc/mm/4xx_mmu.c
+++ b/arch/ppc/mm/4xx_mmu.c
@@ -110,13 +110,11 @@ unsigned long __init mmu_mapin_ram(void)
                 pmd_t *pmdp;
                 unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE;
  
-               spin_lock(&init_mm.page_table_lock);
                 pmdp = pmd_offset(pgd_offset_k(v), v);
                 pmd_val(*pmdp++) = val;
                 pmd_val(*pmdp++) = val;
                 pmd_val(*pmdp++) = val;
                 pmd_val(*pmdp++) = val;
-               spin_unlock(&init_mm.page_table_lock);
  
                 v += LARGE_PAGE_SIZE_16M;
                 p += LARGE_PAGE_SIZE_16M;
@@ -127,10 +125,8 @@ unsigned long __init mmu_mapin_ram(void)
                 pmd_t *pmdp;
                 unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE;
  
-               spin_lock(&init_mm.page_table_lock);
                 pmdp = pmd_offset(pgd_offset_k(v), v);
                 pmd_val(*pmdp) = val;
-               spin_unlock(&init_mm.page_table_lock);
  
                 v += LARGE_PAGE_SIZE_4M;
                 p += LARGE_PAGE_SIZE_4M;
diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c

index 43505b1..6ea9185 100644 (file)
--- a/arch/ppc/mm/pgtable.c
+++ b/arch/ppc/mm/pgtable.c
@@ -280,18 +280,16 @@ map_page(unsigned long va, phys_addr_t pa, int flags)
         pte_t *pg;
         int err = -ENOMEM;
  
-       spin_lock(&init_mm.page_table_lock);
         /* Use upper 10 bits of VA to index the first level map */
         pd = pmd_offset(pgd_offset_k(va), va);
         /* Use middle 10 bits of VA to index the second-level map */
-       pg = pte_alloc_kernel(&init_mm, pd, va);
+       pg = pte_alloc_kernel(pd, va);
         if (pg != 0) {
                 err = 0;
                 set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags)));
                 if (mem_init_done)
                         flush_HPTE(0, va, pmd_val(*pd));
         }
-       spin_unlock(&init_mm.page_table_lock);
         return err;
  }
  
diff --git a/arch/ppc64/kernel/vdso.c b/arch/ppc64/kernel/vdso.c

index efa985f..4aacf52 100644 (file)
--- a/arch/ppc64/kernel/vdso.c
+++ b/arch/ppc64/kernel/vdso.c
@@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma,
                 return NOPAGE_SIGBUS;
  
         /*
-        * Last page is systemcfg, special handling here, no get_page() a
-        * this is a reserved page
+        * Last page is systemcfg.
          */
         if ((vma->vm_end - address) <= PAGE_SIZE)
-               return virt_to_page(systemcfg);
+               pg = virt_to_page(systemcfg);
+       else
+               pg = virt_to_page(vbase + offset);
  
-       pg = virt_to_page(vbase + offset);
         get_page(pg);
         DBG(" ->page count: %d\n", page_count(pg));
  
@@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack)
          * gettimeofday will be totally dead. It's fine to use that for setting
          * breakpoints in the vDSO code pages though
          */
-       vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+       vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED;
         vma->vm_flags |= mm->def_flags;
         vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
         vma->vm_ops = &vdso_vmops;
@@ -603,6 +603,8 @@ void __init vdso_init(void)
                 ClearPageReserved(pg);
                 get_page(pg);
         }
+
+       get_page(virt_to_page(systemcfg));
  }
  
  int in_gate_area_no_task(unsigned long addr)
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c

index c65b87b..f4ca29c 100644 (file)
--- a/arch/ppc64/mm/imalloc.c
+++ b/arch/ppc64/mm/imalloc.c
@@ -300,12 +300,7 @@ void im_free(void * addr)
         for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
                 if (tmp->addr == addr) {
                         *p = tmp->next;
-
-                       /* XXX: do we need the lock? */
-                       spin_lock(&init_mm.page_table_lock);
                         unmap_vm_area(tmp);
-                       spin_unlock(&init_mm.page_table_lock);
-
                         kfree(tmp);
                         up(&imlist_sem);
                         return;
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c

index be64b15..e2bd777 100644 (file)
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -104,6 +104,8 @@ void show_mem(void)
         show_free_areas();
         printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
         for_each_pgdat(pgdat) {
+               unsigned long flags;
+               pgdat_resize_lock(pgdat, &flags);
                 for (i = 0; i < pgdat->node_spanned_pages; i++) {
                         page = pgdat_page_nr(pgdat, i);
                         total++;
@@ -114,6 +116,7 @@ void show_mem(void)
                         else if (page_count(page))
                                 shared += page_count(page) - 1;
                 }
+               pgdat_resize_unlock(pgdat, &flags);
         }
         printk("%ld pages of RAM\n", total);
         printk("%ld reserved pages\n", reserved);
@@ -155,7 +158,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
         unsigned long vsid;
  
         if (mem_init_done) {
-               spin_lock(&init_mm.page_table_lock);
                 pgdp = pgd_offset_k(ea);
                 pudp = pud_alloc(&init_mm, pgdp, ea);
                 if (!pudp)
@@ -163,12 +165,11 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
                 pmdp = pmd_alloc(&init_mm, pudp, ea);
                 if (!pmdp)
                         return -ENOMEM;
-               ptep = pte_alloc_kernel(&init_mm, pmdp, ea);
+               ptep = pte_alloc_kernel(pmdp, ea);
                 if (!ptep)
                         return -ENOMEM;
                 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
                                                           __pgprot(flags)));
-               spin_unlock(&init_mm.page_table_lock);
         } else {
                 unsigned long va, vpn, hash, hpteg;
  
@@ -649,11 +650,14 @@ void __init mem_init(void)
  #endif
  
         for_each_pgdat(pgdat) {
+               unsigned long flags;
+               pgdat_resize_lock(pgdat, &flags);
                 for (i = 0; i < pgdat->node_spanned_pages; i++) {
                         page = pgdat_page_nr(pgdat, i);
                         if (PageReserved(page))
                                 reservedpages++;
                 }
+               pgdat_resize_unlock(pgdat, &flags);
         }
  
         codesize = (unsigned long)&_etext - (unsigned long)&_stext;
@@ -867,3 +871,80 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
         return vma_prot;
  }
  EXPORT_SYMBOL(phys_mem_access_prot);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+void online_page(struct page *page)
+{
+       ClearPageReserved(page);
+       free_cold_page(page);
+       totalram_pages++;
+       num_physpages++;
+}
+
+/*
+ * This works only for the non-NUMA case.  Later, we'll need a lookup
+ * to convert from real physical addresses to nid, that doesn't use
+ * pfn_to_nid().
+ */
+int __devinit add_memory(u64 start, u64 size)
+{
+       struct pglist_data *pgdata = NODE_DATA(0);
+       struct zone *zone;
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+
+       /* this should work for most non-highmem platforms */
+       zone = pgdata->node_zones;
+
+       return __add_pages(zone, start_pfn, nr_pages);
+
+       return 0;
+}
+
+/*
+ * First pass at this code will check to determine if the remove
+ * request is within the RMO.  Do not allow removal within the RMO.
+ */
+int __devinit remove_memory(u64 start, u64 size)
+{
+       struct zone *zone;
+       unsigned long start_pfn, end_pfn, nr_pages;
+
+       start_pfn = start >> PAGE_SHIFT;
+       nr_pages = size >> PAGE_SHIFT;
+       end_pfn = start_pfn + nr_pages;
+
+       printk("%s(): Attempting to remove memoy in range "
+                       "%lx to %lx\n", __func__, start, start+size);
+       /*
+        * check for range within RMO
+        */
+       zone = page_zone(pfn_to_page(start_pfn));
+
+       printk("%s(): memory will be removed from "
+                       "the %s zone\n", __func__, zone->name);
+
+       /*
+        * not handling removing memory ranges that
+        * overlap multiple zones yet
+        */
+       if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages))
+               goto overlap;
+
+       /* make sure it is NOT in RMO */
+       if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) {
+               printk("%s(): range to be removed must NOT be in RMO!\n",
+                       __func__);
+               goto in_rmo;
+       }
+
+       return __remove_pages(zone, start_pfn, nr_pages);
+
+overlap:
+       printk("%s(): memory range to be removed overlaps "
+               "multiple zones!!!\n", __func__);
+in_rmo:
+       return -1;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/s390/mm/ioremap.c b/arch/s390/mm/ioremap.c

index c6c39d8..0f6e9ec 100644 (file)
--- a/arch/s390/mm/ioremap.c
+++ b/arch/s390/mm/ioremap.c
@@ -58,7 +58,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
         if (address >= end)
                 BUG();
         do {
-               pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+               pte_t * pte = pte_alloc_kernel(pmd, address);
                 if (!pte)
                         return -ENOMEM;
                 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -80,7 +80,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
         flush_cache_all();
         if (address >= end)
                 BUG();
-       spin_lock(&init_mm.page_table_lock);
         do {
                 pmd_t *pmd;
                 pmd = pmd_alloc(&init_mm, dir, address);
@@ -94,7 +93,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
                 dir++;
         } while (address && (address < end));
-       spin_unlock(&init_mm.page_table_lock);
         flush_tlb_all();
         return 0;
  }
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c

index 7abba21..775f86c 100644 (file)
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -194,10 +194,13 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
                                unsigned long address)
  {
         unsigned long addrmax = P4SEG;
-       pgd_t *dir;
+       pgd_t *pgd;
         pmd_t *pmd;
         pte_t *pte;
         pte_t entry;
+       struct mm_struct *mm;
+       spinlock_t *ptl;
+       int ret = 1;
  
  #ifdef CONFIG_SH_KGDB
         if (kgdb_nofault && kgdb_bus_err_hook)
@@ -208,28 +211,28 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
         addrmax = P4SEG_STORE_QUE + 0x04000000;
  #endif
  
-       if (address >= P3SEG && address < addrmax)
-               dir = pgd_offset_k(address);
-       else if (address >= TASK_SIZE)
+       if (address >= P3SEG && address < addrmax) {
+               pgd = pgd_offset_k(address);
+               mm = NULL;
+       } else if (address >= TASK_SIZE)
                 return 1;
-       else if (!current->mm)
+       else if (!(mm = current->mm))
                 return 1;
         else
-               dir = pgd_offset(current->mm, address);
+               pgd = pgd_offset(mm, address);
  
-       pmd = pmd_offset(dir, address);
-       if (pmd_none(*pmd))
-               return 1;
-       if (pmd_bad(*pmd)) {
-               pmd_ERROR(*pmd);
-               pmd_clear(pmd);
+       pmd = pmd_offset(pgd, address);
+       if (pmd_none_or_clear_bad(pmd))
                 return 1;
-       }
-       pte = pte_offset_kernel(pmd, address);
+       if (mm)
+               pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       else
+               pte = pte_offset_kernel(pmd, address);
+
         entry = *pte;
         if (pte_none(entry) || pte_not_present(entry)
             || (writeaccess && !pte_write(entry)))
-               return 1;
+               goto unlock;
  
         if (writeaccess)
                 entry = pte_mkdirty(entry);
@@ -251,8 +254,11 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
  
         set_pte(pte, entry);
         update_mmu_cache(NULL, address, entry);
-
-       return 0;
+       ret = 0;
+unlock:
+       if (mm)
+               pte_unmap_unlock(pte, ptl);
+       return ret;
  }
  
  void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c

index 95bb1a6..6b7a768 100644 (file)
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -54,8 +54,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
         return pte;
  }
  
-#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
-
  void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                      pte_t *ptep, pte_t entry)
  {
diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c

index 9f490c2..e794e27 100644 (file)
--- a/arch/sh/mm/ioremap.c
+++ b/arch/sh/mm/ioremap.c
@@ -57,7 +57,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address,
         if (address >= end)
                 BUG();
         do {
-               pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+               pte_t * pte = pte_alloc_kernel(pmd, address);
                 if (!pte)
                         return -ENOMEM;
                 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -79,7 +79,6 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr,
         flush_cache_all();
         if (address >= end)
                 BUG();
-       spin_lock(&init_mm.page_table_lock);
         do {
                 pmd_t *pmd;
                 pmd = pmd_alloc(&init_mm, dir, address);
@@ -93,7 +92,6 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr,
                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
                 dir++;
         } while (address && (address < end));
-       spin_unlock(&init_mm.page_table_lock);
         flush_tlb_all();
         return error;
  }
diff --git a/arch/sh64/mm/cache.c b/arch/sh64/mm/cache.c

index 3b87e25..c0c1b21 100644 (file)
--- a/arch/sh64/mm/cache.c
+++ b/arch/sh64/mm/cache.c
@@ -584,32 +584,36 @@ static void sh64_dcache_purge_phy_page(unsigned long paddr)
         }
  }
  
-static void sh64_dcache_purge_user_page(struct mm_struct *mm, unsigned long eaddr)
+static void sh64_dcache_purge_user_pages(struct mm_struct *mm,
+                               unsigned long addr, unsigned long end)
  {
         pgd_t *pgd;
         pmd_t *pmd;
         pte_t *pte;
         pte_t entry;
+       spinlock_t *ptl;
         unsigned long paddr;
  
-       /* NOTE : all the callers of this have mm->page_table_lock held, so the
-          following page table traversal is safe even on SMP/pre-emptible. */
-
-       if (!mm) return; /* No way to find physical address of page */
-       pgd = pgd_offset(mm, eaddr);
-       if (pgd_bad(*pgd)) return;
-
-       pmd = pmd_offset(pgd, eaddr);
-       if (pmd_none(*pmd) || pmd_bad(*pmd)) return;
-
-       pte = pte_offset_kernel(pmd, eaddr);
-       entry = *pte;
-       if (pte_none(entry) || !pte_present(entry)) return;
-
-       paddr = pte_val(entry) & PAGE_MASK;
-
-       sh64_dcache_purge_coloured_phy_page(paddr, eaddr);
-
+       if (!mm)
+               return; /* No way to find physical address of page */
+
+       pgd = pgd_offset(mm, addr);
+       if (pgd_bad(*pgd))
+               return;
+
+       pmd = pmd_offset(pgd, addr);
+       if (pmd_none(*pmd) || pmd_bad(*pmd))
+               return;
+
+       pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       do {
+               entry = *pte;
+               if (pte_none(entry) || !pte_present(entry))
+                       continue;
+               paddr = pte_val(entry) & PAGE_MASK;
+               sh64_dcache_purge_coloured_phy_page(paddr, addr);
+       } while (pte++, addr += PAGE_SIZE, addr != end);
+       pte_unmap_unlock(pte - 1, ptl);
  }
  /****************************************************************************/
  
@@ -668,7 +672,7 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm,
         int n_pages;
  
         n_pages = ((end - start) >> PAGE_SHIFT);
-       if (n_pages >= 64) {
+       if (n_pages >= 64 || ((start ^ (end - 1)) & PMD_MASK)) {
  #if 1
                 sh64_dcache_purge_all();
  #else
@@ -707,20 +711,10 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm,
                 }
  #endif
         } else {
-               /* 'Small' range */
-               unsigned long aligned_start;
-               unsigned long eaddr;
-               unsigned long last_page_start;
-
-               aligned_start = start & PAGE_MASK;
-               /* 'end' is 1 byte beyond the end of the range */
-               last_page_start = (end - 1) & PAGE_MASK;
-
-               eaddr = aligned_start;
-               while (eaddr <= last_page_start) {
-                       sh64_dcache_purge_user_page(mm, eaddr);
-                       eaddr += PAGE_SIZE;
-               }
+               /* Small range, covered by a single page table page */
+               start &= PAGE_MASK;     /* should already be so */
+               end = PAGE_ALIGN(end);  /* should already be so */
+               sh64_dcache_purge_user_pages(mm, start, end);
         }
         return;
  }
@@ -880,9 +874,7 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
            addresses from the user address space specified by mm, after writing
            back any dirty data.
  
-          Note(1), 'end' is 1 byte beyond the end of the range to flush.
-
-          Note(2), this is called with mm->page_table_lock held.*/
+          Note, 'end' is 1 byte beyond the end of the range to flush. */
  
         sh64_dcache_purge_user_range(mm, start, end);
         sh64_icache_inv_user_page_range(mm, start, end);
@@ -898,7 +890,7 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long eaddr, unsigned
            the I-cache must be searched too in case the page in question is
            both writable and being executed from (e.g. stack trampolines.)
  
-          Note(1), this is called with mm->page_table_lock held.
+          Note, this is called with pte lock held.
            */
  
         sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT);
diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c

index dcd9c8a..ed6a505 100644 (file)
--- a/arch/sh64/mm/hugetlbpage.c
+++ b/arch/sh64/mm/hugetlbpage.c
@@ -54,41 +54,31 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
         return pte;
  }
  
-#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
-
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-                        struct page *page, pte_t * page_table, int write_access)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t entry)
  {
-       unsigned long i;
-       pte_t entry;
-
-       add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-
-       if (write_access)
-               entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
-                                                      vma->vm_page_prot)));
-       else
-               entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-       entry = pte_mkyoung(entry);
-       mk_pte_huge(entry);
+       int i;
  
         for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-               set_pte(page_table, entry);
-               page_table++;
-
+               set_pte_at(mm, addr, ptep, entry);
+               ptep++;
+               addr += PAGE_SIZE;
                 pte_val(entry) += PAGE_SIZE;
         }
  }
  
-pte_t huge_ptep_get_and_clear(pte_t *ptep)
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+                             pte_t *ptep)
  {
         pte_t entry;
+       int i;
  
         entry = *ptep;
  
         for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-               pte_clear(pte);
-               pte++;
+               pte_clear(mm, addr, ptep);
+               addr += PAGE_SIZE;
+               ptep++;
         }
  
         return entry;
@@ -106,79 +96,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
         return 0;
  }
  
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-                           struct vm_area_struct *vma)
-{
-       pte_t *src_pte, *dst_pte, entry;
-       struct page *ptepage;
-       unsigned long addr = vma->vm_start;
-       unsigned long end = vma->vm_end;
-       int i;
-
-       while (addr < end) {
-               dst_pte = huge_pte_alloc(dst, addr);
-               if (!dst_pte)
-                       goto nomem;
-               src_pte = huge_pte_offset(src, addr);
-               BUG_ON(!src_pte || pte_none(*src_pte));
-               entry = *src_pte;
-               ptepage = pte_page(entry);
-               get_page(ptepage);
-               for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-                       set_pte(dst_pte, entry);
-                       pte_val(entry) += PAGE_SIZE;
-                       dst_pte++;
-               }
-               add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-               addr += HPAGE_SIZE;
-       }
-       return 0;
-
-nomem:
-       return -ENOMEM;
-}
-
-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                       struct page **pages, struct vm_area_struct **vmas,
-                       unsigned long *position, int *length, int i)
-{
-       unsigned long vaddr = *position;
-       int remainder = *length;
-
-       WARN_ON(!is_vm_hugetlb_page(vma));
-
-       while (vaddr < vma->vm_end && remainder) {
-               if (pages) {
-                       pte_t *pte;
-                       struct page *page;
-
-                       pte = huge_pte_offset(mm, vaddr);
-
-                       /* hugetlb should be locked, and hence, prefaulted */
-                       BUG_ON(!pte || pte_none(*pte));
-
-                       page = pte_page(*pte);
-
-                       WARN_ON(!PageCompound(page));
-
-                       get_page(page);
-                       pages[i] = page;
-               }
-
-               if (vmas)
-                       vmas[i] = vma;
-
-               vaddr += PAGE_SIZE;
-               --remainder;
-               ++i;
-       }
-
-       *length = remainder;
-       *position = vaddr;
-
-       return i;
-}
-
  struct page *follow_huge_addr(struct mm_struct *mm,
                               unsigned long address, int write)
  {
@@ -195,84 +112,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
  {
         return NULL;
  }
-
-void unmap_hugepage_range(struct vm_area_struct *vma,
-                         unsigned long start, unsigned long end)
-{
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned long address;
-       pte_t *pte;
-       struct page *page;
-       int i;
-
-       BUG_ON(start & (HPAGE_SIZE - 1));
-       BUG_ON(end & (HPAGE_SIZE - 1));
-
-       for (address = start; address < end; address += HPAGE_SIZE) {
-               pte = huge_pte_offset(mm, address);
-               BUG_ON(!pte);
-               if (pte_none(*pte))
-                       continue;
-               page = pte_page(*pte);
-               put_page(page);
-               for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-                       pte_clear(mm, address+(i*PAGE_SIZE), pte);
-                       pte++;
-               }
-       }
-       add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
-       flush_tlb_range(vma, start, end);
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-       struct mm_struct *mm = current->mm;
-       unsigned long addr;
-       int ret = 0;
-
-       BUG_ON(vma->vm_start & ~HPAGE_MASK);
-       BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-       spin_lock(&mm->page_table_lock);
-       for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-               unsigned long idx;
-               pte_t *pte = huge_pte_alloc(mm, addr);
-               struct page *page;
-
-               if (!pte) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
-               if (!pte_none(*pte))
-                       continue;
-
-               idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-                       + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-               page = find_get_page(mapping, idx);
-               if (!page) {
-                       /* charge the fs quota first */
-                       if (hugetlb_get_quota(mapping)) {
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-                       page = alloc_huge_page();
-                       if (!page) {
-                               hugetlb_put_quota(mapping);
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-                       ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-                       if (! ret) {
-                               unlock_page(page);
-                       } else {
-                               hugetlb_put_quota(mapping);
-                               free_huge_page(page);
-                               goto out;
-                       }
-               }
-               set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
-       }
-out:
-       spin_unlock(&mm->page_table_lock);
-       return ret;
-}
diff --git a/arch/sh64/mm/ioremap.c b/arch/sh64/mm/ioremap.c

index f4003da..fb1866f 100644 (file)
--- a/arch/sh64/mm/ioremap.c
+++ b/arch/sh64/mm/ioremap.c
@@ -79,7 +79,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
                 BUG();
  
         do {
-               pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+               pte_t * pte = pte_alloc_kernel(pmd, address);
                 if (!pte)
                         return -ENOMEM;
                 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -101,7 +101,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
         flush_cache_all();
         if (address >= end)
                 BUG();
-       spin_lock(&init_mm.page_table_lock);
         do {
                 pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
                 error = -ENOMEM;
@@ -115,7 +114,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
                 dir++;
         } while (address && (address < end));
-       spin_unlock(&init_mm.page_table_lock);
         flush_tlb_all();
         return 0;
  }
diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c

index 20ccb95..9604893 100644 (file)
--- a/arch/sparc/mm/generic.c
+++ b/arch/sparc/mm/generic.c
@@ -73,14 +73,16 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
         int space = GET_IOSPACE(pfn);
         unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
  
+       /* See comment in mm/memory.c remap_pfn_range */
+       vma->vm_flags |= VM_IO | VM_RESERVED;
+
         prot = __pgprot(pg_iobits);
         offset -= from;
         dir = pgd_offset(mm, from);
         flush_cache_range(vma, beg, end);
  
-       spin_lock(&mm->page_table_lock);
         while (from < end) {
-               pmd_t *pmd = pmd_alloc(current->mm, dir, from);
+               pmd_t *pmd = pmd_alloc(mm, dir, from);
                 error = -ENOMEM;
                 if (!pmd)
                         break;
@@ -90,7 +92,6 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
                 dir++;
         }
-       spin_unlock(&mm->page_table_lock);
  
         flush_tlb_range(vma, beg, end);
         return error;
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c

index b2854ef..edf52d0 100644 (file)
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -241,7 +241,6 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
         current->mm->brk = ex.a_bss +
                 (current->mm->start_brk = N_BSSADDR(ex));
  
-       set_mm_counter(current->mm, rss, 0);
         current->mm->mmap = NULL;
         compute_creds(bprm);
         current->flags &= ~PF_FORKNOEXEC;
diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c

index c954d91..112c316 100644 (file)
--- a/arch/sparc64/mm/generic.c
+++ b/arch/sparc64/mm/generic.c
@@ -127,14 +127,16 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
         int space = GET_IOSPACE(pfn);
         unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
  
+       /* See comment in mm/memory.c remap_pfn_range */
+       vma->vm_flags |= VM_IO | VM_RESERVED;
+
         prot = __pgprot(pg_iobits);
         offset -= from;
         dir = pgd_offset(mm, from);
         flush_cache_range(vma, beg, end);
  
-       spin_lock(&mm->page_table_lock);
         while (from < end) {
-               pud_t *pud = pud_alloc(current->mm, dir, from);
+               pud_t *pud = pud_alloc(mm, dir, from);
                 error = -ENOMEM;
                 if (!pud)
                         break;
@@ -144,8 +146,7 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
                 dir++;
         }
-       flush_tlb_range(vma, beg, end);
-       spin_unlock(&mm->page_table_lock);
  
+       flush_tlb_range(vma, beg, end);
         return error;
  }
diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c

index 90ca99d..8b104be 100644 (file)
--- a/arch/sparc64/mm/tlb.c
+++ b/arch/sparc64/mm/tlb.c
@@ -18,8 +18,7 @@
  
  /* Heavily inspired by the ppc64 code.  */
  
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) =
-       { NULL, 0, 0, 0, 0, 0, { 0 }, { NULL }, };
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) = { 0, };
  
  void flush_tlb_pending(void)
  {
@@ -72,7 +71,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t
  
  no_cache_flush:
  
-       if (mp->tlb_frozen)
+       if (mp->fullmm)
                 return;
  
         nr = mp->tlb_nr;
@@ -97,7 +96,7 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long
         unsigned long nr = mp->tlb_nr;
         long s = start, e = end, vpte_base;
  
-       if (mp->tlb_frozen)
+       if (mp->fullmm)
                 return;
  
         /* If start is greater than end, that is a real problem.  */
diff --git a/arch/um/include/tlb.h b/arch/um/include/tlb.h

index 45d7da6..8efc1e0 100644 (file)
--- a/arch/um/include/tlb.h
+++ b/arch/um/include/tlb.h
@@ -34,7 +34,6 @@ struct host_vm_op {
         } u;
  };
  
-extern void mprotect_kernel_vm(int w);
  extern void force_flush_all(void);
  extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
                               unsigned long end_addr, int force,
diff --git a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c

index 0d73cee..34b54a3 100644 (file)
--- a/arch/um/kernel/process_kern.c
+++ b/arch/um/kernel/process_kern.c
@@ -222,6 +222,7 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
         pud_t *pud;
         pmd_t *pmd;
         pte_t *pte;
+       pte_t ptent;
  
         if(task->mm == NULL) 
                 return(ERR_PTR(-EINVAL));
@@ -238,12 +239,13 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
                 return(ERR_PTR(-EINVAL));
  
         pte = pte_offset_kernel(pmd, addr);
-       if(!pte_present(*pte)) 
+       ptent = *pte;
+       if(!pte_present(ptent))
                 return(ERR_PTR(-EINVAL));
  
         if(pte_out != NULL)
-               *pte_out = *pte;
-       return((void *) (pte_val(*pte) & PAGE_MASK) + (addr & ~PAGE_MASK));
+               *pte_out = ptent;
+       return((void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK));
  }
  
  char *current_cmd(void)
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c

index 240143b..9e5e39c 100644 (file)
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -28,7 +28,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
         pmd_t *pmd;
         pte_t *pte;
  
-       spin_lock(&mm->page_table_lock);
         pgd = pgd_offset(mm, proc);
         pud = pud_alloc(mm, pgd, proc);
         if (!pud)
@@ -63,7 +62,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
         *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT));
         *pte = pte_mkexec(*pte);
         *pte = pte_wrprotect(*pte);
-       spin_unlock(&mm->page_table_lock);
         return(0);
  
   out_pmd:
@@ -71,7 +69,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
   out_pte:
         pmd_free(pmd);
   out:
-       spin_unlock(&mm->page_table_lock);
         return(-ENOMEM);
  }
  
@@ -147,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm)
  
         if(!proc_mm || !ptrace_faultinfo){
                 free_page(mmu->id.stack);
+               pte_lock_deinit(virt_to_page(mmu->last_page_table));
                 pte_free_kernel((pte_t *) mmu->last_page_table);
                  dec_page_state(nr_page_table_pages);
  #ifdef CONFIG_3_LEVEL_PGTABLES
diff --git a/arch/um/kernel/tt/tlb.c b/arch/um/kernel/tt/tlb.c

index f1d85db..ae6217c 100644 (file)
--- a/arch/um/kernel/tt/tlb.c
+++ b/arch/um/kernel/tt/tlb.c
@@ -74,42 +74,6 @@ void flush_tlb_kernel_range_tt(unsigned long start, unsigned long end)
                  atomic_inc(&vmchange_seq);
  }
  
-static void protect_vm_page(unsigned long addr, int w, int must_succeed)
-{
-       int err;
-
-       err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed);
-       if(err == 0) return;
-       else if((err == -EFAULT) || (err == -ENOMEM)){
-               flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-               protect_vm_page(addr, w, 1);
-       }
-       else panic("protect_vm_page : protect failed, errno = %d\n", err);
-}
-
-void mprotect_kernel_vm(int w)
-{
-       struct mm_struct *mm;
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-       unsigned long addr;
-       
-       mm = &init_mm;
-       for(addr = start_vm; addr < end_vm;){
-               pgd = pgd_offset(mm, addr);
-               pud = pud_offset(pgd, addr);
-               pmd = pmd_offset(pud, addr);
-               if(pmd_present(*pmd)){
-                       pte = pte_offset_kernel(pmd, addr);
-                       if(pte_present(*pte)) protect_vm_page(addr, w, 0);
-                       addr += PAGE_SIZE;
-               }
-               else addr += PMD_SIZE;
-       }
-}
-
  void flush_tlb_kernel_vm_tt(void)
  {
          flush_tlb_kernel_range(start_vm, end_vm);
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c

index 3e6780f..93c60f4 100644 (file)
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -314,7 +314,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
         current->mm->free_area_cache = TASK_UNMAPPED_BASE;
         current->mm->cached_hole_size = 0;
  
-       set_mm_counter(current->mm, rss, 0);
         current->mm->mmap = NULL;
         compute_creds(bprm);
         current->flags &= ~PF_FORKNOEXEC;
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c

index 6972df4..ecf7acb 100644 (file)
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -60,7 +60,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
         if (address >= end)
                 BUG();
         do {
-               pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+               pte_t * pte = pte_alloc_kernel(pmd, address);
                 if (!pte)
                         return -ENOMEM;
                 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -105,7 +105,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
         flush_cache_all();
         if (address >= end)
                 BUG();
-       spin_lock(&init_mm.page_table_lock);
         do {
                 pud_t *pud;
                 pud = pud_alloc(&init_mm, pgd, address);
@@ -119,7 +118,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
                 pgd++;
         } while (address && (address < end));
-       spin_unlock(&init_mm.page_table_lock);
         flush_tlb_all();
         return error;
  }
diff --git a/crypto/api.c b/crypto/api.c

index 959c4e5..40ae42e 100644 (file)
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -215,7 +215,10 @@ int crypto_register_alg(struct crypto_alg *alg)
         if (alg->cra_alignmask & (alg->cra_alignmask + 1))
                 return -EINVAL;
  
-       if (alg->cra_alignmask > PAGE_SIZE)
+       if (alg->cra_alignmask & alg->cra_blocksize)
+               return -EINVAL;
+
+       if (alg->cra_blocksize > PAGE_SIZE)
                 return -EINVAL;
         
         down_write(&crypto_alg_sem);
diff --git a/crypto/hmac.c b/crypto/hmac.c

index da0456b..46120de 100644 (file)
--- a/crypto/hmac.c
+++ b/crypto/hmac.c
@@ -18,18 +18,15 @@
  #include <linux/mm.h>
  #include <linux/highmem.h>
  #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
  #include "internal.h"
  
  static void hash_key(struct crypto_tfm *tfm, u8 *key, unsigned int keylen)
  {
         struct scatterlist tmp;
         
-       tmp.page = virt_to_page(key);
-       tmp.offset = offset_in_page(key);
-       tmp.length = keylen;
+       sg_set_buf(&tmp, key, keylen);
         crypto_digest_digest(tfm, &tmp, 1, key);
-               
  }
  
  int crypto_alloc_hmac_block(struct crypto_tfm *tfm)
@@ -69,9 +66,7 @@ void crypto_hmac_init(struct crypto_tfm *tfm, u8 *key, unsigned int *keylen)
         for (i = 0; i < crypto_tfm_alg_blocksize(tfm); i++)
                 ipad[i] ^= 0x36;
  
-       tmp.page = virt_to_page(ipad);
-       tmp.offset = offset_in_page(ipad);
-       tmp.length = crypto_tfm_alg_blocksize(tfm);
+       sg_set_buf(&tmp, ipad, crypto_tfm_alg_blocksize(tfm));
         
         crypto_digest_init(tfm);
         crypto_digest_update(tfm, &tmp, 1);
@@ -103,16 +98,12 @@ void crypto_hmac_final(struct crypto_tfm *tfm, u8 *key,
         for (i = 0; i < crypto_tfm_alg_blocksize(tfm); i++)
                 opad[i] ^= 0x5c;
  
-       tmp.page = virt_to_page(opad);
-       tmp.offset = offset_in_page(opad);
-       tmp.length = crypto_tfm_alg_blocksize(tfm);
+       sg_set_buf(&tmp, opad, crypto_tfm_alg_blocksize(tfm));
  
         crypto_digest_init(tfm);
         crypto_digest_update(tfm, &tmp, 1);
         
-       tmp.page = virt_to_page(out);
-       tmp.offset = offset_in_page(out);
-       tmp.length = crypto_tfm_alg_digestsize(tfm);
+       sg_set_buf(&tmp, out, crypto_tfm_alg_digestsize(tfm));
         
         crypto_digest_update(tfm, &tmp, 1);
         crypto_digest_final(tfm, out);
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c

index 6863941..53f4ee8 100644 (file)
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -21,7 +21,7 @@
  #include <linux/module.h>
  #include <linux/mm.h>
  #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
  #include <linux/string.h>
  #include <linux/crypto.h>
  #include <linux/highmem.h>
@@ -86,7 +86,6 @@ static void hexdump(unsigned char *buf, unsigned int len)
  static void test_hash(char *algo, struct hash_testvec *template,
                       unsigned int tcount)
  {
-       char *p;
         unsigned int i, j, k, temp;
         struct scatterlist sg[8];
         char result[64];
@@ -116,10 +115,7 @@ static void test_hash(char *algo, struct hash_testvec *template,
                 printk("test %u:\n", i + 1);
                 memset(result, 0, 64);
  
-               p = hash_tv[i].plaintext;
-               sg[0].page = virt_to_page(p);
-               sg[0].offset = offset_in_page(p);
-               sg[0].length = hash_tv[i].psize;
+               sg_set_buf(&sg[0], hash_tv[i].plaintext, hash_tv[i].psize);
  
                 crypto_digest_init(tfm);
                 if (tfm->crt_u.digest.dit_setkey) {
@@ -154,10 +150,8 @@ static void test_hash(char *algo, struct hash_testvec *template,
                                        hash_tv[i].plaintext + temp,
                                        hash_tv[i].tap[k]);
                                 temp += hash_tv[i].tap[k];
-                               p = &xbuf[IDX[k]];
-                               sg[k].page = virt_to_page(p);
-                               sg[k].offset = offset_in_page(p);
-                               sg[k].length = hash_tv[i].tap[k];
+                               sg_set_buf(&sg[k], &xbuf[IDX[k]],
+                                           hash_tv[i].tap[k]);
                         }
  
                         crypto_digest_digest(tfm, sg, hash_tv[i].np, result);
@@ -179,7 +173,6 @@ static void test_hash(char *algo, struct hash_testvec *template,
  static void test_hmac(char *algo, struct hmac_testvec *template,
                       unsigned int tcount)
  {
-       char *p;
         unsigned int i, j, k, temp;
         struct scatterlist sg[8];
         char result[64];
@@ -210,11 +203,8 @@ static void test_hmac(char *algo, struct hmac_testvec *template,
                 printk("test %u:\n", i + 1);
                 memset(result, 0, sizeof (result));
  
-               p = hmac_tv[i].plaintext;
                 klen = hmac_tv[i].ksize;
-               sg[0].page = virt_to_page(p);
-               sg[0].offset = offset_in_page(p);
-               sg[0].length = hmac_tv[i].psize;
+               sg_set_buf(&sg[0], hmac_tv[i].plaintext, hmac_tv[i].psize);
  
                 crypto_hmac(tfm, hmac_tv[i].key, &klen, sg, 1, result);
  
@@ -243,10 +233,8 @@ static void test_hmac(char *algo, struct hmac_testvec *template,
                                        hmac_tv[i].plaintext + temp,
                                        hmac_tv[i].tap[k]);
                                 temp += hmac_tv[i].tap[k];
-                               p = &xbuf[IDX[k]];
-                               sg[k].page = virt_to_page(p);
-                               sg[k].offset = offset_in_page(p);
-                               sg[k].length = hmac_tv[i].tap[k];
+                               sg_set_buf(&sg[k], &xbuf[IDX[k]],
+                                           hmac_tv[i].tap[k]);
                         }
  
                         crypto_hmac(tfm, hmac_tv[i].key, &klen, sg,
@@ -270,7 +258,7 @@ static void test_cipher(char *algo, int mode, int enc,
  {
         unsigned int ret, i, j, k, temp;
         unsigned int tsize;
-       char *p, *q;
+       char *q;
         struct crypto_tfm *tfm;
         char *key;
         struct cipher_testvec *cipher_tv;
@@ -330,10 +318,8 @@ static void test_cipher(char *algo, int mode, int enc,
                                         goto out;
                         }
  
-                       p = cipher_tv[i].input;
-                       sg[0].page = virt_to_page(p);
-                       sg[0].offset = offset_in_page(p);
-                       sg[0].length = cipher_tv[i].ilen;
+                       sg_set_buf(&sg[0], cipher_tv[i].input,
+                                  cipher_tv[i].ilen);
  
                         if (!mode) {
                                 crypto_cipher_set_iv(tfm, cipher_tv[i].iv,
@@ -389,10 +375,8 @@ static void test_cipher(char *algo, int mode, int enc,
                                        cipher_tv[i].input + temp,
                                        cipher_tv[i].tap[k]);
                                 temp += cipher_tv[i].tap[k];
-                               p = &xbuf[IDX[k]];
-                               sg[k].page = virt_to_page(p);
-                               sg[k].offset = offset_in_page(p);
-                               sg[k].length = cipher_tv[i].tap[k];
+                               sg_set_buf(&sg[k], &xbuf[IDX[k]],
+                                          cipher_tv[i].tap[k]);
                         }
  
                         if (!mode) {
@@ -431,14 +415,12 @@ out:
  static int test_cipher_jiffies(struct crypto_tfm *tfm, int enc, char *p,
                                int blen, int sec)
  {
-       struct scatterlist sg[8];
+       struct scatterlist sg[1];
         unsigned long start, end;
         int bcount;
         int ret;
  
-       sg[0].page = virt_to_page(p);
-       sg[0].offset = offset_in_page(p);
-       sg[0].length = blen;
+       sg_set_buf(sg, p, blen);
  
         for (start = jiffies, end = start + sec * HZ, bcount = 0;
              time_before(jiffies, end); bcount++) {
@@ -459,14 +441,12 @@ static int test_cipher_jiffies(struct crypto_tfm *tfm, int enc, char *p,
  static int test_cipher_cycles(struct crypto_tfm *tfm, int enc, char *p,
                               int blen)
  {
-       struct scatterlist sg[8];
+       struct scatterlist sg[1];
         unsigned long cycles = 0;
         int ret = 0;
         int i;
  
-       sg[0].page = virt_to_page(p);
-       sg[0].offset = offset_in_page(p);
-       sg[0].length = blen;
+       sg_set_buf(sg, p, blen);
  
         local_bh_disable();
         local_irq_disable();
@@ -709,9 +689,7 @@ static void test_crc32c(void)
         for (i = 0; i < NUMVEC; i++) {
                 for (j = 0; j < VECSIZE; j++)
                         test_vec[i][j] = ++b;
-               sg[i].page = virt_to_page(test_vec[i]);
-               sg[i].offset = offset_in_page(test_vec[i]);
-               sg[i].length = VECSIZE;
+               sg_set_buf(&sg[i], test_vec[i], VECSIZE);
         }
  
         seed = SEEDTESTVAL;
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c

index 01a1bd2..2143609 100644 (file)
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -200,8 +200,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
          * Note: Assume that this function returns zero on success
          */
         result = add_memory(mem_device->start_addr,
-                           (mem_device->end_addr - mem_device->start_addr) + 1,
-                           mem_device->read_write_attribute);
+                           (mem_device->end_addr - mem_device->start_addr) + 1);
         if (result) {
                 ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n"));
                 mem_device->state = MEMORY_INVALID_STATE;
@@ -259,7 +258,7 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
          * Ask the VM to offline this memory range.
          * Note: Assume that this function returns zero on success
          */
-       result = remove_memory(start, len, attr);
+       result = remove_memory(start, len);
         if (result) {
                 ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n"));
                 return_VALUE(result);
diff --git a/drivers/base/Makefile b/drivers/base/Makefile

index 66d9c46..f12898d 100644 (file)
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -7,6 +7,7 @@ obj-y                   := core.o sys.o bus.o dd.o \
  obj-y                  += power/
  obj-$(CONFIG_FW_LOADER)        += firmware_class.o
  obj-$(CONFIG_NUMA)     += node.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
  
  ifeq ($(CONFIG_DEBUG_DRIVER),y)
  EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/base/init.c b/drivers/base/init.c

index 84e604e..c648914 100644 (file)
--- a/drivers/base/init.c
+++ b/drivers/base/init.c
@@ -9,6 +9,7 @@
  
  #include <linux/device.h>
  #include <linux/init.h>
+#include <linux/memory.h>
  
  #include "base.h"
  
@@ -33,5 +34,6 @@ void __init driver_init(void)
         platform_bus_init();
         system_bus_init();
         cpu_dev_init();
+       memory_dev_init();
         attribute_container_init();
  }
diff --git a/drivers/base/memory.c b/drivers/base/memory.c

new file mode 100644 (file)

index 0000000..b7ddd65
--- /dev/null
+++ b/drivers/base/memory.c
@@ -0,0 +1,452 @@
+/*
+ * drivers/base/memory.c - basic Memory class support
+ *
+ * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
+ *            Dave Hansen <haveblue@us.ibm.com>
+ *
+ * This file provides the necessary infrastructure to represent
+ * a SPARSEMEM-memory-model system's physical memory in /sysfs.
+ * All arch-independent code that assumes MEMORY_HOTPLUG requires
+ * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
+ */
+
+#include <linux/sysdev.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>       /* capable() */
+#include <linux/topology.h>
+#include <linux/device.h>
+#include <linux/memory.h>
+#include <linux/kobject.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#define MEMORY_CLASS_NAME      "memory"
+
+static struct sysdev_class memory_sysdev_class = {
+       set_kset_name(MEMORY_CLASS_NAME),
+};
+EXPORT_SYMBOL(memory_sysdev_class);
+
+static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj)
+{
+       return MEMORY_CLASS_NAME;
+}
+
+static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+                       int num_envp, char *buffer, int buffer_size)
+{
+       int retval = 0;
+
+       return retval;
+}
+
+static struct kset_hotplug_ops memory_hotplug_ops = {
+       .name           = memory_hotplug_name,
+       .hotplug        = memory_hotplug,
+};
+
+static struct notifier_block *memory_chain;
+
+static int register_memory_notifier(struct notifier_block *nb)
+{
+        return notifier_chain_register(&memory_chain, nb);
+}
+
+static void unregister_memory_notifier(struct notifier_block *nb)
+{
+        notifier_chain_unregister(&memory_chain, nb);
+}
+
+/*
+ * register_memory - Setup a sysfs device for a memory block
+ */
+static int
+register_memory(struct memory_block *memory, struct mem_section *section,
+               struct node *root)
+{
+       int error;
+
+       memory->sysdev.cls = &memory_sysdev_class;
+       memory->sysdev.id = __section_nr(section);
+
+       error = sysdev_register(&memory->sysdev);
+
+       if (root && !error)
+               error = sysfs_create_link(&root->sysdev.kobj,
+                                         &memory->sysdev.kobj,
+                                         kobject_name(&memory->sysdev.kobj));
+
+       return error;
+}
+
+static void
+unregister_memory(struct memory_block *memory, struct mem_section *section,
+               struct node *root)
+{
+       BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
+       BUG_ON(memory->sysdev.id != __section_nr(section));
+
+       sysdev_unregister(&memory->sysdev);
+       if (root)
+               sysfs_remove_link(&root->sysdev.kobj,
+                                 kobject_name(&memory->sysdev.kobj));
+}
+
+/*
+ * use this as the physical section index that this memsection
+ * uses.
+ */
+
+static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf)
+{
+       struct memory_block *mem =
+               container_of(dev, struct memory_block, sysdev);
+       return sprintf(buf, "%08lx\n", mem->phys_index);
+}
+
+/*
+ * online, offline, going offline, etc.
+ */
+static ssize_t show_mem_state(struct sys_device *dev, char *buf)
+{
+       struct memory_block *mem =
+               container_of(dev, struct memory_block, sysdev);
+       ssize_t len = 0;
+
+       /*
+        * We can probably put these states in a nice little array
+        * so that they're not open-coded
+        */
+       switch (mem->state) {
+               case MEM_ONLINE:
+                       len = sprintf(buf, "online\n");
+                       break;
+               case MEM_OFFLINE:
+                       len = sprintf(buf, "offline\n");
+                       break;
+               case MEM_GOING_OFFLINE:
+                       len = sprintf(buf, "going-offline\n");
+                       break;
+               default:
+                       len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
+                                       mem->state);
+                       WARN_ON(1);
+                       break;
+       }
+
+       return len;
+}
+
+static inline int memory_notify(unsigned long val, void *v)
+{
+       return notifier_call_chain(&memory_chain, val, v);
+}
+
+/*
+ * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
+ * OK to have direct references to sparsemem variables in here.
+ */
+static int
+memory_block_action(struct memory_block *mem, unsigned long action)
+{
+       int i;
+       unsigned long psection;
+       unsigned long start_pfn, start_paddr;
+       struct page *first_page;
+       int ret;
+       int old_state = mem->state;
+
+       psection = mem->phys_index;
+       first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);
+
+       /*
+        * The probe routines leave the pages reserved, just
+        * as the bootmem code does.  Make sure they're still
+        * that way.
+        */
+       if (action == MEM_ONLINE) {
+               for (i = 0; i < PAGES_PER_SECTION; i++) {
+                       if (PageReserved(first_page+i))
+                               continue;
+
+                       printk(KERN_WARNING "section number %ld page number %d "
+                               "not reserved, was it already online? \n",
+                               psection, i);
+                       return -EBUSY;
+               }
+       }
+
+       switch (action) {
+               case MEM_ONLINE:
+                       start_pfn = page_to_pfn(first_page);
+                       ret = online_pages(start_pfn, PAGES_PER_SECTION);
+                       break;
+               case MEM_OFFLINE:
+                       mem->state = MEM_GOING_OFFLINE;
+                       memory_notify(MEM_GOING_OFFLINE, NULL);
+                       start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
+                       ret = remove_memory(start_paddr,
+                                           PAGES_PER_SECTION << PAGE_SHIFT);
+                       if (ret) {
+                               mem->state = old_state;
+                               break;
+                       }
+                       memory_notify(MEM_MAPPING_INVALID, NULL);
+                       break;
+               default:
+                       printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n",
+                                       __FUNCTION__, mem, action, action);
+                       WARN_ON(1);
+                       ret = -EINVAL;
+       }
+       /*
+        * For now, only notify on successful memory operations
+        */
+       if (!ret)
+               memory_notify(action, NULL);
+
+       return ret;
+}
+
+static int memory_block_change_state(struct memory_block *mem,
+               unsigned long to_state, unsigned long from_state_req)
+{
+       int ret = 0;
+       down(&mem->state_sem);
+
+       if (mem->state != from_state_req) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = memory_block_action(mem, to_state);
+       if (!ret)
+               mem->state = to_state;
+
+out:
+       up(&mem->state_sem);
+       return ret;
+}
+
+static ssize_t
+store_mem_state(struct sys_device *dev, const char *buf, size_t count)
+{
+       struct memory_block *mem;
+       unsigned int phys_section_nr;
+       int ret = -EINVAL;
+
+       mem = container_of(dev, struct memory_block, sysdev);
+       phys_section_nr = mem->phys_index;
+
+       if (!valid_section_nr(phys_section_nr))
+               goto out;
+
+       if (!strncmp(buf, "online", min((int)count, 6)))
+               ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+       else if(!strncmp(buf, "offline", min((int)count, 7)))
+               ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
+out:
+       if (ret)
+               return ret;
+       return count;
+}
+
+/*
+ * phys_device is a bad name for this.  What I really want
+ * is a way to differentiate between memory ranges that
+ * are part of physical devices that constitute
+ * a complete removable unit or fru.
+ * i.e. do these ranges belong to the same physical device,
+ * s.t. if I offline all of these sections I can then
+ * remove the physical device?
+ */
+static ssize_t show_phys_device(struct sys_device *dev, char *buf)
+{
+       struct memory_block *mem =
+               container_of(dev, struct memory_block, sysdev);
+       return sprintf(buf, "%d\n", mem->phys_device);
+}
+
+static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
+static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
+static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
+
+#define mem_create_simple_file(mem, attr_name) \
+       sysdev_create_file(&mem->sysdev, &attr_##attr_name)
+#define mem_remove_simple_file(mem, attr_name) \
+       sysdev_remove_file(&mem->sysdev, &attr_##attr_name)
+
+/*
+ * Block size attribute stuff
+ */
+static ssize_t
+print_block_size(struct class *class, char *buf)
+{
+       return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE);
+}
+
+static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
+
+static int block_size_init(void)
+{
+       sysfs_create_file(&memory_sysdev_class.kset.kobj,
+               &class_attr_block_size_bytes.attr);
+       return 0;
+}
+
+/*
+ * Some architectures will have custom drivers to do this, and
+ * will not need to do it from userspace.  The fake hot-add code
+ * as well as ppc64 will do all of their discovery in userspace
+ * and will require this interface.
+ */
+#ifdef CONFIG_ARCH_MEMORY_PROBE
+static ssize_t
+memory_probe_store(struct class *class, const char __user *buf, size_t count)
+{
+       u64 phys_addr;
+       int ret;
+
+       phys_addr = simple_strtoull(buf, NULL, 0);
+
+       ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
+
+       if (ret)
+               count = ret;
+
+       return count;
+}
+static CLASS_ATTR(probe, 0700, NULL, memory_probe_store);
+
+static int memory_probe_init(void)
+{
+       sysfs_create_file(&memory_sysdev_class.kset.kobj,
+               &class_attr_probe.attr);
+       return 0;
+}
+#else
+#define memory_probe_init(...) do {} while (0)
+#endif
+
+/*
+ * Note that phys_device is optional.  It is here to allow for
+ * differentiation between which *physical* devices each
+ * section belongs to...
+ */
+
+static int add_memory_block(unsigned long node_id, struct mem_section *section,
+                    unsigned long state, int phys_device)
+{
+       struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+       int ret = 0;
+
+       if (!mem)
+               return -ENOMEM;
+
+       mem->phys_index = __section_nr(section);
+       mem->state = state;
+       init_MUTEX(&mem->state_sem);
+       mem->phys_device = phys_device;
+
+       ret = register_memory(mem, section, NULL);
+       if (!ret)
+               ret = mem_create_simple_file(mem, phys_index);
+       if (!ret)
+               ret = mem_create_simple_file(mem, state);
+       if (!ret)
+               ret = mem_create_simple_file(mem, phys_device);
+
+       return ret;
+}
+
+/*
+ * For now, we have a linear search to go find the appropriate
+ * memory_block corresponding to a particular phys_index. If
+ * this gets to be a real problem, we can always use a radix
+ * tree or something here.
+ *
+ * This could be made generic for all sysdev classes.
+ */
+static struct memory_block *find_memory_block(struct mem_section *section)
+{
+       struct kobject *kobj;
+       struct sys_device *sysdev;
+       struct memory_block *mem;
+       char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
+
+       /*
+        * This only works because we know that section == sysdev->id
+        * slightly redundant with sysdev_register()
+        */
+       sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));
+
+       kobj = kset_find_obj(&memory_sysdev_class.kset, name);
+       if (!kobj)
+               return NULL;
+
+       sysdev = container_of(kobj, struct sys_device, kobj);
+       mem = container_of(sysdev, struct memory_block, sysdev);
+
+       return mem;
+}
+
+int remove_memory_block(unsigned long node_id, struct mem_section *section,
+               int phys_device)
+{
+       struct memory_block *mem;
+
+       mem = find_memory_block(section);
+       mem_remove_simple_file(mem, phys_index);
+       mem_remove_simple_file(mem, state);
+       mem_remove_simple_file(mem, phys_device);
+       unregister_memory(mem, section, NULL);
+
+       return 0;
+}
+
+/*
+ * need an interface for the VM to add new memory regions,
+ * but without onlining it.
+ */
+int register_new_memory(struct mem_section *section)
+{
+       return add_memory_block(0, section, MEM_OFFLINE, 0);
+}
+
+int unregister_memory_section(struct mem_section *section)
+{
+       if (!valid_section(section))
+               return -EINVAL;
+
+       return remove_memory_block(0, section, 0);
+}
+
+/*
+ * Initialize the sysfs support for memory devices...
+ */
+int __init memory_dev_init(void)
+{
+       unsigned int i;
+       int ret;
+
+       memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops;
+       ret = sysdev_class_register(&memory_sysdev_class);
+
+       /*
+        * Create entries for memory sections that were found
+        * during boot and have been initialized
+        */
+       for (i = 0; i < NR_MEM_SECTIONS; i++) {
+               if (!valid_section_nr(i))
+                       continue;
+               add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0);
+       }
+
+       memory_probe_init();
+       block_size_init();
+
+       return ret;
+}
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c

index 28c1a62..cf66310 100644 (file)
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -15,7 +15,7 @@
  #include <linux/crypto.h>
  #include <linux/workqueue.h>
  #include <asm/atomic.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
  #include <asm/page.h>
  
  #include "dm.h"
@@ -164,9 +164,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
                 return -ENOMEM;
         }
  
-       sg.page = virt_to_page(cc->key);
-       sg.offset = offset_in_page(cc->key);
-       sg.length = cc->key_size;
+       sg_set_buf(&sg, cc->key, cc->key_size);
         crypto_digest_digest(hash_tfm, &sg, 1, salt);
         crypto_free_tfm(hash_tfm);
  
@@ -207,14 +205,12 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc)
  
  static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
  {
-       struct scatterlist sg = { NULL, };
+       struct scatterlist sg;
  
         memset(iv, 0, cc->iv_size);
         *(u64 *)iv = cpu_to_le64(sector);
  
-       sg.page = virt_to_page(iv);
-       sg.offset = offset_in_page(iv);
-       sg.length = cc->iv_size;
+       sg_set_buf(&sg, iv, cc->iv_size);
         crypto_cipher_encrypt((struct crypto_tfm *)cc->iv_gen_private,
                               &sg, &sg, cc->iv_size);
  
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c

index 4c11699..750c016 100644 (file)
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -35,6 +35,7 @@
  #include <linux/interrupt.h>
  #include <linux/in.h>
  #include <linux/bitops.h>
+#include <linux/scatterlist.h>
  #include <asm/io.h>
  #include <asm/system.h>
  
@@ -1590,11 +1591,9 @@ static void emmh32_setseed(emmh32_context *context, u8 *pkey, int keylen, struct
                 aes_counter[12] = (u8)(counter >> 24);
                 counter++;
                 memcpy (plain, aes_counter, 16);
-               sg[0].page = virt_to_page(plain);
-               sg[0].offset = ((long) plain & ~PAGE_MASK);
-               sg[0].length = 16;
+               sg_set_buf(sg, plain, 16);
                 crypto_cipher_encrypt(tfm, sg, sg, 16);
-               cipher = kmap(sg[0].page) + sg[0].offset;
+               cipher = kmap(sg->page) + sg->offset;
                 for (j=0; (j<16) && (i< (sizeof(context->coeff)/sizeof(context->coeff[0]))); ) {
                         context->coeff[i++] = ntohl(*(u32 *)&cipher[j]);
                         j += 4;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c

index bbd9c23..5627ce1 100644 (file)
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -356,7 +356,7 @@ static void piix4_mem_quirk(struct pci_dev *dev, const char *name, unsigned int
  /*
   * PIIX4 ACPI: Two IO regions pointed to by longwords at
   *     0x40 (64 bytes of ACPI registers)
- *     0x90 (32 bytes of SMB registers)
+ *     0x90 (16 bytes of SMB registers)
   * and a few strange programmable PIIX4 device resources.
   */
  static void __devinit quirk_piix4_acpi(struct pci_dev *dev)
@@ -366,7 +366,7 @@ static void __devinit quirk_piix4_acpi(struct pci_dev *dev)
         pci_read_config_dword(dev, 0x40, &region);
         quirk_io_region(dev, region, 64, PCI_BRIDGE_RESOURCES, "PIIX4 ACPI");
         pci_read_config_dword(dev, 0x90, &region);
-       quirk_io_region(dev, region, 32, PCI_BRIDGE_RESOURCES+1, "PIIX4 SMB");
+       quirk_io_region(dev, region, 16, PCI_BRIDGE_RESOURCES+1, "PIIX4 SMB");
  
         /* Device resource A has enables for some of the other ones */
         pci_read_config_dword(dev, 0x5c, &res_a);
diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c

index fe8187d..e2a5657 100644 (file)
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -41,6 +41,7 @@
  #include <linux/interrupt.h>
  #include <linux/sched.h>
  #include <linux/dma-mapping.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <linux/libata.h>
@@ -192,7 +193,6 @@ static void ahci_port_stop(struct ata_port *ap);
  static void ahci_tf_read(struct ata_port *ap, struct ata_taskfile *tf);
  static void ahci_qc_prep(struct ata_queued_cmd *qc);
  static u8 ahci_check_status(struct ata_port *ap);
-static u8 ahci_check_err(struct ata_port *ap);
  static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
  static void ahci_remove_one (struct pci_dev *pdev);
  
@@ -221,7 +221,6 @@ static const struct ata_port_operations ahci_ops = {
  
         .check_status           = ahci_check_status,
         .check_altstatus        = ahci_check_status,
-       .check_err              = ahci_check_err,
         .dev_select             = ata_noop_dev_select,
  
         .tf_read                = ahci_tf_read,
@@ -458,13 +457,6 @@ static u8 ahci_check_status(struct ata_port *ap)
         return readl(mmio + PORT_TFDATA) & 0xFF;
  }
  
-static u8 ahci_check_err(struct ata_port *ap)
-{
-       void __iomem *mmio = (void __iomem *) ap->ioaddr.cmd_addr;
-
-       return (readl(mmio + PORT_TFDATA) >> 8) & 0xFF;
-}
-
  static void ahci_tf_read(struct ata_port *ap, struct ata_taskfile *tf)
  {
         struct ahci_port_priv *pp = ap->private_data;
@@ -609,7 +601,7 @@ static void ahci_eng_timeout(struct ata_port *ap)
                  * not being called from the SCSI EH.
                  */
                 qc->scsidone = scsi_finish_command;
-               ata_qc_complete(qc, ATA_ERR);
+               ata_qc_complete(qc, AC_ERR_OTHER);
         }
  
         spin_unlock_irqrestore(&host_set->lock, flags);
@@ -638,7 +630,7 @@ static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
         if (status & PORT_IRQ_FATAL) {
                 ahci_intr_error(ap, status);
                 if (qc)
-                       ata_qc_complete(qc, ATA_ERR);
+                       ata_qc_complete(qc, AC_ERR_OTHER);
         }
  
         return 1;
@@ -683,10 +675,10 @@ static irqreturn_t ahci_interrupt (int irq, void *dev_instance, struct pt_regs *
                         if (!ahci_host_intr(ap, qc))
                                 if (ata_ratelimit()) {
                                         struct pci_dev *pdev =
-                                         to_pci_dev(ap->host_set->dev);
-                                       printk(KERN_WARNING
-                                         "ahci(%s): unhandled interrupt on port %u\n",
-                                         pci_name(pdev), i);
+                                               to_pci_dev(ap->host_set->dev);
+                                       dev_printk(KERN_WARNING, &pdev->dev,
+                                         "unhandled interrupt on port %u\n",
+                                         i);
                                 }
  
                         VPRINTK("port %u\n", i);
@@ -694,10 +686,9 @@ static irqreturn_t ahci_interrupt (int irq, void *dev_instance, struct pt_regs *
                         VPRINTK("port %u (no irq)\n", i);
                         if (ata_ratelimit()) {
                                 struct pci_dev *pdev =
-                                 to_pci_dev(ap->host_set->dev);
-                               printk(KERN_WARNING
-                                 "ahci(%s): interrupt on disabled port %u\n",
-                                 pci_name(pdev), i);
+                                       to_pci_dev(ap->host_set->dev);
+                               dev_printk(KERN_WARNING, &pdev->dev,
+                                       "interrupt on disabled port %u\n", i);
                         }
                 }
  
@@ -769,8 +760,8 @@ static int ahci_host_init(struct ata_probe_ent *probe_ent)
  
         tmp = readl(mmio + HOST_CTL);
         if (tmp & HOST_RESET) {
-               printk(KERN_ERR DRV_NAME "(%s): controller reset failed (0x%x)\n",
-                       pci_name(pdev), tmp);
+               dev_printk(KERN_ERR, &pdev->dev,
+                          "controller reset failed (0x%x)\n", tmp);
                 return -EIO;
         }
  
@@ -798,22 +789,22 @@ static int ahci_host_init(struct ata_probe_ent *probe_ent)
                 if (rc) {
                         rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
                         if (rc) {
-                               printk(KERN_ERR DRV_NAME "(%s): 64-bit DMA enable failed\n",
-                                       pci_name(pdev));
+                               dev_printk(KERN_ERR, &pdev->dev,
+                                          "64-bit DMA enable failed\n");
                                 return rc;
                         }
                 }
         } else {
                 rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
                 if (rc) {
-                       printk(KERN_ERR DRV_NAME "(%s): 32-bit DMA enable failed\n",
-                               pci_name(pdev));
+                       dev_printk(KERN_ERR, &pdev->dev,
+                                  "32-bit DMA enable failed\n");
                         return rc;
                 }
                 rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
                 if (rc) {
-                       printk(KERN_ERR DRV_NAME "(%s): 32-bit consistent DMA enable failed\n",
-                               pci_name(pdev));
+                       dev_printk(KERN_ERR, &pdev->dev,
+                                  "32-bit consistent DMA enable failed\n");
                         return rc;
                 }
         }
@@ -916,10 +907,10 @@ static void ahci_print_info(struct ata_probe_ent *probe_ent)
         else
                 scc_s = "unknown";
  
-       printk(KERN_INFO DRV_NAME "(%s) AHCI %02x%02x.%02x%02x "
+       dev_printk(KERN_INFO, &pdev->dev,
+               "AHCI %02x%02x.%02x%02x "
                 "%u slots %u ports %s Gbps 0x%x impl %s mode\n"
                 ,
-               pci_name(pdev),
  
                 (vers >> 24) & 0xff,
                 (vers >> 16) & 0xff,
@@ -932,11 +923,11 @@ static void ahci_print_info(struct ata_probe_ent *probe_ent)
                 impl,
                 scc_s);
  
-       printk(KERN_INFO DRV_NAME "(%s) flags: "
+       dev_printk(KERN_INFO, &pdev->dev,
+               "flags: "
                 "%s%s%s%s%s%s"
                 "%s%s%s%s%s%s%s\n"
                 ,
-               pci_name(pdev),
  
                 cap & (1 << 31) ? "64bit " : "",
                 cap & (1 << 30) ? "ncq " : "",
@@ -969,7 +960,7 @@ static int ahci_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
         VPRINTK("ENTER\n");
  
         if (!printed_version++)
-               printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+               dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n");
  
         rc = pci_enable_device(pdev);
         if (rc)
diff --git a/drivers/scsi/arm/scsi.h b/drivers/scsi/arm/scsi.h

index 48e1c4d..1993764 100644 (file)
--- a/drivers/scsi/arm/scsi.h
+++ b/drivers/scsi/arm/scsi.h
@@ -10,6 +10,8 @@
   *  Commonly used scsi driver functions.
   */
  
+#include <linux/scatterlist.h>
+
  #define BELT_AND_BRACES
  
  /*
@@ -22,9 +24,7 @@ static inline int copy_SCp_to_sg(struct scatterlist *sg, Scsi_Pointer *SCp, int
  
         BUG_ON(bufs + 1 > max);
  
-       sg->page   = virt_to_page(SCp->ptr);
-       sg->offset = offset_in_page(SCp->ptr);
-       sg->length = SCp->this_residual;
+       sg_set_buf(sg, SCp->ptr, SCp->this_residual);
  
         if (bufs)
                 memcpy(sg + 1, SCp->buffer + 1,
diff --git a/drivers/scsi/ata_piix.c b/drivers/scsi/ata_piix.c

index be02147..7f8aa1b 100644 (file)
--- a/drivers/scsi/ata_piix.c
+++ b/drivers/scsi/ata_piix.c
@@ -45,6 +45,7 @@
  #include <linux/init.h>
  #include <linux/blkdev.h>
  #include <linux/delay.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <linux/libata.h>
@@ -621,18 +622,19 @@ static int piix_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
  {
         static int printed_version;
         struct ata_port_info *port_info[2];
-       unsigned int combined = 0, n_ports = 1;
+       unsigned int combined = 0;
         unsigned int pata_chan = 0, sata_chan = 0;
  
         if (!printed_version++)
-               printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+               dev_printk(KERN_DEBUG, &pdev->dev,
+                          "version " DRV_VERSION "\n");
  
         /* no hotplugging support (FIXME) */
         if (!in_module_init)
                 return -ENODEV;
  
         port_info[0] = &piix_port_info[ent->driver_data];
-       port_info[1] = NULL;
+       port_info[1] = &piix_port_info[ent->driver_data];
  
         if (port_info[0]->host_flags & PIIX_FLAG_AHCI) {
                 u8 tmp;
@@ -670,12 +672,13 @@ static int piix_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
                 port_info[sata_chan] = &piix_port_info[ent->driver_data];
                 port_info[sata_chan]->host_flags |= ATA_FLAG_SLAVE_POSS;
                 port_info[pata_chan] = &piix_port_info[ich5_pata];
-               n_ports++;
  
-               printk(KERN_WARNING DRV_NAME ": combined mode detected\n");
+               dev_printk(KERN_WARNING, &pdev->dev,
+                          "combined mode detected (p=%u, s=%u)\n",
+                          pata_chan, sata_chan);
         }
  
-       return ata_pci_init_one(pdev, port_info, n_ports);
+       return ata_pci_init_one(pdev, port_info, 2);
  }
  
  static int __init piix_init(void)
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c

index b1b1c6f..8be7dc0 100644 (file)
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -49,6 +49,7 @@
  #include <linux/suspend.h>
  #include <linux/workqueue.h>
  #include <linux/jiffies.h>
+#include <linux/scatterlist.h>
  #include <scsi/scsi.h>
  #include "scsi.h"
  #include "scsi_priv.h"
@@ -371,7 +372,7 @@ static void ata_tf_read_pio(struct ata_port *ap, struct ata_taskfile *tf)
         struct ata_ioports *ioaddr = &ap->ioaddr;
  
         tf->command = ata_check_status(ap);
-       tf->feature = ata_chk_err(ap);
+       tf->feature = inb(ioaddr->error_addr);
         tf->nsect = inb(ioaddr->nsect_addr);
         tf->lbal = inb(ioaddr->lbal_addr);
         tf->lbam = inb(ioaddr->lbam_addr);
@@ -405,7 +406,7 @@ static void ata_tf_read_mmio(struct ata_port *ap, struct ata_taskfile *tf)
         struct ata_ioports *ioaddr = &ap->ioaddr;
  
         tf->command = ata_check_status(ap);
-       tf->feature = ata_chk_err(ap);
+       tf->feature = readb((void __iomem *)ioaddr->error_addr);
         tf->nsect = readb((void __iomem *)ioaddr->nsect_addr);
         tf->lbal = readb((void __iomem *)ioaddr->lbal_addr);
         tf->lbam = readb((void __iomem *)ioaddr->lbam_addr);
@@ -525,30 +526,6 @@ u8 ata_altstatus(struct ata_port *ap)
  }
  
  
-/**
- *     ata_chk_err - Read device error reg
- *     @ap: port where the device is
- *
- *     Reads ATA taskfile error register for
- *     currently-selected device and return its value.
- *
- *     Note: may NOT be used as the check_err() entry in
- *     ata_port_operations.
- *
- *     LOCKING:
- *     Inherited from caller.
- */
-u8 ata_chk_err(struct ata_port *ap)
-{
-       if (ap->ops->check_err)
-               return ap->ops->check_err(ap);
-
-       if (ap->flags & ATA_FLAG_MMIO) {
-               return readb((void __iomem *) ap->ioaddr.error_addr);
-       }
-       return inb(ap->ioaddr.error_addr);
-}
-
  /**
   *     ata_tf_to_fis - Convert ATA taskfile to SATA FIS structure
   *     @tf: Taskfile to convert
@@ -901,8 +878,8 @@ static u8 ata_dev_try_classify(struct ata_port *ap, unsigned int device)
  
         memset(&tf, 0, sizeof(tf));
  
-       err = ata_chk_err(ap);
         ap->ops->tf_read(ap, &tf);
+       err = tf.feature;
  
         dev->class = ATA_DEV_NONE;
  
@@ -1139,7 +1116,6 @@ static void ata_dev_identify(struct ata_port *ap, unsigned int device)
         unsigned int major_version;
         u16 tmp;
         unsigned long xfer_modes;
-       u8 status;
         unsigned int using_edd;
         DECLARE_COMPLETION(wait);
         struct ata_queued_cmd *qc;
@@ -1193,8 +1169,11 @@ retry:
         else
                 wait_for_completion(&wait);
  
-       status = ata_chk_status(ap);
-       if (status & ATA_ERR) {
+       spin_lock_irqsave(&ap->host_set->lock, flags);
+       ap->ops->tf_read(ap, &qc->tf);
+       spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+       if (qc->tf.command & ATA_ERR) {
                 /*
                  * arg!  EDD works for all test cases, but seems to return
                  * the ATA signature for some ATAPI devices.  Until the
@@ -1207,7 +1186,7 @@ retry:
                  * to have this problem.
                  */
                 if ((using_edd) && (qc->tf.command == ATA_CMD_ID_ATA)) {
-                       u8 err = ata_chk_err(ap);
+                       u8 err = qc->tf.feature;
                         if (err & ATA_ABORTED) {
                                 dev->class = ATA_DEV_ATAPI;
                                 qc->cursg = 0;
@@ -2576,19 +2555,12 @@ void ata_qc_prep(struct ata_queued_cmd *qc)
  
  void ata_sg_init_one(struct ata_queued_cmd *qc, void *buf, unsigned int buflen)
  {
-       struct scatterlist *sg;
-
         qc->flags |= ATA_QCFLAG_SINGLE;
  
-       memset(&qc->sgent, 0, sizeof(qc->sgent));
         qc->sg = &qc->sgent;
         qc->n_elem = 1;
         qc->buf_virt = buf;
-
-       sg = qc->sg;
-       sg->page = virt_to_page(buf);
-       sg->offset = (unsigned long) buf & ~PAGE_MASK;
-       sg->length = buflen;
+       sg_init_one(qc->sg, buf, buflen);
  }
  
  /**
@@ -2691,7 +2663,7 @@ static int ata_sg_setup(struct ata_queued_cmd *qc)
   *     None.  (grabs host lock)
   */
  
-void ata_poll_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat)
+void ata_poll_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask)
  {
         struct ata_port *ap = qc->ap;
         unsigned long flags;
@@ -2699,7 +2671,7 @@ void ata_poll_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat)
         spin_lock_irqsave(&ap->host_set->lock, flags);
         ap->flags &= ~ATA_FLAG_NOINTR;
         ata_irq_on(ap);
-       ata_qc_complete(qc, drv_stat);
+       ata_qc_complete(qc, err_mask);
         spin_unlock_irqrestore(&ap->host_set->lock, flags);
  }
  
@@ -2796,7 +2768,7 @@ static int ata_pio_complete (struct ata_port *ap)
  
         ap->hsm_task_state = HSM_ST_IDLE;
  
-       ata_poll_qc_complete(qc, drv_stat);
+       ata_poll_qc_complete(qc, 0);
  
         /* another command may start at this point */
  
@@ -3164,18 +3136,15 @@ static void ata_pio_block(struct ata_port *ap)
  static void ata_pio_error(struct ata_port *ap)
  {
         struct ata_queued_cmd *qc;
-       u8 drv_stat;
+
+       printk(KERN_WARNING "ata%u: PIO error\n", ap->id);
  
         qc = ata_qc_from_tag(ap, ap->active_tag);
         assert(qc != NULL);
  
-       drv_stat = ata_chk_status(ap);
-       printk(KERN_WARNING "ata%u: PIO error, drv_stat 0x%x\n",
-              ap->id, drv_stat);
-
         ap->hsm_task_state = HSM_ST_IDLE;
  
-       ata_poll_qc_complete(qc, drv_stat | ATA_ERR);
+       ata_poll_qc_complete(qc, AC_ERR_ATA_BUS);
  }
  
  static void ata_pio_task(void *_data)
@@ -3298,7 +3267,7 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc)
                        ap->id, qc->tf.command, drv_stat, host_stat);
  
                 /* complete taskfile transaction */
-               ata_qc_complete(qc, drv_stat);
+               ata_qc_complete(qc, ac_err_mask(drv_stat));
                 break;
         }
  
@@ -3403,7 +3372,7 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
         return qc;
  }
  
-int ata_qc_complete_noop(struct ata_queued_cmd *qc, u8 drv_stat)
+int ata_qc_complete_noop(struct ata_queued_cmd *qc, unsigned int err_mask)
  {
         return 0;
  }
@@ -3462,7 +3431,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
   *     spin_lock_irqsave(host_set lock)
   */
  
-void ata_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat)
+void ata_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask)
  {
         int rc;
  
@@ -3479,7 +3448,7 @@ void ata_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat)
         qc->flags &= ~ATA_QCFLAG_ACTIVE;
  
         /* call completion callback */
-       rc = qc->complete_fn(qc, drv_stat);
+       rc = qc->complete_fn(qc, err_mask);
  
         /* if callback indicates not to complete command (non-zero),
          * return immediately
@@ -3917,7 +3886,7 @@ inline unsigned int ata_host_intr (struct ata_port *ap,
                 ap->ops->irq_clear(ap);
  
                 /* complete taskfile transaction */
-               ata_qc_complete(qc, status);
+               ata_qc_complete(qc, ac_err_mask(status));
                 break;
  
         default:
@@ -4012,7 +3981,7 @@ static void atapi_packet_task(void *_data)
         /* sleep-wait for BSY to clear */
         DPRINTK("busy wait\n");
         if (ata_busy_sleep(ap, ATA_TMOUT_CDB_QUICK, ATA_TMOUT_CDB))
-               goto err_out;
+               goto err_out_status;
  
         /* make sure DRQ is set */
         status = ata_chk_status(ap);
@@ -4049,8 +4018,10 @@ static void atapi_packet_task(void *_data)
  
         return;
  
+err_out_status:
+       status = ata_chk_status(ap);
  err_out:
-       ata_poll_qc_complete(qc, ATA_ERR);
+       ata_poll_qc_complete(qc, __ac_err_mask(status));
  }
  
  
@@ -4556,11 +4527,11 @@ ata_pci_init_native_mode(struct pci_dev *pdev, struct ata_port_info **port, int
         return probe_ent;
  }
  
-static struct ata_probe_ent *ata_pci_init_legacy_port(struct pci_dev *pdev, struct ata_port_info **port, int port_num)
+static struct ata_probe_ent *ata_pci_init_legacy_port(struct pci_dev *pdev, struct ata_port_info *port, int port_num)
  {
         struct ata_probe_ent *probe_ent;
  
-       probe_ent = ata_probe_ent_alloc(pci_dev_to_dev(pdev), port[0]);
+       probe_ent = ata_probe_ent_alloc(pci_dev_to_dev(pdev), port);
         if (!probe_ent)
                 return NULL;
  
@@ -4707,9 +4678,9 @@ int ata_pci_init_one (struct pci_dev *pdev, struct ata_port_info **port_info,
  
         if (legacy_mode) {
                 if (legacy_mode & (1 << 0))
-                       probe_ent = ata_pci_init_legacy_port(pdev, port, 0);
+                       probe_ent = ata_pci_init_legacy_port(pdev, port[0], 0);
                 if (legacy_mode & (1 << 1))
-                       probe_ent2 = ata_pci_init_legacy_port(pdev, port, 1);
+                       probe_ent2 = ata_pci_init_legacy_port(pdev, port[1], 1);
         } else {
                 if (n_ports == 2)
                         probe_ent = ata_pci_init_native_mode(pdev, port, ATA_PORT_PRIMARY | ATA_PORT_SECONDARY);
@@ -4873,7 +4844,6 @@ EXPORT_SYMBOL_GPL(ata_tf_to_fis);
  EXPORT_SYMBOL_GPL(ata_tf_from_fis);
  EXPORT_SYMBOL_GPL(ata_check_status);
  EXPORT_SYMBOL_GPL(ata_altstatus);
-EXPORT_SYMBOL_GPL(ata_chk_err);
  EXPORT_SYMBOL_GPL(ata_exec_command);
  EXPORT_SYMBOL_GPL(ata_port_start);
  EXPORT_SYMBOL_GPL(ata_port_stop);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c

index 89a04b1..1e3792f 100644 (file)
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -560,7 +560,7 @@ void ata_gen_ata_desc_sense(struct ata_queued_cmd *qc)
          * Use ata_to_sense_error() to map status register bits
          * onto sense key, asc & ascq.
          */
-       if (unlikely(tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ))) {
+       if (tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ)) {
                 ata_to_sense_error(qc->ap->id, tf->command, tf->feature,
                                    &sb[1], &sb[2], &sb[3]);
                 sb[1] &= 0x0f;
@@ -635,7 +635,7 @@ void ata_gen_fixed_sense(struct ata_queued_cmd *qc)
          * Use ata_to_sense_error() to map status register bits
          * onto sense key, asc & ascq.
          */
-       if (unlikely(tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ))) {
+       if (tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ)) {
                 ata_to_sense_error(qc->ap->id, tf->command, tf->feature,
                                    &sb[2], &sb[12], &sb[13]);
                 sb[2] &= 0x0f;
@@ -644,7 +644,11 @@ void ata_gen_fixed_sense(struct ata_queued_cmd *qc)
         sb[0] = 0x70;
         sb[7] = 0x0a;
  
-       if (tf->flags & ATA_TFLAG_LBA && !(tf->flags & ATA_TFLAG_LBA48)) {
+       if (tf->flags & ATA_TFLAG_LBA48) {
+               /* TODO: find solution for LBA48 descriptors */
+       }
+
+       else if (tf->flags & ATA_TFLAG_LBA) {
                 /* A small (28b) LBA will fit in the 32b info field */
                 sb[0] |= 0x80;          /* set valid bit */
                 sb[3] = tf->device & 0x0f;
@@ -652,6 +656,10 @@ void ata_gen_fixed_sense(struct ata_queued_cmd *qc)
                 sb[5] = tf->lbam;
                 sb[6] = tf->lbal;
         }
+
+       else {
+               /* TODO: C/H/S */
+       }
  }
  
  /**
@@ -1199,10 +1207,12 @@ nothing_to_do:
         return 1;
  }
  
-static int ata_scsi_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat)
+static int ata_scsi_qc_complete(struct ata_queued_cmd *qc,
+                               unsigned int err_mask)
  {
         struct scsi_cmnd *cmd = qc->scsicmd;
-       int need_sense = drv_stat & (ATA_ERR | ATA_BUSY | ATA_DRQ);
+       u8 *cdb = cmd->cmnd;
+       int need_sense = (err_mask != 0);
  
         /* For ATA pass thru (SAT) commands, generate a sense block if
          * user mandated it or if there's an error.  Note that if we
@@ -1211,8 +1221,8 @@ static int ata_scsi_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat)
          * whether the command completed successfully or not. If there
          * was no error, SK, ASC and ASCQ will all be zero.
          */
-       if (((cmd->cmnd[0] == ATA_16) || (cmd->cmnd[0] == ATA_12)) &&
-           ((cmd->cmnd[2] & 0x20) || need_sense)) {
+       if (((cdb[0] == ATA_16) || (cdb[0] == ATA_12)) &&
+           ((cdb[2] & 0x20) || need_sense)) {
                 ata_gen_ata_desc_sense(qc);
         } else {
                 if (!need_sense) {
@@ -1995,21 +2005,13 @@ void atapi_request_sense(struct ata_port *ap, struct ata_device *dev,
         DPRINTK("EXIT\n");
  }
  
-static int atapi_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat)
+static int atapi_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask)
  {
         struct scsi_cmnd *cmd = qc->scsicmd;
  
-       VPRINTK("ENTER, drv_stat == 0x%x\n", drv_stat);
-
-       if (unlikely(drv_stat & (ATA_BUSY | ATA_DRQ)))
-               /* FIXME: not quite right; we don't want the
-                * translation of taskfile registers into
-                * a sense descriptors, since that's only
-                * correct for ATA, not ATAPI
-                */
-               ata_gen_ata_desc_sense(qc);
+       VPRINTK("ENTER, err_mask 0x%X\n", err_mask);
  
-       else if (unlikely(drv_stat & ATA_ERR)) {
+       if (unlikely(err_mask & AC_ERR_DEV)) {
                 DPRINTK("request check condition\n");
  
                 /* FIXME: command completion with check condition
@@ -2026,6 +2028,14 @@ static int atapi_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat)
                 return 1;
         }
  
+       else if (unlikely(err_mask))
+               /* FIXME: not quite right; we don't want the
+                * translation of taskfile registers into
+                * a sense descriptors, since that's only
+                * correct for ATA, not ATAPI
+                */
+               ata_gen_ata_desc_sense(qc);
+
         else {
                 u8 *scsicmd = cmd->cmnd;
  
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h

index 65c264b..10ecd9e 100644 (file)
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -39,7 +39,7 @@ struct ata_scsi_args {
  
  /* libata-core.c */
  extern int atapi_enabled;
-extern int ata_qc_complete_noop(struct ata_queued_cmd *qc, u8 drv_stat);
+extern int ata_qc_complete_noop(struct ata_queued_cmd *qc, unsigned int err_mask);
  extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
                                       struct ata_device *dev);
  extern void ata_rwcmd_protocol(struct ata_queued_cmd *qc);
diff --git a/drivers/scsi/pdc_adma.c b/drivers/scsi/pdc_adma.c

index af99feb..665017e 100644 (file)
--- a/drivers/scsi/pdc_adma.c
+++ b/drivers/scsi/pdc_adma.c
@@ -40,6 +40,7 @@
  #include <linux/delay.h>
  #include <linux/interrupt.h>
  #include <linux/sched.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <asm/io.h>
@@ -451,7 +452,7 @@ static inline unsigned int adma_intr_pkt(struct ata_host_set *host_set)
                 struct adma_port_priv *pp;
                 struct ata_queued_cmd *qc;
                 void __iomem *chan = ADMA_REGS(mmio_base, port_no);
-               u8 drv_stat = 0, status = readb(chan + ADMA_STATUS);
+               u8 status = readb(chan + ADMA_STATUS);
  
                 if (status == 0)
                         continue;
@@ -464,11 +465,14 @@ static inline unsigned int adma_intr_pkt(struct ata_host_set *host_set)
                         continue;
                 qc = ata_qc_from_tag(ap, ap->active_tag);
                 if (qc && (!(qc->tf.ctl & ATA_NIEN))) {
+                       unsigned int err_mask = 0;
+
                         if ((status & (aPERR | aPSD | aUIRQ)))
-                               drv_stat = ATA_ERR;
+                               err_mask = AC_ERR_OTHER;
                         else if (pp->pkt[0] != cDONE)
-                               drv_stat = ATA_ERR;
-                       ata_qc_complete(qc, drv_stat);
+                               err_mask = AC_ERR_OTHER;
+
+                       ata_qc_complete(qc, err_mask);
                 }
         }
         return handled;
@@ -498,7 +502,7 @@ static inline unsigned int adma_intr_mmio(struct ata_host_set *host_set)
                 
                                 /* complete taskfile transaction */
                                 pp->state = adma_state_idle;
-                               ata_qc_complete(qc, status);
+                               ata_qc_complete(qc, ac_err_mask(status));
                                 handled = 1;
                         }
                 }
@@ -623,16 +627,14 @@ static int adma_set_dma_masks(struct pci_dev *pdev, void __iomem *mmio_base)
  
         rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
         if (rc) {
-               printk(KERN_ERR DRV_NAME
-                       "(%s): 32-bit DMA enable failed\n",
-                       pci_name(pdev));
+               dev_printk(KERN_ERR, &pdev->dev,
+                       "32-bit DMA enable failed\n");
                 return rc;
         }
         rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
         if (rc) {
-               printk(KERN_ERR DRV_NAME
-                       "(%s): 32-bit consistent DMA enable failed\n",
-                       pci_name(pdev));
+               dev_printk(KERN_ERR, &pdev->dev,
+                       "32-bit consistent DMA enable failed\n");
                 return rc;
         }
         return 0;
@@ -648,7 +650,7 @@ static int adma_ata_init_one(struct pci_dev *pdev,
         int rc, port_no;
  
         if (!printed_version++)
-               printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+               dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n");
  
         rc = pci_enable_device(pdev);
         if (rc)
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c

index 422e0b6..46dbdee 100644 (file)
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -29,6 +29,7 @@
  #include <linux/interrupt.h>
  #include <linux/sched.h>
  #include <linux/dma-mapping.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <linux/libata.h>
@@ -258,7 +259,6 @@ struct mv_host_priv {
  static void mv_irq_clear(struct ata_port *ap);
  static u32 mv_scr_read(struct ata_port *ap, unsigned int sc_reg_in);
  static void mv_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val);
-static u8 mv_check_err(struct ata_port *ap);
  static void mv_phy_reset(struct ata_port *ap);
  static void mv_host_stop(struct ata_host_set *host_set);
  static int mv_port_start(struct ata_port *ap);
@@ -296,7 +296,6 @@ static const struct ata_port_operations mv_ops = {
         .tf_load                = ata_tf_load,
         .tf_read                = ata_tf_read,
         .check_status           = ata_check_status,
-       .check_err              = mv_check_err,
         .exec_command           = ata_exec_command,
         .dev_select             = ata_std_dev_select,
  
@@ -1067,6 +1066,7 @@ static void mv_host_intr(struct ata_host_set *host_set, u32 relevant,
         struct ata_queued_cmd *qc;
         u32 hc_irq_cause;
         int shift, port, port0, hard_port, handled;
+       unsigned int err_mask;
         u8 ata_status = 0;
  
         if (hc == 0) {
@@ -1102,15 +1102,15 @@ static void mv_host_intr(struct ata_host_set *host_set, u32 relevant,
                         handled++;
                 }
  
+               err_mask = ac_err_mask(ata_status);
+
                 shift = port << 1;              /* (port * 2) */
                 if (port >= MV_PORTS_PER_HC) {
                         shift++;        /* skip bit 8 in the HC Main IRQ reg */
                 }
                 if ((PORT0_ERR << shift) & relevant) {
                         mv_err_intr(ap);
-                       /* OR in ATA_ERR to ensure libata knows we took one */
-                       ata_status = readb((void __iomem *)
-                                          ap->ioaddr.status_addr) | ATA_ERR;
+                       err_mask |= AC_ERR_OTHER;
                         handled++;
                 }
                 
@@ -1120,7 +1120,7 @@ static void mv_host_intr(struct ata_host_set *host_set, u32 relevant,
                                 VPRINTK("port %u IRQ found for qc, "
                                         "ata_status 0x%x\n", port,ata_status);
                                 /* mark qc status appropriately */
-                               ata_qc_complete(qc, ata_status);
+                               ata_qc_complete(qc, err_mask);
                         }
                 }
         }
@@ -1184,22 +1184,6 @@ static irqreturn_t mv_interrupt(int irq, void *dev_instance,
         return IRQ_RETVAL(handled);
  }
  
-/**
- *      mv_check_err - Return the error shadow register to caller.
- *      @ap: ATA channel to manipulate
- *
- *      Marvell requires DMA to be stopped before accessing shadow
- *      registers.  So we do that, then return the needed register.
- *
- *      LOCKING:
- *      Inherited from caller.  FIXME: protect mv_stop_dma with lock?
- */
-static u8 mv_check_err(struct ata_port *ap)
-{
-       mv_stop_dma(ap);                /* can't read shadow regs if DMA on */
-       return readb((void __iomem *) ap->ioaddr.error_addr);
-}
-
  /**
   *      mv_phy_reset - Perform eDMA reset followed by COMRESET
   *      @ap: ATA channel to manipulate
@@ -1312,7 +1296,7 @@ static void mv_eng_timeout(struct ata_port *ap)
                  */
                 spin_lock_irqsave(&ap->host_set->lock, flags);
                 qc->scsidone = scsi_finish_command;
-               ata_qc_complete(qc, ATA_ERR);
+               ata_qc_complete(qc, AC_ERR_OTHER);
                 spin_unlock_irqrestore(&ap->host_set->lock, flags);
         }
  }
@@ -1454,9 +1438,9 @@ static void mv_print_info(struct ata_probe_ent *probe_ent)
         else
                 scc_s = "unknown";
  
-       printk(KERN_INFO DRV_NAME 
-              "(%s) %u slots %u ports %s mode IRQ via %s\n",
-              pci_name(pdev), (unsigned)MV_MAX_Q_DEPTH, probe_ent->n_ports, 
+       dev_printk(KERN_INFO, &pdev->dev,
+              "%u slots %u ports %s mode IRQ via %s\n",
+              (unsigned)MV_MAX_Q_DEPTH, probe_ent->n_ports, 
                scc_s, (MV_HP_FLAG_MSI & hpriv->hp_flags) ? "MSI" : "INTx");
  }
  
@@ -1477,9 +1461,8 @@ static int mv_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
         void __iomem *mmio_base;
         int pci_dev_busy = 0, rc;
  
-       if (!printed_version++) {
-               printk(KERN_INFO DRV_NAME " version " DRV_VERSION "\n");
-       }
+       if (!printed_version++)
+               dev_printk(KERN_INFO, &pdev->dev, "version " DRV_VERSION "\n");
  
         rc = pci_enable_device(pdev);
         if (rc) {
diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c

index 1a56d6c..d573888 100644 (file)
--- a/drivers/scsi/sata_nv.c
+++ b/drivers/scsi/sata_nv.c
@@ -61,6 +61,7 @@
  #include <linux/blkdev.h>
  #include <linux/delay.h>
  #include <linux/interrupt.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <linux/libata.h>
@@ -383,7 +384,7 @@ static int nv_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
                         return -ENODEV;
  
         if (!printed_version++)
-               printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+               dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n");
  
         rc = pci_enable_device(pdev);
         if (rc)
diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c

index 63911f1..b41c977 100644 (file)
--- a/drivers/scsi/sata_promise.c
+++ b/drivers/scsi/sata_promise.c
@@ -38,6 +38,7 @@
  #include <linux/delay.h>
  #include <linux/interrupt.h>
  #include <linux/sched.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <linux/libata.h>
@@ -399,7 +400,8 @@ static void pdc_eng_timeout(struct ata_port *ap)
         case ATA_PROT_DMA:
         case ATA_PROT_NODATA:
                 printk(KERN_ERR "ata%u: command timeout\n", ap->id);
-               ata_qc_complete(qc, ata_wait_idle(ap) | ATA_ERR);
+               drv_stat = ata_wait_idle(ap);
+               ata_qc_complete(qc, __ac_err_mask(drv_stat));
                 break;
  
         default:
@@ -408,7 +410,7 @@ static void pdc_eng_timeout(struct ata_port *ap)
                 printk(KERN_ERR "ata%u: unknown timeout, cmd 0x%x stat 0x%x\n",
                        ap->id, qc->tf.command, drv_stat);
  
-               ata_qc_complete(qc, drv_stat);
+               ata_qc_complete(qc, ac_err_mask(drv_stat));
                 break;
         }
  
@@ -420,24 +422,21 @@ out:
  static inline unsigned int pdc_host_intr( struct ata_port *ap,
                                            struct ata_queued_cmd *qc)
  {
-       u8 status;
-       unsigned int handled = 0, have_err = 0;
+       unsigned int handled = 0, err_mask = 0;
         u32 tmp;
         void __iomem *mmio = (void __iomem *) ap->ioaddr.cmd_addr + PDC_GLOBAL_CTL;
  
         tmp = readl(mmio);
         if (tmp & PDC_ERR_MASK) {
-               have_err = 1;
+               err_mask = AC_ERR_DEV;
                 pdc_reset_port(ap);
         }
  
         switch (qc->tf.protocol) {
         case ATA_PROT_DMA:
         case ATA_PROT_NODATA:
-               status = ata_wait_idle(ap);
-               if (have_err)
-                       status |= ATA_ERR;
-               ata_qc_complete(qc, status);
+               err_mask |= ac_err_mask(ata_wait_idle(ap));
+               ata_qc_complete(qc, err_mask);
                 handled = 1;
                 break;
  
@@ -635,7 +634,7 @@ static int pdc_ata_init_one (struct pci_dev *pdev, const struct pci_device_id *e
         int rc;
  
         if (!printed_version++)
-               printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+               dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n");
  
         /*
          * If this driver happens to only be useful on Apple's K2, then
diff --git a/drivers/scsi/sata_qstor.c b/drivers/scsi/sata_qstor.c

index 1aaf330..9938dae 100644 (file)
--- a/drivers/scsi/sata_qstor.c
+++ b/drivers/scsi/sata_qstor.c
@@ -35,6 +35,7 @@
  #include <linux/delay.h>
  #include <linux/interrupt.h>
  #include <linux/sched.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <asm/io.h>
@@ -400,11 +401,12 @@ static inline unsigned int qs_intr_pkt(struct ata_host_set *host_set)
                                 qc = ata_qc_from_tag(ap, ap->active_tag);
                                 if (qc && (!(qc->tf.ctl & ATA_NIEN))) {
                                         switch (sHST) {
-                                       case 0: /* sucessful CPB */
+                                       case 0: /* successful CPB */
                                         case 3: /* device error */
                                                 pp->state = qs_state_idle;
                                                 qs_enter_reg_mode(qc->ap);
-                                               ata_qc_complete(qc, sDST);
+                                               ata_qc_complete(qc,
+                                                       ac_err_mask(sDST));
                                                 break;
                                         default:
                                                 break;
@@ -441,7 +443,7 @@ static inline unsigned int qs_intr_mmio(struct ata_host_set *host_set)
  
                                 /* complete taskfile transaction */
                                 pp->state = qs_state_idle;
-                               ata_qc_complete(qc, status);
+                               ata_qc_complete(qc, ac_err_mask(status));
                                 handled = 1;
                         }
                 }
@@ -599,25 +601,22 @@ static int qs_set_dma_masks(struct pci_dev *pdev, void __iomem *mmio_base)
                 if (rc) {
                         rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
                         if (rc) {
-                               printk(KERN_ERR DRV_NAME
-                                       "(%s): 64-bit DMA enable failed\n",
-                                       pci_name(pdev));
+                               dev_printk(KERN_ERR, &pdev->dev,
+                                          "64-bit DMA enable failed\n");
                                 return rc;
                         }
                 }
         } else {
                 rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
                 if (rc) {
-                       printk(KERN_ERR DRV_NAME
-                               "(%s): 32-bit DMA enable failed\n",
-                               pci_name(pdev));
+                       dev_printk(KERN_ERR, &pdev->dev,
+                               "32-bit DMA enable failed\n");
                         return rc;
                 }
                 rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
                 if (rc) {
-                       printk(KERN_ERR DRV_NAME
-                               "(%s): 32-bit consistent DMA enable failed\n",
-                               pci_name(pdev));
+                       dev_printk(KERN_ERR, &pdev->dev,
+                               "32-bit consistent DMA enable failed\n");
                         return rc;
                 }
         }
@@ -634,7 +633,7 @@ static int qs_ata_init_one(struct pci_dev *pdev,
         int rc, port_no;
  
         if (!printed_version++)
-               printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+               dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n");
  
         rc = pci_enable_device(pdev);
         if (rc)
diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c

index 3a05617..435f7e0 100644 (file)
--- a/drivers/scsi/sata_sil.c
+++ b/drivers/scsi/sata_sil.c
@@ -41,6 +41,7 @@
  #include <linux/blkdev.h>
  #include <linux/delay.h>
  #include <linux/interrupt.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <linux/libata.h>
@@ -386,7 +387,7 @@ static int sil_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
         u8 cls;
  
         if (!printed_version++)
-               printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+               dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n");
  
         /*
          * If this driver happens to only be useful on Apple's K2, then
@@ -463,8 +464,8 @@ static int sil_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
                         writeb(cls, mmio_base + SIL_FIFO_W3);
                 }
         } else
-               printk(KERN_WARNING DRV_NAME "(%s): cache line size not set.  Driver may not function\n",
-                       pci_name(pdev));
+               dev_printk(KERN_WARNING, &pdev->dev,
+                        "cache line size not set.  Driver may not function\n");
  
         if (ent->driver_data == sil_3114) {
                 irq_mask = SIL_MASK_4PORT;
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c

index 51855d3..c665480 100644 (file)
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -35,6 +35,7 @@
  #include <linux/delay.h>
  #include <linux/interrupt.h>
  #include <linux/dma-mapping.h>
+#include <linux/device.h>
  #include <scsi/scsi_host.h>
  #include "scsi.h"
  #include <linux/libata.h>
@@ -225,7 +226,6 @@ struct sil24_host_priv {
  };
  
  static u8 sil24_check_status(struct ata_port *ap);
-static u8 sil24_check_err(struct ata_port *ap);
  static u32 sil24_scr_read(struct ata_port *ap, unsigned sc_reg);
  static void sil24_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val);
  static void sil24_tf_read(struct ata_port *ap, struct ata_taskfile *tf);
@@ -280,7 +280,6 @@ static const struct ata_port_operations sil24_ops = {
  
         .check_status           = sil24_check_status,
         .check_altstatus        = sil24_check_status,
-       .check_err              = sil24_check_err,
         .dev_select             = ata_noop_dev_select,
  
         .tf_read                = sil24_tf_read,
@@ -363,12 +362,6 @@ static u8 sil24_check_status(struct ata_port *ap)
         return pp->tf.command;
  }
  
-static u8 sil24_check_err(struct ata_port *ap)
-{
-       struct sil24_port_priv *pp = ap->private_data;
-       return pp->tf.feature;
-}
-
  static int sil24_scr_map[] = {
         [SCR_CONTROL]   = 0,
         [SCR_STATUS]    = 1,
@@ -506,7 +499,7 @@ static void sil24_eng_timeout(struct ata_port *ap)
  
         qc = ata_qc_from_tag(ap, ap->active_tag);
         if (!qc) {
-               printk(KERN_ERR "ata%u: BUG: tiemout without command\n",
+               printk(KERN_ERR "ata%u: BUG: timeout without command\n",
                        ap->id);
                 return;
         }
@@ -520,7 +513,7 @@ static void sil24_eng_timeout(struct ata_port *ap)
          */
         printk(KERN_ERR "ata%u: command timeout\n", ap->id);
         qc->scsidone = scsi_finish_command;
-       ata_qc_complete(qc, ATA_ERR);
+       ata_qc_complete(qc, AC_ERR_OTHER);
  
         sil24_reset_controller(ap);
  }
@@ -531,6 +524,7 @@ static void sil24_error_intr(struct ata_port *ap, u32 slot_stat)
         struct sil24_port_priv *pp = ap->private_data;
         void __iomem *port = (void __iomem *)ap->ioaddr.cmd_addr;
         u32 irq_stat, cmd_err, sstatus, serror;
+       unsigned int err_mask;
  
         irq_stat = readl(port + PORT_IRQ_STAT);
         writel(irq_stat, port + PORT_IRQ_STAT);         /* clear irq */
@@ -558,17 +552,18 @@ static void sil24_error_intr(struct ata_port *ap, u32 slot_stat)
                  * Device is reporting error, tf registers are valid.
                  */
                 sil24_update_tf(ap);
+               err_mask = ac_err_mask(pp->tf.command);
         } else {
                 /*
                  * Other errors.  libata currently doesn't have any
                  * mechanism to report these errors.  Just turn on
                  * ATA_ERR.
                  */
-               pp->tf.command = ATA_ERR;
+               err_mask = AC_ERR_OTHER;
         }
  
         if (qc)
-               ata_qc_complete(qc, pp->tf.command);
+               ata_qc_complete(qc, err_mask);
  
         sil24_reset_controller(ap);
  }
@@ -593,7 +588,7 @@ static inline void sil24_host_intr(struct ata_port *ap)
                 sil24_update_tf(ap);
  
                 if (qc)
-                       ata_qc_complete(qc, pp->tf.command);
+                       ata_qc_complete(qc, ac_err_mask(pp->tf.command));
         } else
                 sil24_error_intr(ap, slot_stat);
  }
@@ -696,7 +691,7 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
         int i, rc;
  
         if (!printed_version++)
-               printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+               dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n");
  
         rc = pci_enable_device(pdev);
         if (rc)
@@ -756,14 +751,14 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
          */
         rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
         if (rc) {
-               printk(KERN_ERR DRV_NAME "(%s): 32-bit DMA enable failed\n",
-                      pci_name(pdev));
+               dev_printk(KERN_ERR, &pdev->dev,
+                          "32-bit DMA enable failed\n");
                 goto out_free;
         }
         rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
         if (rc) {
-               printk(KERN_ERR DRV_NAME "(%s): 32-bit consistent DMA enable failed\n",
-                      pci_name(pdev));
+               dev_printk(KERN_ERR, &pdev->dev,
+                          "32-bit consistent DMA enable failed\n");
                 goto out_free;
         }
  
@@ -799,9 +794,8 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
                                         break;
                         }
                         if (tmp & PORT_CS_PORT_RST)
-                               printk(KERN_ERR DRV_NAME
-                                      "(%s): failed to clear port RST\n",
-                                      pci_name(pdev));
+                               dev_printk(KERN_ERR, &pdev->dev,
+                                          "failed to clear port RST\n");
                 }
  
                 /* Zero error counters. */
@@ -830,9 +824,8 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
  
                 /* Reset itself */
                 if (__sil24_reset_controller(port))
-                       printk(KERN_ERR DRV_NAME
-                              "(%s): failed to reset controller\n",
-                              pci_name(pdev));
+                       dev_printk(KERN_ERR, &pdev->dev,
+                                  "failed to reset controller\n");
         }
  
         /* Turn on interrupts */
diff --git a/drivers/scsi/sata_sis.c b/drivers/scsi/sata_sis.c

index 057f7b9..42288be 100644 (file)
--- a/drivers/scsi/sata_sis.c
+++ b/drivers/scsi/sata_sis.c
@@ -38,6 +38,7 @@
  #include <linux/blkdev.h>
  #include <linux/delay.h>
  #include <linux/interrupt.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <linux/libata.h>
@@ -237,6 +238,7 @@ static void sis_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val)
  
  static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
  {
+       static int printed_version;
         struct ata_probe_ent *probe_ent = NULL;
         int rc;
         u32 genctl;
@@ -245,6 +247,9 @@ static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
         u8 pmr;
         u8 port2_start;
  
+       if (!printed_version++)
+               dev_printk(KERN_INFO, &pdev->dev, "version " DRV_VERSION "\n");
+
         rc = pci_enable_device(pdev);
         if (rc)
                 return rc;
@@ -288,16 +293,18 @@ static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
         pci_read_config_byte(pdev, SIS_PMR, &pmr);
         if (ent->device != 0x182) {
                 if ((pmr & SIS_PMR_COMBINED) == 0) {
-                       printk(KERN_INFO "sata_sis: Detected SiS 180/181 chipset in SATA mode\n");
+                       dev_printk(KERN_INFO, &pdev->dev,
+                                  "Detected SiS 180/181 chipset in SATA mode\n");
                         port2_start = 64;
                 }
                 else {
-                       printk(KERN_INFO "sata_sis: Detected SiS 180/181 chipset in combined mode\n");
+                       dev_printk(KERN_INFO, &pdev->dev,
+                                  "Detected SiS 180/181 chipset in combined mode\n");
                         port2_start=0;
                 }
         }
         else {
-               printk(KERN_INFO "sata_sis: Detected SiS 182 chipset\n");
+               dev_printk(KERN_INFO, &pdev->dev, "Detected SiS 182 chipset\n");
                 port2_start = 0x20;
         }
  
diff --git a/drivers/scsi/sata_svw.c b/drivers/scsi/sata_svw.c

index 46208f5..db615ff 100644 (file)
--- a/drivers/scsi/sata_svw.c
+++ b/drivers/scsi/sata_svw.c
@@ -44,6 +44,7 @@
  #include <linux/blkdev.h>
  #include <linux/delay.h>
  #include <linux/interrupt.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <linux/libata.h>
@@ -362,7 +363,7 @@ static int k2_sata_init_one (struct pci_dev *pdev, const struct pci_device_id *e
         int i;
  
         if (!printed_version++)
-               printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+               dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n");
  
         /*
          * If this driver happens to only be useful on Apple's K2, then
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c

index af08f4f..0ec21e0 100644 (file)
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -38,6 +38,7 @@
  #include <linux/delay.h>
  #include <linux/interrupt.h>
  #include <linux/sched.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <linux/libata.h>
@@ -718,7 +719,7 @@ static inline unsigned int pdc20621_host_intr( struct ata_port *ap,
                         VPRINTK("ata%u: read hdma, 0x%x 0x%x\n", ap->id,
                                 readl(mmio + 0x104), readl(mmio + PDC_HDMA_CTLSTAT));
                         /* get drive status; clear intr; complete txn */
-                       ata_qc_complete(qc, ata_wait_idle(ap));
+                       ata_qc_complete(qc, ac_err_mask(ata_wait_idle(ap)));
                         pdc20621_pop_hdma(qc);
                 }
  
@@ -756,7 +757,7 @@ static inline unsigned int pdc20621_host_intr( struct ata_port *ap,
                         VPRINTK("ata%u: write ata, 0x%x 0x%x\n", ap->id,
                                 readl(mmio + 0x104), readl(mmio + PDC_HDMA_CTLSTAT));
                         /* get drive status; clear intr; complete txn */
-                       ata_qc_complete(qc, ata_wait_idle(ap));
+                       ata_qc_complete(qc, ac_err_mask(ata_wait_idle(ap)));
                         pdc20621_pop_hdma(qc);
                 }
                 handled = 1;
@@ -766,7 +767,7 @@ static inline unsigned int pdc20621_host_intr( struct ata_port *ap,
  
                 status = ata_busy_wait(ap, ATA_BUSY | ATA_DRQ, 1000);
                 DPRINTK("BUS_NODATA (drv_stat 0x%X)\n", status);
-               ata_qc_complete(qc, status);
+               ata_qc_complete(qc, ac_err_mask(status));
                 handled = 1;
  
         } else {
@@ -881,7 +882,7 @@ static void pdc_eng_timeout(struct ata_port *ap)
         case ATA_PROT_DMA:
         case ATA_PROT_NODATA:
                 printk(KERN_ERR "ata%u: command timeout\n", ap->id);
-               ata_qc_complete(qc, ata_wait_idle(ap) | ATA_ERR);
+               ata_qc_complete(qc, __ac_err_mask(ata_wait_idle(ap)));
                 break;
  
         default:
@@ -890,7 +891,7 @@ static void pdc_eng_timeout(struct ata_port *ap)
                 printk(KERN_ERR "ata%u: unknown timeout, cmd 0x%x stat 0x%x\n",
                        ap->id, qc->tf.command, drv_stat);
  
-               ata_qc_complete(qc, drv_stat);
+               ata_qc_complete(qc, ac_err_mask(drv_stat));
                 break;
         }
  
@@ -1385,7 +1386,7 @@ static int pdc_sata_init_one (struct pci_dev *pdev, const struct pci_device_id *
         int rc;
  
         if (!printed_version++)
-               printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+               dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n");
  
         /*
          * If this driver happens to only be useful on Apple's K2, then
diff --git a/drivers/scsi/sata_uli.c b/drivers/scsi/sata_uli.c

index d68dc7d..a5e245c 100644 (file)
--- a/drivers/scsi/sata_uli.c
+++ b/drivers/scsi/sata_uli.c
@@ -32,6 +32,7 @@
  #include <linux/blkdev.h>
  #include <linux/delay.h>
  #include <linux/interrupt.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <linux/libata.h>
@@ -178,12 +179,16 @@ static void uli_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val)
  
  static int uli_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
  {
+       static int printed_version;
         struct ata_probe_ent *probe_ent;
         struct ata_port_info *ppi;
         int rc;
         unsigned int board_idx = (unsigned int) ent->driver_data;
         int pci_dev_busy = 0;
  
+       if (!printed_version++)
+               dev_printk(KERN_INFO, &pdev->dev, "version " DRV_VERSION "\n");
+
         rc = pci_enable_device(pdev);
         if (rc)
                 return rc;
diff --git a/drivers/scsi/sata_via.c b/drivers/scsi/sata_via.c

index 80e291a..b3ecdbe 100644 (file)
--- a/drivers/scsi/sata_via.c
+++ b/drivers/scsi/sata_via.c
@@ -41,6 +41,7 @@
  #include <linux/init.h>
  #include <linux/blkdev.h>
  #include <linux/delay.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <linux/libata.h>
@@ -259,15 +260,15 @@ static void svia_configure(struct pci_dev *pdev)
         u8 tmp8;
  
         pci_read_config_byte(pdev, PCI_INTERRUPT_LINE, &tmp8);
-       printk(KERN_INFO DRV_NAME "(%s): routed to hard irq line %d\n",
-              pci_name(pdev),
+       dev_printk(KERN_INFO, &pdev->dev, "routed to hard irq line %d\n",
                (int) (tmp8 & 0xf0) == 0xf0 ? 0 : tmp8 & 0x0f);
  
         /* make sure SATA channels are enabled */
         pci_read_config_byte(pdev, SATA_CHAN_ENAB, &tmp8);
         if ((tmp8 & ALL_PORTS) != ALL_PORTS) {
-               printk(KERN_DEBUG DRV_NAME "(%s): enabling SATA channels (0x%x)\n",
-                      pci_name(pdev), (int) tmp8);
+               dev_printk(KERN_DEBUG, &pdev->dev,
+                          "enabling SATA channels (0x%x)\n",
+                          (int) tmp8);
                 tmp8 |= ALL_PORTS;
                 pci_write_config_byte(pdev, SATA_CHAN_ENAB, tmp8);
         }
@@ -275,8 +276,9 @@ static void svia_configure(struct pci_dev *pdev)
         /* make sure interrupts for each channel sent to us */
         pci_read_config_byte(pdev, SATA_INT_GATE, &tmp8);
         if ((tmp8 & ALL_PORTS) != ALL_PORTS) {
-               printk(KERN_DEBUG DRV_NAME "(%s): enabling SATA channel interrupts (0x%x)\n",
-                      pci_name(pdev), (int) tmp8);
+               dev_printk(KERN_DEBUG, &pdev->dev,
+                          "enabling SATA channel interrupts (0x%x)\n",
+                          (int) tmp8);
                 tmp8 |= ALL_PORTS;
                 pci_write_config_byte(pdev, SATA_INT_GATE, tmp8);
         }
@@ -284,8 +286,9 @@ static void svia_configure(struct pci_dev *pdev)
         /* make sure native mode is enabled */
         pci_read_config_byte(pdev, SATA_NATIVE_MODE, &tmp8);
         if ((tmp8 & NATIVE_MODE_ALL) != NATIVE_MODE_ALL) {
-               printk(KERN_DEBUG DRV_NAME "(%s): enabling SATA channel native mode (0x%x)\n",
-                      pci_name(pdev), (int) tmp8);
+               dev_printk(KERN_DEBUG, &pdev->dev,
+                          "enabling SATA channel native mode (0x%x)\n",
+                          (int) tmp8);
                 tmp8 |= NATIVE_MODE_ALL;
                 pci_write_config_byte(pdev, SATA_NATIVE_MODE, tmp8);
         }
@@ -303,7 +306,7 @@ static int svia_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
         u8 tmp8;
  
         if (!printed_version++)
-               printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+               dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n");
  
         rc = pci_enable_device(pdev);
         if (rc)
@@ -318,8 +321,9 @@ static int svia_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
         if (board_id == vt6420) {
                 pci_read_config_byte(pdev, SATA_PATA_SHARING, &tmp8);
                 if (tmp8 & SATA_2DEV) {
-                       printk(KERN_ERR DRV_NAME "(%s): SATA master/slave not supported (0x%x)\n",
-                       pci_name(pdev), (int) tmp8);
+                       dev_printk(KERN_ERR, &pdev->dev,
+                                  "SATA master/slave not supported (0x%x)\n",
+                                  (int) tmp8);
                         rc = -EIO;
                         goto err_out_regions;
                 }
@@ -332,10 +336,11 @@ static int svia_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
         for (i = 0; i < ARRAY_SIZE(svia_bar_sizes); i++)
                 if ((pci_resource_start(pdev, i) == 0) ||
                     (pci_resource_len(pdev, i) < bar_sizes[i])) {
-                       printk(KERN_ERR DRV_NAME "(%s): invalid PCI BAR %u (sz 0x%lx, val 0x%lx)\n",
-                              pci_name(pdev), i,
-                              pci_resource_start(pdev, i),
-                              pci_resource_len(pdev, i));
+                       dev_printk(KERN_ERR, &pdev->dev,
+                                  "invalid PCI BAR %u (sz 0x%lx, val 0x%lx)\n",
+                                  i,
+                                  pci_resource_start(pdev, i),
+                                  pci_resource_len(pdev, i));
                         rc = -ENODEV;
                         goto err_out_regions;
                 }
@@ -353,8 +358,7 @@ static int svia_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
                 probe_ent = vt6421_init_probe_ent(pdev);
  
         if (!probe_ent) {
-               printk(KERN_ERR DRV_NAME "(%s): out of memory\n",
-                      pci_name(pdev));
+               dev_printk(KERN_ERR, &pdev->dev, "out of memory\n");
                 rc = -ENOMEM;
                 goto err_out_regions;
         }
diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c

index 54273e0..bb84ba0 100644 (file)
--- a/drivers/scsi/sata_vsc.c
+++ b/drivers/scsi/sata_vsc.c
@@ -42,6 +42,7 @@
  #include <linux/delay.h>
  #include <linux/interrupt.h>
  #include <linux/dma-mapping.h>
+#include <linux/device.h>
  #include "scsi.h"
  #include <scsi/scsi_host.h>
  #include <linux/libata.h>
@@ -295,7 +296,7 @@ static int __devinit vsc_sata_init_one (struct pci_dev *pdev, const struct pci_d
         int rc;
  
         if (!printed_version++)
-               printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+               dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n");
  
         rc = pci_enable_device(pdev);
         if (rc)
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c

index 861e513..d86d5c2 100644 (file)
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -49,6 +49,7 @@ static int sg_version_num = 30533;    /* 2 digits for each component */
  #include <linux/seq_file.h>
  #include <linux/blkdev.h>
  #include <linux/delay.h>
+#include <linux/scatterlist.h>
  
  #include "scsi.h"
  #include <scsi/scsi_dbg.h>
@@ -1886,13 +1887,17 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
         int i;
  
         for (i=0; i < nr_pages; i++) {
-               if (dirtied && !PageReserved(sgl[i].page))
-                       SetPageDirty(sgl[i].page);
-               /* unlock_page(sgl[i].page); */
+               struct page *page = sgl[i].page;
+
+               /* XXX: just for debug. Remove when PageReserved is removed */
+               BUG_ON(PageReserved(page));
+               if (dirtied)
+                       SetPageDirty(page);
+               /* unlock_page(page); */
                 /* FIXME: cache flush missing for rw==READ
                  * FIXME: call the correct reference counting function
                  */
-               page_cache_release(sgl[i].page);
+               page_cache_release(page);
         }
  
         return 0;
@@ -1992,9 +1997,7 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size)
                                 if (!p)
                                         break;
                         }
-                       sclp->page = virt_to_page(p);
-                       sclp->offset = offset_in_page(p);
-                       sclp->length = ret_sz;
+                       sg_set_buf(sclp, p, ret_sz);
  
                         SCSI_LOG_TIMEOUT(5, printk("sg_build_build: k=%d, a=0x%p, len=%d\n",
                                           k, sg_scatg2virt(sclp), ret_sz));
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c

index 5eb54d8..da97662 100644 (file)
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4526,12 +4526,16 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p
         int i;
  
         for (i=0; i < nr_pages; i++) {
-               if (dirtied && !PageReserved(sgl[i].page))
-                       SetPageDirty(sgl[i].page);
+               struct page *page = sgl[i].page;
+
+               /* XXX: just for debug. Remove when PageReserved is removed */
+               BUG_ON(PageReserved(page));
+               if (dirtied)
+                       SetPageDirty(page);
                 /* FIXME: cache flush missing for rw==READ
                  * FIXME: call the correct reference counting function
                  */
-               page_cache_release(sgl[i].page);
+               page_cache_release(page);
         }
  
         return 0;
diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c

index 90a9625..2997f55 100644 (file)
--- a/drivers/usb/misc/usbtest.c
+++ b/drivers/usb/misc/usbtest.c
@@ -9,7 +9,7 @@
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/moduleparam.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
  
  #include <linux/usb.h>
  
@@ -381,7 +381,6 @@ alloc_sglist (int nents, int max, int vary)
         sg = kmalloc (nents * sizeof *sg, SLAB_KERNEL);
         if (!sg)
                 return NULL;
-       memset (sg, 0, nents * sizeof *sg);
  
         for (i = 0; i < nents; i++) {
                 char            *buf;
@@ -394,9 +393,7 @@ alloc_sglist (int nents, int max, int vary)
                 memset (buf, 0, size);
  
                 /* kmalloc pages are always physically contiguous! */
-               sg [i].page = virt_to_page (buf);
-               sg [i].offset = offset_in_page (buf);
-               sg [i].length = size;
+               sg_init_one(&sg[i], buf, size);
  
                 if (vary) {
                         size += vary;
diff --git a/fs/afs/file.c b/fs/afs/file.c

index 0d57698..4975c9c 100644 (file)
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
                 cachefs_uncache_page(vnode->cache, page);
  #endif
  
-               pageio = (struct cachefs_page *) page->private;
-               page->private = 0;
+               pageio = (struct cachefs_page *) page_private(page);
+               set_page_private(page, 0);
                 ClearPagePrivate(page);
  
                 if (pageio)
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c

index dd9baab..7201182 100644 (file)
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -318,7 +318,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
         current->mm->free_area_cache = current->mm->mmap_base;
         current->mm->cached_hole_size = 0;
  
-       set_mm_counter(current->mm, rss, 0);
         current->mm->mmap = NULL;
         compute_creds(bprm);
         current->flags &= ~PF_FORKNOEXEC;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c

index d4b1557..918ccc2 100644 (file)
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -773,7 +773,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
  
         /* Do this so that we can load the interpreter, if need be.  We will
            change some of these later */
-       set_mm_counter(current->mm, rss, 0);
         current->mm->free_area_cache = current->mm->mmap_base;
         current->mm->cached_hole_size = 0;
         retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c

index 134c9c0..dda87c4 100644 (file)
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -294,14 +294,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
                                   &interp_params,
                                   &current->mm->start_stack,
                                   &current->mm->start_brk);
-#endif
-
-       /* do this so that we can load the interpreter, if need be
-        * - we will change some of these later
-        */
-       set_mm_counter(current->mm, rss, 0);
  
-#ifdef CONFIG_MMU
         retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack);
         if (retval < 0) {
                 send_sig(SIGKILL, current, 0);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c

index 7974efa..9d66258 100644 (file)
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -650,7 +650,6 @@ static int load_flat_file(struct linux_binprm * bprm,
                 current->mm->start_brk = datapos + data_len + bss_len;
                 current->mm->brk = (current->mm->start_brk + 3) & ~3;
                 current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
-               set_mm_counter(current->mm, rss, 0);
         }
  
         if (flags & FLAT_FLAG_KTRACE)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c

index 227a268..00a91dc 100644 (file)
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -259,7 +259,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
         create_som_tables(bprm);
  
         current->mm->start_stack = bprm->p;
-       set_mm_counter(current->mm, rss, 0);
  
  #if 0
         printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
diff --git a/fs/buffer.c b/fs/buffer.c

index b166798..2066e4c 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -96,7 +96,7 @@ static void
  __clear_page_buffers(struct page *page)
  {
         ClearPagePrivate(page);
-       page->private = 0;
+       set_page_private(page, 0);
         page_cache_release(page);
  }
  
diff --git a/fs/compat.c b/fs/compat.c

index a719e15..8e71cdb 100644 (file)
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename,
                 /* execve success */
                 security_bprm_free(bprm);
                 acct_update_integrals(current);
-               update_mem_hiwater(current);
                 kfree(bprm);
                 return retval;
         }
diff --git a/fs/direct-io.c b/fs/direct-io.c

index 0d06097..3931e7f 100644 (file)
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio)
         up_read(&current->mm->mmap_sem);
  
         if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+               struct page *page = ZERO_PAGE(dio->curr_user_address);
                 /*
                  * A memory fault, but the filesystem has some outstanding
                  * mapped blocks.  We need to use those blocks up to avoid
@@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio)
                  */
                 if (dio->page_errors == 0)
                         dio->page_errors = ret;
-               dio->pages[0] = ZERO_PAGE(dio->curr_user_address);
+               page_cache_get(page);
+               dio->pages[0] = page;
                 dio->head = 0;
                 dio->tail = 1;
                 ret = 0;
diff --git a/fs/exec.c b/fs/exec.c

index d2208f7..ba73797 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -309,40 +309,36 @@ void install_arg_page(struct vm_area_struct *vma,
         pud_t * pud;
         pmd_t * pmd;
         pte_t * pte;
+       spinlock_t *ptl;
  
         if (unlikely(anon_vma_prepare(vma)))
-               goto out_sig;
+               goto out;
  
         flush_dcache_page(page);
         pgd = pgd_offset(mm, address);
-
-       spin_lock(&mm->page_table_lock);
         pud = pud_alloc(mm, pgd, address);
         if (!pud)
                 goto out;
         pmd = pmd_alloc(mm, pud, address);
         if (!pmd)
                 goto out;
-       pte = pte_alloc_map(mm, pmd, address);
+       pte = pte_alloc_map_lock(mm, pmd, address, &ptl);
         if (!pte)
                 goto out;
         if (!pte_none(*pte)) {
-               pte_unmap(pte);
+               pte_unmap_unlock(pte, ptl);
                 goto out;
         }
-       inc_mm_counter(mm, rss);
+       inc_mm_counter(mm, anon_rss);
         lru_cache_add_active(page);
         set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
                                         page, vma->vm_page_prot))));
         page_add_anon_rmap(page, vma, address);
-       pte_unmap(pte);
-       spin_unlock(&mm->page_table_lock);
+       pte_unmap_unlock(pte, ptl);
  
         /* no need for flush_tlb */
         return;
  out:
-       spin_unlock(&mm->page_table_lock);
-out_sig:
         __free_page(page);
         force_sig(SIGKILL, current);
  }
@@ -1207,7 +1203,6 @@ int do_execve(char * filename,
                 /* execve success */
                 security_bprm_free(bprm);
                 acct_update_integrals(current);
-               update_mem_hiwater(current);
                 kfree(bprm);
                 return retval;
         }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index 3a9b6d1..e026c80 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -45,10 +45,58 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = {
  
  int sysctl_hugetlb_shm_group;
  
+static void huge_pagevec_release(struct pagevec *pvec)
+{
+       int i;
+
+       for (i = 0; i < pagevec_count(pvec); ++i)
+               put_page(pvec->pages[i]);
+
+       pagevec_reinit(pvec);
+}
+
+/*
+ * huge_pages_needed tries to determine the number of new huge pages that
+ * will be required to fully populate this VMA.  This will be equal to
+ * the size of the VMA in huge pages minus the number of huge pages
+ * (covered by this VMA) that are found in the page cache.
+ *
+ * Result is in bytes to be compatible with is_hugepage_mem_enough()
+ */
+unsigned long
+huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
+{
+       int i;
+       struct pagevec pvec;
+       unsigned long start = vma->vm_start;
+       unsigned long end = vma->vm_end;
+       unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
+       pgoff_t next = vma->vm_pgoff;
+       pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT);
+
+       pagevec_init(&pvec, 0);
+       while (next < endpg) {
+               if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+                       break;
+               for (i = 0; i < pagevec_count(&pvec); i++) {
+                       struct page *page = pvec.pages[i];
+                       if (page->index > next)
+                               next = page->index;
+                       if (page->index >= endpg)
+                               break;
+                       next++;
+                       hugepages--;
+               }
+               huge_pagevec_release(&pvec);
+       }
+       return hugepages << HPAGE_SHIFT;
+}
+
  static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
  {
         struct inode *inode = file->f_dentry->d_inode;
         struct address_space *mapping = inode->i_mapping;
+       unsigned long bytes;
         loff_t len, vma_len;
         int ret;
  
@@ -67,6 +115,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
         if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
                 return -EINVAL;
  
+       bytes = huge_pages_needed(mapping, vma);
+       if (!is_hugepage_mem_enough(bytes))
+               return -ENOMEM;
+
         vma_len = (loff_t)(vma->vm_end - vma->vm_start);
  
         down(&inode->i_sem);
@@ -79,10 +131,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
         if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
                 goto out;
  
-       ret = hugetlb_prefault(mapping, vma);
-       if (ret)
-               goto out;
-
+       ret = 0;
+       hugetlb_prefault_arch_hook(vma->vm_mm);
         if (inode->i_size < len)
                 inode->i_size = len;
  out:
@@ -92,7 +142,7 @@ out:
  }
  
  /*
- * Called under down_write(mmap_sem), page_table_lock is not held
+ * Called under down_write(mmap_sem).
   */
  
  #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
@@ -171,16 +221,6 @@ static int hugetlbfs_commit_write(struct file *file,
         return -EINVAL;
  }
  
-static void huge_pagevec_release(struct pagevec *pvec)
-{
-       int i;
-
-       for (i = 0; i < pagevec_count(pvec); ++i)
-               put_page(pvec->pages[i]);
-
-       pagevec_reinit(pvec);
-}
-
  static void truncate_huge_page(struct page *page)
  {
         clear_page_dirty(page);
@@ -224,52 +264,35 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
  
  static void hugetlbfs_delete_inode(struct inode *inode)
  {
-       struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb);
-
-       hlist_del_init(&inode->i_hash);
-       list_del_init(&inode->i_list);
-       list_del_init(&inode->i_sb_list);
-       inode->i_state |= I_FREEING;
-       inodes_stat.nr_inodes--;
-       spin_unlock(&inode_lock);
-
         if (inode->i_data.nrpages)
                 truncate_hugepages(&inode->i_data, 0);
-
-       security_inode_delete(inode);
-
-       if (sbinfo->free_inodes >= 0) {
-               spin_lock(&sbinfo->stat_lock);
-               sbinfo->free_inodes++;
-               spin_unlock(&sbinfo->stat_lock);
-       }
-
         clear_inode(inode);
-       destroy_inode(inode);
  }
  
  static void hugetlbfs_forget_inode(struct inode *inode)
  {
-       struct super_block *super_block = inode->i_sb;
-       struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(super_block);
+       struct super_block *sb = inode->i_sb;
  
-       if (hlist_unhashed(&inode->i_hash))
-               goto out_truncate;
-
-       if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
-               list_del(&inode->i_list);
-               list_add(&inode->i_list, &inode_unused);
-       }
-       inodes_stat.nr_unused++;
-       if (!super_block || (super_block->s_flags & MS_ACTIVE)) {
+       if (!hlist_unhashed(&inode->i_hash)) {
+               if (!(inode->i_state & (I_DIRTY|I_LOCK)))
+                       list_move(&inode->i_list, &inode_unused);
+               inodes_stat.nr_unused++;
+               if (!sb || (sb->s_flags & MS_ACTIVE)) {
+                       spin_unlock(&inode_lock);
+                       return;
+               }
+               inode->i_state |= I_WILL_FREE;
                 spin_unlock(&inode_lock);
-               return;
+               /*
+                * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
+                * in our backing_dev_info.
+                */
+               write_inode_now(inode, 1);
+               spin_lock(&inode_lock);
+               inode->i_state &= ~I_WILL_FREE;
+               inodes_stat.nr_unused--;
+               hlist_del_init(&inode->i_hash);
         }
-
-       /* write_inode_now() ? */
-       inodes_stat.nr_unused--;
-       hlist_del_init(&inode->i_hash);
-out_truncate:
         list_del_init(&inode->i_list);
         list_del_init(&inode->i_sb_list);
         inode->i_state |= I_FREEING;
@@ -277,13 +300,6 @@ out_truncate:
         spin_unlock(&inode_lock);
         if (inode->i_data.nrpages)
                 truncate_hugepages(&inode->i_data, 0);
-
-       if (sbinfo->free_inodes >= 0) {
-               spin_lock(&sbinfo->stat_lock);
-               sbinfo->free_inodes++;
-               spin_unlock(&sbinfo->stat_lock);
-       }
-
         clear_inode(inode);
         destroy_inode(inode);
  }
@@ -291,7 +307,7 @@ out_truncate:
  static void hugetlbfs_drop_inode(struct inode *inode)
  {
         if (!inode->i_nlink)
-               hugetlbfs_delete_inode(inode);
+               generic_delete_inode(inode);
         else
                 hugetlbfs_forget_inode(inode);
  }
@@ -308,7 +324,6 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
  
         vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) {
                 unsigned long h_vm_pgoff;
-               unsigned long v_length;
                 unsigned long v_offset;
  
                 h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
@@ -319,11 +334,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
                 if (h_vm_pgoff >= h_pgoff)
                         v_offset = 0;
  
-               v_length = vma->vm_end - vma->vm_start;
-
-               zap_hugepage_range(vma,
-                               vma->vm_start + v_offset,
-                               v_length - v_offset);
+               unmap_hugepage_range(vma,
+                               vma->vm_start + v_offset, vma->vm_end);
         }
  }
  
@@ -379,17 +391,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
                                         gid_t gid, int mode, dev_t dev)
  {
         struct inode *inode;
-       struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
-
-       if (sbinfo->free_inodes >= 0) {
-               spin_lock(&sbinfo->stat_lock);
-               if (!sbinfo->free_inodes) {
-                       spin_unlock(&sbinfo->stat_lock);
-                       return NULL;
-               }
-               sbinfo->free_inodes--;
-               spin_unlock(&sbinfo->stat_lock);
-       }
  
         inode = new_inode(sb);
         if (inode) {
@@ -531,29 +532,51 @@ static void hugetlbfs_put_super(struct super_block *sb)
         }
  }
  
+static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
+{
+       if (sbinfo->free_inodes >= 0) {
+               spin_lock(&sbinfo->stat_lock);
+               if (unlikely(!sbinfo->free_inodes)) {
+                       spin_unlock(&sbinfo->stat_lock);
+                       return 0;
+               }
+               sbinfo->free_inodes--;
+               spin_unlock(&sbinfo->stat_lock);
+       }
+
+       return 1;
+}
+
+static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
+{
+       if (sbinfo->free_inodes >= 0) {
+               spin_lock(&sbinfo->stat_lock);
+               sbinfo->free_inodes++;
+               spin_unlock(&sbinfo->stat_lock);
+       }
+}
+
+
  static kmem_cache_t *hugetlbfs_inode_cachep;
  
  static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
  {
+       struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
         struct hugetlbfs_inode_info *p;
  
+       if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
+               return NULL;
         p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL);
-       if (!p)
+       if (unlikely(!p)) {
+               hugetlbfs_inc_free_inodes(sbinfo);
                 return NULL;
+       }
         return &p->vfs_inode;
  }
  
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
-{
-       struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
-
-       if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-           SLAB_CTOR_CONSTRUCTOR)
-               inode_init_once(&ei->vfs_inode);
-}
-
  static void hugetlbfs_destroy_inode(struct inode *inode)
  {
+       hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
         mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
         kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
  }
@@ -565,6 +588,16 @@ static struct address_space_operations hugetlbfs_aops = {
         .set_page_dirty = hugetlbfs_set_page_dirty,
  };
  
+
+static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+       struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+
+       if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+           SLAB_CTOR_CONSTRUCTOR)
+               inode_init_once(&ei->vfs_inode);
+}
+
  struct file_operations hugetlbfs_file_operations = {
         .mmap                   = hugetlbfs_file_mmap,
         .fsync                  = simple_sync_file,
@@ -592,6 +625,7 @@ static struct super_operations hugetlbfs_ops = {
         .alloc_inode    = hugetlbfs_alloc_inode,
         .destroy_inode  = hugetlbfs_destroy_inode,
         .statfs         = hugetlbfs_statfs,
+       .delete_inode   = hugetlbfs_delete_inode,
         .drop_inode     = hugetlbfs_drop_inode,
         .put_super      = hugetlbfs_put_super,
  };
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c

index 26091a5..8a53981 100644 (file)
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -86,7 +86,7 @@ struct meta_anchor {
         atomic_t io_count;
         struct metapage *mp[MPS_PER_PAGE];
  };
-#define mp_anchor(page) ((struct meta_anchor *)page->private)
+#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
  
  static inline struct metapage *page_to_mp(struct page *page, uint offset)
  {
@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
                 if (!a)
                         return -ENOMEM;
                 memset(a, 0, sizeof(struct meta_anchor));
-               page->private = (unsigned long)a;
+               set_page_private(page, (unsigned long)a);
                 SetPagePrivate(page);
                 kmap(page);
         }
@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
         a->mp[index] = NULL;
         if (--a->mp_count == 0) {
                 kfree(a);
-               page->private = 0;
+               set_page_private(page, 0);
                 ClearPagePrivate(page);
                 kunmap(page);
         }
@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
  #else
  static inline struct metapage *page_to_mp(struct page *page, uint offset)
  {
-       return PagePrivate(page) ? (struct metapage *)page->private : NULL;
+       return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
  }
  
  static inline int insert_metapage(struct page *page, struct metapage *mp)
  {
         if (mp) {
-               page->private = (unsigned long)mp;
+               set_page_private(page, (unsigned long)mp);
                 SetPagePrivate(page);
                 kmap(page);
         }
@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
  
  static inline void remove_metapage(struct page *page, struct metapage *mp)
  {
-       page->private = 0;
+       set_page_private(page, 0);
         ClearPagePrivate(page);
         kunmap(page);
  }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c

index f2781ca..fc0f12b 100644 (file)
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1274,14 +1274,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
         }
  
         if ((fattr->valid & NFS_ATTR_FATTR) == 0) {
-               spin_unlock(&inode->i_lock);
                 return 0;
         }
  
         /* Has the inode gone and changed behind our back? */
         if (nfsi->fileid != fattr->fileid
                         || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
-               spin_unlock(&inode->i_lock);
                 return -EIO;
         }
  
diff --git a/fs/proc/array.c b/fs/proc/array.c

index d84eeca..3e1239e 100644 (file)
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
                 jiffies_to_clock_t(it_real_value),
                 start_time,
                 vsize,
-               mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */
+               mm ? get_mm_rss(mm) : 0,
                 rsslim,
                 mm ? mm->start_code : 0,
                 mm ? mm->end_code : 0,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c

index c7ef3e4..d2fa420 100644 (file)
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,22 +14,41 @@
  char *task_mem(struct mm_struct *mm, char *buffer)
  {
         unsigned long data, text, lib;
+       unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+
+       /*
+        * Note: to minimize their overhead, mm maintains hiwater_vm and
+        * hiwater_rss only when about to *lower* total_vm or rss.  Any
+        * collector of these hiwater stats must therefore get total_vm
+        * and rss too, which will usually be the higher.  Barriers? not
+        * worth the effort, such snapshots can always be inconsistent.
+        */
+       hiwater_vm = total_vm = mm->total_vm;
+       if (hiwater_vm < mm->hiwater_vm)
+               hiwater_vm = mm->hiwater_vm;
+       hiwater_rss = total_rss = get_mm_rss(mm);
+       if (hiwater_rss < mm->hiwater_rss)
+               hiwater_rss = mm->hiwater_rss;
  
         data = mm->total_vm - mm->shared_vm - mm->stack_vm;
         text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
         lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
         buffer += sprintf(buffer,
+               "VmPeak:\t%8lu kB\n"
                 "VmSize:\t%8lu kB\n"
                 "VmLck:\t%8lu kB\n"
+               "VmHWM:\t%8lu kB\n"
                 "VmRSS:\t%8lu kB\n"
                 "VmData:\t%8lu kB\n"
                 "VmStk:\t%8lu kB\n"
                 "VmExe:\t%8lu kB\n"
                 "VmLib:\t%8lu kB\n"
                 "VmPTE:\t%8lu kB\n",
-               (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+               hiwater_vm << (PAGE_SHIFT-10),
+               (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
                 mm->locked_vm << (PAGE_SHIFT-10),
-               get_mm_counter(mm, rss) << (PAGE_SHIFT-10),
+               hiwater_rss << (PAGE_SHIFT-10),
+               total_rss << (PAGE_SHIFT-10),
                 data << (PAGE_SHIFT-10),
                 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
                 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
@@ -44,13 +63,11 @@ unsigned long task_vsize(struct mm_struct *mm)
  int task_statm(struct mm_struct *mm, int *shared, int *text,
                int *data, int *resident)
  {
-       int rss = get_mm_counter(mm, rss);
-
-       *shared = rss - get_mm_counter(mm, anon_rss);
+       *shared = get_mm_counter(mm, file_rss);
         *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
                                                                 >> PAGE_SHIFT;
         *data = mm->total_vm - mm->shared_vm;
-       *resident = rss;
+       *resident = *shared + get_mm_counter(mm, anon_rss);
         return mm->total_vm;
  }
  
@@ -186,13 +203,14 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                 struct mem_size_stats *mss)
  {
         pte_t *pte, ptent;
+       spinlock_t *ptl;
         unsigned long pfn;
         struct page *page;
  
-       pte = pte_offset_map(pmd, addr);
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
         do {
                 ptent = *pte;
-               if (pte_none(ptent) || !pte_present(ptent))
+               if (!pte_present(ptent))
                         continue;
  
                 mss->resident += PAGE_SIZE;
@@ -213,8 +231,8 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                 mss->private_clean += PAGE_SIZE;
                 }
         } while (pte++, addr += PAGE_SIZE, addr != end);
-       pte_unmap(pte - 1);
-       cond_resched_lock(&vma->vm_mm->page_table_lock);
+       pte_unmap_unlock(pte - 1, ptl);
+       cond_resched();
  }
  
  static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -268,17 +286,11 @@ static inline void smaps_pgd_range(struct vm_area_struct *vma,
  static int show_smap(struct seq_file *m, void *v)
  {
         struct vm_area_struct *vma = v;
-       struct mm_struct *mm = vma->vm_mm;
         struct mem_size_stats mss;
  
         memset(&mss, 0, sizeof mss);
-
-       if (mm) {
-               spin_lock(&mm->page_table_lock);
+       if (vma->vm_mm)
                 smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
-               spin_unlock(&mm->page_table_lock);
-       }
-
         return show_map_internal(m, v, &mss);
  }
  
@@ -407,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
         for_each_node(i)
                 md->node[i] =0;
  
-       spin_lock(&mm->page_table_lock);
         for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
                 page = follow_page(mm, vaddr, 0);
                 if (page) {
@@ -422,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
                                 md->anon++;
                         md->node[page_to_nid(page)]++;
                 }
+               cond_resched();
         }
-       spin_unlock(&mm->page_table_lock);
         return md;
  }
  
@@ -469,7 +480,7 @@ static int show_numa_map(struct seq_file *m, void *v)
                 seq_printf(m, " interleave={");
                 first = 1;
                 for_each_node(n) {
-                       if (test_bit(n, pol->v.nodes)) {
+                       if (node_isset(n, pol->v.nodes)) {
                                 if (!first)
                                         seq_putc(m,',');
                                 else
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c

index ba4767c..4cd46ab 100644 (file)
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -181,8 +181,9 @@ set_page_region(
         size_t          offset,
         size_t          length)
  {
-       page->private |= page_region_mask(offset, length);
-       if (page->private == ~0UL)
+       set_page_private(page,
+               page_private(page) | page_region_mask(offset, length));
+       if (page_private(page) == ~0UL)
                 SetPageUptodate(page);
  }
  
@@ -194,7 +195,7 @@ test_page_region(
  {
         unsigned long   mask = page_region_mask(offset, length);
  
-       return (mask && (page->private & mask) == mask);
+       return (mask && (page_private(page) & mask) == mask);
  }
  
  /*
diff --git a/include/asm-alpha/barrier.h b/include/asm-alpha/barrier.h

index 229c83f..681ff58 100644 (file)
--- a/include/asm-alpha/barrier.h
+++ b/include/asm-alpha/barrier.h
@@ -1,6 +1,8 @@
  #ifndef __BARRIER_H
  #define __BARRIER_H
  
+#include <asm/compiler.h>
+
  #define mb() \
  __asm__ __volatile__("mb": : :"memory")
  
diff --git a/include/asm-alpha/rwsem.h b/include/asm-alpha/rwsem.h

index 8e058a6..fafdd4f 100644 (file)
--- a/include/asm-alpha/rwsem.h
+++ b/include/asm-alpha/rwsem.h
@@ -262,5 +262,10 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem)
  #endif
  }
  
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+       return (sem->count != 0);
+}
+
  #endif /* __KERNEL__ */
  #endif /* _ALPHA_RWSEM_H */
diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h

index 9bb325c..f49bfb7 100644 (file)
--- a/include/asm-arm/tlb.h
+++ b/include/asm-arm/tlb.h
@@ -27,11 +27,7 @@
   */
  struct mmu_gather {
         struct mm_struct        *mm;
-       unsigned int            freed;
         unsigned int            fullmm;
-
-       unsigned int            flushes;
-       unsigned int            avoided_flushes;
  };
  
  DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -39,11 +35,9 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
  static inline struct mmu_gather *
  tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
  {
-       int cpu = smp_processor_id();
-       struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu);
+       struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
  
         tlb->mm = mm;
-       tlb->freed = 0;
         tlb->fullmm = full_mm_flush;
  
         return tlb;
@@ -52,24 +46,13 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
  static inline void
  tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
  {
-       struct mm_struct *mm = tlb->mm;
-       unsigned long freed = tlb->freed;
-       int rss = get_mm_counter(mm, rss);
-
-       if (rss < freed)
-               freed = rss;
-       add_mm_counter(mm, rss, -freed);
-
         if (tlb->fullmm)
-               flush_tlb_mm(mm);
+               flush_tlb_mm(tlb->mm);
  
         /* keep the page table cache within bounds */
         check_pgt_cache();
-}
  
-static inline unsigned int tlb_is_full_mm(struct mmu_gather *tlb)
-{
-       return tlb->fullmm;
+       put_cpu_var(mmu_gathers);
  }
  
  #define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0)
diff --git a/include/asm-arm26/tlb.h b/include/asm-arm26/tlb.h

index 1316352..08ddd85 100644 (file)
--- a/include/asm-arm26/tlb.h
+++ b/include/asm-arm26/tlb.h
@@ -10,24 +10,20 @@
   */
  struct mmu_gather {
          struct mm_struct        *mm;
-        unsigned int            freed;
-       unsigned int            fullmm;
-
-        unsigned int            flushes;
-        unsigned int            avoided_flushes;
+        unsigned int            need_flush;
+        unsigned int            fullmm;
  };
  
-extern struct mmu_gather mmu_gathers[NR_CPUS];
+DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
  
  static inline struct mmu_gather *
  tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
  {
-        int cpu = smp_processor_id();
-        struct mmu_gather *tlb = &mmu_gathers[cpu];
+        struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
  
          tlb->mm = mm;
-        tlb->freed = 0;
-       tlb->fullmm = full_mm_flush;
+        tlb->need_flush = 0;
+        tlb->fullmm = full_mm_flush;
  
          return tlb;
  }
@@ -35,30 +31,13 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
  static inline void
  tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
  {
-        struct mm_struct *mm = tlb->mm;
-        unsigned long freed = tlb->freed;
-        int rss = get_mm_counter(mm, rss);
-
-        if (rss < freed)
-                freed = rss;
-        add_mm_counter(mm, rss, -freed);
-
-        if (freed) {
-                flush_tlb_mm(mm);
-                tlb->flushes++;
-        } else {
-                tlb->avoided_flushes++;
-        }
+        if (tlb->need_flush)
+                flush_tlb_mm(tlb->mm);
  
          /* keep the page table cache within bounds */
          check_pgt_cache();
-}
-
  
-static inline unsigned int
-tlb_is_full_mm(struct mmu_gather *tlb)
-{
-     return tlb->fullmm;
+        put_cpu_var(mmu_gathers);
  }
  
  #define tlb_remove_tlb_entry(tlb,ptep,address)  do { } while (0)
@@ -71,7 +50,13 @@ tlb_is_full_mm(struct mmu_gather *tlb)
          } while (0)
  #define tlb_end_vma(tlb,vma)                    do { } while (0)
  
-#define tlb_remove_page(tlb,page)       free_page_and_swap_cache(page)
+static inline void
+tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+        tlb->need_flush = 1;
+        free_page_and_swap_cache(page);
+}
+
  #define pte_free_tlb(tlb,ptep)          pte_free(ptep)
  #define pmd_free_tlb(tlb,pmdp)          pmd_free(pmdp)
  
diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h

index c20ec25..68c6fea 100644 (file)
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -10,14 +10,9 @@
  
  #define pud_t                          pgd_t
  
-#define pmd_alloc(mm, pud, address)                    \
-({     pmd_t *ret;                                     \
-       if (pgd_none(*pud))                             \
-               ret = __pmd_alloc(mm, pud, address);    \
-       else                                            \
-               ret = pmd_offset(pud, address);         \
-       ret;                                            \
-})
+#define pmd_alloc(mm, pud, address) \
+       ((unlikely(pgd_none(*(pud))) && __pmd_alloc(mm, pud, address))? \
+               NULL: pmd_offset(pud, address))
  
  #define pud_alloc(mm, pgd, address)    (pgd)
  #define pud_offset(pgd, start)         (pgd)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h

index ff28c8b..7dca30a 100644 (file)
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -8,7 +8,7 @@
   *  - update the page tables
   *  - inform the TLB about the new one
   *
- * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock.
+ * We hold the mm semaphore for reading, and the pte lock.
   *
   * Note: the old pte is known to not be writable, so we don't need to
   * worry about dirty bits etc getting lost.
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h

index 7d02983..cdd4145 100644 (file)
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -35,16 +35,13 @@
  #endif
  
  /* struct mmu_gather is an opaque type used by the mm code for passing around
- * any data needed by arch specific code for tlb_remove_page.  This structure
- * can be per-CPU or per-MM as the page table lock is held for the duration of
- * TLB shootdown.
+ * any data needed by arch specific code for tlb_remove_page.
   */
  struct mmu_gather {
         struct mm_struct        *mm;
         unsigned int            nr;     /* set to ~0U means fast mode */
         unsigned int            need_flush;/* Really unmapped some ptes? */
         unsigned int            fullmm; /* non-zero means full mm flush */
-       unsigned long           freed;
         struct page *           pages[FREE_PTE_NR];
  };
  
@@ -57,7 +54,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
  static inline struct mmu_gather *
  tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
  {
-       struct mmu_gather *tlb = &per_cpu(mmu_gathers, smp_processor_id());
+       struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
  
         tlb->mm = mm;
  
@@ -65,7 +62,6 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
         tlb->nr = num_online_cpus() > 1 ? 0U : ~0U;
  
         tlb->fullmm = full_mm_flush;
-       tlb->freed = 0;
  
         return tlb;
  }
@@ -85,28 +81,17 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
  
  /* tlb_finish_mmu
   *     Called at the end of the shootdown operation to free up any resources
- *     that were required.  The page table lock is still held at this point.
+ *     that were required.
   */
  static inline void
  tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
  {
-       int freed = tlb->freed;
-       struct mm_struct *mm = tlb->mm;
-       int rss = get_mm_counter(mm, rss);
-
-       if (rss < freed)
-               freed = rss;
-       add_mm_counter(mm, rss, -freed);
         tlb_flush_mmu(tlb, start, end);
  
         /* keep the page table cache within bounds */
         check_pgt_cache();
-}
  
-static inline unsigned int
-tlb_is_full_mm(struct mmu_gather *tlb)
-{
-       return tlb->fullmm;
+       put_cpu_var(mmu_gathers);
  }
  
  /* tlb_remove_page
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h

index 348fe3a..620a906 100644 (file)
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -88,12 +88,6 @@ static inline int pfn_to_nid(unsigned long pfn)
         __pgdat->node_start_pfn + __pgdat->node_spanned_pages;          \
  })
  
-#define local_mapnr(kvaddr)                                            \
-({                                                                     \
-       unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT;               \
-       (__pfn - node_start_pfn(pfn_to_nid(__pfn)));                    \
-})
-
  /* XXX: FIXME -- wli */
  #define kern_addr_valid(kaddr) (0)
  
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h

index d101ac4..0e3ec80 100644 (file)
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -203,7 +203,8 @@ extern unsigned long pg0[];
  #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
  #define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
  
-#define pmd_none(x)    (!pmd_val(x))
+/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
+#define pmd_none(x)    (!(unsigned long)pmd_val(x))
  #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
  #define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
  #define        pmd_bad(x)      ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
diff --git a/include/asm-i386/rwsem.h b/include/asm-i386/rwsem.h

index 7625a67..be4ab85 100644 (file)
--- a/include/asm-i386/rwsem.h
+++ b/include/asm-i386/rwsem.h
@@ -284,5 +284,10 @@ LOCK_PREFIX        "xadd %0,(%2)"
         return tmp+delta;
  }
  
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+       return (sem->count != 0);
+}
+
  #endif /* __KERNEL__ */
  #endif /* _I386_RWSEM_H */
diff --git a/include/asm-ia64/rwsem.h b/include/asm-ia64/rwsem.h

index e18b5ab..1327c91 100644 (file)
--- a/include/asm-ia64/rwsem.h
+++ b/include/asm-ia64/rwsem.h
@@ -186,4 +186,9 @@ __downgrade_write (struct rw_semaphore *sem)
  #define rwsem_atomic_add(delta, sem)   atomic64_add(delta, (atomic64_t *)(&(sem)->count))
  #define rwsem_atomic_update(delta, sem)        atomic64_add_return(delta, (atomic64_t *)(&(sem)->count))
  
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+       return (sem->count != 0);
+}
+
  #endif /* _ASM_IA64_RWSEM_H */
diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h

index 3a9a6d1..834370b 100644 (file)
--- a/include/asm-ia64/tlb.h
+++ b/include/asm-ia64/tlb.h
@@ -60,7 +60,6 @@ struct mmu_gather {
         unsigned int            nr;             /* == ~0U => fast mode */
         unsigned char           fullmm;         /* non-zero means full mm flush */
         unsigned char           need_flush;     /* really unmapped some PTEs? */
-       unsigned long           freed;          /* number of pages freed */
         unsigned long           start_addr;
         unsigned long           end_addr;
         struct page             *pages[FREE_PTE_NR];
@@ -129,7 +128,7 @@ ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long e
  static inline struct mmu_gather *
  tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
  {
-       struct mmu_gather *tlb = &__get_cpu_var(mmu_gathers);
+       struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
  
         tlb->mm = mm;
         /*
@@ -147,25 +146,17 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
          */
         tlb->nr = (num_online_cpus() == 1) ? ~0U : 0;
         tlb->fullmm = full_mm_flush;
-       tlb->freed = 0;
         tlb->start_addr = ~0UL;
         return tlb;
  }
  
  /*
   * Called at the end of the shootdown operation to free up any resources that were
- * collected.  The page table lock is still held at this point.
+ * collected.
   */
  static inline void
  tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
  {
-       unsigned long freed = tlb->freed;
-       struct mm_struct *mm = tlb->mm;
-       unsigned long rss = get_mm_counter(mm, rss);
-
-       if (rss < freed)
-               freed = rss;
-       add_mm_counter(mm, rss, -freed);
         /*
          * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
          * tlb->end_addr.
@@ -174,12 +165,8 @@ tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
  
         /* keep the page table cache within bounds */
         check_pgt_cache();
-}
  
-static inline unsigned int
-tlb_is_full_mm(struct mmu_gather *tlb)
-{
-     return tlb->fullmm;
+       put_cpu_var(mmu_gathers);
  }
  
  /*
diff --git a/include/asm-m32r/mmzone.h b/include/asm-m32r/mmzone.h

index d58878e..adc7970 100644 (file)
--- a/include/asm-m32r/mmzone.h
+++ b/include/asm-m32r/mmzone.h
@@ -21,12 +21,6 @@ extern struct pglist_data *node_data[];
         __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1;      \
  })
  
-#define local_mapnr(kvaddr)                                            \
-({                                                                     \
-       unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT;               \
-       (__pfn - node_start_pfn(pfn_to_nid(__pfn)));                    \
-})
-
  #define pfn_to_page(pfn)                                               \
  ({                                                                     \
         unsigned long __pfn = pfn;                                      \
diff --git a/include/asm-parisc/cacheflush.h b/include/asm-parisc/cacheflush.h

index aa592d8..1bc3c83 100644 (file)
--- a/include/asm-parisc/cacheflush.h
+++ b/include/asm-parisc/cacheflush.h
@@ -100,30 +100,34 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
  
  /* Simple function to work out if we have an existing address translation
   * for a user space vma. */
-static inline pte_t *__translation_exists(struct mm_struct *mm,
-                                         unsigned long addr)
+static inline int translation_exists(struct vm_area_struct *vma,
+                               unsigned long addr, unsigned long pfn)
  {
-       pgd_t *pgd = pgd_offset(mm, addr);
+       pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
         pmd_t *pmd;
-       pte_t *pte;
+       pte_t pte;
  
         if(pgd_none(*pgd))
-               return NULL;
+               return 0;
  
         pmd = pmd_offset(pgd, addr);
         if(pmd_none(*pmd) || pmd_bad(*pmd))
-               return NULL;
+               return 0;
  
-       pte = pte_offset_map(pmd, addr);
+       /* We cannot take the pte lock here: flush_cache_page is usually
+        * called with pte lock already held.  Whereas flush_dcache_page
+        * takes flush_dcache_mmap_lock, which is lower in the hierarchy:
+        * the vma itself is secure, but the pte might come or go racily.
+        */
+       pte = *pte_offset_map(pmd, addr);
+       /* But pte_unmap() does nothing on this architecture */
  
-       /* The PA flush mappings show up as pte_none, but they're
-        * valid none the less */
-       if(pte_none(*pte) && ((pte_val(*pte) & _PAGE_FLUSH) == 0))
-               return NULL;
-       return pte;
-}
-#define        translation_exists(vma, addr)   __translation_exists((vma)->vm_mm, addr)
+       /* Filter out coincidental file entries and swap entries */
+       if (!(pte_val(pte) & (_PAGE_FLUSH|_PAGE_PRESENT)))
+               return 0;
  
+       return pte_pfn(pte) == pfn;
+}
  
  /* Private function to flush a page from the cache of a non-current
   * process.  cr25 contains the Page Directory of the current user
@@ -175,9 +179,8 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long
  {
         BUG_ON(!vma->vm_mm->context);
  
-       if(likely(translation_exists(vma, vmaddr)))
+       if (likely(translation_exists(vma, vmaddr, pfn)))
                 __flush_cache_page(vma, vmaddr);
  
  }
  #endif
-
diff --git a/include/asm-parisc/mmzone.h b/include/asm-parisc/mmzone.h

index 595d3dc..ae039f4 100644 (file)
--- a/include/asm-parisc/mmzone.h
+++ b/include/asm-parisc/mmzone.h
@@ -27,12 +27,6 @@ extern struct node_map_data node_data[];
  })
  #define node_localnr(pfn, nid)         ((pfn) - node_start_pfn(nid))
  
-#define local_mapnr(kvaddr)                                            \
-({                                                                     \
-       unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT;               \
-       (__pfn - node_start_pfn(pfn_to_nid(__pfn)));                    \
-})
-
  #define pfn_to_page(pfn)                                               \
  ({                                                                     \
         unsigned long __pfn = (pfn);                                    \
diff --git a/include/asm-parisc/tlbflush.h b/include/asm-parisc/tlbflush.h

index 84af4ab..e97aa8d 100644 (file)
--- a/include/asm-parisc/tlbflush.h
+++ b/include/asm-parisc/tlbflush.h
@@ -88,7 +88,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
         if (npages >= 512)  /* 2MB of space: arbitrary, should be tuned */
                 flush_tlb_all();
         else {
-
+               preempt_disable();
                 mtsp(vma->vm_mm->context,1);
                 purge_tlb_start();
                 if (split_tlb) {
@@ -102,6 +102,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
                                 pdtlb(start);
                                 start += PAGE_SIZE;
                         }
+               preempt_enable();
                 }
                 purge_tlb_end();
         }
diff --git a/include/asm-ppc/rwsem.h b/include/asm-ppc/rwsem.h

index 3e738f4..3501ea7 100644 (file)
--- a/include/asm-ppc/rwsem.h
+++ b/include/asm-ppc/rwsem.h
@@ -168,5 +168,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
         return atomic_add_return(delta, (atomic_t *)(&sem->count));
  }
  
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+       return (sem->count != 0);
+}
+
  #endif /* __KERNEL__ */
  #endif /* _PPC_RWSEM_XADD_H */
diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h

index ed473f4..80a708e 100644 (file)
--- a/include/asm-ppc64/mmzone.h
+++ b/include/asm-ppc64/mmzone.h
@@ -67,9 +67,6 @@ static inline int pa_to_nid(unsigned long pa)
  #define node_start_pfn(nid)    (NODE_DATA(nid)->node_start_pfn)
  #define node_end_pfn(nid)      (NODE_DATA(nid)->node_end_pfn)
  
-#define local_mapnr(kvaddr) \
-       ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) 
-
  #ifdef CONFIG_DISCONTIGMEM
  
  /*
diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h

index c83679c..2eb1778 100644 (file)
--- a/include/asm-ppc64/pgtable.h
+++ b/include/asm-ppc64/pgtable.h
@@ -478,10 +478,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
  #define __HAVE_ARCH_PTE_SAME
  #define pte_same(A,B)  (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
  
+#define pte_ERROR(e) \
+       printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
  #define pmd_ERROR(e) \
         printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
  #define pud_ERROR(e) \
-       printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e))
+       printk("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
  #define pgd_ERROR(e) \
         printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
  
diff --git a/include/asm-ppc64/rwsem.h b/include/asm-ppc64/rwsem.h

index bd5c2f0..7a647fa 100644 (file)
--- a/include/asm-ppc64/rwsem.h
+++ b/include/asm-ppc64/rwsem.h
@@ -163,5 +163,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
         return atomic_add_return(delta, (atomic_t *)(&sem->count));
  }
  
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+       return (sem->count != 0);
+}
+
  #endif /* __KERNEL__ */
  #endif /* _PPC_RWSEM_XADD_H */
diff --git a/include/asm-s390/rwsem.h b/include/asm-s390/rwsem.h

index 8c0cebb..0422a08 100644 (file)
--- a/include/asm-s390/rwsem.h
+++ b/include/asm-s390/rwsem.h
@@ -351,5 +351,10 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
         return new;
  }
  
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+       return (sem->count != 0);
+}
+
  #endif /* __KERNEL__ */
  #endif /* _S390_RWSEM_H */
diff --git a/include/asm-sh/rwsem.h b/include/asm-sh/rwsem.h

index 1be4337..0262d3d 100644 (file)
--- a/include/asm-sh/rwsem.h
+++ b/include/asm-sh/rwsem.h
@@ -166,5 +166,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
         return atomic_add_return(delta, (atomic_t *)(&sem->count));
  }
  
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+       return (sem->count != 0);
+}
+
  #endif /* __KERNEL__ */
  #endif /* _ASM_SH_RWSEM_H */
diff --git a/include/asm-sparc64/rwsem.h b/include/asm-sparc64/rwsem.h

index 4568ee4..cef5e82 100644 (file)
--- a/include/asm-sparc64/rwsem.h
+++ b/include/asm-sparc64/rwsem.h
@@ -56,6 +56,11 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
         atomic_add(delta, (atomic_t *)(&sem->count));
  }
  
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+       return (sem->count != 0);
+}
+
  #endif /* __KERNEL__ */
  
  #endif /* _SPARC64_RWSEM_H */
diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h

index 9baf57d..66138d9 100644 (file)
--- a/include/asm-sparc64/tlb.h
+++ b/include/asm-sparc64/tlb.h
@@ -25,9 +25,8 @@ struct mmu_gather {
         struct mm_struct *mm;
         unsigned int pages_nr;
         unsigned int need_flush;
-       unsigned int tlb_frozen;
+       unsigned int fullmm;
         unsigned int tlb_nr;
-       unsigned long freed;
         unsigned long vaddrs[TLB_BATCH_NR];
         struct page *pages[FREE_PTE_NR];
  };
@@ -44,14 +43,13 @@ extern void flush_tlb_pending(void);
  
  static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
  {
-       struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
+       struct mmu_gather *mp = &get_cpu_var(mmu_gathers);
  
         BUG_ON(mp->tlb_nr);
  
         mp->mm = mm;
         mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U;
-       mp->tlb_frozen = full_mm_flush;
-       mp->freed = 0;
+       mp->fullmm = full_mm_flush;
  
         return mp;
  }
@@ -78,30 +76,19 @@ extern void smp_flush_tlb_mm(struct mm_struct *mm);
  
  static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, unsigned long end)
  {
-       unsigned long freed = mp->freed;
-       struct mm_struct *mm = mp->mm;
-       unsigned long rss = get_mm_counter(mm, rss);
-
-       if (rss < freed)
-               freed = rss;
-       add_mm_counter(mm, rss, -freed);
-
         tlb_flush_mmu(mp);
  
-       if (mp->tlb_frozen) {
-               if (CTX_VALID(mm->context))
-                       do_flush_tlb_mm(mm);
-               mp->tlb_frozen = 0;
+       if (mp->fullmm) {
+               if (CTX_VALID(mp->mm->context))
+                       do_flush_tlb_mm(mp->mm);
+               mp->fullmm = 0;
         } else
                 flush_tlb_pending();
  
         /* keep the page table cache within bounds */
         check_pgt_cache();
-}
  
-static inline unsigned int tlb_is_full_mm(struct mmu_gather *mp)
-{
-       return mp->tlb_frozen;
+       put_cpu_var(mmu_gathers);
  }
  
  static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page)
diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h

index 616d02b..ac64eb9 100644 (file)
--- a/include/asm-um/pgtable.h
+++ b/include/asm-um/pgtable.h
@@ -138,7 +138,7 @@ extern unsigned long pg0[1024];
  
  #define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE))
  
-#define pmd_none(x)    (!(pmd_val(x) & ~_PAGE_NEWPAGE))
+#define pmd_none(x)    (!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE))
  #define        pmd_bad(x)      ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
  #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
  #define pmd_clear(xp)  do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0)
diff --git a/include/asm-x86_64/rwsem.h b/include/asm-x86_64/rwsem.h

index c002175..46077e9 100644 (file)
--- a/include/asm-x86_64/rwsem.h
+++ b/include/asm-x86_64/rwsem.h
@@ -274,5 +274,10 @@ LOCK_PREFIX        "xaddl %0,(%2)"
         return tmp+delta;
  }
  
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+       return (sem->count != 0);
+}
+
  #endif /* __KERNEL__ */
  #endif /* _X8664_RWSEM_H */
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h

index 88af42f..c937d6e 100644 (file)
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp)
  /* If we *know* page->private refers to buffer_heads */
  #define page_buffers(page)                                     \
         ({                                                      \
-               BUG_ON(!PagePrivate(page));             \
-               ((struct buffer_head *)(page)->private);        \
+               BUG_ON(!PagePrivate(page));                     \
+               ((struct buffer_head *)page_private(page));     \
         })
  #define page_has_buffers(page) PagePrivate(page)
  
@@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page,
  {
         page_cache_get(page);
         SetPagePrivate(page);
-       page->private = (unsigned long)head;
+       set_page_private(page, (unsigned long)head);
  }
  
  static inline void get_bh(struct buffer_head *bh)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index d664330..0cea162 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -16,7 +16,6 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
  int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
  int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
  int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
-void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
  void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
  int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
  int hugetlb_report_meminfo(char *);
@@ -87,7 +86,6 @@ static inline unsigned long hugetlb_total_pages(void)
  #define follow_huge_addr(mm, addr, write)      ERR_PTR(-EINVAL)
  #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
  #define hugetlb_prefault(mapping, vma)         ({ BUG(); 0; })
-#define zap_hugepage_range(vma, start, len)    BUG()
  #define unmap_hugepage_range(vma, start, end)  BUG()
  #define is_hugepage_mem_enough(size)           0
  #define hugetlb_report_meminfo(buf)            0
diff --git a/include/linux/libata.h b/include/linux/libata.h

index 00a8a57..0ba3af7 100644 (file)
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -172,6 +172,13 @@ enum hsm_task_states {
         HSM_ST_ERR,
  };
  
+enum ata_completion_errors {
+       AC_ERR_OTHER            = (1 << 0),
+       AC_ERR_DEV              = (1 << 1),
+       AC_ERR_ATA_BUS          = (1 << 2),
+       AC_ERR_HOST_BUS         = (1 << 3),
+};
+
  /* forward declarations */
  struct scsi_device;
  struct ata_port_operations;
@@ -179,7 +186,7 @@ struct ata_port;
  struct ata_queued_cmd;
  
  /* typedefs */
-typedef int (*ata_qc_cb_t) (struct ata_queued_cmd *qc, u8 drv_stat);
+typedef int (*ata_qc_cb_t) (struct ata_queued_cmd *qc, unsigned int err_mask);
  
  struct ata_ioports {
         unsigned long           cmd_addr;
@@ -347,7 +354,6 @@ struct ata_port_operations {
         void (*exec_command)(struct ata_port *ap, const struct ata_taskfile *tf);
         u8   (*check_status)(struct ata_port *ap);
         u8   (*check_altstatus)(struct ata_port *ap);
-       u8   (*check_err)(struct ata_port *ap);
         void (*dev_select)(struct ata_port *ap, unsigned int device);
  
         void (*phy_reset) (struct ata_port *ap);
@@ -434,7 +440,6 @@ extern void ata_noop_dev_select (struct ata_port *ap, unsigned int device);
  extern void ata_std_dev_select (struct ata_port *ap, unsigned int device);
  extern u8 ata_check_status(struct ata_port *ap);
  extern u8 ata_altstatus(struct ata_port *ap);
-extern u8 ata_chk_err(struct ata_port *ap);
  extern void ata_exec_command(struct ata_port *ap, const struct ata_taskfile *tf);
  extern int ata_port_start (struct ata_port *ap);
  extern void ata_port_stop (struct ata_port *ap);
@@ -455,7 +460,7 @@ extern void ata_bmdma_start (struct ata_queued_cmd *qc);
  extern void ata_bmdma_stop(struct ata_queued_cmd *qc);
  extern u8   ata_bmdma_status(struct ata_port *ap);
  extern void ata_bmdma_irq_clear(struct ata_port *ap);
-extern void ata_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat);
+extern void ata_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask);
  extern void ata_eng_timeout(struct ata_port *ap);
  extern void ata_scsi_simulate(u16 *id, struct scsi_cmnd *cmd,
                               void (*done)(struct scsi_cmnd *));
@@ -718,4 +723,21 @@ static inline int ata_try_flush_cache(const struct ata_device *dev)
                ata_id_has_flush_ext(dev->id);
  }
  
+static inline unsigned int ac_err_mask(u8 status)
+{
+       if (status & ATA_BUSY)
+               return AC_ERR_ATA_BUS;
+       if (status & (ATA_ERR | ATA_DF))
+               return AC_ERR_DEV;
+       return 0;
+}
+
+static inline unsigned int __ac_err_mask(u8 status)
+{
+       unsigned int mask = ac_err_mask(status);
+       if (mask == 0)
+               return AC_ERR_OTHER;
+       return mask;
+}
+
  #endif /* __LINUX_LIBATA_H__ */
diff --git a/include/linux/memory.h b/include/linux/memory.h

new file mode 100644 (file)

index 0000000..0def328
--- /dev/null
+++ b/include/linux/memory.h
@@ -0,0 +1,94 @@
+/*
+ * include/linux/memory.h - generic memory definition
+ *
+ * This is mainly for topological representation. We define the
+ * basic "struct memory_block" here, which can be embedded in per-arch
+ * definitions or NUMA information.
+ *
+ * Basic handling of the devices is done in drivers/base/memory.c
+ * and system devices are handled in drivers/base/sys.c.
+ *
+ * Memory block are exported via sysfs in the class/memory/devices/
+ * directory.
+ *
+ */
+#ifndef _LINUX_MEMORY_H_
+#define _LINUX_MEMORY_H_
+
+#include <linux/sysdev.h>
+#include <linux/node.h>
+#include <linux/compiler.h>
+
+#include <asm/semaphore.h>
+
+struct memory_block {
+       unsigned long phys_index;
+       unsigned long state;
+       /*
+        * This serializes all state change requests.  It isn't
+        * held during creation because the control files are
+        * created long after the critical areas during
+        * initialization.
+        */
+       struct semaphore state_sem;
+       int phys_device;                /* to which fru does this belong? */
+       void *hw;                       /* optional pointer to fw/hw data */
+       int (*phys_callback)(struct memory_block *);
+       struct sys_device sysdev;
+};
+
+/* These states are exposed to userspace as text strings in sysfs */
+#define        MEM_ONLINE              (1<<0) /* exposed to userspace */
+#define        MEM_GOING_OFFLINE       (1<<1) /* exposed to userspace */
+#define        MEM_OFFLINE             (1<<2) /* exposed to userspace */
+
+/*
+ * All of these states are currently kernel-internal for notifying
+ * kernel components and architectures.
+ *
+ * For MEM_MAPPING_INVALID, all notifier chains with priority >0
+ * are called before pfn_to_page() becomes invalid.  The priority=0
+ * entry is reserved for the function that actually makes
+ * pfn_to_page() stop working.  Any notifiers that want to be called
+ * after that should have priority <0.
+ */
+#define        MEM_MAPPING_INVALID     (1<<3)
+
+#ifndef CONFIG_MEMORY_HOTPLUG
+static inline int memory_dev_init(void)
+{
+       return 0;
+}
+static inline int register_memory_notifier(struct notifier_block *nb)
+{
+       return 0;
+}
+static inline void unregister_memory_notifier(struct notifier_block *nb)
+{
+}
+#else
+extern int register_memory(struct memory_block *, struct mem_section *section, struct node *);
+extern int register_new_memory(struct mem_section *);
+extern int unregister_memory_section(struct mem_section *);
+extern int memory_dev_init(void);
+extern int register_memory_notifier(struct notifier_block *nb);
+extern void unregister_memory_notifier(struct notifier_block *nb);
+
+#define CONFIG_MEM_BLOCK_SIZE  (PAGES_PER_SECTION<<PAGE_SHIFT)
+
+extern int invalidate_phys_mapping(unsigned long, unsigned long);
+struct notifier_block;
+
+extern int register_memory_notifier(struct notifier_block *nb);
+extern void unregister_memory_notifier(struct notifier_block *nb);
+
+extern struct sysdev_class memory_sysdev_class;
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#define hotplug_memory_notifier(fn, pri) {                     \
+       static struct notifier_block fn##_mem_nb =              \
+               { .notifier_call = fn, .priority = pri };       \
+       register_memory_notifier(&fn##_mem_nb);                 \
+}
+
+#endif /* _LINUX_MEMORY_H_ */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h

new file mode 100644 (file)

index 0000000..01f03bc
--- /dev/null
+++ b/include/linux/memory_hotplug.h
@@ -0,0 +1,104 @@
+#ifndef __LINUX_MEMORY_HOTPLUG_H
+#define __LINUX_MEMORY_HOTPLUG_H
+
+#include <linux/mmzone.h>
+#include <linux/spinlock.h>
+#include <linux/mmzone.h>
+#include <linux/notifier.h>
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * pgdat resizing functions
+ */
+static inline
+void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
+{
+       spin_lock_irqsave(&pgdat->node_size_lock, *flags);
+}
+static inline
+void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
+{
+       spin_unlock_irqrestore(&pgdat->node_size_lock, *flags);
+}
+static inline
+void pgdat_resize_init(struct pglist_data *pgdat)
+{
+       spin_lock_init(&pgdat->node_size_lock);
+}
+/*
+ * Zone resizing functions
+ */
+static inline unsigned zone_span_seqbegin(struct zone *zone)
+{
+       return read_seqbegin(&zone->span_seqlock);
+}
+static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
+{
+       return read_seqretry(&zone->span_seqlock, iv);
+}
+static inline void zone_span_writelock(struct zone *zone)
+{
+       write_seqlock(&zone->span_seqlock);
+}
+static inline void zone_span_writeunlock(struct zone *zone)
+{
+       write_sequnlock(&zone->span_seqlock);
+}
+static inline void zone_seqlock_init(struct zone *zone)
+{
+       seqlock_init(&zone->span_seqlock);
+}
+extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
+extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
+extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
+/* need some defines for these for archs that don't support it */
+extern void online_page(struct page *page);
+/* VM interface that may be used by firmware interface */
+extern int add_memory(u64 start, u64 size);
+extern int remove_memory(u64 start, u64 size);
+extern int online_pages(unsigned long, unsigned long);
+
+/* reasonably generic interface to expand the physical pages in a zone  */
+extern int __add_pages(struct zone *zone, unsigned long start_pfn,
+       unsigned long nr_pages);
+#else /* ! CONFIG_MEMORY_HOTPLUG */
+/*
+ * Stub functions for when hotplug is off
+ */
+static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
+static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
+static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
+
+static inline unsigned zone_span_seqbegin(struct zone *zone)
+{
+       return 0;
+}
+static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
+{
+       return 0;
+}
+static inline void zone_span_writelock(struct zone *zone) {}
+static inline void zone_span_writeunlock(struct zone *zone) {}
+static inline void zone_seqlock_init(struct zone *zone) {}
+
+static inline int mhp_notimplemented(const char *func)
+{
+       printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func);
+       dump_stack();
+       return -ENOSYS;
+}
+
+static inline int __add_pages(struct zone *zone, unsigned long start_pfn,
+       unsigned long nr_pages)
+{
+       return mhp_notimplemented(__FUNCTION__);
+}
+#endif /* ! CONFIG_MEMORY_HOTPLUG */
+static inline int __remove_pages(struct zone *zone, unsigned long start_pfn,
+       unsigned long nr_pages)
+{
+       printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__);
+       dump_stack();
+       return -ENOSYS;
+}
+#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h

index 58385ee..7af8cb8 100644 (file)
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -27,10 +27,10 @@
  
  #include <linux/config.h>
  #include <linux/mmzone.h>
-#include <linux/bitmap.h>
  #include <linux/slab.h>
  #include <linux/rbtree.h>
  #include <linux/spinlock.h>
+#include <linux/nodemask.h>
  
  struct vm_area_struct;
  
@@ -47,8 +47,7 @@ struct vm_area_struct;
   * Locking policy for interlave:
   * In process context there is no locking because only the process accesses
   * its own state. All vma manipulation is somewhat protected by a down_read on
- * mmap_sem. For allocating in the interleave policy the page_table_lock
- * must be also aquired to protect il_next.
+ * mmap_sem.
   *
   * Freeing policy:
   * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
@@ -63,7 +62,7 @@ struct mempolicy {
         union {
                 struct zonelist  *zonelist;     /* bind */
                 short            preferred_node; /* preferred */
-               DECLARE_BITMAP(nodes, MAX_NUMNODES); /* interleave */
+               nodemask_t       nodes;         /* interleave */
                 /* undefined for default */
         } v;
  };
diff --git a/include/linux/mm.h b/include/linux/mm.h

index e164957..5c1fb0a 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp);
  
  #define VM_DONTCOPY    0x00020000      /* Do not copy this vma on fork */
  #define VM_DONTEXPAND  0x00040000      /* Cannot expand with mremap() */
-#define VM_RESERVED    0x00080000      /* Don't unmap it from swap_out */
+#define VM_RESERVED    0x00080000      /* Pages managed in a special way */
  #define VM_ACCOUNT     0x00100000      /* Is a VM accounted object */
  #define VM_HUGETLB     0x00400000      /* Huge TLB Page VM */
  #define VM_NONLINEAR   0x00800000      /* Is non-linear (remap_file_pages) */
@@ -226,13 +226,18 @@ struct page {
                                          * to show when page is mapped
                                          * & limit reverse map searches.
                                          */
-       unsigned long private;          /* Mapping-private opaque data:
+       union {
+               unsigned long private;  /* Mapping-private opaque data:
                                          * usually used for buffer_heads
                                          * if PagePrivate set; used for
                                          * swp_entry_t if PageSwapCache
                                          * When page is free, this indicates
                                          * order in the buddy system.
                                          */
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+               spinlock_t ptl;
+#endif
+       } u;
         struct address_space *mapping;  /* If low bit clear, points to
                                          * inode address_space, or NULL.
                                          * If page mapped as anonymous
@@ -260,6 +265,9 @@ struct page {
  #endif /* WANT_PAGE_VIRTUAL */
  };
  
+#define page_private(page)             ((page)->u.private)
+#define set_page_private(page, v)      ((page)->u.private = (v))
+
  /*
   * FIXME: take this include out, include page-flags.h in
   * files which need it (119 of them)
@@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *));
  
  #ifdef CONFIG_HUGETLB_PAGE
  
-static inline int page_count(struct page *p)
+static inline int page_count(struct page *page)
  {
-       if (PageCompound(p))
-               p = (struct page *)p->private;
-       return atomic_read(&(p)->_count) + 1;
+       if (PageCompound(page))
+               page = (struct page *)page_private(page);
+       return atomic_read(&page->_count) + 1;
  }
  
  static inline void get_page(struct page *page)
  {
         if (unlikely(PageCompound(page)))
-               page = (struct page *)page->private;
+               page = (struct page *)page_private(page);
         atomic_inc(&page->_count);
  }
  
@@ -338,7 +346,7 @@ static inline void get_page(struct page *page)
  
  static inline void put_page(struct page *page)
  {
-       if (!PageReserved(page) && put_page_testzero(page))
+       if (put_page_testzero(page))
                 __page_cache_release(page);
  }
  
@@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page)
  static inline pgoff_t page_index(struct page *page)
  {
         if (unlikely(PageSwapCache(page)))
-               return page->private;
+               return page_private(page);
         return page->index;
  }
  
@@ -682,7 +690,7 @@ struct zap_details {
  
  unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
                 unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlb,
                 struct vm_area_struct *start_vma, unsigned long start_addr,
                 unsigned long end_addr, unsigned long *nr_accounted,
                 struct zap_details *);
@@ -704,10 +712,6 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
  }
  
  extern int vmtruncate(struct inode * inode, loff_t offset);
-extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
-extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
  extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
  extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
  extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
@@ -723,6 +727,7 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
  
  int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
                 int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
  
  int __set_page_dirty_buffers(struct page *page);
  int __set_page_dirty_nobuffers(struct page *page);
@@ -759,38 +764,83 @@ struct shrinker;
  extern struct shrinker *set_shrinker(int, shrinker_t);
  extern void remove_shrinker(struct shrinker *shrinker);
  
-/*
- * On a two-level or three-level page table, this ends up being trivial. Thus
- * the inlining and the symmetry break with pte_alloc_map() that does all
- * of this out-of-line.
- */
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
+
  /*
   * The following ifdef needed to get the 4level-fixup.h header to work.
   * Remove it when 4level-fixup.h has been removed.
   */
-#ifdef CONFIG_MMU
-#ifndef __ARCH_HAS_4LEVEL_HACK 
+#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
  static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
  {
-       if (pgd_none(*pgd))
-               return __pud_alloc(mm, pgd, address);
-       return pud_offset(pgd, address);
+       return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
+               NULL: pud_offset(pgd, address);
  }
  
  static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
  {
-       if (pud_none(*pud))
-               return __pmd_alloc(mm, pud, address);
-       return pmd_offset(pud, address);
+       return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
+               NULL: pmd_offset(pud, address);
  }
-#endif
-#endif /* CONFIG_MMU */
+#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
+
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * We tuck a spinlock to guard each pagetable page into its struct page,
+ * at page->private, with BUILD_BUG_ON to make sure that this will not
+ * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
+ * When freeing, reset page->mapping so free_pages_check won't complain.
+ */
+#define __pte_lockptr(page)    &((page)->u.ptl)
+#define pte_lock_init(_page)   do {                                    \
+       spin_lock_init(__pte_lockptr(_page));                           \
+} while (0)
+#define pte_lock_deinit(page)  ((page)->mapping = NULL)
+#define pte_lockptr(mm, pmd)   ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#else
+/*
+ * We use mm->page_table_lock to guard all pagetable pages of the mm.
+ */
+#define pte_lock_init(page)    do {} while (0)
+#define pte_lock_deinit(page)  do {} while (0)
+#define pte_lockptr(mm, pmd)   ({(void)(pmd); &(mm)->page_table_lock;})
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
+#define pte_offset_map_lock(mm, pmd, address, ptlp)    \
+({                                                     \
+       spinlock_t *__ptl = pte_lockptr(mm, pmd);       \
+       pte_t *__pte = pte_offset_map(pmd, address);    \
+       *(ptlp) = __ptl;                                \
+       spin_lock(__ptl);                               \
+       __pte;                                          \
+})
+
+#define pte_unmap_unlock(pte, ptl)     do {            \
+       spin_unlock(ptl);                               \
+       pte_unmap(pte);                                 \
+} while (0)
+
+#define pte_alloc_map(mm, pmd, address)                        \
+       ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+               NULL: pte_offset_map(pmd, address))
+
+#define pte_alloc_map_lock(mm, pmd, address, ptlp)     \
+       ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+               NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
+
+#define pte_alloc_kernel(pmd, address)                 \
+       ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
+               NULL: pte_offset_kernel(pmd, address))
  
  extern void free_area_init(unsigned long * zones_size);
  extern void free_area_init_node(int nid, pg_data_t *pgdat,
         unsigned long * zones_size, unsigned long zone_start_pfn, 
         unsigned long *zholes_size);
  extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
+extern void setup_per_zone_pages_min(void);
  extern void mem_init(void);
  extern void show_mem(void);
  extern void si_meminfo(struct sysinfo * val);
@@ -834,6 +884,7 @@ extern int split_vma(struct mm_struct *,
  extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
  extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
         struct rb_node **, struct rb_node *);
+extern void unlink_file_vma(struct vm_area_struct *);
  extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
         unsigned long addr, unsigned long len, pgoff_t pgoff);
  extern void exit_mmap(struct mm_struct *);
@@ -894,7 +945,8 @@ void handle_ra_miss(struct address_space *mapping,
  unsigned long max_sane_readahead(unsigned long nr);
  
  /* Do stack extension */
-extern int expand_stack(struct vm_area_struct * vma, unsigned long address);
+extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
+extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
  
  /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
  extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
@@ -917,40 +969,28 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
         return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
  }
  
-extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
+struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
+struct page *vmalloc_to_page(void *addr);
+unsigned long vmalloc_to_pfn(void *addr);
+int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+                       unsigned long pfn, unsigned long size, pgprot_t);
  
-extern struct page * vmalloc_to_page(void *addr);
-extern unsigned long vmalloc_to_pfn(void *addr);
-extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
-               int write);
-extern int check_user_page_readable(struct mm_struct *mm, unsigned long address);
-int remap_pfn_range(struct vm_area_struct *, unsigned long,
-               unsigned long, unsigned long, pgprot_t);
+struct page *follow_page(struct mm_struct *, unsigned long address,
+                       unsigned int foll_flags);
+#define FOLL_WRITE     0x01    /* check pte is writable */
+#define FOLL_TOUCH     0x02    /* mark page accessed */
+#define FOLL_GET       0x04    /* do get_page on page */
+#define FOLL_ANON      0x08    /* give ZERO_PAGE if no pgtable */
  
  #ifdef CONFIG_PROC_FS
-void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
+void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
  #else
-static inline void __vm_stat_account(struct mm_struct *mm,
+static inline void vm_stat_account(struct mm_struct *mm,
                         unsigned long flags, struct file *file, long pages)
  {
  }
  #endif /* CONFIG_PROC_FS */
  
-static inline void vm_stat_account(struct vm_area_struct *vma)
-{
-       __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-                                                       vma_pages(vma));
-}
-
-static inline void vm_stat_unaccount(struct vm_area_struct *vma)
-{
-       __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-                                                       -vma_pages(vma));
-}
-
-/* update per process rss and vm hiwater data */
-extern void update_mem_hiwater(struct task_struct *tsk);
-
  #ifndef CONFIG_DEBUG_PAGEALLOC
  static inline void
  kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 7519eb4..f5fa308 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -12,6 +12,7 @@
  #include <linux/threads.h>
  #include <linux/numa.h>
  #include <linux/init.h>
+#include <linux/seqlock.h>
  #include <asm/atomic.h>
  
  /* Free memory management - zoned buddy allocator.  */
@@ -137,6 +138,10 @@ struct zone {
          * free areas of different sizes
          */
         spinlock_t              lock;
+#ifdef CONFIG_MEMORY_HOTPLUG
+       /* see spanned/present_pages for more description */
+       seqlock_t               span_seqlock;
+#endif
         struct free_area        free_area[MAX_ORDER];
  
  
@@ -220,6 +225,16 @@ struct zone {
         /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
         unsigned long           zone_start_pfn;
  
+       /*
+        * zone_start_pfn, spanned_pages and present_pages are all
+        * protected by span_seqlock.  It is a seqlock because it has
+        * to be read outside of zone->lock, and it is done in the main
+        * allocator path.  But, it is written quite infrequently.
+        *
+        * The lock is declared along with zone->lock because it is
+        * frequently read in proximity to zone->lock.  It's good to
+        * give them a chance of being in the same cacheline.
+        */
         unsigned long           spanned_pages;  /* total size, including holes */
         unsigned long           present_pages;  /* amount of memory (excluding holes) */
  
@@ -273,6 +288,16 @@ typedef struct pglist_data {
         struct page *node_mem_map;
  #endif
         struct bootmem_data *bdata;
+#ifdef CONFIG_MEMORY_HOTPLUG
+       /*
+        * Must be held any time you expect node_start_pfn, node_present_pages
+        * or node_spanned_pages stay constant.  Holding this will also
+        * guarantee that any pfn_valid() stays that way.
+        *
+        * Nests above zone->lock and zone->size_seqlock.
+        */
+       spinlock_t node_size_lock;
+#endif
         unsigned long node_start_pfn;
         unsigned long node_present_pages; /* total number of physical pages */
         unsigned long node_spanned_pages; /* total size of physical page
@@ -293,6 +318,8 @@ typedef struct pglist_data {
  #endif
  #define nid_page_nr(nid, pagenr)       pgdat_page_nr(NODE_DATA(nid),(pagenr))
  
+#include <linux/memory_hotplug.h>
+
  extern struct pglist_data *pgdat_list;
  
  void __get_zone_counts(unsigned long *active, unsigned long *inactive,
@@ -509,6 +536,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
                 return NULL;
         return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
  }
+extern int __section_nr(struct mem_section* ms);
  
  /*
   * We use the lower bits of the mem_map pointer to store
diff --git a/include/linux/rmap.h b/include/linux/rmap.h

index e80fb7e..35b30e6 100644 (file)
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -95,8 +95,8 @@ int try_to_unmap(struct page *);
  /*
   * Called from mm/filemap_xip.c to unmap empty zero page
   */
-pte_t *page_check_address(struct page *, struct mm_struct *, unsigned long);
-
+pte_t *page_check_address(struct page *, struct mm_struct *,
+                               unsigned long, spinlock_t **);
  
  /*
   * Used by swapoff to help locate where page is expected in vma.
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h

index b52a2af..f30f805 100644 (file)
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -61,5 +61,10 @@ extern void FASTCALL(__up_read(struct rw_semaphore *sem));
  extern void FASTCALL(__up_write(struct rw_semaphore *sem));
  extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem));
  
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+       return (sem->activity != 0);
+}
+
  #endif /* __KERNEL__ */
  #endif /* _LINUX_RWSEM_SPINLOCK_H */
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h

index 7f717e9..66ff545 100644 (file)
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -1,14 +1,23 @@
  #ifndef _LINUX_SCATTERLIST_H
  #define _LINUX_SCATTERLIST_H
  
-static inline void sg_init_one(struct scatterlist *sg,
-                              u8 *buf, unsigned int buflen)
-{
-       memset(sg, 0, sizeof(*sg));
+#include <asm/scatterlist.h>
+#include <linux/mm.h>
+#include <linux/string.h>
  
+static inline void sg_set_buf(struct scatterlist *sg, void *buf,
+                             unsigned int buflen)
+{
         sg->page = virt_to_page(buf);
         sg->offset = offset_in_page(buf);
         sg->length = buflen;
  }
  
+static inline void sg_init_one(struct scatterlist *sg, void *buf,
+                              unsigned int buflen)
+{
+       memset(sg, 0, sizeof(*sg));
+       sg_set_buf(sg, buf, buflen);
+}
+
  #endif /* _LINUX_SCATTERLIST_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 27519df..1c30bc3 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -249,6 +249,36 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
  extern void arch_unmap_area(struct mm_struct *, unsigned long);
  extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
  
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * The mm counters are not protected by its page_table_lock,
+ * so must be incremented atomically.
+ */
+#ifdef ATOMIC64_INIT
+#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member)
+typedef atomic64_t mm_counter_t;
+#else /* !ATOMIC64_INIT */
+/*
+ * The counters wrap back to 0 at 2^32 * PAGE_SIZE,
+ * that is, at 16TB if using 4kB page size.
+ */
+#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member)
+typedef atomic_t mm_counter_t;
+#endif /* !ATOMIC64_INIT */
+
+#else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+/*
+ * The mm counters are protected by its page_table_lock,
+ * so can be incremented directly.
+ */
  #define set_mm_counter(mm, member, value) (mm)->_##member = (value)
  #define get_mm_counter(mm, member) ((mm)->_##member)
  #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
@@ -256,6 +286,20 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
  #define dec_mm_counter(mm, member) (mm)->_##member--
  typedef unsigned long mm_counter_t;
  
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
+#define get_mm_rss(mm)                                 \
+       (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
+#define update_hiwater_rss(mm) do {                    \
+       unsigned long _rss = get_mm_rss(mm);            \
+       if ((mm)->hiwater_rss < _rss)                   \
+               (mm)->hiwater_rss = _rss;               \
+} while (0)
+#define update_hiwater_vm(mm)  do {                    \
+       if ((mm)->hiwater_vm < (mm)->total_vm)          \
+               (mm)->hiwater_vm = (mm)->total_vm;      \
+} while (0)
+
  struct mm_struct {
         struct vm_area_struct * mmap;           /* list of VMAs */
         struct rb_root mm_rb;
@@ -279,15 +323,20 @@ struct mm_struct {
                                                  * by mmlist_lock
                                                  */
  
+       /* Special counters, in some configurations protected by the
+        * page_table_lock, in other configurations by being atomic.
+        */
+       mm_counter_t _file_rss;
+       mm_counter_t _anon_rss;
+
+       unsigned long hiwater_rss;      /* High-watermark of RSS usage */
+       unsigned long hiwater_vm;       /* High-water virtual memory usage */
+
+       unsigned long total_vm, locked_vm, shared_vm, exec_vm;
+       unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
         unsigned long start_code, end_code, start_data, end_data;
         unsigned long start_brk, brk, start_stack;
         unsigned long arg_start, arg_end, env_start, env_end;
-       unsigned long total_vm, locked_vm, shared_vm;
-       unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;
-
-       /* Special counters protected by the page_table_lock */
-       mm_counter_t _rss;
-       mm_counter_t _anon_rss;
  
         unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
  
@@ -308,11 +357,7 @@ struct mm_struct {
         /* aio bits */
         rwlock_t                ioctx_list_lock;
         struct kioctx           *ioctx_list;
-
         struct kioctx           default_kioctx;
-
-       unsigned long hiwater_rss;      /* High-water RSS usage */
-       unsigned long hiwater_vm;       /* High-water virtual memory usage */
  };
  
  struct sighand_struct {
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h

index 3701a06..1d5577b 100644 (file)
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -32,10 +32,14 @@ struct vm_struct {
   *     Highlevel APIs for driver use
   */
  extern void *vmalloc(unsigned long size);
+extern void *vmalloc_node(unsigned long size, int node);
  extern void *vmalloc_exec(unsigned long size);
  extern void *vmalloc_32(unsigned long size);
  extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
-extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot);
+extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
+                               pgprot_t prot);
+extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask,
+                               pgprot_t prot, int node);
  extern void vfree(void *addr);
  
  extern void *vmap(struct page **pages, unsigned int count,
@@ -48,6 +52,8 @@ extern void vunmap(void *addr);
  extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
  extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
                                         unsigned long start, unsigned long end);
+extern struct vm_struct *get_vm_area_node(unsigned long size,
+                                       unsigned long flags, int node);
  extern struct vm_struct *remove_vm_area(void *addr);
  extern struct vm_struct *__remove_vm_area(void *addr);
  extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
diff --git a/ipc/shm.c b/ipc/shm.c

index dca9048..b58c651 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -233,10 +233,11 @@ static int newseg (key_t key, int shmflg, size_t size)
         shp->id = shm_buildid(id,shp->shm_perm.seq);
         shp->shm_file = file;
         file->f_dentry->d_inode->i_ino = shp->id;
-       if (shmflg & SHM_HUGETLB)
-               set_file_hugepages(file);
-       else
+
+       /* Hugetlb ops would have already been assigned. */
+       if (!(shmflg & SHM_HUGETLB))
                 file->f_op = &shm_file_operations;
+
         shm_tot += numpages;
         shm_unlock(shp);
         return shp->id;
diff --git a/kernel/acct.c b/kernel/acct.c

index b756f52..2e3f4a4 100644 (file)
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk)
                 if (delta == 0)
                         return;
                 tsk->acct_stimexpd = tsk->stime;
-               tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
+               tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
                 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
         }
  }
diff --git a/kernel/exit.c b/kernel/exit.c

index 3b25b18..79f52b8 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long code)
                                 preempt_count());
  
         acct_update_integrals(tsk);
-       update_mem_hiwater(tsk);
+       if (tsk->mm) {
+               update_hiwater_rss(tsk->mm);
+               update_hiwater_vm(tsk->mm);
+       }
         group_dead = atomic_dec_and_test(&tsk->signal->live);
         if (group_dead) {
                 del_timer_sync(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c

index 280bd44..8a06961 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -182,37 +182,37 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
  }
  
  #ifdef CONFIG_MMU
-static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
+static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
  {
-       struct vm_area_struct * mpnt, *tmp, **pprev;
+       struct vm_area_struct *mpnt, *tmp, **pprev;
         struct rb_node **rb_link, *rb_parent;
         int retval;
         unsigned long charge;
         struct mempolicy *pol;
  
         down_write(&oldmm->mmap_sem);
-       flush_cache_mm(current->mm);
+       flush_cache_mm(oldmm);
+       down_write(&mm->mmap_sem);
+
         mm->locked_vm = 0;
         mm->mmap = NULL;
         mm->mmap_cache = NULL;
         mm->free_area_cache = oldmm->mmap_base;
         mm->cached_hole_size = ~0UL;
         mm->map_count = 0;
-       set_mm_counter(mm, rss, 0);
-       set_mm_counter(mm, anon_rss, 0);
         cpus_clear(mm->cpu_vm_mask);
         mm->mm_rb = RB_ROOT;
         rb_link = &mm->mm_rb.rb_node;
         rb_parent = NULL;
         pprev = &mm->mmap;
  
-       for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
+       for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
                 struct file *file;
  
                 if (mpnt->vm_flags & VM_DONTCOPY) {
                         long pages = vma_pages(mpnt);
                         mm->total_vm -= pages;
-                       __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+                       vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
                                                                 -pages);
                         continue;
                 }
@@ -253,12 +253,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
                 }
  
                 /*
-                * Link in the new vma and copy the page table entries:
-                * link in first so that swapoff can see swap entries.
-                * Note that, exceptionally, here the vma is inserted
-                * without holding mm->mmap_sem.
+                * Link in the new vma and copy the page table entries.
                  */
-               spin_lock(&mm->page_table_lock);
                 *pprev = tmp;
                 pprev = &tmp->vm_next;
  
@@ -267,8 +263,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
                 rb_parent = &tmp->vm_rb;
  
                 mm->map_count++;
-               retval = copy_page_range(mm, current->mm, tmp);
-               spin_unlock(&mm->page_table_lock);
+               retval = copy_page_range(mm, oldmm, tmp);
  
                 if (tmp->vm_ops && tmp->vm_ops->open)
                         tmp->vm_ops->open(tmp);
@@ -277,9 +272,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
                         goto out;
         }
         retval = 0;
-
  out:
-       flush_tlb_mm(current->mm);
+       up_write(&mm->mmap_sem);
+       flush_tlb_mm(oldmm);
         up_write(&oldmm->mmap_sem);
         return retval;
  fail_nomem_policy:
@@ -323,6 +318,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
         INIT_LIST_HEAD(&mm->mmlist);
         mm->core_waiters = 0;
         mm->nr_ptes = 0;
+       set_mm_counter(mm, file_rss, 0);
+       set_mm_counter(mm, anon_rss, 0);
         spin_lock_init(&mm->page_table_lock);
         rwlock_init(&mm->ioctx_list_lock);
         mm->ioctx_list = NULL;
@@ -499,7 +496,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
         if (retval)
                 goto free_pt;
  
-       mm->hiwater_rss = get_mm_counter(mm,rss);
+       mm->hiwater_rss = get_mm_rss(mm);
         mm->hiwater_vm = mm->total_vm;
  
  good_mm:
diff --git a/kernel/futex.c b/kernel/futex.c

index ca05fe6..3b4d5ad 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
         /*
          * Do a quick atomic lookup first - this is the fastpath.
          */
-       spin_lock(&current->mm->page_table_lock);
-       page = follow_page(mm, uaddr, 0);
+       page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
         if (likely(page != NULL)) {
                 key->shared.pgoff =
                         page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-               spin_unlock(&current->mm->page_table_lock);
+               put_page(page);
                 return 0;
         }
-       spin_unlock(&current->mm->page_table_lock);
  
         /*
          * Do it the general way.
diff --git a/kernel/kexec.c b/kernel/kexec.c

index 36c5d9c..2c95848 100644 (file)
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
         if (pages) {
                 unsigned int count, i;
                 pages->mapping = NULL;
-               pages->private = order;
+               set_page_private(pages, order);
                 count = 1 << order;
                 for (i = 0; i < count; i++)
                         SetPageReserved(pages + i);
@@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page)
  {
         unsigned int order, count, i;
  
-       order = page->private;
+       order = page_private(page);
         count = 1 << order;
         for (i = 0; i < count; i++)
                 ClearPageReserved(page + i);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c

index 10bc5ec..016504c 100644 (file)
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -578,15 +578,23 @@ static int save_highmem_zone(struct zone *zone)
                         continue;
                 page = pfn_to_page(pfn);
                 /*
-                * This condition results from rvmalloc() sans vmalloc_32()
-                * and architectural memory reservations. This should be
-                * corrected eventually when the cases giving rise to this
-                * are better understood.
+                * PageReserved results from rvmalloc() sans vmalloc_32()
+                * and architectural memory reservations.
+                *
+                * rvmalloc should not cause this, because all implementations
+                * appear to always be using vmalloc_32 on architectures with
+                * highmem. This is a good thing, because we would like to save
+                * rvmalloc pages.
+                *
+                * It appears to be triggered by pages which do not point to
+                * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
+                * which sets PageReserved if the page does not point to valid
+                * RAM.
+                *
+                * XXX: must remove usage of PageReserved!
                  */
-               if (PageReserved(page)) {
-                       printk("highmem reserved page?!\n");
+               if (PageReserved(page))
                         continue;
-               }
                 BUG_ON(PageNosave(page));
                 if (PageNosaveFree(page))
                         continue;
@@ -672,10 +680,9 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
                 return 0;
  
         page = pfn_to_page(pfn);
-       BUG_ON(PageReserved(page) && PageNosave(page));
         if (PageNosave(page))
                 return 0;
-       if (PageReserved(page) && pfn_is_nosave(pfn)) {
+       if (pfn_is_nosave(pfn)) {
                 pr_debug("[nosave pfn 0x%lx]", pfn);
                 return 0;
         }
diff --git a/kernel/sched.c b/kernel/sched.c

index 1e5cafd..4f26c54 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                 cpustat->idle = cputime64_add(cpustat->idle, tmp);
         /* Account for system time used */
         acct_update_integrals(p);
-       /* Update rss highwater mark */
-       update_mem_hiwater(p);
  }
  
  /*
diff --git a/kernel/timer.c b/kernel/timer.c

index 3ba10fa..6a2e5f8 100644 (file)
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -752,6 +752,15 @@ static void second_overflow(void)
      else
         time_adj += (time_adj >> 2) + (time_adj >> 5);
  #endif
+#if HZ == 250
+    /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
+     * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+     */
+    if (time_adj < 0)
+       time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
+    else
+       time_adj += (time_adj >> 6) + (time_adj >> 7);
+#endif
  #if HZ == 1000
      /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
       * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
diff --git a/mm/Kconfig b/mm/Kconfig

index 391ffc5..1a4473f 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -111,3 +111,24 @@ config SPARSEMEM_STATIC
  config SPARSEMEM_EXTREME
         def_bool y
         depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+# eventually, we can have this option just 'select SPARSEMEM'
+config MEMORY_HOTPLUG
+       bool "Allow for memory hot-add"
+       depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
+
+comment "Memory hotplug is currently incompatible with Software Suspend"
+       depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
+#
+config SPLIT_PTLOCK_CPUS
+       int
+       default "4096" if ARM && !CPU_CACHE_VIPT
+       default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
+       default "4"
diff --git a/mm/Makefile b/mm/Makefile

index 4cd69e3..2fa6d2c 100644 (file)
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA)    += mempolicy.o
  obj-$(CONFIG_SPARSEMEM)        += sparse.o
  obj-$(CONFIG_SHMEM) += shmem.o
  obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
-
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
  obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c

index a58699b..e8c5671 100644 (file)
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                                 if (j + 16 < BITS_PER_LONG)
                                         prefetchw(page + j + 16);
                                 __ClearPageReserved(page + j);
+                               set_page_count(page + j, 0);
                         }
                         __free_pages(page, order);
                         i += BITS_PER_LONG;
diff --git a/mm/filemap.c b/mm/filemap.c

index 1c31b2f..768687f 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
   *
   *  ->mmap_sem
   *    ->i_mmap_lock
- *      ->page_table_lock      (various places, mainly in mmap.c)
+ *      ->page_table_lock or pte_lock  (various, mainly in memory.c)
   *        ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
   *
   *  ->mmap_sem
@@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
   *    ->anon_vma.lock          (vma_adjust)
   *
   *  ->anon_vma.lock
- *    ->page_table_lock                (anon_vma_prepare and various)
+ *    ->page_table_lock or pte_lock    (anon_vma_prepare and various)
   *
- *  ->page_table_lock
+ *  ->page_table_lock or pte_lock
   *    ->swap_lock              (try_to_unmap_one)
   *    ->private_lock           (try_to_unmap_one)
   *    ->tree_lock              (try_to_unmap_one)
@@ -152,7 +152,7 @@ static int sync_page(void *word)
          * in the ->sync_page() methods make essential use of the
          * page_mapping(), merely passing the page down to the backing
          * device's unplug functions when it's non-NULL, which in turn
-        * ignore it for all cases but swap, where only page->private is
+        * ignore it for all cases but swap, where only page_private(page) is
          * of interest. When page_mapping() does go NULL, the entire
          * call stack gracefully ignores the page and returns.
          * -- wli
@@ -1520,7 +1520,7 @@ repeat:
                         page_cache_release(page);
                         return err;
                 }
-       } else {
+       } else if (vma->vm_flags & VM_NONLINEAR) {
                 /* No page was found just because we can't read it in now (being
                  * here implies nonblock != 0), but the page may exist, so set
                  * the PTE to fault it in later. */
@@ -1537,6 +1537,7 @@ repeat:
  
         return 0;
  }
+EXPORT_SYMBOL(filemap_populate);
  
  struct vm_operations_struct generic_file_vm_ops = {
         .nopage         = filemap_nopage,
@@ -1555,7 +1556,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
         vma->vm_ops = &generic_file_vm_ops;
         return 0;
  }
-EXPORT_SYMBOL(filemap_populate);
  
  /*
   * This is for filesystems which do not implement ->writepage.
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c

index 8c199f5..9cf687e 100644 (file)
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,6 +174,8 @@ __xip_unmap (struct address_space * mapping,
         unsigned long address;
         pte_t *pte;
         pte_t pteval;
+       spinlock_t *ptl;
+       struct page *page;
  
         spin_lock(&mapping->i_mmap_lock);
         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -181,19 +183,17 @@ __xip_unmap (struct address_space * mapping,
                 address = vma->vm_start +
                         ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-               /*
-                * We need the page_table_lock to protect us from page faults,
-                * munmap, fork, etc...
-                */
-               pte = page_check_address(ZERO_PAGE(address), mm,
-                                        address);
-               if (!IS_ERR(pte)) {
+               page = ZERO_PAGE(address);
+               pte = page_check_address(page, mm, address, &ptl);
+               if (pte) {
                         /* Nuke the page table entry. */
                         flush_cache_page(vma, address, pte_pfn(*pte));
                         pteval = ptep_clear_flush(vma, address, pte);
+                       page_remove_rmap(page);
+                       dec_mm_counter(mm, file_rss);
                         BUG_ON(pte_dirty(pteval));
-                       pte_unmap(pte);
-                       spin_unlock(&mm->page_table_lock);
+                       pte_unmap_unlock(pte, ptl);
+                       page_cache_release(page);
                 }
         }
         spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +228,7 @@ xip_file_nopage(struct vm_area_struct * area,
  
         page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
         if (!IS_ERR(page)) {
-               return page;
+               goto out;
         }
         if (PTR_ERR(page) != -ENODATA)
                 return NULL;
@@ -249,6 +249,8 @@ xip_file_nopage(struct vm_area_struct * area,
                 page = ZERO_PAGE(address);
         }
  
+out:
+       page_cache_get(page);
         return page;
  }
  
diff --git a/mm/fremap.c b/mm/fremap.c

index ab23a06..d862be3 100644 (file)
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -20,33 +20,32 @@
  #include <asm/cacheflush.h>
  #include <asm/tlbflush.h>
  
-static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long addr, pte_t *ptep)
  {
         pte_t pte = *ptep;
+       struct page *page = NULL;
  
-       if (pte_none(pte))
-               return;
         if (pte_present(pte)) {
                 unsigned long pfn = pte_pfn(pte);
-
                 flush_cache_page(vma, addr, pfn);
                 pte = ptep_clear_flush(vma, addr, ptep);
-               if (pfn_valid(pfn)) {
-                       struct page *page = pfn_to_page(pfn);
-                       if (!PageReserved(page)) {
-                               if (pte_dirty(pte))
-                                       set_page_dirty(page);
-                               page_remove_rmap(page);
-                               page_cache_release(page);
-                               dec_mm_counter(mm, rss);
-                       }
+               if (unlikely(!pfn_valid(pfn))) {
+                       print_bad_pte(vma, pte, addr);
+                       goto out;
                 }
+               page = pfn_to_page(pfn);
+               if (pte_dirty(pte))
+                       set_page_dirty(page);
+               page_remove_rmap(page);
+               page_cache_release(page);
         } else {
                 if (!pte_file(pte))
                         free_swap_and_cache(pte_to_swp_entry(pte));
                 pte_clear(mm, addr, ptep);
         }
+out:
+       return !!page;
  }
  
  /*
@@ -64,21 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
         pud_t *pud;
         pgd_t *pgd;
         pte_t pte_val;
+       spinlock_t *ptl;
+
+       BUG_ON(vma->vm_flags & VM_RESERVED);
  
         pgd = pgd_offset(mm, addr);
-       spin_lock(&mm->page_table_lock);
-       
         pud = pud_alloc(mm, pgd, addr);
         if (!pud)
-               goto err_unlock;
-
+               goto out;
         pmd = pmd_alloc(mm, pud, addr);
         if (!pmd)
-               goto err_unlock;
-
-       pte = pte_alloc_map(mm, pmd, addr);
+               goto out;
+       pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
         if (!pte)
-               goto err_unlock;
+               goto out;
  
         /*
          * This page may have been truncated. Tell the
@@ -88,29 +86,27 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
         inode = vma->vm_file->f_mapping->host;
         size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
         if (!page->mapping || page->index >= size)
-               goto err_unlock;
+               goto unlock;
         err = -ENOMEM;
         if (page_mapcount(page) > INT_MAX/2)
-               goto err_unlock;
+               goto unlock;
  
-       zap_pte(mm, vma, addr, pte);
+       if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
+               inc_mm_counter(mm, file_rss);
  
-       inc_mm_counter(mm,rss);
         flush_icache_page(vma, page);
         set_pte_at(mm, addr, pte, mk_pte(page, prot));
         page_add_file_rmap(page);
         pte_val = *pte;
-       pte_unmap(pte);
         update_mmu_cache(vma, addr, pte_val);
-
         err = 0;
-err_unlock:
-       spin_unlock(&mm->page_table_lock);
+unlock:
+       pte_unmap_unlock(pte, ptl);
+out:
         return err;
  }
  EXPORT_SYMBOL(install_page);
  
-
  /*
   * Install a file pte to a given virtual memory address, release any
   * previously existing mapping.
@@ -124,37 +120,35 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
         pud_t *pud;
         pgd_t *pgd;
         pte_t pte_val;
+       spinlock_t *ptl;
+
+       BUG_ON(vma->vm_flags & VM_RESERVED);
  
         pgd = pgd_offset(mm, addr);
-       spin_lock(&mm->page_table_lock);
-       
         pud = pud_alloc(mm, pgd, addr);
         if (!pud)
-               goto err_unlock;
-
+               goto out;
         pmd = pmd_alloc(mm, pud, addr);
         if (!pmd)
-               goto err_unlock;
-
-       pte = pte_alloc_map(mm, pmd, addr);
+               goto out;
+       pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
         if (!pte)
-               goto err_unlock;
+               goto out;
  
-       zap_pte(mm, vma, addr, pte);
+       if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
+               update_hiwater_rss(mm);
+               dec_mm_counter(mm, file_rss);
+       }
  
         set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
         pte_val = *pte;
-       pte_unmap(pte);
         update_mmu_cache(vma, addr, pte_val);
-       spin_unlock(&mm->page_table_lock);
-       return 0;
-
-err_unlock:
-       spin_unlock(&mm->page_table_lock);
+       pte_unmap_unlock(pte, ptl);
+       err = 0;
+out:
         return err;
  }
  
-
  /***
   * sys_remap_file_pages - remap arbitrary pages of a shared backing store
   *                        file within an existing vma.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 61d3806..c9b4336 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -277,19 +277,23 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
         unsigned long addr;
  
         for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+               src_pte = huge_pte_offset(src, addr);
+               if (!src_pte)
+                       continue;
                 dst_pte = huge_pte_alloc(dst, addr);
                 if (!dst_pte)
                         goto nomem;
+               spin_lock(&dst->page_table_lock);
                 spin_lock(&src->page_table_lock);
-               src_pte = huge_pte_offset(src, addr);
-               if (src_pte && !pte_none(*src_pte)) {
+               if (!pte_none(*src_pte)) {
                         entry = *src_pte;
                         ptepage = pte_page(entry);
                         get_page(ptepage);
-                       add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+                       add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
                         set_huge_pte_at(dst, addr, dst_pte, entry);
                 }
                 spin_unlock(&src->page_table_lock);
+               spin_unlock(&dst->page_table_lock);
         }
         return 0;
  
@@ -310,12 +314,14 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
         BUG_ON(start & ~HPAGE_MASK);
         BUG_ON(end & ~HPAGE_MASK);
  
+       spin_lock(&mm->page_table_lock);
+
+       /* Update high watermark before we lower rss */
+       update_hiwater_rss(mm);
+
         for (address = start; address < end; address += HPAGE_SIZE) {
                 ptep = huge_pte_offset(mm, address);
-               if (! ptep)
-                       /* This can happen on truncate, or if an
-                        * mmap() is aborted due to an error before
-                        * the prefault */
+               if (!ptep)
                         continue;
  
                 pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -324,96 +330,99 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
  
                 page = pte_page(pte);
                 put_page(page);
-               add_mm_counter(mm, rss,  - (HPAGE_SIZE / PAGE_SIZE));
+               add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
         }
-       flush_tlb_range(vma, start, end);
-}
-
-void zap_hugepage_range(struct vm_area_struct *vma,
-                       unsigned long start, unsigned long length)
-{
-       struct mm_struct *mm = vma->vm_mm;
  
-       spin_lock(&mm->page_table_lock);
-       unmap_hugepage_range(vma, start, start + length);
         spin_unlock(&mm->page_table_lock);
+       flush_tlb_range(vma, start, end);
  }
  
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+static struct page *find_lock_huge_page(struct address_space *mapping,
+                       unsigned long idx)
  {
-       struct mm_struct *mm = current->mm;
-       unsigned long addr;
-       int ret = 0;
-
-       WARN_ON(!is_vm_hugetlb_page(vma));
-       BUG_ON(vma->vm_start & ~HPAGE_MASK);
-       BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-       hugetlb_prefault_arch_hook(mm);
-
-       spin_lock(&mm->page_table_lock);
-       for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-               unsigned long idx;
-               pte_t *pte = huge_pte_alloc(mm, addr);
-               struct page *page;
-
-               if (!pte) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
+       struct page *page;
+       int err;
+       struct inode *inode = mapping->host;
+       unsigned long size;
+
+retry:
+       page = find_lock_page(mapping, idx);
+       if (page)
+               goto out;
+
+       /* Check to make sure the mapping hasn't been truncated */
+       size = i_size_read(inode) >> HPAGE_SHIFT;
+       if (idx >= size)
+               goto out;
+
+       if (hugetlb_get_quota(mapping))
+               goto out;
+       page = alloc_huge_page();
+       if (!page) {
+               hugetlb_put_quota(mapping);
+               goto out;
+       }
  
-               idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-                       + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-               page = find_get_page(mapping, idx);
-               if (!page) {
-                       /* charge the fs quota first */
-                       if (hugetlb_get_quota(mapping)) {
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-                       page = alloc_huge_page();
-                       if (!page) {
-                               hugetlb_put_quota(mapping);
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-                       ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-                       if (! ret) {
-                               unlock_page(page);
-                       } else {
-                               hugetlb_put_quota(mapping);
-                               free_huge_page(page);
-                               goto out;
-                       }
-               }
-               add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-               set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+       err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+       if (err) {
+               put_page(page);
+               hugetlb_put_quota(mapping);
+               if (err == -EEXIST)
+                       goto retry;
+               page = NULL;
         }
  out:
-       spin_unlock(&mm->page_table_lock);
-       return ret;
+       return page;
  }
  
-/*
- * On ia64 at least, it is possible to receive a hugetlb fault from a
- * stale zero entry left in the TLB from earlier hardware prefetching.
- * Low-level arch code should already have flushed the stale entry as
- * part of its fault handling, but we do need to accept this minor fault
- * and return successfully.  Whereas the "normal" case is that this is
- * an access to a hugetlb page which has been truncated off since mmap.
- */
  int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long address, int write_access)
  {
         int ret = VM_FAULT_SIGBUS;
+       unsigned long idx;
+       unsigned long size;
         pte_t *pte;
+       struct page *page;
+       struct address_space *mapping;
+
+       pte = huge_pte_alloc(mm, address);
+       if (!pte)
+               goto out;
+
+       mapping = vma->vm_file->f_mapping;
+       idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+               + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+       /*
+        * Use page lock to guard against racing truncation
+        * before we get page_table_lock.
+        */
+       page = find_lock_huge_page(mapping, idx);
+       if (!page)
+               goto out;
  
         spin_lock(&mm->page_table_lock);
-       pte = huge_pte_offset(mm, address);
-       if (pte && !pte_none(*pte))
-               ret = VM_FAULT_MINOR;
+       size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+       if (idx >= size)
+               goto backout;
+
+       ret = VM_FAULT_MINOR;
+       if (!pte_none(*pte))
+               goto backout;
+
+       add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+       set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
         spin_unlock(&mm->page_table_lock);
+       unlock_page(page);
+out:
         return ret;
+
+backout:
+       spin_unlock(&mm->page_table_lock);
+       hugetlb_put_quota(mapping);
+       unlock_page(page);
+       put_page(page);
+       goto out;
  }
  
  int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -423,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
         unsigned long vpfn, vaddr = *position;
         int remainder = *length;
  
-       BUG_ON(!is_vm_hugetlb_page(vma));
-
         vpfn = vaddr/PAGE_SIZE;
         spin_lock(&mm->page_table_lock);
         while (vaddr < vma->vm_end && remainder) {
+               pte_t *pte;
+               struct page *page;
  
-               if (pages) {
-                       pte_t *pte;
-                       struct page *page;
-
-                       /* Some archs (sparc64, sh*) have multiple
-                        * pte_ts to each hugepage.  We have to make
-                        * sure we get the first, for the page
-                        * indexing below to work. */
-                       pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
-
-                       /* the hugetlb file might have been truncated */
-                       if (!pte || pte_none(*pte)) {
-                               remainder = 0;
-                               if (!i)
-                                       i = -EFAULT;
-                               break;
-                       }
+               /*
+                * Some archs (sparc64, sh*) have multiple pte_ts to
+                * each hugepage.  We have to make * sure we get the
+                * first, for the page indexing below to work.
+                */
+               pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
  
-                       page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+               if (!pte || pte_none(*pte)) {
+                       int ret;
  
-                       WARN_ON(!PageCompound(page));
+                       spin_unlock(&mm->page_table_lock);
+                       ret = hugetlb_fault(mm, vma, vaddr, 0);
+                       spin_lock(&mm->page_table_lock);
+                       if (ret == VM_FAULT_MINOR)
+                               continue;
+
+                       remainder = 0;
+                       if (!i)
+                               i = -EFAULT;
+                       break;
+               }
  
+               if (pages) {
+                       page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
                         get_page(page);
                         pages[i] = page;
                 }
diff --git a/mm/madvise.c b/mm/madvise.c

index 20e075d..17aaf3e 100644 (file)
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
                              unsigned long start, unsigned long end)
  {
         *prev = vma;
-       if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+       if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
                 return -EINVAL;
  
         if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c

index 1db40e9..0f60baf 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
  {
         struct page *page = pmd_page(*pmd);
         pmd_clear(pmd);
+       pte_lock_deinit(page);
         pte_free_tlb(tlb, page);
         dec_page_state(nr_page_table_pages);
         tlb->mm->nr_ptes--;
@@ -249,7 +250,7 @@ void free_pgd_range(struct mmu_gather **tlb,
                 free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
         } while (pgd++, addr = next, addr != end);
  
-       if (!tlb_is_full_mm(*tlb))
+       if (!(*tlb)->fullmm)
                 flush_tlb_pgtables((*tlb)->mm, start, end);
  }
  
@@ -260,6 +261,12 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
                 struct vm_area_struct *next = vma->vm_next;
                 unsigned long addr = vma->vm_start;
  
+               /*
+                * Hide vma from rmap and vmtruncate before freeing pgtables
+                */
+               anon_vma_unlink(vma);
+               unlink_file_vma(vma);
+
                 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
                         hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
                                 floor, next? next->vm_start: ceiling);
@@ -272,6 +279,8 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
                                                         HPAGE_SIZE)) {
                                 vma = next;
                                 next = vma->vm_next;
+                               anon_vma_unlink(vma);
+                               unlink_file_vma(vma);
                         }
                         free_pgd_range(tlb, addr, vma->vm_end,
                                 floor, next? next->vm_start: ceiling);
@@ -280,72 +289,78 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
         }
  }
  
-pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
-                               unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
  {
-       if (!pmd_present(*pmd)) {
-               struct page *new;
-
-               spin_unlock(&mm->page_table_lock);
-               new = pte_alloc_one(mm, address);
-               spin_lock(&mm->page_table_lock);
-               if (!new)
-                       return NULL;
-               /*
-                * Because we dropped the lock, we should re-check the
-                * entry, as somebody else could have populated it..
-                */
-               if (pmd_present(*pmd)) {
-                       pte_free(new);
-                       goto out;
-               }
+       struct page *new = pte_alloc_one(mm, address);
+       if (!new)
+               return -ENOMEM;
+
+       pte_lock_init(new);
+       spin_lock(&mm->page_table_lock);
+       if (pmd_present(*pmd)) {        /* Another has populated it */
+               pte_lock_deinit(new);
+               pte_free(new);
+       } else {
                 mm->nr_ptes++;
                 inc_page_state(nr_page_table_pages);
                 pmd_populate(mm, pmd, new);
         }
-out:
-       return pte_offset_map(pmd, address);
+       spin_unlock(&mm->page_table_lock);
+       return 0;
  }
  
-pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
  {
-       if (!pmd_present(*pmd)) {
-               pte_t *new;
+       pte_t *new = pte_alloc_one_kernel(&init_mm, address);
+       if (!new)
+               return -ENOMEM;
  
-               spin_unlock(&mm->page_table_lock);
-               new = pte_alloc_one_kernel(mm, address);
-               spin_lock(&mm->page_table_lock);
-               if (!new)
-                       return NULL;
+       spin_lock(&init_mm.page_table_lock);
+       if (pmd_present(*pmd))          /* Another has populated it */
+               pte_free_kernel(new);
+       else
+               pmd_populate_kernel(&init_mm, pmd, new);
+       spin_unlock(&init_mm.page_table_lock);
+       return 0;
+}
  
-               /*
-                * Because we dropped the lock, we should re-check the
-                * entry, as somebody else could have populated it..
-                */
-               if (pmd_present(*pmd)) {
-                       pte_free_kernel(new);
-                       goto out;
-               }
-               pmd_populate_kernel(mm, pmd, new);
-       }
-out:
-       return pte_offset_kernel(pmd, address);
+static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+{
+       if (file_rss)
+               add_mm_counter(mm, file_rss, file_rss);
+       if (anon_rss)
+               add_mm_counter(mm, anon_rss, anon_rss);
+}
+
+/*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+{
+       printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+                       "vm_flags = %lx, vaddr = %lx\n",
+               (long long)pte_val(pte),
+               (vma->vm_mm == current->mm ? current->comm : "???"),
+               vma->vm_flags, vaddr);
+       dump_stack();
  }
  
  /*
   * copy one vm_area from one task to the other. Assumes the page tables
   * already present in the new task to be cleared in the whole range
   * covered by this vma.
- *
- * dst->page_table_lock is held on entry and exit,
- * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
   */
  
  static inline void
  copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-               pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
-               unsigned long addr)
+               pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
+               unsigned long addr, int *rss)
  {
+       unsigned long vm_flags = vma->vm_flags;
         pte_t pte = *src_pte;
         struct page *page;
         unsigned long pfn;
@@ -357,29 +372,32 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                         /* make sure dst_mm is on swapoff's mmlist. */
                         if (unlikely(list_empty(&dst_mm->mmlist))) {
                                 spin_lock(&mmlist_lock);
-                               list_add(&dst_mm->mmlist, &src_mm->mmlist);
+                               if (list_empty(&dst_mm->mmlist))
+                                       list_add(&dst_mm->mmlist,
+                                                &src_mm->mmlist);
                                 spin_unlock(&mmlist_lock);
                         }
                 }
-               set_pte_at(dst_mm, addr, dst_pte, pte);
-               return;
+               goto out_set_pte;
         }
  
-       pfn = pte_pfn(pte);
-       /* the pte points outside of valid memory, the
-        * mapping is assumed to be good, meaningful
-        * and not mapped via rmap - duplicate the
-        * mapping as is.
+       /* If the region is VM_RESERVED, the mapping is not
+        * mapped via rmap - duplicate the pte as is.
          */
-       page = NULL;
-       if (pfn_valid(pfn))
-               page = pfn_to_page(pfn);
+       if (vm_flags & VM_RESERVED)
+               goto out_set_pte;
  
-       if (!page || PageReserved(page)) {
-               set_pte_at(dst_mm, addr, dst_pte, pte);
-               return;
+       pfn = pte_pfn(pte);
+       /* If the pte points outside of valid memory but
+        * the region is not VM_RESERVED, we have a problem.
+        */
+       if (unlikely(!pfn_valid(pfn))) {
+               print_bad_pte(vma, pte, addr);
+               goto out_set_pte; /* try to do something sane */
         }
  
+       page = pfn_to_page(pfn);
+
         /*
          * If it's a COW mapping, write protect it both
          * in the parent and the child
@@ -397,11 +415,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 pte = pte_mkclean(pte);
         pte = pte_mkold(pte);
         get_page(page);
-       inc_mm_counter(dst_mm, rss);
-       if (PageAnon(page))
-               inc_mm_counter(dst_mm, anon_rss);
-       set_pte_at(dst_mm, addr, dst_pte, pte);
         page_dup_rmap(page);
+       rss[!!PageAnon(page)]++;
+
+out_set_pte:
+       set_pte_at(dst_mm, addr, dst_pte, pte);
  }
  
  static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -409,38 +427,44 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 unsigned long addr, unsigned long end)
  {
         pte_t *src_pte, *dst_pte;
-       unsigned long vm_flags = vma->vm_flags;
-       int progress;
+       spinlock_t *src_ptl, *dst_ptl;
+       int progress = 0;
+       int rss[2];
  
  again:
-       dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
+       rss[1] = rss[0] = 0;
+       dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
         if (!dst_pte)
                 return -ENOMEM;
         src_pte = pte_offset_map_nested(src_pmd, addr);
+       src_ptl = pte_lockptr(src_mm, src_pmd);
+       spin_lock(src_ptl);
  
-       progress = 0;
-       spin_lock(&src_mm->page_table_lock);
         do {
                 /*
                  * We are holding two locks at this point - either of them
                  * could generate latencies in another task on another CPU.
                  */
-               if (progress >= 32 && (need_resched() ||
-                   need_lockbreak(&src_mm->page_table_lock) ||
-                   need_lockbreak(&dst_mm->page_table_lock)))
-                       break;
+               if (progress >= 32) {
+                       progress = 0;
+                       if (need_resched() ||
+                           need_lockbreak(src_ptl) ||
+                           need_lockbreak(dst_ptl))
+                               break;
+               }
                 if (pte_none(*src_pte)) {
                         progress++;
                         continue;
                 }
-               copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
+               copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
                 progress += 8;
         } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
-       spin_unlock(&src_mm->page_table_lock);
  
+       spin_unlock(src_ptl);
         pte_unmap_nested(src_pte - 1);
-       pte_unmap(dst_pte - 1);
-       cond_resched_lock(&dst_mm->page_table_lock);
+       add_mm_rss(dst_mm, rss[0], rss[1]);
+       pte_unmap_unlock(dst_pte - 1, dst_ptl);
+       cond_resched();
         if (addr != end)
                 goto again;
         return 0;
@@ -525,24 +549,30 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         return 0;
  }
  
-static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+static void zap_pte_range(struct mmu_gather *tlb,
+                               struct vm_area_struct *vma, pmd_t *pmd,
                                 unsigned long addr, unsigned long end,
                                 struct zap_details *details)
  {
+       struct mm_struct *mm = tlb->mm;
         pte_t *pte;
+       spinlock_t *ptl;
+       int file_rss = 0;
+       int anon_rss = 0;
  
-       pte = pte_offset_map(pmd, addr);
+       pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
         do {
                 pte_t ptent = *pte;
                 if (pte_none(ptent))
                         continue;
                 if (pte_present(ptent)) {
                         struct page *page = NULL;
-                       unsigned long pfn = pte_pfn(ptent);
-                       if (pfn_valid(pfn)) {
-                               page = pfn_to_page(pfn);
-                               if (PageReserved(page))
-                                       page = NULL;
+                       if (!(vma->vm_flags & VM_RESERVED)) {
+                               unsigned long pfn = pte_pfn(ptent);
+                               if (unlikely(!pfn_valid(pfn)))
+                                       print_bad_pte(vma, ptent, addr);
+                               else
+                                       page = pfn_to_page(pfn);
                         }
                         if (unlikely(details) && page) {
                                 /*
@@ -562,7 +592,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                                      page->index > details->last_index))
                                         continue;
                         }
-                       ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
+                       ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                         tlb->fullmm);
                         tlb_remove_tlb_entry(tlb, pte, addr);
                         if (unlikely(!page))
@@ -570,15 +600,17 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                         if (unlikely(details) && details->nonlinear_vma
                             && linear_page_index(details->nonlinear_vma,
                                                 addr) != page->index)
-                               set_pte_at(tlb->mm, addr, pte,
+                               set_pte_at(mm, addr, pte,
                                            pgoff_to_pte(page->index));
-                       if (pte_dirty(ptent))
-                               set_page_dirty(page);
                         if (PageAnon(page))
-                               dec_mm_counter(tlb->mm, anon_rss);
-                       else if (pte_young(ptent))
-                               mark_page_accessed(page);
-                       tlb->freed++;
+                               anon_rss--;
+                       else {
+                               if (pte_dirty(ptent))
+                                       set_page_dirty(page);
+                               if (pte_young(ptent))
+                                       mark_page_accessed(page);
+                               file_rss--;
+                       }
                         page_remove_rmap(page);
                         tlb_remove_page(tlb, page);
                         continue;
@@ -591,12 +623,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                         continue;
                 if (!pte_file(ptent))
                         free_swap_and_cache(pte_to_swp_entry(ptent));
-               pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
+               pte_clear_full(mm, addr, pte, tlb->fullmm);
         } while (pte++, addr += PAGE_SIZE, addr != end);
-       pte_unmap(pte - 1);
+
+       add_mm_rss(mm, file_rss, anon_rss);
+       pte_unmap_unlock(pte - 1, ptl);
  }
  
-static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+static inline void zap_pmd_range(struct mmu_gather *tlb,
+                               struct vm_area_struct *vma, pud_t *pud,
                                 unsigned long addr, unsigned long end,
                                 struct zap_details *details)
  {
@@ -608,11 +643,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                 next = pmd_addr_end(addr, end);
                 if (pmd_none_or_clear_bad(pmd))
                         continue;
-               zap_pte_range(tlb, pmd, addr, next, details);
+               zap_pte_range(tlb, vma, pmd, addr, next, details);
         } while (pmd++, addr = next, addr != end);
  }
  
-static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline void zap_pud_range(struct mmu_gather *tlb,
+                               struct vm_area_struct *vma, pgd_t *pgd,
                                 unsigned long addr, unsigned long end,
                                 struct zap_details *details)
  {
@@ -624,7 +660,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
                 next = pud_addr_end(addr, end);
                 if (pud_none_or_clear_bad(pud))
                         continue;
-               zap_pmd_range(tlb, pud, addr, next, details);
+               zap_pmd_range(tlb, vma, pud, addr, next, details);
         } while (pud++, addr = next, addr != end);
  }
  
@@ -645,7 +681,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 next = pgd_addr_end(addr, end);
                 if (pgd_none_or_clear_bad(pgd))
                         continue;
-               zap_pud_range(tlb, pgd, addr, next, details);
+               zap_pud_range(tlb, vma, pgd, addr, next, details);
         } while (pgd++, addr = next, addr != end);
         tlb_end_vma(tlb, vma);
  }
@@ -660,7 +696,6 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
  /**
   * unmap_vmas - unmap a range of memory covered by a list of vma's
   * @tlbp: address of the caller's struct mmu_gather
- * @mm: the controlling mm_struct
   * @vma: the starting vma
   * @start_addr: virtual address at which to start unmapping
   * @end_addr: virtual address at which to end unmapping
@@ -669,10 +704,10 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
   *
   * Returns the end address of the unmapping (restart addr if interrupted).
   *
- * Unmap all pages in the vma list.  Called under page_table_lock.
+ * Unmap all pages in the vma list.
   *
- * We aim to not hold page_table_lock for too long (for scheduling latency
- * reasons).  So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
+ * We aim to not hold locks for too long (for scheduling latency reasons).
+ * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
   * return the ending mmu_gather to the caller.
   *
   * Only addresses between `start' and `end' will be unmapped.
@@ -684,7 +719,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
   * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
   * drops the lock and schedules.
   */
-unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlbp,
                 struct vm_area_struct *vma, unsigned long start_addr,
                 unsigned long end_addr, unsigned long *nr_accounted,
                 struct zap_details *details)
@@ -694,7 +729,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
         int tlb_start_valid = 0;
         unsigned long start = start_addr;
         spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-       int fullmm = tlb_is_full_mm(*tlbp);
+       int fullmm = (*tlbp)->fullmm;
  
         for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
                 unsigned long end;
@@ -734,19 +769,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
                         tlb_finish_mmu(*tlbp, tlb_start, start);
  
                         if (need_resched() ||
-                               need_lockbreak(&mm->page_table_lock) ||
                                 (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
                                 if (i_mmap_lock) {
-                                       /* must reset count of rss freed */
-                                       *tlbp = tlb_gather_mmu(mm, fullmm);
+                                       *tlbp = NULL;
                                         goto out;
                                 }
-                               spin_unlock(&mm->page_table_lock);
                                 cond_resched();
-                               spin_lock(&mm->page_table_lock);
                         }
  
-                       *tlbp = tlb_gather_mmu(mm, fullmm);
+                       *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
                         tlb_start_valid = 0;
                         zap_bytes = ZAP_BLOCK_SIZE;
                 }
@@ -770,123 +801,93 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
         unsigned long end = address + size;
         unsigned long nr_accounted = 0;
  
-       if (is_vm_hugetlb_page(vma)) {
-               zap_hugepage_range(vma, address, size);
-               return end;
-       }
-
         lru_add_drain();
-       spin_lock(&mm->page_table_lock);
         tlb = tlb_gather_mmu(mm, 0);
-       end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
-       tlb_finish_mmu(tlb, address, end);
-       spin_unlock(&mm->page_table_lock);
+       update_hiwater_rss(mm);
+       end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+       if (tlb)
+               tlb_finish_mmu(tlb, address, end);
         return end;
  }
  
  /*
   * Do a quick page-table lookup for a single page.
- * mm->page_table_lock must be held.
   */
-static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
-                       int read, int write, int accessed)
+struct page *follow_page(struct mm_struct *mm, unsigned long address,
+                       unsigned int flags)
  {
         pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
         pte_t *ptep, pte;
+       spinlock_t *ptl;
         unsigned long pfn;
         struct page *page;
  
-       page = follow_huge_addr(mm, address, write);
-       if (! IS_ERR(page))
-               return page;
+       page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+       if (!IS_ERR(page)) {
+               BUG_ON(flags & FOLL_GET);
+               goto out;
+       }
  
+       page = NULL;
         pgd = pgd_offset(mm, address);
         if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-               goto out;
+               goto no_page_table;
  
         pud = pud_offset(pgd, address);
         if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-               goto out;
+               goto no_page_table;
         
         pmd = pmd_offset(pud, address);
         if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+               goto no_page_table;
+
+       if (pmd_huge(*pmd)) {
+               BUG_ON(flags & FOLL_GET);
+               page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                 goto out;
-       if (pmd_huge(*pmd))
-               return follow_huge_pmd(mm, address, pmd, write);
+       }
  
-       ptep = pte_offset_map(pmd, address);
+       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
         if (!ptep)
                 goto out;
  
         pte = *ptep;
-       pte_unmap(ptep);
-       if (pte_present(pte)) {
-               if (write && !pte_write(pte))
-                       goto out;
-               if (read && !pte_read(pte))
-                       goto out;
-               pfn = pte_pfn(pte);
-               if (pfn_valid(pfn)) {
-                       page = pfn_to_page(pfn);
-                       if (accessed) {
-                               if (write && !pte_dirty(pte) &&!PageDirty(page))
-                                       set_page_dirty(page);
-                               mark_page_accessed(page);
-                       }
-                       return page;
-               }
+       if (!pte_present(pte))
+               goto unlock;
+       if ((flags & FOLL_WRITE) && !pte_write(pte))
+               goto unlock;
+       pfn = pte_pfn(pte);
+       if (!pfn_valid(pfn))
+               goto unlock;
+
+       page = pfn_to_page(pfn);
+       if (flags & FOLL_GET)
+               get_page(page);
+       if (flags & FOLL_TOUCH) {
+               if ((flags & FOLL_WRITE) &&
+                   !pte_dirty(pte) && !PageDirty(page))
+                       set_page_dirty(page);
+               mark_page_accessed(page);
         }
-
+unlock:
+       pte_unmap_unlock(ptep, ptl);
  out:
-       return NULL;
-}
-
-inline struct page *
-follow_page(struct mm_struct *mm, unsigned long address, int write)
-{
-       return __follow_page(mm, address, 0, write, 1);
-}
-
-/*
- * check_user_page_readable() can be called frm niterrupt context by oprofile,
- * so we need to avoid taking any non-irq-safe locks
- */
-int check_user_page_readable(struct mm_struct *mm, unsigned long address)
-{
-       return __follow_page(mm, address, 1, 0, 0) != NULL;
-}
-EXPORT_SYMBOL(check_user_page_readable);
-
-static inline int
-untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
-                        unsigned long address)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-
-       /* Check if the vma is for an anonymous mapping. */
-       if (vma->vm_ops && vma->vm_ops->nopage)
-               return 0;
-
-       /* Check if page directory entry exists. */
-       pgd = pgd_offset(mm, address);
-       if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-               return 1;
-
-       pud = pud_offset(pgd, address);
-       if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-               return 1;
-
-       /* Check if page middle directory entry exists. */
-       pmd = pmd_offset(pud, address);
-       if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-               return 1;
+       return page;
  
-       /* There is a pte slot for 'address' in 'mm'. */
-       return 0;
+no_page_table:
+       /*
+        * When core dumping an enormous anonymous area that nobody
+        * has touched so far, we don't want to allocate page tables.
+        */
+       if (flags & FOLL_ANON) {
+               page = ZERO_PAGE(address);
+               if (flags & FOLL_GET)
+                       get_page(page);
+               BUG_ON(flags & FOLL_WRITE);
+       }
+       return page;
  }
  
  int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -894,18 +895,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                 struct page **pages, struct vm_area_struct **vmas)
  {
         int i;
-       unsigned int flags;
+       unsigned int vm_flags;
  
         /* 
          * Require read or write permissions.
          * If 'force' is set, we only require the "MAY" flags.
          */
-       flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
-       flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+       vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+       vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
         i = 0;
  
         do {
-               struct vm_area_struct * vma;
+               struct vm_area_struct *vma;
+               unsigned int foll_flags;
  
                 vma = find_extend_vma(mm, start);
                 if (!vma && in_gate_area(tsk, start)) {
@@ -945,8 +947,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                         continue;
                 }
  
-               if (!vma || (vma->vm_flags & VM_IO)
-                               || !(flags & vma->vm_flags))
+               if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
+                               || !(vm_flags & vma->vm_flags))
                         return i ? : -EFAULT;
  
                 if (is_vm_hugetlb_page(vma)) {
@@ -954,29 +956,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                                 &start, &len, i);
                         continue;
                 }
-               spin_lock(&mm->page_table_lock);
+
+               foll_flags = FOLL_TOUCH;
+               if (pages)
+                       foll_flags |= FOLL_GET;
+               if (!write && !(vma->vm_flags & VM_LOCKED) &&
+                   (!vma->vm_ops || !vma->vm_ops->nopage))
+                       foll_flags |= FOLL_ANON;
+
                 do {
-                       int write_access = write;
                         struct page *page;
  
-                       cond_resched_lock(&mm->page_table_lock);
-                       while (!(page = follow_page(mm, start, write_access))) {
-                               int ret;
-
-                               /*
-                                * Shortcut for anonymous pages. We don't want
-                                * to force the creation of pages tables for
-                                * insanely big anonymously mapped areas that
-                                * nobody touched so far. This is important
-                                * for doing a core dump for these mappings.
-                                */
-                               if (!write && untouched_anonymous_page(mm,vma,start)) {
-                                       page = ZERO_PAGE(start);
-                                       break;
-                               }
-                               spin_unlock(&mm->page_table_lock);
-                               ret = __handle_mm_fault(mm, vma, start, write_access);
+                       if (write)
+                               foll_flags |= FOLL_WRITE;
  
+                       cond_resched();
+                       while (!(page = follow_page(mm, start, foll_flags))) {
+                               int ret;
+                               ret = __handle_mm_fault(mm, vma, start,
+                                               foll_flags & FOLL_WRITE);
                                 /*
                                  * The VM_FAULT_WRITE bit tells us that do_wp_page has
                                  * broken COW when necessary, even if maybe_mkwrite
@@ -984,7 +982,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                  * subsequent page lookups as if they were reads.
                                  */
                                 if (ret & VM_FAULT_WRITE)
-                                       write_access = 0;
+                                       foll_flags &= ~FOLL_WRITE;
                                 
                                 switch (ret & ~VM_FAULT_WRITE) {
                                 case VM_FAULT_MINOR:
@@ -1000,13 +998,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                 default:
                                         BUG();
                                 }
-                               spin_lock(&mm->page_table_lock);
                         }
                         if (pages) {
                                 pages[i] = page;
                                 flush_dcache_page(page);
-                               if (!PageReserved(page))
-                                       page_cache_get(page);
                         }
                         if (vmas)
                                 vmas[i] = vma;
@@ -1014,7 +1009,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                         start += PAGE_SIZE;
                         len--;
                 } while (len && start < vma->vm_end);
-               spin_unlock(&mm->page_table_lock);
         } while (len);
         return i;
  }
@@ -1024,16 +1018,21 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                         unsigned long addr, unsigned long end, pgprot_t prot)
  {
         pte_t *pte;
+       spinlock_t *ptl;
  
-       pte = pte_alloc_map(mm, pmd, addr);
+       pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
         if (!pte)
                 return -ENOMEM;
         do {
-               pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
+               struct page *page = ZERO_PAGE(addr);
+               pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+               page_cache_get(page);
+               page_add_file_rmap(page);
+               inc_mm_counter(mm, file_rss);
                 BUG_ON(!pte_none(*pte));
                 set_pte_at(mm, addr, pte, zero_pte);
         } while (pte++, addr += PAGE_SIZE, addr != end);
-       pte_unmap(pte - 1);
+       pte_unmap_unlock(pte - 1, ptl);
         return 0;
  }
  
@@ -1083,14 +1082,12 @@ int zeromap_page_range(struct vm_area_struct *vma,
         BUG_ON(addr >= end);
         pgd = pgd_offset(mm, addr);
         flush_cache_range(vma, addr, end);
-       spin_lock(&mm->page_table_lock);
         do {
                 next = pgd_addr_end(addr, end);
                 err = zeromap_pud_range(mm, pgd, addr, next, prot);
                 if (err)
                         break;
         } while (pgd++, addr = next, addr != end);
-       spin_unlock(&mm->page_table_lock);
         return err;
  }
  
@@ -1104,17 +1101,17 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                         unsigned long pfn, pgprot_t prot)
  {
         pte_t *pte;
+       spinlock_t *ptl;
  
-       pte = pte_alloc_map(mm, pmd, addr);
+       pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
         if (!pte)
                 return -ENOMEM;
         do {
                 BUG_ON(!pte_none(*pte));
-               if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
-                       set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+               set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
                 pfn++;
         } while (pte++, addr += PAGE_SIZE, addr != end);
-       pte_unmap(pte - 1);
+       pte_unmap_unlock(pte - 1, ptl);
         return 0;
  }
  
@@ -1173,8 +1170,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
          * rest of the world about it:
          *   VM_IO tells people not to look at these pages
          *      (accesses can have side effects).
-        *   VM_RESERVED tells swapout not to try to touch
-        *      this region.
+        *   VM_RESERVED tells the core MM not to "manage" these pages
+         *     (e.g. refcount, mapcount, try to swap them out).
          */
         vma->vm_flags |= VM_IO | VM_RESERVED;
  
@@ -1182,7 +1179,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
         pfn -= addr >> PAGE_SHIFT;
         pgd = pgd_offset(mm, addr);
         flush_cache_range(vma, addr, end);
-       spin_lock(&mm->page_table_lock);
         do {
                 next = pgd_addr_end(addr, end);
                 err = remap_pud_range(mm, pgd, addr, next,
@@ -1190,11 +1186,35 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                 if (err)
                         break;
         } while (pgd++, addr = next, addr != end);
-       spin_unlock(&mm->page_table_lock);
         return err;
  }
  EXPORT_SYMBOL(remap_pfn_range);
  
+/*
+ * handle_pte_fault chooses page fault handler according to an entry
+ * which was read non-atomically.  Before making any commitment, on
+ * those architectures or configurations (e.g. i386 with PAE) which
+ * might give a mix of unmatched parts, do_swap_page and do_file_page
+ * must check under lock before unmapping the pte and proceeding
+ * (but do_wp_page is only called after already making such a check;
+ * and do_anonymous_page and do_no_page can safely check later on).
+ */
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
+                               pte_t *page_table, pte_t orig_pte)
+{
+       int same = 1;
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
+       if (sizeof(pte_t) > sizeof(unsigned long)) {
+               spinlock_t *ptl = pte_lockptr(mm, pmd);
+               spin_lock(ptl);
+               same = pte_same(*page_table, orig_pte);
+               spin_unlock(ptl);
+       }
+#endif
+       pte_unmap(page_table);
+       return same;
+}
+
  /*
   * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
   * servicing faults for write access.  In the normal case, do always want
@@ -1208,29 +1228,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
         return pte;
  }
  
-/*
- * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
- */
-static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
-               pte_t *page_table)
-{
-       pte_t entry;
-
-       entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
-                             vma);
-       ptep_establish(vma, address, page_table, entry);
-       update_mmu_cache(vma, address, entry);
-       lazy_mmu_prot_update(entry);
-}
-
  /*
   * This routine handles present pages, when users try to write
   * to a shared page. It is done by copying the page to a new address
   * and decrementing the shared-page counter for the old page.
   *
- * Goto-purists beware: the only reason for goto's here is that it results
- * in better assembly code.. The "default" path will see no jumps at all.
- *
   * Note that this routine assumes that the protection checks have been
   * done by the caller (the low-level page fault routine in most cases).
   * Thus we can safely just mark it writable once we've done any necessary
@@ -1240,28 +1242,28 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page
   * change only once the write actually happens. This avoids a few races,
   * and potentially makes it more efficient.
   *
- * We hold the mm semaphore and the page_table_lock on entry and exit
- * with the page_table_lock released.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
   */
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
-       unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               spinlock_t *ptl, pte_t orig_pte)
  {
         struct page *old_page, *new_page;
-       unsigned long pfn = pte_pfn(pte);
+       unsigned long pfn = pte_pfn(orig_pte);
         pte_t entry;
-       int ret;
+       int ret = VM_FAULT_MINOR;
+
+       BUG_ON(vma->vm_flags & VM_RESERVED);
  
         if (unlikely(!pfn_valid(pfn))) {
                 /*
-                * This should really halt the system so it can be debugged or
-                * at least the kernel stops what it's doing before it corrupts
-                * data, but for the moment just pretend this is OOM.
+                * Page table corrupted: show pte and kill process.
                  */
-               pte_unmap(page_table);
-               printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
-                               address);
-               spin_unlock(&mm->page_table_lock);
-               return VM_FAULT_OOM;
+               print_bad_pte(vma, orig_pte, address);
+               ret = VM_FAULT_OOM;
+               goto unlock;
         }
         old_page = pfn_to_page(pfn);
  
@@ -1270,52 +1272,51 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
                 unlock_page(old_page);
                 if (reuse) {
                         flush_cache_page(vma, address, pfn);
-                       entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
-                                             vma);
+                       entry = pte_mkyoung(orig_pte);
+                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                         ptep_set_access_flags(vma, address, page_table, entry, 1);
                         update_mmu_cache(vma, address, entry);
                         lazy_mmu_prot_update(entry);
-                       pte_unmap(page_table);
-                       spin_unlock(&mm->page_table_lock);
-                       return VM_FAULT_MINOR|VM_FAULT_WRITE;
+                       ret |= VM_FAULT_WRITE;
+                       goto unlock;
                 }
         }
-       pte_unmap(page_table);
  
         /*
          * Ok, we need to copy. Oh, well..
          */
-       if (!PageReserved(old_page))
-               page_cache_get(old_page);
-       spin_unlock(&mm->page_table_lock);
+       page_cache_get(old_page);
+       pte_unmap_unlock(page_table, ptl);
  
         if (unlikely(anon_vma_prepare(vma)))
-               goto no_new_page;
+               goto oom;
         if (old_page == ZERO_PAGE(address)) {
                 new_page = alloc_zeroed_user_highpage(vma, address);
                 if (!new_page)
-                       goto no_new_page;
+                       goto oom;
         } else {
                 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
                 if (!new_page)
-                       goto no_new_page;
+                       goto oom;
                 copy_user_highpage(new_page, old_page, address);
         }
+
         /*
          * Re-check the pte - we dropped the lock
          */
-       ret = VM_FAULT_MINOR;
-       spin_lock(&mm->page_table_lock);
-       page_table = pte_offset_map(pmd, address);
-       if (likely(pte_same(*page_table, pte))) {
-               if (PageAnon(old_page))
-                       dec_mm_counter(mm, anon_rss);
-               if (PageReserved(old_page))
-                       inc_mm_counter(mm, rss);
-               else
-                       page_remove_rmap(old_page);
+       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (likely(pte_same(*page_table, orig_pte))) {
+               page_remove_rmap(old_page);
+               if (!PageAnon(old_page)) {
+                       inc_mm_counter(mm, anon_rss);
+                       dec_mm_counter(mm, file_rss);
+               }
                 flush_cache_page(vma, address, pfn);
-               break_cow(vma, new_page, address, page_table);
+               entry = mk_pte(new_page, vma->vm_page_prot);
+               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+               ptep_establish(vma, address, page_table, entry);
+               update_mmu_cache(vma, address, entry);
+               lazy_mmu_prot_update(entry);
                 lru_cache_add_active(new_page);
                 page_add_anon_rmap(new_page, vma, address);
  
@@ -1323,13 +1324,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
                 new_page = old_page;
                 ret |= VM_FAULT_WRITE;
         }
-       pte_unmap(page_table);
         page_cache_release(new_page);
         page_cache_release(old_page);
-       spin_unlock(&mm->page_table_lock);
+unlock:
+       pte_unmap_unlock(page_table, ptl);
         return ret;
-
-no_new_page:
+oom:
         page_cache_release(old_page);
         return VM_FAULT_OOM;
  }
@@ -1399,13 +1399,6 @@ again:
  
         restart_addr = zap_page_range(vma, start_addr,
                                         end_addr - start_addr, details);
-
-       /*
-        * We cannot rely on the break test in unmap_vmas:
-        * on the one hand, we don't want to restart our loop
-        * just because that broke out for the page_table_lock;
-        * on the other hand, it does no test when vma is small.
-        */
         need_break = need_resched() ||
                         need_lockbreak(details->i_mmap_lock);
  
@@ -1654,38 +1647,37 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc
  }
  
  /*
- * We hold the mm semaphore and the page_table_lock on entry and
- * should release the pagetable lock on exit..
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
   */
-static int do_swap_page(struct mm_struct * mm,
-       struct vm_area_struct * vma, unsigned long address,
-       pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
+static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               int write_access, pte_t orig_pte)
  {
+       spinlock_t *ptl;
         struct page *page;
-       swp_entry_t entry = pte_to_swp_entry(orig_pte);
+       swp_entry_t entry;
         pte_t pte;
         int ret = VM_FAULT_MINOR;
  
-       pte_unmap(page_table);
-       spin_unlock(&mm->page_table_lock);
+       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+               goto out;
+
+       entry = pte_to_swp_entry(orig_pte);
         page = lookup_swap_cache(entry);
         if (!page) {
                 swapin_readahead(entry, address, vma);
                 page = read_swap_cache_async(entry, vma, address);
                 if (!page) {
                         /*
-                        * Back out if somebody else faulted in this pte while
-                        * we released the page table lock.
+                        * Back out if somebody else faulted in this pte
+                        * while we released the pte lock.
                          */
-                       spin_lock(&mm->page_table_lock);
-                       page_table = pte_offset_map(pmd, address);
+                       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
                         if (likely(pte_same(*page_table, orig_pte)))
                                 ret = VM_FAULT_OOM;
-                       else
-                               ret = VM_FAULT_MINOR;
-                       pte_unmap(page_table);
-                       spin_unlock(&mm->page_table_lock);
-                       goto out;
+                       goto unlock;
                 }
  
                 /* Had to read the page from swap area: Major fault */
@@ -1698,15 +1690,11 @@ static int do_swap_page(struct mm_struct * mm,
         lock_page(page);
  
         /*
-        * Back out if somebody else faulted in this pte while we
-        * released the page table lock.
+        * Back out if somebody else already faulted in this pte.
          */
-       spin_lock(&mm->page_table_lock);
-       page_table = pte_offset_map(pmd, address);
-       if (unlikely(!pte_same(*page_table, orig_pte))) {
-               ret = VM_FAULT_MINOR;
+       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (unlikely(!pte_same(*page_table, orig_pte)))
                 goto out_nomap;
-       }
  
         if (unlikely(!PageUptodate(page))) {
                 ret = VM_FAULT_SIGBUS;
@@ -1715,7 +1703,7 @@ static int do_swap_page(struct mm_struct * mm,
  
         /* The page isn't present yet, go ahead with the fault. */
  
-       inc_mm_counter(mm, rss);
+       inc_mm_counter(mm, anon_rss);
         pte = mk_pte(page, vma->vm_page_prot);
         if (write_access && can_share_swap_page(page)) {
                 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1733,7 +1721,7 @@ static int do_swap_page(struct mm_struct * mm,
  
         if (write_access) {
                 if (do_wp_page(mm, vma, address,
-                               page_table, pmd, pte) == VM_FAULT_OOM)
+                               page_table, pmd, ptl, pte) == VM_FAULT_OOM)
                         ret = VM_FAULT_OOM;
                 goto out;
         }
@@ -1741,74 +1729,76 @@ static int do_swap_page(struct mm_struct * mm,
         /* No need to invalidate - it was non-present before */
         update_mmu_cache(vma, address, pte);
         lazy_mmu_prot_update(pte);
-       pte_unmap(page_table);
-       spin_unlock(&mm->page_table_lock);
+unlock:
+       pte_unmap_unlock(page_table, ptl);
  out:
         return ret;
  out_nomap:
-       pte_unmap(page_table);
-       spin_unlock(&mm->page_table_lock);
+       pte_unmap_unlock(page_table, ptl);
         unlock_page(page);
         page_cache_release(page);
-       goto out;
+       return ret;
  }
  
  /*
- * We are called with the MM semaphore and page_table_lock
- * spinlock held to protect against concurrent faults in
- * multithreaded programs. 
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
   */
-static int
-do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
-               pte_t *page_table, pmd_t *pmd, int write_access,
-               unsigned long addr)
+static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               int write_access)
  {
+       struct page *page;
+       spinlock_t *ptl;
         pte_t entry;
-       struct page * page = ZERO_PAGE(addr);
-
-       /* Read-only mapping of ZERO_PAGE. */
-       entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
  
-       /* ..except if it's a write access */
         if (write_access) {
                 /* Allocate our own private page. */
                 pte_unmap(page_table);
-               spin_unlock(&mm->page_table_lock);
  
                 if (unlikely(anon_vma_prepare(vma)))
-                       goto no_mem;
-               page = alloc_zeroed_user_highpage(vma, addr);
+                       goto oom;
+               page = alloc_zeroed_user_highpage(vma, address);
                 if (!page)
-                       goto no_mem;
+                       goto oom;
  
-               spin_lock(&mm->page_table_lock);
-               page_table = pte_offset_map(pmd, addr);
+               entry = mk_pte(page, vma->vm_page_prot);
+               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
  
-               if (!pte_none(*page_table)) {
-                       pte_unmap(page_table);
-                       page_cache_release(page);
-                       spin_unlock(&mm->page_table_lock);
-                       goto out;
-               }
-               inc_mm_counter(mm, rss);
-               entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
-                                                        vma->vm_page_prot)),
-                                     vma);
+               page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+               if (!pte_none(*page_table))
+                       goto release;
+               inc_mm_counter(mm, anon_rss);
                 lru_cache_add_active(page);
                 SetPageReferenced(page);
-               page_add_anon_rmap(page, vma, addr);
+               page_add_anon_rmap(page, vma, address);
+       } else {
+               /* Map the ZERO_PAGE - vm_page_prot is readonly */
+               page = ZERO_PAGE(address);
+               page_cache_get(page);
+               entry = mk_pte(page, vma->vm_page_prot);
+
+               ptl = pte_lockptr(mm, pmd);
+               spin_lock(ptl);
+               if (!pte_none(*page_table))
+                       goto release;
+               inc_mm_counter(mm, file_rss);
+               page_add_file_rmap(page);
         }
  
-       set_pte_at(mm, addr, page_table, entry);
-       pte_unmap(page_table);
+       set_pte_at(mm, address, page_table, entry);
  
         /* No need to invalidate - it was non-present before */
-       update_mmu_cache(vma, addr, entry);
+       update_mmu_cache(vma, address, entry);
         lazy_mmu_prot_update(entry);
-       spin_unlock(&mm->page_table_lock);
-out:
+unlock:
+       pte_unmap_unlock(page_table, ptl);
         return VM_FAULT_MINOR;
-no_mem:
+release:
+       page_cache_release(page);
+       goto unlock;
+oom:
         return VM_FAULT_OOM;
  }
  
@@ -1821,25 +1811,23 @@ no_mem:
   * As this is called only for pages that do not currently exist, we
   * do not need to flush old virtual caches or the TLB.
   *
- * This is called with the MM semaphore held and the page table
- * spinlock held. Exit with the spinlock released.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
   */
-static int
-do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-       unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
+static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               int write_access)
  {
-       struct page * new_page;
+       spinlock_t *ptl;
+       struct page *new_page;
         struct address_space *mapping = NULL;
         pte_t entry;
         unsigned int sequence = 0;
         int ret = VM_FAULT_MINOR;
         int anon = 0;
  
-       if (!vma->vm_ops || !vma->vm_ops->nopage)
-               return do_anonymous_page(mm, vma, page_table,
-                                       pmd, write_access, address);
         pte_unmap(page_table);
-       spin_unlock(&mm->page_table_lock);
  
         if (vma->vm_file) {
                 mapping = vma->vm_file->f_mapping;
@@ -1847,7 +1835,6 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 smp_rmb(); /* serializes i_size against truncate_count */
         }
  retry:
-       cond_resched();
         new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
         /*
          * No smp_rmb is needed here as long as there's a full
@@ -1880,19 +1867,20 @@ retry:
                 anon = 1;
         }
  
-       spin_lock(&mm->page_table_lock);
+       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
         /*
          * For a file-backed vma, someone could have truncated or otherwise
          * invalidated this page.  If unmap_mapping_range got called,
          * retry getting the page.
          */
         if (mapping && unlikely(sequence != mapping->truncate_count)) {
-               sequence = mapping->truncate_count;
-               spin_unlock(&mm->page_table_lock);
+               pte_unmap_unlock(page_table, ptl);
                 page_cache_release(new_page);
+               cond_resched();
+               sequence = mapping->truncate_count;
+               smp_rmb();
                 goto retry;
         }
-       page_table = pte_offset_map(pmd, address);
  
         /*
          * This silly early PAGE_DIRTY setting removes a race
@@ -1906,68 +1894,67 @@ retry:
          */
         /* Only go through if we didn't race with anybody else... */
         if (pte_none(*page_table)) {
-               if (!PageReserved(new_page))
-                       inc_mm_counter(mm, rss);
-
                 flush_icache_page(vma, new_page);
                 entry = mk_pte(new_page, vma->vm_page_prot);
                 if (write_access)
                         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                 set_pte_at(mm, address, page_table, entry);
                 if (anon) {
+                       inc_mm_counter(mm, anon_rss);
                         lru_cache_add_active(new_page);
                         page_add_anon_rmap(new_page, vma, address);
-               } else
+               } else if (!(vma->vm_flags & VM_RESERVED)) {
+                       inc_mm_counter(mm, file_rss);
                         page_add_file_rmap(new_page);
-               pte_unmap(page_table);
+               }
         } else {
                 /* One of our sibling threads was faster, back out. */
-               pte_unmap(page_table);
                 page_cache_release(new_page);
-               spin_unlock(&mm->page_table_lock);
-               goto out;
+               goto unlock;
         }
  
         /* no need to invalidate: a not-present page shouldn't be cached */
         update_mmu_cache(vma, address, entry);
         lazy_mmu_prot_update(entry);
-       spin_unlock(&mm->page_table_lock);
-out:
+unlock:
+       pte_unmap_unlock(page_table, ptl);
         return ret;
  oom:
         page_cache_release(new_page);
-       ret = VM_FAULT_OOM;
-       goto out;
+       return VM_FAULT_OOM;
  }
  
  /*
   * Fault of a previously existing named mapping. Repopulate the pte
   * from the encoded file_pte if possible. This enables swappable
   * nonlinear vmas.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
   */
-static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
-       unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
+static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               int write_access, pte_t orig_pte)
  {
-       unsigned long pgoff;
+       pgoff_t pgoff;
         int err;
  
-       BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
-       /*
-        * Fall back to the linear mapping if the fs does not support
-        * ->populate:
-        */
-       if (!vma->vm_ops->populate ||
-                       (write_access && !(vma->vm_flags & VM_SHARED))) {
-               pte_clear(mm, address, pte);
-               return do_no_page(mm, vma, address, write_access, pte, pmd);
-       }
-
-       pgoff = pte_to_pgoff(*pte);
+       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+               return VM_FAULT_MINOR;
  
-       pte_unmap(pte);
-       spin_unlock(&mm->page_table_lock);
+       if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
+               /*
+                * Page table corrupted: show pte and kill process.
+                */
+               print_bad_pte(vma, orig_pte, address);
+               return VM_FAULT_OOM;
+       }
+       /* We can then assume vm->vm_ops && vma->vm_ops->populate */
  
-       err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
+       pgoff = pte_to_pgoff(orig_pte);
+       err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
+                                       vma->vm_page_prot, pgoff, 0);
         if (err == -ENOMEM)
                 return VM_FAULT_OOM;
         if (err)
@@ -1984,56 +1971,68 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
   * with external mmu caches can use to update those (ie the Sparc or
   * PowerPC hashed page tables that act as extended TLBs).
   *
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
- * The adding of pages is protected by the MM semaphore (which we hold),
- * so we don't need to worry about a page being suddenly been added into
- * our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
   */
  static inline int handle_pte_fault(struct mm_struct *mm,
-       struct vm_area_struct * vma, unsigned long address,
-       int write_access, pte_t *pte, pmd_t *pmd)
+               struct vm_area_struct *vma, unsigned long address,
+               pte_t *pte, pmd_t *pmd, int write_access)
  {
         pte_t entry;
+       pte_t old_entry;
+       spinlock_t *ptl;
  
-       entry = *pte;
+       old_entry = entry = *pte;
         if (!pte_present(entry)) {
-               /*
-                * If it truly wasn't present, we know that kswapd
-                * and the PTE updates will not touch it later. So
-                * drop the lock.
-                */
-               if (pte_none(entry))
-                       return do_no_page(mm, vma, address, write_access, pte, pmd);
+               if (pte_none(entry)) {
+                       if (!vma->vm_ops || !vma->vm_ops->nopage)
+                               return do_anonymous_page(mm, vma, address,
+                                       pte, pmd, write_access);
+                       return do_no_page(mm, vma, address,
+                                       pte, pmd, write_access);
+               }
                 if (pte_file(entry))
-                       return do_file_page(mm, vma, address, write_access, pte, pmd);
-               return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
+                       return do_file_page(mm, vma, address,
+                                       pte, pmd, write_access, entry);
+               return do_swap_page(mm, vma, address,
+                                       pte, pmd, write_access, entry);
         }
  
+       ptl = pte_lockptr(mm, pmd);
+       spin_lock(ptl);
+       if (unlikely(!pte_same(*pte, entry)))
+               goto unlock;
         if (write_access) {
                 if (!pte_write(entry))
-                       return do_wp_page(mm, vma, address, pte, pmd, entry);
+                       return do_wp_page(mm, vma, address,
+                                       pte, pmd, ptl, entry);
                 entry = pte_mkdirty(entry);
         }
         entry = pte_mkyoung(entry);
-       ptep_set_access_flags(vma, address, pte, entry, write_access);
-       update_mmu_cache(vma, address, entry);
-       lazy_mmu_prot_update(entry);
-       pte_unmap(pte);
-       spin_unlock(&mm->page_table_lock);
+       if (!pte_same(old_entry, entry)) {
+               ptep_set_access_flags(vma, address, pte, entry, write_access);
+               update_mmu_cache(vma, address, entry);
+               lazy_mmu_prot_update(entry);
+       } else {
+               /*
+                * This is needed only for protection faults but the arch code
+                * is not yet telling us if this is a protection fault or not.
+                * This still avoids useless tlb flushes for .text page faults
+                * with threads.
+                */
+               if (write_access)
+                       flush_tlb_page(vma, address);
+       }
+unlock:
+       pte_unmap_unlock(pte, ptl);
         return VM_FAULT_MINOR;
  }
  
  /*
   * By the time we get here, we already hold the mm semaphore
   */
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, int write_access)
  {
         pgd_t *pgd;
@@ -2048,100 +2047,66 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
         if (unlikely(is_vm_hugetlb_page(vma)))
                 return hugetlb_fault(mm, vma, address, write_access);
  
-       /*
-        * We need the page table lock to synchronize with kswapd
-        * and the SMP-safe atomic PTE updates.
-        */
         pgd = pgd_offset(mm, address);
-       spin_lock(&mm->page_table_lock);
-
         pud = pud_alloc(mm, pgd, address);
         if (!pud)
-               goto oom;
-
+               return VM_FAULT_OOM;
         pmd = pmd_alloc(mm, pud, address);
         if (!pmd)
-               goto oom;
-
+               return VM_FAULT_OOM;
         pte = pte_alloc_map(mm, pmd, address);
         if (!pte)
-               goto oom;
-       
-       return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+               return VM_FAULT_OOM;
  
- oom:
-       spin_unlock(&mm->page_table_lock);
-       return VM_FAULT_OOM;
+       return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
  }
  
  #ifndef __PAGETABLE_PUD_FOLDED
  /*
   * Allocate page upper directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
   */
-pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
  {
-       pud_t *new;
-
-       spin_unlock(&mm->page_table_lock);
-       new = pud_alloc_one(mm, address);
-       spin_lock(&mm->page_table_lock);
+       pud_t *new = pud_alloc_one(mm, address);
         if (!new)
-               return NULL;
+               return -ENOMEM;
  
-       /*
-        * Because we dropped the lock, we should re-check the
-        * entry, as somebody else could have populated it..
-        */
-       if (pgd_present(*pgd)) {
+       spin_lock(&mm->page_table_lock);
+       if (pgd_present(*pgd))          /* Another has populated it */
                 pud_free(new);
-               goto out;
-       }
-       pgd_populate(mm, pgd, new);
- out:
-       return pud_offset(pgd, address);
+       else
+               pgd_populate(mm, pgd, new);
+       spin_unlock(&mm->page_table_lock);
+       return 0;
  }
  #endif /* __PAGETABLE_PUD_FOLDED */
  
  #ifndef __PAGETABLE_PMD_FOLDED
  /*
   * Allocate page middle directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
   */
-pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
  {
-       pmd_t *new;
-
-       spin_unlock(&mm->page_table_lock);
-       new = pmd_alloc_one(mm, address);
-       spin_lock(&mm->page_table_lock);
+       pmd_t *new = pmd_alloc_one(mm, address);
         if (!new)
-               return NULL;
+               return -ENOMEM;
  
-       /*
-        * Because we dropped the lock, we should re-check the
-        * entry, as somebody else could have populated it..
-        */
+       spin_lock(&mm->page_table_lock);
  #ifndef __ARCH_HAS_4LEVEL_HACK
-       if (pud_present(*pud)) {
+       if (pud_present(*pud))          /* Another has populated it */
                 pmd_free(new);
-               goto out;
-       }
-       pud_populate(mm, pud, new);
+       else
+               pud_populate(mm, pud, new);
  #else
-       if (pgd_present(*pud)) {
+       if (pgd_present(*pud))          /* Another has populated it */
                 pmd_free(new);
-               goto out;
-       }
-       pgd_populate(mm, pud, new);
+       else
+               pgd_populate(mm, pud, new);
  #endif /* __ARCH_HAS_4LEVEL_HACK */
-
- out:
-       return pmd_offset(pud, address);
+       spin_unlock(&mm->page_table_lock);
+       return 0;
  }
  #endif /* __PAGETABLE_PMD_FOLDED */
  
@@ -2206,22 +2171,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr)
  
  EXPORT_SYMBOL(vmalloc_to_pfn);
  
-/*
- * update_mem_hiwater
- *     - update per process rss and vm high water data
- */
-void update_mem_hiwater(struct task_struct *tsk)
-{
-       if (tsk->mm) {
-               unsigned long rss = get_mm_counter(tsk->mm, rss);
-
-               if (tsk->mm->hiwater_rss < rss)
-                       tsk->mm->hiwater_rss = rss;
-               if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
-                       tsk->mm->hiwater_vm = tsk->mm->total_vm;
-       }
-}
-
  #if !defined(__HAVE_ARCH_GATE_AREA)
  
  #if defined(AT_SYSINFO_EHDR)
@@ -2233,7 +2182,7 @@ static int __init gate_vma_init(void)
         gate_vma.vm_start = FIXADDR_USER_START;
         gate_vma.vm_end = FIXADDR_USER_END;
         gate_vma.vm_page_prot = PAGE_READONLY;
-       gate_vma.vm_flags = 0;
+       gate_vma.vm_flags = VM_RESERVED;
         return 0;
  }
  __initcall(gate_vma_init);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

new file mode 100644 (file)

index 0000000..431a64f
--- /dev/null
+++ b/mm/memory_hotplug.c
@@ -0,0 +1,138 @@
+/*
+ *  linux/mm/memory_hotplug.c
+ *
+ *  Copyright (C)
+ */
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/pagevec.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+
+extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+                         unsigned long size);
+static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+{
+       struct pglist_data *pgdat = zone->zone_pgdat;
+       int nr_pages = PAGES_PER_SECTION;
+       int nid = pgdat->node_id;
+       int zone_type;
+
+       zone_type = zone - pgdat->node_zones;
+       memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
+       zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+}
+
+extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+                                 int nr_pages);
+static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
+{
+       struct pglist_data *pgdat = zone->zone_pgdat;
+       int nr_pages = PAGES_PER_SECTION;
+       int ret;
+
+       ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
+
+       if (ret < 0)
+               return ret;
+
+       __add_zone(zone, phys_start_pfn);
+       return register_new_memory(__pfn_to_section(phys_start_pfn));
+}
+
+/*
+ * Reasonably generic function for adding memory.  It is
+ * expected that archs that support memory hotplug will
+ * call this function after deciding the zone to which to
+ * add the new pages.
+ */
+int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+                unsigned long nr_pages)
+{
+       unsigned long i;
+       int err = 0;
+
+       for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
+               err = __add_section(zone, phys_start_pfn + i);
+
+               if (err)
+                       break;
+       }
+
+       return err;
+}
+
+static void grow_zone_span(struct zone *zone,
+               unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned long old_zone_end_pfn;
+
+       zone_span_writelock(zone);
+
+       old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+       if (start_pfn < zone->zone_start_pfn)
+               zone->zone_start_pfn = start_pfn;
+
+       if (end_pfn > old_zone_end_pfn)
+               zone->spanned_pages = end_pfn - zone->zone_start_pfn;
+
+       zone_span_writeunlock(zone);
+}
+
+static void grow_pgdat_span(struct pglist_data *pgdat,
+               unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned long old_pgdat_end_pfn =
+               pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+       if (start_pfn < pgdat->node_start_pfn)
+               pgdat->node_start_pfn = start_pfn;
+
+       if (end_pfn > old_pgdat_end_pfn)
+               pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages;
+}
+
+int online_pages(unsigned long pfn, unsigned long nr_pages)
+{
+       unsigned long i;
+       unsigned long flags;
+       unsigned long onlined_pages = 0;
+       struct zone *zone;
+
+       /*
+        * This doesn't need a lock to do pfn_to_page().
+        * The section can't be removed here because of the
+        * memory_block->state_sem.
+        */
+       zone = page_zone(pfn_to_page(pfn));
+       pgdat_resize_lock(zone->zone_pgdat, &flags);
+       grow_zone_span(zone, pfn, pfn + nr_pages);
+       grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
+       pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+       for (i = 0; i < nr_pages; i++) {
+               struct page *page = pfn_to_page(pfn + i);
+               online_page(page);
+               onlined_pages++;
+       }
+       zone->present_pages += onlined_pages;
+
+       setup_per_zone_pages_min();
+
+       return 0;
+}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 1d5c64d..2076b15 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2,6 +2,7 @@
   * Simple NUMA memory policy for the Linux kernel.
   *
   * Copyright 2003,2004 Andi Kleen, SuSE Labs.
+ * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   * Subject to the GNU Public License, version 2.
   *
   * NUMA policy allows the user to give hints in which node(s) memory should
@@ -17,13 +18,19 @@
   *                offset into the backing object or offset into the mapping
   *                for anonymous memory. For process policy an process counter
   *                is used.
+ *
   * bind           Only allocate memory on a specific set of nodes,
   *                no fallback.
+ *                FIXME: memory is allocated starting with the first node
+ *                to the last. It would be better if bind would truly restrict
+ *                the allocation to memory nodes instead
+ *
   * preferred       Try a specific node first before normal fallback.
   *                As a special case node -1 here means do the allocation
   *                on the local CPU. This is normally identical to default,
   *                but useful to set in a VMA when you have a non default
   *                process policy.
+ *
   * default        Allocate on the local node first, or when on a VMA
   *                use the process policy. This is what Linux always did
   *               in a NUMA aware kernel and still does by, ahem, default.
@@ -93,23 +100,10 @@ struct mempolicy default_policy = {
         .policy = MPOL_DEFAULT,
  };
  
-/* Check if all specified nodes are online */
-static int nodes_online(unsigned long *nodes)
-{
-       DECLARE_BITMAP(online2, MAX_NUMNODES);
-
-       bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
-       if (bitmap_empty(online2, MAX_NUMNODES))
-               set_bit(0, online2);
-       if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
-               return -EINVAL;
-       return 0;
-}
-
  /* Do sanity checking on a policy */
-static int mpol_check_policy(int mode, unsigned long *nodes)
+static int mpol_check_policy(int mode, nodemask_t *nodes)
  {
-       int empty = bitmap_empty(nodes, MAX_NUMNODES);
+       int empty = nodes_empty(*nodes);
  
         switch (mode) {
         case MPOL_DEFAULT:
@@ -124,71 +118,20 @@ static int mpol_check_policy(int mode, unsigned long *nodes)
                         return -EINVAL;
                 break;
         }
-       return nodes_online(nodes);
-}
-
-/* Copy a node mask from user space. */
-static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
-                    unsigned long maxnode, int mode)
-{
-       unsigned long k;
-       unsigned long nlongs;
-       unsigned long endmask;
-
-       --maxnode;
-       bitmap_zero(nodes, MAX_NUMNODES);
-       if (maxnode == 0 || !nmask)
-               return 0;
-
-       nlongs = BITS_TO_LONGS(maxnode);
-       if ((maxnode % BITS_PER_LONG) == 0)
-               endmask = ~0UL;
-       else
-               endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
-
-       /* When the user specified more nodes than supported just check
-          if the non supported part is all zero. */
-       if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
-               if (nlongs > PAGE_SIZE/sizeof(long))
-                       return -EINVAL;
-               for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
-                       unsigned long t;
-                       if (get_user(t,  nmask + k))
-                               return -EFAULT;
-                       if (k == nlongs - 1) {
-                               if (t & endmask)
-                                       return -EINVAL;
-                       } else if (t)
-                               return -EINVAL;
-               }
-               nlongs = BITS_TO_LONGS(MAX_NUMNODES);
-               endmask = ~0UL;
-       }
-
-       if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
-               return -EFAULT;
-       nodes[nlongs-1] &= endmask;
-       /* Update current mems_allowed */
-       cpuset_update_current_mems_allowed();
-       /* Ignore nodes not set in current->mems_allowed */
-       cpuset_restrict_to_mems_allowed(nodes);
-       return mpol_check_policy(mode, nodes);
+       return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
  }
-
  /* Generate a custom zonelist for the BIND policy. */
-static struct zonelist *bind_zonelist(unsigned long *nodes)
+static struct zonelist *bind_zonelist(nodemask_t *nodes)
  {
         struct zonelist *zl;
         int num, max, nd;
  
-       max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
+       max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
         if (!zl)
                 return NULL;
         num = 0;
-       for (nd = find_first_bit(nodes, MAX_NUMNODES);
-            nd < MAX_NUMNODES;
-            nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
+       for_each_node_mask(nd, *nodes) {
                 int k;
                 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
@@ -199,17 +142,16 @@ static struct zonelist *bind_zonelist(unsigned long *nodes)
                                 policy_zone = k;
                 }
         }
-       BUG_ON(num >= max);
         zl->zones[num] = NULL;
         return zl;
  }
  
  /* Create a new policy */
-static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
+static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
  {
         struct mempolicy *policy;
  
-       PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
+       PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
         if (mode == MPOL_DEFAULT)
                 return NULL;
         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -218,10 +160,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
         atomic_set(&policy->refcnt, 1);
         switch (mode) {
         case MPOL_INTERLEAVE:
-               bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
+               policy->v.nodes = *nodes;
                 break;
         case MPOL_PREFERRED:
-               policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
+               policy->v.preferred_node = first_node(*nodes);
                 if (policy->v.preferred_node >= MAX_NUMNODES)
                         policy->v.preferred_node = -1;
                 break;
@@ -238,14 +180,14 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
  }
  
  /* Ensure all existing pages follow the policy. */
-static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
-               unsigned long addr, unsigned long end, unsigned long *nodes)
+static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long addr, unsigned long end, nodemask_t *nodes)
  {
         pte_t *orig_pte;
         pte_t *pte;
+       spinlock_t *ptl;
  
-       spin_lock(&mm->page_table_lock);
-       orig_pte = pte = pte_offset_map(pmd, addr);
+       orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
         do {
                 unsigned long pfn;
                 unsigned int nid;
@@ -253,19 +195,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
                 if (!pte_present(*pte))
                         continue;
                 pfn = pte_pfn(*pte);
-               if (!pfn_valid(pfn))
+               if (!pfn_valid(pfn)) {
+                       print_bad_pte(vma, *pte, addr);
                         continue;
+               }
                 nid = pfn_to_nid(pfn);
-               if (!test_bit(nid, nodes))
+               if (!node_isset(nid, *nodes))
                         break;
         } while (pte++, addr += PAGE_SIZE, addr != end);
-       pte_unmap(orig_pte);
-       spin_unlock(&mm->page_table_lock);
+       pte_unmap_unlock(orig_pte, ptl);
         return addr != end;
  }
  
-static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
-               unsigned long addr, unsigned long end, unsigned long *nodes)
+static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+               unsigned long addr, unsigned long end, nodemask_t *nodes)
  {
         pmd_t *pmd;
         unsigned long next;
@@ -275,14 +218,14 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
                 next = pmd_addr_end(addr, end);
                 if (pmd_none_or_clear_bad(pmd))
                         continue;
-               if (check_pte_range(mm, pmd, addr, next, nodes))
+               if (check_pte_range(vma, pmd, addr, next, nodes))
                         return -EIO;
         } while (pmd++, addr = next, addr != end);
         return 0;
  }
  
-static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
-               unsigned long addr, unsigned long end, unsigned long *nodes)
+static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+               unsigned long addr, unsigned long end, nodemask_t *nodes)
  {
         pud_t *pud;
         unsigned long next;
@@ -292,24 +235,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
                 next = pud_addr_end(addr, end);
                 if (pud_none_or_clear_bad(pud))
                         continue;
-               if (check_pmd_range(mm, pud, addr, next, nodes))
+               if (check_pmd_range(vma, pud, addr, next, nodes))
                         return -EIO;
         } while (pud++, addr = next, addr != end);
         return 0;
  }
  
-static inline int check_pgd_range(struct mm_struct *mm,
-               unsigned long addr, unsigned long end, unsigned long *nodes)
+static inline int check_pgd_range(struct vm_area_struct *vma,
+               unsigned long addr, unsigned long end, nodemask_t *nodes)
  {
         pgd_t *pgd;
         unsigned long next;
  
-       pgd = pgd_offset(mm, addr);
+       pgd = pgd_offset(vma->vm_mm, addr);
         do {
                 next = pgd_addr_end(addr, end);
                 if (pgd_none_or_clear_bad(pgd))
                         continue;
-               if (check_pud_range(mm, pgd, addr, next, nodes))
+               if (check_pud_range(vma, pgd, addr, next, nodes))
                         return -EIO;
         } while (pgd++, addr = next, addr != end);
         return 0;
@@ -318,7 +261,7 @@ static inline int check_pgd_range(struct mm_struct *mm,
  /* Step 1: check the range */
  static struct vm_area_struct *
  check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-           unsigned long *nodes, unsigned long flags)
+           nodemask_t *nodes, unsigned long flags)
  {
         int err;
         struct vm_area_struct *first, *vma, *prev;
@@ -326,6 +269,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
         first = find_vma(mm, start);
         if (!first)
                 return ERR_PTR(-EFAULT);
+       if (first->vm_flags & VM_RESERVED)
+               return ERR_PTR(-EACCES);
         prev = NULL;
         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
                 if (!vma->vm_next && vma->vm_end < end)
@@ -338,8 +283,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                                 endvma = end;
                         if (vma->vm_start > start)
                                 start = vma->vm_start;
-                       err = check_pgd_range(vma->vm_mm,
-                                          start, endvma, nodes);
+                       err = check_pgd_range(vma, start, endvma, nodes);
                         if (err) {
                                 first = ERR_PTR(err);
                                 break;
@@ -393,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
         return err;
  }
  
-/* Change policy for a memory range */
-asmlinkage long sys_mbind(unsigned long start, unsigned long len,
-                         unsigned long mode,
-                         unsigned long __user *nmask, unsigned long maxnode,
-                         unsigned flags)
+static int contextualize_policy(int mode, nodemask_t *nodes)
+{
+       if (!nodes)
+               return 0;
+
+       /* Update current mems_allowed */
+       cpuset_update_current_mems_allowed();
+       /* Ignore nodes not set in current->mems_allowed */
+       cpuset_restrict_to_mems_allowed(nodes->bits);
+       return mpol_check_policy(mode, nodes);
+}
+
+long do_mbind(unsigned long start, unsigned long len,
+               unsigned long mode, nodemask_t *nmask, unsigned long flags)
  {
         struct vm_area_struct *vma;
         struct mm_struct *mm = current->mm;
         struct mempolicy *new;
         unsigned long end;
-       DECLARE_BITMAP(nodes, MAX_NUMNODES);
         int err;
  
         if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
@@ -418,20 +370,17 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
                 return -EINVAL;
         if (end == start)
                 return 0;
-
-       err = get_nodes(nodes, nmask, maxnode, mode);
-       if (err)
-               return err;
-
-       new = mpol_new(mode, nodes);
+       if (mpol_check_policy(mode, nmask))
+               return -EINVAL;
+       new = mpol_new(mode, nmask);
         if (IS_ERR(new))
                 return PTR_ERR(new);
  
         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
-                       mode,nodes[0]);
+                       mode,nodes_addr(nodes)[0]);
  
         down_write(&mm->mmap_sem);
-       vma = check_range(mm, start, end, nodes, flags);
+       vma = check_range(mm, start, end, nmask, flags);
         err = PTR_ERR(vma);
         if (!IS_ERR(vma))
                 err = mbind_range(vma, start, end, new);
@@ -441,50 +390,45 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
  }
  
  /* Set the process memory policy */
-asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
-                                  unsigned long maxnode)
+long do_set_mempolicy(int mode, nodemask_t *nodes)
  {
-       int err;
         struct mempolicy *new;
-       DECLARE_BITMAP(nodes, MAX_NUMNODES);
  
-       if (mode < 0 || mode > MPOL_MAX)
+       if (contextualize_policy(mode, nodes))
                 return -EINVAL;
-       err = get_nodes(nodes, nmask, maxnode, mode);
-       if (err)
-               return err;
         new = mpol_new(mode, nodes);
         if (IS_ERR(new))
                 return PTR_ERR(new);
         mpol_free(current->mempolicy);
         current->mempolicy = new;
         if (new && new->policy == MPOL_INTERLEAVE)
-               current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
+               current->il_next = first_node(new->v.nodes);
         return 0;
  }
  
  /* Fill a zone bitmap for a policy */
-static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
+static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
  {
         int i;
  
-       bitmap_zero(nodes, MAX_NUMNODES);
+       nodes_clear(*nodes);
         switch (p->policy) {
         case MPOL_BIND:
                 for (i = 0; p->v.zonelist->zones[i]; i++)
-                       __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
+                       node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
+                               *nodes);
                 break;
         case MPOL_DEFAULT:
                 break;
         case MPOL_INTERLEAVE:
-               bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
+               *nodes = p->v.nodes;
                 break;
         case MPOL_PREFERRED:
                 /* or use current node instead of online map? */
                 if (p->v.preferred_node < 0)
-                       bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
+                       *nodes = node_online_map;
                 else
-                       __set_bit(p->v.preferred_node, nodes);
+                       node_set(p->v.preferred_node, *nodes);
                 break;
         default:
                 BUG();
@@ -504,37 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
         return err;
  }
  
-/* Copy a kernel node mask to user space */
-static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
-                             void *nodes, unsigned nbytes)
-{
-       unsigned long copy = ALIGN(maxnode-1, 64) / 8;
-
-       if (copy > nbytes) {
-               if (copy > PAGE_SIZE)
-                       return -EINVAL;
-               if (clear_user((char __user *)mask + nbytes, copy - nbytes))
-                       return -EFAULT;
-               copy = nbytes;
-       }
-       return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
-}
-
  /* Retrieve NUMA policy */
-asmlinkage long sys_get_mempolicy(int __user *policy,
-                                 unsigned long __user *nmask,
-                                 unsigned long maxnode,
-                                 unsigned long addr, unsigned long flags)
+long do_get_mempolicy(int *policy, nodemask_t *nmask,
+                       unsigned long addr, unsigned long flags)
  {
-       int err, pval;
+       int err;
         struct mm_struct *mm = current->mm;
         struct vm_area_struct *vma = NULL;
         struct mempolicy *pol = current->mempolicy;
  
         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
                 return -EINVAL;
-       if (nmask != NULL && maxnode < MAX_NUMNODES)
-               return -EINVAL;
         if (flags & MPOL_F_ADDR) {
                 down_read(&mm->mmap_sem);
                 vma = find_vma_intersection(mm, addr, addr+1);
@@ -557,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
                         err = lookup_node(mm, addr);
                         if (err < 0)
                                 goto out;
-                       pval = err;
+                       *policy = err;
                 } else if (pol == current->mempolicy &&
                                 pol->policy == MPOL_INTERLEAVE) {
-                       pval = current->il_next;
+                       *policy = current->il_next;
                 } else {
                         err = -EINVAL;
                         goto out;
                 }
         } else
-               pval = pol->policy;
+               *policy = pol->policy;
  
         if (vma) {
                 up_read(&current->mm->mmap_sem);
                 vma = NULL;
         }
  
-       if (policy && put_user(pval, policy))
-               return -EFAULT;
-
         err = 0;
-       if (nmask) {
-               DECLARE_BITMAP(nodes, MAX_NUMNODES);
-               get_zonemask(pol, nodes);
-               err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
-       }
+       if (nmask)
+               get_zonemask(pol, nmask);
  
   out:
         if (vma)
@@ -589,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
         return err;
  }
  
+/*
+ * User space interface with variable sized bitmaps for nodelists.
+ */
+
+/* Copy a node mask from user space. */
+static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+                    unsigned long maxnode)
+{
+       unsigned long k;
+       unsigned long nlongs;
+       unsigned long endmask;
+
+       --maxnode;
+       nodes_clear(*nodes);
+       if (maxnode == 0 || !nmask)
+               return 0;
+
+       nlongs = BITS_TO_LONGS(maxnode);
+       if ((maxnode % BITS_PER_LONG) == 0)
+               endmask = ~0UL;
+       else
+               endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+       /* When the user specified more nodes than supported just check
+          if the non supported part is all zero. */
+       if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+               if (nlongs > PAGE_SIZE/sizeof(long))
+                       return -EINVAL;
+               for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
+                       unsigned long t;
+                       if (get_user(t, nmask + k))
+                               return -EFAULT;
+                       if (k == nlongs - 1) {
+                               if (t & endmask)
+                                       return -EINVAL;
+                       } else if (t)
+                               return -EINVAL;
+               }
+               nlongs = BITS_TO_LONGS(MAX_NUMNODES);
+               endmask = ~0UL;
+       }
+
+       if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
+               return -EFAULT;
+       nodes_addr(*nodes)[nlongs-1] &= endmask;
+       return 0;
+}
+
+/* Copy a kernel node mask to user space */
+static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
+                             nodemask_t *nodes)
+{
+       unsigned long copy = ALIGN(maxnode-1, 64) / 8;
+       const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+
+       if (copy > nbytes) {
+               if (copy > PAGE_SIZE)
+                       return -EINVAL;
+               if (clear_user((char __user *)mask + nbytes, copy - nbytes))
+                       return -EFAULT;
+               copy = nbytes;
+       }
+       return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
+}
+
+asmlinkage long sys_mbind(unsigned long start, unsigned long len,
+                       unsigned long mode,
+                       unsigned long __user *nmask, unsigned long maxnode,
+                       unsigned flags)
+{
+       nodemask_t nodes;
+       int err;
+
+       err = get_nodes(&nodes, nmask, maxnode);
+       if (err)
+               return err;
+       return do_mbind(start, len, mode, &nodes, flags);
+}
+
+/* Set the process memory policy */
+asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
+               unsigned long maxnode)
+{
+       int err;
+       nodemask_t nodes;
+
+       if (mode < 0 || mode > MPOL_MAX)
+               return -EINVAL;
+       err = get_nodes(&nodes, nmask, maxnode);
+       if (err)
+               return err;
+       return do_set_mempolicy(mode, &nodes);
+}
+
+/* Retrieve NUMA policy */
+asmlinkage long sys_get_mempolicy(int __user *policy,
+                               unsigned long __user *nmask,
+                               unsigned long maxnode,
+                               unsigned long addr, unsigned long flags)
+{
+       int err, pval;
+       nodemask_t nodes;
+
+       if (nmask != NULL && maxnode < MAX_NUMNODES)
+               return -EINVAL;
+
+       err = do_get_mempolicy(&pval, &nodes, addr, flags);
+
+       if (err)
+               return err;
+
+       if (policy && put_user(pval, policy))
+               return -EFAULT;
+
+       if (nmask)
+               err = copy_nodes_to_user(nmask, maxnode, &nodes);
+
+       return err;
+}
+
  #ifdef CONFIG_COMPAT
  
  asmlinkage long compat_sys_get_mempolicy(int __user *policy,
@@ -649,15 +687,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
         long err = 0;
         unsigned long __user *nm = NULL;
         unsigned long nr_bits, alloc_size;
-       DECLARE_BITMAP(bm, MAX_NUMNODES);
+       nodemask_t bm;
  
         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
  
         if (nmask) {
-               err = compat_get_bitmap(bm, nmask, nr_bits);
+               err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
                 nm = compat_alloc_user_space(alloc_size);
-               err |= copy_to_user(nm, bm, alloc_size);
+               err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
         }
  
         if (err)
@@ -676,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
  
         if (vma) {
                 if (vma->vm_ops && vma->vm_ops->get_policy)
-                       pol = vma->vm_ops->get_policy(vma, addr);
+                       pol = vma->vm_ops->get_policy(vma, addr);
                 else if (vma->vm_policy &&
                                 vma->vm_policy->policy != MPOL_DEFAULT)
                         pol = vma->vm_policy;
@@ -722,10 +760,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
         struct task_struct *me = current;
  
         nid = me->il_next;
-       BUG_ON(nid >= MAX_NUMNODES);
-       next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
+       next = next_node(nid, policy->v.nodes);
         if (next >= MAX_NUMNODES)
-               next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
+               next = first_node(policy->v.nodes);
         me->il_next = next;
         return nid;
  }
@@ -734,29 +771,27 @@ static unsigned interleave_nodes(struct mempolicy *policy)
  static unsigned offset_il_node(struct mempolicy *pol,
                 struct vm_area_struct *vma, unsigned long off)
  {
-       unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
+       unsigned nnodes = nodes_weight(pol->v.nodes);
         unsigned target = (unsigned)off % nnodes;
         int c;
         int nid = -1;
  
         c = 0;
         do {
-               nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
+               nid = next_node(nid, pol->v.nodes);
                 c++;
         } while (c <= target);
-       BUG_ON(nid >= MAX_NUMNODES);
-       BUG_ON(!test_bit(nid, pol->v.nodes));
         return nid;
  }
  
  /* Allocate a page in interleaved policy.
     Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid)
+static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
+                                       unsigned nid)
  {
         struct zonelist *zl;
         struct page *page;
  
-       BUG_ON(!node_online(nid));
         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
         page = __alloc_pages(gfp, order, zl);
         if (page && page_zone(page) == zl->zones[0]) {
@@ -799,8 +834,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
                 unsigned nid;
                 if (vma) {
                         unsigned long off;
-                       BUG_ON(addr >= vma->vm_end);
-                       BUG_ON(addr < vma->vm_start);
                         off = vma->vm_pgoff;
                         off += (addr - vma->vm_start) >> PAGE_SHIFT;
                         nid = offset_il_node(pol, vma, off);
@@ -878,7 +911,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
         case MPOL_DEFAULT:
                 return 1;
         case MPOL_INTERLEAVE:
-               return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
+               return nodes_equal(a->v.nodes, b->v.nodes);
         case MPOL_PREFERRED:
                 return a->v.preferred_node == b->v.preferred_node;
         case MPOL_BIND: {
@@ -1117,7 +1150,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
                  vma->vm_pgoff,
                  sz, npol? npol->policy : -1,
-               npol ? npol->v.nodes[0] : -1);
+               npol ? nodes_addr(npol->v.nodes)[0] : -1);
  
         if (npol) {
                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1164,14 +1197,12 @@ void __init numa_policy_init(void)
         /* Set interleaving policy for system init. This way not all
            the data structures allocated at system boot end up in node zero. */
  
-       if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
-                                                       MAX_NUMNODES) < 0)
+       if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
                 printk("numa_policy_init: interleaving failed\n");
  }
  
-/* Reset policy of current process to default.
- * Assumes fs == KERNEL_DS */
+/* Reset policy of current process to default */
  void numa_default_policy(void)
  {
-       sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
+       do_set_mempolicy(MPOL_DEFAULT, NULL);
  }
diff --git a/mm/mmap.c b/mm/mmap.c

index fa11d91..5ecc2cf 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -181,26 +181,36 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
  }
  
  /*
- * Remove one vm structure and free it.
+ * Unlink a file-based vm structure from its prio_tree, to hide
+ * vma from rmap and vmtruncate before freeing its page tables.
   */
-static void remove_vm_struct(struct vm_area_struct *vma)
+void unlink_file_vma(struct vm_area_struct *vma)
  {
         struct file *file = vma->vm_file;
  
-       might_sleep();
         if (file) {
                 struct address_space *mapping = file->f_mapping;
                 spin_lock(&mapping->i_mmap_lock);
                 __remove_shared_vm_struct(vma, file, mapping);
                 spin_unlock(&mapping->i_mmap_lock);
         }
+}
+
+/*
+ * Close a vm structure and free it, returning the next.
+ */
+static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+{
+       struct vm_area_struct *next = vma->vm_next;
+
+       might_sleep();
         if (vma->vm_ops && vma->vm_ops->close)
                 vma->vm_ops->close(vma);
-       if (file)
-               fput(file);
-       anon_vma_unlink(vma);
+       if (vma->vm_file)
+               fput(vma->vm_file);
         mpol_free(vma_policy(vma));
         kmem_cache_free(vm_area_cachep, vma);
+       return next;
  }
  
  asmlinkage unsigned long sys_brk(unsigned long brk)
@@ -832,7 +842,7 @@ none:
  }
  
  #ifdef CONFIG_PROC_FS
-void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
+void vm_stat_account(struct mm_struct *mm, unsigned long flags,
                                                 struct file *file, long pages)
  {
         const unsigned long stack_flags
@@ -1070,6 +1080,17 @@ munmap_back:
                 error = file->f_op->mmap(file, vma);
                 if (error)
                         goto unmap_and_free_vma;
+               if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
+                                               == (VM_WRITE | VM_RESERVED)) {
+                       printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+                               "PROT_WRITE mmap of VM_RESERVED memory, which "
+                               "is deprecated. Please report this to "
+                               "linux-kernel@vger.kernel.org\n",current->comm);
+                       if (vma->vm_ops && vma->vm_ops->close)
+                               vma->vm_ops->close(vma);
+                       error = -EACCES;
+                       goto unmap_and_free_vma;
+               }
         } else if (vm_flags & VM_SHARED) {
                 error = shmem_zero_setup(vma);
                 if (error)
@@ -1110,7 +1131,7 @@ munmap_back:
         }
  out:   
         mm->total_vm += len >> PAGE_SHIFT;
-       __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+       vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
         if (vm_flags & VM_LOCKED) {
                 mm->locked_vm += len >> PAGE_SHIFT;
                 make_pages_present(addr, addr + len);
@@ -1475,15 +1496,19 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
         mm->total_vm += grow;
         if (vma->vm_flags & VM_LOCKED)
                 mm->locked_vm += grow;
-       __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
+       vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
         return 0;
  }
  
-#ifdef CONFIG_STACK_GROWSUP
+#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
  /*
- * vma is the first one with address > vma->vm_end.  Have to extend vma.
+ * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * vma is the last one with address > vma->vm_end.  Have to extend vma.
   */
-int expand_stack(struct vm_area_struct * vma, unsigned long address)
+#ifdef CONFIG_STACK_GROWSUP
+static inline
+#endif
+int expand_upwards(struct vm_area_struct *vma, unsigned long address)
  {
         int error;
  
@@ -1521,6 +1546,13 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address)
         anon_vma_unlock(vma);
         return error;
  }
+#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
+
+#ifdef CONFIG_STACK_GROWSUP
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+       return expand_upwards(vma, address);
+}
  
  struct vm_area_struct *
  find_extend_vma(struct mm_struct *mm, unsigned long addr)
@@ -1603,36 +1635,24 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
  }
  #endif
  
-/* Normal function to fix up a mapping
- * This function is the default for when an area has no specific
- * function.  This may be used as part of a more specific routine.
- *
- * By the time this function is called, the area struct has been
- * removed from the process mapping list.
- */
-static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
-{
-       size_t len = area->vm_end - area->vm_start;
-
-       area->vm_mm->total_vm -= len >> PAGE_SHIFT;
-       if (area->vm_flags & VM_LOCKED)
-               area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
-       vm_stat_unaccount(area);
-       remove_vm_struct(area);
-}
-
  /*
- * Update the VMA and inode share lists.
- *
- * Ok - we have the memory areas we should free on the 'free' list,
+ * Ok - we have the memory areas we should free on the vma list,
   * so release them, and do the vma updates.
+ *
+ * Called with the mm semaphore held.
   */
-static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
  {
+       /* Update high watermark before we lower total_vm */
+       update_hiwater_vm(mm);
         do {
-               struct vm_area_struct *next = vma->vm_next;
-               unmap_vma(mm, vma);
-               vma = next;
+               long nrpages = vma_pages(vma);
+
+               mm->total_vm -= nrpages;
+               if (vma->vm_flags & VM_LOCKED)
+                       mm->locked_vm -= nrpages;
+               vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+               vma = remove_vma(vma);
         } while (vma);
         validate_mm(mm);
  }
@@ -1651,14 +1671,13 @@ static void unmap_region(struct mm_struct *mm,
         unsigned long nr_accounted = 0;
  
         lru_add_drain();
-       spin_lock(&mm->page_table_lock);
         tlb = tlb_gather_mmu(mm, 0);
-       unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
+       update_hiwater_rss(mm);
+       unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
         vm_unacct_memory(nr_accounted);
         free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
                                  next? next->vm_start: 0);
         tlb_finish_mmu(tlb, start, end);
-       spin_unlock(&mm->page_table_lock);
  }
  
  /*
@@ -1799,7 +1818,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
         unmap_region(mm, vma, prev, start, end);
  
         /* Fix up all other VM information */
-       unmap_vma_list(mm, vma);
+       remove_vma_list(mm, vma);
  
         return 0;
  }
@@ -1933,34 +1952,21 @@ void exit_mmap(struct mm_struct *mm)
         unsigned long end;
  
         lru_add_drain();
-
-       spin_lock(&mm->page_table_lock);
-
         flush_cache_mm(mm);
         tlb = tlb_gather_mmu(mm, 1);
+       /* Don't update_hiwater_rss(mm) here, do_exit already did */
         /* Use -1 here to ensure all VMAs in the mm are unmapped */
-       end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
+       end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
         vm_unacct_memory(nr_accounted);
         free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
         tlb_finish_mmu(tlb, 0, end);
  
-       mm->mmap = mm->mmap_cache = NULL;
-       mm->mm_rb = RB_ROOT;
-       set_mm_counter(mm, rss, 0);
-       mm->total_vm = 0;
-       mm->locked_vm = 0;
-
-       spin_unlock(&mm->page_table_lock);
-
         /*
-        * Walk the list again, actually closing and freeing it
-        * without holding any MM locks.
+        * Walk the list again, actually closing and freeing it,
+        * with preemption enabled, without holding any MM locks.
          */
-       while (vma) {
-               struct vm_area_struct *next = vma->vm_next;
-               remove_vm_struct(vma);
-               vma = next;
-       }
+       while (vma)
+               vma = remove_vma(vma);
  
         BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
  }
diff --git a/mm/mprotect.c b/mm/mprotect.c

index 57577f6..17a2b52 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,8 +29,9 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                 unsigned long addr, unsigned long end, pgprot_t newprot)
  {
         pte_t *pte;
+       spinlock_t *ptl;
  
-       pte = pte_offset_map(pmd, addr);
+       pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
         do {
                 if (pte_present(*pte)) {
                         pte_t ptent;
@@ -44,7 +45,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                         lazy_mmu_prot_update(ptent);
                 }
         } while (pte++, addr += PAGE_SIZE, addr != end);
-       pte_unmap(pte - 1);
+       pte_unmap_unlock(pte - 1, ptl);
  }
  
  static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -88,7 +89,6 @@ static void change_protection(struct vm_area_struct *vma,
         BUG_ON(addr >= end);
         pgd = pgd_offset(mm, addr);
         flush_cache_range(vma, addr, end);
-       spin_lock(&mm->page_table_lock);
         do {
                 next = pgd_addr_end(addr, end);
                 if (pgd_none_or_clear_bad(pgd))
@@ -96,7 +96,6 @@ static void change_protection(struct vm_area_struct *vma,
                 change_pud_range(mm, pgd, addr, next, newprot);
         } while (pgd++, addr = next, addr != end);
         flush_tlb_range(vma, start, end);
-       spin_unlock(&mm->page_table_lock);
  }
  
  static int
@@ -125,6 +124,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
          * a MAP_NORESERVE private mapping to writable will now reserve.
          */
         if (newflags & VM_WRITE) {
+               if (oldflags & VM_RESERVED) {
+                       BUG_ON(oldflags & VM_WRITE);
+                       printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+                               "PROT_WRITE mprotect of VM_RESERVED memory, "
+                               "which is deprecated. Please report this to "
+                               "linux-kernel@vger.kernel.org\n",current->comm);
+                       return -EACCES;
+               }
                 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
                         charged = nrpages;
                         if (security_vm_enough_memory(charged))
@@ -168,8 +175,8 @@ success:
         vma->vm_flags = newflags;
         vma->vm_page_prot = newprot;
         change_protection(vma, start, end, newprot);
-       __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
-       __vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+       vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+       vm_stat_account(mm, newflags, vma->vm_file, nrpages);
         return 0;
  
  fail:
diff --git a/mm/mremap.c b/mm/mremap.c

index f343fc7..b535438 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,35 +22,7 @@
  #include <asm/cacheflush.h>
  #include <asm/tlbflush.h>
  
-static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte = NULL;
-
-       pgd = pgd_offset(mm, addr);
-       if (pgd_none_or_clear_bad(pgd))
-               goto end;
-
-       pud = pud_offset(pgd, addr);
-       if (pud_none_or_clear_bad(pud))
-               goto end;
-
-       pmd = pmd_offset(pud, addr);
-       if (pmd_none_or_clear_bad(pmd))
-               goto end;
-
-       pte = pte_offset_map_nested(pmd, addr);
-       if (pte_none(*pte)) {
-               pte_unmap_nested(pte);
-               pte = NULL;
-       }
-end:
-       return pte;
-}
-
-static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
  {
         pgd_t *pgd;
         pud_t *pud;
@@ -68,35 +40,39 @@ static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
         if (pmd_none_or_clear_bad(pmd))
                 return NULL;
  
-       return pte_offset_map(pmd, addr);
+       return pmd;
  }
  
-static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
  {
         pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
-       pte_t *pte = NULL;
  
         pgd = pgd_offset(mm, addr);
-
         pud = pud_alloc(mm, pgd, addr);
         if (!pud)
                 return NULL;
+
         pmd = pmd_alloc(mm, pud, addr);
-       if (pmd)
-               pte = pte_alloc_map(mm, pmd, addr);
-       return pte;
+       if (!pmd)
+               return NULL;
+
+       if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+               return NULL;
+
+       return pmd;
  }
  
-static int
-move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
-               struct vm_area_struct *new_vma, unsigned long new_addr)
+static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
+               unsigned long old_addr, unsigned long old_end,
+               struct vm_area_struct *new_vma, pmd_t *new_pmd,
+               unsigned long new_addr)
  {
         struct address_space *mapping = NULL;
         struct mm_struct *mm = vma->vm_mm;
-       int error = 0;
-       pte_t *src, *dst;
+       pte_t *old_pte, *new_pte, pte;
+       spinlock_t *old_ptl, *new_ptl;
  
         if (vma->vm_file) {
                 /*
@@ -111,74 +87,69 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
                     new_vma->vm_truncate_count != vma->vm_truncate_count)
                         new_vma->vm_truncate_count = 0;
         }
-       spin_lock(&mm->page_table_lock);
  
-       src = get_one_pte_map_nested(mm, old_addr);
-       if (src) {
-               /*
-                * Look to see whether alloc_one_pte_map needs to perform a
-                * memory allocation.  If it does then we need to drop the
-                * atomic kmap
-                */
-               dst = get_one_pte_map(mm, new_addr);
-               if (unlikely(!dst)) {
-                       pte_unmap_nested(src);
-                       if (mapping)
-                               spin_unlock(&mapping->i_mmap_lock);
-                       dst = alloc_one_pte_map(mm, new_addr);
-                       if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
-                               spin_unlock(&mm->page_table_lock);
-                               spin_lock(&mapping->i_mmap_lock);
-                               spin_lock(&mm->page_table_lock);
-                       }
-                       src = get_one_pte_map_nested(mm, old_addr);
-               }
-               /*
-                * Since alloc_one_pte_map can drop and re-acquire
-                * page_table_lock, we should re-check the src entry...
-                */
-               if (src) {
-                       if (dst) {
-                               pte_t pte;
-                               pte = ptep_clear_flush(vma, old_addr, src);
-
-                               /* ZERO_PAGE can be dependant on virtual addr */
-                               pte = move_pte(pte, new_vma->vm_page_prot,
-                                                       old_addr, new_addr);
-                               set_pte_at(mm, new_addr, dst, pte);
-                       } else
-                               error = -ENOMEM;
-                       pte_unmap_nested(src);
-               }
-               if (dst)
-                       pte_unmap(dst);
+       /*
+        * We don't have to worry about the ordering of src and dst
+        * pte locks because exclusive mmap_sem prevents deadlock.
+        */
+       old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+       new_pte = pte_offset_map_nested(new_pmd, new_addr);
+       new_ptl = pte_lockptr(mm, new_pmd);
+       if (new_ptl != old_ptl)
+               spin_lock(new_ptl);
+
+       for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
+                                  new_pte++, new_addr += PAGE_SIZE) {
+               if (pte_none(*old_pte))
+                       continue;
+               pte = ptep_clear_flush(vma, old_addr, old_pte);
+               /* ZERO_PAGE can be dependant on virtual addr */
+               pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
+               set_pte_at(mm, new_addr, new_pte, pte);
         }
-       spin_unlock(&mm->page_table_lock);
+
+       if (new_ptl != old_ptl)
+               spin_unlock(new_ptl);
+       pte_unmap_nested(new_pte - 1);
+       pte_unmap_unlock(old_pte - 1, old_ptl);
         if (mapping)
                 spin_unlock(&mapping->i_mmap_lock);
-       return error;
  }
  
+#define LATENCY_LIMIT  (64 * PAGE_SIZE)
+
  static unsigned long move_page_tables(struct vm_area_struct *vma,
                 unsigned long old_addr, struct vm_area_struct *new_vma,
                 unsigned long new_addr, unsigned long len)
  {
-       unsigned long offset;
+       unsigned long extent, next, old_end;
+       pmd_t *old_pmd, *new_pmd;
  
-       flush_cache_range(vma, old_addr, old_addr + len);
+       old_end = old_addr + len;
+       flush_cache_range(vma, old_addr, old_end);
  
-       /*
-        * This is not the clever way to do this, but we're taking the
-        * easy way out on the assumption that most remappings will be
-        * only a few pages.. This also makes error recovery easier.
-        */
-       for (offset = 0; offset < len; offset += PAGE_SIZE) {
-               if (move_one_page(vma, old_addr + offset,
-                               new_vma, new_addr + offset) < 0)
-                       break;
+       for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
                 cond_resched();
+               next = (old_addr + PMD_SIZE) & PMD_MASK;
+               if (next - 1 > old_end)
+                       next = old_end;
+               extent = next - old_addr;
+               old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+               if (!old_pmd)
+                       continue;
+               new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+               if (!new_pmd)
+                       break;
+               next = (new_addr + PMD_SIZE) & PMD_MASK;
+               if (extent > next - new_addr)
+                       extent = next - new_addr;
+               if (extent > LATENCY_LIMIT)
+                       extent = LATENCY_LIMIT;
+               move_ptes(vma, old_pmd, old_addr, old_addr + extent,
+                               new_vma, new_pmd, new_addr);
         }
-       return offset;
+
+       return len + old_addr - old_end;        /* how much done */
  }
  
  static unsigned long move_vma(struct vm_area_struct *vma,
@@ -191,6 +162,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         unsigned long new_pgoff;
         unsigned long moved_len;
         unsigned long excess = 0;
+       unsigned long hiwater_vm;
         int split = 0;
  
         /*
@@ -229,17 +201,24 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         }
  
         /*
-        * if we failed to move page tables we still do total_vm increment
-        * since do_munmap() will decrement it by old_len == new_len
+        * If we failed to move page tables we still do total_vm increment
+        * since do_munmap() will decrement it by old_len == new_len.
+        *
+        * Since total_vm is about to be raised artificially high for a
+        * moment, we need to restore high watermark afterwards: if stats
+        * are taken meanwhile, total_vm and hiwater_vm appear too high.
+        * If this were a serious issue, we'd add a flag to do_munmap().
          */
+       hiwater_vm = mm->hiwater_vm;
         mm->total_vm += new_len >> PAGE_SHIFT;
-       __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+       vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
  
         if (do_munmap(mm, old_addr, old_len) < 0) {
                 /* OOM: unable to split vma, just get accounts right */
                 vm_unacct_memory(excess >> PAGE_SHIFT);
                 excess = 0;
         }
+       mm->hiwater_vm = hiwater_vm;
  
         /* Restore VM_ACCOUNT if one or two pieces of vma left */
         if (excess) {
@@ -269,6 +248,7 @@ unsigned long do_mremap(unsigned long addr,
         unsigned long old_len, unsigned long new_len,
         unsigned long flags, unsigned long new_addr)
  {
+       struct mm_struct *mm = current->mm;
         struct vm_area_struct *vma;
         unsigned long ret = -EINVAL;
         unsigned long charged = 0;
@@ -309,7 +289,7 @@ unsigned long do_mremap(unsigned long addr,
                 if ((addr <= new_addr) && (addr+old_len) > new_addr)
                         goto out;
  
-               ret = do_munmap(current->mm, new_addr, new_len);
+               ret = do_munmap(mm, new_addr, new_len);
                 if (ret)
                         goto out;
         }
@@ -320,7 +300,7 @@ unsigned long do_mremap(unsigned long addr,
          * do_munmap does all the needed commit accounting
          */
         if (old_len >= new_len) {
-               ret = do_munmap(current->mm, addr+new_len, old_len - new_len);
+               ret = do_munmap(mm, addr+new_len, old_len - new_len);
                 if (ret && old_len != new_len)
                         goto out;
                 ret = addr;
@@ -333,7 +313,7 @@ unsigned long do_mremap(unsigned long addr,
          * Ok, we need to grow..  or relocate.
          */
         ret = -EFAULT;
-       vma = find_vma(current->mm, addr);
+       vma = find_vma(mm, addr);
         if (!vma || vma->vm_start > addr)
                 goto out;
         if (is_vm_hugetlb_page(vma)) {
@@ -349,14 +329,14 @@ unsigned long do_mremap(unsigned long addr,
         }
         if (vma->vm_flags & VM_LOCKED) {
                 unsigned long locked, lock_limit;
-               locked = current->mm->locked_vm << PAGE_SHIFT;
+               locked = mm->locked_vm << PAGE_SHIFT;
                 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
                 locked += new_len - old_len;
                 ret = -EAGAIN;
                 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
                         goto out;
         }
-       if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) {
+       if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
                 ret = -ENOMEM;
                 goto out;
         }
@@ -383,11 +363,10 @@ unsigned long do_mremap(unsigned long addr,
                         vma_adjust(vma, vma->vm_start,
                                 addr + new_len, vma->vm_pgoff, NULL);
  
-                       current->mm->total_vm += pages;
-                       __vm_stat_account(vma->vm_mm, vma->vm_flags,
-                                                       vma->vm_file, pages);
+                       mm->total_vm += pages;
+                       vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
                         if (vma->vm_flags & VM_LOCKED) {
-                               current->mm->locked_vm += pages;
+                               mm->locked_vm += pages;
                                 make_pages_present(addr + old_len,
                                                    addr + new_len);
                         }
diff --git a/mm/msync.c b/mm/msync.c

index d0f5a1b..0e040e9 100644 (file)
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -17,40 +17,48 @@
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
  
-/*
- * Called with mm->page_table_lock held to protect against other
- * threads/the swapper from ripping pte's out from under us.
- */
-
-static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                 unsigned long addr, unsigned long end)
  {
         pte_t *pte;
+       spinlock_t *ptl;
+       int progress = 0;
  
-       pte = pte_offset_map(pmd, addr);
+again:
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
         do {
                 unsigned long pfn;
                 struct page *page;
  
+               if (progress >= 64) {
+                       progress = 0;
+                       if (need_resched() || need_lockbreak(ptl))
+                               break;
+               }
+               progress++;
                 if (!pte_present(*pte))
                         continue;
                 if (!pte_maybe_dirty(*pte))
                         continue;
                 pfn = pte_pfn(*pte);
-               if (!pfn_valid(pfn))
+               if (unlikely(!pfn_valid(pfn))) {
+                       print_bad_pte(vma, *pte, addr);
                         continue;
+               }
                 page = pfn_to_page(pfn);
-               if (PageReserved(page))
-                       continue;
  
                 if (ptep_clear_flush_dirty(vma, addr, pte) ||
                     page_test_and_clear_dirty(page))
                         set_page_dirty(page);
+               progress += 3;
         } while (pte++, addr += PAGE_SIZE, addr != end);
-       pte_unmap(pte - 1);
+       pte_unmap_unlock(pte - 1, ptl);
+       cond_resched();
+       if (addr != end)
+               goto again;
  }
  
-static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                                 unsigned long addr, unsigned long end)
  {
         pmd_t *pmd;
@@ -61,11 +69,11 @@ static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                 next = pmd_addr_end(addr, end);
                 if (pmd_none_or_clear_bad(pmd))
                         continue;
-               sync_pte_range(vma, pmd, addr, next);
+               msync_pte_range(vma, pmd, addr, next);
         } while (pmd++, addr = next, addr != end);
  }
  
-static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                                 unsigned long addr, unsigned long end)
  {
         pud_t *pud;
@@ -76,58 +84,34 @@ static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                 next = pud_addr_end(addr, end);
                 if (pud_none_or_clear_bad(pud))
                         continue;
-               sync_pmd_range(vma, pud, addr, next);
+               msync_pmd_range(vma, pud, addr, next);
         } while (pud++, addr = next, addr != end);
  }
  
-static void sync_page_range(struct vm_area_struct *vma,
+static void msync_page_range(struct vm_area_struct *vma,
                                 unsigned long addr, unsigned long end)
  {
-       struct mm_struct *mm = vma->vm_mm;
         pgd_t *pgd;
         unsigned long next;
  
         /* For hugepages we can't go walking the page table normally,
          * but that's ok, hugetlbfs is memory based, so we don't need
-        * to do anything more on an msync() */
-       if (is_vm_hugetlb_page(vma))
+        * to do anything more on an msync().
+        * Can't do anything with VM_RESERVED regions either.
+        */
+       if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
                 return;
  
         BUG_ON(addr >= end);
-       pgd = pgd_offset(mm, addr);
+       pgd = pgd_offset(vma->vm_mm, addr);
         flush_cache_range(vma, addr, end);
-       spin_lock(&mm->page_table_lock);
         do {
                 next = pgd_addr_end(addr, end);
                 if (pgd_none_or_clear_bad(pgd))
                         continue;
-               sync_pud_range(vma, pgd, addr, next);
+               msync_pud_range(vma, pgd, addr, next);
         } while (pgd++, addr = next, addr != end);
-       spin_unlock(&mm->page_table_lock);
-}
-
-#ifdef CONFIG_PREEMPT
-static inline void filemap_sync(struct vm_area_struct *vma,
-                               unsigned long addr, unsigned long end)
-{
-       const size_t chunk = 64 * 1024; /* bytes */
-       unsigned long next;
-
-       do {
-               next = addr + chunk;
-               if (next > end || next < addr)
-                       next = end;
-               sync_page_range(vma, addr, next);
-               cond_resched();
-       } while (addr = next, addr != end);
-}
-#else
-static inline void filemap_sync(struct vm_area_struct *vma,
-                               unsigned long addr, unsigned long end)
-{
-       sync_page_range(vma, addr, end);
  }
-#endif
  
  /*
   * MS_SYNC syncs the entire file - including mappings.
@@ -150,7 +134,7 @@ static int msync_interval(struct vm_area_struct *vma,
                 return -EBUSY;
  
         if (file && (vma->vm_flags & VM_SHARED)) {
-               filemap_sync(vma, addr, end);
+               msync_page_range(vma, addr, end);
  
                 if (flags & MS_SYNC) {
                         struct address_space *mapping = file->f_mapping;
diff --git a/mm/nommu.c b/mm/nommu.c

index 0ef241a..d1e076a 100644 (file)
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
         realalloc -= kobjsize(vml);
         askedalloc -= sizeof(*vml);
         kfree(vml);
+
+       update_hiwater_vm(mm);
         mm->total_vm -= len >> PAGE_SHIFT;
  
  #ifdef DEBUG
@@ -1047,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
  
  EXPORT_SYMBOL(find_vma);
  
-struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write)
+struct page *follow_page(struct mm_struct *mm, unsigned long address,
+                       unsigned int foll_flags)
  {
         return NULL;
  }
@@ -1078,19 +1081,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
  {
  }
  
-void update_mem_hiwater(struct task_struct *tsk)
-{
-       unsigned long rss;
-
-       if (likely(tsk->mm)) {
-               rss = get_mm_counter(tsk->mm, rss);
-               if (tsk->mm->hiwater_rss < rss)
-                       tsk->mm->hiwater_rss = rss;
-               if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
-                       tsk->mm->hiwater_vm = tsk->mm->total_vm;
-       }
-}
-
  void unmap_mapping_range(struct address_space *mapping,
                          loff_t const holebegin, loff_t const holelen,
                          int even_cows)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 94c864e..2dbdd98 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -33,6 +33,7 @@
  #include <linux/sysctl.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
+#include <linux/memory_hotplug.h>
  #include <linux/nodemask.h>
  #include <linux/vmalloc.h>
  
@@ -78,21 +79,44 @@ int min_free_kbytes = 1024;
  unsigned long __initdata nr_kernel_pages;
  unsigned long __initdata nr_all_pages;
  
+static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
+{
+       int ret = 0;
+       unsigned seq;
+       unsigned long pfn = page_to_pfn(page);
+
+       do {
+               seq = zone_span_seqbegin(zone);
+               if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
+                       ret = 1;
+               else if (pfn < zone->zone_start_pfn)
+                       ret = 1;
+       } while (zone_span_seqretry(zone, seq));
+
+       return ret;
+}
+
+static int page_is_consistent(struct zone *zone, struct page *page)
+{
+#ifdef CONFIG_HOLES_IN_ZONE
+       if (!pfn_valid(page_to_pfn(page)))
+               return 0;
+#endif
+       if (zone != page_zone(page))
+               return 0;
+
+       return 1;
+}
  /*
   * Temporary debugging check for pages not lying within a given zone.
   */
  static int bad_range(struct zone *zone, struct page *page)
  {
-       if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
+       if (page_outside_zone_boundaries(zone, page))
                 return 1;
-       if (page_to_pfn(page) < zone->zone_start_pfn)
-               return 1;
-#ifdef CONFIG_HOLES_IN_ZONE
-       if (!pfn_valid(page_to_pfn(page)))
-               return 1;
-#endif
-       if (zone != page_zone(page))
+       if (!page_is_consistent(zone, page))
                 return 1;
+
         return 0;
  }
  
@@ -114,7 +138,8 @@ static void bad_page(const char *function, struct page *page)
                         1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
-                       1 << PG_writeback);
+                       1 << PG_writeback |
+                       1 << PG_reserved );
         set_page_count(page, 0);
         reset_page_mapcount(page);
         page->mapping = NULL;
@@ -153,7 +178,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
                 struct page *p = page + i;
  
                 SetPageCompound(p);
-               p->private = (unsigned long)page;
+               set_page_private(p, (unsigned long)page);
         }
  }
  
@@ -173,7 +198,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
  
                 if (!PageCompound(p))
                         bad_page(__FUNCTION__, page);
-               if (p->private != (unsigned long)page)
+               if (page_private(p) != (unsigned long)page)
                         bad_page(__FUNCTION__, page);
                 ClearPageCompound(p);
         }
@@ -186,18 +211,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
   * So, we don't need atomic page->flags operations here.
   */
  static inline unsigned long page_order(struct page *page) {
-       return page->private;
+       return page_private(page);
  }
  
  static inline void set_page_order(struct page *page, int order) {
-       page->private = order;
+       set_page_private(page, order);
         __SetPagePrivate(page);
  }
  
  static inline void rmv_page_order(struct page *page)
  {
         __ClearPagePrivate(page);
-       page->private = 0;
+       set_page_private(page, 0);
  }
  
  /*
@@ -237,14 +262,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
   * (a) the buddy is free &&
   * (b) the buddy is on the buddy system &&
   * (c) a page and its buddy have the same order.
- * for recording page's order, we use page->private and PG_private.
+ * for recording page's order, we use page_private(page) and PG_private.
   *
   */
  static inline int page_is_buddy(struct page *page, int order)
  {
         if (PagePrivate(page)           &&
             (page_order(page) == order) &&
-           !PageReserved(page)         &&
              page_count(page) == 0)
                 return 1;
         return 0;
@@ -264,7 +288,7 @@ static inline int page_is_buddy(struct page *page, int order)
   * parts of the VM system.
   * At each level, we keep a list of pages, which are heads of continuous
   * free pages of length of (1 << order) and marked with PG_Private.Page's
- * order is recorded in page->private field.
+ * order is recorded in page_private(page) field.
   * So when we are allocating or freeing one, we can derive the state of the
   * other.  That is, if we allocate a small block, and both were   
   * free, the remainder of the region must be split into blocks.   
@@ -327,7 +351,8 @@ static inline void free_pages_check(const char *function, struct page *page)
                         1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
-                       1 << PG_writeback )))
+                       1 << PG_writeback |
+                       1 << PG_reserved )))
                 bad_page(function, page);
         if (PageDirty(page))
                 __ClearPageDirty(page);
@@ -455,13 +480,14 @@ static void prep_new_page(struct page *page, int order)
                         1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
-                       1 << PG_writeback )))
+                       1 << PG_writeback |
+                       1 << PG_reserved )))
                 bad_page(__FUNCTION__, page);
  
         page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
                         1 << PG_referenced | 1 << PG_arch_1 |
                         1 << PG_checked | 1 << PG_mappedtodisk);
-       page->private = 0;
+       set_page_private(page, 0);
         set_page_refs(page, order);
         kernel_map_pages(page, 1 << order, 1);
  }
@@ -1016,7 +1042,7 @@ void __pagevec_free(struct pagevec *pvec)
  
  fastcall void __free_pages(struct page *page, unsigned int order)
  {
-       if (!PageReserved(page) && put_page_testzero(page)) {
+       if (put_page_testzero(page)) {
                 if (order == 0)
                         free_hot_page(page);
                 else
@@ -1305,12 +1331,9 @@ void show_free_areas(void)
                 } else
                         printk("\n");
  
-               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+               for_each_cpu(cpu) {
                         struct per_cpu_pageset *pageset;
  
-                       if (!cpu_possible(cpu))
-                               continue;
-
                         pageset = zone_pcp(zone, cpu);
  
                         for (temperature = 0; temperature < 2; temperature++)
@@ -1660,7 +1683,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
   * up by free_all_bootmem() once the early boot process is
   * done. Non-atomic initialization, single-pass.
   */
-void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                 unsigned long start_pfn)
  {
         struct page *page;
@@ -1674,7 +1697,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                         continue;
                 page = pfn_to_page(pfn);
                 set_page_links(page, zone, nid, pfn);
-               set_page_count(page, 0);
+               set_page_count(page, 1);
                 reset_page_mapcount(page);
                 SetPageReserved(page);
                 INIT_LIST_HEAD(&page->lru);
@@ -1721,29 +1744,29 @@ static int __devinit zone_batchsize(struct zone *zone)
  
         /*
          * The per-cpu-pages pools are set to around 1000th of the
-        * size of the zone.  But no more than 1/4 of a meg - there's
-        * no point in going beyond the size of L2 cache.
+        * size of the zone.  But no more than 1/2 of a meg.
          *
          * OK, so we don't know how big the cache is.  So guess.
          */
         batch = zone->present_pages / 1024;
-       if (batch * PAGE_SIZE > 256 * 1024)
-               batch = (256 * 1024) / PAGE_SIZE;
+       if (batch * PAGE_SIZE > 512 * 1024)
+               batch = (512 * 1024) / PAGE_SIZE;
         batch /= 4;             /* We effectively *= 4 below */
         if (batch < 1)
                 batch = 1;
  
         /*
-        * Clamp the batch to a 2^n - 1 value. Having a power
-        * of 2 value was found to be more likely to have
-        * suboptimal cache aliasing properties in some cases.
+        * We will be trying to allcoate bigger chunks of contiguous
+        * memory of the order of fls(batch).  This should result in
+        * better cache coloring.
          *
-        * For example if 2 tasks are alternately allocating
-        * batches of pages, one task can end up with a lot
-        * of pages of one half of the possible page colors
-        * and the other with pages of the other colors.
+        * A sanity check also to ensure that batch is still in limits.
          */
-       batch = (1 << fls(batch + batch/2)) - 1;
+       batch = (1 << fls(batch + batch/2));
+
+       if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
+               batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
+
         return batch;
  }
  
@@ -1755,7 +1778,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
  
         pcp = &p->pcp[0];               /* hot */
         pcp->count = 0;
-       pcp->low = 2 * batch;
+       pcp->low = 0;
         pcp->high = 6 * batch;
         pcp->batch = max(1UL, 1 * batch);
         INIT_LIST_HEAD(&pcp->list);
@@ -1764,7 +1787,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
         pcp->count = 0;
         pcp->low = 0;
         pcp->high = 2 * batch;
-       pcp->batch = max(1UL, 1 * batch);
+       pcp->batch = max(1UL, batch/2);
         INIT_LIST_HEAD(&pcp->list);
  }
  
@@ -1873,6 +1896,60 @@ void __init setup_per_cpu_pageset()
  
  #endif
  
+static __devinit
+void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+       int i;
+       struct pglist_data *pgdat = zone->zone_pgdat;
+
+       /*
+        * The per-page waitqueue mechanism uses hashed waitqueues
+        * per zone.
+        */
+       zone->wait_table_size = wait_table_size(zone_size_pages);
+       zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
+       zone->wait_table = (wait_queue_head_t *)
+               alloc_bootmem_node(pgdat, zone->wait_table_size
+                                       * sizeof(wait_queue_head_t));
+
+       for(i = 0; i < zone->wait_table_size; ++i)
+               init_waitqueue_head(zone->wait_table + i);
+}
+
+static __devinit void zone_pcp_init(struct zone *zone)
+{
+       int cpu;
+       unsigned long batch = zone_batchsize(zone);
+
+       for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+               /* Early boot. Slab allocator not functional yet */
+               zone->pageset[cpu] = &boot_pageset[cpu];
+               setup_pageset(&boot_pageset[cpu],0);
+#else
+               setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+       }
+       printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+               zone->name, zone->present_pages, batch);
+}
+
+static __devinit void init_currently_empty_zone(struct zone *zone,
+               unsigned long zone_start_pfn, unsigned long size)
+{
+       struct pglist_data *pgdat = zone->zone_pgdat;
+
+       zone_wait_table_init(zone, size);
+       pgdat->nr_zones = zone_idx(zone) + 1;
+
+       zone->zone_mem_map = pfn_to_page(zone_start_pfn);
+       zone->zone_start_pfn = zone_start_pfn;
+
+       memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+
+       zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+}
+
  /*
   * Set up the zone data structures:
   *   - mark all pages reserved
@@ -1882,10 +1959,11 @@ void __init setup_per_cpu_pageset()
  static void __init free_area_init_core(struct pglist_data *pgdat,
                 unsigned long *zones_size, unsigned long *zholes_size)
  {
-       unsigned long i, j;
-       int cpu, nid = pgdat->node_id;
+       unsigned long j;
+       int nid = pgdat->node_id;
         unsigned long zone_start_pfn = pgdat->node_start_pfn;
  
+       pgdat_resize_init(pgdat);
         pgdat->nr_zones = 0;
         init_waitqueue_head(&pgdat->kswapd_wait);
         pgdat->kswapd_max_order = 0;
@@ -1893,7 +1971,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
         for (j = 0; j < MAX_NR_ZONES; j++) {
                 struct zone *zone = pgdat->node_zones + j;
                 unsigned long size, realsize;
-               unsigned long batch;
  
                 realsize = size = zones_size[j];
                 if (zholes_size)
@@ -1908,24 +1985,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                 zone->name = zone_names[j];
                 spin_lock_init(&zone->lock);
                 spin_lock_init(&zone->lru_lock);
+               zone_seqlock_init(zone);
                 zone->zone_pgdat = pgdat;
                 zone->free_pages = 0;
  
                 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
  
-               batch = zone_batchsize(zone);
-
-               for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
-                       /* Early boot. Slab allocator not functional yet */
-                       zone->pageset[cpu] = &boot_pageset[cpu];
-                       setup_pageset(&boot_pageset[cpu],0);
-#else
-                       setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
-               }
-               printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-                               zone_names[j], realsize, batch);
+               zone_pcp_init(zone);
                 INIT_LIST_HEAD(&zone->active_list);
                 INIT_LIST_HEAD(&zone->inactive_list);
                 zone->nr_scan_active = 0;
@@ -1936,32 +2002,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                 if (!size)
                         continue;
  
-               /*
-                * The per-page waitqueue mechanism uses hashed waitqueues
-                * per zone.
-                */
-               zone->wait_table_size = wait_table_size(size);
-               zone->wait_table_bits =
-                       wait_table_bits(zone->wait_table_size);
-               zone->wait_table = (wait_queue_head_t *)
-                       alloc_bootmem_node(pgdat, zone->wait_table_size
-                                               * sizeof(wait_queue_head_t));
-
-               for(i = 0; i < zone->wait_table_size; ++i)
-                       init_waitqueue_head(zone->wait_table + i);
-
-               pgdat->nr_zones = j+1;
-
-               zone->zone_mem_map = pfn_to_page(zone_start_pfn);
-               zone->zone_start_pfn = zone_start_pfn;
-
-               memmap_init(size, nid, j, zone_start_pfn);
-
                 zonetable_add(zone, nid, j, zone_start_pfn, size);
-
+               init_currently_empty_zone(zone, zone_start_pfn, size);
                 zone_start_pfn += size;
-
-               zone_init_free_lists(pgdat, zone, zone->spanned_pages);
         }
  }
  
@@ -2361,7 +2404,7 @@ static void setup_per_zone_lowmem_reserve(void)
   *     that the pages_{min,low,high} values for each zone are set correctly 
   *     with respect to min_free_kbytes.
   */
-static void setup_per_zone_pages_min(void)
+void setup_per_zone_pages_min(void)
  {
         unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
         unsigned long lowmem_pages = 0;
diff --git a/mm/page_io.c b/mm/page_io.c

index 330e00d..bb2b0d5 100644 (file)
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
                 unlock_page(page);
                 goto out;
         }
-       bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
+       bio = get_swap_bio(GFP_NOIO, page_private(page), page,
+                               end_swap_bio_write);
         if (bio == NULL) {
                 set_page_dirty(page);
                 unlock_page(page);
@@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page)
  
         BUG_ON(!PageLocked(page));
         ClearPageUptodate(page);
-       bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
+       bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
+                               end_swap_bio_read);
         if (bio == NULL) {
                 unlock_page(page);
                 ret = -ENOMEM;
diff --git a/mm/rmap.c b/mm/rmap.c

index 450f524..914d04b 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,7 +32,7 @@
   *   page->flags PG_locked (lock_page)
   *     mapping->i_mmap_lock
   *       anon_vma->lock
- *         mm->page_table_lock
+ *         mm->page_table_lock or pte_lock
   *           zone->lru_lock (in mark_page_accessed)
   *           swap_lock (in swap_duplicate, swap_info_get)
   *             mmlist_lock (in mmput, drain_mmlist and others)
@@ -244,37 +244,44 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
  /*
   * Check that @page is mapped at @address into @mm.
   *
- * On success returns with mapped pte and locked mm->page_table_lock.
+ * On success returns with pte mapped and locked.
   */
  pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-                         unsigned long address)
+                         unsigned long address, spinlock_t **ptlp)
  {
         pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
         pte_t *pte;
+       spinlock_t *ptl;
  
-       /*
-        * We need the page_table_lock to protect us from page faults,
-        * munmap, fork, etc...
-        */
-       spin_lock(&mm->page_table_lock);
         pgd = pgd_offset(mm, address);
-       if (likely(pgd_present(*pgd))) {
-               pud = pud_offset(pgd, address);
-               if (likely(pud_present(*pud))) {
-                       pmd = pmd_offset(pud, address);
-                       if (likely(pmd_present(*pmd))) {
-                               pte = pte_offset_map(pmd, address);
-                               if (likely(pte_present(*pte) &&
-                                          page_to_pfn(page) == pte_pfn(*pte)))
-                                       return pte;
-                               pte_unmap(pte);
-                       }
-               }
+       if (!pgd_present(*pgd))
+               return NULL;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return NULL;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return NULL;
+
+       pte = pte_offset_map(pmd, address);
+       /* Make a quick check before getting the lock */
+       if (!pte_present(*pte)) {
+               pte_unmap(pte);
+               return NULL;
+       }
+
+       ptl = pte_lockptr(mm, pmd);
+       spin_lock(ptl);
+       if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
+               *ptlp = ptl;
+               return pte;
         }
-       spin_unlock(&mm->page_table_lock);
-       return ERR_PTR(-ENOENT);
+       pte_unmap_unlock(pte, ptl);
+       return NULL;
  }
  
  /*
@@ -287,24 +294,28 @@ static int page_referenced_one(struct page *page,
         struct mm_struct *mm = vma->vm_mm;
         unsigned long address;
         pte_t *pte;
+       spinlock_t *ptl;
         int referenced = 0;
  
         address = vma_address(page, vma);
         if (address == -EFAULT)
                 goto out;
  
-       pte = page_check_address(page, mm, address);
-       if (!IS_ERR(pte)) {
-               if (ptep_clear_flush_young(vma, address, pte))
-                       referenced++;
+       pte = page_check_address(page, mm, address, &ptl);
+       if (!pte)
+               goto out;
  
-               if (mm != current->mm && !ignore_token && has_swap_token(mm))
-                       referenced++;
+       if (ptep_clear_flush_young(vma, address, pte))
+               referenced++;
  
-               (*mapcount)--;
-               pte_unmap(pte);
-               spin_unlock(&mm->page_table_lock);
-       }
+       /* Pretend the page is referenced if the task has the
+          swap token and is in the middle of a page fault. */
+       if (mm != current->mm && !ignore_token && has_swap_token(mm) &&
+                       rwsem_is_locked(&mm->mmap_sem))
+               referenced++;
+
+       (*mapcount)--;
+       pte_unmap_unlock(pte, ptl);
  out:
         return referenced;
  }
@@ -434,15 +445,11 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
   * @vma:       the vm area in which the mapping is added
   * @address:   the user virtual address mapped
   *
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
   */
  void page_add_anon_rmap(struct page *page,
         struct vm_area_struct *vma, unsigned long address)
  {
-       BUG_ON(PageReserved(page));
-
-       inc_mm_counter(vma->vm_mm, anon_rss);
-
         if (atomic_inc_and_test(&page->_mapcount)) {
                 struct anon_vma *anon_vma = vma->anon_vma;
  
@@ -461,13 +468,12 @@ void page_add_anon_rmap(struct page *page,
   * page_add_file_rmap - add pte mapping to a file page
   * @page: the page to add the mapping to
   *
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
   */
  void page_add_file_rmap(struct page *page)
  {
         BUG_ON(PageAnon(page));
-       if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
-               return;
+       BUG_ON(!pfn_valid(page_to_pfn(page)));
  
         if (atomic_inc_and_test(&page->_mapcount))
                 inc_page_state(nr_mapped);
@@ -477,12 +483,10 @@ void page_add_file_rmap(struct page *page)
   * page_remove_rmap - take down pte mapping from a page
   * @page: page to remove mapping from
   *
- * Caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
   */
  void page_remove_rmap(struct page *page)
  {
-       BUG_ON(PageReserved(page));
-
         if (atomic_add_negative(-1, &page->_mapcount)) {
                 BUG_ON(page_mapcount(page) < 0);
                 /*
@@ -510,14 +514,15 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
         unsigned long address;
         pte_t *pte;
         pte_t pteval;
+       spinlock_t *ptl;
         int ret = SWAP_AGAIN;
  
         address = vma_address(page, vma);
         if (address == -EFAULT)
                 goto out;
  
-       pte = page_check_address(page, mm, address);
-       if (IS_ERR(pte))
+       pte = page_check_address(page, mm, address, &ptl);
+       if (!pte)
                 goto out;
  
         /*
@@ -541,8 +546,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
         if (pte_dirty(pteval))
                 set_page_dirty(page);
  
+       /* Update high watermark before we lower rss */
+       update_hiwater_rss(mm);
+
         if (PageAnon(page)) {
-               swp_entry_t entry = { .val = page->private };
+               swp_entry_t entry = { .val = page_private(page) };
                 /*
                  * Store the swap location in the pte.
                  * See handle_pte_fault() ...
@@ -551,21 +559,21 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
                 swap_duplicate(entry);
                 if (list_empty(&mm->mmlist)) {
                         spin_lock(&mmlist_lock);
-                       list_add(&mm->mmlist, &init_mm.mmlist);
+                       if (list_empty(&mm->mmlist))
+                               list_add(&mm->mmlist, &init_mm.mmlist);
                         spin_unlock(&mmlist_lock);
                 }
                 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                 BUG_ON(pte_file(*pte));
                 dec_mm_counter(mm, anon_rss);
-       }
+       } else
+               dec_mm_counter(mm, file_rss);
  
-       dec_mm_counter(mm, rss);
         page_remove_rmap(page);
         page_cache_release(page);
  
  out_unmap:
-       pte_unmap(pte);
-       spin_unlock(&mm->page_table_lock);
+       pte_unmap_unlock(pte, ptl);
  out:
         return ret;
  }
@@ -599,19 +607,14 @@ static void try_to_unmap_cluster(unsigned long cursor,
         pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
-       pte_t *pte, *original_pte;
+       pte_t *pte;
         pte_t pteval;
+       spinlock_t *ptl;
         struct page *page;
         unsigned long address;
         unsigned long end;
         unsigned long pfn;
  
-       /*
-        * We need the page_table_lock to protect us from page faults,
-        * munmap, fork, etc...
-        */
-       spin_lock(&mm->page_table_lock);
-
         address = (vma->vm_start + cursor) & CLUSTER_MASK;
         end = address + CLUSTER_SIZE;
         if (address < vma->vm_start)
@@ -621,30 +624,33 @@ static void try_to_unmap_cluster(unsigned long cursor,
  
         pgd = pgd_offset(mm, address);
         if (!pgd_present(*pgd))
-               goto out_unlock;
+               return;
  
         pud = pud_offset(pgd, address);
         if (!pud_present(*pud))
-               goto out_unlock;
+               return;
  
         pmd = pmd_offset(pud, address);
         if (!pmd_present(*pmd))
-               goto out_unlock;
+               return;
+
+       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
  
-       for (original_pte = pte = pte_offset_map(pmd, address);
-                       address < end; pte++, address += PAGE_SIZE) {
+       /* Update high watermark before we lower rss */
+       update_hiwater_rss(mm);
  
+       for (; address < end; pte++, address += PAGE_SIZE) {
                 if (!pte_present(*pte))
                         continue;
  
                 pfn = pte_pfn(*pte);
-               if (!pfn_valid(pfn))
+               if (unlikely(!pfn_valid(pfn))) {
+                       print_bad_pte(vma, *pte, address);
                         continue;
+               }
  
                 page = pfn_to_page(pfn);
                 BUG_ON(PageAnon(page));
-               if (PageReserved(page))
-                       continue;
  
                 if (ptep_clear_flush_young(vma, address, pte))
                         continue;
@@ -663,13 +669,10 @@ static void try_to_unmap_cluster(unsigned long cursor,
  
                 page_remove_rmap(page);
                 page_cache_release(page);
-               dec_mm_counter(mm, rss);
+               dec_mm_counter(mm, file_rss);
                 (*mapcount)--;
         }
-
-       pte_unmap(original_pte);
-out_unlock:
-       spin_unlock(&mm->page_table_lock);
+       pte_unmap_unlock(pte - 1, ptl);
  }
  
  static int try_to_unmap_anon(struct page *page)
@@ -806,7 +809,6 @@ int try_to_unmap(struct page *page)
  {
         int ret;
  
-       BUG_ON(PageReserved(page));
         BUG_ON(!PageLocked(page));
  
         if (PageAnon(page))
diff --git a/mm/shmem.c b/mm/shmem.c

index 55e04a0..dc25565 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -71,9 +71,6 @@
  /* Pretend that each entry is of this size in directory's i_size */
  #define BOGO_DIRENT_SIZE 20
  
-/* Keep swapped page count in private field of indirect struct page */
-#define nr_swapped             private
-
  /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
  enum sgp_type {
         SGP_QUICK,      /* don't try more than file page cache lookup */
@@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
  
         entry->val = value;
         info->swapped += incdec;
-       if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
-               kmap_atomic_to_page(entry)->nr_swapped += incdec;
+       if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
+               struct page *page = kmap_atomic_to_page(entry);
+               set_page_private(page, page_private(page) + incdec);
+       }
  }
  
  /*
@@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
  
                 spin_unlock(&info->lock);
                 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
-               if (page) {
-                       page->nr_swapped = 0;
-               }
+               if (page)
+                       set_page_private(page, 0);
                 spin_lock(&info->lock);
  
                 if (!page) {
@@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode)
                         diroff = 0;
                 }
                 subdir = dir[diroff];
-               if (subdir && subdir->nr_swapped) {
+               if (subdir && page_private(subdir)) {
                         size = limit - idx;
                         if (size > ENTRIES_PER_PAGE)
                                 size = ENTRIES_PER_PAGE;
@@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode)
                         nr_swaps_freed += freed;
                         if (offset)
                                 spin_lock(&info->lock);
-                       subdir->nr_swapped -= freed;
+                       set_page_private(subdir, page_private(subdir) - freed);
                         if (offset)
                                 spin_unlock(&info->lock);
-                       BUG_ON(subdir->nr_swapped > offset);
+                       BUG_ON(page_private(subdir) > offset);
                 }
                 if (offset)
                         offset = 0;
@@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
                         dir = shmem_dir_map(subdir);
                 }
                 subdir = *dir;
-               if (subdir && subdir->nr_swapped) {
+               if (subdir && page_private(subdir)) {
                         ptr = shmem_swp_map(subdir);
                         size = limit - idx;
                         if (size > ENTRIES_PER_PAGE)
@@ -1201,7 +1199,7 @@ static int shmem_populate(struct vm_area_struct *vma,
                                 page_cache_release(page);
                                 return err;
                         }
-               } else {
+               } else if (vma->vm_flags & VM_NONLINEAR) {
                         /* No page was found just because we can't read it in
                          * now (being here implies nonblock != 0), but the page
                          * may exist, so set the PTE to fault it in later. */
@@ -1506,8 +1504,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
                          */
                         if (!offset)
                                 mark_page_accessed(page);
-               } else
+               } else {
                         page = ZERO_PAGE(0);
+                       page_cache_get(page);
+               }
  
                 /*
                  * Ok, we have the page, and it's up-to-date, so
diff --git a/mm/slab.c b/mm/slab.c

index d30423f..22bfb0b 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2419,6 +2419,7 @@ retry:
                         next = slab_bufctl(slabp)[slabp->free];
  #if DEBUG
                         slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+                       WARN_ON(numa_node_id() != slabp->nodeid);
  #endif
                         slabp->free = next;
                 }
@@ -2633,8 +2634,10 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
                 check_spinlock_acquired_node(cachep, node);
                 check_slabp(cachep, slabp);
  
-
  #if DEBUG
+               /* Verify that the slab belongs to the intended node */
+               WARN_ON(slabp->nodeid != node);
+
                 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
                         printk(KERN_ERR "slab: double free detected in cache "
                                         "'%s', objp %p\n", cachep->name, objp);
diff --git a/mm/sparse.c b/mm/sparse.c

index 347249a..72079b5 100644 (file)
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,8 +5,10 @@
  #include <linux/mm.h>
  #include <linux/mmzone.h>
  #include <linux/bootmem.h>
+#include <linux/highmem.h>
  #include <linux/module.h>
  #include <linux/spinlock.h>
+#include <linux/vmalloc.h>
  #include <asm/dma.h>
  
  /*
@@ -72,6 +74,31 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
  }
  #endif
  
+/*
+ * Although written for the SPARSEMEM_EXTREME case, this happens
+ * to also work for the flat array case becase
+ * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
+ */
+int __section_nr(struct mem_section* ms)
+{
+       unsigned long root_nr;
+       struct mem_section* root;
+
+       for (root_nr = 0;
+            root_nr < NR_MEM_SECTIONS;
+            root_nr += SECTIONS_PER_ROOT) {
+               root = __nr_to_section(root_nr);
+
+               if (!root)
+                       continue;
+
+               if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
+                    break;
+       }
+
+       return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
+}
+
  /* Record a memory area against a node. */
  void memory_present(int nid, unsigned long start, unsigned long end)
  {
@@ -162,6 +189,45 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
         return NULL;
  }
  
+static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
+{
+       struct page *page, *ret;
+       unsigned long memmap_size = sizeof(struct page) * nr_pages;
+
+       page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
+       if (page)
+               goto got_map_page;
+
+       ret = vmalloc(memmap_size);
+       if (ret)
+               goto got_map_ptr;
+
+       return NULL;
+got_map_page:
+       ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
+got_map_ptr:
+       memset(ret, 0, memmap_size);
+
+       return ret;
+}
+
+static int vaddr_in_vmalloc_area(void *addr)
+{
+       if (addr >= (void *)VMALLOC_START &&
+           addr < (void *)VMALLOC_END)
+               return 1;
+       return 0;
+}
+
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+{
+       if (vaddr_in_vmalloc_area(memmap))
+               vfree(memmap);
+       else
+               free_pages((unsigned long)memmap,
+                          get_order(sizeof(struct page) * nr_pages));
+}
+
  /*
   * Allocate the accumulated non-linear sections, allocate a mem_map
   * for each and record the physical to section mapping.
@@ -187,14 +253,37 @@ void sparse_init(void)
   * set.  If this is <=0, then that means that the passed-in
   * map was not consumed and must be freed.
   */
-int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map)
+int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+                          int nr_pages)
  {
-       struct mem_section *ms = __pfn_to_section(start_pfn);
+       unsigned long section_nr = pfn_to_section_nr(start_pfn);
+       struct pglist_data *pgdat = zone->zone_pgdat;
+       struct mem_section *ms;
+       struct page *memmap;
+       unsigned long flags;
+       int ret;
  
-       if (ms->section_mem_map & SECTION_MARKED_PRESENT)
-               return -EEXIST;
+       /*
+        * no locking for this, because it does its own
+        * plus, it does a kmalloc
+        */
+       sparse_index_init(section_nr, pgdat->node_id);
+       memmap = __kmalloc_section_memmap(nr_pages);
+
+       pgdat_resize_lock(pgdat, &flags);
  
+       ms = __pfn_to_section(start_pfn);
+       if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
+               ret = -EEXIST;
+               goto out;
+       }
         ms->section_mem_map |= SECTION_MARKED_PRESENT;
  
-       return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map);
+       ret = sparse_init_one_section(ms, section_nr, memmap);
+
+       if (ret <= 0)
+               __kfree_section_memmap(memmap, nr_pages);
+out:
+       pgdat_resize_unlock(pgdat, &flags);
+       return ret;
  }
diff --git a/mm/swap.c b/mm/swap.c

index 7771d28..b895128 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,7 +39,7 @@ int page_cluster;
  void put_page(struct page *page)
  {
         if (unlikely(PageCompound(page))) {
-               page = (struct page *)page->private;
+               page = (struct page *)page_private(page);
                 if (put_page_testzero(page)) {
                         void (*dtor)(struct page *page);
  
@@ -48,7 +48,7 @@ void put_page(struct page *page)
                 }
                 return;
         }
-       if (!PageReserved(page) && put_page_testzero(page))
+       if (put_page_testzero(page))
                 __page_cache_release(page);
  }
  EXPORT_SYMBOL(put_page);
@@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold)
                 struct page *page = pages[i];
                 struct zone *pagezone;
  
-               if (PageReserved(page) || !put_page_testzero(page))
+               if (!put_page_testzero(page))
                         continue;
  
                 pagezone = page_zone(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c

index 132164f..dfd9a46 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
                         page_cache_get(page);
                         SetPageLocked(page);
                         SetPageSwapCache(page);
-                       page->private = entry.val;
+                       set_page_private(page, entry.val);
                         total_swapcache_pages++;
                         pagecache_acct(1);
                 }
@@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page)
         BUG_ON(PageWriteback(page));
         BUG_ON(PagePrivate(page));
  
-       radix_tree_delete(&swapper_space.page_tree, page->private);
-       page->private = 0;
+       radix_tree_delete(&swapper_space.page_tree, page_private(page));
+       set_page_private(page, 0);
         ClearPageSwapCache(page);
         total_swapcache_pages--;
         pagecache_acct(-1);
@@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page)
  {
         swp_entry_t entry;
  
-       entry.val = page->private;
+       entry.val = page_private(page);
  
         write_lock_irq(&swapper_space.tree_lock);
         __delete_from_swap_cache(page);
@@ -259,8 +259,7 @@ static inline void free_swap_cache(struct page *page)
  
  /* 
   * Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page. Can not do a lock_page,
- * as we are holding the page_table_lock spinlock.
+ * this page if it is the last user of the page.
   */
  void free_page_and_swap_cache(struct page *page)
  {
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 1dcaeda..8970c0b 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
         swp_entry_t entry;
  
         down_read(&swap_unplug_sem);
-       entry.val = page->private;
+       entry.val = page_private(page);
         if (PageSwapCache(page)) {
                 struct block_device *bdev = swap_info[swp_type(entry)].bdev;
                 struct backing_dev_info *bdi;
@@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
                 /*
                  * If the page is removed from swapcache from under us (with a
                  * racy try_to_unuse/swapoff) we need an additional reference
-                * count to avoid reading garbage from page->private above. If
-                * the WARN_ON triggers during a swapoff it maybe the race
+                * count to avoid reading garbage from page_private(page) above.
+                * If the WARN_ON triggers during a swapoff it maybe the race
                  * condition and it's harmless. However if it triggers without
                  * swapoff it signals a problem.
                  */
@@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page)
         struct swap_info_struct *p;
         swp_entry_t entry;
  
-       entry.val = page->private;
+       entry.val = page_private(page);
         p = swap_info_get(entry);
         if (p) {
                 /* Subtract the 1 for the swap cache itself */
@@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page)
         if (page_count(page) != 2) /* 2: us + cache */
                 return 0;
  
-       entry.val = page->private;
+       entry.val = page_private(page);
         p = swap_info_get(entry);
         if (!p)
                 return 0;
@@ -398,17 +398,14 @@ void free_swap_and_cache(swp_entry_t entry)
  }
  
  /*
- * Always set the resulting pte to be nowrite (the same as COW pages
- * after one process has exited).  We don't know just how many PTEs will
- * share this swap entry, so be cautious and let do_wp_page work out
- * what to do if a write is requested later.
- *
- * vma->vm_mm->page_table_lock is held.
+ * No need to decide whether this PTE shares the swap entry with others,
+ * just let do_wp_page work it out if a write is requested later - to
+ * force COW, vm_page_prot omits write permission from any private vma.
   */
  static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
                 unsigned long addr, swp_entry_t entry, struct page *page)
  {
-       inc_mm_counter(vma->vm_mm, rss);
+       inc_mm_counter(vma->vm_mm, anon_rss);
         get_page(page);
         set_pte_at(vma->vm_mm, addr, pte,
                    pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -425,23 +422,25 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                 unsigned long addr, unsigned long end,
                                 swp_entry_t entry, struct page *page)
  {
-       pte_t *pte;
         pte_t swp_pte = swp_entry_to_pte(entry);
+       pte_t *pte;
+       spinlock_t *ptl;
+       int found = 0;
  
-       pte = pte_offset_map(pmd, addr);
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
         do {
                 /*
                  * swapoff spends a _lot_ of time in this loop!
                  * Test inline before going to call unuse_pte.
                  */
                 if (unlikely(pte_same(*pte, swp_pte))) {
-                       unuse_pte(vma, pte, addr, entry, page);
-                       pte_unmap(pte);
-                       return 1;
+                       unuse_pte(vma, pte++, addr, entry, page);
+                       found = 1;
+                       break;
                 }
         } while (pte++, addr += PAGE_SIZE, addr != end);
-       pte_unmap(pte - 1);
-       return 0;
+       pte_unmap_unlock(pte - 1, ptl);
+       return found;
  }
  
  static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -523,12 +522,10 @@ static int unuse_mm(struct mm_struct *mm,
                 down_read(&mm->mmap_sem);
                 lock_page(page);
         }
-       spin_lock(&mm->page_table_lock);
         for (vma = mm->mmap; vma; vma = vma->vm_next) {
                 if (vma->anon_vma && unuse_vma(vma, entry, page))
                         break;
         }
-       spin_unlock(&mm->page_table_lock);
         up_read(&mm->mmap_sem);
         /*
          * Currently unuse_mm cannot fail, but leave error handling
@@ -1045,7 +1042,7 @@ int page_queue_congested(struct page *page)
         BUG_ON(!PageLocked(page));      /* It pins the swap_info_struct */
  
         if (PageSwapCache(page)) {
-               swp_entry_t entry = { .val = page->private };
+               swp_entry_t entry = { .val = page_private(page) };
                 struct swap_info_struct *sis;
  
                 sis = get_swap_info_struct(swp_type(entry));
diff --git a/mm/thrash.c b/mm/thrash.c

index 11461f7..eff3c18 100644 (file)
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -19,7 +19,7 @@ static unsigned long swap_token_check;
  struct mm_struct * swap_token_mm = &init_mm;
  
  #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
-#define SWAP_TOKEN_TIMEOUT     0
+#define SWAP_TOKEN_TIMEOUT     (300 * HZ)
  /*
   * Currently disabled; Needs further code to work at HZ * 300.
   */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c

index 1150229..54a90e8 100644 (file)
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -5,6 +5,7 @@
   *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
   *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
+ *  Numa awareness, Christoph Lameter, SGI, June 2005
   */
  
  #include <linux/mm.h>
@@ -88,7 +89,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
  {
         pte_t *pte;
  
-       pte = pte_alloc_kernel(&init_mm, pmd, addr);
+       pte = pte_alloc_kernel(pmd, addr);
         if (!pte)
                 return -ENOMEM;
         do {
@@ -146,20 +147,18 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
  
         BUG_ON(addr >= end);
         pgd = pgd_offset_k(addr);
-       spin_lock(&init_mm.page_table_lock);
         do {
                 next = pgd_addr_end(addr, end);
                 err = vmap_pud_range(pgd, addr, next, prot, pages);
                 if (err)
                         break;
         } while (pgd++, addr = next, addr != end);
-       spin_unlock(&init_mm.page_table_lock);
         flush_cache_vmap((unsigned long) area->addr, end);
         return err;
  }
  
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
-                               unsigned long start, unsigned long end)
+struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
+                               unsigned long start, unsigned long end, int node)
  {
         struct vm_struct **p, *tmp, *area;
         unsigned long align = 1;
@@ -178,7 +177,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
         addr = ALIGN(start, align);
         size = PAGE_ALIGN(size);
  
-       area = kmalloc(sizeof(*area), GFP_KERNEL);
+       area = kmalloc_node(sizeof(*area), GFP_KERNEL, node);
         if (unlikely(!area))
                 return NULL;
  
@@ -231,6 +230,12 @@ out:
         return NULL;
  }
  
+struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
+                               unsigned long start, unsigned long end)
+{
+       return __get_vm_area_node(size, flags, start, end, -1);
+}
+
  /**
   *     get_vm_area  -  reserve a contingous kernel virtual area
   *
@@ -246,6 +251,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
         return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
  }
  
+struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node)
+{
+       return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
+}
+
  /* Caller must hold vmlist_lock */
  struct vm_struct *__remove_vm_area(void *addr)
  {
@@ -342,7 +352,6 @@ void vfree(void *addr)
         BUG_ON(in_interrupt());
         __vunmap(addr, 1);
  }
-
  EXPORT_SYMBOL(vfree);
  
  /**
@@ -360,7 +369,6 @@ void vunmap(void *addr)
         BUG_ON(in_interrupt());
         __vunmap(addr, 0);
  }
-
  EXPORT_SYMBOL(vunmap);
  
  /**
@@ -392,10 +400,10 @@ void *vmap(struct page **pages, unsigned int count,
  
         return area->addr;
  }
-
  EXPORT_SYMBOL(vmap);
  
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
+                               pgprot_t prot, int node)
  {
         struct page **pages;
         unsigned int nr_pages, array_size, i;
@@ -406,9 +414,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
         area->nr_pages = nr_pages;
         /* Please note that the recursion is strictly bounded. */
         if (array_size > PAGE_SIZE)
-               pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL);
+               pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
         else
-               pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM));
+               pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
         area->pages = pages;
         if (!area->pages) {
                 remove_vm_area(area->addr);
@@ -418,7 +426,10 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
         memset(area->pages, 0, array_size);
  
         for (i = 0; i < area->nr_pages; i++) {
-               area->pages[i] = alloc_page(gfp_mask);
+               if (node < 0)
+                       area->pages[i] = alloc_page(gfp_mask);
+               else
+                       area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
                 if (unlikely(!area->pages[i])) {
                         /* Successfully allocated i pages, free them in __vunmap() */
                         area->nr_pages = i;
@@ -435,18 +446,25 @@ fail:
         return NULL;
  }
  
+void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
+{
+       return __vmalloc_area_node(area, gfp_mask, prot, -1);
+}
+
  /**
- *     __vmalloc  -  allocate virtually contiguous memory
+ *     __vmalloc_node  -  allocate virtually contiguous memory
   *
   *     @size:          allocation size
   *     @gfp_mask:      flags for the page level allocator
   *     @prot:          protection mask for the allocated pages
+ *     @node           node to use for allocation or -1
   *
   *     Allocate enough pages to cover @size from the page level
   *     allocator with @gfp_mask flags.  Map them into contiguous
   *     kernel virtual space, using a pagetable protection of @prot.
   */
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+                       int node)
  {
         struct vm_struct *area;
  
@@ -454,13 +472,18 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
         if (!size || (size >> PAGE_SHIFT) > num_physpages)
                 return NULL;
  
-       area = get_vm_area(size, VM_ALLOC);
+       area = get_vm_area_node(size, VM_ALLOC, node);
         if (!area)
                 return NULL;
  
-       return __vmalloc_area(area, gfp_mask, prot);
+       return __vmalloc_area_node(area, gfp_mask, prot, node);
  }
+EXPORT_SYMBOL(__vmalloc_node);
  
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+{
+       return __vmalloc_node(size, gfp_mask, prot, -1);
+}
  EXPORT_SYMBOL(__vmalloc);
  
  /**
@@ -478,9 +501,26 @@ void *vmalloc(unsigned long size)
  {
         return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
  }
-
  EXPORT_SYMBOL(vmalloc);
  
+/**
+ *     vmalloc_node  -  allocate memory on a specific node
+ *
+ *     @size:          allocation size
+ *     @node;          numa node
+ *
+ *     Allocate enough pages to cover @size from the page level
+ *     allocator and map them into contiguous kernel virtual space.
+ *
+ *     For tight cotrol over page level allocator and protection flags
+ *     use __vmalloc() instead.
+ */
+void *vmalloc_node(unsigned long size, int node)
+{
+       return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+}
+EXPORT_SYMBOL(vmalloc_node);
+
  #ifndef PAGE_KERNEL_EXEC
  # define PAGE_KERNEL_EXEC PAGE_KERNEL
  #endif
@@ -515,7 +555,6 @@ void *vmalloc_32(unsigned long size)
  {
         return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
  }
-
  EXPORT_SYMBOL(vmalloc_32);
  
  long vread(char *buf, char *addr, unsigned long count)
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 843c87d..135bf8c 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -417,7 +417,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                  * Anonymous process memory has backing store?
                  * Try to allocate it some swap space here.
                  */
-               if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) {
+               if (PageAnon(page) && !PageSwapCache(page)) {
+                       if (!sc->may_swap)
+                               goto keep_locked;
                         if (!add_to_swap(page))
                                 goto activate_locked;
                 }
@@ -519,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
  
  #ifdef CONFIG_SWAP
                 if (PageSwapCache(page)) {
-                       swp_entry_t swap = { .val = page->private };
+                       swp_entry_t swap = { .val = page_private(page) };
                         __delete_from_swap_cache(page);
                         write_unlock_irq(&mapping->tree_lock);
                         swap_free(swap);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c

index a970b47..41edc14 100644 (file)
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -75,7 +75,7 @@
  #ifdef CONFIG_IPV6_PRIVACY
  #include <linux/random.h>
  #include <linux/crypto.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
  #endif
  
  #include <asm/uaccess.h>
@@ -1217,12 +1217,8 @@ static int __ipv6_regen_rndid(struct inet6_dev *idev)
         struct net_device *dev;
         struct scatterlist sg[2];
  
-       sg[0].page = virt_to_page(idev->entropy);
-       sg[0].offset = offset_in_page(idev->entropy);
-       sg[0].length = 8;
-       sg[1].page = virt_to_page(idev->work_eui64);
-       sg[1].offset = offset_in_page(idev->work_eui64);
-       sg[1].length = 8;
+       sg_set_buf(&sg[0], idev->entropy, 8);
+       sg_set_buf(&sg[1], idev->work_eui64, 8);
  
         dev = idev->dev;
  
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c

index 3f3d543..97c981f 100644 (file)
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -37,7 +37,7 @@
  #include <linux/types.h>
  #include <linux/mm.h>
  #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
  #include <linux/crypto.h>
  #include <linux/highmem.h>
  #include <linux/pagemap.h>
@@ -75,9 +75,7 @@ krb5_encrypt(
                 memcpy(local_iv, iv, crypto_tfm_alg_ivsize(tfm));
  
         memcpy(out, in, length);
-       sg[0].page = virt_to_page(out);
-       sg[0].offset = offset_in_page(out);
-       sg[0].length = length;
+       sg_set_buf(sg, out, length);
  
         ret = crypto_cipher_encrypt_iv(tfm, sg, sg, length, local_iv);
  
@@ -117,9 +115,7 @@ krb5_decrypt(
                 memcpy(local_iv,iv, crypto_tfm_alg_ivsize(tfm));
  
         memcpy(out, in, length);
-       sg[0].page = virt_to_page(out);
-       sg[0].offset = offset_in_page(out);
-       sg[0].length = length;
+       sg_set_buf(sg, out, length);
  
         ret = crypto_cipher_decrypt_iv(tfm, sg, sg, length, local_iv);
  
@@ -132,13 +128,6 @@ out:
  
  EXPORT_SYMBOL(krb5_decrypt);
  
-static void
-buf_to_sg(struct scatterlist *sg, char *ptr, int len) {
-       sg->page = virt_to_page(ptr);
-       sg->offset = offset_in_page(ptr);
-       sg->length = len;
-}
-
  static int
  process_xdr_buf(struct xdr_buf *buf, int offset, int len,
                 int (*actor)(struct scatterlist *, void *), void *data)
@@ -152,7 +141,7 @@ process_xdr_buf(struct xdr_buf *buf, int offset, int len,
                 thislen = buf->head[0].iov_len - offset;
                 if (thislen > len)
                         thislen = len;
-               buf_to_sg(sg, buf->head[0].iov_base + offset, thislen);
+               sg_set_buf(sg, buf->head[0].iov_base + offset, thislen);
                 ret = actor(sg, data);
                 if (ret)
                         goto out;
@@ -195,7 +184,7 @@ process_xdr_buf(struct xdr_buf *buf, int offset, int len,
                 thislen = buf->tail[0].iov_len - offset;
                 if (thislen > len)
                         thislen = len;
-               buf_to_sg(sg, buf->tail[0].iov_base + offset, thislen);
+               sg_set_buf(sg, buf->tail[0].iov_base + offset, thislen);
                 ret = actor(sg, data);
                 len -= thislen;
         }
@@ -241,7 +230,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
                 goto out;
  
         crypto_digest_init(tfm);
-       buf_to_sg(sg, header, hdrlen);
+       sg_set_buf(sg, header, hdrlen);
         crypto_digest_update(tfm, sg, 1);
         process_xdr_buf(body, body_offset, body->len - body_offset,
                         checksummer, tfm);
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c

index 67abeba..e97b2d1 100644 (file)
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2949,8 +2949,7 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, uns
                 return NOPAGE_OOM;
         runtime = substream->runtime;
         page = virt_to_page(runtime->status);
-       if (!PageReserved(page))
-               get_page(page);
+       get_page(page);
         if (type)
                 *type = VM_FAULT_MINOR;
         return page;
@@ -2992,8 +2991,7 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, un
                 return NOPAGE_OOM;
         runtime = substream->runtime;
         page = virt_to_page(runtime->control);
-       if (!PageReserved(page))
-               get_page(page);
+       get_page(page);
         if (type)
                 *type = VM_FAULT_MINOR;
         return page;
@@ -3066,8 +3064,7 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsign
                 vaddr = runtime->dma_area + offset;
                 page = virt_to_page(vaddr);
         }
-       if (!PageReserved(page))
-               get_page(page);
+       get_page(page);
         if (type)
                 *type = VM_FAULT_MINOR;
         return page;
author	Jeff Garzik <jgarzik@pobox.com>
	Mon, 31 Oct 2005 01:37:44 +0000 (20:37 -0500)
committer	Jeff Garzik <jgarzik@pobox.com>
	Mon, 31 Oct 2005 01:37:44 +0000 (20:37 -0500)
Documentation/cachetlb.txt		patch \| blob \| history
Documentation/kernel-parameters.txt		patch \| blob \| history
Documentation/m68k/kernel-options.txt		patch \| blob \| history
arch/alpha/mm/numa.c		patch \| blob \| history
arch/alpha/mm/remap.c		patch \| blob \| history
arch/arm/kernel/signal.c		patch \| blob \| history
arch/arm/kernel/traps.c		patch \| blob \| history
arch/arm/mm/consistent.c		patch \| blob \| history
arch/arm/mm/fault-armv.c		patch \| blob \| history
arch/arm/mm/ioremap.c		patch \| blob \| history
arch/arm/mm/mm-armv.c		patch \| blob \| history
arch/arm/oprofile/backtrace.c		patch \| blob \| history
arch/arm26/mm/memc.c		patch \| blob \| history
arch/cris/arch-v32/mm/tlb.c		patch \| blob \| history
arch/cris/mm/ioremap.c		patch \| blob \| history
arch/frv/mm/dma-alloc.c		patch \| blob \| history
arch/frv/mm/pgalloc.c		patch \| blob \| history
arch/i386/kernel/vm86.c		patch \| blob \| history
arch/i386/mm/discontig.c		patch \| blob \| history
arch/i386/mm/init.c		patch \| blob \| history
arch/i386/mm/ioremap.c		patch \| blob \| history
arch/i386/mm/pgtable.c		patch \| blob \| history
arch/i386/oprofile/backtrace.c		patch \| blob \| history
arch/ia64/kernel/perfmon.c		patch \| blob \| history
arch/ia64/mm/discontig.c		patch \| blob \| history
arch/ia64/mm/fault.c		patch \| blob \| history
arch/ia64/mm/init.c		patch \| blob \| history
arch/ia64/mm/tlb.c		patch \| blob \| history
arch/m32r/mm/init.c		patch \| blob \| history
arch/m32r/mm/ioremap.c		patch \| blob \| history
arch/m68k/Kconfig		patch \| blob \| history
arch/m68k/atari/stram.c		patch \| blob \| history
arch/m68k/mm/kmap.c		patch \| blob \| history
arch/m68k/sun3x/dvma.c		patch \| blob \| history
arch/mips/kernel/irixelf.c		patch \| blob \| history
arch/mips/mm/ioremap.c		patch \| blob \| history
arch/parisc/kernel/cache.c		patch \| blob \| history
arch/parisc/kernel/pci-dma.c		patch \| blob \| history
arch/parisc/mm/init.c		patch \| blob \| history
arch/parisc/mm/ioremap.c		patch \| blob \| history
arch/ppc/kernel/dma-mapping.c		patch \| blob \| history
arch/ppc/mm/4xx_mmu.c		patch \| blob \| history
arch/ppc/mm/pgtable.c		patch \| blob \| history
arch/ppc64/kernel/vdso.c		patch \| blob \| history
arch/ppc64/mm/imalloc.c		patch \| blob \| history
arch/ppc64/mm/init.c		patch \| blob \| history
arch/s390/mm/ioremap.c		patch \| blob \| history
arch/sh/mm/fault.c		patch \| blob \| history
arch/sh/mm/hugetlbpage.c		patch \| blob \| history
arch/sh/mm/ioremap.c		patch \| blob \| history
arch/sh64/mm/cache.c		patch \| blob \| history
arch/sh64/mm/hugetlbpage.c		patch \| blob \| history
arch/sh64/mm/ioremap.c		patch \| blob \| history
arch/sparc/mm/generic.c		patch \| blob \| history
arch/sparc64/kernel/binfmt_aout32.c		patch \| blob \| history
arch/sparc64/mm/generic.c		patch \| blob \| history
arch/sparc64/mm/tlb.c		patch \| blob \| history
arch/um/include/tlb.h		patch \| blob \| history
arch/um/kernel/process_kern.c		patch \| blob \| history
arch/um/kernel/skas/mmu.c		patch \| blob \| history
arch/um/kernel/tt/tlb.c		patch \| blob \| history
arch/x86_64/ia32/ia32_aout.c		patch \| blob \| history
arch/x86_64/mm/ioremap.c		patch \| blob \| history
crypto/api.c		patch \| blob \| history
crypto/hmac.c		patch \| blob \| history
crypto/tcrypt.c		patch \| blob \| history
drivers/acpi/acpi_memhotplug.c		patch \| blob \| history
drivers/base/Makefile		patch \| blob \| history
drivers/base/init.c		patch \| blob \| history
drivers/base/memory.c	[new file with mode: 0644]	patch \| blob
drivers/md/dm-crypt.c		patch \| blob \| history
drivers/net/wireless/airo.c		patch \| blob \| history
drivers/pci/quirks.c		patch \| blob \| history
drivers/scsi/ahci.c		patch \| blob \| history
drivers/scsi/arm/scsi.h		patch \| blob \| history
drivers/scsi/ata_piix.c		patch \| blob \| history
drivers/scsi/libata-core.c		patch \| blob \| history
drivers/scsi/libata-scsi.c		patch \| blob \| history
drivers/scsi/libata.h		patch \| blob \| history
drivers/scsi/pdc_adma.c		patch \| blob \| history
drivers/scsi/sata_mv.c		patch \| blob \| history
drivers/scsi/sata_nv.c		patch \| blob \| history
drivers/scsi/sata_promise.c		patch \| blob \| history
drivers/scsi/sata_qstor.c		patch \| blob \| history
drivers/scsi/sata_sil.c		patch \| blob \| history
drivers/scsi/sata_sil24.c		patch \| blob \| history
drivers/scsi/sata_sis.c		patch \| blob \| history
drivers/scsi/sata_svw.c		patch \| blob \| history
drivers/scsi/sata_sx4.c		patch \| blob \| history
drivers/scsi/sata_uli.c		patch \| blob \| history
drivers/scsi/sata_via.c		patch \| blob \| history
drivers/scsi/sata_vsc.c		patch \| blob \| history
drivers/scsi/sg.c		patch \| blob \| history
drivers/scsi/st.c		patch \| blob \| history
drivers/usb/misc/usbtest.c		patch \| blob \| history
fs/afs/file.c		patch \| blob \| history
fs/binfmt_aout.c		patch \| blob \| history
fs/binfmt_elf.c		patch \| blob \| history
fs/binfmt_elf_fdpic.c		patch \| blob \| history
fs/binfmt_flat.c		patch \| blob \| history
fs/binfmt_som.c		patch \| blob \| history
fs/buffer.c		patch \| blob \| history
fs/compat.c		patch \| blob \| history
fs/direct-io.c		patch \| blob \| history
fs/exec.c		patch \| blob \| history
fs/hugetlbfs/inode.c		patch \| blob \| history
fs/jfs/jfs_metapage.c		patch \| blob \| history
fs/nfs/inode.c		patch \| blob \| history
fs/proc/array.c		patch \| blob \| history
fs/proc/task_mmu.c		patch \| blob \| history
fs/xfs/linux-2.6/xfs_buf.c		patch \| blob \| history
include/asm-alpha/barrier.h		patch \| blob \| history
include/asm-alpha/rwsem.h		patch \| blob \| history
include/asm-arm/tlb.h		patch \| blob \| history
include/asm-arm26/tlb.h		patch \| blob \| history
include/asm-generic/4level-fixup.h		patch \| blob \| history
include/asm-generic/pgtable.h		patch \| blob \| history
include/asm-generic/tlb.h		patch \| blob \| history
include/asm-i386/mmzone.h		patch \| blob \| history
include/asm-i386/pgtable.h		patch \| blob \| history
include/asm-i386/rwsem.h		patch \| blob \| history
include/asm-ia64/rwsem.h		patch \| blob \| history
include/asm-ia64/tlb.h		patch \| blob \| history
include/asm-m32r/mmzone.h		patch \| blob \| history
include/asm-parisc/cacheflush.h		patch \| blob \| history
include/asm-parisc/mmzone.h		patch \| blob \| history
include/asm-parisc/tlbflush.h		patch \| blob \| history
include/asm-ppc/rwsem.h		patch \| blob \| history
include/asm-ppc64/mmzone.h		patch \| blob \| history
include/asm-ppc64/pgtable.h		patch \| blob \| history
include/asm-ppc64/rwsem.h		patch \| blob \| history
include/asm-s390/rwsem.h		patch \| blob \| history
include/asm-sh/rwsem.h		patch \| blob \| history
include/asm-sparc64/rwsem.h		patch \| blob \| history
include/asm-sparc64/tlb.h		patch \| blob \| history
include/asm-um/pgtable.h		patch \| blob \| history
include/asm-x86_64/rwsem.h		patch \| blob \| history
include/linux/buffer_head.h		patch \| blob \| history
include/linux/hugetlb.h		patch \| blob \| history
include/linux/libata.h		patch \| blob \| history
include/linux/memory.h	[new file with mode: 0644]	patch \| blob
include/linux/memory_hotplug.h	[new file with mode: 0644]	patch \| blob
include/linux/mempolicy.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
include/linux/rmap.h		patch \| blob \| history
include/linux/rwsem-spinlock.h		patch \| blob \| history
include/linux/scatterlist.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/vmalloc.h		patch \| blob \| history
ipc/shm.c		patch \| blob \| history
kernel/acct.c		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/futex.c		patch \| blob \| history
kernel/kexec.c		patch \| blob \| history
kernel/power/swsusp.c		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
kernel/timer.c		patch \| blob \| history
mm/Kconfig		patch \| blob \| history
mm/Makefile		patch \| blob \| history
mm/bootmem.c		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/filemap_xip.c		patch \| blob \| history
mm/fremap.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/madvise.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/memory_hotplug.c	[new file with mode: 0644]	patch \| blob
mm/mempolicy.c		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/mprotect.c		patch \| blob \| history
mm/mremap.c		patch \| blob \| history
mm/msync.c		patch \| blob \| history
mm/nommu.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/page_io.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history
mm/shmem.c		patch \| blob \| history
mm/slab.c		patch \| blob \| history
mm/sparse.c		patch \| blob \| history
mm/swap.c		patch \| blob \| history
mm/swap_state.c		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history
mm/thrash.c		patch \| blob \| history
mm/vmalloc.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history
net/ipv6/addrconf.c		patch \| blob \| history
net/sunrpc/auth_gss/gss_krb5_crypto.c		patch \| blob \| history
sound/core/pcm_native.c		patch \| blob \| history