Merge branch 'upstream-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/linvil...

[pandora-kernel.git] / mm / mempolicy.c
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 3171f88..67af4ce 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -95,6 +95,9 @@
  #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)         /* Invert check for nodemask */
  #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)          /* Gather statistics */
  
+/* The number of pages to migrate per call to migrate_pages() */
+#define MIGRATE_CHUNK_SIZE 256
+
  static kmem_cache_t *policy_cache;
  static kmem_cache_t *sn_cache;
  
@@ -129,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
         }
         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
  }
+
  /* Generate a custom zonelist for the BIND policy. */
  static struct zonelist *bind_zonelist(nodemask_t *nodes)
  {
         struct zonelist *zl;
-       int num, max, nd;
+       int num, max, nd, k;
  
         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
-       zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+       zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
         if (!zl)
                 return NULL;
         num = 0;
-       for_each_node_mask(nd, *nodes)
-               zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+       /* First put in the highest zones from all nodes, then all the next 
+          lower zones etc. Avoid empty zones because the memory allocator
+          doesn't like them. If you implement node hot removal you
+          have to fix that. */
+       for (k = policy_zone; k >= 0; k--) { 
+               for_each_node_mask(nd, *nodes) { 
+                       struct zone *z = &NODE_DATA(nd)->node_zones[k];
+                       if (z->present_pages > 0) 
+                               zl->zones[num++] = z;
+               }
+       }
         zl->zones[num] = NULL;
         return zl;
  }
@@ -185,8 +198,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
  }
  
  static void gather_stats(struct page *, void *);
-static void migrate_page_add(struct vm_area_struct *vma,
-       struct page *page, struct list_head *pagelist, unsigned long flags);
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                               unsigned long flags);
  
  /* Scan through pages checking if pages follow certain conditions. */
  static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -208,6 +221,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                 page = vm_normal_page(vma, addr, *pte);
                 if (!page)
                         continue;
+               /*
+                * The check for PageReserved here is important to avoid
+                * handling zero pages and other pages that may have been
+                * marked special by the system.
+                *
+                * If the PageReserved would not be checked here then f.e.
+                * the location of the zero page could have an influence
+                * on MPOL_MF_STRICT, zero pages would be counted for
+                * the per node stats, and there would be useless attempts
+                * to put zero pages on the migration list.
+                */
                 if (PageReserved(page))
                         continue;
                 nid = page_to_nid(page);
@@ -216,11 +240,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
  
                 if (flags & MPOL_MF_STATS)
                         gather_stats(page, private);
-               else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-                       spin_unlock(ptl);
-                       migrate_page_add(vma, page, private, flags);
-                       spin_lock(ptl);
-               }
+               else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+                       migrate_page_add(page, private, flags);
                 else
                         break;
         } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -309,6 +330,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
         int err;
         struct vm_area_struct *first, *vma, *prev;
  
+       /* Clear the LRU lists so pages can be isolated */
+       if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+               lru_add_drain_all();
+
         first = find_vma(mm, start);
         if (!first)
                 return ERR_PTR(-EFAULT);
@@ -519,72 +544,117 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
   * page migration
   */
  
-/* Check if we are the only process mapping the page in question */
-static inline int single_mm_mapping(struct mm_struct *mm,
-                       struct address_space *mapping)
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                               unsigned long flags)
  {
-       struct vm_area_struct *vma;
-       struct prio_tree_iter iter;
-       int rc = 1;
+       /*
+        * Avoid migrating a page that is shared with others.
+        */
+       if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+               if (isolate_lru_page(page))
+                       list_add_tail(&page->lru, pagelist);
+       }
+}
  
-       spin_lock(&mapping->i_mmap_lock);
-       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
-               if (mm != vma->vm_mm) {
-                       rc = 0;
-                       goto out;
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+static int migrate_pages_to(struct list_head *pagelist,
+                       struct vm_area_struct *vma, int dest)
+{
+       LIST_HEAD(newlist);
+       LIST_HEAD(moved);
+       LIST_HEAD(failed);
+       int err = 0;
+       unsigned long offset = 0;
+       int nr_pages;
+       struct page *page;
+       struct list_head *p;
+
+redo:
+       nr_pages = 0;
+       list_for_each(p, pagelist) {
+               if (vma) {
+                       /*
+                        * The address passed to alloc_page_vma is used to
+                        * generate the proper interleave behavior. We fake
+                        * the address here by an increasing offset in order
+                        * to get the proper distribution of pages.
+                        *
+                        * No decision has been made as to which page
+                        * a certain old page is moved to so we cannot
+                        * specify the correct address.
+                        */
+                       page = alloc_page_vma(GFP_HIGHUSER, vma,
+                                       offset + vma->vm_start);
+                       offset += PAGE_SIZE;
                 }
-       list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-               if (mm != vma->vm_mm) {
-                       rc = 0;
+               else
+                       page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
+
+               if (!page) {
+                       err = -ENOMEM;
                         goto out;
                 }
+               list_add_tail(&page->lru, &newlist);
+               nr_pages++;
+               if (nr_pages > MIGRATE_CHUNK_SIZE)
+                       break;
+       }
+       err = migrate_pages(pagelist, &newlist, &moved, &failed);
+
+       putback_lru_pages(&moved);      /* Call release pages instead ?? */
+
+       if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+               goto redo;
  out:
-       spin_unlock(&mapping->i_mmap_lock);
-       return rc;
+       /* Return leftover allocated pages */
+       while (!list_empty(&newlist)) {
+               page = list_entry(newlist.next, struct page, lru);
+               list_del(&page->lru);
+               __free_page(page);
+       }
+       list_splice(&failed, pagelist);
+       if (err < 0)
+               return err;
+
+       /* Calculate number of leftover pages */
+       nr_pages = 0;
+       list_for_each(p, pagelist)
+               nr_pages++;
+       return nr_pages;
  }
  
  /*
- * Add a page to be migrated to the pagelist
+ * Migrate pages from one node to a target node.
+ * Returns error or the number of pages not migrated.
   */
-static void migrate_page_add(struct vm_area_struct *vma,
-       struct page *page, struct list_head *pagelist, unsigned long flags)
+int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
  {
-       /*
-        * Avoid migrating a page that is shared by others and not writable.
-        */
-       if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
-           mapping_writably_mapped(page->mapping) ||
-           single_mm_mapping(vma->vm_mm, page->mapping)) {
-               int rc = isolate_lru_page(page);
-
-               if (rc == 1)
-                       list_add(&page->lru, pagelist);
-               /*
-                * If the isolate attempt was not successful then we just
-                * encountered an unswappable page. Something must be wrong.
-                */
-               WARN_ON(rc == 0);
-       }
-}
+       nodemask_t nmask;
+       LIST_HEAD(pagelist);
+       int err = 0;
  
-static int swap_pages(struct list_head *pagelist)
-{
-       LIST_HEAD(moved);
-       LIST_HEAD(failed);
-       int n;
+       nodes_clear(nmask);
+       node_set(source, nmask);
  
-       n = migrate_pages(pagelist, NULL, &moved, &failed);
-       putback_lru_pages(&failed);
-       putback_lru_pages(&moved);
+       check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+                       flags | MPOL_MF_DISCONTIG_OK, &pagelist);
  
-       return n;
+       if (!list_empty(&pagelist)) {
+               err = migrate_pages_to(&pagelist, NULL, dest);
+               if (!list_empty(&pagelist))
+                       putback_lru_pages(&pagelist);
+       }
+       return err;
  }
  
  /*
- * For now migrate_pages simply swaps out the pages from nodes that are in
- * the source set but not in the target set. In the future, we would
- * want a function that moves pages between the two nodesets in such
- * a way as to preserve the physical layout as much as possible.
+ * Move pages between the two nodesets so as to preserve the physical
+ * layout as much as possible.
   *
   * Returns the number of page that could not be moved.
   */
@@ -592,22 +662,76 @@ int do_migrate_pages(struct mm_struct *mm,
         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
  {
         LIST_HEAD(pagelist);
-       int count = 0;
-       nodemask_t nodes;
+       int busy = 0;
+       int err = 0;
+       nodemask_t tmp;
  
-       nodes_andnot(nodes, *from_nodes, *to_nodes);
+       down_read(&mm->mmap_sem);
  
-       down_read(&mm->mmap_sem);
-       check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
-                       flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+/*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same.  If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient.  As we go, we remember the
+ * most recent <s, d> pair that moved (s != d).  If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scannng from_tmp, we at least have the
+ * most recent <s, d> pair that moved.  If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
  
-       if (!list_empty(&pagelist)) {
-               count = swap_pages(&pagelist);
-               putback_lru_pages(&pagelist);
+       tmp = *from_nodes;
+       while (!nodes_empty(tmp)) {
+               int s,d;
+               int source = -1;
+               int dest = 0;
+
+               for_each_node_mask(s, tmp) {
+                       d = node_remap(s, *from_nodes, *to_nodes);
+                       if (s == d)
+                               continue;
+
+                       source = s;     /* Node moved. Memorize */
+                       dest = d;
+
+                       /* dest not in remaining from nodes? */
+                       if (!node_isset(dest, tmp))
+                               break;
+               }
+               if (source == -1)
+                       break;
+
+               node_clear(source, tmp);
+               err = migrate_to_node(mm, source, dest, flags);
+               if (err > 0)
+                       busy += err;
+               if (err < 0)
+                       break;
         }
  
         up_read(&mm->mmap_sem);
-       return count;
+       if (err < 0)
+               return err;
+       return busy;
  }
  
  long do_mbind(unsigned long start, unsigned long len,
@@ -667,8 +791,9 @@ long do_mbind(unsigned long start, unsigned long len,
                 int nr_failed = 0;
  
                 err = mbind_range(vma, start, end, new);
+
                 if (!list_empty(&pagelist))
-                       nr_failed = swap_pages(&pagelist);
+                       nr_failed = migrate_pages_to(&pagelist, vma, -1);
  
                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                         err = -EIO;
@@ -697,6 +822,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
         nodes_clear(*nodes);
         if (maxnode == 0 || !nmask)
                 return 0;
+       if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+               return -EINVAL;
  
         nlongs = BITS_TO_LONGS(maxnode);
         if ((maxnode % BITS_PER_LONG) == 0)
@@ -1000,6 +1127,33 @@ static unsigned interleave_nodes(struct mempolicy *policy)
         return nid;
  }
  
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+       switch (policy->policy) {
+       case MPOL_INTERLEAVE:
+               return interleave_nodes(policy);
+
+       case MPOL_BIND:
+               /*
+                * Follow bind policy behavior and start allocation at the
+                * first node.
+                */
+               return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+
+       case MPOL_PREFERRED:
+               if (policy->v.preferred_node >= 0)
+                       return policy->v.preferred_node;
+               /* Fall through */
+
+       default:
+               return numa_node_id();
+       }
+}
+
  /* Do static interleaving for a VMA with known offset. */
  static unsigned offset_il_node(struct mempolicy *pol,
                 struct vm_area_struct *vma, unsigned long off)
@@ -1031,6 +1185,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
                 return interleave_nodes(pol);
  }
  
+#ifdef CONFIG_HUGETLBFS
  /* Return a zonelist suitable for a huge page allocation. */
  struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
  {
@@ -1044,6 +1199,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
         }
         return zonelist_policy(GFP_HIGHUSER, pol);
  }
+#endif
  
  /* Allocate a page in interleaved policy.
     Own path because it needs to do special accounting. */