mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89 #include <linux/migrate.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 #define PDprintk(fmt...)
 105
 106 /* Highest zone. An specific allocation for a zone below that is not
 107    policied. */
 108 enum zone_type policy_zone = 0;
 109
 110 struct mempolicy default_policy = {
 111         .refcnt = ATOMIC_INIT(1), /* never free it */
 112         .policy = MPOL_DEFAULT,
 113 };
 114
 115 /* Do sanity checking on a policy */
 116 static int mpol_check_policy(int mode, nodemask_t *nodes)
 117 {
 118         int empty = nodes_empty(*nodes);
 119
 120         switch (mode) {
 121         case MPOL_DEFAULT:
 122                 if (!empty)
 123                         return -EINVAL;
 124                 break;
 125         case MPOL_BIND:
 126         case MPOL_INTERLEAVE:
 127                 /* Preferred will only use the first bit, but allow
 128                    more for now. */
 129                 if (empty)
 130                         return -EINVAL;
 131                 break;
 132         }
 133         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 134 }
 135
 136 /* Generate a custom zonelist for the BIND policy. */
 137 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 138 {
 139         struct zonelist *zl;
 140         int num, max, nd;
 141         enum zone_type k;
 142
 143         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 144         max++;                  /* space for zlcache_ptr (see mmzone.h) */
 145         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 146         if (!zl)
 147                 return NULL;
 148         zl->zlcache_ptr = NULL;
 149         num = 0;
 150         /* First put in the highest zones from all nodes, then all the next
 151            lower zones etc. Avoid empty zones because the memory allocator
 152            doesn't like them. If you implement node hot removal you
 153            have to fix that. */
 154         k = policy_zone;
 155         while (1) {
 156                 for_each_node_mask(nd, *nodes) {
 157                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 158                         if (z->present_pages > 0)
 159                                 zl->zones[num++] = z;
 160                 }
 161                 if (k == 0)
 162                         break;
 163                 k--;
 164         }
 165         zl->zones[num] = NULL;
 166         return zl;
 167 }
 168
 169 /* Create a new policy */
 170 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 171 {
 172         struct mempolicy *policy;
 173
 174         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 175         if (mode == MPOL_DEFAULT)
 176                 return NULL;
 177         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 178         if (!policy)
 179                 return ERR_PTR(-ENOMEM);
 180         atomic_set(&policy->refcnt, 1);
 181         switch (mode) {
 182         case MPOL_INTERLEAVE:
 183                 policy->v.nodes = *nodes;
 184                 if (nodes_weight(*nodes) == 0) {
 185                         kmem_cache_free(policy_cache, policy);
 186                         return ERR_PTR(-EINVAL);
 187                 }
 188                 break;
 189         case MPOL_PREFERRED:
 190                 policy->v.preferred_node = first_node(*nodes);
 191                 if (policy->v.preferred_node >= MAX_NUMNODES)
 192                         policy->v.preferred_node = -1;
 193                 break;
 194         case MPOL_BIND:
 195                 policy->v.zonelist = bind_zonelist(nodes);
 196                 if (policy->v.zonelist == NULL) {
 197                         kmem_cache_free(policy_cache, policy);
 198                         return ERR_PTR(-ENOMEM);
 199                 }
 200                 break;
 201         }
 202         policy->policy = mode;
 203         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 204         return policy;
 205 }
 206
 207 static void gather_stats(struct page *, void *, int pte_dirty);
 208 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 209                                 unsigned long flags);
 210
 211 /* Scan through pages checking if pages follow certain conditions. */
 212 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 213                 unsigned long addr, unsigned long end,
 214                 const nodemask_t *nodes, unsigned long flags,
 215                 void *private)
 216 {
 217         pte_t *orig_pte;
 218         pte_t *pte;
 219         spinlock_t *ptl;
 220
 221         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 222         do {
 223                 struct page *page;
 224                 int nid;
 225
 226                 if (!pte_present(*pte))
 227                         continue;
 228                 page = vm_normal_page(vma, addr, *pte);
 229                 if (!page)
 230                         continue;
 231                 /*
 232                  * The check for PageReserved here is important to avoid
 233                  * handling zero pages and other pages that may have been
 234                  * marked special by the system.
 235                  *
 236                  * If the PageReserved would not be checked here then f.e.
 237                  * the location of the zero page could have an influence
 238                  * on MPOL_MF_STRICT, zero pages would be counted for
 239                  * the per node stats, and there would be useless attempts
 240                  * to put zero pages on the migration list.
 241                  */
 242                 if (PageReserved(page))
 243                         continue;
 244                 nid = page_to_nid(page);
 245                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 246                         continue;
 247
 248                 if (flags & MPOL_MF_STATS)
 249                         gather_stats(page, private, pte_dirty(*pte));
 250                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 251                         migrate_page_add(page, private, flags);
 252                 else
 253                         break;
 254         } while (pte++, addr += PAGE_SIZE, addr != end);
 255         pte_unmap_unlock(orig_pte, ptl);
 256         return addr != end;
 257 }
 258
 259 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 260                 unsigned long addr, unsigned long end,
 261                 const nodemask_t *nodes, unsigned long flags,
 262                 void *private)
 263 {
 264         pmd_t *pmd;
 265         unsigned long next;
 266
 267         pmd = pmd_offset(pud, addr);
 268         do {
 269                 next = pmd_addr_end(addr, end);
 270                 if (pmd_none_or_clear_bad(pmd))
 271                         continue;
 272                 if (check_pte_range(vma, pmd, addr, next, nodes,
 273                                     flags, private))
 274                         return -EIO;
 275         } while (pmd++, addr = next, addr != end);
 276         return 0;
 277 }
 278
 279 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 280                 unsigned long addr, unsigned long end,
 281                 const nodemask_t *nodes, unsigned long flags,
 282                 void *private)
 283 {
 284         pud_t *pud;
 285         unsigned long next;
 286
 287         pud = pud_offset(pgd, addr);
 288         do {
 289                 next = pud_addr_end(addr, end);
 290                 if (pud_none_or_clear_bad(pud))
 291                         continue;
 292                 if (check_pmd_range(vma, pud, addr, next, nodes,
 293                                     flags, private))
 294                         return -EIO;
 295         } while (pud++, addr = next, addr != end);
 296         return 0;
 297 }
 298
 299 static inline int check_pgd_range(struct vm_area_struct *vma,
 300                 unsigned long addr, unsigned long end,
 301                 const nodemask_t *nodes, unsigned long flags,
 302                 void *private)
 303 {
 304         pgd_t *pgd;
 305         unsigned long next;
 306
 307         pgd = pgd_offset(vma->vm_mm, addr);
 308         do {
 309                 next = pgd_addr_end(addr, end);
 310                 if (pgd_none_or_clear_bad(pgd))
 311                         continue;
 312                 if (check_pud_range(vma, pgd, addr, next, nodes,
 313                                     flags, private))
 314                         return -EIO;
 315         } while (pgd++, addr = next, addr != end);
 316         return 0;
 317 }
 318
 319 /* Check if a vma is migratable */
 320 static inline int vma_migratable(struct vm_area_struct *vma)
 321 {
 322         if (vma->vm_flags & (
 323                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 324                 return 0;
 325         return 1;
 326 }
 327
 328 /*
 329  * Check if all pages in a range are on a set of nodes.
 330  * If pagelist != NULL then isolate pages from the LRU and
 331  * put them on the pagelist.
 332  */
 333 static struct vm_area_struct *
 334 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 335                 const nodemask_t *nodes, unsigned long flags, void *private)
 336 {
 337         int err;
 338         struct vm_area_struct *first, *vma, *prev;
 339
 340         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 341
 342                 err = migrate_prep();
 343                 if (err)
 344                         return ERR_PTR(err);
 345         }
 346
 347         first = find_vma(mm, start);
 348         if (!first)
 349                 return ERR_PTR(-EFAULT);
 350         prev = NULL;
 351         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 352                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 353                         if (!vma->vm_next && vma->vm_end < end)
 354                                 return ERR_PTR(-EFAULT);
 355                         if (prev && prev->vm_end < vma->vm_start)
 356                                 return ERR_PTR(-EFAULT);
 357                 }
 358                 if (!is_vm_hugetlb_page(vma) &&
 359                     ((flags & MPOL_MF_STRICT) ||
 360                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 361                                 vma_migratable(vma)))) {
 362                         unsigned long endvma = vma->vm_end;
 363
 364                         if (endvma > end)
 365                                 endvma = end;
 366                         if (vma->vm_start > start)
 367                                 start = vma->vm_start;
 368                         err = check_pgd_range(vma, start, endvma, nodes,
 369                                                 flags, private);
 370                         if (err) {
 371                                 first = ERR_PTR(err);
 372                                 break;
 373                         }
 374                 }
 375                 prev = vma;
 376         }
 377         return first;
 378 }
 379
 380 /* Apply policy to a single VMA */
 381 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 382 {
 383         int err = 0;
 384         struct mempolicy *old = vma->vm_policy;
 385
 386         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 387                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 388                  vma->vm_ops, vma->vm_file,
 389                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 390
 391         if (vma->vm_ops && vma->vm_ops->set_policy)
 392                 err = vma->vm_ops->set_policy(vma, new);
 393         if (!err) {
 394                 mpol_get(new);
 395                 vma->vm_policy = new;
 396                 mpol_free(old);
 397         }
 398         return err;
 399 }
 400
 401 /* Step 2: apply policy to a range and do splits. */
 402 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 403                        unsigned long end, struct mempolicy *new)
 404 {
 405         struct vm_area_struct *next;
 406         int err;
 407
 408         err = 0;
 409         for (; vma && vma->vm_start < end; vma = next) {
 410                 next = vma->vm_next;
 411                 if (vma->vm_start < start)
 412                         err = split_vma(vma->vm_mm, vma, start, 1);
 413                 if (!err && vma->vm_end > end)
 414                         err = split_vma(vma->vm_mm, vma, end, 0);
 415                 if (!err)
 416                         err = policy_vma(vma, new);
 417                 if (err)
 418                         break;
 419         }
 420         return err;
 421 }
 422
 423 static int contextualize_policy(int mode, nodemask_t *nodes)
 424 {
 425         if (!nodes)
 426                 return 0;
 427
 428         cpuset_update_task_memory_state();
 429         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 430                 return -EINVAL;
 431         return mpol_check_policy(mode, nodes);
 432 }
 433
 434
 435 /*
 436  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 437  * mempolicy.  Allows more rapid checking of this (combined perhaps
 438  * with other PF_* flag bits) on memory allocation hot code paths.
 439  *
 440  * If called from outside this file, the task 'p' should -only- be
 441  * a newly forked child not yet visible on the task list, because
 442  * manipulating the task flags of a visible task is not safe.
 443  *
 444  * The above limitation is why this routine has the funny name
 445  * mpol_fix_fork_child_flag().
 446  *
 447  * It is also safe to call this with a task pointer of current,
 448  * which the static wrapper mpol_set_task_struct_flag() does,
 449  * for use within this file.
 450  */
 451
 452 void mpol_fix_fork_child_flag(struct task_struct *p)
 453 {
 454         if (p->mempolicy)
 455                 p->flags |= PF_MEMPOLICY;
 456         else
 457                 p->flags &= ~PF_MEMPOLICY;
 458 }
 459
 460 static void mpol_set_task_struct_flag(void)
 461 {
 462         mpol_fix_fork_child_flag(current);
 463 }
 464
 465 /* Set the process memory policy */
 466 long do_set_mempolicy(int mode, nodemask_t *nodes)
 467 {
 468         struct mempolicy *new;
 469
 470         if (contextualize_policy(mode, nodes))
 471                 return -EINVAL;
 472         new = mpol_new(mode, nodes);
 473         if (IS_ERR(new))
 474                 return PTR_ERR(new);
 475         mpol_free(current->mempolicy);
 476         current->mempolicy = new;
 477         mpol_set_task_struct_flag();
 478         if (new && new->policy == MPOL_INTERLEAVE)
 479                 current->il_next = first_node(new->v.nodes);
 480         return 0;
 481 }
 482
 483 /* Fill a zone bitmap for a policy */
 484 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 485 {
 486         int i;
 487
 488         nodes_clear(*nodes);
 489         switch (p->policy) {
 490         case MPOL_BIND:
 491                 for (i = 0; p->v.zonelist->zones[i]; i++)
 492                         node_set(zone_to_nid(p->v.zonelist->zones[i]),
 493                                 *nodes);
 494                 break;
 495         case MPOL_DEFAULT:
 496                 break;
 497         case MPOL_INTERLEAVE:
 498                 *nodes = p->v.nodes;
 499                 break;
 500         case MPOL_PREFERRED:
 501                 /* or use current node instead of online map? */
 502                 if (p->v.preferred_node < 0)
 503                         *nodes = node_online_map;
 504                 else
 505                         node_set(p->v.preferred_node, *nodes);
 506                 break;
 507         default:
 508                 BUG();
 509         }
 510 }
 511
 512 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 513 {
 514         struct page *p;
 515         int err;
 516
 517         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 518         if (err >= 0) {
 519                 err = page_to_nid(p);
 520                 put_page(p);
 521         }
 522         return err;
 523 }
 524
 525 /* Retrieve NUMA policy */
 526 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 527                         unsigned long addr, unsigned long flags)
 528 {
 529         int err;
 530         struct mm_struct *mm = current->mm;
 531         struct vm_area_struct *vma = NULL;
 532         struct mempolicy *pol = current->mempolicy;
 533
 534         cpuset_update_task_memory_state();
 535         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 536                 return -EINVAL;
 537         if (flags & MPOL_F_ADDR) {
 538                 down_read(&mm->mmap_sem);
 539                 vma = find_vma_intersection(mm, addr, addr+1);
 540                 if (!vma) {
 541                         up_read(&mm->mmap_sem);
 542                         return -EFAULT;
 543                 }
 544                 if (vma->vm_ops && vma->vm_ops->get_policy)
 545                         pol = vma->vm_ops->get_policy(vma, addr);
 546                 else
 547                         pol = vma->vm_policy;
 548         } else if (addr)
 549                 return -EINVAL;
 550
 551         if (!pol)
 552                 pol = &default_policy;
 553
 554         if (flags & MPOL_F_NODE) {
 555                 if (flags & MPOL_F_ADDR) {
 556                         err = lookup_node(mm, addr);
 557                         if (err < 0)
 558                                 goto out;
 559                         *policy = err;
 560                 } else if (pol == current->mempolicy &&
 561                                 pol->policy == MPOL_INTERLEAVE) {
 562                         *policy = current->il_next;
 563                 } else {
 564                         err = -EINVAL;
 565                         goto out;
 566                 }
 567         } else
 568                 *policy = pol->policy;
 569
 570         if (vma) {
 571                 up_read(&current->mm->mmap_sem);
 572                 vma = NULL;
 573         }
 574
 575         err = 0;
 576         if (nmask)
 577                 get_zonemask(pol, nmask);
 578
 579  out:
 580         if (vma)
 581                 up_read(&current->mm->mmap_sem);
 582         return err;
 583 }
 584
 585 #ifdef CONFIG_MIGRATION
 586 /*
 587  * page migration
 588  */
 589 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 590                                 unsigned long flags)
 591 {
 592         /*
 593          * Avoid migrating a page that is shared with others.
 594          */
 595         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 596                 isolate_lru_page(page, pagelist);
 597 }
 598
 599 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 600 {
 601         return alloc_pages_node(node, GFP_HIGHUSER, 0);
 602 }
 603
 604 /*
 605  * Migrate pages from one node to a target node.
 606  * Returns error or the number of pages not migrated.
 607  */
 608 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 609 {
 610         nodemask_t nmask;
 611         LIST_HEAD(pagelist);
 612         int err = 0;
 613
 614         nodes_clear(nmask);
 615         node_set(source, nmask);
 616
 617         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 618                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 619
 620         if (!list_empty(&pagelist))
 621                 err = migrate_pages(&pagelist, new_node_page, dest);
 622
 623         return err;
 624 }
 625
 626 /*
 627  * Move pages between the two nodesets so as to preserve the physical
 628  * layout as much as possible.
 629  *
 630  * Returns the number of page that could not be moved.
 631  */
 632 int do_migrate_pages(struct mm_struct *mm,
 633         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 634 {
 635         LIST_HEAD(pagelist);
 636         int busy = 0;
 637         int err = 0;
 638         nodemask_t tmp;
 639
 640         down_read(&mm->mmap_sem);
 641
 642         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 643         if (err)
 644                 goto out;
 645
 646 /*
 647  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 648  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 649  * bit in 'tmp', and return that <source, dest> pair for migration.
 650  * The pair of nodemasks 'to' and 'from' define the map.
 651  *
 652  * If no pair of bits is found that way, fallback to picking some
 653  * pair of 'source' and 'dest' bits that are not the same.  If the
 654  * 'source' and 'dest' bits are the same, this represents a node
 655  * that will be migrating to itself, so no pages need move.
 656  *
 657  * If no bits are left in 'tmp', or if all remaining bits left
 658  * in 'tmp' correspond to the same bit in 'to', return false
 659  * (nothing left to migrate).
 660  *
 661  * This lets us pick a pair of nodes to migrate between, such that
 662  * if possible the dest node is not already occupied by some other
 663  * source node, minimizing the risk of overloading the memory on a
 664  * node that would happen if we migrated incoming memory to a node
 665  * before migrating outgoing memory source that same node.
 666  *
 667  * A single scan of tmp is sufficient.  As we go, we remember the
 668  * most recent <s, d> pair that moved (s != d).  If we find a pair
 669  * that not only moved, but what's better, moved to an empty slot
 670  * (d is not set in tmp), then we break out then, with that pair.
 671  * Otherwise when we finish scannng from_tmp, we at least have the
 672  * most recent <s, d> pair that moved.  If we get all the way through
 673  * the scan of tmp without finding any node that moved, much less
 674  * moved to an empty node, then there is nothing left worth migrating.
 675  */
 676
 677         tmp = *from_nodes;
 678         while (!nodes_empty(tmp)) {
 679                 int s,d;
 680                 int source = -1;
 681                 int dest = 0;
 682
 683                 for_each_node_mask(s, tmp) {
 684                         d = node_remap(s, *from_nodes, *to_nodes);
 685                         if (s == d)
 686                                 continue;
 687
 688                         source = s;     /* Node moved. Memorize */
 689                         dest = d;
 690
 691                         /* dest not in remaining from nodes? */
 692                         if (!node_isset(dest, tmp))
 693                                 break;
 694                 }
 695                 if (source == -1)
 696                         break;
 697
 698                 node_clear(source, tmp);
 699                 err = migrate_to_node(mm, source, dest, flags);
 700                 if (err > 0)
 701                         busy += err;
 702                 if (err < 0)
 703                         break;
 704         }
 705 out:
 706         up_read(&mm->mmap_sem);
 707         if (err < 0)
 708                 return err;
 709         return busy;
 710
 711 }
 712
 713 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 714 {
 715         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 716
 717         return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
 718 }
 719 #else
 720
 721 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 722                                 unsigned long flags)
 723 {
 724 }
 725
 726 int do_migrate_pages(struct mm_struct *mm,
 727         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 728 {
 729         return -ENOSYS;
 730 }
 731
 732 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 733 {
 734         return NULL;
 735 }
 736 #endif
 737
 738 long do_mbind(unsigned long start, unsigned long len,
 739                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 740 {
 741         struct vm_area_struct *vma;
 742         struct mm_struct *mm = current->mm;
 743         struct mempolicy *new;
 744         unsigned long end;
 745         int err;
 746         LIST_HEAD(pagelist);
 747
 748         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 749                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 750             || mode > MPOL_MAX)
 751                 return -EINVAL;
 752         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 753                 return -EPERM;
 754
 755         if (start & ~PAGE_MASK)
 756                 return -EINVAL;
 757
 758         if (mode == MPOL_DEFAULT)
 759                 flags &= ~MPOL_MF_STRICT;
 760
 761         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 762         end = start + len;
 763
 764         if (end < start)
 765                 return -EINVAL;
 766         if (end == start)
 767                 return 0;
 768
 769         if (mpol_check_policy(mode, nmask))
 770                 return -EINVAL;
 771
 772         new = mpol_new(mode, nmask);
 773         if (IS_ERR(new))
 774                 return PTR_ERR(new);
 775
 776         /*
 777          * If we are using the default policy then operation
 778          * on discontinuous address spaces is okay after all
 779          */
 780         if (!new)
 781                 flags |= MPOL_MF_DISCONTIG_OK;
 782
 783         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 784                         mode,nodes_addr(nodes)[0]);
 785
 786         down_write(&mm->mmap_sem);
 787         vma = check_range(mm, start, end, nmask,
 788                           flags | MPOL_MF_INVERT, &pagelist);
 789
 790         err = PTR_ERR(vma);
 791         if (!IS_ERR(vma)) {
 792                 int nr_failed = 0;
 793
 794                 err = mbind_range(vma, start, end, new);
 795
 796                 if (!list_empty(&pagelist))
 797                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 798                                                 (unsigned long)vma);
 799
 800                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 801                         err = -EIO;
 802         }
 803
 804         up_write(&mm->mmap_sem);
 805         mpol_free(new);
 806         return err;
 807 }
 808
 809 /*
 810  * User space interface with variable sized bitmaps for nodelists.
 811  */
 812
 813 /* Copy a node mask from user space. */
 814 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 815                      unsigned long maxnode)
 816 {
 817         unsigned long k;
 818         unsigned long nlongs;
 819         unsigned long endmask;
 820
 821         --maxnode;
 822         nodes_clear(*nodes);
 823         if (maxnode == 0 || !nmask)
 824                 return 0;
 825         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 826                 return -EINVAL;
 827
 828         nlongs = BITS_TO_LONGS(maxnode);
 829         if ((maxnode % BITS_PER_LONG) == 0)
 830                 endmask = ~0UL;
 831         else
 832                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 833
 834         /* When the user specified more nodes than supported just check
 835            if the non supported part is all zero. */
 836         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 837                 if (nlongs > PAGE_SIZE/sizeof(long))
 838                         return -EINVAL;
 839                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 840                         unsigned long t;
 841                         if (get_user(t, nmask + k))
 842                                 return -EFAULT;
 843                         if (k == nlongs - 1) {
 844                                 if (t & endmask)
 845                                         return -EINVAL;
 846                         } else if (t)
 847                                 return -EINVAL;
 848                 }
 849                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 850                 endmask = ~0UL;
 851         }
 852
 853         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 854                 return -EFAULT;
 855         nodes_addr(*nodes)[nlongs-1] &= endmask;
 856         return 0;
 857 }
 858
 859 /* Copy a kernel node mask to user space */
 860 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 861                               nodemask_t *nodes)
 862 {
 863         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 864         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 865
 866         if (copy > nbytes) {
 867                 if (copy > PAGE_SIZE)
 868                         return -EINVAL;
 869                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 870                         return -EFAULT;
 871                 copy = nbytes;
 872         }
 873         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 874 }
 875
 876 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 877                         unsigned long mode,
 878                         unsigned long __user *nmask, unsigned long maxnode,
 879                         unsigned flags)
 880 {
 881         nodemask_t nodes;
 882         int err;
 883
 884         err = get_nodes(&nodes, nmask, maxnode);
 885         if (err)
 886                 return err;
 887 #ifdef CONFIG_CPUSETS
 888         /* Restrict the nodes to the allowed nodes in the cpuset */
 889         nodes_and(nodes, nodes, current->mems_allowed);
 890 #endif
 891         return do_mbind(start, len, mode, &nodes, flags);
 892 }
 893
 894 /* Set the process memory policy */
 895 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 896                 unsigned long maxnode)
 897 {
 898         int err;
 899         nodemask_t nodes;
 900
 901         if (mode < 0 || mode > MPOL_MAX)
 902                 return -EINVAL;
 903         err = get_nodes(&nodes, nmask, maxnode);
 904         if (err)
 905                 return err;
 906         return do_set_mempolicy(mode, &nodes);
 907 }
 908
 909 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 910                 const unsigned long __user *old_nodes,
 911                 const unsigned long __user *new_nodes)
 912 {
 913         struct mm_struct *mm;
 914         struct task_struct *task;
 915         nodemask_t old;
 916         nodemask_t new;
 917         nodemask_t task_nodes;
 918         int err;
 919
 920         err = get_nodes(&old, old_nodes, maxnode);
 921         if (err)
 922                 return err;
 923
 924         err = get_nodes(&new, new_nodes, maxnode);
 925         if (err)
 926                 return err;
 927
 928         /* Find the mm_struct */
 929         read_lock(&tasklist_lock);
 930         task = pid ? find_task_by_pid(pid) : current;
 931         if (!task) {
 932                 read_unlock(&tasklist_lock);
 933                 return -ESRCH;
 934         }
 935         mm = get_task_mm(task);
 936         read_unlock(&tasklist_lock);
 937
 938         if (!mm)
 939                 return -EINVAL;
 940
 941         /*
 942          * Check if this process has the right to modify the specified
 943          * process. The right exists if the process has administrative
 944          * capabilities, superuser privileges or the same
 945          * userid as the target process.
 946          */
 947         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 948             (current->uid != task->suid) && (current->uid != task->uid) &&
 949             !capable(CAP_SYS_NICE)) {
 950                 err = -EPERM;
 951                 goto out;
 952         }
 953
 954         task_nodes = cpuset_mems_allowed(task);
 955         /* Is the user allowed to access the target nodes? */
 956         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 957                 err = -EPERM;
 958                 goto out;
 959         }
 960
 961         err = security_task_movememory(task);
 962         if (err)
 963                 goto out;
 964
 965         err = do_migrate_pages(mm, &old, &new,
 966                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 967 out:
 968         mmput(mm);
 969         return err;
 970 }
 971
 972
 973 /* Retrieve NUMA policy */
 974 asmlinkage long sys_get_mempolicy(int __user *policy,
 975                                 unsigned long __user *nmask,
 976                                 unsigned long maxnode,
 977                                 unsigned long addr, unsigned long flags)
 978 {
 979         int err, pval;
 980         nodemask_t nodes;
 981
 982         if (nmask != NULL && maxnode < MAX_NUMNODES)
 983                 return -EINVAL;
 984
 985         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 986
 987         if (err)
 988                 return err;
 989
 990         if (policy && put_user(pval, policy))
 991                 return -EFAULT;
 992
 993         if (nmask)
 994                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 995
 996         return err;
 997 }
 998
 999 #ifdef CONFIG_COMPAT
1000
1001 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1002                                      compat_ulong_t __user *nmask,
1003                                      compat_ulong_t maxnode,
1004                                      compat_ulong_t addr, compat_ulong_t flags)
1005 {
1006         long err;
1007         unsigned long __user *nm = NULL;
1008         unsigned long nr_bits, alloc_size;
1009         DECLARE_BITMAP(bm, MAX_NUMNODES);
1010
1011         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1012         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1013
1014         if (nmask)
1015                 nm = compat_alloc_user_space(alloc_size);
1016
1017         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1018
1019         if (!err && nmask) {
1020                 err = copy_from_user(bm, nm, alloc_size);
1021                 /* ensure entire bitmap is zeroed */
1022                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1023                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1024         }
1025
1026         return err;
1027 }
1028
1029 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1030                                      compat_ulong_t maxnode)
1031 {
1032         long err = 0;
1033         unsigned long __user *nm = NULL;
1034         unsigned long nr_bits, alloc_size;
1035         DECLARE_BITMAP(bm, MAX_NUMNODES);
1036
1037         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1038         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1039
1040         if (nmask) {
1041                 err = compat_get_bitmap(bm, nmask, nr_bits);
1042                 nm = compat_alloc_user_space(alloc_size);
1043                 err |= copy_to_user(nm, bm, alloc_size);
1044         }
1045
1046         if (err)
1047                 return -EFAULT;
1048
1049         return sys_set_mempolicy(mode, nm, nr_bits+1);
1050 }
1051
1052 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1053                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1054                              compat_ulong_t maxnode, compat_ulong_t flags)
1055 {
1056         long err = 0;
1057         unsigned long __user *nm = NULL;
1058         unsigned long nr_bits, alloc_size;
1059         nodemask_t bm;
1060
1061         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1062         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1063
1064         if (nmask) {
1065                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1066                 nm = compat_alloc_user_space(alloc_size);
1067                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1068         }
1069
1070         if (err)
1071                 return -EFAULT;
1072
1073         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1074 }
1075
1076 #endif
1077
1078 /* Return effective policy for a VMA */
1079 static struct mempolicy * get_vma_policy(struct task_struct *task,
1080                 struct vm_area_struct *vma, unsigned long addr)
1081 {
1082         struct mempolicy *pol = task->mempolicy;
1083
1084         if (vma) {
1085                 if (vma->vm_ops && vma->vm_ops->get_policy)
1086                         pol = vma->vm_ops->get_policy(vma, addr);
1087                 else if (vma->vm_policy &&
1088                                 vma->vm_policy->policy != MPOL_DEFAULT)
1089                         pol = vma->vm_policy;
1090         }
1091         if (!pol)
1092                 pol = &default_policy;
1093         return pol;
1094 }
1095
1096 /* Return a zonelist representing a mempolicy */
1097 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1098 {
1099         int nd;
1100
1101         switch (policy->policy) {
1102         case MPOL_PREFERRED:
1103                 nd = policy->v.preferred_node;
1104                 if (nd < 0)
1105                         nd = numa_node_id();
1106                 break;
1107         case MPOL_BIND:
1108                 /* Lower zones don't get a policy applied */
1109                 /* Careful: current->mems_allowed might have moved */
1110                 if (gfp_zone(gfp) >= policy_zone)
1111                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1112                                 return policy->v.zonelist;
1113                 /*FALL THROUGH*/
1114         case MPOL_INTERLEAVE: /* should not happen */
1115         case MPOL_DEFAULT:
1116                 nd = numa_node_id();
1117                 break;
1118         default:
1119                 nd = 0;
1120                 BUG();
1121         }
1122         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1123 }
1124
1125 /* Do dynamic interleaving for a process */
1126 static unsigned interleave_nodes(struct mempolicy *policy)
1127 {
1128         unsigned nid, next;
1129         struct task_struct *me = current;
1130
1131         nid = me->il_next;
1132         next = next_node(nid, policy->v.nodes);
1133         if (next >= MAX_NUMNODES)
1134                 next = first_node(policy->v.nodes);
1135         me->il_next = next;
1136         return nid;
1137 }
1138
1139 /*
1140  * Depending on the memory policy provide a node from which to allocate the
1141  * next slab entry.
1142  */
1143 unsigned slab_node(struct mempolicy *policy)
1144 {
1145         int pol = policy ? policy->policy : MPOL_DEFAULT;
1146
1147         switch (pol) {
1148         case MPOL_INTERLEAVE:
1149                 return interleave_nodes(policy);
1150
1151         case MPOL_BIND:
1152                 /*
1153                  * Follow bind policy behavior and start allocation at the
1154                  * first node.
1155                  */
1156                 return zone_to_nid(policy->v.zonelist->zones[0]);
1157
1158         case MPOL_PREFERRED:
1159                 if (policy->v.preferred_node >= 0)
1160                         return policy->v.preferred_node;
1161                 /* Fall through */
1162
1163         default:
1164                 return numa_node_id();
1165         }
1166 }
1167
1168 /* Do static interleaving for a VMA with known offset. */
1169 static unsigned offset_il_node(struct mempolicy *pol,
1170                 struct vm_area_struct *vma, unsigned long off)
1171 {
1172         unsigned nnodes = nodes_weight(pol->v.nodes);
1173         unsigned target = (unsigned)off % nnodes;
1174         int c;
1175         int nid = -1;
1176
1177         c = 0;
1178         do {
1179                 nid = next_node(nid, pol->v.nodes);
1180                 c++;
1181         } while (c <= target);
1182         return nid;
1183 }
1184
1185 /* Determine a node number for interleave */
1186 static inline unsigned interleave_nid(struct mempolicy *pol,
1187                  struct vm_area_struct *vma, unsigned long addr, int shift)
1188 {
1189         if (vma) {
1190                 unsigned long off;
1191
1192                 /*
1193                  * for small pages, there is no difference between
1194                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1195                  * for huge pages, since vm_pgoff is in units of small
1196                  * pages, we need to shift off the always 0 bits to get
1197                  * a useful offset.
1198                  */
1199                 BUG_ON(shift < PAGE_SHIFT);
1200                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1201                 off += (addr - vma->vm_start) >> shift;
1202                 return offset_il_node(pol, vma, off);
1203         } else
1204                 return interleave_nodes(pol);
1205 }
1206
1207 #ifdef CONFIG_HUGETLBFS
1208 /* Return a zonelist suitable for a huge page allocation. */
1209 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1210 {
1211         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1212
1213         if (pol->policy == MPOL_INTERLEAVE) {
1214                 unsigned nid;
1215
1216                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1217                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1218         }
1219         return zonelist_policy(GFP_HIGHUSER, pol);
1220 }
1221 #endif
1222
1223 /* Allocate a page in interleaved policy.
1224    Own path because it needs to do special accounting. */
1225 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1226                                         unsigned nid)
1227 {
1228         struct zonelist *zl;
1229         struct page *page;
1230
1231         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1232         page = __alloc_pages(gfp, order, zl);
1233         if (page && page_zone(page) == zl->zones[0])
1234                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1235         return page;
1236 }
1237
1238 /**
1239  *      alloc_page_vma  - Allocate a page for a VMA.
1240  *
1241  *      @gfp:
1242  *      %GFP_USER    user allocation.
1243  *      %GFP_KERNEL  kernel allocations,
1244  *      %GFP_HIGHMEM highmem/user allocations,
1245  *      %GFP_FS      allocation should not call back into a file system.
1246  *      %GFP_ATOMIC  don't sleep.
1247  *
1248  *      @vma:  Pointer to VMA or NULL if not available.
1249  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1250  *
1251  *      This function allocates a page from the kernel page pool and applies
1252  *      a NUMA policy associated with the VMA or the current process.
1253  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1254  *      mm_struct of the VMA to prevent it from going away. Should be used for
1255  *      all allocations for pages that will be mapped into
1256  *      user space. Returns NULL when no page can be allocated.
1257  *
1258  *      Should be called with the mm_sem of the vma hold.
1259  */
1260 struct page *
1261 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1262 {
1263         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1264
1265         cpuset_update_task_memory_state();
1266
1267         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1268                 unsigned nid;
1269
1270                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1271                 return alloc_page_interleave(gfp, 0, nid);
1272         }
1273         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1274 }
1275
1276 /**
1277  *      alloc_pages_current - Allocate pages.
1278  *
1279  *      @gfp:
1280  *              %GFP_USER   user allocation,
1281  *              %GFP_KERNEL kernel allocation,
1282  *              %GFP_HIGHMEM highmem allocation,
1283  *              %GFP_FS     don't call back into a file system.
1284  *              %GFP_ATOMIC don't sleep.
1285  *      @order: Power of two of allocation size in pages. 0 is a single page.
1286  *
1287  *      Allocate a page from the kernel page pool.  When not in
1288  *      interrupt context and apply the current process NUMA policy.
1289  *      Returns NULL when no page can be allocated.
1290  *
1291  *      Don't call cpuset_update_task_memory_state() unless
1292  *      1) it's ok to take cpuset_sem (can WAIT), and
1293  *      2) allocating for current task (not interrupt).
1294  */
1295 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1296 {
1297         struct mempolicy *pol = current->mempolicy;
1298
1299         if ((gfp & __GFP_WAIT) && !in_interrupt())
1300                 cpuset_update_task_memory_state();
1301         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1302                 pol = &default_policy;
1303         if (pol->policy == MPOL_INTERLEAVE)
1304                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1305         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1306 }
1307 EXPORT_SYMBOL(alloc_pages_current);
1308
1309 /*
1310  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1311  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1312  * with the mems_allowed returned by cpuset_mems_allowed().  This
1313  * keeps mempolicies cpuset relative after its cpuset moves.  See
1314  * further kernel/cpuset.c update_nodemask().
1315  */
1316 void *cpuset_being_rebound;
1317
1318 /* Slow path of a mempolicy copy */
1319 struct mempolicy *__mpol_copy(struct mempolicy *old)
1320 {
1321         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1322
1323         if (!new)
1324                 return ERR_PTR(-ENOMEM);
1325         if (current_cpuset_is_being_rebound()) {
1326                 nodemask_t mems = cpuset_mems_allowed(current);
1327                 mpol_rebind_policy(old, &mems);
1328         }
1329         *new = *old;
1330         atomic_set(&new->refcnt, 1);
1331         if (new->policy == MPOL_BIND) {
1332                 int sz = ksize(old->v.zonelist);
1333                 new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1334                 if (!new->v.zonelist) {
1335                         kmem_cache_free(policy_cache, new);
1336                         return ERR_PTR(-ENOMEM);
1337                 }
1338         }
1339         return new;
1340 }
1341
1342 /* Slow path of a mempolicy comparison */
1343 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1344 {
1345         if (!a || !b)
1346                 return 0;
1347         if (a->policy != b->policy)
1348                 return 0;
1349         switch (a->policy) {
1350         case MPOL_DEFAULT:
1351                 return 1;
1352         case MPOL_INTERLEAVE:
1353                 return nodes_equal(a->v.nodes, b->v.nodes);
1354         case MPOL_PREFERRED:
1355                 return a->v.preferred_node == b->v.preferred_node;
1356         case MPOL_BIND: {
1357                 int i;
1358                 for (i = 0; a->v.zonelist->zones[i]; i++)
1359                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1360                                 return 0;
1361                 return b->v.zonelist->zones[i] == NULL;
1362         }
1363         default:
1364                 BUG();
1365                 return 0;
1366         }
1367 }
1368
1369 /* Slow path of a mpol destructor. */
1370 void __mpol_free(struct mempolicy *p)
1371 {
1372         if (!atomic_dec_and_test(&p->refcnt))
1373                 return;
1374         if (p->policy == MPOL_BIND)
1375                 kfree(p->v.zonelist);
1376         p->policy = MPOL_DEFAULT;
1377         kmem_cache_free(policy_cache, p);
1378 }
1379
1380 /*
1381  * Shared memory backing store policy support.
1382  *
1383  * Remember policies even when nobody has shared memory mapped.
1384  * The policies are kept in Red-Black tree linked from the inode.
1385  * They are protected by the sp->lock spinlock, which should be held
1386  * for any accesses to the tree.
1387  */
1388
1389 /* lookup first element intersecting start-end */
1390 /* Caller holds sp->lock */
1391 static struct sp_node *
1392 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1393 {
1394         struct rb_node *n = sp->root.rb_node;
1395
1396         while (n) {
1397                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1398
1399                 if (start >= p->end)
1400                         n = n->rb_right;
1401                 else if (end <= p->start)
1402                         n = n->rb_left;
1403                 else
1404                         break;
1405         }
1406         if (!n)
1407                 return NULL;
1408         for (;;) {
1409                 struct sp_node *w = NULL;
1410                 struct rb_node *prev = rb_prev(n);
1411                 if (!prev)
1412                         break;
1413                 w = rb_entry(prev, struct sp_node, nd);
1414                 if (w->end <= start)
1415                         break;
1416                 n = prev;
1417         }
1418         return rb_entry(n, struct sp_node, nd);
1419 }
1420
1421 /* Insert a new shared policy into the list. */
1422 /* Caller holds sp->lock */
1423 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1424 {
1425         struct rb_node **p = &sp->root.rb_node;
1426         struct rb_node *parent = NULL;
1427         struct sp_node *nd;
1428
1429         while (*p) {
1430                 parent = *p;
1431                 nd = rb_entry(parent, struct sp_node, nd);
1432                 if (new->start < nd->start)
1433                         p = &(*p)->rb_left;
1434                 else if (new->end > nd->end)
1435                         p = &(*p)->rb_right;
1436                 else
1437                         BUG();
1438         }
1439         rb_link_node(&new->nd, parent, p);
1440         rb_insert_color(&new->nd, &sp->root);
1441         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1442                  new->policy ? new->policy->policy : 0);
1443 }
1444
1445 /* Find shared policy intersecting idx */
1446 struct mempolicy *
1447 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1448 {
1449         struct mempolicy *pol = NULL;
1450         struct sp_node *sn;
1451
1452         if (!sp->root.rb_node)
1453                 return NULL;
1454         spin_lock(&sp->lock);
1455         sn = sp_lookup(sp, idx, idx+1);
1456         if (sn) {
1457                 mpol_get(sn->policy);
1458                 pol = sn->policy;
1459         }
1460         spin_unlock(&sp->lock);
1461         return pol;
1462 }
1463
1464 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1465 {
1466         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1467         rb_erase(&n->nd, &sp->root);
1468         mpol_free(n->policy);
1469         kmem_cache_free(sn_cache, n);
1470 }
1471
1472 struct sp_node *
1473 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1474 {
1475         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1476
1477         if (!n)
1478                 return NULL;
1479         n->start = start;
1480         n->end = end;
1481         mpol_get(pol);
1482         n->policy = pol;
1483         return n;
1484 }
1485
1486 /* Replace a policy range. */
1487 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1488                                  unsigned long end, struct sp_node *new)
1489 {
1490         struct sp_node *n, *new2 = NULL;
1491
1492 restart:
1493         spin_lock(&sp->lock);
1494         n = sp_lookup(sp, start, end);
1495         /* Take care of old policies in the same range. */
1496         while (n && n->start < end) {
1497                 struct rb_node *next = rb_next(&n->nd);
1498                 if (n->start >= start) {
1499                         if (n->end <= end)
1500                                 sp_delete(sp, n);
1501                         else
1502                                 n->start = end;
1503                 } else {
1504                         /* Old policy spanning whole new range. */
1505                         if (n->end > end) {
1506                                 if (!new2) {
1507                                         spin_unlock(&sp->lock);
1508                                         new2 = sp_alloc(end, n->end, n->policy);
1509                                         if (!new2)
1510                                                 return -ENOMEM;
1511                                         goto restart;
1512                                 }
1513                                 n->end = start;
1514                                 sp_insert(sp, new2);
1515                                 new2 = NULL;
1516                                 break;
1517                         } else
1518                                 n->end = start;
1519                 }
1520                 if (!next)
1521                         break;
1522                 n = rb_entry(next, struct sp_node, nd);
1523         }
1524         if (new)
1525                 sp_insert(sp, new);
1526         spin_unlock(&sp->lock);
1527         if (new2) {
1528                 mpol_free(new2->policy);
1529                 kmem_cache_free(sn_cache, new2);
1530         }
1531         return 0;
1532 }
1533
1534 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1535                                 nodemask_t *policy_nodes)
1536 {
1537         info->root = RB_ROOT;
1538         spin_lock_init(&info->lock);
1539
1540         if (policy != MPOL_DEFAULT) {
1541                 struct mempolicy *newpol;
1542
1543                 /* Falls back to MPOL_DEFAULT on any error */
1544                 newpol = mpol_new(policy, policy_nodes);
1545                 if (!IS_ERR(newpol)) {
1546                         /* Create pseudo-vma that contains just the policy */
1547                         struct vm_area_struct pvma;
1548
1549                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1550                         /* Policy covers entire file */
1551                         pvma.vm_end = TASK_SIZE;
1552                         mpol_set_shared_policy(info, &pvma, newpol);
1553                         mpol_free(newpol);
1554                 }
1555         }
1556 }
1557
1558 int mpol_set_shared_policy(struct shared_policy *info,
1559                         struct vm_area_struct *vma, struct mempolicy *npol)
1560 {
1561         int err;
1562         struct sp_node *new = NULL;
1563         unsigned long sz = vma_pages(vma);
1564
1565         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1566                  vma->vm_pgoff,
1567                  sz, npol? npol->policy : -1,
1568                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1569
1570         if (npol) {
1571                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1572                 if (!new)
1573                         return -ENOMEM;
1574         }
1575         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1576         if (err && new)
1577                 kmem_cache_free(sn_cache, new);
1578         return err;
1579 }
1580
1581 /* Free a backing policy store on inode delete. */
1582 void mpol_free_shared_policy(struct shared_policy *p)
1583 {
1584         struct sp_node *n;
1585         struct rb_node *next;
1586
1587         if (!p->root.rb_node)
1588                 return;
1589         spin_lock(&p->lock);
1590         next = rb_first(&p->root);
1591         while (next) {
1592                 n = rb_entry(next, struct sp_node, nd);
1593                 next = rb_next(&n->nd);
1594                 rb_erase(&n->nd, &p->root);
1595                 mpol_free(n->policy);
1596                 kmem_cache_free(sn_cache, n);
1597         }
1598         spin_unlock(&p->lock);
1599 }
1600
1601 /* assumes fs == KERNEL_DS */
1602 void __init numa_policy_init(void)
1603 {
1604         policy_cache = kmem_cache_create("numa_policy",
1605                                          sizeof(struct mempolicy),
1606                                          0, SLAB_PANIC, NULL, NULL);
1607
1608         sn_cache = kmem_cache_create("shared_policy_node",
1609                                      sizeof(struct sp_node),
1610                                      0, SLAB_PANIC, NULL, NULL);
1611
1612         /* Set interleaving policy for system init. This way not all
1613            the data structures allocated at system boot end up in node zero. */
1614
1615         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1616                 printk("numa_policy_init: interleaving failed\n");
1617 }
1618
1619 /* Reset policy of current process to default */
1620 void numa_default_policy(void)
1621 {
1622         do_set_mempolicy(MPOL_DEFAULT, NULL);
1623 }
1624
1625 /* Migrate a policy to a different set of nodes */
1626 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1627 {
1628         nodemask_t *mpolmask;
1629         nodemask_t tmp;
1630
1631         if (!pol)
1632                 return;
1633         mpolmask = &pol->cpuset_mems_allowed;
1634         if (nodes_equal(*mpolmask, *newmask))
1635                 return;
1636
1637         switch (pol->policy) {
1638         case MPOL_DEFAULT:
1639                 break;
1640         case MPOL_INTERLEAVE:
1641                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1642                 pol->v.nodes = tmp;
1643                 *mpolmask = *newmask;
1644                 current->il_next = node_remap(current->il_next,
1645                                                 *mpolmask, *newmask);
1646                 break;
1647         case MPOL_PREFERRED:
1648                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1649                                                 *mpolmask, *newmask);
1650                 *mpolmask = *newmask;
1651                 break;
1652         case MPOL_BIND: {
1653                 nodemask_t nodes;
1654                 struct zone **z;
1655                 struct zonelist *zonelist;
1656
1657                 nodes_clear(nodes);
1658                 for (z = pol->v.zonelist->zones; *z; z++)
1659                         node_set(zone_to_nid(*z), nodes);
1660                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1661                 nodes = tmp;
1662
1663                 zonelist = bind_zonelist(&nodes);
1664
1665                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1666                  * If that old zonelist has no remaining mems_allowed nodes,
1667                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1668                  */
1669
1670                 if (zonelist) {
1671                         /* Good - got mem - substitute new zonelist */
1672                         kfree(pol->v.zonelist);
1673                         pol->v.zonelist = zonelist;
1674                 }
1675                 *mpolmask = *newmask;
1676                 break;
1677         }
1678         default:
1679                 BUG();
1680                 break;
1681         }
1682 }
1683
1684 /*
1685  * Wrapper for mpol_rebind_policy() that just requires task
1686  * pointer, and updates task mempolicy.
1687  */
1688
1689 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1690 {
1691         mpol_rebind_policy(tsk->mempolicy, new);
1692 }
1693
1694 /*
1695  * Rebind each vma in mm to new nodemask.
1696  *
1697  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1698  */
1699
1700 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1701 {
1702         struct vm_area_struct *vma;
1703
1704         down_write(&mm->mmap_sem);
1705         for (vma = mm->mmap; vma; vma = vma->vm_next)
1706                 mpol_rebind_policy(vma->vm_policy, new);
1707         up_write(&mm->mmap_sem);
1708 }
1709
1710 /*
1711  * Display pages allocated per node and memory policy via /proc.
1712  */
1713
1714 static const char * const policy_types[] =
1715         { "default", "prefer", "bind", "interleave" };
1716
1717 /*
1718  * Convert a mempolicy into a string.
1719  * Returns the number of characters in buffer (if positive)
1720  * or an error (negative)
1721  */
1722 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1723 {
1724         char *p = buffer;
1725         int l;
1726         nodemask_t nodes;
1727         int mode = pol ? pol->policy : MPOL_DEFAULT;
1728
1729         switch (mode) {
1730         case MPOL_DEFAULT:
1731                 nodes_clear(nodes);
1732                 break;
1733
1734         case MPOL_PREFERRED:
1735                 nodes_clear(nodes);
1736                 node_set(pol->v.preferred_node, nodes);
1737                 break;
1738
1739         case MPOL_BIND:
1740                 get_zonemask(pol, &nodes);
1741                 break;
1742
1743         case MPOL_INTERLEAVE:
1744                 nodes = pol->v.nodes;
1745                 break;
1746
1747         default:
1748                 BUG();
1749                 return -EFAULT;
1750         }
1751
1752         l = strlen(policy_types[mode]);
1753         if (buffer + maxlen < p + l + 1)
1754                 return -ENOSPC;
1755
1756         strcpy(p, policy_types[mode]);
1757         p += l;
1758
1759         if (!nodes_empty(nodes)) {
1760                 if (buffer + maxlen < p + 2)
1761                         return -ENOSPC;
1762                 *p++ = '=';
1763                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1764         }
1765         return p - buffer;
1766 }
1767
1768 struct numa_maps {
1769         unsigned long pages;
1770         unsigned long anon;
1771         unsigned long active;
1772         unsigned long writeback;
1773         unsigned long mapcount_max;
1774         unsigned long dirty;
1775         unsigned long swapcache;
1776         unsigned long node[MAX_NUMNODES];
1777 };
1778
1779 static void gather_stats(struct page *page, void *private, int pte_dirty)
1780 {
1781         struct numa_maps *md = private;
1782         int count = page_mapcount(page);
1783
1784         md->pages++;
1785         if (pte_dirty || PageDirty(page))
1786                 md->dirty++;
1787
1788         if (PageSwapCache(page))
1789                 md->swapcache++;
1790
1791         if (PageActive(page))
1792                 md->active++;
1793
1794         if (PageWriteback(page))
1795                 md->writeback++;
1796
1797         if (PageAnon(page))
1798                 md->anon++;
1799
1800         if (count > md->mapcount_max)
1801                 md->mapcount_max = count;
1802
1803         md->node[page_to_nid(page)]++;
1804 }
1805
1806 #ifdef CONFIG_HUGETLB_PAGE
1807 static void check_huge_range(struct vm_area_struct *vma,
1808                 unsigned long start, unsigned long end,
1809                 struct numa_maps *md)
1810 {
1811         unsigned long addr;
1812         struct page *page;
1813
1814         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1815                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1816                 pte_t pte;
1817
1818                 if (!ptep)
1819                         continue;
1820
1821                 pte = *ptep;
1822                 if (pte_none(pte))
1823                         continue;
1824
1825                 page = pte_page(pte);
1826                 if (!page)
1827                         continue;
1828
1829                 gather_stats(page, md, pte_dirty(*ptep));
1830         }
1831 }
1832 #else
1833 static inline void check_huge_range(struct vm_area_struct *vma,
1834                 unsigned long start, unsigned long end,
1835                 struct numa_maps *md)
1836 {
1837 }
1838 #endif
1839
1840 int show_numa_map(struct seq_file *m, void *v)
1841 {
1842         struct proc_maps_private *priv = m->private;
1843         struct vm_area_struct *vma = v;
1844         struct numa_maps *md;
1845         struct file *file = vma->vm_file;
1846         struct mm_struct *mm = vma->vm_mm;
1847         int n;
1848         char buffer[50];
1849
1850         if (!mm)
1851                 return 0;
1852
1853         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1854         if (!md)
1855                 return 0;
1856
1857         mpol_to_str(buffer, sizeof(buffer),
1858                             get_vma_policy(priv->task, vma, vma->vm_start));
1859
1860         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1861
1862         if (file) {
1863                 seq_printf(m, " file=");
1864                 seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1865         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1866                 seq_printf(m, " heap");
1867         } else if (vma->vm_start <= mm->start_stack &&
1868                         vma->vm_end >= mm->start_stack) {
1869                 seq_printf(m, " stack");
1870         }
1871
1872         if (is_vm_hugetlb_page(vma)) {
1873                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1874                 seq_printf(m, " huge");
1875         } else {
1876                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1877                                 &node_online_map, MPOL_MF_STATS, md);
1878         }
1879
1880         if (!md->pages)
1881                 goto out;
1882
1883         if (md->anon)
1884                 seq_printf(m," anon=%lu",md->anon);
1885
1886         if (md->dirty)
1887                 seq_printf(m," dirty=%lu",md->dirty);
1888
1889         if (md->pages != md->anon && md->pages != md->dirty)
1890                 seq_printf(m, " mapped=%lu", md->pages);
1891
1892         if (md->mapcount_max > 1)
1893                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1894
1895         if (md->swapcache)
1896                 seq_printf(m," swapcache=%lu", md->swapcache);
1897
1898         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1899                 seq_printf(m," active=%lu", md->active);
1900
1901         if (md->writeback)
1902                 seq_printf(m," writeback=%lu", md->writeback);
1903
1904         for_each_online_node(n)
1905                 if (md->node[n])
1906                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1907 out:
1908         seq_putc(m, '\n');
1909         kfree(md);
1910
1911         if (m->count < m->size)
1912                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1913         return 0;
1914 }
1915