mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89 #include <linux/migrate.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 #define PDprintk(fmt...)
 105
 106 /* Highest zone. An specific allocation for a zone below that is not
 107    policied. */
 108 int policy_zone = ZONE_DMA;
 109
 110 struct mempolicy default_policy = {
 111         .refcnt = ATOMIC_INIT(1), /* never free it */
 112         .policy = MPOL_DEFAULT,
 113 };
 114
 115 /* Do sanity checking on a policy */
 116 static int mpol_check_policy(int mode, nodemask_t *nodes)
 117 {
 118         int empty = nodes_empty(*nodes);
 119
 120         switch (mode) {
 121         case MPOL_DEFAULT:
 122                 if (!empty)
 123                         return -EINVAL;
 124                 break;
 125         case MPOL_BIND:
 126         case MPOL_INTERLEAVE:
 127                 /* Preferred will only use the first bit, but allow
 128                    more for now. */
 129                 if (empty)
 130                         return -EINVAL;
 131                 break;
 132         }
 133         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 134 }
 135
 136 /* Generate a custom zonelist for the BIND policy. */
 137 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 138 {
 139         struct zonelist *zl;
 140         int num, max, nd, k;
 141
 142         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 143         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 144         if (!zl)
 145                 return NULL;
 146         num = 0;
 147         /* First put in the highest zones from all nodes, then all the next
 148            lower zones etc. Avoid empty zones because the memory allocator
 149            doesn't like them. If you implement node hot removal you
 150            have to fix that. */
 151         for (k = policy_zone; k >= 0; k--) {
 152                 for_each_node_mask(nd, *nodes) {
 153                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 154                         if (z->present_pages > 0)
 155                                 zl->zones[num++] = z;
 156                 }
 157         }
 158         zl->zones[num] = NULL;
 159         return zl;
 160 }
 161
 162 /* Create a new policy */
 163 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 164 {
 165         struct mempolicy *policy;
 166
 167         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 168         if (mode == MPOL_DEFAULT)
 169                 return NULL;
 170         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 171         if (!policy)
 172                 return ERR_PTR(-ENOMEM);
 173         atomic_set(&policy->refcnt, 1);
 174         switch (mode) {
 175         case MPOL_INTERLEAVE:
 176                 policy->v.nodes = *nodes;
 177                 if (nodes_weight(*nodes) == 0) {
 178                         kmem_cache_free(policy_cache, policy);
 179                         return ERR_PTR(-EINVAL);
 180                 }
 181                 break;
 182         case MPOL_PREFERRED:
 183                 policy->v.preferred_node = first_node(*nodes);
 184                 if (policy->v.preferred_node >= MAX_NUMNODES)
 185                         policy->v.preferred_node = -1;
 186                 break;
 187         case MPOL_BIND:
 188                 policy->v.zonelist = bind_zonelist(nodes);
 189                 if (policy->v.zonelist == NULL) {
 190                         kmem_cache_free(policy_cache, policy);
 191                         return ERR_PTR(-ENOMEM);
 192                 }
 193                 break;
 194         }
 195         policy->policy = mode;
 196         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 197         return policy;
 198 }
 199
 200 static void gather_stats(struct page *, void *, int pte_dirty);
 201 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 202                                 unsigned long flags);
 203
 204 /* Scan through pages checking if pages follow certain conditions. */
 205 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 206                 unsigned long addr, unsigned long end,
 207                 const nodemask_t *nodes, unsigned long flags,
 208                 void *private)
 209 {
 210         pte_t *orig_pte;
 211         pte_t *pte;
 212         spinlock_t *ptl;
 213
 214         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 215         do {
 216                 struct page *page;
 217                 unsigned int nid;
 218
 219                 if (!pte_present(*pte))
 220                         continue;
 221                 page = vm_normal_page(vma, addr, *pte);
 222                 if (!page)
 223                         continue;
 224                 /*
 225                  * The check for PageReserved here is important to avoid
 226                  * handling zero pages and other pages that may have been
 227                  * marked special by the system.
 228                  *
 229                  * If the PageReserved would not be checked here then f.e.
 230                  * the location of the zero page could have an influence
 231                  * on MPOL_MF_STRICT, zero pages would be counted for
 232                  * the per node stats, and there would be useless attempts
 233                  * to put zero pages on the migration list.
 234                  */
 235                 if (PageReserved(page))
 236                         continue;
 237                 nid = page_to_nid(page);
 238                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 239                         continue;
 240
 241                 if (flags & MPOL_MF_STATS)
 242                         gather_stats(page, private, pte_dirty(*pte));
 243                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 244                         migrate_page_add(page, private, flags);
 245                 else
 246                         break;
 247         } while (pte++, addr += PAGE_SIZE, addr != end);
 248         pte_unmap_unlock(orig_pte, ptl);
 249         return addr != end;
 250 }
 251
 252 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 253                 unsigned long addr, unsigned long end,
 254                 const nodemask_t *nodes, unsigned long flags,
 255                 void *private)
 256 {
 257         pmd_t *pmd;
 258         unsigned long next;
 259
 260         pmd = pmd_offset(pud, addr);
 261         do {
 262                 next = pmd_addr_end(addr, end);
 263                 if (pmd_none_or_clear_bad(pmd))
 264                         continue;
 265                 if (check_pte_range(vma, pmd, addr, next, nodes,
 266                                     flags, private))
 267                         return -EIO;
 268         } while (pmd++, addr = next, addr != end);
 269         return 0;
 270 }
 271
 272 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 273                 unsigned long addr, unsigned long end,
 274                 const nodemask_t *nodes, unsigned long flags,
 275                 void *private)
 276 {
 277         pud_t *pud;
 278         unsigned long next;
 279
 280         pud = pud_offset(pgd, addr);
 281         do {
 282                 next = pud_addr_end(addr, end);
 283                 if (pud_none_or_clear_bad(pud))
 284                         continue;
 285                 if (check_pmd_range(vma, pud, addr, next, nodes,
 286                                     flags, private))
 287                         return -EIO;
 288         } while (pud++, addr = next, addr != end);
 289         return 0;
 290 }
 291
 292 static inline int check_pgd_range(struct vm_area_struct *vma,
 293                 unsigned long addr, unsigned long end,
 294                 const nodemask_t *nodes, unsigned long flags,
 295                 void *private)
 296 {
 297         pgd_t *pgd;
 298         unsigned long next;
 299
 300         pgd = pgd_offset(vma->vm_mm, addr);
 301         do {
 302                 next = pgd_addr_end(addr, end);
 303                 if (pgd_none_or_clear_bad(pgd))
 304                         continue;
 305                 if (check_pud_range(vma, pgd, addr, next, nodes,
 306                                     flags, private))
 307                         return -EIO;
 308         } while (pgd++, addr = next, addr != end);
 309         return 0;
 310 }
 311
 312 /* Check if a vma is migratable */
 313 static inline int vma_migratable(struct vm_area_struct *vma)
 314 {
 315         if (vma->vm_flags & (
 316                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 317                 return 0;
 318         return 1;
 319 }
 320
 321 /*
 322  * Check if all pages in a range are on a set of nodes.
 323  * If pagelist != NULL then isolate pages from the LRU and
 324  * put them on the pagelist.
 325  */
 326 static struct vm_area_struct *
 327 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 328                 const nodemask_t *nodes, unsigned long flags, void *private)
 329 {
 330         int err;
 331         struct vm_area_struct *first, *vma, *prev;
 332
 333         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 334
 335                 err = migrate_prep();
 336                 if (err)
 337                         return ERR_PTR(err);
 338         }
 339
 340         first = find_vma(mm, start);
 341         if (!first)
 342                 return ERR_PTR(-EFAULT);
 343         prev = NULL;
 344         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 345                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 346                         if (!vma->vm_next && vma->vm_end < end)
 347                                 return ERR_PTR(-EFAULT);
 348                         if (prev && prev->vm_end < vma->vm_start)
 349                                 return ERR_PTR(-EFAULT);
 350                 }
 351                 if (!is_vm_hugetlb_page(vma) &&
 352                     ((flags & MPOL_MF_STRICT) ||
 353                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 354                                 vma_migratable(vma)))) {
 355                         unsigned long endvma = vma->vm_end;
 356
 357                         if (endvma > end)
 358                                 endvma = end;
 359                         if (vma->vm_start > start)
 360                                 start = vma->vm_start;
 361                         err = check_pgd_range(vma, start, endvma, nodes,
 362                                                 flags, private);
 363                         if (err) {
 364                                 first = ERR_PTR(err);
 365                                 break;
 366                         }
 367                 }
 368                 prev = vma;
 369         }
 370         return first;
 371 }
 372
 373 /* Apply policy to a single VMA */
 374 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 375 {
 376         int err = 0;
 377         struct mempolicy *old = vma->vm_policy;
 378
 379         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 380                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 381                  vma->vm_ops, vma->vm_file,
 382                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 383
 384         if (vma->vm_ops && vma->vm_ops->set_policy)
 385                 err = vma->vm_ops->set_policy(vma, new);
 386         if (!err) {
 387                 mpol_get(new);
 388                 vma->vm_policy = new;
 389                 mpol_free(old);
 390         }
 391         return err;
 392 }
 393
 394 /* Step 2: apply policy to a range and do splits. */
 395 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 396                        unsigned long end, struct mempolicy *new)
 397 {
 398         struct vm_area_struct *next;
 399         int err;
 400
 401         err = 0;
 402         for (; vma && vma->vm_start < end; vma = next) {
 403                 next = vma->vm_next;
 404                 if (vma->vm_start < start)
 405                         err = split_vma(vma->vm_mm, vma, start, 1);
 406                 if (!err && vma->vm_end > end)
 407                         err = split_vma(vma->vm_mm, vma, end, 0);
 408                 if (!err)
 409                         err = policy_vma(vma, new);
 410                 if (err)
 411                         break;
 412         }
 413         return err;
 414 }
 415
 416 static int contextualize_policy(int mode, nodemask_t *nodes)
 417 {
 418         if (!nodes)
 419                 return 0;
 420
 421         cpuset_update_task_memory_state();
 422         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 423                 return -EINVAL;
 424         return mpol_check_policy(mode, nodes);
 425 }
 426
 427
 428 /*
 429  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 430  * mempolicy.  Allows more rapid checking of this (combined perhaps
 431  * with other PF_* flag bits) on memory allocation hot code paths.
 432  *
 433  * If called from outside this file, the task 'p' should -only- be
 434  * a newly forked child not yet visible on the task list, because
 435  * manipulating the task flags of a visible task is not safe.
 436  *
 437  * The above limitation is why this routine has the funny name
 438  * mpol_fix_fork_child_flag().
 439  *
 440  * It is also safe to call this with a task pointer of current,
 441  * which the static wrapper mpol_set_task_struct_flag() does,
 442  * for use within this file.
 443  */
 444
 445 void mpol_fix_fork_child_flag(struct task_struct *p)
 446 {
 447         if (p->mempolicy)
 448                 p->flags |= PF_MEMPOLICY;
 449         else
 450                 p->flags &= ~PF_MEMPOLICY;
 451 }
 452
 453 static void mpol_set_task_struct_flag(void)
 454 {
 455         mpol_fix_fork_child_flag(current);
 456 }
 457
 458 /* Set the process memory policy */
 459 long do_set_mempolicy(int mode, nodemask_t *nodes)
 460 {
 461         struct mempolicy *new;
 462
 463         if (contextualize_policy(mode, nodes))
 464                 return -EINVAL;
 465         new = mpol_new(mode, nodes);
 466         if (IS_ERR(new))
 467                 return PTR_ERR(new);
 468         mpol_free(current->mempolicy);
 469         current->mempolicy = new;
 470         mpol_set_task_struct_flag();
 471         if (new && new->policy == MPOL_INTERLEAVE)
 472                 current->il_next = first_node(new->v.nodes);
 473         return 0;
 474 }
 475
 476 /* Fill a zone bitmap for a policy */
 477 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 478 {
 479         int i;
 480
 481         nodes_clear(*nodes);
 482         switch (p->policy) {
 483         case MPOL_BIND:
 484                 for (i = 0; p->v.zonelist->zones[i]; i++)
 485                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 486                                 *nodes);
 487                 break;
 488         case MPOL_DEFAULT:
 489                 break;
 490         case MPOL_INTERLEAVE:
 491                 *nodes = p->v.nodes;
 492                 break;
 493         case MPOL_PREFERRED:
 494                 /* or use current node instead of online map? */
 495                 if (p->v.preferred_node < 0)
 496                         *nodes = node_online_map;
 497                 else
 498                         node_set(p->v.preferred_node, *nodes);
 499                 break;
 500         default:
 501                 BUG();
 502         }
 503 }
 504
 505 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 506 {
 507         struct page *p;
 508         int err;
 509
 510         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 511         if (err >= 0) {
 512                 err = page_to_nid(p);
 513                 put_page(p);
 514         }
 515         return err;
 516 }
 517
 518 /* Retrieve NUMA policy */
 519 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 520                         unsigned long addr, unsigned long flags)
 521 {
 522         int err;
 523         struct mm_struct *mm = current->mm;
 524         struct vm_area_struct *vma = NULL;
 525         struct mempolicy *pol = current->mempolicy;
 526
 527         cpuset_update_task_memory_state();
 528         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 529                 return -EINVAL;
 530         if (flags & MPOL_F_ADDR) {
 531                 down_read(&mm->mmap_sem);
 532                 vma = find_vma_intersection(mm, addr, addr+1);
 533                 if (!vma) {
 534                         up_read(&mm->mmap_sem);
 535                         return -EFAULT;
 536                 }
 537                 if (vma->vm_ops && vma->vm_ops->get_policy)
 538                         pol = vma->vm_ops->get_policy(vma, addr);
 539                 else
 540                         pol = vma->vm_policy;
 541         } else if (addr)
 542                 return -EINVAL;
 543
 544         if (!pol)
 545                 pol = &default_policy;
 546
 547         if (flags & MPOL_F_NODE) {
 548                 if (flags & MPOL_F_ADDR) {
 549                         err = lookup_node(mm, addr);
 550                         if (err < 0)
 551                                 goto out;
 552                         *policy = err;
 553                 } else if (pol == current->mempolicy &&
 554                                 pol->policy == MPOL_INTERLEAVE) {
 555                         *policy = current->il_next;
 556                 } else {
 557                         err = -EINVAL;
 558                         goto out;
 559                 }
 560         } else
 561                 *policy = pol->policy;
 562
 563         if (vma) {
 564                 up_read(&current->mm->mmap_sem);
 565                 vma = NULL;
 566         }
 567
 568         err = 0;
 569         if (nmask)
 570                 get_zonemask(pol, nmask);
 571
 572  out:
 573         if (vma)
 574                 up_read(&current->mm->mmap_sem);
 575         return err;
 576 }
 577
 578 #ifdef CONFIG_MIGRATION
 579 /*
 580  * page migration
 581  */
 582 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 583                                 unsigned long flags)
 584 {
 585         /*
 586          * Avoid migrating a page that is shared with others.
 587          */
 588         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 589                 isolate_lru_page(page, pagelist);
 590 }
 591
 592 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 593 {
 594         return alloc_pages_node(node, GFP_HIGHUSER, 0);
 595 }
 596
 597 /*
 598  * Migrate pages from one node to a target node.
 599  * Returns error or the number of pages not migrated.
 600  */
 601 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 602 {
 603         nodemask_t nmask;
 604         LIST_HEAD(pagelist);
 605         int err = 0;
 606
 607         nodes_clear(nmask);
 608         node_set(source, nmask);
 609
 610         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 611                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 612
 613         if (!list_empty(&pagelist))
 614                 err = migrate_pages(&pagelist, new_node_page, dest);
 615
 616         return err;
 617 }
 618
 619 /*
 620  * Move pages between the two nodesets so as to preserve the physical
 621  * layout as much as possible.
 622  *
 623  * Returns the number of page that could not be moved.
 624  */
 625 int do_migrate_pages(struct mm_struct *mm,
 626         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 627 {
 628         LIST_HEAD(pagelist);
 629         int busy = 0;
 630         int err = 0;
 631         nodemask_t tmp;
 632
 633         down_read(&mm->mmap_sem);
 634
 635         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 636         if (err)
 637                 goto out;
 638
 639 /*
 640  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 641  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 642  * bit in 'tmp', and return that <source, dest> pair for migration.
 643  * The pair of nodemasks 'to' and 'from' define the map.
 644  *
 645  * If no pair of bits is found that way, fallback to picking some
 646  * pair of 'source' and 'dest' bits that are not the same.  If the
 647  * 'source' and 'dest' bits are the same, this represents a node
 648  * that will be migrating to itself, so no pages need move.
 649  *
 650  * If no bits are left in 'tmp', or if all remaining bits left
 651  * in 'tmp' correspond to the same bit in 'to', return false
 652  * (nothing left to migrate).
 653  *
 654  * This lets us pick a pair of nodes to migrate between, such that
 655  * if possible the dest node is not already occupied by some other
 656  * source node, minimizing the risk of overloading the memory on a
 657  * node that would happen if we migrated incoming memory to a node
 658  * before migrating outgoing memory source that same node.
 659  *
 660  * A single scan of tmp is sufficient.  As we go, we remember the
 661  * most recent <s, d> pair that moved (s != d).  If we find a pair
 662  * that not only moved, but what's better, moved to an empty slot
 663  * (d is not set in tmp), then we break out then, with that pair.
 664  * Otherwise when we finish scannng from_tmp, we at least have the
 665  * most recent <s, d> pair that moved.  If we get all the way through
 666  * the scan of tmp without finding any node that moved, much less
 667  * moved to an empty node, then there is nothing left worth migrating.
 668  */
 669
 670         tmp = *from_nodes;
 671         while (!nodes_empty(tmp)) {
 672                 int s,d;
 673                 int source = -1;
 674                 int dest = 0;
 675
 676                 for_each_node_mask(s, tmp) {
 677                         d = node_remap(s, *from_nodes, *to_nodes);
 678                         if (s == d)
 679                                 continue;
 680
 681                         source = s;     /* Node moved. Memorize */
 682                         dest = d;
 683
 684                         /* dest not in remaining from nodes? */
 685                         if (!node_isset(dest, tmp))
 686                                 break;
 687                 }
 688                 if (source == -1)
 689                         break;
 690
 691                 node_clear(source, tmp);
 692                 err = migrate_to_node(mm, source, dest, flags);
 693                 if (err > 0)
 694                         busy += err;
 695                 if (err < 0)
 696                         break;
 697         }
 698 out:
 699         up_read(&mm->mmap_sem);
 700         if (err < 0)
 701                 return err;
 702         return busy;
 703
 704 }
 705
 706 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 707 {
 708         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 709
 710         return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
 711 }
 712 #else
 713
 714 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 715                                 unsigned long flags)
 716 {
 717 }
 718
 719 int do_migrate_pages(struct mm_struct *mm,
 720         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 721 {
 722         return -ENOSYS;
 723 }
 724
 725 static struct page *new_vma_page(struct page *page, unsigned long private)
 726 {
 727         return NULL;
 728 }
 729 #endif
 730
 731 long do_mbind(unsigned long start, unsigned long len,
 732                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 733 {
 734         struct vm_area_struct *vma;
 735         struct mm_struct *mm = current->mm;
 736         struct mempolicy *new;
 737         unsigned long end;
 738         int err;
 739         LIST_HEAD(pagelist);
 740
 741         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 742                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 743             || mode > MPOL_MAX)
 744                 return -EINVAL;
 745         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 746                 return -EPERM;
 747
 748         if (start & ~PAGE_MASK)
 749                 return -EINVAL;
 750
 751         if (mode == MPOL_DEFAULT)
 752                 flags &= ~MPOL_MF_STRICT;
 753
 754         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 755         end = start + len;
 756
 757         if (end < start)
 758                 return -EINVAL;
 759         if (end == start)
 760                 return 0;
 761
 762         if (mpol_check_policy(mode, nmask))
 763                 return -EINVAL;
 764
 765         new = mpol_new(mode, nmask);
 766         if (IS_ERR(new))
 767                 return PTR_ERR(new);
 768
 769         /*
 770          * If we are using the default policy then operation
 771          * on discontinuous address spaces is okay after all
 772          */
 773         if (!new)
 774                 flags |= MPOL_MF_DISCONTIG_OK;
 775
 776         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 777                         mode,nodes_addr(nodes)[0]);
 778
 779         down_write(&mm->mmap_sem);
 780         vma = check_range(mm, start, end, nmask,
 781                           flags | MPOL_MF_INVERT, &pagelist);
 782
 783         err = PTR_ERR(vma);
 784         if (!IS_ERR(vma)) {
 785                 int nr_failed = 0;
 786
 787                 err = mbind_range(vma, start, end, new);
 788
 789                 if (!list_empty(&pagelist))
 790                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 791                                                 (unsigned long)vma);
 792
 793                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 794                         err = -EIO;
 795         }
 796
 797         up_write(&mm->mmap_sem);
 798         mpol_free(new);
 799         return err;
 800 }
 801
 802 /*
 803  * User space interface with variable sized bitmaps for nodelists.
 804  */
 805
 806 /* Copy a node mask from user space. */
 807 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 808                      unsigned long maxnode)
 809 {
 810         unsigned long k;
 811         unsigned long nlongs;
 812         unsigned long endmask;
 813
 814         --maxnode;
 815         nodes_clear(*nodes);
 816         if (maxnode == 0 || !nmask)
 817                 return 0;
 818         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 819                 return -EINVAL;
 820
 821         nlongs = BITS_TO_LONGS(maxnode);
 822         if ((maxnode % BITS_PER_LONG) == 0)
 823                 endmask = ~0UL;
 824         else
 825                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 826
 827         /* When the user specified more nodes than supported just check
 828            if the non supported part is all zero. */
 829         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 830                 if (nlongs > PAGE_SIZE/sizeof(long))
 831                         return -EINVAL;
 832                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 833                         unsigned long t;
 834                         if (get_user(t, nmask + k))
 835                                 return -EFAULT;
 836                         if (k == nlongs - 1) {
 837                                 if (t & endmask)
 838                                         return -EINVAL;
 839                         } else if (t)
 840                                 return -EINVAL;
 841                 }
 842                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 843                 endmask = ~0UL;
 844         }
 845
 846         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 847                 return -EFAULT;
 848         nodes_addr(*nodes)[nlongs-1] &= endmask;
 849         return 0;
 850 }
 851
 852 /* Copy a kernel node mask to user space */
 853 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 854                               nodemask_t *nodes)
 855 {
 856         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 857         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 858
 859         if (copy > nbytes) {
 860                 if (copy > PAGE_SIZE)
 861                         return -EINVAL;
 862                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 863                         return -EFAULT;
 864                 copy = nbytes;
 865         }
 866         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 867 }
 868
 869 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 870                         unsigned long mode,
 871                         unsigned long __user *nmask, unsigned long maxnode,
 872                         unsigned flags)
 873 {
 874         nodemask_t nodes;
 875         int err;
 876
 877         err = get_nodes(&nodes, nmask, maxnode);
 878         if (err)
 879                 return err;
 880         return do_mbind(start, len, mode, &nodes, flags);
 881 }
 882
 883 /* Set the process memory policy */
 884 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 885                 unsigned long maxnode)
 886 {
 887         int err;
 888         nodemask_t nodes;
 889
 890         if (mode < 0 || mode > MPOL_MAX)
 891                 return -EINVAL;
 892         err = get_nodes(&nodes, nmask, maxnode);
 893         if (err)
 894                 return err;
 895         return do_set_mempolicy(mode, &nodes);
 896 }
 897
 898 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 899                 const unsigned long __user *old_nodes,
 900                 const unsigned long __user *new_nodes)
 901 {
 902         struct mm_struct *mm;
 903         struct task_struct *task;
 904         nodemask_t old;
 905         nodemask_t new;
 906         nodemask_t task_nodes;
 907         int err;
 908
 909         err = get_nodes(&old, old_nodes, maxnode);
 910         if (err)
 911                 return err;
 912
 913         err = get_nodes(&new, new_nodes, maxnode);
 914         if (err)
 915                 return err;
 916
 917         /* Find the mm_struct */
 918         read_lock(&tasklist_lock);
 919         task = pid ? find_task_by_pid(pid) : current;
 920         if (!task) {
 921                 read_unlock(&tasklist_lock);
 922                 return -ESRCH;
 923         }
 924         mm = get_task_mm(task);
 925         read_unlock(&tasklist_lock);
 926
 927         if (!mm)
 928                 return -EINVAL;
 929
 930         /*
 931          * Check if this process has the right to modify the specified
 932          * process. The right exists if the process has administrative
 933          * capabilities, superuser privileges or the same
 934          * userid as the target process.
 935          */
 936         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 937             (current->uid != task->suid) && (current->uid != task->uid) &&
 938             !capable(CAP_SYS_NICE)) {
 939                 err = -EPERM;
 940                 goto out;
 941         }
 942
 943         task_nodes = cpuset_mems_allowed(task);
 944         /* Is the user allowed to access the target nodes? */
 945         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 946                 err = -EPERM;
 947                 goto out;
 948         }
 949
 950         err = security_task_movememory(task);
 951         if (err)
 952                 goto out;
 953
 954         err = do_migrate_pages(mm, &old, &new,
 955                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 956 out:
 957         mmput(mm);
 958         return err;
 959 }
 960
 961
 962 /* Retrieve NUMA policy */
 963 asmlinkage long sys_get_mempolicy(int __user *policy,
 964                                 unsigned long __user *nmask,
 965                                 unsigned long maxnode,
 966                                 unsigned long addr, unsigned long flags)
 967 {
 968         int err, pval;
 969         nodemask_t nodes;
 970
 971         if (nmask != NULL && maxnode < MAX_NUMNODES)
 972                 return -EINVAL;
 973
 974         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 975
 976         if (err)
 977                 return err;
 978
 979         if (policy && put_user(pval, policy))
 980                 return -EFAULT;
 981
 982         if (nmask)
 983                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 984
 985         return err;
 986 }
 987
 988 #ifdef CONFIG_COMPAT
 989
 990 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 991                                      compat_ulong_t __user *nmask,
 992                                      compat_ulong_t maxnode,
 993                                      compat_ulong_t addr, compat_ulong_t flags)
 994 {
 995         long err;
 996         unsigned long __user *nm = NULL;
 997         unsigned long nr_bits, alloc_size;
 998         DECLARE_BITMAP(bm, MAX_NUMNODES);
 999
1000         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1001         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1002
1003         if (nmask)
1004                 nm = compat_alloc_user_space(alloc_size);
1005
1006         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1007
1008         if (!err && nmask) {
1009                 err = copy_from_user(bm, nm, alloc_size);
1010                 /* ensure entire bitmap is zeroed */
1011                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1012                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1013         }
1014
1015         return err;
1016 }
1017
1018 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1019                                      compat_ulong_t maxnode)
1020 {
1021         long err = 0;
1022         unsigned long __user *nm = NULL;
1023         unsigned long nr_bits, alloc_size;
1024         DECLARE_BITMAP(bm, MAX_NUMNODES);
1025
1026         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1027         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1028
1029         if (nmask) {
1030                 err = compat_get_bitmap(bm, nmask, nr_bits);
1031                 nm = compat_alloc_user_space(alloc_size);
1032                 err |= copy_to_user(nm, bm, alloc_size);
1033         }
1034
1035         if (err)
1036                 return -EFAULT;
1037
1038         return sys_set_mempolicy(mode, nm, nr_bits+1);
1039 }
1040
1041 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1042                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1043                              compat_ulong_t maxnode, compat_ulong_t flags)
1044 {
1045         long err = 0;
1046         unsigned long __user *nm = NULL;
1047         unsigned long nr_bits, alloc_size;
1048         nodemask_t bm;
1049
1050         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1051         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1052
1053         if (nmask) {
1054                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1055                 nm = compat_alloc_user_space(alloc_size);
1056                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1057         }
1058
1059         if (err)
1060                 return -EFAULT;
1061
1062         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1063 }
1064
1065 #endif
1066
1067 /* Return effective policy for a VMA */
1068 static struct mempolicy * get_vma_policy(struct task_struct *task,
1069                 struct vm_area_struct *vma, unsigned long addr)
1070 {
1071         struct mempolicy *pol = task->mempolicy;
1072
1073         if (vma) {
1074                 if (vma->vm_ops && vma->vm_ops->get_policy)
1075                         pol = vma->vm_ops->get_policy(vma, addr);
1076                 else if (vma->vm_policy &&
1077                                 vma->vm_policy->policy != MPOL_DEFAULT)
1078                         pol = vma->vm_policy;
1079         }
1080         if (!pol)
1081                 pol = &default_policy;
1082         return pol;
1083 }
1084
1085 /* Return a zonelist representing a mempolicy */
1086 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1087 {
1088         int nd;
1089
1090         switch (policy->policy) {
1091         case MPOL_PREFERRED:
1092                 nd = policy->v.preferred_node;
1093                 if (nd < 0)
1094                         nd = numa_node_id();
1095                 break;
1096         case MPOL_BIND:
1097                 /* Lower zones don't get a policy applied */
1098                 /* Careful: current->mems_allowed might have moved */
1099                 if (gfp_zone(gfp) >= policy_zone)
1100                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1101                                 return policy->v.zonelist;
1102                 /*FALL THROUGH*/
1103         case MPOL_INTERLEAVE: /* should not happen */
1104         case MPOL_DEFAULT:
1105                 nd = numa_node_id();
1106                 break;
1107         default:
1108                 nd = 0;
1109                 BUG();
1110         }
1111         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1112 }
1113
1114 /* Do dynamic interleaving for a process */
1115 static unsigned interleave_nodes(struct mempolicy *policy)
1116 {
1117         unsigned nid, next;
1118         struct task_struct *me = current;
1119
1120         nid = me->il_next;
1121         next = next_node(nid, policy->v.nodes);
1122         if (next >= MAX_NUMNODES)
1123                 next = first_node(policy->v.nodes);
1124         me->il_next = next;
1125         return nid;
1126 }
1127
1128 /*
1129  * Depending on the memory policy provide a node from which to allocate the
1130  * next slab entry.
1131  */
1132 unsigned slab_node(struct mempolicy *policy)
1133 {
1134         switch (policy->policy) {
1135         case MPOL_INTERLEAVE:
1136                 return interleave_nodes(policy);
1137
1138         case MPOL_BIND:
1139                 /*
1140                  * Follow bind policy behavior and start allocation at the
1141                  * first node.
1142                  */
1143                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1144
1145         case MPOL_PREFERRED:
1146                 if (policy->v.preferred_node >= 0)
1147                         return policy->v.preferred_node;
1148                 /* Fall through */
1149
1150         default:
1151                 return numa_node_id();
1152         }
1153 }
1154
1155 /* Do static interleaving for a VMA with known offset. */
1156 static unsigned offset_il_node(struct mempolicy *pol,
1157                 struct vm_area_struct *vma, unsigned long off)
1158 {
1159         unsigned nnodes = nodes_weight(pol->v.nodes);
1160         unsigned target = (unsigned)off % nnodes;
1161         int c;
1162         int nid = -1;
1163
1164         c = 0;
1165         do {
1166                 nid = next_node(nid, pol->v.nodes);
1167                 c++;
1168         } while (c <= target);
1169         return nid;
1170 }
1171
1172 /* Determine a node number for interleave */
1173 static inline unsigned interleave_nid(struct mempolicy *pol,
1174                  struct vm_area_struct *vma, unsigned long addr, int shift)
1175 {
1176         if (vma) {
1177                 unsigned long off;
1178
1179                 off = vma->vm_pgoff;
1180                 off += (addr - vma->vm_start) >> shift;
1181                 return offset_il_node(pol, vma, off);
1182         } else
1183                 return interleave_nodes(pol);
1184 }
1185
1186 #ifdef CONFIG_HUGETLBFS
1187 /* Return a zonelist suitable for a huge page allocation. */
1188 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1189 {
1190         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1191
1192         if (pol->policy == MPOL_INTERLEAVE) {
1193                 unsigned nid;
1194
1195                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1196                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1197         }
1198         return zonelist_policy(GFP_HIGHUSER, pol);
1199 }
1200 #endif
1201
1202 /* Allocate a page in interleaved policy.
1203    Own path because it needs to do special accounting. */
1204 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1205                                         unsigned nid)
1206 {
1207         struct zonelist *zl;
1208         struct page *page;
1209
1210         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1211         page = __alloc_pages(gfp, order, zl);
1212         if (page && page_zone(page) == zl->zones[0]) {
1213                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1214                 put_cpu();
1215         }
1216         return page;
1217 }
1218
1219 /**
1220  *      alloc_page_vma  - Allocate a page for a VMA.
1221  *
1222  *      @gfp:
1223  *      %GFP_USER    user allocation.
1224  *      %GFP_KERNEL  kernel allocations,
1225  *      %GFP_HIGHMEM highmem/user allocations,
1226  *      %GFP_FS      allocation should not call back into a file system.
1227  *      %GFP_ATOMIC  don't sleep.
1228  *
1229  *      @vma:  Pointer to VMA or NULL if not available.
1230  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1231  *
1232  *      This function allocates a page from the kernel page pool and applies
1233  *      a NUMA policy associated with the VMA or the current process.
1234  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1235  *      mm_struct of the VMA to prevent it from going away. Should be used for
1236  *      all allocations for pages that will be mapped into
1237  *      user space. Returns NULL when no page can be allocated.
1238  *
1239  *      Should be called with the mm_sem of the vma hold.
1240  */
1241 struct page *
1242 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1243 {
1244         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1245
1246         cpuset_update_task_memory_state();
1247
1248         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1249                 unsigned nid;
1250
1251                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1252                 return alloc_page_interleave(gfp, 0, nid);
1253         }
1254         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1255 }
1256
1257 /**
1258  *      alloc_pages_current - Allocate pages.
1259  *
1260  *      @gfp:
1261  *              %GFP_USER   user allocation,
1262  *              %GFP_KERNEL kernel allocation,
1263  *              %GFP_HIGHMEM highmem allocation,
1264  *              %GFP_FS     don't call back into a file system.
1265  *              %GFP_ATOMIC don't sleep.
1266  *      @order: Power of two of allocation size in pages. 0 is a single page.
1267  *
1268  *      Allocate a page from the kernel page pool.  When not in
1269  *      interrupt context and apply the current process NUMA policy.
1270  *      Returns NULL when no page can be allocated.
1271  *
1272  *      Don't call cpuset_update_task_memory_state() unless
1273  *      1) it's ok to take cpuset_sem (can WAIT), and
1274  *      2) allocating for current task (not interrupt).
1275  */
1276 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1277 {
1278         struct mempolicy *pol = current->mempolicy;
1279
1280         if ((gfp & __GFP_WAIT) && !in_interrupt())
1281                 cpuset_update_task_memory_state();
1282         if (!pol || in_interrupt())
1283                 pol = &default_policy;
1284         if (pol->policy == MPOL_INTERLEAVE)
1285                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1286         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1287 }
1288 EXPORT_SYMBOL(alloc_pages_current);
1289
1290 /*
1291  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1292  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1293  * with the mems_allowed returned by cpuset_mems_allowed().  This
1294  * keeps mempolicies cpuset relative after its cpuset moves.  See
1295  * further kernel/cpuset.c update_nodemask().
1296  */
1297 void *cpuset_being_rebound;
1298
1299 /* Slow path of a mempolicy copy */
1300 struct mempolicy *__mpol_copy(struct mempolicy *old)
1301 {
1302         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1303
1304         if (!new)
1305                 return ERR_PTR(-ENOMEM);
1306         if (current_cpuset_is_being_rebound()) {
1307                 nodemask_t mems = cpuset_mems_allowed(current);
1308                 mpol_rebind_policy(old, &mems);
1309         }
1310         *new = *old;
1311         atomic_set(&new->refcnt, 1);
1312         if (new->policy == MPOL_BIND) {
1313                 int sz = ksize(old->v.zonelist);
1314                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1315                 if (!new->v.zonelist) {
1316                         kmem_cache_free(policy_cache, new);
1317                         return ERR_PTR(-ENOMEM);
1318                 }
1319                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1320         }
1321         return new;
1322 }
1323
1324 /* Slow path of a mempolicy comparison */
1325 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1326 {
1327         if (!a || !b)
1328                 return 0;
1329         if (a->policy != b->policy)
1330                 return 0;
1331         switch (a->policy) {
1332         case MPOL_DEFAULT:
1333                 return 1;
1334         case MPOL_INTERLEAVE:
1335                 return nodes_equal(a->v.nodes, b->v.nodes);
1336         case MPOL_PREFERRED:
1337                 return a->v.preferred_node == b->v.preferred_node;
1338         case MPOL_BIND: {
1339                 int i;
1340                 for (i = 0; a->v.zonelist->zones[i]; i++)
1341                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1342                                 return 0;
1343                 return b->v.zonelist->zones[i] == NULL;
1344         }
1345         default:
1346                 BUG();
1347                 return 0;
1348         }
1349 }
1350
1351 /* Slow path of a mpol destructor. */
1352 void __mpol_free(struct mempolicy *p)
1353 {
1354         if (!atomic_dec_and_test(&p->refcnt))
1355                 return;
1356         if (p->policy == MPOL_BIND)
1357                 kfree(p->v.zonelist);
1358         p->policy = MPOL_DEFAULT;
1359         kmem_cache_free(policy_cache, p);
1360 }
1361
1362 /*
1363  * Shared memory backing store policy support.
1364  *
1365  * Remember policies even when nobody has shared memory mapped.
1366  * The policies are kept in Red-Black tree linked from the inode.
1367  * They are protected by the sp->lock spinlock, which should be held
1368  * for any accesses to the tree.
1369  */
1370
1371 /* lookup first element intersecting start-end */
1372 /* Caller holds sp->lock */
1373 static struct sp_node *
1374 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1375 {
1376         struct rb_node *n = sp->root.rb_node;
1377
1378         while (n) {
1379                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1380
1381                 if (start >= p->end)
1382                         n = n->rb_right;
1383                 else if (end <= p->start)
1384                         n = n->rb_left;
1385                 else
1386                         break;
1387         }
1388         if (!n)
1389                 return NULL;
1390         for (;;) {
1391                 struct sp_node *w = NULL;
1392                 struct rb_node *prev = rb_prev(n);
1393                 if (!prev)
1394                         break;
1395                 w = rb_entry(prev, struct sp_node, nd);
1396                 if (w->end <= start)
1397                         break;
1398                 n = prev;
1399         }
1400         return rb_entry(n, struct sp_node, nd);
1401 }
1402
1403 /* Insert a new shared policy into the list. */
1404 /* Caller holds sp->lock */
1405 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1406 {
1407         struct rb_node **p = &sp->root.rb_node;
1408         struct rb_node *parent = NULL;
1409         struct sp_node *nd;
1410
1411         while (*p) {
1412                 parent = *p;
1413                 nd = rb_entry(parent, struct sp_node, nd);
1414                 if (new->start < nd->start)
1415                         p = &(*p)->rb_left;
1416                 else if (new->end > nd->end)
1417                         p = &(*p)->rb_right;
1418                 else
1419                         BUG();
1420         }
1421         rb_link_node(&new->nd, parent, p);
1422         rb_insert_color(&new->nd, &sp->root);
1423         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1424                  new->policy ? new->policy->policy : 0);
1425 }
1426
1427 /* Find shared policy intersecting idx */
1428 struct mempolicy *
1429 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1430 {
1431         struct mempolicy *pol = NULL;
1432         struct sp_node *sn;
1433
1434         if (!sp->root.rb_node)
1435                 return NULL;
1436         spin_lock(&sp->lock);
1437         sn = sp_lookup(sp, idx, idx+1);
1438         if (sn) {
1439                 mpol_get(sn->policy);
1440                 pol = sn->policy;
1441         }
1442         spin_unlock(&sp->lock);
1443         return pol;
1444 }
1445
1446 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1447 {
1448         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1449         rb_erase(&n->nd, &sp->root);
1450         mpol_free(n->policy);
1451         kmem_cache_free(sn_cache, n);
1452 }
1453
1454 struct sp_node *
1455 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1456 {
1457         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1458
1459         if (!n)
1460                 return NULL;
1461         n->start = start;
1462         n->end = end;
1463         mpol_get(pol);
1464         n->policy = pol;
1465         return n;
1466 }
1467
1468 /* Replace a policy range. */
1469 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1470                                  unsigned long end, struct sp_node *new)
1471 {
1472         struct sp_node *n, *new2 = NULL;
1473
1474 restart:
1475         spin_lock(&sp->lock);
1476         n = sp_lookup(sp, start, end);
1477         /* Take care of old policies in the same range. */
1478         while (n && n->start < end) {
1479                 struct rb_node *next = rb_next(&n->nd);
1480                 if (n->start >= start) {
1481                         if (n->end <= end)
1482                                 sp_delete(sp, n);
1483                         else
1484                                 n->start = end;
1485                 } else {
1486                         /* Old policy spanning whole new range. */
1487                         if (n->end > end) {
1488                                 if (!new2) {
1489                                         spin_unlock(&sp->lock);
1490                                         new2 = sp_alloc(end, n->end, n->policy);
1491                                         if (!new2)
1492                                                 return -ENOMEM;
1493                                         goto restart;
1494                                 }
1495                                 n->end = start;
1496                                 sp_insert(sp, new2);
1497                                 new2 = NULL;
1498                                 break;
1499                         } else
1500                                 n->end = start;
1501                 }
1502                 if (!next)
1503                         break;
1504                 n = rb_entry(next, struct sp_node, nd);
1505         }
1506         if (new)
1507                 sp_insert(sp, new);
1508         spin_unlock(&sp->lock);
1509         if (new2) {
1510                 mpol_free(new2->policy);
1511                 kmem_cache_free(sn_cache, new2);
1512         }
1513         return 0;
1514 }
1515
1516 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1517                                 nodemask_t *policy_nodes)
1518 {
1519         info->root = RB_ROOT;
1520         spin_lock_init(&info->lock);
1521
1522         if (policy != MPOL_DEFAULT) {
1523                 struct mempolicy *newpol;
1524
1525                 /* Falls back to MPOL_DEFAULT on any error */
1526                 newpol = mpol_new(policy, policy_nodes);
1527                 if (!IS_ERR(newpol)) {
1528                         /* Create pseudo-vma that contains just the policy */
1529                         struct vm_area_struct pvma;
1530
1531                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1532                         /* Policy covers entire file */
1533                         pvma.vm_end = TASK_SIZE;
1534                         mpol_set_shared_policy(info, &pvma, newpol);
1535                         mpol_free(newpol);
1536                 }
1537         }
1538 }
1539
1540 int mpol_set_shared_policy(struct shared_policy *info,
1541                         struct vm_area_struct *vma, struct mempolicy *npol)
1542 {
1543         int err;
1544         struct sp_node *new = NULL;
1545         unsigned long sz = vma_pages(vma);
1546
1547         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1548                  vma->vm_pgoff,
1549                  sz, npol? npol->policy : -1,
1550                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1551
1552         if (npol) {
1553                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1554                 if (!new)
1555                         return -ENOMEM;
1556         }
1557         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1558         if (err && new)
1559                 kmem_cache_free(sn_cache, new);
1560         return err;
1561 }
1562
1563 /* Free a backing policy store on inode delete. */
1564 void mpol_free_shared_policy(struct shared_policy *p)
1565 {
1566         struct sp_node *n;
1567         struct rb_node *next;
1568
1569         if (!p->root.rb_node)
1570                 return;
1571         spin_lock(&p->lock);
1572         next = rb_first(&p->root);
1573         while (next) {
1574                 n = rb_entry(next, struct sp_node, nd);
1575                 next = rb_next(&n->nd);
1576                 rb_erase(&n->nd, &p->root);
1577                 mpol_free(n->policy);
1578                 kmem_cache_free(sn_cache, n);
1579         }
1580         spin_unlock(&p->lock);
1581 }
1582
1583 /* assumes fs == KERNEL_DS */
1584 void __init numa_policy_init(void)
1585 {
1586         policy_cache = kmem_cache_create("numa_policy",
1587                                          sizeof(struct mempolicy),
1588                                          0, SLAB_PANIC, NULL, NULL);
1589
1590         sn_cache = kmem_cache_create("shared_policy_node",
1591                                      sizeof(struct sp_node),
1592                                      0, SLAB_PANIC, NULL, NULL);
1593
1594         /* Set interleaving policy for system init. This way not all
1595            the data structures allocated at system boot end up in node zero. */
1596
1597         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1598                 printk("numa_policy_init: interleaving failed\n");
1599 }
1600
1601 /* Reset policy of current process to default */
1602 void numa_default_policy(void)
1603 {
1604         do_set_mempolicy(MPOL_DEFAULT, NULL);
1605 }
1606
1607 /* Migrate a policy to a different set of nodes */
1608 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1609 {
1610         nodemask_t *mpolmask;
1611         nodemask_t tmp;
1612
1613         if (!pol)
1614                 return;
1615         mpolmask = &pol->cpuset_mems_allowed;
1616         if (nodes_equal(*mpolmask, *newmask))
1617                 return;
1618
1619         switch (pol->policy) {
1620         case MPOL_DEFAULT:
1621                 break;
1622         case MPOL_INTERLEAVE:
1623                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1624                 pol->v.nodes = tmp;
1625                 *mpolmask = *newmask;
1626                 current->il_next = node_remap(current->il_next,
1627                                                 *mpolmask, *newmask);
1628                 break;
1629         case MPOL_PREFERRED:
1630                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1631                                                 *mpolmask, *newmask);
1632                 *mpolmask = *newmask;
1633                 break;
1634         case MPOL_BIND: {
1635                 nodemask_t nodes;
1636                 struct zone **z;
1637                 struct zonelist *zonelist;
1638
1639                 nodes_clear(nodes);
1640                 for (z = pol->v.zonelist->zones; *z; z++)
1641                         node_set((*z)->zone_pgdat->node_id, nodes);
1642                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1643                 nodes = tmp;
1644
1645                 zonelist = bind_zonelist(&nodes);
1646
1647                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1648                  * If that old zonelist has no remaining mems_allowed nodes,
1649                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1650                  */
1651
1652                 if (zonelist) {
1653                         /* Good - got mem - substitute new zonelist */
1654                         kfree(pol->v.zonelist);
1655                         pol->v.zonelist = zonelist;
1656                 }
1657                 *mpolmask = *newmask;
1658                 break;
1659         }
1660         default:
1661                 BUG();
1662                 break;
1663         }
1664 }
1665
1666 /*
1667  * Wrapper for mpol_rebind_policy() that just requires task
1668  * pointer, and updates task mempolicy.
1669  */
1670
1671 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1672 {
1673         mpol_rebind_policy(tsk->mempolicy, new);
1674 }
1675
1676 /*
1677  * Rebind each vma in mm to new nodemask.
1678  *
1679  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1680  */
1681
1682 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1683 {
1684         struct vm_area_struct *vma;
1685
1686         down_write(&mm->mmap_sem);
1687         for (vma = mm->mmap; vma; vma = vma->vm_next)
1688                 mpol_rebind_policy(vma->vm_policy, new);
1689         up_write(&mm->mmap_sem);
1690 }
1691
1692 /*
1693  * Display pages allocated per node and memory policy via /proc.
1694  */
1695
1696 static const char *policy_types[] = { "default", "prefer", "bind",
1697                                       "interleave" };
1698
1699 /*
1700  * Convert a mempolicy into a string.
1701  * Returns the number of characters in buffer (if positive)
1702  * or an error (negative)
1703  */
1704 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1705 {
1706         char *p = buffer;
1707         int l;
1708         nodemask_t nodes;
1709         int mode = pol ? pol->policy : MPOL_DEFAULT;
1710
1711         switch (mode) {
1712         case MPOL_DEFAULT:
1713                 nodes_clear(nodes);
1714                 break;
1715
1716         case MPOL_PREFERRED:
1717                 nodes_clear(nodes);
1718                 node_set(pol->v.preferred_node, nodes);
1719                 break;
1720
1721         case MPOL_BIND:
1722                 get_zonemask(pol, &nodes);
1723                 break;
1724
1725         case MPOL_INTERLEAVE:
1726                 nodes = pol->v.nodes;
1727                 break;
1728
1729         default:
1730                 BUG();
1731                 return -EFAULT;
1732         }
1733
1734         l = strlen(policy_types[mode]);
1735         if (buffer + maxlen < p + l + 1)
1736                 return -ENOSPC;
1737
1738         strcpy(p, policy_types[mode]);
1739         p += l;
1740
1741         if (!nodes_empty(nodes)) {
1742                 if (buffer + maxlen < p + 2)
1743                         return -ENOSPC;
1744                 *p++ = '=';
1745                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1746         }
1747         return p - buffer;
1748 }
1749
1750 struct numa_maps {
1751         unsigned long pages;
1752         unsigned long anon;
1753         unsigned long active;
1754         unsigned long writeback;
1755         unsigned long mapcount_max;
1756         unsigned long dirty;
1757         unsigned long swapcache;
1758         unsigned long node[MAX_NUMNODES];
1759 };
1760
1761 static void gather_stats(struct page *page, void *private, int pte_dirty)
1762 {
1763         struct numa_maps *md = private;
1764         int count = page_mapcount(page);
1765
1766         md->pages++;
1767         if (pte_dirty || PageDirty(page))
1768                 md->dirty++;
1769
1770         if (PageSwapCache(page))
1771                 md->swapcache++;
1772
1773         if (PageActive(page))
1774                 md->active++;
1775
1776         if (PageWriteback(page))
1777                 md->writeback++;
1778
1779         if (PageAnon(page))
1780                 md->anon++;
1781
1782         if (count > md->mapcount_max)
1783                 md->mapcount_max = count;
1784
1785         md->node[page_to_nid(page)]++;
1786 }
1787
1788 #ifdef CONFIG_HUGETLB_PAGE
1789 static void check_huge_range(struct vm_area_struct *vma,
1790                 unsigned long start, unsigned long end,
1791                 struct numa_maps *md)
1792 {
1793         unsigned long addr;
1794         struct page *page;
1795
1796         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1797                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1798                 pte_t pte;
1799
1800                 if (!ptep)
1801                         continue;
1802
1803                 pte = *ptep;
1804                 if (pte_none(pte))
1805                         continue;
1806
1807                 page = pte_page(pte);
1808                 if (!page)
1809                         continue;
1810
1811                 gather_stats(page, md, pte_dirty(*ptep));
1812         }
1813 }
1814 #else
1815 static inline void check_huge_range(struct vm_area_struct *vma,
1816                 unsigned long start, unsigned long end,
1817                 struct numa_maps *md)
1818 {
1819 }
1820 #endif
1821
1822 int show_numa_map(struct seq_file *m, void *v)
1823 {
1824         struct proc_maps_private *priv = m->private;
1825         struct vm_area_struct *vma = v;
1826         struct numa_maps *md;
1827         struct file *file = vma->vm_file;
1828         struct mm_struct *mm = vma->vm_mm;
1829         int n;
1830         char buffer[50];
1831
1832         if (!mm)
1833                 return 0;
1834
1835         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1836         if (!md)
1837                 return 0;
1838
1839         mpol_to_str(buffer, sizeof(buffer),
1840                             get_vma_policy(priv->task, vma, vma->vm_start));
1841
1842         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1843
1844         if (file) {
1845                 seq_printf(m, " file=");
1846                 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1847         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1848                 seq_printf(m, " heap");
1849         } else if (vma->vm_start <= mm->start_stack &&
1850                         vma->vm_end >= mm->start_stack) {
1851                 seq_printf(m, " stack");
1852         }
1853
1854         if (is_vm_hugetlb_page(vma)) {
1855                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1856                 seq_printf(m, " huge");
1857         } else {
1858                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1859                                 &node_online_map, MPOL_MF_STATS, md);
1860         }
1861
1862         if (!md->pages)
1863                 goto out;
1864
1865         if (md->anon)
1866                 seq_printf(m," anon=%lu",md->anon);
1867
1868         if (md->dirty)
1869                 seq_printf(m," dirty=%lu",md->dirty);
1870
1871         if (md->pages != md->anon && md->pages != md->dirty)
1872                 seq_printf(m, " mapped=%lu", md->pages);
1873
1874         if (md->mapcount_max > 1)
1875                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1876
1877         if (md->swapcache)
1878                 seq_printf(m," swapcache=%lu", md->swapcache);
1879
1880         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1881                 seq_printf(m," active=%lu", md->active);
1882
1883         if (md->writeback)
1884                 seq_printf(m," writeback=%lu", md->writeback);
1885
1886         for_each_online_node(n)
1887                 if (md->node[n])
1888                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1889 out:
1890         seq_putc(m, '\n');
1891         kfree(md);
1892
1893         if (m->count < m->size)
1894                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1895         return 0;
1896 }
1897