mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89 #include <linux/migrate.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 #define PDprintk(fmt...)
 105
 106 /* Highest zone. An specific allocation for a zone below that is not
 107    policied. */
 108 enum zone_type policy_zone = ZONE_DMA;
 109
 110 struct mempolicy default_policy = {
 111         .refcnt = ATOMIC_INIT(1), /* never free it */
 112         .policy = MPOL_DEFAULT,
 113 };
 114
 115 /* Do sanity checking on a policy */
 116 static int mpol_check_policy(int mode, nodemask_t *nodes)
 117 {
 118         int empty = nodes_empty(*nodes);
 119
 120         switch (mode) {
 121         case MPOL_DEFAULT:
 122                 if (!empty)
 123                         return -EINVAL;
 124                 break;
 125         case MPOL_BIND:
 126         case MPOL_INTERLEAVE:
 127                 /* Preferred will only use the first bit, but allow
 128                    more for now. */
 129                 if (empty)
 130                         return -EINVAL;
 131                 break;
 132         }
 133         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 134 }
 135
 136 /* Generate a custom zonelist for the BIND policy. */
 137 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 138 {
 139         struct zonelist *zl;
 140         int num, max, nd;
 141         enum zone_type k;
 142
 143         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 144         max++;                  /* space for zlcache_ptr (see mmzone.h) */
 145         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 146         if (!zl)
 147                 return NULL;
 148         zl->zlcache_ptr = NULL;
 149         num = 0;
 150         /* First put in the highest zones from all nodes, then all the next
 151            lower zones etc. Avoid empty zones because the memory allocator
 152            doesn't like them. If you implement node hot removal you
 153            have to fix that. */
 154         k = policy_zone;
 155         while (1) {
 156                 for_each_node_mask(nd, *nodes) {
 157                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 158                         if (z->present_pages > 0)
 159                                 zl->zones[num++] = z;
 160                 }
 161                 if (k == 0)
 162                         break;
 163                 k--;
 164         }
 165         zl->zones[num] = NULL;
 166         return zl;
 167 }
 168
 169 /* Create a new policy */
 170 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 171 {
 172         struct mempolicy *policy;
 173
 174         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 175         if (mode == MPOL_DEFAULT)
 176                 return NULL;
 177         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 178         if (!policy)
 179                 return ERR_PTR(-ENOMEM);
 180         atomic_set(&policy->refcnt, 1);
 181         switch (mode) {
 182         case MPOL_INTERLEAVE:
 183                 policy->v.nodes = *nodes;
 184                 if (nodes_weight(*nodes) == 0) {
 185                         kmem_cache_free(policy_cache, policy);
 186                         return ERR_PTR(-EINVAL);
 187                 }
 188                 break;
 189         case MPOL_PREFERRED:
 190                 policy->v.preferred_node = first_node(*nodes);
 191                 if (policy->v.preferred_node >= MAX_NUMNODES)
 192                         policy->v.preferred_node = -1;
 193                 break;
 194         case MPOL_BIND:
 195                 policy->v.zonelist = bind_zonelist(nodes);
 196                 if (policy->v.zonelist == NULL) {
 197                         kmem_cache_free(policy_cache, policy);
 198                         return ERR_PTR(-ENOMEM);
 199                 }
 200                 break;
 201         }
 202         policy->policy = mode;
 203         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 204         return policy;
 205 }
 206
 207 static void gather_stats(struct page *, void *, int pte_dirty);
 208 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 209                                 unsigned long flags);
 210
 211 /* Scan through pages checking if pages follow certain conditions. */
 212 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 213                 unsigned long addr, unsigned long end,
 214                 const nodemask_t *nodes, unsigned long flags,
 215                 void *private)
 216 {
 217         pte_t *orig_pte;
 218         pte_t *pte;
 219         spinlock_t *ptl;
 220
 221         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 222         do {
 223                 struct page *page;
 224                 int nid;
 225
 226                 if (!pte_present(*pte))
 227                         continue;
 228                 page = vm_normal_page(vma, addr, *pte);
 229                 if (!page)
 230                         continue;
 231                 /*
 232                  * The check for PageReserved here is important to avoid
 233                  * handling zero pages and other pages that may have been
 234                  * marked special by the system.
 235                  *
 236                  * If the PageReserved would not be checked here then f.e.
 237                  * the location of the zero page could have an influence
 238                  * on MPOL_MF_STRICT, zero pages would be counted for
 239                  * the per node stats, and there would be useless attempts
 240                  * to put zero pages on the migration list.
 241                  */
 242                 if (PageReserved(page))
 243                         continue;
 244                 nid = page_to_nid(page);
 245                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 246                         continue;
 247
 248                 if (flags & MPOL_MF_STATS)
 249                         gather_stats(page, private, pte_dirty(*pte));
 250                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 251                         migrate_page_add(page, private, flags);
 252                 else
 253                         break;
 254         } while (pte++, addr += PAGE_SIZE, addr != end);
 255         pte_unmap_unlock(orig_pte, ptl);
 256         return addr != end;
 257 }
 258
 259 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 260                 unsigned long addr, unsigned long end,
 261                 const nodemask_t *nodes, unsigned long flags,
 262                 void *private)
 263 {
 264         pmd_t *pmd;
 265         unsigned long next;
 266
 267         pmd = pmd_offset(pud, addr);
 268         do {
 269                 next = pmd_addr_end(addr, end);
 270                 if (pmd_none_or_clear_bad(pmd))
 271                         continue;
 272                 if (check_pte_range(vma, pmd, addr, next, nodes,
 273                                     flags, private))
 274                         return -EIO;
 275         } while (pmd++, addr = next, addr != end);
 276         return 0;
 277 }
 278
 279 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 280                 unsigned long addr, unsigned long end,
 281                 const nodemask_t *nodes, unsigned long flags,
 282                 void *private)
 283 {
 284         pud_t *pud;
 285         unsigned long next;
 286
 287         pud = pud_offset(pgd, addr);
 288         do {
 289                 next = pud_addr_end(addr, end);
 290                 if (pud_none_or_clear_bad(pud))
 291                         continue;
 292                 if (check_pmd_range(vma, pud, addr, next, nodes,
 293                                     flags, private))
 294                         return -EIO;
 295         } while (pud++, addr = next, addr != end);
 296         return 0;
 297 }
 298
 299 static inline int check_pgd_range(struct vm_area_struct *vma,
 300                 unsigned long addr, unsigned long end,
 301                 const nodemask_t *nodes, unsigned long flags,
 302                 void *private)
 303 {
 304         pgd_t *pgd;
 305         unsigned long next;
 306
 307         pgd = pgd_offset(vma->vm_mm, addr);
 308         do {
 309                 next = pgd_addr_end(addr, end);
 310                 if (pgd_none_or_clear_bad(pgd))
 311                         continue;
 312                 if (check_pud_range(vma, pgd, addr, next, nodes,
 313                                     flags, private))
 314                         return -EIO;
 315         } while (pgd++, addr = next, addr != end);
 316         return 0;
 317 }
 318
 319 /* Check if a vma is migratable */
 320 static inline int vma_migratable(struct vm_area_struct *vma)
 321 {
 322         if (vma->vm_flags & (
 323                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 324                 return 0;
 325         return 1;
 326 }
 327
 328 /*
 329  * Check if all pages in a range are on a set of nodes.
 330  * If pagelist != NULL then isolate pages from the LRU and
 331  * put them on the pagelist.
 332  */
 333 static struct vm_area_struct *
 334 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 335                 const nodemask_t *nodes, unsigned long flags, void *private)
 336 {
 337         int err;
 338         struct vm_area_struct *first, *vma, *prev;
 339
 340         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 341
 342                 err = migrate_prep();
 343                 if (err)
 344                         return ERR_PTR(err);
 345         }
 346
 347         first = find_vma(mm, start);
 348         if (!first)
 349                 return ERR_PTR(-EFAULT);
 350         prev = NULL;
 351         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 352                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 353                         if (!vma->vm_next && vma->vm_end < end)
 354                                 return ERR_PTR(-EFAULT);
 355                         if (prev && prev->vm_end < vma->vm_start)
 356                                 return ERR_PTR(-EFAULT);
 357                 }
 358                 if (!is_vm_hugetlb_page(vma) &&
 359                     ((flags & MPOL_MF_STRICT) ||
 360                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 361                                 vma_migratable(vma)))) {
 362                         unsigned long endvma = vma->vm_end;
 363
 364                         if (endvma > end)
 365                                 endvma = end;
 366                         if (vma->vm_start > start)
 367                                 start = vma->vm_start;
 368                         err = check_pgd_range(vma, start, endvma, nodes,
 369                                                 flags, private);
 370                         if (err) {
 371                                 first = ERR_PTR(err);
 372                                 break;
 373                         }
 374                 }
 375                 prev = vma;
 376         }
 377         return first;
 378 }
 379
 380 /* Apply policy to a single VMA */
 381 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 382 {
 383         int err = 0;
 384         struct mempolicy *old = vma->vm_policy;
 385
 386         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 387                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 388                  vma->vm_ops, vma->vm_file,
 389                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 390
 391         if (vma->vm_ops && vma->vm_ops->set_policy)
 392                 err = vma->vm_ops->set_policy(vma, new);
 393         if (!err) {
 394                 mpol_get(new);
 395                 vma->vm_policy = new;
 396                 mpol_free(old);
 397         }
 398         return err;
 399 }
 400
 401 /* Step 2: apply policy to a range and do splits. */
 402 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 403                        unsigned long end, struct mempolicy *new)
 404 {
 405         struct vm_area_struct *next;
 406         int err;
 407
 408         err = 0;
 409         for (; vma && vma->vm_start < end; vma = next) {
 410                 next = vma->vm_next;
 411                 if (vma->vm_start < start)
 412                         err = split_vma(vma->vm_mm, vma, start, 1);
 413                 if (!err && vma->vm_end > end)
 414                         err = split_vma(vma->vm_mm, vma, end, 0);
 415                 if (!err)
 416                         err = policy_vma(vma, new);
 417                 if (err)
 418                         break;
 419         }
 420         return err;
 421 }
 422
 423 static int contextualize_policy(int mode, nodemask_t *nodes)
 424 {
 425         if (!nodes)
 426                 return 0;
 427
 428         cpuset_update_task_memory_state();
 429         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 430                 return -EINVAL;
 431         return mpol_check_policy(mode, nodes);
 432 }
 433
 434
 435 /*
 436  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 437  * mempolicy.  Allows more rapid checking of this (combined perhaps
 438  * with other PF_* flag bits) on memory allocation hot code paths.
 439  *
 440  * If called from outside this file, the task 'p' should -only- be
 441  * a newly forked child not yet visible on the task list, because
 442  * manipulating the task flags of a visible task is not safe.
 443  *
 444  * The above limitation is why this routine has the funny name
 445  * mpol_fix_fork_child_flag().
 446  *
 447  * It is also safe to call this with a task pointer of current,
 448  * which the static wrapper mpol_set_task_struct_flag() does,
 449  * for use within this file.
 450  */
 451
 452 void mpol_fix_fork_child_flag(struct task_struct *p)
 453 {
 454         if (p->mempolicy)
 455                 p->flags |= PF_MEMPOLICY;
 456         else
 457                 p->flags &= ~PF_MEMPOLICY;
 458 }
 459
 460 static void mpol_set_task_struct_flag(void)
 461 {
 462         mpol_fix_fork_child_flag(current);
 463 }
 464
 465 /* Set the process memory policy */
 466 long do_set_mempolicy(int mode, nodemask_t *nodes)
 467 {
 468         struct mempolicy *new;
 469
 470         if (contextualize_policy(mode, nodes))
 471                 return -EINVAL;
 472         new = mpol_new(mode, nodes);
 473         if (IS_ERR(new))
 474                 return PTR_ERR(new);
 475         mpol_free(current->mempolicy);
 476         current->mempolicy = new;
 477         mpol_set_task_struct_flag();
 478         if (new && new->policy == MPOL_INTERLEAVE)
 479                 current->il_next = first_node(new->v.nodes);
 480         return 0;
 481 }
 482
 483 /* Fill a zone bitmap for a policy */
 484 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 485 {
 486         int i;
 487
 488         nodes_clear(*nodes);
 489         switch (p->policy) {
 490         case MPOL_BIND:
 491                 for (i = 0; p->v.zonelist->zones[i]; i++)
 492                         node_set(zone_to_nid(p->v.zonelist->zones[i]),
 493                                 *nodes);
 494                 break;
 495         case MPOL_DEFAULT:
 496                 break;
 497         case MPOL_INTERLEAVE:
 498                 *nodes = p->v.nodes;
 499                 break;
 500         case MPOL_PREFERRED:
 501                 /* or use current node instead of online map? */
 502                 if (p->v.preferred_node < 0)
 503                         *nodes = node_online_map;
 504                 else
 505                         node_set(p->v.preferred_node, *nodes);
 506                 break;
 507         default:
 508                 BUG();
 509         }
 510 }
 511
 512 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 513 {
 514         struct page *p;
 515         int err;
 516
 517         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 518         if (err >= 0) {
 519                 err = page_to_nid(p);
 520                 put_page(p);
 521         }
 522         return err;
 523 }
 524
 525 /* Retrieve NUMA policy */
 526 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 527                         unsigned long addr, unsigned long flags)
 528 {
 529         int err;
 530         struct mm_struct *mm = current->mm;
 531         struct vm_area_struct *vma = NULL;
 532         struct mempolicy *pol = current->mempolicy;
 533
 534         cpuset_update_task_memory_state();
 535         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 536                 return -EINVAL;
 537         if (flags & MPOL_F_ADDR) {
 538                 down_read(&mm->mmap_sem);
 539                 vma = find_vma_intersection(mm, addr, addr+1);
 540                 if (!vma) {
 541                         up_read(&mm->mmap_sem);
 542                         return -EFAULT;
 543                 }
 544                 if (vma->vm_ops && vma->vm_ops->get_policy)
 545                         pol = vma->vm_ops->get_policy(vma, addr);
 546                 else
 547                         pol = vma->vm_policy;
 548         } else if (addr)
 549                 return -EINVAL;
 550
 551         if (!pol)
 552                 pol = &default_policy;
 553
 554         if (flags & MPOL_F_NODE) {
 555                 if (flags & MPOL_F_ADDR) {
 556                         err = lookup_node(mm, addr);
 557                         if (err < 0)
 558                                 goto out;
 559                         *policy = err;
 560                 } else if (pol == current->mempolicy &&
 561                                 pol->policy == MPOL_INTERLEAVE) {
 562                         *policy = current->il_next;
 563                 } else {
 564                         err = -EINVAL;
 565                         goto out;
 566                 }
 567         } else
 568                 *policy = pol->policy;
 569
 570         if (vma) {
 571                 up_read(&current->mm->mmap_sem);
 572                 vma = NULL;
 573         }
 574
 575         err = 0;
 576         if (nmask)
 577                 get_zonemask(pol, nmask);
 578
 579  out:
 580         if (vma)
 581                 up_read(&current->mm->mmap_sem);
 582         return err;
 583 }
 584
 585 #ifdef CONFIG_MIGRATION
 586 /*
 587  * page migration
 588  */
 589 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 590                                 unsigned long flags)
 591 {
 592         /*
 593          * Avoid migrating a page that is shared with others.
 594          */
 595         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 596                 isolate_lru_page(page, pagelist);
 597 }
 598
 599 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 600 {
 601         return alloc_pages_node(node, GFP_HIGHUSER, 0);
 602 }
 603
 604 /*
 605  * Migrate pages from one node to a target node.
 606  * Returns error or the number of pages not migrated.
 607  */
 608 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 609 {
 610         nodemask_t nmask;
 611         LIST_HEAD(pagelist);
 612         int err = 0;
 613
 614         nodes_clear(nmask);
 615         node_set(source, nmask);
 616
 617         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 618                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 619
 620         if (!list_empty(&pagelist))
 621                 err = migrate_pages(&pagelist, new_node_page, dest);
 622
 623         return err;
 624 }
 625
 626 /*
 627  * Move pages between the two nodesets so as to preserve the physical
 628  * layout as much as possible.
 629  *
 630  * Returns the number of page that could not be moved.
 631  */
 632 int do_migrate_pages(struct mm_struct *mm,
 633         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 634 {
 635         LIST_HEAD(pagelist);
 636         int busy = 0;
 637         int err = 0;
 638         nodemask_t tmp;
 639
 640         down_read(&mm->mmap_sem);
 641
 642         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 643         if (err)
 644                 goto out;
 645
 646 /*
 647  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 648  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 649  * bit in 'tmp', and return that <source, dest> pair for migration.
 650  * The pair of nodemasks 'to' and 'from' define the map.
 651  *
 652  * If no pair of bits is found that way, fallback to picking some
 653  * pair of 'source' and 'dest' bits that are not the same.  If the
 654  * 'source' and 'dest' bits are the same, this represents a node
 655  * that will be migrating to itself, so no pages need move.
 656  *
 657  * If no bits are left in 'tmp', or if all remaining bits left
 658  * in 'tmp' correspond to the same bit in 'to', return false
 659  * (nothing left to migrate).
 660  *
 661  * This lets us pick a pair of nodes to migrate between, such that
 662  * if possible the dest node is not already occupied by some other
 663  * source node, minimizing the risk of overloading the memory on a
 664  * node that would happen if we migrated incoming memory to a node
 665  * before migrating outgoing memory source that same node.
 666  *
 667  * A single scan of tmp is sufficient.  As we go, we remember the
 668  * most recent <s, d> pair that moved (s != d).  If we find a pair
 669  * that not only moved, but what's better, moved to an empty slot
 670  * (d is not set in tmp), then we break out then, with that pair.
 671  * Otherwise when we finish scannng from_tmp, we at least have the
 672  * most recent <s, d> pair that moved.  If we get all the way through
 673  * the scan of tmp without finding any node that moved, much less
 674  * moved to an empty node, then there is nothing left worth migrating.
 675  */
 676
 677         tmp = *from_nodes;
 678         while (!nodes_empty(tmp)) {
 679                 int s,d;
 680                 int source = -1;
 681                 int dest = 0;
 682
 683                 for_each_node_mask(s, tmp) {
 684                         d = node_remap(s, *from_nodes, *to_nodes);
 685                         if (s == d)
 686                                 continue;
 687
 688                         source = s;     /* Node moved. Memorize */
 689                         dest = d;
 690
 691                         /* dest not in remaining from nodes? */
 692                         if (!node_isset(dest, tmp))
 693                                 break;
 694                 }
 695                 if (source == -1)
 696                         break;
 697
 698                 node_clear(source, tmp);
 699                 err = migrate_to_node(mm, source, dest, flags);
 700                 if (err > 0)
 701                         busy += err;
 702                 if (err < 0)
 703                         break;
 704         }
 705 out:
 706         up_read(&mm->mmap_sem);
 707         if (err < 0)
 708                 return err;
 709         return busy;
 710
 711 }
 712
 713 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 714 {
 715         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 716
 717         return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
 718 }
 719 #else
 720
 721 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 722                                 unsigned long flags)
 723 {
 724 }
 725
 726 int do_migrate_pages(struct mm_struct *mm,
 727         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 728 {
 729         return -ENOSYS;
 730 }
 731
 732 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 733 {
 734         return NULL;
 735 }
 736 #endif
 737
 738 long do_mbind(unsigned long start, unsigned long len,
 739                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 740 {
 741         struct vm_area_struct *vma;
 742         struct mm_struct *mm = current->mm;
 743         struct mempolicy *new;
 744         unsigned long end;
 745         int err;
 746         LIST_HEAD(pagelist);
 747
 748         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 749                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 750             || mode > MPOL_MAX)
 751                 return -EINVAL;
 752         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 753                 return -EPERM;
 754
 755         if (start & ~PAGE_MASK)
 756                 return -EINVAL;
 757
 758         if (mode == MPOL_DEFAULT)
 759                 flags &= ~MPOL_MF_STRICT;
 760
 761         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 762         end = start + len;
 763
 764         if (end < start)
 765                 return -EINVAL;
 766         if (end == start)
 767                 return 0;
 768
 769         if (mpol_check_policy(mode, nmask))
 770                 return -EINVAL;
 771
 772         new = mpol_new(mode, nmask);
 773         if (IS_ERR(new))
 774                 return PTR_ERR(new);
 775
 776         /*
 777          * If we are using the default policy then operation
 778          * on discontinuous address spaces is okay after all
 779          */
 780         if (!new)
 781                 flags |= MPOL_MF_DISCONTIG_OK;
 782
 783         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 784                         mode,nodes_addr(nodes)[0]);
 785
 786         down_write(&mm->mmap_sem);
 787         vma = check_range(mm, start, end, nmask,
 788                           flags | MPOL_MF_INVERT, &pagelist);
 789
 790         err = PTR_ERR(vma);
 791         if (!IS_ERR(vma)) {
 792                 int nr_failed = 0;
 793
 794                 err = mbind_range(vma, start, end, new);
 795
 796                 if (!list_empty(&pagelist))
 797                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 798                                                 (unsigned long)vma);
 799
 800                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 801                         err = -EIO;
 802         }
 803
 804         up_write(&mm->mmap_sem);
 805         mpol_free(new);
 806         return err;
 807 }
 808
 809 /*
 810  * User space interface with variable sized bitmaps for nodelists.
 811  */
 812
 813 /* Copy a node mask from user space. */
 814 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 815                      unsigned long maxnode)
 816 {
 817         unsigned long k;
 818         unsigned long nlongs;
 819         unsigned long endmask;
 820
 821         --maxnode;
 822         nodes_clear(*nodes);
 823         if (maxnode == 0 || !nmask)
 824                 return 0;
 825         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 826                 return -EINVAL;
 827
 828         nlongs = BITS_TO_LONGS(maxnode);
 829         if ((maxnode % BITS_PER_LONG) == 0)
 830                 endmask = ~0UL;
 831         else
 832                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 833
 834         /* When the user specified more nodes than supported just check
 835            if the non supported part is all zero. */
 836         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 837                 if (nlongs > PAGE_SIZE/sizeof(long))
 838                         return -EINVAL;
 839                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 840                         unsigned long t;
 841                         if (get_user(t, nmask + k))
 842                                 return -EFAULT;
 843                         if (k == nlongs - 1) {
 844                                 if (t & endmask)
 845                                         return -EINVAL;
 846                         } else if (t)
 847                                 return -EINVAL;
 848                 }
 849                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 850                 endmask = ~0UL;
 851         }
 852
 853         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 854                 return -EFAULT;
 855         nodes_addr(*nodes)[nlongs-1] &= endmask;
 856         return 0;
 857 }
 858
 859 /* Copy a kernel node mask to user space */
 860 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 861                               nodemask_t *nodes)
 862 {
 863         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 864         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 865
 866         if (copy > nbytes) {
 867                 if (copy > PAGE_SIZE)
 868                         return -EINVAL;
 869                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 870                         return -EFAULT;
 871                 copy = nbytes;
 872         }
 873         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 874 }
 875
 876 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 877                         unsigned long mode,
 878                         unsigned long __user *nmask, unsigned long maxnode,
 879                         unsigned flags)
 880 {
 881         nodemask_t nodes;
 882         int err;
 883
 884         err = get_nodes(&nodes, nmask, maxnode);
 885         if (err)
 886                 return err;
 887         return do_mbind(start, len, mode, &nodes, flags);
 888 }
 889
 890 /* Set the process memory policy */
 891 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 892                 unsigned long maxnode)
 893 {
 894         int err;
 895         nodemask_t nodes;
 896
 897         if (mode < 0 || mode > MPOL_MAX)
 898                 return -EINVAL;
 899         err = get_nodes(&nodes, nmask, maxnode);
 900         if (err)
 901                 return err;
 902         return do_set_mempolicy(mode, &nodes);
 903 }
 904
 905 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 906                 const unsigned long __user *old_nodes,
 907                 const unsigned long __user *new_nodes)
 908 {
 909         struct mm_struct *mm;
 910         struct task_struct *task;
 911         nodemask_t old;
 912         nodemask_t new;
 913         nodemask_t task_nodes;
 914         int err;
 915
 916         err = get_nodes(&old, old_nodes, maxnode);
 917         if (err)
 918                 return err;
 919
 920         err = get_nodes(&new, new_nodes, maxnode);
 921         if (err)
 922                 return err;
 923
 924         /* Find the mm_struct */
 925         read_lock(&tasklist_lock);
 926         task = pid ? find_task_by_pid(pid) : current;
 927         if (!task) {
 928                 read_unlock(&tasklist_lock);
 929                 return -ESRCH;
 930         }
 931         mm = get_task_mm(task);
 932         read_unlock(&tasklist_lock);
 933
 934         if (!mm)
 935                 return -EINVAL;
 936
 937         /*
 938          * Check if this process has the right to modify the specified
 939          * process. The right exists if the process has administrative
 940          * capabilities, superuser privileges or the same
 941          * userid as the target process.
 942          */
 943         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 944             (current->uid != task->suid) && (current->uid != task->uid) &&
 945             !capable(CAP_SYS_NICE)) {
 946                 err = -EPERM;
 947                 goto out;
 948         }
 949
 950         task_nodes = cpuset_mems_allowed(task);
 951         /* Is the user allowed to access the target nodes? */
 952         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 953                 err = -EPERM;
 954                 goto out;
 955         }
 956
 957         err = security_task_movememory(task);
 958         if (err)
 959                 goto out;
 960
 961         err = do_migrate_pages(mm, &old, &new,
 962                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 963 out:
 964         mmput(mm);
 965         return err;
 966 }
 967
 968
 969 /* Retrieve NUMA policy */
 970 asmlinkage long sys_get_mempolicy(int __user *policy,
 971                                 unsigned long __user *nmask,
 972                                 unsigned long maxnode,
 973                                 unsigned long addr, unsigned long flags)
 974 {
 975         int err, pval;
 976         nodemask_t nodes;
 977
 978         if (nmask != NULL && maxnode < MAX_NUMNODES)
 979                 return -EINVAL;
 980
 981         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 982
 983         if (err)
 984                 return err;
 985
 986         if (policy && put_user(pval, policy))
 987                 return -EFAULT;
 988
 989         if (nmask)
 990                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 991
 992         return err;
 993 }
 994
 995 #ifdef CONFIG_COMPAT
 996
 997 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 998                                      compat_ulong_t __user *nmask,
 999                                      compat_ulong_t maxnode,
1000                                      compat_ulong_t addr, compat_ulong_t flags)
1001 {
1002         long err;
1003         unsigned long __user *nm = NULL;
1004         unsigned long nr_bits, alloc_size;
1005         DECLARE_BITMAP(bm, MAX_NUMNODES);
1006
1007         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1008         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1009
1010         if (nmask)
1011                 nm = compat_alloc_user_space(alloc_size);
1012
1013         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1014
1015         if (!err && nmask) {
1016                 err = copy_from_user(bm, nm, alloc_size);
1017                 /* ensure entire bitmap is zeroed */
1018                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1019                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1020         }
1021
1022         return err;
1023 }
1024
1025 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1026                                      compat_ulong_t maxnode)
1027 {
1028         long err = 0;
1029         unsigned long __user *nm = NULL;
1030         unsigned long nr_bits, alloc_size;
1031         DECLARE_BITMAP(bm, MAX_NUMNODES);
1032
1033         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1034         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1035
1036         if (nmask) {
1037                 err = compat_get_bitmap(bm, nmask, nr_bits);
1038                 nm = compat_alloc_user_space(alloc_size);
1039                 err |= copy_to_user(nm, bm, alloc_size);
1040         }
1041
1042         if (err)
1043                 return -EFAULT;
1044
1045         return sys_set_mempolicy(mode, nm, nr_bits+1);
1046 }
1047
1048 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1049                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1050                              compat_ulong_t maxnode, compat_ulong_t flags)
1051 {
1052         long err = 0;
1053         unsigned long __user *nm = NULL;
1054         unsigned long nr_bits, alloc_size;
1055         nodemask_t bm;
1056
1057         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1058         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1059
1060         if (nmask) {
1061                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1062                 nm = compat_alloc_user_space(alloc_size);
1063                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1064         }
1065
1066         if (err)
1067                 return -EFAULT;
1068
1069         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1070 }
1071
1072 #endif
1073
1074 /* Return effective policy for a VMA */
1075 static struct mempolicy * get_vma_policy(struct task_struct *task,
1076                 struct vm_area_struct *vma, unsigned long addr)
1077 {
1078         struct mempolicy *pol = task->mempolicy;
1079
1080         if (vma) {
1081                 if (vma->vm_ops && vma->vm_ops->get_policy)
1082                         pol = vma->vm_ops->get_policy(vma, addr);
1083                 else if (vma->vm_policy &&
1084                                 vma->vm_policy->policy != MPOL_DEFAULT)
1085                         pol = vma->vm_policy;
1086         }
1087         if (!pol)
1088                 pol = &default_policy;
1089         return pol;
1090 }
1091
1092 /* Return a zonelist representing a mempolicy */
1093 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1094 {
1095         int nd;
1096
1097         switch (policy->policy) {
1098         case MPOL_PREFERRED:
1099                 nd = policy->v.preferred_node;
1100                 if (nd < 0)
1101                         nd = numa_node_id();
1102                 break;
1103         case MPOL_BIND:
1104                 /* Lower zones don't get a policy applied */
1105                 /* Careful: current->mems_allowed might have moved */
1106                 if (gfp_zone(gfp) >= policy_zone)
1107                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1108                                 return policy->v.zonelist;
1109                 /*FALL THROUGH*/
1110         case MPOL_INTERLEAVE: /* should not happen */
1111         case MPOL_DEFAULT:
1112                 nd = numa_node_id();
1113                 break;
1114         default:
1115                 nd = 0;
1116                 BUG();
1117         }
1118         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1119 }
1120
1121 /* Do dynamic interleaving for a process */
1122 static unsigned interleave_nodes(struct mempolicy *policy)
1123 {
1124         unsigned nid, next;
1125         struct task_struct *me = current;
1126
1127         nid = me->il_next;
1128         next = next_node(nid, policy->v.nodes);
1129         if (next >= MAX_NUMNODES)
1130                 next = first_node(policy->v.nodes);
1131         me->il_next = next;
1132         return nid;
1133 }
1134
1135 /*
1136  * Depending on the memory policy provide a node from which to allocate the
1137  * next slab entry.
1138  */
1139 unsigned slab_node(struct mempolicy *policy)
1140 {
1141         int pol = policy ? policy->policy : MPOL_DEFAULT;
1142
1143         switch (pol) {
1144         case MPOL_INTERLEAVE:
1145                 return interleave_nodes(policy);
1146
1147         case MPOL_BIND:
1148                 /*
1149                  * Follow bind policy behavior and start allocation at the
1150                  * first node.
1151                  */
1152                 return zone_to_nid(policy->v.zonelist->zones[0]);
1153
1154         case MPOL_PREFERRED:
1155                 if (policy->v.preferred_node >= 0)
1156                         return policy->v.preferred_node;
1157                 /* Fall through */
1158
1159         default:
1160                 return numa_node_id();
1161         }
1162 }
1163
1164 /* Do static interleaving for a VMA with known offset. */
1165 static unsigned offset_il_node(struct mempolicy *pol,
1166                 struct vm_area_struct *vma, unsigned long off)
1167 {
1168         unsigned nnodes = nodes_weight(pol->v.nodes);
1169         unsigned target = (unsigned)off % nnodes;
1170         int c;
1171         int nid = -1;
1172
1173         c = 0;
1174         do {
1175                 nid = next_node(nid, pol->v.nodes);
1176                 c++;
1177         } while (c <= target);
1178         return nid;
1179 }
1180
1181 /* Determine a node number for interleave */
1182 static inline unsigned interleave_nid(struct mempolicy *pol,
1183                  struct vm_area_struct *vma, unsigned long addr, int shift)
1184 {
1185         if (vma) {
1186                 unsigned long off;
1187
1188                 /*
1189                  * for small pages, there is no difference between
1190                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1191                  * for huge pages, since vm_pgoff is in units of small
1192                  * pages, we need to shift off the always 0 bits to get
1193                  * a useful offset.
1194                  */
1195                 BUG_ON(shift < PAGE_SHIFT);
1196                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1197                 off += (addr - vma->vm_start) >> shift;
1198                 return offset_il_node(pol, vma, off);
1199         } else
1200                 return interleave_nodes(pol);
1201 }
1202
1203 #ifdef CONFIG_HUGETLBFS
1204 /* Return a zonelist suitable for a huge page allocation. */
1205 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1206 {
1207         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1208
1209         if (pol->policy == MPOL_INTERLEAVE) {
1210                 unsigned nid;
1211
1212                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1213                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1214         }
1215         return zonelist_policy(GFP_HIGHUSER, pol);
1216 }
1217 #endif
1218
1219 /* Allocate a page in interleaved policy.
1220    Own path because it needs to do special accounting. */
1221 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1222                                         unsigned nid)
1223 {
1224         struct zonelist *zl;
1225         struct page *page;
1226
1227         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1228         page = __alloc_pages(gfp, order, zl);
1229         if (page && page_zone(page) == zl->zones[0])
1230                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1231         return page;
1232 }
1233
1234 /**
1235  *      alloc_page_vma  - Allocate a page for a VMA.
1236  *
1237  *      @gfp:
1238  *      %GFP_USER    user allocation.
1239  *      %GFP_KERNEL  kernel allocations,
1240  *      %GFP_HIGHMEM highmem/user allocations,
1241  *      %GFP_FS      allocation should not call back into a file system.
1242  *      %GFP_ATOMIC  don't sleep.
1243  *
1244  *      @vma:  Pointer to VMA or NULL if not available.
1245  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1246  *
1247  *      This function allocates a page from the kernel page pool and applies
1248  *      a NUMA policy associated with the VMA or the current process.
1249  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1250  *      mm_struct of the VMA to prevent it from going away. Should be used for
1251  *      all allocations for pages that will be mapped into
1252  *      user space. Returns NULL when no page can be allocated.
1253  *
1254  *      Should be called with the mm_sem of the vma hold.
1255  */
1256 struct page *
1257 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1258 {
1259         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1260
1261         cpuset_update_task_memory_state();
1262
1263         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1264                 unsigned nid;
1265
1266                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1267                 return alloc_page_interleave(gfp, 0, nid);
1268         }
1269         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1270 }
1271
1272 /**
1273  *      alloc_pages_current - Allocate pages.
1274  *
1275  *      @gfp:
1276  *              %GFP_USER   user allocation,
1277  *              %GFP_KERNEL kernel allocation,
1278  *              %GFP_HIGHMEM highmem allocation,
1279  *              %GFP_FS     don't call back into a file system.
1280  *              %GFP_ATOMIC don't sleep.
1281  *      @order: Power of two of allocation size in pages. 0 is a single page.
1282  *
1283  *      Allocate a page from the kernel page pool.  When not in
1284  *      interrupt context and apply the current process NUMA policy.
1285  *      Returns NULL when no page can be allocated.
1286  *
1287  *      Don't call cpuset_update_task_memory_state() unless
1288  *      1) it's ok to take cpuset_sem (can WAIT), and
1289  *      2) allocating for current task (not interrupt).
1290  */
1291 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1292 {
1293         struct mempolicy *pol = current->mempolicy;
1294
1295         if ((gfp & __GFP_WAIT) && !in_interrupt())
1296                 cpuset_update_task_memory_state();
1297         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1298                 pol = &default_policy;
1299         if (pol->policy == MPOL_INTERLEAVE)
1300                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1301         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1302 }
1303 EXPORT_SYMBOL(alloc_pages_current);
1304
1305 /*
1306  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1307  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1308  * with the mems_allowed returned by cpuset_mems_allowed().  This
1309  * keeps mempolicies cpuset relative after its cpuset moves.  See
1310  * further kernel/cpuset.c update_nodemask().
1311  */
1312 void *cpuset_being_rebound;
1313
1314 /* Slow path of a mempolicy copy */
1315 struct mempolicy *__mpol_copy(struct mempolicy *old)
1316 {
1317         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1318
1319         if (!new)
1320                 return ERR_PTR(-ENOMEM);
1321         if (current_cpuset_is_being_rebound()) {
1322                 nodemask_t mems = cpuset_mems_allowed(current);
1323                 mpol_rebind_policy(old, &mems);
1324         }
1325         *new = *old;
1326         atomic_set(&new->refcnt, 1);
1327         if (new->policy == MPOL_BIND) {
1328                 int sz = ksize(old->v.zonelist);
1329                 new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1330                 if (!new->v.zonelist) {
1331                         kmem_cache_free(policy_cache, new);
1332                         return ERR_PTR(-ENOMEM);
1333                 }
1334         }
1335         return new;
1336 }
1337
1338 /* Slow path of a mempolicy comparison */
1339 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1340 {
1341         if (!a || !b)
1342                 return 0;
1343         if (a->policy != b->policy)
1344                 return 0;
1345         switch (a->policy) {
1346         case MPOL_DEFAULT:
1347                 return 1;
1348         case MPOL_INTERLEAVE:
1349                 return nodes_equal(a->v.nodes, b->v.nodes);
1350         case MPOL_PREFERRED:
1351                 return a->v.preferred_node == b->v.preferred_node;
1352         case MPOL_BIND: {
1353                 int i;
1354                 for (i = 0; a->v.zonelist->zones[i]; i++)
1355                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1356                                 return 0;
1357                 return b->v.zonelist->zones[i] == NULL;
1358         }
1359         default:
1360                 BUG();
1361                 return 0;
1362         }
1363 }
1364
1365 /* Slow path of a mpol destructor. */
1366 void __mpol_free(struct mempolicy *p)
1367 {
1368         if (!atomic_dec_and_test(&p->refcnt))
1369                 return;
1370         if (p->policy == MPOL_BIND)
1371                 kfree(p->v.zonelist);
1372         p->policy = MPOL_DEFAULT;
1373         kmem_cache_free(policy_cache, p);
1374 }
1375
1376 /*
1377  * Shared memory backing store policy support.
1378  *
1379  * Remember policies even when nobody has shared memory mapped.
1380  * The policies are kept in Red-Black tree linked from the inode.
1381  * They are protected by the sp->lock spinlock, which should be held
1382  * for any accesses to the tree.
1383  */
1384
1385 /* lookup first element intersecting start-end */
1386 /* Caller holds sp->lock */
1387 static struct sp_node *
1388 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1389 {
1390         struct rb_node *n = sp->root.rb_node;
1391
1392         while (n) {
1393                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1394
1395                 if (start >= p->end)
1396                         n = n->rb_right;
1397                 else if (end <= p->start)
1398                         n = n->rb_left;
1399                 else
1400                         break;
1401         }
1402         if (!n)
1403                 return NULL;
1404         for (;;) {
1405                 struct sp_node *w = NULL;
1406                 struct rb_node *prev = rb_prev(n);
1407                 if (!prev)
1408                         break;
1409                 w = rb_entry(prev, struct sp_node, nd);
1410                 if (w->end <= start)
1411                         break;
1412                 n = prev;
1413         }
1414         return rb_entry(n, struct sp_node, nd);
1415 }
1416
1417 /* Insert a new shared policy into the list. */
1418 /* Caller holds sp->lock */
1419 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1420 {
1421         struct rb_node **p = &sp->root.rb_node;
1422         struct rb_node *parent = NULL;
1423         struct sp_node *nd;
1424
1425         while (*p) {
1426                 parent = *p;
1427                 nd = rb_entry(parent, struct sp_node, nd);
1428                 if (new->start < nd->start)
1429                         p = &(*p)->rb_left;
1430                 else if (new->end > nd->end)
1431                         p = &(*p)->rb_right;
1432                 else
1433                         BUG();
1434         }
1435         rb_link_node(&new->nd, parent, p);
1436         rb_insert_color(&new->nd, &sp->root);
1437         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1438                  new->policy ? new->policy->policy : 0);
1439 }
1440
1441 /* Find shared policy intersecting idx */
1442 struct mempolicy *
1443 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1444 {
1445         struct mempolicy *pol = NULL;
1446         struct sp_node *sn;
1447
1448         if (!sp->root.rb_node)
1449                 return NULL;
1450         spin_lock(&sp->lock);
1451         sn = sp_lookup(sp, idx, idx+1);
1452         if (sn) {
1453                 mpol_get(sn->policy);
1454                 pol = sn->policy;
1455         }
1456         spin_unlock(&sp->lock);
1457         return pol;
1458 }
1459
1460 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1461 {
1462         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1463         rb_erase(&n->nd, &sp->root);
1464         mpol_free(n->policy);
1465         kmem_cache_free(sn_cache, n);
1466 }
1467
1468 struct sp_node *
1469 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1470 {
1471         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1472
1473         if (!n)
1474                 return NULL;
1475         n->start = start;
1476         n->end = end;
1477         mpol_get(pol);
1478         n->policy = pol;
1479         return n;
1480 }
1481
1482 /* Replace a policy range. */
1483 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1484                                  unsigned long end, struct sp_node *new)
1485 {
1486         struct sp_node *n, *new2 = NULL;
1487
1488 restart:
1489         spin_lock(&sp->lock);
1490         n = sp_lookup(sp, start, end);
1491         /* Take care of old policies in the same range. */
1492         while (n && n->start < end) {
1493                 struct rb_node *next = rb_next(&n->nd);
1494                 if (n->start >= start) {
1495                         if (n->end <= end)
1496                                 sp_delete(sp, n);
1497                         else
1498                                 n->start = end;
1499                 } else {
1500                         /* Old policy spanning whole new range. */
1501                         if (n->end > end) {
1502                                 if (!new2) {
1503                                         spin_unlock(&sp->lock);
1504                                         new2 = sp_alloc(end, n->end, n->policy);
1505                                         if (!new2)
1506                                                 return -ENOMEM;
1507                                         goto restart;
1508                                 }
1509                                 n->end = start;
1510                                 sp_insert(sp, new2);
1511                                 new2 = NULL;
1512                                 break;
1513                         } else
1514                                 n->end = start;
1515                 }
1516                 if (!next)
1517                         break;
1518                 n = rb_entry(next, struct sp_node, nd);
1519         }
1520         if (new)
1521                 sp_insert(sp, new);
1522         spin_unlock(&sp->lock);
1523         if (new2) {
1524                 mpol_free(new2->policy);
1525                 kmem_cache_free(sn_cache, new2);
1526         }
1527         return 0;
1528 }
1529
1530 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1531                                 nodemask_t *policy_nodes)
1532 {
1533         info->root = RB_ROOT;
1534         spin_lock_init(&info->lock);
1535
1536         if (policy != MPOL_DEFAULT) {
1537                 struct mempolicy *newpol;
1538
1539                 /* Falls back to MPOL_DEFAULT on any error */
1540                 newpol = mpol_new(policy, policy_nodes);
1541                 if (!IS_ERR(newpol)) {
1542                         /* Create pseudo-vma that contains just the policy */
1543                         struct vm_area_struct pvma;
1544
1545                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1546                         /* Policy covers entire file */
1547                         pvma.vm_end = TASK_SIZE;
1548                         mpol_set_shared_policy(info, &pvma, newpol);
1549                         mpol_free(newpol);
1550                 }
1551         }
1552 }
1553
1554 int mpol_set_shared_policy(struct shared_policy *info,
1555                         struct vm_area_struct *vma, struct mempolicy *npol)
1556 {
1557         int err;
1558         struct sp_node *new = NULL;
1559         unsigned long sz = vma_pages(vma);
1560
1561         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1562                  vma->vm_pgoff,
1563                  sz, npol? npol->policy : -1,
1564                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1565
1566         if (npol) {
1567                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1568                 if (!new)
1569                         return -ENOMEM;
1570         }
1571         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1572         if (err && new)
1573                 kmem_cache_free(sn_cache, new);
1574         return err;
1575 }
1576
1577 /* Free a backing policy store on inode delete. */
1578 void mpol_free_shared_policy(struct shared_policy *p)
1579 {
1580         struct sp_node *n;
1581         struct rb_node *next;
1582
1583         if (!p->root.rb_node)
1584                 return;
1585         spin_lock(&p->lock);
1586         next = rb_first(&p->root);
1587         while (next) {
1588                 n = rb_entry(next, struct sp_node, nd);
1589                 next = rb_next(&n->nd);
1590                 rb_erase(&n->nd, &p->root);
1591                 mpol_free(n->policy);
1592                 kmem_cache_free(sn_cache, n);
1593         }
1594         spin_unlock(&p->lock);
1595 }
1596
1597 /* assumes fs == KERNEL_DS */
1598 void __init numa_policy_init(void)
1599 {
1600         policy_cache = kmem_cache_create("numa_policy",
1601                                          sizeof(struct mempolicy),
1602                                          0, SLAB_PANIC, NULL, NULL);
1603
1604         sn_cache = kmem_cache_create("shared_policy_node",
1605                                      sizeof(struct sp_node),
1606                                      0, SLAB_PANIC, NULL, NULL);
1607
1608         /* Set interleaving policy for system init. This way not all
1609            the data structures allocated at system boot end up in node zero. */
1610
1611         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1612                 printk("numa_policy_init: interleaving failed\n");
1613 }
1614
1615 /* Reset policy of current process to default */
1616 void numa_default_policy(void)
1617 {
1618         do_set_mempolicy(MPOL_DEFAULT, NULL);
1619 }
1620
1621 /* Migrate a policy to a different set of nodes */
1622 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1623 {
1624         nodemask_t *mpolmask;
1625         nodemask_t tmp;
1626
1627         if (!pol)
1628                 return;
1629         mpolmask = &pol->cpuset_mems_allowed;
1630         if (nodes_equal(*mpolmask, *newmask))
1631                 return;
1632
1633         switch (pol->policy) {
1634         case MPOL_DEFAULT:
1635                 break;
1636         case MPOL_INTERLEAVE:
1637                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1638                 pol->v.nodes = tmp;
1639                 *mpolmask = *newmask;
1640                 current->il_next = node_remap(current->il_next,
1641                                                 *mpolmask, *newmask);
1642                 break;
1643         case MPOL_PREFERRED:
1644                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1645                                                 *mpolmask, *newmask);
1646                 *mpolmask = *newmask;
1647                 break;
1648         case MPOL_BIND: {
1649                 nodemask_t nodes;
1650                 struct zone **z;
1651                 struct zonelist *zonelist;
1652
1653                 nodes_clear(nodes);
1654                 for (z = pol->v.zonelist->zones; *z; z++)
1655                         node_set(zone_to_nid(*z), nodes);
1656                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1657                 nodes = tmp;
1658
1659                 zonelist = bind_zonelist(&nodes);
1660
1661                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1662                  * If that old zonelist has no remaining mems_allowed nodes,
1663                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1664                  */
1665
1666                 if (zonelist) {
1667                         /* Good - got mem - substitute new zonelist */
1668                         kfree(pol->v.zonelist);
1669                         pol->v.zonelist = zonelist;
1670                 }
1671                 *mpolmask = *newmask;
1672                 break;
1673         }
1674         default:
1675                 BUG();
1676                 break;
1677         }
1678 }
1679
1680 /*
1681  * Wrapper for mpol_rebind_policy() that just requires task
1682  * pointer, and updates task mempolicy.
1683  */
1684
1685 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1686 {
1687         mpol_rebind_policy(tsk->mempolicy, new);
1688 }
1689
1690 /*
1691  * Rebind each vma in mm to new nodemask.
1692  *
1693  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1694  */
1695
1696 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1697 {
1698         struct vm_area_struct *vma;
1699
1700         down_write(&mm->mmap_sem);
1701         for (vma = mm->mmap; vma; vma = vma->vm_next)
1702                 mpol_rebind_policy(vma->vm_policy, new);
1703         up_write(&mm->mmap_sem);
1704 }
1705
1706 /*
1707  * Display pages allocated per node and memory policy via /proc.
1708  */
1709
1710 static const char * const policy_types[] =
1711         { "default", "prefer", "bind", "interleave" };
1712
1713 /*
1714  * Convert a mempolicy into a string.
1715  * Returns the number of characters in buffer (if positive)
1716  * or an error (negative)
1717  */
1718 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1719 {
1720         char *p = buffer;
1721         int l;
1722         nodemask_t nodes;
1723         int mode = pol ? pol->policy : MPOL_DEFAULT;
1724
1725         switch (mode) {
1726         case MPOL_DEFAULT:
1727                 nodes_clear(nodes);
1728                 break;
1729
1730         case MPOL_PREFERRED:
1731                 nodes_clear(nodes);
1732                 node_set(pol->v.preferred_node, nodes);
1733                 break;
1734
1735         case MPOL_BIND:
1736                 get_zonemask(pol, &nodes);
1737                 break;
1738
1739         case MPOL_INTERLEAVE:
1740                 nodes = pol->v.nodes;
1741                 break;
1742
1743         default:
1744                 BUG();
1745                 return -EFAULT;
1746         }
1747
1748         l = strlen(policy_types[mode]);
1749         if (buffer + maxlen < p + l + 1)
1750                 return -ENOSPC;
1751
1752         strcpy(p, policy_types[mode]);
1753         p += l;
1754
1755         if (!nodes_empty(nodes)) {
1756                 if (buffer + maxlen < p + 2)
1757                         return -ENOSPC;
1758                 *p++ = '=';
1759                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1760         }
1761         return p - buffer;
1762 }
1763
1764 struct numa_maps {
1765         unsigned long pages;
1766         unsigned long anon;
1767         unsigned long active;
1768         unsigned long writeback;
1769         unsigned long mapcount_max;
1770         unsigned long dirty;
1771         unsigned long swapcache;
1772         unsigned long node[MAX_NUMNODES];
1773 };
1774
1775 static void gather_stats(struct page *page, void *private, int pte_dirty)
1776 {
1777         struct numa_maps *md = private;
1778         int count = page_mapcount(page);
1779
1780         md->pages++;
1781         if (pte_dirty || PageDirty(page))
1782                 md->dirty++;
1783
1784         if (PageSwapCache(page))
1785                 md->swapcache++;
1786
1787         if (PageActive(page))
1788                 md->active++;
1789
1790         if (PageWriteback(page))
1791                 md->writeback++;
1792
1793         if (PageAnon(page))
1794                 md->anon++;
1795
1796         if (count > md->mapcount_max)
1797                 md->mapcount_max = count;
1798
1799         md->node[page_to_nid(page)]++;
1800 }
1801
1802 #ifdef CONFIG_HUGETLB_PAGE
1803 static void check_huge_range(struct vm_area_struct *vma,
1804                 unsigned long start, unsigned long end,
1805                 struct numa_maps *md)
1806 {
1807         unsigned long addr;
1808         struct page *page;
1809
1810         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1811                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1812                 pte_t pte;
1813
1814                 if (!ptep)
1815                         continue;
1816
1817                 pte = *ptep;
1818                 if (pte_none(pte))
1819                         continue;
1820
1821                 page = pte_page(pte);
1822                 if (!page)
1823                         continue;
1824
1825                 gather_stats(page, md, pte_dirty(*ptep));
1826         }
1827 }
1828 #else
1829 static inline void check_huge_range(struct vm_area_struct *vma,
1830                 unsigned long start, unsigned long end,
1831                 struct numa_maps *md)
1832 {
1833 }
1834 #endif
1835
1836 int show_numa_map(struct seq_file *m, void *v)
1837 {
1838         struct proc_maps_private *priv = m->private;
1839         struct vm_area_struct *vma = v;
1840         struct numa_maps *md;
1841         struct file *file = vma->vm_file;
1842         struct mm_struct *mm = vma->vm_mm;
1843         int n;
1844         char buffer[50];
1845
1846         if (!mm)
1847                 return 0;
1848
1849         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1850         if (!md)
1851                 return 0;
1852
1853         mpol_to_str(buffer, sizeof(buffer),
1854                             get_vma_policy(priv->task, vma, vma->vm_start));
1855
1856         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1857
1858         if (file) {
1859                 seq_printf(m, " file=");
1860                 seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1861         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1862                 seq_printf(m, " heap");
1863         } else if (vma->vm_start <= mm->start_stack &&
1864                         vma->vm_end >= mm->start_stack) {
1865                 seq_printf(m, " stack");
1866         }
1867
1868         if (is_vm_hugetlb_page(vma)) {
1869                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1870                 seq_printf(m, " huge");
1871         } else {
1872                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1873                                 &node_online_map, MPOL_MF_STATS, md);
1874         }
1875
1876         if (!md->pages)
1877                 goto out;
1878
1879         if (md->anon)
1880                 seq_printf(m," anon=%lu",md->anon);
1881
1882         if (md->dirty)
1883                 seq_printf(m," dirty=%lu",md->dirty);
1884
1885         if (md->pages != md->anon && md->pages != md->dirty)
1886                 seq_printf(m, " mapped=%lu", md->pages);
1887
1888         if (md->mapcount_max > 1)
1889                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1890
1891         if (md->swapcache)
1892                 seq_printf(m," swapcache=%lu", md->swapcache);
1893
1894         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1895                 seq_printf(m," active=%lu", md->active);
1896
1897         if (md->writeback)
1898                 seq_printf(m," writeback=%lu", md->writeback);
1899
1900         for_each_online_node(n)
1901                 if (md->node[n])
1902                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1903 out:
1904         seq_putc(m, '\n');
1905         kfree(md);
1906
1907         if (m->count < m->size)
1908                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1909         return 0;
1910 }
1911