mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 static kmem_cache_t *policy_cache;
  99 static kmem_cache_t *sn_cache;
 100
 101 #define PDprintk(fmt...)
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 int policy_zone = ZONE_DMA;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 /* Do sanity checking on a policy */
 113 static int mpol_check_policy(int mode, nodemask_t *nodes)
 114 {
 115         int empty = nodes_empty(*nodes);
 116
 117         switch (mode) {
 118         case MPOL_DEFAULT:
 119                 if (!empty)
 120                         return -EINVAL;
 121                 break;
 122         case MPOL_BIND:
 123         case MPOL_INTERLEAVE:
 124                 /* Preferred will only use the first bit, but allow
 125                    more for now. */
 126                 if (empty)
 127                         return -EINVAL;
 128                 break;
 129         }
 130         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 131 }
 132 /* Generate a custom zonelist for the BIND policy. */
 133 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 134 {
 135         struct zonelist *zl;
 136         int num, max, nd;
 137
 138         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 139         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 140         if (!zl)
 141                 return NULL;
 142         num = 0;
 143         for_each_node_mask(nd, *nodes)
 144                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         return policy;
 184 }
 185
 186 static void gather_stats(struct page *, void *);
 187 static void migrate_page_add(struct vm_area_struct *vma,
 188         struct page *page, struct list_head *pagelist, unsigned long flags);
 189
 190 /* Scan through pages checking if pages follow certain conditions. */
 191 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 192                 unsigned long addr, unsigned long end,
 193                 const nodemask_t *nodes, unsigned long flags,
 194                 void *private)
 195 {
 196         pte_t *orig_pte;
 197         pte_t *pte;
 198         spinlock_t *ptl;
 199
 200         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 201         do {
 202                 struct page *page;
 203                 unsigned int nid;
 204
 205                 if (!pte_present(*pte))
 206                         continue;
 207                 page = vm_normal_page(vma, addr, *pte);
 208                 if (!page)
 209                         continue;
 210                 nid = page_to_nid(page);
 211                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 212                         continue;
 213
 214                 if (flags & MPOL_MF_STATS)
 215                         gather_stats(page, private);
 216                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 217                         spin_unlock(ptl);
 218                         migrate_page_add(vma, page, private, flags);
 219                         spin_lock(ptl);
 220                 }
 221                 else
 222                         break;
 223         } while (pte++, addr += PAGE_SIZE, addr != end);
 224         pte_unmap_unlock(orig_pte, ptl);
 225         return addr != end;
 226 }
 227
 228 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 229                 unsigned long addr, unsigned long end,
 230                 const nodemask_t *nodes, unsigned long flags,
 231                 void *private)
 232 {
 233         pmd_t *pmd;
 234         unsigned long next;
 235
 236         pmd = pmd_offset(pud, addr);
 237         do {
 238                 next = pmd_addr_end(addr, end);
 239                 if (pmd_none_or_clear_bad(pmd))
 240                         continue;
 241                 if (check_pte_range(vma, pmd, addr, next, nodes,
 242                                     flags, private))
 243                         return -EIO;
 244         } while (pmd++, addr = next, addr != end);
 245         return 0;
 246 }
 247
 248 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 249                 unsigned long addr, unsigned long end,
 250                 const nodemask_t *nodes, unsigned long flags,
 251                 void *private)
 252 {
 253         pud_t *pud;
 254         unsigned long next;
 255
 256         pud = pud_offset(pgd, addr);
 257         do {
 258                 next = pud_addr_end(addr, end);
 259                 if (pud_none_or_clear_bad(pud))
 260                         continue;
 261                 if (check_pmd_range(vma, pud, addr, next, nodes,
 262                                     flags, private))
 263                         return -EIO;
 264         } while (pud++, addr = next, addr != end);
 265         return 0;
 266 }
 267
 268 static inline int check_pgd_range(struct vm_area_struct *vma,
 269                 unsigned long addr, unsigned long end,
 270                 const nodemask_t *nodes, unsigned long flags,
 271                 void *private)
 272 {
 273         pgd_t *pgd;
 274         unsigned long next;
 275
 276         pgd = pgd_offset(vma->vm_mm, addr);
 277         do {
 278                 next = pgd_addr_end(addr, end);
 279                 if (pgd_none_or_clear_bad(pgd))
 280                         continue;
 281                 if (check_pud_range(vma, pgd, addr, next, nodes,
 282                                     flags, private))
 283                         return -EIO;
 284         } while (pgd++, addr = next, addr != end);
 285         return 0;
 286 }
 287
 288 /* Check if a vma is migratable */
 289 static inline int vma_migratable(struct vm_area_struct *vma)
 290 {
 291         if (vma->vm_flags & (
 292                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
 293                 return 0;
 294         return 1;
 295 }
 296
 297 /*
 298  * Check if all pages in a range are on a set of nodes.
 299  * If pagelist != NULL then isolate pages from the LRU and
 300  * put them on the pagelist.
 301  */
 302 static struct vm_area_struct *
 303 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 304                 const nodemask_t *nodes, unsigned long flags, void *private)
 305 {
 306         int err;
 307         struct vm_area_struct *first, *vma, *prev;
 308
 309         first = find_vma(mm, start);
 310         if (!first)
 311                 return ERR_PTR(-EFAULT);
 312         prev = NULL;
 313         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 314                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 315                         if (!vma->vm_next && vma->vm_end < end)
 316                                 return ERR_PTR(-EFAULT);
 317                         if (prev && prev->vm_end < vma->vm_start)
 318                                 return ERR_PTR(-EFAULT);
 319                 }
 320                 if (!is_vm_hugetlb_page(vma) &&
 321                     ((flags & MPOL_MF_STRICT) ||
 322                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 323                                 vma_migratable(vma)))) {
 324                         unsigned long endvma = vma->vm_end;
 325
 326                         if (endvma > end)
 327                                 endvma = end;
 328                         if (vma->vm_start > start)
 329                                 start = vma->vm_start;
 330                         err = check_pgd_range(vma, start, endvma, nodes,
 331                                                 flags, private);
 332                         if (err) {
 333                                 first = ERR_PTR(err);
 334                                 break;
 335                         }
 336                 }
 337                 prev = vma;
 338         }
 339         return first;
 340 }
 341
 342 /* Apply policy to a single VMA */
 343 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 344 {
 345         int err = 0;
 346         struct mempolicy *old = vma->vm_policy;
 347
 348         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 349                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 350                  vma->vm_ops, vma->vm_file,
 351                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 352
 353         if (vma->vm_ops && vma->vm_ops->set_policy)
 354                 err = vma->vm_ops->set_policy(vma, new);
 355         if (!err) {
 356                 mpol_get(new);
 357                 vma->vm_policy = new;
 358                 mpol_free(old);
 359         }
 360         return err;
 361 }
 362
 363 /* Step 2: apply policy to a range and do splits. */
 364 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 365                        unsigned long end, struct mempolicy *new)
 366 {
 367         struct vm_area_struct *next;
 368         int err;
 369
 370         err = 0;
 371         for (; vma && vma->vm_start < end; vma = next) {
 372                 next = vma->vm_next;
 373                 if (vma->vm_start < start)
 374                         err = split_vma(vma->vm_mm, vma, start, 1);
 375                 if (!err && vma->vm_end > end)
 376                         err = split_vma(vma->vm_mm, vma, end, 0);
 377                 if (!err)
 378                         err = policy_vma(vma, new);
 379                 if (err)
 380                         break;
 381         }
 382         return err;
 383 }
 384
 385 static int contextualize_policy(int mode, nodemask_t *nodes)
 386 {
 387         if (!nodes)
 388                 return 0;
 389
 390         cpuset_update_task_memory_state();
 391         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 392                 return -EINVAL;
 393         return mpol_check_policy(mode, nodes);
 394 }
 395
 396 /* Set the process memory policy */
 397 long do_set_mempolicy(int mode, nodemask_t *nodes)
 398 {
 399         struct mempolicy *new;
 400
 401         if (contextualize_policy(mode, nodes))
 402                 return -EINVAL;
 403         new = mpol_new(mode, nodes);
 404         if (IS_ERR(new))
 405                 return PTR_ERR(new);
 406         mpol_free(current->mempolicy);
 407         current->mempolicy = new;
 408         if (new && new->policy == MPOL_INTERLEAVE)
 409                 current->il_next = first_node(new->v.nodes);
 410         return 0;
 411 }
 412
 413 /* Fill a zone bitmap for a policy */
 414 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 415 {
 416         int i;
 417
 418         nodes_clear(*nodes);
 419         switch (p->policy) {
 420         case MPOL_BIND:
 421                 for (i = 0; p->v.zonelist->zones[i]; i++)
 422                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 423                                 *nodes);
 424                 break;
 425         case MPOL_DEFAULT:
 426                 break;
 427         case MPOL_INTERLEAVE:
 428                 *nodes = p->v.nodes;
 429                 break;
 430         case MPOL_PREFERRED:
 431                 /* or use current node instead of online map? */
 432                 if (p->v.preferred_node < 0)
 433                         *nodes = node_online_map;
 434                 else
 435                         node_set(p->v.preferred_node, *nodes);
 436                 break;
 437         default:
 438                 BUG();
 439         }
 440 }
 441
 442 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 443 {
 444         struct page *p;
 445         int err;
 446
 447         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 448         if (err >= 0) {
 449                 err = page_to_nid(p);
 450                 put_page(p);
 451         }
 452         return err;
 453 }
 454
 455 /* Retrieve NUMA policy */
 456 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 457                         unsigned long addr, unsigned long flags)
 458 {
 459         int err;
 460         struct mm_struct *mm = current->mm;
 461         struct vm_area_struct *vma = NULL;
 462         struct mempolicy *pol = current->mempolicy;
 463
 464         cpuset_update_task_memory_state();
 465         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 466                 return -EINVAL;
 467         if (flags & MPOL_F_ADDR) {
 468                 down_read(&mm->mmap_sem);
 469                 vma = find_vma_intersection(mm, addr, addr+1);
 470                 if (!vma) {
 471                         up_read(&mm->mmap_sem);
 472                         return -EFAULT;
 473                 }
 474                 if (vma->vm_ops && vma->vm_ops->get_policy)
 475                         pol = vma->vm_ops->get_policy(vma, addr);
 476                 else
 477                         pol = vma->vm_policy;
 478         } else if (addr)
 479                 return -EINVAL;
 480
 481         if (!pol)
 482                 pol = &default_policy;
 483
 484         if (flags & MPOL_F_NODE) {
 485                 if (flags & MPOL_F_ADDR) {
 486                         err = lookup_node(mm, addr);
 487                         if (err < 0)
 488                                 goto out;
 489                         *policy = err;
 490                 } else if (pol == current->mempolicy &&
 491                                 pol->policy == MPOL_INTERLEAVE) {
 492                         *policy = current->il_next;
 493                 } else {
 494                         err = -EINVAL;
 495                         goto out;
 496                 }
 497         } else
 498                 *policy = pol->policy;
 499
 500         if (vma) {
 501                 up_read(&current->mm->mmap_sem);
 502                 vma = NULL;
 503         }
 504
 505         err = 0;
 506         if (nmask)
 507                 get_zonemask(pol, nmask);
 508
 509  out:
 510         if (vma)
 511                 up_read(&current->mm->mmap_sem);
 512         return err;
 513 }
 514
 515 /*
 516  * page migration
 517  */
 518
 519 /* Check if we are the only process mapping the page in question */
 520 static inline int single_mm_mapping(struct mm_struct *mm,
 521                         struct address_space *mapping)
 522 {
 523         struct vm_area_struct *vma;
 524         struct prio_tree_iter iter;
 525         int rc = 1;
 526
 527         spin_lock(&mapping->i_mmap_lock);
 528         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
 529                 if (mm != vma->vm_mm) {
 530                         rc = 0;
 531                         goto out;
 532                 }
 533         list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 534                 if (mm != vma->vm_mm) {
 535                         rc = 0;
 536                         goto out;
 537                 }
 538 out:
 539         spin_unlock(&mapping->i_mmap_lock);
 540         return rc;
 541 }
 542
 543 /*
 544  * Add a page to be migrated to the pagelist
 545  */
 546 static void migrate_page_add(struct vm_area_struct *vma,
 547         struct page *page, struct list_head *pagelist, unsigned long flags)
 548 {
 549         /*
 550          * Avoid migrating a page that is shared by others and not writable.
 551          */
 552         if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 553             mapping_writably_mapped(page->mapping) ||
 554             single_mm_mapping(vma->vm_mm, page->mapping)) {
 555                 int rc = isolate_lru_page(page);
 556
 557                 if (rc == 1)
 558                         list_add(&page->lru, pagelist);
 559                 /*
 560                  * If the isolate attempt was not successful then we just
 561                  * encountered an unswappable page. Something must be wrong.
 562                  */
 563                 WARN_ON(rc == 0);
 564         }
 565 }
 566
 567 static int swap_pages(struct list_head *pagelist)
 568 {
 569         LIST_HEAD(moved);
 570         LIST_HEAD(failed);
 571         int n;
 572
 573         n = migrate_pages(pagelist, NULL, &moved, &failed);
 574         putback_lru_pages(&failed);
 575         putback_lru_pages(&moved);
 576
 577         return n;
 578 }
 579
 580 /*
 581  * For now migrate_pages simply swaps out the pages from nodes that are in
 582  * the source set but not in the target set. In the future, we would
 583  * want a function that moves pages between the two nodesets in such
 584  * a way as to preserve the physical layout as much as possible.
 585  *
 586  * Returns the number of page that could not be moved.
 587  */
 588 int do_migrate_pages(struct mm_struct *mm,
 589         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 590 {
 591         LIST_HEAD(pagelist);
 592         int count = 0;
 593         nodemask_t nodes;
 594
 595         nodes_andnot(nodes, *from_nodes, *to_nodes);
 596
 597         down_read(&mm->mmap_sem);
 598         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 599                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 600
 601         if (!list_empty(&pagelist)) {
 602                 count = swap_pages(&pagelist);
 603                 putback_lru_pages(&pagelist);
 604         }
 605
 606         up_read(&mm->mmap_sem);
 607         return count;
 608 }
 609
 610 long do_mbind(unsigned long start, unsigned long len,
 611                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 612 {
 613         struct vm_area_struct *vma;
 614         struct mm_struct *mm = current->mm;
 615         struct mempolicy *new;
 616         unsigned long end;
 617         int err;
 618         LIST_HEAD(pagelist);
 619
 620         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 621                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 622             || mode > MPOL_MAX)
 623                 return -EINVAL;
 624         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 625                 return -EPERM;
 626
 627         if (start & ~PAGE_MASK)
 628                 return -EINVAL;
 629
 630         if (mode == MPOL_DEFAULT)
 631                 flags &= ~MPOL_MF_STRICT;
 632
 633         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 634         end = start + len;
 635
 636         if (end < start)
 637                 return -EINVAL;
 638         if (end == start)
 639                 return 0;
 640
 641         if (mpol_check_policy(mode, nmask))
 642                 return -EINVAL;
 643
 644         new = mpol_new(mode, nmask);
 645         if (IS_ERR(new))
 646                 return PTR_ERR(new);
 647
 648         /*
 649          * If we are using the default policy then operation
 650          * on discontinuous address spaces is okay after all
 651          */
 652         if (!new)
 653                 flags |= MPOL_MF_DISCONTIG_OK;
 654
 655         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 656                         mode,nodes_addr(nodes)[0]);
 657
 658         down_write(&mm->mmap_sem);
 659         vma = check_range(mm, start, end, nmask,
 660                           flags | MPOL_MF_INVERT, &pagelist);
 661
 662         err = PTR_ERR(vma);
 663         if (!IS_ERR(vma)) {
 664                 int nr_failed = 0;
 665
 666                 err = mbind_range(vma, start, end, new);
 667                 if (!list_empty(&pagelist))
 668                         nr_failed = swap_pages(&pagelist);
 669
 670                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 671                         err = -EIO;
 672         }
 673         if (!list_empty(&pagelist))
 674                 putback_lru_pages(&pagelist);
 675
 676         up_write(&mm->mmap_sem);
 677         mpol_free(new);
 678         return err;
 679 }
 680
 681 /*
 682  * User space interface with variable sized bitmaps for nodelists.
 683  */
 684
 685 /* Copy a node mask from user space. */
 686 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 687                      unsigned long maxnode)
 688 {
 689         unsigned long k;
 690         unsigned long nlongs;
 691         unsigned long endmask;
 692
 693         --maxnode;
 694         nodes_clear(*nodes);
 695         if (maxnode == 0 || !nmask)
 696                 return 0;
 697
 698         nlongs = BITS_TO_LONGS(maxnode);
 699         if ((maxnode % BITS_PER_LONG) == 0)
 700                 endmask = ~0UL;
 701         else
 702                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 703
 704         /* When the user specified more nodes than supported just check
 705            if the non supported part is all zero. */
 706         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 707                 if (nlongs > PAGE_SIZE/sizeof(long))
 708                         return -EINVAL;
 709                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 710                         unsigned long t;
 711                         if (get_user(t, nmask + k))
 712                                 return -EFAULT;
 713                         if (k == nlongs - 1) {
 714                                 if (t & endmask)
 715                                         return -EINVAL;
 716                         } else if (t)
 717                                 return -EINVAL;
 718                 }
 719                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 720                 endmask = ~0UL;
 721         }
 722
 723         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 724                 return -EFAULT;
 725         nodes_addr(*nodes)[nlongs-1] &= endmask;
 726         return 0;
 727 }
 728
 729 /* Copy a kernel node mask to user space */
 730 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 731                               nodemask_t *nodes)
 732 {
 733         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 734         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 735
 736         if (copy > nbytes) {
 737                 if (copy > PAGE_SIZE)
 738                         return -EINVAL;
 739                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 740                         return -EFAULT;
 741                 copy = nbytes;
 742         }
 743         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 744 }
 745
 746 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 747                         unsigned long mode,
 748                         unsigned long __user *nmask, unsigned long maxnode,
 749                         unsigned flags)
 750 {
 751         nodemask_t nodes;
 752         int err;
 753
 754         err = get_nodes(&nodes, nmask, maxnode);
 755         if (err)
 756                 return err;
 757         return do_mbind(start, len, mode, &nodes, flags);
 758 }
 759
 760 /* Set the process memory policy */
 761 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 762                 unsigned long maxnode)
 763 {
 764         int err;
 765         nodemask_t nodes;
 766
 767         if (mode < 0 || mode > MPOL_MAX)
 768                 return -EINVAL;
 769         err = get_nodes(&nodes, nmask, maxnode);
 770         if (err)
 771                 return err;
 772         return do_set_mempolicy(mode, &nodes);
 773 }
 774
 775 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 776                 const unsigned long __user *old_nodes,
 777                 const unsigned long __user *new_nodes)
 778 {
 779         struct mm_struct *mm;
 780         struct task_struct *task;
 781         nodemask_t old;
 782         nodemask_t new;
 783         nodemask_t task_nodes;
 784         int err;
 785
 786         err = get_nodes(&old, old_nodes, maxnode);
 787         if (err)
 788                 return err;
 789
 790         err = get_nodes(&new, new_nodes, maxnode);
 791         if (err)
 792                 return err;
 793
 794         /* Find the mm_struct */
 795         read_lock(&tasklist_lock);
 796         task = pid ? find_task_by_pid(pid) : current;
 797         if (!task) {
 798                 read_unlock(&tasklist_lock);
 799                 return -ESRCH;
 800         }
 801         mm = get_task_mm(task);
 802         read_unlock(&tasklist_lock);
 803
 804         if (!mm)
 805                 return -EINVAL;
 806
 807         /*
 808          * Check if this process has the right to modify the specified
 809          * process. The right exists if the process has administrative
 810          * capabilities, superuser priviledges or the same
 811          * userid as the target process.
 812          */
 813         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 814             (current->uid != task->suid) && (current->uid != task->uid) &&
 815             !capable(CAP_SYS_ADMIN)) {
 816                 err = -EPERM;
 817                 goto out;
 818         }
 819
 820         task_nodes = cpuset_mems_allowed(task);
 821         /* Is the user allowed to access the target nodes? */
 822         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 823                 err = -EPERM;
 824                 goto out;
 825         }
 826
 827         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 828 out:
 829         mmput(mm);
 830         return err;
 831 }
 832
 833
 834 /* Retrieve NUMA policy */
 835 asmlinkage long sys_get_mempolicy(int __user *policy,
 836                                 unsigned long __user *nmask,
 837                                 unsigned long maxnode,
 838                                 unsigned long addr, unsigned long flags)
 839 {
 840         int err, pval;
 841         nodemask_t nodes;
 842
 843         if (nmask != NULL && maxnode < MAX_NUMNODES)
 844                 return -EINVAL;
 845
 846         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 847
 848         if (err)
 849                 return err;
 850
 851         if (policy && put_user(pval, policy))
 852                 return -EFAULT;
 853
 854         if (nmask)
 855                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 856
 857         return err;
 858 }
 859
 860 #ifdef CONFIG_COMPAT
 861
 862 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 863                                      compat_ulong_t __user *nmask,
 864                                      compat_ulong_t maxnode,
 865                                      compat_ulong_t addr, compat_ulong_t flags)
 866 {
 867         long err;
 868         unsigned long __user *nm = NULL;
 869         unsigned long nr_bits, alloc_size;
 870         DECLARE_BITMAP(bm, MAX_NUMNODES);
 871
 872         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 873         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 874
 875         if (nmask)
 876                 nm = compat_alloc_user_space(alloc_size);
 877
 878         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 879
 880         if (!err && nmask) {
 881                 err = copy_from_user(bm, nm, alloc_size);
 882                 /* ensure entire bitmap is zeroed */
 883                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 884                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 885         }
 886
 887         return err;
 888 }
 889
 890 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 891                                      compat_ulong_t maxnode)
 892 {
 893         long err = 0;
 894         unsigned long __user *nm = NULL;
 895         unsigned long nr_bits, alloc_size;
 896         DECLARE_BITMAP(bm, MAX_NUMNODES);
 897
 898         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 899         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 900
 901         if (nmask) {
 902                 err = compat_get_bitmap(bm, nmask, nr_bits);
 903                 nm = compat_alloc_user_space(alloc_size);
 904                 err |= copy_to_user(nm, bm, alloc_size);
 905         }
 906
 907         if (err)
 908                 return -EFAULT;
 909
 910         return sys_set_mempolicy(mode, nm, nr_bits+1);
 911 }
 912
 913 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 914                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 915                              compat_ulong_t maxnode, compat_ulong_t flags)
 916 {
 917         long err = 0;
 918         unsigned long __user *nm = NULL;
 919         unsigned long nr_bits, alloc_size;
 920         nodemask_t bm;
 921
 922         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 923         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 924
 925         if (nmask) {
 926                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 927                 nm = compat_alloc_user_space(alloc_size);
 928                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 929         }
 930
 931         if (err)
 932                 return -EFAULT;
 933
 934         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 935 }
 936
 937 #endif
 938
 939 /* Return effective policy for a VMA */
 940 static struct mempolicy * get_vma_policy(struct task_struct *task,
 941                 struct vm_area_struct *vma, unsigned long addr)
 942 {
 943         struct mempolicy *pol = task->mempolicy;
 944
 945         if (vma) {
 946                 if (vma->vm_ops && vma->vm_ops->get_policy)
 947                         pol = vma->vm_ops->get_policy(vma, addr);
 948                 else if (vma->vm_policy &&
 949                                 vma->vm_policy->policy != MPOL_DEFAULT)
 950                         pol = vma->vm_policy;
 951         }
 952         if (!pol)
 953                 pol = &default_policy;
 954         return pol;
 955 }
 956
 957 /* Return a zonelist representing a mempolicy */
 958 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 959 {
 960         int nd;
 961
 962         switch (policy->policy) {
 963         case MPOL_PREFERRED:
 964                 nd = policy->v.preferred_node;
 965                 if (nd < 0)
 966                         nd = numa_node_id();
 967                 break;
 968         case MPOL_BIND:
 969                 /* Lower zones don't get a policy applied */
 970                 /* Careful: current->mems_allowed might have moved */
 971                 if (gfp_zone(gfp) >= policy_zone)
 972                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 973                                 return policy->v.zonelist;
 974                 /*FALL THROUGH*/
 975         case MPOL_INTERLEAVE: /* should not happen */
 976         case MPOL_DEFAULT:
 977                 nd = numa_node_id();
 978                 break;
 979         default:
 980                 nd = 0;
 981                 BUG();
 982         }
 983         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 984 }
 985
 986 /* Do dynamic interleaving for a process */
 987 static unsigned interleave_nodes(struct mempolicy *policy)
 988 {
 989         unsigned nid, next;
 990         struct task_struct *me = current;
 991
 992         nid = me->il_next;
 993         next = next_node(nid, policy->v.nodes);
 994         if (next >= MAX_NUMNODES)
 995                 next = first_node(policy->v.nodes);
 996         me->il_next = next;
 997         return nid;
 998 }
 999
1000 /* Do static interleaving for a VMA with known offset. */
1001 static unsigned offset_il_node(struct mempolicy *pol,
1002                 struct vm_area_struct *vma, unsigned long off)
1003 {
1004         unsigned nnodes = nodes_weight(pol->v.nodes);
1005         unsigned target = (unsigned)off % nnodes;
1006         int c;
1007         int nid = -1;
1008
1009         c = 0;
1010         do {
1011                 nid = next_node(nid, pol->v.nodes);
1012                 c++;
1013         } while (c <= target);
1014         return nid;
1015 }
1016
1017 /* Determine a node number for interleave */
1018 static inline unsigned interleave_nid(struct mempolicy *pol,
1019                  struct vm_area_struct *vma, unsigned long addr, int shift)
1020 {
1021         if (vma) {
1022                 unsigned long off;
1023
1024                 off = vma->vm_pgoff;
1025                 off += (addr - vma->vm_start) >> shift;
1026                 return offset_il_node(pol, vma, off);
1027         } else
1028                 return interleave_nodes(pol);
1029 }
1030
1031 /* Return a zonelist suitable for a huge page allocation. */
1032 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1033 {
1034         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1035
1036         if (pol->policy == MPOL_INTERLEAVE) {
1037                 unsigned nid;
1038
1039                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1040                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1041         }
1042         return zonelist_policy(GFP_HIGHUSER, pol);
1043 }
1044
1045 /* Allocate a page in interleaved policy.
1046    Own path because it needs to do special accounting. */
1047 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1048                                         unsigned nid)
1049 {
1050         struct zonelist *zl;
1051         struct page *page;
1052
1053         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1054         page = __alloc_pages(gfp, order, zl);
1055         if (page && page_zone(page) == zl->zones[0]) {
1056                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1057                 put_cpu();
1058         }
1059         return page;
1060 }
1061
1062 /**
1063  *      alloc_page_vma  - Allocate a page for a VMA.
1064  *
1065  *      @gfp:
1066  *      %GFP_USER    user allocation.
1067  *      %GFP_KERNEL  kernel allocations,
1068  *      %GFP_HIGHMEM highmem/user allocations,
1069  *      %GFP_FS      allocation should not call back into a file system.
1070  *      %GFP_ATOMIC  don't sleep.
1071  *
1072  *      @vma:  Pointer to VMA or NULL if not available.
1073  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1074  *
1075  *      This function allocates a page from the kernel page pool and applies
1076  *      a NUMA policy associated with the VMA or the current process.
1077  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1078  *      mm_struct of the VMA to prevent it from going away. Should be used for
1079  *      all allocations for pages that will be mapped into
1080  *      user space. Returns NULL when no page can be allocated.
1081  *
1082  *      Should be called with the mm_sem of the vma hold.
1083  */
1084 struct page *
1085 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1086 {
1087         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1088
1089         cpuset_update_task_memory_state();
1090
1091         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1092                 unsigned nid;
1093
1094                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1095                 return alloc_page_interleave(gfp, 0, nid);
1096         }
1097         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1098 }
1099
1100 /**
1101  *      alloc_pages_current - Allocate pages.
1102  *
1103  *      @gfp:
1104  *              %GFP_USER   user allocation,
1105  *              %GFP_KERNEL kernel allocation,
1106  *              %GFP_HIGHMEM highmem allocation,
1107  *              %GFP_FS     don't call back into a file system.
1108  *              %GFP_ATOMIC don't sleep.
1109  *      @order: Power of two of allocation size in pages. 0 is a single page.
1110  *
1111  *      Allocate a page from the kernel page pool.  When not in
1112  *      interrupt context and apply the current process NUMA policy.
1113  *      Returns NULL when no page can be allocated.
1114  *
1115  *      Don't call cpuset_update_task_memory_state() unless
1116  *      1) it's ok to take cpuset_sem (can WAIT), and
1117  *      2) allocating for current task (not interrupt).
1118  */
1119 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1120 {
1121         struct mempolicy *pol = current->mempolicy;
1122
1123         if ((gfp & __GFP_WAIT) && !in_interrupt())
1124                 cpuset_update_task_memory_state();
1125         if (!pol || in_interrupt())
1126                 pol = &default_policy;
1127         if (pol->policy == MPOL_INTERLEAVE)
1128                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1129         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1130 }
1131 EXPORT_SYMBOL(alloc_pages_current);
1132
1133 /* Slow path of a mempolicy copy */
1134 struct mempolicy *__mpol_copy(struct mempolicy *old)
1135 {
1136         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1137
1138         if (!new)
1139                 return ERR_PTR(-ENOMEM);
1140         *new = *old;
1141         atomic_set(&new->refcnt, 1);
1142         if (new->policy == MPOL_BIND) {
1143                 int sz = ksize(old->v.zonelist);
1144                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1145                 if (!new->v.zonelist) {
1146                         kmem_cache_free(policy_cache, new);
1147                         return ERR_PTR(-ENOMEM);
1148                 }
1149                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1150         }
1151         return new;
1152 }
1153
1154 /* Slow path of a mempolicy comparison */
1155 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1156 {
1157         if (!a || !b)
1158                 return 0;
1159         if (a->policy != b->policy)
1160                 return 0;
1161         switch (a->policy) {
1162         case MPOL_DEFAULT:
1163                 return 1;
1164         case MPOL_INTERLEAVE:
1165                 return nodes_equal(a->v.nodes, b->v.nodes);
1166         case MPOL_PREFERRED:
1167                 return a->v.preferred_node == b->v.preferred_node;
1168         case MPOL_BIND: {
1169                 int i;
1170                 for (i = 0; a->v.zonelist->zones[i]; i++)
1171                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1172                                 return 0;
1173                 return b->v.zonelist->zones[i] == NULL;
1174         }
1175         default:
1176                 BUG();
1177                 return 0;
1178         }
1179 }
1180
1181 /* Slow path of a mpol destructor. */
1182 void __mpol_free(struct mempolicy *p)
1183 {
1184         if (!atomic_dec_and_test(&p->refcnt))
1185                 return;
1186         if (p->policy == MPOL_BIND)
1187                 kfree(p->v.zonelist);
1188         p->policy = MPOL_DEFAULT;
1189         kmem_cache_free(policy_cache, p);
1190 }
1191
1192 /*
1193  * Shared memory backing store policy support.
1194  *
1195  * Remember policies even when nobody has shared memory mapped.
1196  * The policies are kept in Red-Black tree linked from the inode.
1197  * They are protected by the sp->lock spinlock, which should be held
1198  * for any accesses to the tree.
1199  */
1200
1201 /* lookup first element intersecting start-end */
1202 /* Caller holds sp->lock */
1203 static struct sp_node *
1204 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1205 {
1206         struct rb_node *n = sp->root.rb_node;
1207
1208         while (n) {
1209                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1210
1211                 if (start >= p->end)
1212                         n = n->rb_right;
1213                 else if (end <= p->start)
1214                         n = n->rb_left;
1215                 else
1216                         break;
1217         }
1218         if (!n)
1219                 return NULL;
1220         for (;;) {
1221                 struct sp_node *w = NULL;
1222                 struct rb_node *prev = rb_prev(n);
1223                 if (!prev)
1224                         break;
1225                 w = rb_entry(prev, struct sp_node, nd);
1226                 if (w->end <= start)
1227                         break;
1228                 n = prev;
1229         }
1230         return rb_entry(n, struct sp_node, nd);
1231 }
1232
1233 /* Insert a new shared policy into the list. */
1234 /* Caller holds sp->lock */
1235 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1236 {
1237         struct rb_node **p = &sp->root.rb_node;
1238         struct rb_node *parent = NULL;
1239         struct sp_node *nd;
1240
1241         while (*p) {
1242                 parent = *p;
1243                 nd = rb_entry(parent, struct sp_node, nd);
1244                 if (new->start < nd->start)
1245                         p = &(*p)->rb_left;
1246                 else if (new->end > nd->end)
1247                         p = &(*p)->rb_right;
1248                 else
1249                         BUG();
1250         }
1251         rb_link_node(&new->nd, parent, p);
1252         rb_insert_color(&new->nd, &sp->root);
1253         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1254                  new->policy ? new->policy->policy : 0);
1255 }
1256
1257 /* Find shared policy intersecting idx */
1258 struct mempolicy *
1259 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1260 {
1261         struct mempolicy *pol = NULL;
1262         struct sp_node *sn;
1263
1264         if (!sp->root.rb_node)
1265                 return NULL;
1266         spin_lock(&sp->lock);
1267         sn = sp_lookup(sp, idx, idx+1);
1268         if (sn) {
1269                 mpol_get(sn->policy);
1270                 pol = sn->policy;
1271         }
1272         spin_unlock(&sp->lock);
1273         return pol;
1274 }
1275
1276 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1277 {
1278         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1279         rb_erase(&n->nd, &sp->root);
1280         mpol_free(n->policy);
1281         kmem_cache_free(sn_cache, n);
1282 }
1283
1284 struct sp_node *
1285 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1286 {
1287         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1288
1289         if (!n)
1290                 return NULL;
1291         n->start = start;
1292         n->end = end;
1293         mpol_get(pol);
1294         n->policy = pol;
1295         return n;
1296 }
1297
1298 /* Replace a policy range. */
1299 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1300                                  unsigned long end, struct sp_node *new)
1301 {
1302         struct sp_node *n, *new2 = NULL;
1303
1304 restart:
1305         spin_lock(&sp->lock);
1306         n = sp_lookup(sp, start, end);
1307         /* Take care of old policies in the same range. */
1308         while (n && n->start < end) {
1309                 struct rb_node *next = rb_next(&n->nd);
1310                 if (n->start >= start) {
1311                         if (n->end <= end)
1312                                 sp_delete(sp, n);
1313                         else
1314                                 n->start = end;
1315                 } else {
1316                         /* Old policy spanning whole new range. */
1317                         if (n->end > end) {
1318                                 if (!new2) {
1319                                         spin_unlock(&sp->lock);
1320                                         new2 = sp_alloc(end, n->end, n->policy);
1321                                         if (!new2)
1322                                                 return -ENOMEM;
1323                                         goto restart;
1324                                 }
1325                                 n->end = start;
1326                                 sp_insert(sp, new2);
1327                                 new2 = NULL;
1328                                 break;
1329                         } else
1330                                 n->end = start;
1331                 }
1332                 if (!next)
1333                         break;
1334                 n = rb_entry(next, struct sp_node, nd);
1335         }
1336         if (new)
1337                 sp_insert(sp, new);
1338         spin_unlock(&sp->lock);
1339         if (new2) {
1340                 mpol_free(new2->policy);
1341                 kmem_cache_free(sn_cache, new2);
1342         }
1343         return 0;
1344 }
1345
1346 int mpol_set_shared_policy(struct shared_policy *info,
1347                         struct vm_area_struct *vma, struct mempolicy *npol)
1348 {
1349         int err;
1350         struct sp_node *new = NULL;
1351         unsigned long sz = vma_pages(vma);
1352
1353         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1354                  vma->vm_pgoff,
1355                  sz, npol? npol->policy : -1,
1356                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1357
1358         if (npol) {
1359                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1360                 if (!new)
1361                         return -ENOMEM;
1362         }
1363         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1364         if (err && new)
1365                 kmem_cache_free(sn_cache, new);
1366         return err;
1367 }
1368
1369 /* Free a backing policy store on inode delete. */
1370 void mpol_free_shared_policy(struct shared_policy *p)
1371 {
1372         struct sp_node *n;
1373         struct rb_node *next;
1374
1375         if (!p->root.rb_node)
1376                 return;
1377         spin_lock(&p->lock);
1378         next = rb_first(&p->root);
1379         while (next) {
1380                 n = rb_entry(next, struct sp_node, nd);
1381                 next = rb_next(&n->nd);
1382                 rb_erase(&n->nd, &p->root);
1383                 mpol_free(n->policy);
1384                 kmem_cache_free(sn_cache, n);
1385         }
1386         spin_unlock(&p->lock);
1387 }
1388
1389 /* assumes fs == KERNEL_DS */
1390 void __init numa_policy_init(void)
1391 {
1392         policy_cache = kmem_cache_create("numa_policy",
1393                                          sizeof(struct mempolicy),
1394                                          0, SLAB_PANIC, NULL, NULL);
1395
1396         sn_cache = kmem_cache_create("shared_policy_node",
1397                                      sizeof(struct sp_node),
1398                                      0, SLAB_PANIC, NULL, NULL);
1399
1400         /* Set interleaving policy for system init. This way not all
1401            the data structures allocated at system boot end up in node zero. */
1402
1403         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1404                 printk("numa_policy_init: interleaving failed\n");
1405 }
1406
1407 /* Reset policy of current process to default */
1408 void numa_default_policy(void)
1409 {
1410         do_set_mempolicy(MPOL_DEFAULT, NULL);
1411 }
1412
1413 /* Migrate a policy to a different set of nodes */
1414 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1415                                                         const nodemask_t *new)
1416 {
1417         nodemask_t tmp;
1418
1419         if (!pol)
1420                 return;
1421
1422         switch (pol->policy) {
1423         case MPOL_DEFAULT:
1424                 break;
1425         case MPOL_INTERLEAVE:
1426                 nodes_remap(tmp, pol->v.nodes, *old, *new);
1427                 pol->v.nodes = tmp;
1428                 current->il_next = node_remap(current->il_next, *old, *new);
1429                 break;
1430         case MPOL_PREFERRED:
1431                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1432                                                                 *old, *new);
1433                 break;
1434         case MPOL_BIND: {
1435                 nodemask_t nodes;
1436                 struct zone **z;
1437                 struct zonelist *zonelist;
1438
1439                 nodes_clear(nodes);
1440                 for (z = pol->v.zonelist->zones; *z; z++)
1441                         node_set((*z)->zone_pgdat->node_id, nodes);
1442                 nodes_remap(tmp, nodes, *old, *new);
1443                 nodes = tmp;
1444
1445                 zonelist = bind_zonelist(&nodes);
1446
1447                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1448                  * If that old zonelist has no remaining mems_allowed nodes,
1449                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1450                  */
1451
1452                 if (zonelist) {
1453                         /* Good - got mem - substitute new zonelist */
1454                         kfree(pol->v.zonelist);
1455                         pol->v.zonelist = zonelist;
1456                 }
1457                 break;
1458         }
1459         default:
1460                 BUG();
1461                 break;
1462         }
1463 }
1464
1465 /*
1466  * Someone moved this task to different nodes.  Fixup mempolicies.
1467  *
1468  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1469  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1470  */
1471 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1472 {
1473         rebind_policy(current->mempolicy, old, new);
1474 }
1475
1476 /*
1477  * Display pages allocated per node and memory policy via /proc.
1478  */
1479
1480 static const char *policy_types[] = { "default", "prefer", "bind",
1481                                       "interleave" };
1482
1483 /*
1484  * Convert a mempolicy into a string.
1485  * Returns the number of characters in buffer (if positive)
1486  * or an error (negative)
1487  */
1488 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1489 {
1490         char *p = buffer;
1491         int l;
1492         nodemask_t nodes;
1493         int mode = pol ? pol->policy : MPOL_DEFAULT;
1494
1495         switch (mode) {
1496         case MPOL_DEFAULT:
1497                 nodes_clear(nodes);
1498                 break;
1499
1500         case MPOL_PREFERRED:
1501                 nodes_clear(nodes);
1502                 node_set(pol->v.preferred_node, nodes);
1503                 break;
1504
1505         case MPOL_BIND:
1506                 get_zonemask(pol, &nodes);
1507                 break;
1508
1509         case MPOL_INTERLEAVE:
1510                 nodes = pol->v.nodes;
1511                 break;
1512
1513         default:
1514                 BUG();
1515                 return -EFAULT;
1516         }
1517
1518         l = strlen(policy_types[mode]);
1519         if (buffer + maxlen < p + l + 1)
1520                 return -ENOSPC;
1521
1522         strcpy(p, policy_types[mode]);
1523         p += l;
1524
1525         if (!nodes_empty(nodes)) {
1526                 if (buffer + maxlen < p + 2)
1527                         return -ENOSPC;
1528                 *p++ = '=';
1529                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1530         }
1531         return p - buffer;
1532 }
1533
1534 struct numa_maps {
1535         unsigned long pages;
1536         unsigned long anon;
1537         unsigned long mapped;
1538         unsigned long mapcount_max;
1539         unsigned long node[MAX_NUMNODES];
1540 };
1541
1542 static void gather_stats(struct page *page, void *private)
1543 {
1544         struct numa_maps *md = private;
1545         int count = page_mapcount(page);
1546
1547         if (count)
1548                 md->mapped++;
1549
1550         if (count > md->mapcount_max)
1551                 md->mapcount_max = count;
1552
1553         md->pages++;
1554
1555         if (PageAnon(page))
1556                 md->anon++;
1557
1558         md->node[page_to_nid(page)]++;
1559         cond_resched();
1560 }
1561
1562 int show_numa_map(struct seq_file *m, void *v)
1563 {
1564         struct task_struct *task = m->private;
1565         struct vm_area_struct *vma = v;
1566         struct numa_maps *md;
1567         int n;
1568         char buffer[50];
1569
1570         if (!vma->vm_mm)
1571                 return 0;
1572
1573         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1574         if (!md)
1575                 return 0;
1576
1577         check_pgd_range(vma, vma->vm_start, vma->vm_end,
1578                     &node_online_map, MPOL_MF_STATS, md);
1579
1580         if (md->pages) {
1581                 mpol_to_str(buffer, sizeof(buffer),
1582                             get_vma_policy(task, vma, vma->vm_start));
1583
1584                 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1585                            vma->vm_start, buffer, md->pages,
1586                            md->mapped, md->mapcount_max);
1587
1588                 if (md->anon)
1589                         seq_printf(m," anon=%lu",md->anon);
1590
1591                 for_each_online_node(n)
1592                         if (md->node[n])
1593                                 seq_printf(m, " N%d=%lu", n, md->node[n]);
1594
1595                 seq_putc(m, '\n');
1596         }
1597         kfree(md);
1598
1599         if (m->count < m->size)
1600                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1601         return 0;
1602 }
1603