arch/powerpc/mm/numa.c

   1 /*
   2  * pSeries NUMA support
   3  *
   4  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License
   8  * as published by the Free Software Foundation; either version
   9  * 2 of the License, or (at your option) any later version.
  10  */
  11 #include <linux/threads.h>
  12 #include <linux/bootmem.h>
  13 #include <linux/init.h>
  14 #include <linux/mm.h>
  15 #include <linux/mmzone.h>
  16 #include <linux/module.h>
  17 #include <linux/nodemask.h>
  18 #include <linux/cpu.h>
  19 #include <linux/notifier.h>
  20 #include <linux/lmb.h>
  21 #include <asm/sparsemem.h>
  22 #include <asm/prom.h>
  23 #include <asm/system.h>
  24 #include <asm/smp.h>
  25
  26 static int numa_enabled = 1;
  27
  28 static char *cmdline __initdata;
  29
  30 static int numa_debug;
  31 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
  32
  33 int numa_cpu_lookup_table[NR_CPUS];
  34 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
  35 struct pglist_data *node_data[MAX_NUMNODES];
  36
  37 EXPORT_SYMBOL(numa_cpu_lookup_table);
  38 EXPORT_SYMBOL(numa_cpumask_lookup_table);
  39 EXPORT_SYMBOL(node_data);
  40
  41 static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
  42 static int min_common_depth;
  43 static int n_mem_addr_cells, n_mem_size_cells;
  44
  45 static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
  46                                                 unsigned int *nid)
  47 {
  48         unsigned long long mem;
  49         char *p = cmdline;
  50         static unsigned int fake_nid;
  51         static unsigned long long curr_boundary;
  52
  53         /*
  54          * Modify node id, iff we started creating NUMA nodes
  55          * We want to continue from where we left of the last time
  56          */
  57         if (fake_nid)
  58                 *nid = fake_nid;
  59         /*
  60          * In case there are no more arguments to parse, the
  61          * node_id should be the same as the last fake node id
  62          * (we've handled this above).
  63          */
  64         if (!p)
  65                 return 0;
  66
  67         mem = memparse(p, &p);
  68         if (!mem)
  69                 return 0;
  70
  71         if (mem < curr_boundary)
  72                 return 0;
  73
  74         curr_boundary = mem;
  75
  76         if ((end_pfn << PAGE_SHIFT) > mem) {
  77                 /*
  78                  * Skip commas and spaces
  79                  */
  80                 while (*p == ',' || *p == ' ' || *p == '\t')
  81                         p++;
  82
  83                 cmdline = p;
  84                 fake_nid++;
  85                 *nid = fake_nid;
  86                 dbg("created new fake_node with id %d\n", fake_nid);
  87                 return 1;
  88         }
  89         return 0;
  90 }
  91
  92 static void __cpuinit map_cpu_to_node(int cpu, int node)
  93 {
  94         numa_cpu_lookup_table[cpu] = node;
  95
  96         dbg("adding cpu %d to node %d\n", cpu, node);
  97
  98         if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node])))
  99                 cpu_set(cpu, numa_cpumask_lookup_table[node]);
 100 }
 101
 102 #ifdef CONFIG_HOTPLUG_CPU
 103 static void unmap_cpu_from_node(unsigned long cpu)
 104 {
 105         int node = numa_cpu_lookup_table[cpu];
 106
 107         dbg("removing cpu %lu from node %d\n", cpu, node);
 108
 109         if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
 110                 cpu_clear(cpu, numa_cpumask_lookup_table[node]);
 111         } else {
 112                 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
 113                        cpu, node);
 114         }
 115 }
 116 #endif /* CONFIG_HOTPLUG_CPU */
 117
 118 static struct device_node * __cpuinit find_cpu_node(unsigned int cpu)
 119 {
 120         unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
 121         struct device_node *cpu_node = NULL;
 122         const unsigned int *interrupt_server, *reg;
 123         int len;
 124
 125         while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
 126                 /* Try interrupt server first */
 127                 interrupt_server = of_get_property(cpu_node,
 128                                         "ibm,ppc-interrupt-server#s", &len);
 129
 130                 len = len / sizeof(u32);
 131
 132                 if (interrupt_server && (len > 0)) {
 133                         while (len--) {
 134                                 if (interrupt_server[len] == hw_cpuid)
 135                                         return cpu_node;
 136                         }
 137                 } else {
 138                         reg = of_get_property(cpu_node, "reg", &len);
 139                         if (reg && (len > 0) && (reg[0] == hw_cpuid))
 140                                 return cpu_node;
 141                 }
 142         }
 143
 144         return NULL;
 145 }
 146
 147 /* must hold reference to node during call */
 148 static const int *of_get_associativity(struct device_node *dev)
 149 {
 150         return of_get_property(dev, "ibm,associativity", NULL);
 151 }
 152
 153 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
 154  * info is found.
 155  */
 156 static int of_node_to_nid_single(struct device_node *device)
 157 {
 158         int nid = -1;
 159         const unsigned int *tmp;
 160
 161         if (min_common_depth == -1)
 162                 goto out;
 163
 164         tmp = of_get_associativity(device);
 165         if (!tmp)
 166                 goto out;
 167
 168         if (tmp[0] >= min_common_depth)
 169                 nid = tmp[min_common_depth];
 170
 171         /* POWER4 LPAR uses 0xffff as invalid node */
 172         if (nid == 0xffff || nid >= MAX_NUMNODES)
 173                 nid = -1;
 174 out:
 175         return nid;
 176 }
 177
 178 /* Walk the device tree upwards, looking for an associativity id */
 179 int of_node_to_nid(struct device_node *device)
 180 {
 181         struct device_node *tmp;
 182         int nid = -1;
 183
 184         of_node_get(device);
 185         while (device) {
 186                 nid = of_node_to_nid_single(device);
 187                 if (nid != -1)
 188                         break;
 189
 190                 tmp = device;
 191                 device = of_get_parent(tmp);
 192                 of_node_put(tmp);
 193         }
 194         of_node_put(device);
 195
 196         return nid;
 197 }
 198 EXPORT_SYMBOL_GPL(of_node_to_nid);
 199
 200 /*
 201  * In theory, the "ibm,associativity" property may contain multiple
 202  * associativity lists because a resource may be multiply connected
 203  * into the machine.  This resource then has different associativity
 204  * characteristics relative to its multiple connections.  We ignore
 205  * this for now.  We also assume that all cpu and memory sets have
 206  * their distances represented at a common level.  This won't be
 207  * true for hierarchical NUMA.
 208  *
 209  * In any case the ibm,associativity-reference-points should give
 210  * the correct depth for a normal NUMA system.
 211  *
 212  * - Dave Hansen <haveblue@us.ibm.com>
 213  */
 214 static int __init find_min_common_depth(void)
 215 {
 216         int depth;
 217         const unsigned int *ref_points;
 218         struct device_node *rtas_root;
 219         unsigned int len;
 220
 221         rtas_root = of_find_node_by_path("/rtas");
 222
 223         if (!rtas_root)
 224                 return -1;
 225
 226         /*
 227          * this property is 2 32-bit integers, each representing a level of
 228          * depth in the associativity nodes.  The first is for an SMP
 229          * configuration (should be all 0's) and the second is for a normal
 230          * NUMA configuration.
 231          */
 232         ref_points = of_get_property(rtas_root,
 233                         "ibm,associativity-reference-points", &len);
 234
 235         if ((len >= 1) && ref_points) {
 236                 depth = ref_points[1];
 237         } else {
 238                 dbg("NUMA: ibm,associativity-reference-points not found.\n");
 239                 depth = -1;
 240         }
 241         of_node_put(rtas_root);
 242
 243         return depth;
 244 }
 245
 246 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
 247 {
 248         struct device_node *memory = NULL;
 249
 250         memory = of_find_node_by_type(memory, "memory");
 251         if (!memory)
 252                 panic("numa.c: No memory nodes found!");
 253
 254         *n_addr_cells = of_n_addr_cells(memory);
 255         *n_size_cells = of_n_size_cells(memory);
 256         of_node_put(memory);
 257 }
 258
 259 static unsigned long __devinit read_n_cells(int n, const unsigned int **buf)
 260 {
 261         unsigned long result = 0;
 262
 263         while (n--) {
 264                 result = (result << 32) | **buf;
 265                 (*buf)++;
 266         }
 267         return result;
 268 }
 269
 270 /*
 271  * Figure out to which domain a cpu belongs and stick it there.
 272  * Return the id of the domain used.
 273  */
 274 static int __cpuinit numa_setup_cpu(unsigned long lcpu)
 275 {
 276         int nid = 0;
 277         struct device_node *cpu = find_cpu_node(lcpu);
 278
 279         if (!cpu) {
 280                 WARN_ON(1);
 281                 goto out;
 282         }
 283
 284         nid = of_node_to_nid_single(cpu);
 285
 286         if (nid < 0 || !node_online(nid))
 287                 nid = any_online_node(NODE_MASK_ALL);
 288 out:
 289         map_cpu_to_node(lcpu, nid);
 290
 291         of_node_put(cpu);
 292
 293         return nid;
 294 }
 295
 296 static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
 297                              unsigned long action,
 298                              void *hcpu)
 299 {
 300         unsigned long lcpu = (unsigned long)hcpu;
 301         int ret = NOTIFY_DONE;
 302
 303         switch (action) {
 304         case CPU_UP_PREPARE:
 305         case CPU_UP_PREPARE_FROZEN:
 306                 numa_setup_cpu(lcpu);
 307                 ret = NOTIFY_OK;
 308                 break;
 309 #ifdef CONFIG_HOTPLUG_CPU
 310         case CPU_DEAD:
 311         case CPU_DEAD_FROZEN:
 312         case CPU_UP_CANCELED:
 313         case CPU_UP_CANCELED_FROZEN:
 314                 unmap_cpu_from_node(lcpu);
 315                 break;
 316                 ret = NOTIFY_OK;
 317 #endif
 318         }
 319         return ret;
 320 }
 321
 322 /*
 323  * Check and possibly modify a memory region to enforce the memory limit.
 324  *
 325  * Returns the size the region should have to enforce the memory limit.
 326  * This will either be the original value of size, a truncated value,
 327  * or zero. If the returned value of size is 0 the region should be
 328  * discarded as it lies wholy above the memory limit.
 329  */
 330 static unsigned long __init numa_enforce_memory_limit(unsigned long start,
 331                                                       unsigned long size)
 332 {
 333         /*
 334          * We use lmb_end_of_DRAM() in here instead of memory_limit because
 335          * we've already adjusted it for the limit and it takes care of
 336          * having memory holes below the limit.
 337          */
 338
 339         if (! memory_limit)
 340                 return size;
 341
 342         if (start + size <= lmb_end_of_DRAM())
 343                 return size;
 344
 345         if (start >= lmb_end_of_DRAM())
 346                 return 0;
 347
 348         return lmb_end_of_DRAM() - start;
 349 }
 350
 351 /*
 352  * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
 353  * node.  This assumes n_mem_{addr,size}_cells have been set.
 354  */
 355 static void __init parse_drconf_memory(struct device_node *memory)
 356 {
 357         const unsigned int *lm, *dm, *aa;
 358         unsigned int ls, ld, la;
 359         unsigned int n, aam, aalen;
 360         unsigned long lmb_size, size, start;
 361         int nid, default_nid = 0;
 362         unsigned int ai, flags;
 363
 364         lm = of_get_property(memory, "ibm,lmb-size", &ls);
 365         dm = of_get_property(memory, "ibm,dynamic-memory", &ld);
 366         aa = of_get_property(memory, "ibm,associativity-lookup-arrays", &la);
 367         if (!lm || !dm || !aa ||
 368             ls < sizeof(unsigned int) || ld < sizeof(unsigned int) ||
 369             la < 2 * sizeof(unsigned int))
 370                 return;
 371
 372         lmb_size = read_n_cells(n_mem_size_cells, &lm);
 373         n = *dm++;              /* number of LMBs */
 374         aam = *aa++;            /* number of associativity lists */
 375         aalen = *aa++;          /* length of each associativity list */
 376         if (ld < (n * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int) ||
 377             la < (aam * aalen + 2) * sizeof(unsigned int))
 378                 return;
 379
 380         for (; n != 0; --n) {
 381                 start = read_n_cells(n_mem_addr_cells, &dm);
 382                 ai = dm[2];
 383                 flags = dm[3];
 384                 dm += 4;
 385                 /* 0x80 == reserved, 0x8 = assigned to us */
 386                 if ((flags & 0x80) || !(flags & 0x8))
 387                         continue;
 388                 nid = default_nid;
 389                 /* flags & 0x40 means associativity index is invalid */
 390                 if (min_common_depth > 0 && min_common_depth <= aalen &&
 391                     (flags & 0x40) == 0 && ai < aam) {
 392                         /* this is like of_node_to_nid_single */
 393                         nid = aa[ai * aalen + min_common_depth - 1];
 394                         if (nid == 0xffff || nid >= MAX_NUMNODES)
 395                                 nid = default_nid;
 396                 }
 397
 398                 fake_numa_create_new_node(((start + lmb_size) >> PAGE_SHIFT),
 399                                                 &nid);
 400                 node_set_online(nid);
 401
 402                 size = numa_enforce_memory_limit(start, lmb_size);
 403                 if (!size)
 404                         continue;
 405
 406                 add_active_range(nid, start >> PAGE_SHIFT,
 407                                  (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
 408         }
 409 }
 410
 411 static int __init parse_numa_properties(void)
 412 {
 413         struct device_node *cpu = NULL;
 414         struct device_node *memory = NULL;
 415         int default_nid = 0;
 416         unsigned long i;
 417
 418         if (numa_enabled == 0) {
 419                 printk(KERN_WARNING "NUMA disabled by user\n");
 420                 return -1;
 421         }
 422
 423         min_common_depth = find_min_common_depth();
 424
 425         if (min_common_depth < 0)
 426                 return min_common_depth;
 427
 428         dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
 429
 430         /*
 431          * Even though we connect cpus to numa domains later in SMP
 432          * init, we need to know the node ids now. This is because
 433          * each node to be onlined must have NODE_DATA etc backing it.
 434          */
 435         for_each_present_cpu(i) {
 436                 int nid;
 437
 438                 cpu = find_cpu_node(i);
 439                 BUG_ON(!cpu);
 440                 nid = of_node_to_nid_single(cpu);
 441                 of_node_put(cpu);
 442
 443                 /*
 444                  * Don't fall back to default_nid yet -- we will plug
 445                  * cpus into nodes once the memory scan has discovered
 446                  * the topology.
 447                  */
 448                 if (nid < 0)
 449                         continue;
 450                 node_set_online(nid);
 451         }
 452
 453         get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
 454         memory = NULL;
 455         while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
 456                 unsigned long start;
 457                 unsigned long size;
 458                 int nid;
 459                 int ranges;
 460                 const unsigned int *memcell_buf;
 461                 unsigned int len;
 462
 463                 memcell_buf = of_get_property(memory,
 464                         "linux,usable-memory", &len);
 465                 if (!memcell_buf || len <= 0)
 466                         memcell_buf = of_get_property(memory, "reg", &len);
 467                 if (!memcell_buf || len <= 0)
 468                         continue;
 469
 470                 /* ranges in cell */
 471                 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
 472 new_range:
 473                 /* these are order-sensitive, and modify the buffer pointer */
 474                 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
 475                 size = read_n_cells(n_mem_size_cells, &memcell_buf);
 476
 477                 /*
 478                  * Assumption: either all memory nodes or none will
 479                  * have associativity properties.  If none, then
 480                  * everything goes to default_nid.
 481                  */
 482                 nid = of_node_to_nid_single(memory);
 483                 if (nid < 0)
 484                         nid = default_nid;
 485
 486                 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
 487                 node_set_online(nid);
 488
 489                 if (!(size = numa_enforce_memory_limit(start, size))) {
 490                         if (--ranges)
 491                                 goto new_range;
 492                         else
 493                                 continue;
 494                 }
 495
 496                 add_active_range(nid, start >> PAGE_SHIFT,
 497                                 (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
 498
 499                 if (--ranges)
 500                         goto new_range;
 501         }
 502
 503         /*
 504          * Now do the same thing for each LMB listed in the ibm,dynamic-memory
 505          * property in the ibm,dynamic-reconfiguration-memory node.
 506          */
 507         memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
 508         if (memory)
 509                 parse_drconf_memory(memory);
 510
 511         return 0;
 512 }
 513
 514 static void __init setup_nonnuma(void)
 515 {
 516         unsigned long top_of_ram = lmb_end_of_DRAM();
 517         unsigned long total_ram = lmb_phys_mem_size();
 518         unsigned long start_pfn, end_pfn;
 519         unsigned int i, nid = 0;
 520
 521         printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
 522                top_of_ram, total_ram);
 523         printk(KERN_DEBUG "Memory hole size: %ldMB\n",
 524                (top_of_ram - total_ram) >> 20);
 525
 526         for (i = 0; i < lmb.memory.cnt; ++i) {
 527                 start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
 528                 end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
 529
 530                 fake_numa_create_new_node(end_pfn, &nid);
 531                 add_active_range(nid, start_pfn, end_pfn);
 532                 node_set_online(nid);
 533         }
 534 }
 535
 536 void __init dump_numa_cpu_topology(void)
 537 {
 538         unsigned int node;
 539         unsigned int cpu, count;
 540
 541         if (min_common_depth == -1 || !numa_enabled)
 542                 return;
 543
 544         for_each_online_node(node) {
 545                 printk(KERN_DEBUG "Node %d CPUs:", node);
 546
 547                 count = 0;
 548                 /*
 549                  * If we used a CPU iterator here we would miss printing
 550                  * the holes in the cpumap.
 551                  */
 552                 for (cpu = 0; cpu < NR_CPUS; cpu++) {
 553                         if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
 554                                 if (count == 0)
 555                                         printk(" %u", cpu);
 556                                 ++count;
 557                         } else {
 558                                 if (count > 1)
 559                                         printk("-%u", cpu - 1);
 560                                 count = 0;
 561                         }
 562                 }
 563
 564                 if (count > 1)
 565                         printk("-%u", NR_CPUS - 1);
 566                 printk("\n");
 567         }
 568 }
 569
 570 static void __init dump_numa_memory_topology(void)
 571 {
 572         unsigned int node;
 573         unsigned int count;
 574
 575         if (min_common_depth == -1 || !numa_enabled)
 576                 return;
 577
 578         for_each_online_node(node) {
 579                 unsigned long i;
 580
 581                 printk(KERN_DEBUG "Node %d Memory:", node);
 582
 583                 count = 0;
 584
 585                 for (i = 0; i < lmb_end_of_DRAM();
 586                      i += (1 << SECTION_SIZE_BITS)) {
 587                         if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
 588                                 if (count == 0)
 589                                         printk(" 0x%lx", i);
 590                                 ++count;
 591                         } else {
 592                                 if (count > 0)
 593                                         printk("-0x%lx", i);
 594                                 count = 0;
 595                         }
 596                 }
 597
 598                 if (count > 0)
 599                         printk("-0x%lx", i);
 600                 printk("\n");
 601         }
 602 }
 603
 604 /*
 605  * Allocate some memory, satisfying the lmb or bootmem allocator where
 606  * required. nid is the preferred node and end is the physical address of
 607  * the highest address in the node.
 608  *
 609  * Returns the physical address of the memory.
 610  */
 611 static void __init *careful_allocation(int nid, unsigned long size,
 612                                        unsigned long align,
 613                                        unsigned long end_pfn)
 614 {
 615         int new_nid;
 616         unsigned long ret = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT);
 617
 618         /* retry over all memory */
 619         if (!ret)
 620                 ret = __lmb_alloc_base(size, align, lmb_end_of_DRAM());
 621
 622         if (!ret)
 623                 panic("numa.c: cannot allocate %lu bytes on node %d",
 624                       size, nid);
 625
 626         /*
 627          * If the memory came from a previously allocated node, we must
 628          * retry with the bootmem allocator.
 629          */
 630         new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT);
 631         if (new_nid < nid) {
 632                 ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid),
 633                                 size, align, 0);
 634
 635                 if (!ret)
 636                         panic("numa.c: cannot allocate %lu bytes on node %d",
 637                               size, new_nid);
 638
 639                 ret = __pa(ret);
 640
 641                 dbg("alloc_bootmem %lx %lx\n", ret, size);
 642         }
 643
 644         return (void *)ret;
 645 }
 646
 647 static struct notifier_block __cpuinitdata ppc64_numa_nb = {
 648         .notifier_call = cpu_numa_callback,
 649         .priority = 1 /* Must run before sched domains notifier. */
 650 };
 651
 652 void __init do_init_bootmem(void)
 653 {
 654         int nid;
 655         unsigned int i;
 656
 657         min_low_pfn = 0;
 658         max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
 659         max_pfn = max_low_pfn;
 660
 661         if (parse_numa_properties())
 662                 setup_nonnuma();
 663         else
 664                 dump_numa_memory_topology();
 665
 666         register_cpu_notifier(&ppc64_numa_nb);
 667         cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
 668                           (void *)(unsigned long)boot_cpuid);
 669
 670         for_each_online_node(nid) {
 671                 unsigned long start_pfn, end_pfn;
 672                 unsigned long bootmem_paddr;
 673                 unsigned long bootmap_pages;
 674
 675                 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 676
 677                 /* Allocate the node structure node local if possible */
 678                 NODE_DATA(nid) = careful_allocation(nid,
 679                                         sizeof(struct pglist_data),
 680                                         SMP_CACHE_BYTES, end_pfn);
 681                 NODE_DATA(nid) = __va(NODE_DATA(nid));
 682                 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
 683
 684                 dbg("node %d\n", nid);
 685                 dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
 686
 687                 NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
 688                 NODE_DATA(nid)->node_start_pfn = start_pfn;
 689                 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
 690
 691                 if (NODE_DATA(nid)->node_spanned_pages == 0)
 692                         continue;
 693
 694                 dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
 695                 dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
 696
 697                 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
 698                 bootmem_paddr = (unsigned long)careful_allocation(nid,
 699                                         bootmap_pages << PAGE_SHIFT,
 700                                         PAGE_SIZE, end_pfn);
 701                 memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT);
 702
 703                 dbg("bootmap_paddr = %lx\n", bootmem_paddr);
 704
 705                 init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
 706                                   start_pfn, end_pfn);
 707
 708                 free_bootmem_with_active_regions(nid, end_pfn);
 709
 710                 /* Mark reserved regions on this node */
 711                 for (i = 0; i < lmb.reserved.cnt; i++) {
 712                         unsigned long physbase = lmb.reserved.region[i].base;
 713                         unsigned long size = lmb.reserved.region[i].size;
 714                         unsigned long start_paddr = start_pfn << PAGE_SHIFT;
 715                         unsigned long end_paddr = end_pfn << PAGE_SHIFT;
 716
 717                         if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid &&
 718                             early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid)
 719                                 continue;
 720
 721                         if (physbase < end_paddr &&
 722                             (physbase+size) > start_paddr) {
 723                                 /* overlaps */
 724                                 if (physbase < start_paddr) {
 725                                         size -= start_paddr - physbase;
 726                                         physbase = start_paddr;
 727                                 }
 728
 729                                 if (size > end_paddr - physbase)
 730                                         size = end_paddr - physbase;
 731
 732                                 dbg("reserve_bootmem %lx %lx\n", physbase,
 733                                     size);
 734                                 reserve_bootmem_node(NODE_DATA(nid), physbase,
 735                                                      size, BOOTMEM_DEFAULT);
 736                         }
 737                 }
 738
 739                 sparse_memory_present_with_active_regions(nid);
 740         }
 741 }
 742
 743 void __init paging_init(void)
 744 {
 745         unsigned long max_zone_pfns[MAX_NR_ZONES];
 746         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 747         max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT;
 748         free_area_init_nodes(max_zone_pfns);
 749 }
 750
 751 static int __init early_numa(char *p)
 752 {
 753         if (!p)
 754                 return 0;
 755
 756         if (strstr(p, "off"))
 757                 numa_enabled = 0;
 758
 759         if (strstr(p, "debug"))
 760                 numa_debug = 1;
 761
 762         p = strstr(p, "fake=");
 763         if (p)
 764                 cmdline = p + strlen("fake=");
 765
 766         return 0;
 767 }
 768 early_param("numa", early_numa);
 769
 770 #ifdef CONFIG_MEMORY_HOTPLUG
 771 /*
 772  * Find the node associated with a hot added memory section.  Section
 773  * corresponds to a SPARSEMEM section, not an LMB.  It is assumed that
 774  * sections are fully contained within a single LMB.
 775  */
 776 int hot_add_scn_to_nid(unsigned long scn_addr)
 777 {
 778         struct device_node *memory = NULL;
 779         nodemask_t nodes;
 780         int default_nid = any_online_node(NODE_MASK_ALL);
 781         int nid;
 782
 783         if (!numa_enabled || (min_common_depth < 0))
 784                 return default_nid;
 785
 786         while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
 787                 unsigned long start, size;
 788                 int ranges;
 789                 const unsigned int *memcell_buf;
 790                 unsigned int len;
 791
 792                 memcell_buf = of_get_property(memory, "reg", &len);
 793                 if (!memcell_buf || len <= 0)
 794                         continue;
 795
 796                 /* ranges in cell */
 797                 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
 798 ha_new_range:
 799                 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
 800                 size = read_n_cells(n_mem_size_cells, &memcell_buf);
 801                 nid = of_node_to_nid_single(memory);
 802
 803                 /* Domains not present at boot default to 0 */
 804                 if (nid < 0 || !node_online(nid))
 805                         nid = default_nid;
 806
 807                 if ((scn_addr >= start) && (scn_addr < (start + size))) {
 808                         of_node_put(memory);
 809                         goto got_nid;
 810                 }
 811
 812                 if (--ranges)           /* process all ranges in cell */
 813                         goto ha_new_range;
 814         }
 815         BUG();  /* section address should be found above */
 816         return 0;
 817
 818         /* Temporary code to ensure that returned node is not empty */
 819 got_nid:
 820         nodes_setall(nodes);
 821         while (NODE_DATA(nid)->node_spanned_pages == 0) {
 822                 node_clear(nid, nodes);
 823                 nid = any_online_node(nodes);
 824         }
 825         return nid;
 826 }
 827 #endif /* CONFIG_MEMORY_HOTPLUG */