2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
28 struct memnode memnode;
30 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE
33 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
38 int numa_off __initdata;
42 * Given a shift value, try to populate memnodemap[]
45 * 0 if memnodmap[] too small (of shift too small)
46 * -1 if node overlap or lost ram (shift too big)
49 populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
53 unsigned long addr, end;
57 memset(memnodemap, 0xff, sizeof(memnodemap));
58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start;
63 if ((end >> shift) >= NODEMAPSIZE)
66 if (memnodemap[addr >> shift] != 0xff)
68 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift);
76 int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
80 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
83 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
86 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
88 "Your memory is not aligned you need to rebuild your kernel "
89 "with a bigger NODEMAPSIZE shift=%d\n",
96 #ifdef CONFIG_SPARSEMEM
97 int early_pfn_to_nid(unsigned long pfn)
99 return phys_to_nid(pfn << PAGE_SHIFT);
103 /* Initialize bootmem allocator for a node */
104 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
106 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
107 unsigned long nodedata_phys;
108 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
110 start = round_up(start, ZONE_ALIGN);
112 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
114 start_pfn = start >> PAGE_SHIFT;
115 end_pfn = end >> PAGE_SHIFT;
117 nodedata_phys = find_e820_area(start, end, pgdat_size);
118 if (nodedata_phys == -1L)
119 panic("Cannot find memory pgdat in node %d\n", nodeid);
121 Dprintk("nodedata_phys %lx\n", nodedata_phys);
123 node_data[nodeid] = phys_to_virt(nodedata_phys);
124 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
125 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
126 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
127 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
129 /* Find a place for the bootmem map */
130 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
131 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
132 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
133 if (bootmap_start == -1L)
134 panic("Not enough continuous space for bootmap on node %d", nodeid);
135 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
137 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
138 bootmap_start >> PAGE_SHIFT,
141 e820_bootmem_free(NODE_DATA(nodeid), start, end);
143 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
144 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
145 #ifdef CONFIG_ACPI_NUMA
146 srat_reserve_add_area(nodeid);
148 node_set_online(nodeid);
151 /* Initialize final allocator for a zone */
152 void __init setup_node_zones(int nodeid)
154 unsigned long start_pfn, end_pfn, memmapsize, limit;
155 unsigned long zones[MAX_NR_ZONES];
156 unsigned long holes[MAX_NR_ZONES];
158 start_pfn = node_start_pfn(nodeid);
159 end_pfn = node_end_pfn(nodeid);
161 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
162 nodeid, start_pfn, end_pfn);
164 /* Try to allocate mem_map at end to not fill up precious <4GB
166 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
167 limit = end_pfn << PAGE_SHIFT;
168 NODE_DATA(nodeid)->node_mem_map =
169 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
170 memmapsize, SMP_CACHE_BYTES,
171 round_down(limit - memmapsize, PAGE_SIZE),
174 size_zones(zones, holes, start_pfn, end_pfn);
175 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
179 void __init numa_init_array(void)
182 /* There are unfortunately some poorly designed mainboards around
183 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
184 mapping. To avoid this fill in the mapping for all possible
185 CPUs, as the number of CPUs is not known yet.
186 We round robin the existing nodes. */
187 rr = first_node(node_online_map);
188 for (i = 0; i < NR_CPUS; i++) {
189 if (cpu_to_node[i] != NUMA_NO_NODE)
191 numa_set_node(i, rr);
192 rr = next_node(rr, node_online_map);
193 if (rr == MAX_NUMNODES)
194 rr = first_node(node_online_map);
199 #ifdef CONFIG_NUMA_EMU
200 int numa_fake __initdata = 0;
203 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
206 struct bootnode nodes[MAX_NUMNODES];
207 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
209 /* Kludge needed for the hash function */
210 if (hweight64(sz) > 1) {
212 while ((x << 1) < sz)
215 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
219 memset(&nodes,0,sizeof(nodes));
220 for (i = 0; i < numa_fake; i++) {
221 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
222 if (i == numa_fake-1)
223 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
224 nodes[i].end = nodes[i].start + sz;
225 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
227 nodes[i].start, nodes[i].end,
228 (nodes[i].end - nodes[i].start) >> 20);
231 memnode_shift = compute_hash_shift(nodes, numa_fake);
232 if (memnode_shift < 0) {
234 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
237 for_each_online_node(i)
238 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
244 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
248 #ifdef CONFIG_NUMA_EMU
249 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
253 #ifdef CONFIG_ACPI_NUMA
254 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
255 end_pfn << PAGE_SHIFT))
259 #ifdef CONFIG_K8_NUMA
260 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
263 printk(KERN_INFO "%s\n",
264 numa_off ? "NUMA turned off" : "No NUMA configuration found");
266 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
267 start_pfn << PAGE_SHIFT,
268 end_pfn << PAGE_SHIFT);
269 /* setup dummy node covering all memory */
272 nodes_clear(node_online_map);
274 for (i = 0; i < NR_CPUS; i++)
276 node_to_cpumask[0] = cpumask_of_cpu(0);
277 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
280 __cpuinit void numa_add_cpu(int cpu)
282 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
285 void __cpuinit numa_set_node(int cpu, int node)
287 cpu_pda(cpu)->nodenumber = node;
288 cpu_to_node[cpu] = node;
291 unsigned long __init numa_free_all_bootmem(void)
294 unsigned long pages = 0;
295 for_each_online_node(i) {
296 pages += free_all_bootmem_node(NODE_DATA(i));
301 #ifdef CONFIG_SPARSEMEM
302 static void __init arch_sparse_init(void)
306 for_each_online_node(i)
307 memory_present(i, node_start_pfn(i), node_end_pfn(i));
312 #define arch_sparse_init() do {} while (0)
315 void __init paging_init(void)
321 for_each_online_node(i) {
327 __init int numa_setup(char *opt)
329 if (!strncmp(opt,"off",3))
331 #ifdef CONFIG_NUMA_EMU
332 if(!strncmp(opt, "fake=", 5)) {
333 numa_fake = simple_strtoul(opt+5,NULL,0); ;
334 if (numa_fake >= MAX_NUMNODES)
335 numa_fake = MAX_NUMNODES;
338 #ifdef CONFIG_ACPI_NUMA
339 if (!strncmp(opt,"noacpi",6))
341 if (!strncmp(opt,"hotadd=", 7))
342 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
348 * Setup early cpu_to_node.
350 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
351 * and apicid_to_node[] tables have valid entries for a CPU.
352 * This means we skip cpu_to_node[] initialisation for NUMA
353 * emulation and faking node case (when running a kernel compiled
354 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
355 * is already initialized in a round robin manner at numa_init_array,
356 * prior to this call, and this initialization is good enough
357 * for the fake NUMA cases.
359 void __init init_cpu_to_node(void)
362 for (i = 0; i < NR_CPUS; i++) {
363 u8 apicid = x86_cpu_to_apicid[i];
364 if (apicid == BAD_APICID)
366 if (apicid_to_node[apicid] == NUMA_NO_NODE)
368 numa_set_node(i,apicid_to_node[apicid]);
372 EXPORT_SYMBOL(cpu_to_node);
373 EXPORT_SYMBOL(node_to_cpumask);
374 EXPORT_SYMBOL(memnode);
375 EXPORT_SYMBOL(node_data);
377 #ifdef CONFIG_DISCONTIGMEM
379 * Functions to convert PFNs from/to per node page addresses.
380 * These are out of line because they are quite big.
381 * They could be all tuned by pre caching more state.
385 int pfn_valid(unsigned long pfn)
388 if (pfn >= num_physpages)
390 nid = pfn_to_nid(pfn);
393 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
395 EXPORT_SYMBOL(pfn_valid);