2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
29 u8 memnodemap[NODEMAPSIZE];
31 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
34 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
37 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
39 int numa_off __initdata;
43 * Given a shift value, try to populate memnodemap[]
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
50 populate_memnodemap(const struct node *nodes, int numnodes, int shift)
54 unsigned long addr, end;
58 memset(memnodemap, 0xff, sizeof(memnodemap));
59 for (i = 0; i < numnodes; i++) {
60 addr = nodes[i].start;
64 if ((end >> shift) >= NODEMAPSIZE)
67 if (memnodemap[addr >> shift] != 0xff)
69 memnodemap[addr >> shift] = i;
70 addr += (1UL << shift);
77 int __init compute_hash_shift(struct node *nodes, int numnodes)
81 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
84 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
87 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
89 "Your memory is not aligned you need to rebuild your kernel "
90 "with a bigger NODEMAPSIZE shift=%d\n",
97 #ifdef CONFIG_SPARSEMEM
98 int early_pfn_to_nid(unsigned long pfn)
100 return phys_to_nid(pfn << PAGE_SHIFT);
104 /* Initialize bootmem allocator for a node */
105 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
107 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
108 unsigned long nodedata_phys;
109 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
111 start = round_up(start, ZONE_ALIGN);
113 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
115 start_pfn = start >> PAGE_SHIFT;
116 end_pfn = end >> PAGE_SHIFT;
118 nodedata_phys = find_e820_area(start, end, pgdat_size);
119 if (nodedata_phys == -1L)
120 panic("Cannot find memory pgdat in node %d\n", nodeid);
122 Dprintk("nodedata_phys %lx\n", nodedata_phys);
124 node_data[nodeid] = phys_to_virt(nodedata_phys);
125 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
126 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
127 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
128 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
130 /* Find a place for the bootmem map */
131 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
132 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
133 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
134 if (bootmap_start == -1L)
135 panic("Not enough continuous space for bootmap on node %d", nodeid);
136 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
138 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
139 bootmap_start >> PAGE_SHIFT,
142 e820_bootmem_free(NODE_DATA(nodeid), start, end);
144 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
145 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
146 node_set_online(nodeid);
149 /* Initialize final allocator for a zone */
150 void __init setup_node_zones(int nodeid)
152 unsigned long start_pfn, end_pfn;
153 unsigned long zones[MAX_NR_ZONES];
154 unsigned long holes[MAX_NR_ZONES];
156 start_pfn = node_start_pfn(nodeid);
157 end_pfn = node_end_pfn(nodeid);
159 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
160 nodeid, start_pfn, end_pfn);
162 size_zones(zones, holes, start_pfn, end_pfn);
163 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
167 void __init numa_init_array(void)
170 /* There are unfortunately some poorly designed mainboards around
171 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
172 mapping. To avoid this fill in the mapping for all possible
173 CPUs, as the number of CPUs is not known yet.
174 We round robin the existing nodes. */
175 rr = first_node(node_online_map);
176 for (i = 0; i < NR_CPUS; i++) {
177 if (cpu_to_node[i] != NUMA_NO_NODE)
179 numa_set_node(i, rr);
180 rr = next_node(rr, node_online_map);
181 if (rr == MAX_NUMNODES)
182 rr = first_node(node_online_map);
187 #ifdef CONFIG_NUMA_EMU
188 int numa_fake __initdata = 0;
191 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
194 struct node nodes[MAX_NUMNODES];
195 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
197 /* Kludge needed for the hash function */
198 if (hweight64(sz) > 1) {
200 while ((x << 1) < sz)
203 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
207 memset(&nodes,0,sizeof(nodes));
208 for (i = 0; i < numa_fake; i++) {
209 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
210 if (i == numa_fake-1)
211 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
212 nodes[i].end = nodes[i].start + sz;
213 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
215 nodes[i].start, nodes[i].end,
216 (nodes[i].end - nodes[i].start) >> 20);
219 memnode_shift = compute_hash_shift(nodes, numa_fake);
220 if (memnode_shift < 0) {
222 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
225 for_each_online_node(i)
226 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
232 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
236 #ifdef CONFIG_NUMA_EMU
237 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
241 #ifdef CONFIG_ACPI_NUMA
242 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
243 end_pfn << PAGE_SHIFT))
247 #ifdef CONFIG_K8_NUMA
248 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
251 printk(KERN_INFO "%s\n",
252 numa_off ? "NUMA turned off" : "No NUMA configuration found");
254 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
255 start_pfn << PAGE_SHIFT,
256 end_pfn << PAGE_SHIFT);
257 /* setup dummy node covering all memory */
260 nodes_clear(node_online_map);
262 for (i = 0; i < NR_CPUS; i++)
264 node_to_cpumask[0] = cpumask_of_cpu(0);
265 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
268 __cpuinit void numa_add_cpu(int cpu)
270 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
273 void __cpuinit numa_set_node(int cpu, int node)
275 cpu_pda(cpu)->nodenumber = node;
276 cpu_to_node[cpu] = node;
279 unsigned long __init numa_free_all_bootmem(void)
282 unsigned long pages = 0;
283 for_each_online_node(i) {
284 pages += free_all_bootmem_node(NODE_DATA(i));
289 #ifdef CONFIG_SPARSEMEM
290 static void __init arch_sparse_init(void)
294 for_each_online_node(i)
295 memory_present(i, node_start_pfn(i), node_end_pfn(i));
300 #define arch_sparse_init() do {} while (0)
303 void __init paging_init(void)
309 for_each_online_node(i) {
315 __init int numa_setup(char *opt)
317 if (!strncmp(opt,"off",3))
319 #ifdef CONFIG_NUMA_EMU
320 if(!strncmp(opt, "fake=", 5)) {
321 numa_fake = simple_strtoul(opt+5,NULL,0); ;
322 if (numa_fake >= MAX_NUMNODES)
323 numa_fake = MAX_NUMNODES;
326 #ifdef CONFIG_ACPI_NUMA
327 if (!strncmp(opt,"noacpi",6))
334 * Setup early cpu_to_node.
336 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
337 * and apicid_to_node[] tables have valid entries for a CPU.
338 * This means we skip cpu_to_node[] initialisation for NUMA
339 * emulation and faking node case (when running a kernel compiled
340 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
341 * is already initialized in a round robin manner at numa_init_array,
342 * prior to this call, and this initialization is good enough
343 * for the fake NUMA cases.
345 void __init init_cpu_to_node(void)
348 for (i = 0; i < NR_CPUS; i++) {
349 u8 apicid = x86_cpu_to_apicid[i];
350 if (apicid == BAD_APICID)
352 if (apicid_to_node[apicid] == NUMA_NO_NODE)
354 cpu_to_node[i] = apicid_to_node[apicid];
358 EXPORT_SYMBOL(cpu_to_node);
359 EXPORT_SYMBOL(node_to_cpumask);
360 EXPORT_SYMBOL(memnode_shift);
361 EXPORT_SYMBOL(memnodemap);
362 EXPORT_SYMBOL(node_data);
364 #ifdef CONFIG_DISCONTIGMEM
366 * Functions to convert PFNs from/to per node page addresses.
367 * These are out of line because they are quite big.
368 * They could be all tuned by pre caching more state.
372 /* Requires pfn_valid(pfn) to be true */
373 struct page *pfn_to_page(unsigned long pfn)
375 int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT);
376 return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;
378 EXPORT_SYMBOL(pfn_to_page);
380 unsigned long page_to_pfn(struct page *page)
382 return (long)(((page) - page_zone(page)->zone_mem_map) +
383 page_zone(page)->zone_start_pfn);
385 EXPORT_SYMBOL(page_to_pfn);
387 int pfn_valid(unsigned long pfn)
390 if (pfn >= num_physpages)
392 nid = pfn_to_nid(pfn);
395 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
397 EXPORT_SYMBOL(pfn_valid);