2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
29 u8 memnodemap[NODEMAPSIZE];
31 unsigned char cpu_to_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
32 unsigned char apicid_to_node[256] __cpuinitdata = {
33 [0 ... NR_CPUS-1] = NUMA_NO_NODE
35 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
37 int numa_off __initdata;
39 int __init compute_hash_shift(struct node *nodes, int numnodes)
43 unsigned long addr,maxend=0;
45 for (i = 0; i < numnodes; i++)
46 if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
47 maxend = nodes[i].end;
49 while ((1UL << shift) < (maxend / NODEMAPSIZE))
52 printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
54 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
55 for (i = 0; i < numnodes; i++) {
56 if (nodes[i].start == nodes[i].end)
58 for (addr = nodes[i].start;
60 addr += (1UL << shift)) {
61 if (memnodemap[addr >> shift] != 0xff) {
63 "Your memory is not aligned you need to rebuild your kernel "
64 "with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
68 memnodemap[addr >> shift] = i;
74 #ifdef CONFIG_SPARSEMEM
75 int early_pfn_to_nid(unsigned long pfn)
77 return phys_to_nid(pfn << PAGE_SHIFT);
81 /* Initialize bootmem allocator for a node */
82 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
84 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
85 unsigned long nodedata_phys;
86 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
88 start = round_up(start, ZONE_ALIGN);
90 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
92 start_pfn = start >> PAGE_SHIFT;
93 end_pfn = end >> PAGE_SHIFT;
95 memory_present(nodeid, start_pfn, end_pfn);
96 nodedata_phys = find_e820_area(start, end, pgdat_size);
97 if (nodedata_phys == -1L)
98 panic("Cannot find memory pgdat in node %d\n", nodeid);
100 Dprintk("nodedata_phys %lx\n", nodedata_phys);
102 node_data[nodeid] = phys_to_virt(nodedata_phys);
103 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
104 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
105 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
106 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
108 /* Find a place for the bootmem map */
109 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
110 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
111 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
112 if (bootmap_start == -1L)
113 panic("Not enough continuous space for bootmap on node %d", nodeid);
114 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
116 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
117 bootmap_start >> PAGE_SHIFT,
120 e820_bootmem_free(NODE_DATA(nodeid), start, end);
122 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
123 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
124 node_set_online(nodeid);
127 /* Initialize final allocator for a zone */
128 void __init setup_node_zones(int nodeid)
130 unsigned long start_pfn, end_pfn;
131 unsigned long zones[MAX_NR_ZONES];
132 unsigned long holes[MAX_NR_ZONES];
133 unsigned long dma_end_pfn;
135 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
136 memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
138 start_pfn = node_start_pfn(nodeid);
139 end_pfn = node_end_pfn(nodeid);
141 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
143 /* All nodes > 0 have a zero length zone DMA */
144 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
145 if (start_pfn < dma_end_pfn) {
146 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
147 holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
148 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
149 holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
152 zones[ZONE_NORMAL] = end_pfn - start_pfn;
153 holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
156 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
160 void __init numa_init_array(void)
163 /* There are unfortunately some poorly designed mainboards around
164 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
165 mapping. To avoid this fill in the mapping for all possible
166 CPUs, as the number of CPUs is not known yet.
167 We round robin the existing nodes. */
169 for (i = 0; i < NR_CPUS; i++) {
170 if (cpu_to_node[i] != NUMA_NO_NODE)
172 rr = next_node(rr, node_online_map);
173 if (rr == MAX_NUMNODES)
174 rr = first_node(node_online_map);
179 set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
182 #ifdef CONFIG_NUMA_EMU
183 int numa_fake __initdata = 0;
186 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
189 struct node nodes[MAX_NUMNODES];
190 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
192 /* Kludge needed for the hash function */
193 if (hweight64(sz) > 1) {
195 while ((x << 1) < sz)
198 printk("Numa emulation unbalanced. Complain to maintainer\n");
202 memset(&nodes,0,sizeof(nodes));
203 for (i = 0; i < numa_fake; i++) {
204 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
205 if (i == numa_fake-1)
206 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
207 nodes[i].end = nodes[i].start + sz;
208 if (i != numa_fake-1)
210 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
212 nodes[i].start, nodes[i].end,
213 (nodes[i].end - nodes[i].start) >> 20);
216 memnode_shift = compute_hash_shift(nodes, numa_fake);
217 if (memnode_shift < 0) {
219 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
222 for_each_online_node(i)
223 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
229 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
233 #ifdef CONFIG_NUMA_EMU
234 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
238 #ifdef CONFIG_ACPI_NUMA
239 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
240 end_pfn << PAGE_SHIFT))
244 #ifdef CONFIG_K8_NUMA
245 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
248 printk(KERN_INFO "%s\n",
249 numa_off ? "NUMA turned off" : "No NUMA configuration found");
251 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
252 start_pfn << PAGE_SHIFT,
253 end_pfn << PAGE_SHIFT);
254 /* setup dummy node covering all memory */
257 nodes_clear(node_online_map);
259 for (i = 0; i < NR_CPUS; i++)
261 node_to_cpumask[0] = cpumask_of_cpu(0);
262 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
265 __cpuinit void numa_add_cpu(int cpu)
267 /* BP is initialized elsewhere */
269 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
272 unsigned long __init numa_free_all_bootmem(void)
275 unsigned long pages = 0;
276 for_each_online_node(i) {
277 pages += free_all_bootmem_node(NODE_DATA(i));
282 void __init paging_init(void)
285 for_each_online_node(i) {
291 __init int numa_setup(char *opt)
293 if (!strncmp(opt,"off",3))
295 #ifdef CONFIG_NUMA_EMU
296 if(!strncmp(opt, "fake=", 5)) {
297 numa_fake = simple_strtoul(opt+5,NULL,0); ;
298 if (numa_fake >= MAX_NUMNODES)
299 numa_fake = MAX_NUMNODES;
302 #ifdef CONFIG_ACPI_NUMA
303 if (!strncmp(opt,"noacpi",6))
309 EXPORT_SYMBOL(cpu_to_node);
310 EXPORT_SYMBOL(node_to_cpumask);
311 EXPORT_SYMBOL(memnode_shift);
312 EXPORT_SYMBOL(memnodemap);
313 EXPORT_SYMBOL(node_data);