2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
29 u8 memnodemap[NODEMAPSIZE];
31 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
34 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
37 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
39 int numa_off __initdata;
41 int __init compute_hash_shift(struct node *nodes, int numnodes)
45 unsigned long addr,maxend=0;
47 for (i = 0; i < numnodes; i++)
48 if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
49 maxend = nodes[i].end;
51 while ((1UL << shift) < (maxend / NODEMAPSIZE))
54 printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
56 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
57 for (i = 0; i < numnodes; i++) {
58 if (nodes[i].start == nodes[i].end)
60 for (addr = nodes[i].start;
62 addr += (1UL << shift)) {
63 if (memnodemap[addr >> shift] != 0xff) {
65 "Your memory is not aligned you need to rebuild your kernel "
66 "with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
70 memnodemap[addr >> shift] = i;
76 #ifdef CONFIG_SPARSEMEM
77 int early_pfn_to_nid(unsigned long pfn)
79 return phys_to_nid(pfn << PAGE_SHIFT);
83 /* Initialize bootmem allocator for a node */
84 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
86 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
87 unsigned long nodedata_phys;
88 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
90 start = round_up(start, ZONE_ALIGN);
92 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
94 start_pfn = start >> PAGE_SHIFT;
95 end_pfn = end >> PAGE_SHIFT;
97 memory_present(nodeid, start_pfn, end_pfn);
98 nodedata_phys = find_e820_area(start, end, pgdat_size);
99 if (nodedata_phys == -1L)
100 panic("Cannot find memory pgdat in node %d\n", nodeid);
102 Dprintk("nodedata_phys %lx\n", nodedata_phys);
104 node_data[nodeid] = phys_to_virt(nodedata_phys);
105 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
106 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
107 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
108 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
110 /* Find a place for the bootmem map */
111 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
112 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
113 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
114 if (bootmap_start == -1L)
115 panic("Not enough continuous space for bootmap on node %d", nodeid);
116 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
118 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
119 bootmap_start >> PAGE_SHIFT,
122 e820_bootmem_free(NODE_DATA(nodeid), start, end);
124 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
125 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
126 node_set_online(nodeid);
129 /* Initialize final allocator for a zone */
130 void __init setup_node_zones(int nodeid)
132 unsigned long start_pfn, end_pfn;
133 unsigned long zones[MAX_NR_ZONES];
134 unsigned long holes[MAX_NR_ZONES];
136 start_pfn = node_start_pfn(nodeid);
137 end_pfn = node_end_pfn(nodeid);
139 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
140 nodeid, start_pfn, end_pfn);
142 size_zones(zones, holes, start_pfn, end_pfn);
143 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
147 void __init numa_init_array(void)
150 /* There are unfortunately some poorly designed mainboards around
151 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
152 mapping. To avoid this fill in the mapping for all possible
153 CPUs, as the number of CPUs is not known yet.
154 We round robin the existing nodes. */
155 rr = first_node(node_online_map);
156 for (i = 0; i < NR_CPUS; i++) {
157 if (cpu_to_node[i] != NUMA_NO_NODE)
160 rr = next_node(rr, node_online_map);
161 if (rr == MAX_NUMNODES)
162 rr = first_node(node_online_map);
167 #ifdef CONFIG_NUMA_EMU
168 int numa_fake __initdata = 0;
171 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
174 struct node nodes[MAX_NUMNODES];
175 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
177 /* Kludge needed for the hash function */
178 if (hweight64(sz) > 1) {
180 while ((x << 1) < sz)
183 printk("Numa emulation unbalanced. Complain to maintainer\n");
187 memset(&nodes,0,sizeof(nodes));
188 for (i = 0; i < numa_fake; i++) {
189 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
190 if (i == numa_fake-1)
191 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
192 nodes[i].end = nodes[i].start + sz;
193 if (i != numa_fake-1)
195 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
197 nodes[i].start, nodes[i].end,
198 (nodes[i].end - nodes[i].start) >> 20);
201 memnode_shift = compute_hash_shift(nodes, numa_fake);
202 if (memnode_shift < 0) {
204 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
207 for_each_online_node(i)
208 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
214 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
218 #ifdef CONFIG_NUMA_EMU
219 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
223 #ifdef CONFIG_ACPI_NUMA
224 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
225 end_pfn << PAGE_SHIFT))
229 #ifdef CONFIG_K8_NUMA
230 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
233 printk(KERN_INFO "%s\n",
234 numa_off ? "NUMA turned off" : "No NUMA configuration found");
236 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
237 start_pfn << PAGE_SHIFT,
238 end_pfn << PAGE_SHIFT);
239 /* setup dummy node covering all memory */
242 nodes_clear(node_online_map);
244 for (i = 0; i < NR_CPUS; i++)
246 node_to_cpumask[0] = cpumask_of_cpu(0);
247 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
250 __cpuinit void numa_add_cpu(int cpu)
252 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
255 unsigned long __init numa_free_all_bootmem(void)
258 unsigned long pages = 0;
259 for_each_online_node(i) {
260 pages += free_all_bootmem_node(NODE_DATA(i));
265 void __init paging_init(void)
268 for_each_online_node(i) {
274 __init int numa_setup(char *opt)
276 if (!strncmp(opt,"off",3))
278 #ifdef CONFIG_NUMA_EMU
279 if(!strncmp(opt, "fake=", 5)) {
280 numa_fake = simple_strtoul(opt+5,NULL,0); ;
281 if (numa_fake >= MAX_NUMNODES)
282 numa_fake = MAX_NUMNODES;
285 #ifdef CONFIG_ACPI_NUMA
286 if (!strncmp(opt,"noacpi",6))
292 EXPORT_SYMBOL(cpu_to_node);
293 EXPORT_SYMBOL(node_to_cpumask);
294 EXPORT_SYMBOL(memnode_shift);
295 EXPORT_SYMBOL(memnodemap);
296 EXPORT_SYMBOL(node_data);