2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
29 u8 memnodemap[NODEMAPSIZE];
31 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
34 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
37 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
39 int numa_off __initdata;
43 * Given a shift value, try to populate memnodemap[]
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
49 static int __init populate_memnodemap(
50 const struct node *nodes, int numnodes, int shift)
54 unsigned long addr, end;
56 memset(memnodemap, 0xff, sizeof(memnodemap));
57 for (i = 0; i < numnodes; i++) {
58 addr = nodes[i].start;
62 if ((end >> shift) >= NODEMAPSIZE)
65 if (memnodemap[addr >> shift] != 0xff)
67 memnodemap[addr >> shift] = i;
75 int __init compute_hash_shift(struct node *nodes, int numnodes)
79 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
82 printk(KERN_DEBUG "Using %d for the hash shift.\n",
85 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
87 "Your memory is not aligned you need to rebuild your kernel "
88 "with a bigger NODEMAPSIZE shift=%d\n",
95 #ifdef CONFIG_SPARSEMEM
96 int early_pfn_to_nid(unsigned long pfn)
98 return phys_to_nid(pfn << PAGE_SHIFT);
102 /* Initialize bootmem allocator for a node */
103 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
105 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
106 unsigned long nodedata_phys;
107 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
109 start = round_up(start, ZONE_ALIGN);
111 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
113 start_pfn = start >> PAGE_SHIFT;
114 end_pfn = end >> PAGE_SHIFT;
116 memory_present(nodeid, start_pfn, end_pfn);
117 nodedata_phys = find_e820_area(start, end, pgdat_size);
118 if (nodedata_phys == -1L)
119 panic("Cannot find memory pgdat in node %d\n", nodeid);
121 Dprintk("nodedata_phys %lx\n", nodedata_phys);
123 node_data[nodeid] = phys_to_virt(nodedata_phys);
124 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
125 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
126 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
127 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
129 /* Find a place for the bootmem map */
130 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
131 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
132 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
133 if (bootmap_start == -1L)
134 panic("Not enough continuous space for bootmap on node %d", nodeid);
135 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
137 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
138 bootmap_start >> PAGE_SHIFT,
141 e820_bootmem_free(NODE_DATA(nodeid), start, end);
143 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
144 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
145 node_set_online(nodeid);
148 /* Initialize final allocator for a zone */
149 void __init setup_node_zones(int nodeid)
151 unsigned long start_pfn, end_pfn;
152 unsigned long zones[MAX_NR_ZONES];
153 unsigned long holes[MAX_NR_ZONES];
155 start_pfn = node_start_pfn(nodeid);
156 end_pfn = node_end_pfn(nodeid);
158 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
159 nodeid, start_pfn, end_pfn);
161 size_zones(zones, holes, start_pfn, end_pfn);
162 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
166 void __init numa_init_array(void)
169 /* There are unfortunately some poorly designed mainboards around
170 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
171 mapping. To avoid this fill in the mapping for all possible
172 CPUs, as the number of CPUs is not known yet.
173 We round robin the existing nodes. */
174 rr = first_node(node_online_map);
175 for (i = 0; i < NR_CPUS; i++) {
176 if (cpu_to_node[i] != NUMA_NO_NODE)
178 numa_set_node(i, rr);
179 rr = next_node(rr, node_online_map);
180 if (rr == MAX_NUMNODES)
181 rr = first_node(node_online_map);
186 #ifdef CONFIG_NUMA_EMU
187 int numa_fake __initdata = 0;
190 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
193 struct node nodes[MAX_NUMNODES];
194 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
196 /* Kludge needed for the hash function */
197 if (hweight64(sz) > 1) {
199 while ((x << 1) < sz)
202 printk("Numa emulation unbalanced. Complain to maintainer\n");
206 memset(&nodes,0,sizeof(nodes));
207 for (i = 0; i < numa_fake; i++) {
208 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
209 if (i == numa_fake-1)
210 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
211 nodes[i].end = nodes[i].start + sz;
212 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
214 nodes[i].start, nodes[i].end,
215 (nodes[i].end - nodes[i].start) >> 20);
218 memnode_shift = compute_hash_shift(nodes, numa_fake);
219 if (memnode_shift < 0) {
221 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
224 for_each_online_node(i)
225 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
231 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
235 #ifdef CONFIG_NUMA_EMU
236 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
240 #ifdef CONFIG_ACPI_NUMA
241 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
242 end_pfn << PAGE_SHIFT))
246 #ifdef CONFIG_K8_NUMA
247 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
250 printk(KERN_INFO "%s\n",
251 numa_off ? "NUMA turned off" : "No NUMA configuration found");
253 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
254 start_pfn << PAGE_SHIFT,
255 end_pfn << PAGE_SHIFT);
256 /* setup dummy node covering all memory */
259 nodes_clear(node_online_map);
261 for (i = 0; i < NR_CPUS; i++)
263 node_to_cpumask[0] = cpumask_of_cpu(0);
264 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
267 __cpuinit void numa_add_cpu(int cpu)
269 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
272 void __cpuinit numa_set_node(int cpu, int node)
274 cpu_pda[cpu].nodenumber = node;
275 cpu_to_node[cpu] = node;
278 unsigned long __init numa_free_all_bootmem(void)
281 unsigned long pages = 0;
282 for_each_online_node(i) {
283 pages += free_all_bootmem_node(NODE_DATA(i));
288 void __init paging_init(void)
291 for_each_online_node(i) {
297 __init int numa_setup(char *opt)
299 if (!strncmp(opt,"off",3))
301 #ifdef CONFIG_NUMA_EMU
302 if(!strncmp(opt, "fake=", 5)) {
303 numa_fake = simple_strtoul(opt+5,NULL,0); ;
304 if (numa_fake >= MAX_NUMNODES)
305 numa_fake = MAX_NUMNODES;
308 #ifdef CONFIG_ACPI_NUMA
309 if (!strncmp(opt,"noacpi",6))
315 EXPORT_SYMBOL(cpu_to_node);
316 EXPORT_SYMBOL(node_to_cpumask);
317 EXPORT_SYMBOL(memnode_shift);
318 EXPORT_SYMBOL(memnodemap);
319 EXPORT_SYMBOL(node_data);