2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
29 u8 memnodemap[NODEMAPSIZE];
31 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
34 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
37 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
39 int numa_off __initdata;
43 * Given a shift value, try to populate memnodemap[]
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
49 static int __init populate_memnodemap(
50 const struct node *nodes, int numnodes, int shift)
54 unsigned long addr, end;
56 memset(memnodemap, 0xff, sizeof(memnodemap));
57 for (i = 0; i < numnodes; i++) {
58 addr = nodes[i].start;
62 if ((end >> shift) >= NODEMAPSIZE)
65 if (memnodemap[addr >> shift] != 0xff)
67 memnodemap[addr >> shift] = i;
75 int __init compute_hash_shift(struct node *nodes, int numnodes)
79 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
82 printk(KERN_DEBUG "Using %d for the hash shift.\n",
85 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
87 "Your memory is not aligned you need to rebuild your kernel "
88 "with a bigger NODEMAPSIZE shift=%d\n",
95 #ifdef CONFIG_SPARSEMEM
96 int early_pfn_to_nid(unsigned long pfn)
98 return phys_to_nid(pfn << PAGE_SHIFT);
102 /* Initialize bootmem allocator for a node */
103 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
105 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
106 unsigned long nodedata_phys;
107 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
109 start = round_up(start, ZONE_ALIGN);
111 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
113 start_pfn = start >> PAGE_SHIFT;
114 end_pfn = end >> PAGE_SHIFT;
116 nodedata_phys = find_e820_area(start, end, pgdat_size);
117 if (nodedata_phys == -1L)
118 panic("Cannot find memory pgdat in node %d\n", nodeid);
120 Dprintk("nodedata_phys %lx\n", nodedata_phys);
122 node_data[nodeid] = phys_to_virt(nodedata_phys);
123 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
124 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
125 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
126 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
128 /* Find a place for the bootmem map */
129 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
130 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
131 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
132 if (bootmap_start == -1L)
133 panic("Not enough continuous space for bootmap on node %d", nodeid);
134 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
136 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
137 bootmap_start >> PAGE_SHIFT,
140 e820_bootmem_free(NODE_DATA(nodeid), start, end);
142 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
143 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
144 node_set_online(nodeid);
147 /* Initialize final allocator for a zone */
148 void __init setup_node_zones(int nodeid)
150 unsigned long start_pfn, end_pfn;
151 unsigned long zones[MAX_NR_ZONES];
152 unsigned long holes[MAX_NR_ZONES];
154 start_pfn = node_start_pfn(nodeid);
155 end_pfn = node_end_pfn(nodeid);
157 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
158 nodeid, start_pfn, end_pfn);
160 size_zones(zones, holes, start_pfn, end_pfn);
161 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
165 void __init numa_init_array(void)
168 /* There are unfortunately some poorly designed mainboards around
169 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
170 mapping. To avoid this fill in the mapping for all possible
171 CPUs, as the number of CPUs is not known yet.
172 We round robin the existing nodes. */
173 rr = first_node(node_online_map);
174 for (i = 0; i < NR_CPUS; i++) {
175 if (cpu_to_node[i] != NUMA_NO_NODE)
177 numa_set_node(i, rr);
178 rr = next_node(rr, node_online_map);
179 if (rr == MAX_NUMNODES)
180 rr = first_node(node_online_map);
185 #ifdef CONFIG_NUMA_EMU
186 int numa_fake __initdata = 0;
189 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
192 struct node nodes[MAX_NUMNODES];
193 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
195 /* Kludge needed for the hash function */
196 if (hweight64(sz) > 1) {
198 while ((x << 1) < sz)
201 printk("Numa emulation unbalanced. Complain to maintainer\n");
205 memset(&nodes,0,sizeof(nodes));
206 for (i = 0; i < numa_fake; i++) {
207 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
208 if (i == numa_fake-1)
209 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
210 nodes[i].end = nodes[i].start + sz;
211 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
213 nodes[i].start, nodes[i].end,
214 (nodes[i].end - nodes[i].start) >> 20);
217 memnode_shift = compute_hash_shift(nodes, numa_fake);
218 if (memnode_shift < 0) {
220 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
223 for_each_online_node(i)
224 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
230 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
234 #ifdef CONFIG_NUMA_EMU
235 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
239 #ifdef CONFIG_ACPI_NUMA
240 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
241 end_pfn << PAGE_SHIFT))
245 #ifdef CONFIG_K8_NUMA
246 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
249 printk(KERN_INFO "%s\n",
250 numa_off ? "NUMA turned off" : "No NUMA configuration found");
252 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
253 start_pfn << PAGE_SHIFT,
254 end_pfn << PAGE_SHIFT);
255 /* setup dummy node covering all memory */
258 nodes_clear(node_online_map);
260 for (i = 0; i < NR_CPUS; i++)
262 node_to_cpumask[0] = cpumask_of_cpu(0);
263 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
266 __cpuinit void numa_add_cpu(int cpu)
268 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
271 void __cpuinit numa_set_node(int cpu, int node)
273 cpu_pda[cpu].nodenumber = node;
274 cpu_to_node[cpu] = node;
277 unsigned long __init numa_free_all_bootmem(void)
280 unsigned long pages = 0;
281 for_each_online_node(i) {
282 pages += free_all_bootmem_node(NODE_DATA(i));
287 #ifdef CONFIG_SPARSEMEM
288 static void __init arch_sparse_init(void)
292 for_each_online_node(i)
293 memory_present(i, node_start_pfn(i), node_end_pfn(i));
298 #define arch_sparse_init() do {} while (0)
301 void __init paging_init(void)
307 for_each_online_node(i) {
313 __init int numa_setup(char *opt)
315 if (!strncmp(opt,"off",3))
317 #ifdef CONFIG_NUMA_EMU
318 if(!strncmp(opt, "fake=", 5)) {
319 numa_fake = simple_strtoul(opt+5,NULL,0); ;
320 if (numa_fake >= MAX_NUMNODES)
321 numa_fake = MAX_NUMNODES;
324 #ifdef CONFIG_ACPI_NUMA
325 if (!strncmp(opt,"noacpi",6))
331 EXPORT_SYMBOL(cpu_to_node);
332 EXPORT_SYMBOL(node_to_cpumask);
333 EXPORT_SYMBOL(memnode_shift);
334 EXPORT_SYMBOL(memnodemap);
335 EXPORT_SYMBOL(node_data);