fd9f25d7a6c4c2d6d3d48c4057a00e2c725ddff0
[pandora-kernel.git] / arch / x86_64 / mm / numa.c
1 /* 
2  * Generic VM initialization for x86-64 NUMA setups.
3  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4  */ 
5 #include <linux/kernel.h>
6 #include <linux/mm.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
14
15 #include <asm/e820.h>
16 #include <asm/proto.h>
17 #include <asm/dma.h>
18 #include <asm/numa.h>
19 #include <asm/acpi.h>
20
21 #ifndef Dprintk
22 #define Dprintk(x...)
23 #endif
24
25 struct pglist_data *node_data[MAX_NUMNODES];
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28 int memnode_shift;
29 u8  memnodemap[NODEMAPSIZE];
30
31 unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
32 cpumask_t     node_to_cpumask[MAX_NUMNODES];
33
34 int numa_off __initdata;
35
36 int __init compute_hash_shift(struct node *nodes, int numnodes)
37 {
38         int i; 
39         int shift = 24;
40         u64 addr;
41         
42         /* When in doubt use brute force. */
43         while (shift < 48) { 
44                 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); 
45                 for (i = 0; i < numnodes; i++) {
46                         if (nodes[i].start == nodes[i].end) 
47                                 continue;
48                         for (addr = nodes[i].start; 
49                              addr < nodes[i].end; 
50                              addr += (1UL << shift)) {
51                                 if (memnodemap[addr >> shift] != 0xff && 
52                                     memnodemap[addr >> shift] != i) { 
53                                         printk(KERN_INFO 
54                                             "node %d shift %d addr %Lx conflict %d\n", 
55                                                i, shift, addr, memnodemap[addr>>shift]);
56                                         goto next; 
57                                 } 
58                                 memnodemap[addr >> shift] = i; 
59                         } 
60                 } 
61                 return shift; 
62         next:
63                 shift++; 
64         } 
65         memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE); 
66         return -1; 
67 }
68
69 /* Initialize bootmem allocator for a node */
70 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
71
72         unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
73         unsigned long nodedata_phys;
74         const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
75
76         start = round_up(start, ZONE_ALIGN); 
77
78         printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
79
80         start_pfn = start >> PAGE_SHIFT;
81         end_pfn = end >> PAGE_SHIFT;
82
83         nodedata_phys = find_e820_area(start, end, pgdat_size); 
84         if (nodedata_phys == -1L) 
85                 panic("Cannot find memory pgdat in node %d\n", nodeid);
86
87         Dprintk("nodedata_phys %lx\n", nodedata_phys); 
88
89         node_data[nodeid] = phys_to_virt(nodedata_phys);
90         memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
91         NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
92         NODE_DATA(nodeid)->node_start_pfn = start_pfn;
93         NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
94
95         /* Find a place for the bootmem map */
96         bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 
97         bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
98         bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
99         if (bootmap_start == -1L) 
100                 panic("Not enough continuous space for bootmap on node %d", nodeid); 
101         Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 
102         
103         bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
104                                          bootmap_start >> PAGE_SHIFT, 
105                                          start_pfn, end_pfn); 
106
107         e820_bootmem_free(NODE_DATA(nodeid), start, end);
108
109         reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
110         reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
111         node_set_online(nodeid);
112
113
114 /* Initialize final allocator for a zone */
115 void __init setup_node_zones(int nodeid)
116
117         unsigned long start_pfn, end_pfn; 
118         unsigned long zones[MAX_NR_ZONES];
119         unsigned long dma_end_pfn;
120
121         memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 
122
123         start_pfn = node_start_pfn(nodeid);
124         end_pfn = node_end_pfn(nodeid);
125
126         Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
127         
128         /* All nodes > 0 have a zero length zone DMA */ 
129         dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; 
130         if (start_pfn < dma_end_pfn) { 
131                 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
132                 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; 
133         } else { 
134                 zones[ZONE_NORMAL] = end_pfn - start_pfn; 
135         } 
136     
137         free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
138                             start_pfn, NULL); 
139
140
141 void __init numa_init_array(void)
142 {
143         int rr, i;
144         /* There are unfortunately some poorly designed mainboards around
145            that only connect memory to a single CPU. This breaks the 1:1 cpu->node
146            mapping. To avoid this fill in the mapping for all possible
147            CPUs, as the number of CPUs is not known yet. 
148            We round robin the existing nodes. */
149         rr = 0;
150         for (i = 0; i < NR_CPUS; i++) {
151                 if (cpu_to_node[i] != NUMA_NO_NODE)
152                         continue;
153                 rr = next_node(rr, node_online_map);
154                 if (rr == MAX_NUMNODES)
155                         rr = first_node(node_online_map);
156                 cpu_to_node[i] = rr;
157                 rr++; 
158         }
159
160         set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
161 }
162
163 #ifdef CONFIG_NUMA_EMU
164 int numa_fake __initdata = 0;
165
166 /* Numa emulation */
167 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
168 {
169         int i;
170         struct node nodes[MAX_NUMNODES];
171         unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
172
173         /* Kludge needed for the hash function */
174         if (hweight64(sz) > 1) {
175                 unsigned long x = 1;
176                 while ((x << 1) < sz)
177                         x <<= 1;
178                 if (x < sz/2)
179                         printk("Numa emulation unbalanced. Complain to maintainer\n");
180                 sz = x;
181         }
182
183         memset(&nodes,0,sizeof(nodes));
184         for (i = 0; i < numa_fake; i++) {
185                 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
186                 if (i == numa_fake-1)
187                         sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
188                 nodes[i].end = nodes[i].start + sz;
189                 if (i != numa_fake-1)
190                         nodes[i].end--;
191                 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
192                        i,
193                        nodes[i].start, nodes[i].end,
194                        (nodes[i].end - nodes[i].start) >> 20);
195                 node_set_online(i);
196         }
197         memnode_shift = compute_hash_shift(nodes, numa_fake);
198         if (memnode_shift < 0) {
199                 memnode_shift = 0;
200                 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
201                 return -1;
202         }
203         for_each_online_node(i)
204                 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
205         numa_init_array();
206         return 0;
207 }
208 #endif
209
210 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
211
212         int i;
213
214 #ifdef CONFIG_NUMA_EMU
215         if (numa_fake && !numa_emulation(start_pfn, end_pfn))
216                 return;
217 #endif
218
219 #ifdef CONFIG_ACPI_NUMA
220         if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
221                                           end_pfn << PAGE_SHIFT))
222                 return;
223 #endif
224
225 #ifdef CONFIG_K8_NUMA
226         if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
227                 return;
228 #endif
229         printk(KERN_INFO "%s\n",
230                numa_off ? "NUMA turned off" : "No NUMA configuration found");
231
232         printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
233                start_pfn << PAGE_SHIFT,
234                end_pfn << PAGE_SHIFT); 
235                 /* setup dummy node covering all memory */ 
236         memnode_shift = 63; 
237         memnodemap[0] = 0;
238         nodes_clear(node_online_map);
239         node_set_online(0);
240         for (i = 0; i < NR_CPUS; i++)
241                 cpu_to_node[i] = 0;
242         node_to_cpumask[0] = cpumask_of_cpu(0);
243         setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
244 }
245
246 __init void numa_add_cpu(int cpu)
247 {
248         /* BP is initialized elsewhere */
249         if (cpu) 
250                 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
251
252
253 unsigned long __init numa_free_all_bootmem(void) 
254
255         int i;
256         unsigned long pages = 0;
257         for_each_online_node(i) {
258                 pages += free_all_bootmem_node(NODE_DATA(i));
259         }
260         return pages;
261
262
263 void __init paging_init(void)
264
265         int i;
266         for_each_online_node(i) {
267                 setup_node_zones(i); 
268         }
269
270
271 /* [numa=off] */
272 __init int numa_setup(char *opt) 
273
274         if (!strncmp(opt,"off",3))
275                 numa_off = 1;
276 #ifdef CONFIG_NUMA_EMU
277         if(!strncmp(opt, "fake=", 5)) {
278                 numa_fake = simple_strtoul(opt+5,NULL,0); ;
279                 if (numa_fake >= MAX_NUMNODES)
280                         numa_fake = MAX_NUMNODES;
281         }
282 #endif
283 #ifdef CONFIG_ACPI_NUMA
284         if (!strncmp(opt,"noacpi",6))
285                 acpi_numa = -1;
286 #endif
287         return 1;
288
289
290 EXPORT_SYMBOL(cpu_to_node);
291 EXPORT_SYMBOL(node_to_cpumask);
292 EXPORT_SYMBOL(memnode_shift);
293 EXPORT_SYMBOL(memnodemap);
294 EXPORT_SYMBOL(node_data);