xen: include xen/xen.h for definition of xen_initial_domain()
[pandora-kernel.git] / arch / x86 / xen / setup.c
1 /*
2  * Machine specific setup for xen
3  *
4  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
5  */
6
7 #include <linux/module.h>
8 #include <linux/sched.h>
9 #include <linux/mm.h>
10 #include <linux/pm.h>
11
12 #include <asm/elf.h>
13 #include <asm/vdso.h>
14 #include <asm/e820.h>
15 #include <asm/setup.h>
16 #include <asm/acpi.h>
17 #include <asm/xen/hypervisor.h>
18 #include <asm/xen/hypercall.h>
19
20 #include <xen/xen.h>
21 #include <xen/page.h>
22 #include <xen/interface/callback.h>
23 #include <xen/interface/memory.h>
24 #include <xen/interface/physdev.h>
25 #include <xen/interface/memory.h>
26 #include <xen/features.h>
27
28 #include "xen-ops.h"
29 #include "vdso.h"
30
31 /* These are code, but not functions.  Defined in entry.S */
32 extern const char xen_hypervisor_callback[];
33 extern const char xen_failsafe_callback[];
34 extern void xen_sysenter_target(void);
35 extern void xen_syscall_target(void);
36 extern void xen_syscall32_target(void);
37
38 /* Amount of extra memory space we add to the e820 ranges */
39 phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
40
41 /* 
42  * The maximum amount of extra memory compared to the base size.  The
43  * main scaling factor is the size of struct page.  At extreme ratios
44  * of base:extra, all the base memory can be filled with page
45  * structures for the extra memory, leaving no space for anything
46  * else.
47  * 
48  * 10x seems like a reasonable balance between scaling flexibility and
49  * leaving a practically usable system.
50  */
51 #define EXTRA_MEM_RATIO         (10)
52
53 static __init void xen_add_extra_mem(unsigned long pages)
54 {
55         u64 size = (u64)pages * PAGE_SIZE;
56         u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
57
58         if (!pages)
59                 return;
60
61         e820_add_region(extra_start, size, E820_RAM);
62         sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
63
64         reserve_early(extra_start, extra_start + size, "XEN EXTRA");
65
66         xen_extra_mem_size += size;
67
68         xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
69 }
70
71 static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
72                                               phys_addr_t end_addr)
73 {
74         struct xen_memory_reservation reservation = {
75                 .address_bits = 0,
76                 .extent_order = 0,
77                 .domid        = DOMID_SELF
78         };
79         unsigned long start, end;
80         unsigned long len = 0;
81         unsigned long pfn;
82         int ret;
83
84         start = PFN_UP(start_addr);
85         end = PFN_DOWN(end_addr);
86
87         if (end <= start)
88                 return 0;
89
90         printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ",
91                start, end);
92         for(pfn = start; pfn < end; pfn++) {
93                 unsigned long mfn = pfn_to_mfn(pfn);
94
95                 /* Make sure pfn exists to start with */
96                 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
97                         continue;
98
99                 set_xen_guest_handle(reservation.extent_start, &mfn);
100                 reservation.nr_extents = 1;
101
102                 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
103                                            &reservation);
104                 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
105                      start, end, ret);
106                 if (ret == 1) {
107                         set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
108                         len++;
109                 }
110         }
111         printk(KERN_CONT "%ld pages freed\n", len);
112
113         return len;
114 }
115
116 static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
117                                                      const struct e820map *e820)
118 {
119         phys_addr_t max_addr = PFN_PHYS(max_pfn);
120         phys_addr_t last_end = 0;
121         unsigned long released = 0;
122         int i;
123
124         for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
125                 phys_addr_t end = e820->map[i].addr;
126                 end = min(max_addr, end);
127
128                 released += xen_release_chunk(last_end, end);
129                 last_end = e820->map[i].addr + e820->map[i].size;
130         }
131
132         if (last_end < max_addr)
133                 released += xen_release_chunk(last_end, max_addr);
134
135         printk(KERN_INFO "released %ld pages of unused memory\n", released);
136         return released;
137 }
138
139 /**
140  * machine_specific_memory_setup - Hook for machine specific memory setup.
141  **/
142 char * __init xen_memory_setup(void)
143 {
144         static struct e820entry map[E820MAX] __initdata;
145
146         unsigned long max_pfn = xen_start_info->nr_pages;
147         unsigned long long mem_end;
148         int rc;
149         struct xen_memory_map memmap;
150         unsigned long extra_pages = 0;
151         unsigned long extra_limit;
152         int i;
153         int op;
154
155         max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
156         mem_end = PFN_PHYS(max_pfn);
157
158         memmap.nr_entries = E820MAX;
159         set_xen_guest_handle(memmap.buffer, map);
160
161         op = xen_initial_domain() ?
162                 XENMEM_machine_memory_map :
163                 XENMEM_memory_map;
164         rc = HYPERVISOR_memory_op(op, &memmap);
165         if (rc == -ENOSYS) {
166                 memmap.nr_entries = 1;
167                 map[0].addr = 0ULL;
168                 map[0].size = mem_end;
169                 /* 8MB slack (to balance backend allocations). */
170                 map[0].size += 8ULL << 20;
171                 map[0].type = E820_RAM;
172                 rc = 0;
173         }
174         BUG_ON(rc);
175
176         e820.nr_map = 0;
177         xen_extra_mem_start = mem_end;
178         for (i = 0; i < memmap.nr_entries; i++) {
179                 unsigned long long end = map[i].addr + map[i].size;
180
181                 if (map[i].type == E820_RAM) {
182                         if (map[i].addr < mem_end && end > mem_end) {
183                                 /* Truncate region to max_mem. */
184                                 u64 delta = end - mem_end;
185
186                                 map[i].size -= delta;
187                                 extra_pages += PFN_DOWN(delta);
188
189                                 end = mem_end;
190                         }
191                 }
192
193                 if (end > xen_extra_mem_start)
194                         xen_extra_mem_start = end;
195
196                 /* If region is non-RAM or below mem_end, add what remains */
197                 if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
198                     map[i].size > 0)
199                         e820_add_region(map[i].addr, map[i].size, map[i].type);
200         }
201
202         /*
203          * Even though this is normal, usable memory under Xen, reserve
204          * ISA memory anyway because too many things think they can poke
205          * about in there.
206          */
207         e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
208                         E820_RESERVED);
209
210         /*
211          * Reserve Xen bits:
212          *  - mfn_list
213          *  - xen_start_info
214          * See comment above "struct start_info" in <xen/interface/xen.h>
215          */
216         reserve_early(__pa(xen_start_info->mfn_list),
217                       __pa(xen_start_info->pt_base),
218                         "XEN START INFO");
219
220         sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
221
222         extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
223
224         /*
225          * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
226          * factor the base size.  On non-highmem systems, the base
227          * size is the full initial memory allocation; on highmem it
228          * is limited to the max size of lowmem, so that it doesn't
229          * get completely filled.
230          *
231          * In principle there could be a problem in lowmem systems if
232          * the initial memory is also very large with respect to
233          * lowmem, but we won't try to deal with that here.
234          */
235         extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
236                           max_pfn + extra_pages);
237
238         if (extra_limit >= max_pfn)
239                 extra_pages = extra_limit - max_pfn;
240         else
241                 extra_pages = 0;
242
243         if (!xen_initial_domain())
244                 xen_add_extra_mem(extra_pages);
245
246         return "Xen";
247 }
248
249 static void xen_idle(void)
250 {
251         local_irq_disable();
252
253         if (need_resched())
254                 local_irq_enable();
255         else {
256                 current_thread_info()->status &= ~TS_POLLING;
257                 smp_mb__after_clear_bit();
258                 safe_halt();
259                 current_thread_info()->status |= TS_POLLING;
260         }
261 }
262
263 /*
264  * Set the bit indicating "nosegneg" library variants should be used.
265  * We only need to bother in pure 32-bit mode; compat 32-bit processes
266  * can have un-truncated segments, so wrapping around is allowed.
267  */
268 static void __init fiddle_vdso(void)
269 {
270 #ifdef CONFIG_X86_32
271         u32 *mask;
272         mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
273         *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
274         mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
275         *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
276 #endif
277 }
278
279 static __cpuinit int register_callback(unsigned type, const void *func)
280 {
281         struct callback_register callback = {
282                 .type = type,
283                 .address = XEN_CALLBACK(__KERNEL_CS, func),
284                 .flags = CALLBACKF_mask_events,
285         };
286
287         return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
288 }
289
290 void __cpuinit xen_enable_sysenter(void)
291 {
292         int ret;
293         unsigned sysenter_feature;
294
295 #ifdef CONFIG_X86_32
296         sysenter_feature = X86_FEATURE_SEP;
297 #else
298         sysenter_feature = X86_FEATURE_SYSENTER32;
299 #endif
300
301         if (!boot_cpu_has(sysenter_feature))
302                 return;
303
304         ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
305         if(ret != 0)
306                 setup_clear_cpu_cap(sysenter_feature);
307 }
308
309 void __cpuinit xen_enable_syscall(void)
310 {
311 #ifdef CONFIG_X86_64
312         int ret;
313
314         ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
315         if (ret != 0) {
316                 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
317                 /* Pretty fatal; 64-bit userspace has no other
318                    mechanism for syscalls. */
319         }
320
321         if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
322                 ret = register_callback(CALLBACKTYPE_syscall32,
323                                         xen_syscall32_target);
324                 if (ret != 0)
325                         setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
326         }
327 #endif /* CONFIG_X86_64 */
328 }
329
330 void __init xen_arch_setup(void)
331 {
332         struct physdev_set_iopl set_iopl;
333         int rc;
334
335         xen_panic_handler_init();
336
337         HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
338         HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
339
340         if (!xen_feature(XENFEAT_auto_translated_physmap))
341                 HYPERVISOR_vm_assist(VMASST_CMD_enable,
342                                      VMASST_TYPE_pae_extended_cr3);
343
344         if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
345             register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
346                 BUG();
347
348         xen_enable_sysenter();
349         xen_enable_syscall();
350
351         set_iopl.iopl = 1;
352         rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
353         if (rc != 0)
354                 printk(KERN_INFO "physdev_op failed %d\n", rc);
355
356 #ifdef CONFIG_ACPI
357         if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
358                 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
359                 disable_acpi();
360         }
361 #endif
362
363         memcpy(boot_command_line, xen_start_info->cmd_line,
364                MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
365                COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
366
367         pm_idle = xen_idle;
368
369         paravirt_disable_iospace();
370
371         fiddle_vdso();
372 }