2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
18 #include <linux/swap.h>
19 #include <linux/smp.h>
20 #include <linux/init.h>
21 #include <linux/pagemap.h>
22 #include <linux/bootmem.h>
23 #include <linux/proc_fs.h>
24 #include <linux/pci.h>
25 #include <linux/pfn.h>
26 #include <linux/poison.h>
27 #include <linux/dma-mapping.h>
28 #include <linux/module.h>
29 #include <linux/memory_hotplug.h>
31 #include <asm/processor.h>
32 #include <asm/system.h>
33 #include <asm/uaccess.h>
34 #include <asm/pgtable.h>
35 #include <asm/pgalloc.h>
37 #include <asm/fixmap.h>
41 #include <asm/mmu_context.h>
42 #include <asm/proto.h>
44 #include <asm/sections.h>
50 const struct dma_mapping_ops* dma_ops;
51 EXPORT_SYMBOL(dma_ops);
53 static unsigned long dma_reserve __initdata;
55 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
58 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
59 * physical space so we can cache the place of the first one and move
60 * around without checking the pgd every time.
65 long i, total = 0, reserved = 0;
66 long shared = 0, cached = 0;
70 printk(KERN_INFO "Mem-info:\n");
72 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
74 for_each_online_pgdat(pgdat) {
75 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
76 page = pfn_to_page(pgdat->node_start_pfn + i);
78 if (PageReserved(page))
80 else if (PageSwapCache(page))
82 else if (page_count(page))
83 shared += page_count(page) - 1;
86 printk(KERN_INFO "%lu pages of RAM\n", total);
87 printk(KERN_INFO "%lu reserved pages\n",reserved);
88 printk(KERN_INFO "%lu pages shared\n",shared);
89 printk(KERN_INFO "%lu pages swap cached\n",cached);
94 static __init void *spp_getpage(void)
98 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
100 ptr = alloc_bootmem_pages(PAGE_SIZE);
101 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
102 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
104 Dprintk("spp_getpage %p\n", ptr);
108 static __init void set_pte_phys(unsigned long vaddr,
109 unsigned long phys, pgprot_t prot)
116 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
118 pgd = pgd_offset_k(vaddr);
119 if (pgd_none(*pgd)) {
120 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
123 pud = pud_offset(pgd, vaddr);
124 if (pud_none(*pud)) {
125 pmd = (pmd_t *) spp_getpage();
126 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
127 if (pmd != pmd_offset(pud, 0)) {
128 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
132 pmd = pmd_offset(pud, vaddr);
133 if (pmd_none(*pmd)) {
134 pte = (pte_t *) spp_getpage();
135 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
136 if (pte != pte_offset_kernel(pmd, 0)) {
137 printk("PAGETABLE BUG #02!\n");
141 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
143 pte = pte_offset_kernel(pmd, vaddr);
144 if (!pte_none(*pte) &&
145 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
147 set_pte(pte, new_pte);
150 * It's enough to flush this one mapping.
151 * (PGE mappings get flushed as well)
153 __flush_tlb_one(vaddr);
156 /* NOTE: this is meant to be run only at boot */
158 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
160 unsigned long address = __fix_to_virt(idx);
162 if (idx >= __end_of_fixed_addresses) {
163 printk("Invalid __set_fixmap\n");
166 set_pte_phys(address, phys, prot);
169 unsigned long __initdata table_start, table_end;
171 static __meminit void *alloc_low_page(unsigned long *phys)
173 unsigned long pfn = table_end++;
177 adr = (void *)get_zeroed_page(GFP_ATOMIC);
183 panic("alloc_low_page: ran out of memory");
185 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
186 memset(adr, 0, PAGE_SIZE);
187 *phys = pfn * PAGE_SIZE;
191 static __meminit void unmap_low_page(void *adr)
197 early_iounmap(adr, PAGE_SIZE);
200 /* Must run before zap_low_mappings */
201 __init void *early_ioremap(unsigned long addr, unsigned long size)
204 pmd_t *pmd, *last_pmd;
207 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
208 vaddr = __START_KERNEL_map;
209 pmd = level2_kernel_pgt;
210 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
211 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
212 for (i = 0; i < pmds; i++) {
213 if (pmd_present(pmd[i]))
216 vaddr += addr & ~PMD_MASK;
218 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
219 set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
221 return (void *)vaddr;
225 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
229 /* To avoid virtual aliases later */
230 __init void early_iounmap(void *addr, unsigned long size)
236 vaddr = (unsigned long)addr;
237 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
238 pmd = level2_kernel_pgt + pmd_index(vaddr);
239 for (i = 0; i < pmds; i++)
244 static void __meminit
245 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
247 int i = pmd_index(address);
249 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
251 pmd_t *pmd = pmd_page + pmd_index(address);
253 if (address >= end) {
255 for (; i < PTRS_PER_PMD; i++, pmd++)
256 set_pmd(pmd, __pmd(0));
263 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
264 entry &= __supported_pte_mask;
265 set_pmd(pmd, __pmd(entry));
269 static void __meminit
270 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
272 pmd_t *pmd = pmd_offset(pud,0);
273 spin_lock(&init_mm.page_table_lock);
274 phys_pmd_init(pmd, address, end);
275 spin_unlock(&init_mm.page_table_lock);
279 static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
281 int i = pud_index(addr);
284 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
285 unsigned long pmd_phys;
286 pud_t *pud = pud_page + pud_index(addr);
292 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
293 set_pud(pud, __pud(0));
298 phys_pmd_update(pud, addr, end);
302 pmd = alloc_low_page(&pmd_phys);
303 spin_lock(&init_mm.page_table_lock);
304 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
305 phys_pmd_init(pmd, addr, end);
306 spin_unlock(&init_mm.page_table_lock);
312 static void __init find_early_table_space(unsigned long end)
314 unsigned long puds, pmds, tables, start;
316 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
317 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
318 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
319 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
321 /* RED-PEN putting page tables only on node 0 could
322 cause a hotspot and fill up ZONE_DMA. The page tables
323 need roughly 0.5KB per GB. */
325 table_start = find_e820_area(start, end, tables);
326 if (table_start == -1UL)
327 panic("Cannot find space for the kernel page tables");
329 table_start >>= PAGE_SHIFT;
330 table_end = table_start;
332 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
333 end, table_start << PAGE_SHIFT,
334 (table_start << PAGE_SHIFT) + tables);
337 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
338 This runs before bootmem is initialized and gets pages directly from the
339 physical memory. To access them they are temporarily mapped. */
340 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
344 Dprintk("init_memory_mapping\n");
347 * Find space for the kernel direct mapping tables.
348 * Later we should allocate these tables in the local node of the memory
349 * mapped. Unfortunately this is done currently before the nodes are
353 find_early_table_space(end);
355 start = (unsigned long)__va(start);
356 end = (unsigned long)__va(end);
358 for (; start < end; start = next) {
359 unsigned long pud_phys;
360 pgd_t *pgd = pgd_offset_k(start);
364 pud = pud_offset(pgd, start & PGDIR_MASK);
366 pud = alloc_low_page(&pud_phys);
368 next = start + PGDIR_SIZE;
371 phys_pud_init(pud, __pa(start), __pa(next));
373 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
378 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
383 void __init paging_init(void)
385 unsigned long max_zone_pfns[MAX_NR_ZONES];
386 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
387 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
388 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
389 max_zone_pfns[ZONE_NORMAL] = end_pfn;
391 memory_present(0, 0, end_pfn);
393 free_area_init_nodes(max_zone_pfns);
397 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
398 from the CPU leading to inconsistent cache lines. address and size
399 must be aligned to 2MB boundaries.
400 Does nothing when the mapping doesn't exist. */
401 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
403 unsigned long end = address + size;
405 BUG_ON(address & ~LARGE_PAGE_MASK);
406 BUG_ON(size & ~LARGE_PAGE_MASK);
408 for (; address < end; address += LARGE_PAGE_SIZE) {
409 pgd_t *pgd = pgd_offset_k(address);
414 pud = pud_offset(pgd, address);
417 pmd = pmd_offset(pud, address);
418 if (!pmd || pmd_none(*pmd))
420 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
421 /* Could handle this, but it should not happen currently. */
423 "clear_kernel_mapping: mapping has been split. will leak memory\n");
426 set_pmd(pmd, __pmd(0));
432 * Memory hotplug specific functions
434 void online_page(struct page *page)
436 ClearPageReserved(page);
437 init_page_count(page);
443 #ifdef CONFIG_MEMORY_HOTPLUG
445 * Memory is added always to NORMAL zone. This means you will never get
446 * additional DMA/DMA32 memory.
448 int arch_add_memory(int nid, u64 start, u64 size)
450 struct pglist_data *pgdat = NODE_DATA(nid);
451 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
452 unsigned long start_pfn = start >> PAGE_SHIFT;
453 unsigned long nr_pages = size >> PAGE_SHIFT;
456 init_memory_mapping(start, (start + size -1));
458 ret = __add_pages(zone, start_pfn, nr_pages);
464 printk("%s: Problem encountered in __add_pages!\n", __func__);
467 EXPORT_SYMBOL_GPL(arch_add_memory);
469 int remove_memory(u64 start, u64 size)
473 EXPORT_SYMBOL_GPL(remove_memory);
475 #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
476 int memory_add_physaddr_to_nid(u64 start)
480 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
483 #endif /* CONFIG_MEMORY_HOTPLUG */
485 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
487 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
488 * just online the pages.
490 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
494 unsigned long total = 0, mem = 0;
495 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
496 if (pfn_valid(pfn)) {
497 online_page(pfn_to_page(pfn));
504 z->spanned_pages += total;
505 z->present_pages += mem;
506 z->zone_pgdat->node_spanned_pages += total;
507 z->zone_pgdat->node_present_pages += mem;
513 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
516 void __init mem_init(void)
518 long codesize, reservedpages, datasize, initsize;
522 /* clear the zero-page */
523 memset(empty_zero_page, 0, PAGE_SIZE);
527 /* this will put all low memory onto the freelists */
529 totalram_pages = numa_free_all_bootmem();
531 totalram_pages = free_all_bootmem();
533 reservedpages = end_pfn - totalram_pages -
534 absent_pages_in_range(0, end_pfn);
538 codesize = (unsigned long) &_etext - (unsigned long) &_text;
539 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
540 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
542 /* Register memory areas for /proc/kcore */
543 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
544 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
545 VMALLOC_END-VMALLOC_START);
546 kclist_add(&kcore_kernel, &_stext, _end - _stext);
547 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
548 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
549 VSYSCALL_END - VSYSCALL_START);
551 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
552 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
553 end_pfn << (PAGE_SHIFT-10),
555 reservedpages << (PAGE_SHIFT-10),
560 void free_init_pages(char *what, unsigned long begin, unsigned long end)
567 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
568 for (addr = begin; addr < end; addr += PAGE_SIZE) {
569 struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
570 ClearPageReserved(page);
571 init_page_count(page);
572 memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
573 if (addr >= __START_KERNEL_map)
574 change_page_attr_addr(addr, 1, __pgprot(0));
578 if (addr > __START_KERNEL_map)
582 void free_initmem(void)
584 free_init_pages("unused kernel memory",
585 __pa_symbol(&__init_begin),
586 __pa_symbol(&__init_end));
589 #ifdef CONFIG_DEBUG_RODATA
591 void mark_rodata_ro(void)
593 unsigned long start = PFN_ALIGN(__va(__pa_symbol(&_stext))), size;
595 #ifdef CONFIG_HOTPLUG_CPU
596 /* It must still be possible to apply SMP alternatives. */
597 if (num_possible_cpus() > 1)
598 start = PFN_ALIGN(__va(__pa_symbol(&_etext)));
600 size = (unsigned long)__va(__pa_symbol(&__end_rodata)) - start;
601 change_page_attr_addr(start, size >> PAGE_SHIFT, PAGE_KERNEL_RO);
603 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
607 * change_page_attr_addr() requires a global_flush_tlb() call after it.
608 * We do this after the printk so that if something went wrong in the
609 * change, the printk gets out at least to give a better debug hint
610 * of who is the culprit.
616 #ifdef CONFIG_BLK_DEV_INITRD
617 void free_initrd_mem(unsigned long start, unsigned long end)
619 free_init_pages("initrd memory", __pa(start), __pa(end));
623 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
626 int nid = phys_to_nid(phys);
628 unsigned long pfn = phys >> PAGE_SHIFT;
629 if (pfn >= end_pfn) {
630 /* This can happen with kdump kernels when accessing firmware
632 if (pfn < end_pfn_map)
634 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
639 /* Should check here against the e820 map to avoid double free */
641 reserve_bootmem_node(NODE_DATA(nid), phys, len);
643 reserve_bootmem(phys, len);
645 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
646 dma_reserve += len / PAGE_SIZE;
647 set_dma_reserve(dma_reserve);
651 int kern_addr_valid(unsigned long addr)
653 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
659 if (above != 0 && above != -1UL)
662 pgd = pgd_offset_k(addr);
666 pud = pud_offset(pgd, addr);
670 pmd = pmd_offset(pud, addr);
674 return pfn_valid(pmd_pfn(*pmd));
676 pte = pte_offset_kernel(pmd, addr);
679 return pfn_valid(pte_pfn(*pte));
683 #include <linux/sysctl.h>
685 extern int exception_trace, page_fault_trace;
687 static ctl_table debug_table2[] = {
690 .procname = "exception-trace",
691 .data = &exception_trace,
692 .maxlen = sizeof(int),
694 .proc_handler = proc_dointvec
699 static ctl_table debug_root_table2[] = {
701 .ctl_name = CTL_DEBUG,
704 .child = debug_table2
709 static __init int x8664_sysctl_init(void)
711 register_sysctl_table(debug_root_table2);
714 __initcall(x8664_sysctl_init);
717 /* A pseudo VMA to allow ptrace access for the vsyscall page. This only
718 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
719 not need special handling anymore. */
721 static struct vm_area_struct gate_vma = {
722 .vm_start = VSYSCALL_START,
723 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
724 .vm_page_prot = PAGE_READONLY_EXEC,
725 .vm_flags = VM_READ | VM_EXEC
728 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
730 #ifdef CONFIG_IA32_EMULATION
731 if (test_tsk_thread_flag(tsk, TIF_IA32))
737 int in_gate_area(struct task_struct *task, unsigned long addr)
739 struct vm_area_struct *vma = get_gate_vma(task);
742 return (addr >= vma->vm_start) && (addr < vma->vm_end);
745 /* Use this when you have no reliable task/vma, typically from interrupt
746 * context. It is less reliable than using the task's vma and may give
749 int in_gate_area_no_task(unsigned long addr)
751 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);