2 * Copyright IBM Corp. 2007,2009
3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
6 #include <linux/sched.h>
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
11 #include <linux/swap.h>
12 #include <linux/smp.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h>
16 #include <linux/module.h>
17 #include <linux/quicklist.h>
18 #include <linux/rcupdate.h>
19 #include <linux/slab.h>
21 #include <asm/system.h>
22 #include <asm/pgtable.h>
23 #include <asm/pgalloc.h>
25 #include <asm/tlbflush.h>
26 #include <asm/mmu_context.h>
30 #define FRAG_MASK 0x0f
33 #define FRAG_MASK 0x03
36 unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
37 EXPORT_SYMBOL(VMALLOC_START);
39 static int __init parse_vmalloc(char *arg)
43 VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
46 early_param("vmalloc", parse_vmalloc);
48 unsigned long *crst_table_alloc(struct mm_struct *mm)
50 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
54 return (unsigned long *) page_to_phys(page);
57 void crst_table_free(struct mm_struct *mm, unsigned long *table)
59 free_pages((unsigned long) table, ALLOC_ORDER);
63 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
65 unsigned long *table, *pgd;
68 BUG_ON(limit > (1UL << 53));
70 table = crst_table_alloc(mm);
73 spin_lock_bh(&mm->page_table_lock);
74 if (mm->context.asce_limit < limit) {
75 pgd = (unsigned long *) mm->pgd;
76 if (mm->context.asce_limit <= (1UL << 31)) {
77 entry = _REGION3_ENTRY_EMPTY;
78 mm->context.asce_limit = 1UL << 42;
79 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
83 entry = _REGION2_ENTRY_EMPTY;
84 mm->context.asce_limit = 1UL << 53;
85 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
89 crst_table_init(table, entry);
90 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
91 mm->pgd = (pgd_t *) table;
92 mm->task_size = mm->context.asce_limit;
95 spin_unlock_bh(&mm->page_table_lock);
97 crst_table_free(mm, table);
98 if (mm->context.asce_limit < limit)
100 update_mm(mm, current);
104 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
108 if (mm->context.asce_limit <= limit)
111 while (mm->context.asce_limit > limit) {
113 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
114 case _REGION_ENTRY_TYPE_R2:
115 mm->context.asce_limit = 1UL << 42;
116 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
120 case _REGION_ENTRY_TYPE_R3:
121 mm->context.asce_limit = 1UL << 31;
122 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
129 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
130 mm->task_size = mm->context.asce_limit;
131 crst_table_free(mm, (unsigned long *) pgd);
133 update_mm(mm, current);
140 * gmap_alloc - allocate a guest address space
141 * @mm: pointer to the parent mm_struct
143 * Returns a guest address space structure.
145 struct gmap *gmap_alloc(struct mm_struct *mm)
149 unsigned long *table;
151 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
154 INIT_LIST_HEAD(&gmap->crst_list);
156 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
159 list_add(&page->lru, &gmap->crst_list);
160 table = (unsigned long *) page_to_phys(page);
161 crst_table_init(table, _REGION1_ENTRY_EMPTY);
163 gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
164 _ASCE_USER_BITS | __pa(table);
165 list_add(&gmap->list, &mm->context.gmap_list);
173 EXPORT_SYMBOL_GPL(gmap_alloc);
175 static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
177 struct gmap_pgtable *mp;
178 struct gmap_rmap *rmap;
181 if (*table & _SEGMENT_ENTRY_INV)
183 page = pfn_to_page(*table >> PAGE_SHIFT);
184 mp = (struct gmap_pgtable *) page->index;
185 list_for_each_entry(rmap, &mp->mapper, list) {
186 if (rmap->entry != table)
188 list_del(&rmap->list);
192 *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
196 static void gmap_flush_tlb(struct gmap *gmap)
198 if (MACHINE_HAS_IDTE)
199 __tlb_flush_idte((unsigned long) gmap->table |
202 __tlb_flush_global();
206 * gmap_free - free a guest address space
207 * @gmap: pointer to the guest address space structure
209 void gmap_free(struct gmap *gmap)
211 struct page *page, *next;
212 unsigned long *table;
217 if (MACHINE_HAS_IDTE)
218 __tlb_flush_idte((unsigned long) gmap->table |
221 __tlb_flush_global();
223 /* Free all segment & region tables. */
224 down_read(&gmap->mm->mmap_sem);
225 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
226 table = (unsigned long *) page_to_phys(page);
227 if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
228 /* Remove gmap rmap structures for segment table. */
229 for (i = 0; i < PTRS_PER_PMD; i++, table++)
230 gmap_unlink_segment(gmap, table);
231 __free_pages(page, ALLOC_ORDER);
233 up_read(&gmap->mm->mmap_sem);
234 list_del(&gmap->list);
237 EXPORT_SYMBOL_GPL(gmap_free);
240 * gmap_enable - switch primary space to the guest address space
241 * @gmap: pointer to the guest address space structure
243 void gmap_enable(struct gmap *gmap)
245 S390_lowcore.gmap = (unsigned long) gmap;
247 EXPORT_SYMBOL_GPL(gmap_enable);
250 * gmap_disable - switch back to the standard primary address space
251 * @gmap: pointer to the guest address space structure
253 void gmap_disable(struct gmap *gmap)
255 S390_lowcore.gmap = 0UL;
257 EXPORT_SYMBOL_GPL(gmap_disable);
259 static int gmap_alloc_table(struct gmap *gmap,
260 unsigned long *table, unsigned long init)
265 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
268 new = (unsigned long *) page_to_phys(page);
269 crst_table_init(new, init);
270 down_read(&gmap->mm->mmap_sem);
271 if (*table & _REGION_ENTRY_INV) {
272 list_add(&page->lru, &gmap->crst_list);
273 *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
274 (*table & _REGION_ENTRY_TYPE_MASK);
276 __free_pages(page, ALLOC_ORDER);
277 up_read(&gmap->mm->mmap_sem);
282 * gmap_unmap_segment - unmap segment from the guest address space
283 * @gmap: pointer to the guest address space structure
284 * @addr: address in the guest address space
285 * @len: length of the memory area to unmap
287 * Returns 0 if the unmap succeded, -EINVAL if not.
289 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
291 unsigned long *table;
295 if ((to | len) & (PMD_SIZE - 1))
297 if (len == 0 || to + len < to)
301 down_read(&gmap->mm->mmap_sem);
302 for (off = 0; off < len; off += PMD_SIZE) {
303 /* Walk the guest addr space page table */
304 table = gmap->table + (((to + off) >> 53) & 0x7ff);
305 if (*table & _REGION_ENTRY_INV)
307 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
308 table = table + (((to + off) >> 42) & 0x7ff);
309 if (*table & _REGION_ENTRY_INV)
311 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
312 table = table + (((to + off) >> 31) & 0x7ff);
313 if (*table & _REGION_ENTRY_INV)
315 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
316 table = table + (((to + off) >> 20) & 0x7ff);
318 /* Clear segment table entry in guest address space. */
319 flush |= gmap_unlink_segment(gmap, table);
320 *table = _SEGMENT_ENTRY_INV;
322 up_read(&gmap->mm->mmap_sem);
324 gmap_flush_tlb(gmap);
327 EXPORT_SYMBOL_GPL(gmap_unmap_segment);
330 * gmap_mmap_segment - map a segment to the guest address space
331 * @gmap: pointer to the guest address space structure
332 * @from: source address in the parent address space
333 * @to: target address in the guest address space
335 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not.
337 int gmap_map_segment(struct gmap *gmap, unsigned long from,
338 unsigned long to, unsigned long len)
340 unsigned long *table;
344 if ((from | to | len) & (PMD_SIZE - 1))
346 if (len == 0 || from + len > PGDIR_SIZE ||
347 from + len < from || to + len < to)
351 down_read(&gmap->mm->mmap_sem);
352 for (off = 0; off < len; off += PMD_SIZE) {
353 /* Walk the gmap address space page table */
354 table = gmap->table + (((to + off) >> 53) & 0x7ff);
355 if ((*table & _REGION_ENTRY_INV) &&
356 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
358 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
359 table = table + (((to + off) >> 42) & 0x7ff);
360 if ((*table & _REGION_ENTRY_INV) &&
361 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
363 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
364 table = table + (((to + off) >> 31) & 0x7ff);
365 if ((*table & _REGION_ENTRY_INV) &&
366 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
368 table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
369 table = table + (((to + off) >> 20) & 0x7ff);
371 /* Store 'from' address in an invalid segment table entry. */
372 flush |= gmap_unlink_segment(gmap, table);
373 *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off);
375 up_read(&gmap->mm->mmap_sem);
377 gmap_flush_tlb(gmap);
381 up_read(&gmap->mm->mmap_sem);
382 gmap_unmap_segment(gmap, to, len);
385 EXPORT_SYMBOL_GPL(gmap_map_segment);
387 unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
389 unsigned long *table, vmaddr, segment;
390 struct mm_struct *mm;
391 struct gmap_pgtable *mp;
392 struct gmap_rmap *rmap;
393 struct vm_area_struct *vma;
399 current->thread.gmap_addr = address;
401 /* Walk the gmap address space page table */
402 table = gmap->table + ((address >> 53) & 0x7ff);
403 if (unlikely(*table & _REGION_ENTRY_INV))
405 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
406 table = table + ((address >> 42) & 0x7ff);
407 if (unlikely(*table & _REGION_ENTRY_INV))
409 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
410 table = table + ((address >> 31) & 0x7ff);
411 if (unlikely(*table & _REGION_ENTRY_INV))
413 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
414 table = table + ((address >> 20) & 0x7ff);
416 /* Convert the gmap address to an mm address. */
418 if (likely(!(segment & _SEGMENT_ENTRY_INV))) {
419 page = pfn_to_page(segment >> PAGE_SHIFT);
420 mp = (struct gmap_pgtable *) page->index;
421 return mp->vmaddr | (address & ~PMD_MASK);
422 } else if (segment & _SEGMENT_ENTRY_RO) {
423 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
424 vma = find_vma(mm, vmaddr);
425 if (!vma || vma->vm_start > vmaddr)
428 /* Walk the parent mm page table */
429 pgd = pgd_offset(mm, vmaddr);
430 pud = pud_alloc(mm, pgd, vmaddr);
433 pmd = pmd_alloc(mm, pud, vmaddr);
436 if (!pmd_present(*pmd) &&
437 __pte_alloc(mm, vma, pmd, vmaddr))
439 /* pmd now points to a valid segment table entry. */
440 rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
443 /* Link gmap segment table entry location to page table. */
444 page = pmd_page(*pmd);
445 mp = (struct gmap_pgtable *) page->index;
447 list_add(&rmap->list, &mp->mapper);
448 /* Set gmap segment table entry to page table. */
449 *table = pmd_val(*pmd) & PAGE_MASK;
450 return vmaddr | (address & ~PMD_MASK);
455 EXPORT_SYMBOL_GPL(gmap_fault);
457 void gmap_unmap_notifier(struct mm_struct *mm, unsigned long *table)
459 struct gmap_rmap *rmap, *next;
460 struct gmap_pgtable *mp;
465 spin_lock(&mm->page_table_lock);
466 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
467 mp = (struct gmap_pgtable *) page->index;
468 list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
470 _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
471 list_del(&rmap->list);
475 spin_unlock(&mm->page_table_lock);
477 __tlb_flush_global();
480 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
481 unsigned long vmaddr)
484 unsigned long *table;
485 struct gmap_pgtable *mp;
487 page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
490 mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
495 pgtable_page_ctor(page);
496 mp->vmaddr = vmaddr & PMD_MASK;
497 INIT_LIST_HEAD(&mp->mapper);
498 page->index = (unsigned long) mp;
499 atomic_set(&page->_mapcount, 3);
500 table = (unsigned long *) page_to_phys(page);
501 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
502 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
506 static inline void page_table_free_pgste(unsigned long *table)
509 struct gmap_pgtable *mp;
511 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
512 mp = (struct gmap_pgtable *) page->index;
513 BUG_ON(!list_empty(&mp->mapper));
514 pgtable_page_ctor(page);
515 atomic_set(&page->_mapcount, -1);
520 #else /* CONFIG_PGSTE */
522 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
523 unsigned long vmaddr)
528 static inline void page_table_free_pgste(unsigned long *table)
532 static inline void gmap_unmap_notifier(struct mm_struct *mm,
533 unsigned long *table)
537 #endif /* CONFIG_PGSTE */
539 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
541 unsigned int old, new;
544 old = atomic_read(v);
546 } while (atomic_cmpxchg(v, old, new) != old);
551 * page table entry allocation/free routines.
553 unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
556 unsigned long *table;
557 unsigned int mask, bit;
559 if (mm_has_pgste(mm))
560 return page_table_alloc_pgste(mm, vmaddr);
561 /* Allocate fragments of a 4K page as 1K/2K page table */
562 spin_lock_bh(&mm->context.list_lock);
564 if (!list_empty(&mm->context.pgtable_list)) {
565 page = list_first_entry(&mm->context.pgtable_list,
567 table = (unsigned long *) page_to_phys(page);
568 mask = atomic_read(&page->_mapcount);
569 mask = mask | (mask >> 4);
571 if ((mask & FRAG_MASK) == FRAG_MASK) {
572 spin_unlock_bh(&mm->context.list_lock);
573 page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
576 pgtable_page_ctor(page);
577 atomic_set(&page->_mapcount, 1);
578 table = (unsigned long *) page_to_phys(page);
579 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
580 spin_lock_bh(&mm->context.list_lock);
581 list_add(&page->lru, &mm->context.pgtable_list);
583 for (bit = 1; mask & bit; bit <<= 1)
584 table += PTRS_PER_PTE;
585 mask = atomic_xor_bits(&page->_mapcount, bit);
586 if ((mask & FRAG_MASK) == FRAG_MASK)
587 list_del(&page->lru);
589 spin_unlock_bh(&mm->context.list_lock);
593 void page_table_free(struct mm_struct *mm, unsigned long *table)
596 unsigned int bit, mask;
598 if (mm_has_pgste(mm)) {
599 gmap_unmap_notifier(mm, table);
600 return page_table_free_pgste(table);
602 /* Free 1K/2K page table fragment of a 4K page */
603 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
604 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
605 spin_lock_bh(&mm->context.list_lock);
606 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
607 list_del(&page->lru);
608 mask = atomic_xor_bits(&page->_mapcount, bit);
609 if (mask & FRAG_MASK)
610 list_add(&page->lru, &mm->context.pgtable_list);
611 spin_unlock_bh(&mm->context.list_lock);
613 pgtable_page_dtor(page);
614 atomic_set(&page->_mapcount, -1);
619 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
621 static void __page_table_free_rcu(void *table, unsigned bit)
625 if (bit == FRAG_MASK)
626 return page_table_free_pgste(table);
627 /* Free 1K/2K page table fragment of a 4K page */
628 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
629 if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
630 pgtable_page_dtor(page);
631 atomic_set(&page->_mapcount, -1);
636 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
638 struct mm_struct *mm;
640 unsigned int bit, mask;
643 if (mm_has_pgste(mm)) {
644 gmap_unmap_notifier(mm, table);
645 table = (unsigned long *) (__pa(table) | FRAG_MASK);
646 tlb_remove_table(tlb, table);
649 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
650 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
651 spin_lock_bh(&mm->context.list_lock);
652 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
653 list_del(&page->lru);
654 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
655 if (mask & FRAG_MASK)
656 list_add_tail(&page->lru, &mm->context.pgtable_list);
657 spin_unlock_bh(&mm->context.list_lock);
658 table = (unsigned long *) (__pa(table) | (bit << 4));
659 tlb_remove_table(tlb, table);
662 void __tlb_remove_table(void *_table)
664 void *table = (void *)((unsigned long) _table & PAGE_MASK);
665 unsigned type = (unsigned long) _table & ~PAGE_MASK;
668 __page_table_free_rcu(table, type);
670 free_pages((unsigned long) table, ALLOC_ORDER);
676 * switch on pgstes for its userspace process (for kvm)
678 int s390_enable_sie(void)
680 struct task_struct *tsk = current;
681 struct mm_struct *mm, *old_mm;
683 /* Do we have switched amode? If no, we cannot do sie */
684 if (user_mode == HOME_SPACE_MODE)
687 /* Do we have pgstes? if yes, we are done */
688 if (mm_has_pgste(tsk->mm))
691 /* lets check if we are allowed to replace the mm */
693 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
695 !hlist_empty(&tsk->mm->ioctx_list) ||
697 tsk->mm != tsk->active_mm) {
703 /* we copy the mm and let dup_mm create the page tables with_pgstes */
704 tsk->mm->context.alloc_pgste = 1;
706 tsk->mm->context.alloc_pgste = 0;
710 /* Now lets check again if something happened */
712 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
714 !hlist_empty(&tsk->mm->ioctx_list) ||
716 tsk->mm != tsk->active_mm) {
722 /* ok, we are alone. No ptrace, no threads, etc. */
724 tsk->mm = tsk->active_mm = mm;
727 atomic_inc(&mm->context.attach_count);
728 atomic_dec(&old_mm->context.attach_count);
729 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
735 EXPORT_SYMBOL_GPL(s390_enable_sie);
737 #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
738 bool kernel_page_present(struct page *page)
743 addr = page_to_phys(page);
748 : "=d" (cc), "+a" (addr) : : "cc");
751 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */