X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?p=pandora-kernel.git;a=blobdiff_plain;f=mm%2Fhuge_memory.c;h=1b0412135d11828704fea807a496197a444cd9ef;hp=36b3d988b4ef6ac8c263ee0732c1d08513afb04f;hb=4cecd7e369e1b252db1d64451462221b362eed1c;hpb=1dfb059b9438633b0546c5431538a47f6ed99028 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 36b3d988b4ef..1b0412135d11 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -65,6 +65,17 @@ static void khugepaged_slab_free(void); static struct hlist_head *mm_slots_hash __read_mostly; static struct kmem_cache *mm_slot_cache __read_mostly; +#ifdef CONFIG_FB +extern const struct file_operations fb_fops; + +#define is_fb_vma(vma) \ + (vma->vm_file && vma->vm_file->f_op == &fb_fops) +#else +#define is_fb_vma(vma) 0 +#endif + +static void split_fb_pmd(struct vm_area_struct *vma, pmd_t *pmd); + /** * struct mm_slot - hash lookup from mm to mm_slot * @hash: hash collision list @@ -538,7 +549,7 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) + if (totalram_pages < (200 << (20 - PAGE_SHIFT))) transparent_hugepage_flags = 0; start_khugepaged(); @@ -642,6 +653,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, set_pmd_at(mm, haddr, pmd, entry); prepare_pmd_huge_pte(pgtable, mm); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm->nr_ptes++; spin_unlock(&mm->page_table_lock); } @@ -681,7 +693,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma))) + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), vma, haddr, numa_node_id(), 0); @@ -760,6 +772,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); prepare_pmd_huge_pte(pgtable, dst_mm); + dst_mm->nr_ptes++; ret = 0; out_unlock: @@ -788,6 +801,28 @@ pgtable_t get_pmd_huge_pte(struct mm_struct *mm) return pgtable; } +void huge_pmd_set_accessed(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + int dirty) +{ + pmd_t entry; + unsigned long haddr; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto unlock; + + entry = pmd_mkyoung(orig_pmd); + haddr = address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) + update_mmu_cache_pmd(vma, address, pmd); + +unlock: + spin_unlock(&mm->page_table_lock); +} + static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -858,7 +893,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } kfree(pages); - mm->nr_ptes++; smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); page_remove_rmap(page); @@ -902,7 +936,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } @@ -920,6 +954,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); + if (ret & VM_FAULT_OOM) + split_huge_page(page); put_page(page); goto out; } @@ -927,6 +963,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { put_page(new_page); + split_huge_page(page); put_page(page); ret |= VM_FAULT_OOM; goto out; @@ -949,7 +986,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pmdp_clear_flush_notify(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, pmd); page_remove_rmap(page); put_page(page); ret |= VM_FAULT_WRITE; @@ -960,6 +997,18 @@ out: return ret; } +/* + * FOLL_FORCE can write to even unwritable pmd's, but only + * after we've gone through a COW cycle and they are dirty. + */ +static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, + unsigned int flags) +{ + return pmd_write(pmd) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && + page && PageAnon(page)); +} + struct page *follow_trans_huge_pmd(struct mm_struct *mm, unsigned long addr, pmd_t *pmd, @@ -969,11 +1018,12 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, assert_spin_locked(&mm->page_table_lock); - if (flags & FOLL_WRITE && !pmd_write(*pmd)) - goto out; - page = pmd_page(*pmd); VM_BUG_ON(!PageHead(page)); + + if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, page, flags)) + return NULL; + if (flags & FOLL_TOUCH) { pmd_t _pmd; /* @@ -992,7 +1042,6 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, if (flags & FOLL_GET) get_page_foll(page); -out: return page; } @@ -1003,6 +1052,11 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, spin_lock(&tlb->mm->page_table_lock); if (likely(pmd_trans_huge(*pmd))) { + if (is_fb_vma(vma)) { + split_fb_pmd(vma, pmd); + return 0; + } + if (unlikely(pmd_trans_splitting(*pmd))) { spin_unlock(&tlb->mm->page_table_lock); wait_split_huge_page(vma->anon_vma, @@ -1017,6 +1071,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON(!PageHead(page)); + tlb->mm->nr_ptes--; spin_unlock(&tlb->mm->page_table_lock); tlb_remove_page(tlb, page); pte_free(tlb->mm, pgtable); @@ -1356,7 +1411,6 @@ static int __split_huge_page_map(struct page *page, pte_unmap(pte); } - mm->nr_ptes++; smp_wmb(); /* make pte visible before pmd */ /* * Up to this point the pmd is present and huge and @@ -1469,6 +1523,157 @@ out: return ret; } +/* callers must hold mmap_sem (madvise() does) */ +static int collapse_fb_pmd(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, struct vm_area_struct *vma) +{ + unsigned long _addr; + struct page *page; + pgtable_t pgtable; + pte_t *pte, *_pte; + pmd_t _pmd; + u32 pa; + + pte = pte_offset_map(pmd, addr); + page = pte_page(*pte); + pa = __pfn_to_phys(page_to_pfn(page)); + _pmd = pmdp_clear_flush_notify(vma, addr, pmd); + + if ((addr | pa) & ~HPAGE_PMD_MASK) { + printk(KERN_ERR "collapse_fb: bad alignment: %08lx->%08x\n", + addr, pa); + pte_unmap(pte); + return -EINVAL; + } + + for (_pte = pte, _addr = addr; _pte < pte + HPAGE_PMD_NR; _pte++) { + pte_t pteval = *_pte; + struct page *src_page; + + if (!pte_none(pteval)) { + src_page = pte_page(pteval); + + pte_clear(vma->vm_mm, _addr, _pte); + if (pte_present(pteval)) + page_remove_rmap(src_page); + } + + _addr += PAGE_SIZE; + } + + pte_unmap(pte); + pgtable = pmd_pgtable(_pmd); + VM_BUG_ON(page_count(pgtable) != 1); + VM_BUG_ON(page_mapcount(pgtable) != 0); + + _pmd = mk_pmd(page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + _pmd = pmd_mkhuge(_pmd); + + smp_wmb(); + + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + set_pmd_at(mm, addr, pmd, _pmd); + update_mmu_cache(vma, addr, pmd); + prepare_pmd_huge_pte(pgtable, mm); + spin_unlock(&mm->page_table_lock); + + return 0; +} + +static int try_collapse_fb(struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long hstart, hend, addr; + int ret = 0; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart >= hend) + return -EINVAL; + + for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return -EINVAL; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return -EINVAL; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return -EINVAL; + if (pmd_trans_huge(*pmd)) + continue; + + ret = collapse_fb_pmd(mm, pmd, addr, vma); + if (ret) + break; + } + + return ret; +} + +/* undo collapse_fb_pmd(), restore pages so that mm subsys can release them + * page_table_lock() should be held */ +static void split_fb_pmd(struct vm_area_struct *vma, pmd_t *pmd) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr, haddr, pfn; + struct page *page; + pgtable_t pgtable; + pmd_t _pmd; + int i; + + page = pmd_page(*pmd); + pgtable = get_pmd_huge_pte(mm); + pfn = page_to_pfn(page); + addr = pfn << PAGE_SHIFT; + + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0, haddr = addr; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + BUG_ON(PageCompound(page + i)); + entry = mk_pte(page + i, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (!pmd_young(*pmd)) + entry = pte_mkold(entry); + atomic_set(&page[i]._mapcount, 0); // hack? + pte = pte_offset_map(&_pmd, haddr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + + set_pmd_at(mm, addr, pmd, pmd_mknotpresent(*pmd)); + flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); + pmd_populate(mm, pmd, pgtable); +} + +#ifndef __arm__ +#error arm only.. +#endif +static u32 pmd_to_va(struct mm_struct *mm, pmd_t *pmd) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd0; + u32 ret; + + pgd = pgd_offset(mm, 0); + pud = pud_offset(pgd, 0); + pmd0 = pmd_offset(pud, 0); + + ret = (pmd - pmd0) << SECTION_SHIFT; + return ret; +} + #define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ VM_HUGETLB|VM_SHARED|VM_MAYSHARE) @@ -1477,6 +1682,9 @@ int hugepage_madvise(struct vm_area_struct *vma, { switch (advice) { case MADV_HUGEPAGE: + if (is_fb_vma(vma)) + return try_collapse_fb(vma); + /* * Be somewhat over-protective like KSM for now! */ @@ -1489,7 +1697,7 @@ int hugepage_madvise(struct vm_area_struct *vma, * register it here without waiting a page fault that * may not happen any time soon. */ - if (unlikely(khugepaged_enter_vma_merge(vma))) + if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) return -ENOMEM; break; case MADV_NOHUGEPAGE: @@ -1621,7 +1829,8 @@ int __khugepaged_enter(struct mm_struct *mm) return 0; } -int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { unsigned long hstart, hend; if (!vma->anon_vma) @@ -1630,18 +1839,18 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) * page fault if needed. */ return 0; - if (vma->vm_ops) + if (vma->vm_ops || (vm_flags & VM_NO_THP)) /* khugepaged not yet working on file or special mappings */ return 0; /* * If is_pfn_mapping() is true is_learn_pfn_mapping() must be * true too, verify it here. */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); + VM_BUG_ON(is_linear_pfn_mapping(vma)); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) - return khugepaged_enter(vma); + return khugepaged_enter(vma, vm_flags); return 0; } @@ -1812,6 +2021,24 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } } +static bool hugepage_vma_check(struct vm_area_struct *vma) +{ + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) + return false; + + if (!vma->anon_vma || vma->vm_ops) + return false; + if (is_vma_temporary_stack(vma)) + return false; + /* + * If is_pfn_mapping() is true is_learn_pfn_mapping() must be + * true too, verify it here. + */ + VM_BUG_ON(is_linear_pfn_mapping(vma)); + return !(vma->vm_flags & VM_NO_THP); +} + static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, @@ -1878,25 +2105,14 @@ static void collapse_huge_page(struct mm_struct *mm, goto out; vma = find_vma(mm, address); + if (!vma) + goto out; hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (address < hstart || address + HPAGE_PMD_SIZE > hend) goto out; - - if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) - goto out; - - if (!vma->anon_vma || vma->vm_ops) - goto out; - if (is_vma_temporary_stack(vma)) + if (!hugepage_vma_check(vma)) goto out; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() must be - * true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); - pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) goto out; @@ -1933,7 +2149,12 @@ static void collapse_huge_page(struct mm_struct *mm, pte_unmap(pte); spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); - set_pmd_at(mm, address, pmd, _pmd); + /* + * We can only use set_pmd_at when establishing + * hugepmds and never for establishing regular pmds that + * points to regular pagetables. Use pmd_populate for that + */ + pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); goto out; @@ -1967,9 +2188,8 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache(vma, address, _pmd); + update_mmu_cache(vma, address, pmd); prepare_pmd_huge_pte(pgtable, mm); - mm->nr_ptes--; spin_unlock(&mm->page_table_lock); #ifndef CONFIG_NUMA @@ -2064,7 +2284,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) { struct mm_struct *mm = mm_slot->mm; - VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); if (khugepaged_test_exit(mm)) { /* free mm_slot */ @@ -2094,7 +2314,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int progress = 0; VM_BUG_ON(!pages); - VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; @@ -2122,25 +2342,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - - if ((!(vma->vm_flags & VM_HUGEPAGE) && - !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) { - skip: + if (!hugepage_vma_check(vma)) { +skip: progress++; continue; } - if (!vma->anon_vma || vma->vm_ops) - goto skip; - if (is_vma_temporary_stack(vma)) - goto skip; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() - * must be true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || - vma->vm_flags & VM_NO_THP); - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart >= hend) @@ -2357,6 +2563,7 @@ static int khugepaged(void *none) void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) { + struct vm_area_struct *vma; struct page *page; spin_lock(&mm->page_table_lock); @@ -2364,6 +2571,12 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) spin_unlock(&mm->page_table_lock); return; } + vma = find_vma(mm, pmd_to_va(mm, pmd)); + if (vma && is_fb_vma(vma)) { + split_fb_pmd(vma, pmd); + spin_unlock(&mm->page_table_lock); + return; + } page = pmd_page(*pmd); VM_BUG_ON(!page_count(page)); get_page(page);