X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Fmemory.c;h=6325103db6f35683ec8781e9bdf60d15be8e5528;hb=dbbb3062dd53e7950a18bc05cca296fb2803b10b;hp=829d437354022959eeca37082f695e2ee73025ed;hpb=c292fe4aae5aa5c089633bc40342d27c8275306a;p=pandora-kernel.git diff --git a/mm/memory.c b/mm/memory.c index 829d43735402..6325103db6f3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -205,10 +205,14 @@ static int tlb_next_batch(struct mmu_gather *tlb) return 1; } + if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) + return 0; + batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) return 0; + tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; @@ -235,6 +239,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; + tlb->batch_count = 0; #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; @@ -865,20 +870,20 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!pte_file(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); - if (swap_duplicate(entry) < 0) - return entry.val; - - /* make sure dst_mm is on swapoff's mmlist. */ - if (unlikely(list_empty(&dst_mm->mmlist))) { - spin_lock(&mmlist_lock); - if (list_empty(&dst_mm->mmlist)) - list_add(&dst_mm->mmlist, - &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (likely(!non_swap_entry(entry))) + if (likely(!non_swap_entry(entry))) { + if (swap_duplicate(entry) < 0) + return entry.val; + + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } rss[MM_SWAPENTS]++; - else if (is_write_migration_entry(entry) && + } else if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent @@ -1173,8 +1178,10 @@ again: if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) + if (force_flush) { + addr += PAGE_SIZE; break; + } continue; } /* @@ -1228,16 +1235,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, do { next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { - if (next-addr != HPAGE_PMD_SIZE) { + if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd)) - continue; + goto next; /* fall through */ } - if (pmd_none_or_clear_bad(pmd)) - continue; + /* + * Here there can be other concurrent MADV_DONTNEED or + * trans huge page faults running, and if the pmd is + * none or trans huge it can change under us. This is + * because MADV_DONTNEED holds the mmap_sem in read + * mode. + */ + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + goto next; next = zap_pte_range(tlb, vma, pmd, addr, next, details); +next: cond_resched(); } while (pmd++, addr = next, addr != end); @@ -1350,8 +1365,11 @@ unsigned long unmap_vmas(struct mmu_gather *tlb, * Since no pte has actually been setup, it is * safe to do nothing in this case. */ - if (vma->vm_file) - unmap_hugepage_range(vma, start, end, NULL); + if (vma->vm_file) { + mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); + __unmap_hugepage_range_final(vma, start, end, NULL); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + } start = end; } else @@ -1409,6 +1427,24 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); +static inline bool can_follow_write_pte(pte_t pte, struct page *page, + unsigned int flags) +{ + if (pte_write(pte)) + return true; + + /* + * Make sure that we are really following CoWed page. We do not really + * have to care about exclusiveness of the page because we only want + * to ensure that once COWed page hasn't disappeared in the meantime + * or it hasn't been merged to a KSM page. + */ + if ((flags & FOLL_FORCE) && (flags & FOLL_COW)) + return page && PageAnon(page) && !PageKsm(page); + + return false; +} + /** * follow_page - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address @@ -1491,10 +1527,13 @@ split_fallthrough: pte = *ptep; if (!pte_present(pte)) goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; page = vm_normal_page(vma, address, pte); + if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, flags)) { + pte_unmap_unlock(ptep, ptl); + return NULL; + } + if (unlikely(!page)) { if ((flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(pte))) @@ -1537,7 +1576,7 @@ split_fallthrough: unlock_page(page); } } -unlock: + pte_unmap_unlock(ptep, ptl); out: return page; @@ -1566,12 +1605,6 @@ no_page_table: return page; } -static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -{ - return stack_guard_page_start(vma, addr) || - stack_guard_page_end(vma, addr+PAGE_SIZE); -} - /** * __get_user_pages() - pin user pages in memory * @tsk: task_struct of target task @@ -1722,11 +1755,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int ret; unsigned int fault_flags = 0; - /* For mlock, just skip the stack guard page. */ - if (foll_flags & FOLL_MLOCK) { - if (stack_guard_page(vma, start)) - goto next_page; - } if (foll_flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (nonblocking) @@ -1749,7 +1777,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, else return -EFAULT; } - if (ret & VM_FAULT_SIGBUS) + if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) return i ? i : -EFAULT; BUG(); } @@ -1771,17 +1799,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * The VM_FAULT_WRITE bit tells us that * do_wp_page has broken COW when necessary, * even if maybe_mkwrite decided not to set - * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. But only - * do so when looping for pte_write is futile: - * in some cases userspace may also be wanting - * to write to the gotten user page, which a - * read fault here might prevent (a readonly - * page might get reCOWed by userspace write). + * pte_write. We cannot simply drop FOLL_WRITE + * here because the COWed page might be gone by + * the time we do the subsequent page lookups. */ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) - foll_flags &= ~FOLL_WRITE; + foll_flags |= FOLL_COW; cond_resched(); } @@ -1836,19 +1860,24 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags) { struct vm_area_struct *vma; + vm_flags_t vm_flags; int ret; vma = find_extend_vma(mm, address); if (!vma || address < vma->vm_start) return -EFAULT; + vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + if (!(vm_flags & vma->vm_flags)) + return -EFAULT; + ret = handle_mm_fault(mm, vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return -ENOMEM; if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) return -EHWPOISON; - if (ret & VM_FAULT_SIGBUS) + if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) return -EFAULT; BUG(); } @@ -2293,6 +2322,53 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); +/** + * vm_iomap_memory - remap memory to userspace + * @vma: user vma to map to + * @start: start of area + * @len: size of area + * + * This is a simplified io_remap_pfn_range() for common driver use. The + * driver just needs to give us the physical memory range to be mapped, + * we'll figure out the rest from the vma information. + * + * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get + * whatever write-combining details or similar. + */ +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ + unsigned long vm_len, pfn, pages; + + /* Check that the physical memory area passed in looks valid */ + if (start + len < start) + return -EINVAL; + /* + * You *really* shouldn't map things that aren't page-aligned, + * but we've historically allowed it because IO memory might + * just have smaller alignment. + */ + len += start & ~PAGE_MASK; + pfn = start >> PAGE_SHIFT; + pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; + if (pfn + pages < pfn) + return -EINVAL; + + /* We start the mapping 'vm_pgoff' pages into the area */ + if (vma->vm_pgoff > pages) + return -EINVAL; + pfn += vma->vm_pgoff; + pages -= vma->vm_pgoff; + + /* Can we fit all of the mapping? */ + vm_len = vma->vm_end - vma->vm_start; + if (vm_len >> PAGE_SHIFT > pages) + return -EINVAL; + + /* Ok, let it rip */ + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); + static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) @@ -2591,17 +2667,24 @@ reuse: if (!dirty_page) return ret; - /* - * Yes, Virginia, this is actually required to prevent a race - * with clear_page_dirty_for_io() from clearing the page dirty - * bit after it clear all dirty ptes, but before a racing - * do_wp_page installs a dirty pte. - * - * __do_fault is protected similarly. - */ if (!page_mkwrite) { - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); + struct address_space *mapping; + int dirtied; + + lock_page(dirty_page); + dirtied = set_page_dirty(dirty_page); + VM_BUG_ON(PageAnon(dirty_page)); + mapping = dirty_page->mapping; + unlock_page(dirty_page); + + if (dirtied && mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + } put_page(dirty_page); if (page_mkwrite) { @@ -3027,40 +3110,6 @@ out_release: return ret; } -/* - * This is like a special single-page "expand_{down|up}wards()", - * except we must first make sure that 'address{-|+}PAGE_SIZE' - * doesn't hit another vma. - */ -static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) -{ - address &= PAGE_MASK; - if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { - struct vm_area_struct *prev = vma->vm_prev; - - /* - * Is there a mapping abutting this one below? - * - * That's only ok if it's the same stack mapping - * that has gotten split.. - */ - if (prev && prev->vm_end == address) - return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; - - expand_downwards(vma, address - PAGE_SIZE); - } - if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { - struct vm_area_struct *next = vma->vm_next; - - /* As VM_GROWSDOWN but s/below/above/ */ - if (next && next->vm_start == address + PAGE_SIZE) - return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; - - expand_upwards(vma, address + PAGE_SIZE); - } - return 0; -} - /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -3076,8 +3125,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap(page_table); - /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, address) < 0) + /* File mapping without ->vm_ops ? */ + if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; /* Use the zero-page for reads */ @@ -3335,6 +3384,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; pte_unmap(page_table); + /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ + if (!vma->vm_ops->fault) + return VM_FAULT_SIGBUS; return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } @@ -3393,11 +3445,9 @@ int handle_pte_fault(struct mm_struct *mm, entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { - if (vma->vm_ops) { - if (likely(vma->vm_ops->fault)) - return do_linear_fault(mm, vma, address, + if (vma->vm_ops) + return do_linear_fault(mm, vma, address, pte, pmd, flags, entry); - } return do_anonymous_page(mm, vma, address, pte, pmd, flags); } @@ -3458,6 +3508,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); +retry: pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); if (!pud) @@ -3471,13 +3522,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd, flags); } else { pmd_t orig_pmd = *pmd; + int ret; + barrier(); if (pmd_trans_huge(orig_pmd)) { if (flags & FAULT_FLAG_WRITE && !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) - return do_huge_pmd_wp_page(mm, vma, address, - pmd, orig_pmd); + !pmd_trans_splitting(orig_pmd)) { + ret = do_huge_pmd_wp_page(mm, vma, address, pmd, + orig_pmd); + /* + * If COW results in an oom, the huge pmd will + * have been split, so retry the fault on the + * pte for a smaller charge. + */ + if (unlikely(ret & VM_FAULT_OOM)) + goto retry; + return ret; + } return 0; } } @@ -3489,8 +3551,18 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) return VM_FAULT_OOM; - /* if an huge pmd materialized from under us just retry later */ - if (unlikely(pmd_trans_huge(*pmd))) + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd + * didn't become pmd_trans_huge under us and then back to pmd_none, as + * a result of MADV_DONTNEED running immediately after a huge pmd fault + * in a different thread of this mm, in turn leading to a misleading + * pmd_trans_huge() retval. All we have to ensure is that it is a + * regular pmd that we can walk with pte_offset_map() and we can do that + * through an atomic read in C, which is what pmd_trans_unstable() + * provides. + */ + if (unlikely(pmd_trans_unstable(pmd))) return 0; /* * A regular pmd is established and it can't morph into a huge pmd @@ -3742,7 +3814,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, if (follow_phys(vma, addr, write, &prot, &phys_addr)) return -EINVAL; - maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); + maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (write) memcpy_toio(maddr + offset, buf, len); else