X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Fhugetlb.c;h=a489d9338ac4b6cd935c7ef36b5ca98a5c966ff7;hb=131802b8292d35e8a407469c485565b199ed79cf;hp=3a5aae2d6ee7387e9274b97454cd293bd719a159;hpb=8ae94088319ea3047cdf68adc9fafdcf862e2790;p=pandora-kernel.git diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3a5aae2d6ee7..a489d9338ac4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -1078,6 +1079,7 @@ static void return_unused_surplus_pages(struct hstate *h, while (nr_pages--) { if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) break; + cond_resched_lock(&hugetlb_lock); } } @@ -1416,6 +1418,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * and reducing the surplus. */ spin_unlock(&hugetlb_lock); + + /* yield cpu to avoid soft lockup */ + cond_resched(); + ret = alloc_fresh_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) @@ -1447,6 +1453,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, while (min_count < persistent_huge_pages(h)) { if (!free_pool_huge_page(h, nodes_allowed, 0)) break; + cond_resched_lock(&hugetlb_lock); } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) @@ -1887,11 +1894,7 @@ module_exit(hugetlb_exit); static int __init hugetlb_init(void) { - /* Some platform decide whether they support huge pages at boot - * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when - * there is no such support - */ - if (HPAGE_SHIFT == 0) + if (!hugepages_supported()) return 0; if (!size_to_hstate(default_hstate_size)) { @@ -2008,6 +2011,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->max_huge_pages; if (write && h->order >= MAX_ORDER) @@ -2073,6 +2079,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->nr_overcommit_huge_pages; if (write && h->order >= MAX_ORDER) @@ -2098,6 +2107,8 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return; seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" @@ -2114,6 +2125,8 @@ void hugetlb_report_meminfo(struct seq_file *m) int hugetlb_report_node_meminfo(int nid, char *buf) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return 0; return sprintf(buf, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" @@ -2270,6 +2283,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } +static int is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_migration_entry(swp)) + return 1; + else + return 0; +} + +static int is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) + return 1; + else + return 0; +} int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) @@ -2297,7 +2335,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_lock(&dst->page_table_lock); spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); - if (!huge_pte_none(huge_ptep_get(src_pte))) { + entry = huge_ptep_get(src_pte); + if (huge_pte_none(entry)) { /* skip none entry */ + ; + } else if (unlikely(is_hugetlb_entry_migration(entry) || + is_hugetlb_entry_hwpoisoned(entry))) { + swp_entry_t swp_entry = pte_to_swp_entry(entry); + + if (is_write_migration_entry(swp_entry) && cow) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + make_migration_entry_read(&swp_entry); + entry = swp_entry_to_pte(swp_entry); + set_huge_pte_at(src, addr, src_pte, entry); + } + set_huge_pte_at(dst, addr, dst_pte, entry); + } else { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_ptep_get(src_pte); @@ -2315,32 +2370,6 @@ nomem: return -ENOMEM; } -static int is_hugetlb_entry_migration(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return 0; - swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_migration_entry(swp)) - return 1; - else - return 0; -} - -static int is_hugetlb_entry_hwpoisoned(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return 0; - swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_hwpoison_entry(swp)) - return 1; - else - return 0; -} - void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { @@ -2400,9 +2429,10 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, continue; /* - * HWPoisoned hugepage is already unmapped and dropped reference + * Migrating hugepage or HWPoisoned hugepage is already + * unmapped and its refcount is dropped */ - if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) + if (unlikely(!pte_present(pte))) continue; page = pte_page(pte); @@ -2482,6 +2512,14 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, if (iter_vma == vma) continue; + /* + * Shared VMAs have their own reserves and do not affect + * MAP_PRIVATE accounting but it is possible that a shared + * VMA is using the same page so check and skip such VMAs. + */ + if (iter_vma->vm_flags & VM_MAYSHARE) + continue; + /* * Unmap the page from other VMAs without their own reserves. * They get marked to be SIGKILLed if they fault in these @@ -2780,6 +2818,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + int need_wait_lock = 0; ptep = huge_pte_offset(mm, address); if (ptep) { @@ -2790,12 +2829,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(h - hstates); + } else { + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; - /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate @@ -2810,6 +2849,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; + /* + * entry could be a migration/hwpoison entry at this point, so this + * check prevents the kernel from going below assuming that we have + * a active hugepage in pagecache. This goto expects the 2nd page fault, + * and is_hugetlb_entry_(migration|hwpoisoned) check will properly + * handle it. + */ + if (!pte_present(entry)) + goto out_mutex; + /* * If we are going to COW the mapping later, we examine the pending * reservations for this page now. This will ensure that any @@ -2829,29 +2878,30 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } + spin_lock(&mm->page_table_lock); + /* Check for a racing update before calling hugetlb_cow */ + if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + goto out_page_table_lock; + /* * hugetlb_cow() requires page locks of pte_page(entry) and * pagecache_page, so here we need take the former one * when page != pagecache_page or !pagecache_page. - * Note that locking order is always pagecache_page -> page, - * so no worry about deadlock. */ page = pte_page(entry); - get_page(page); if (page != pagecache_page) - lock_page(page); - - spin_lock(&mm->page_table_lock); - /* Check for a racing update before calling hugetlb_cow */ - if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) - goto out_page_table_lock; + if (!trylock_page(page)) { + need_wait_lock = 1; + goto out_page_table_lock; + } + get_page(page); if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, pagecache_page); - goto out_page_table_lock; + goto out_put_page; } entry = pte_mkdirty(entry); } @@ -2859,7 +2909,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (huge_ptep_set_access_flags(vma, address, ptep, entry, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, ptep); - +out_put_page: + if (page != pagecache_page) + unlock_page(page); + put_page(page); out_page_table_lock: spin_unlock(&mm->page_table_lock); @@ -2867,13 +2920,18 @@ out_page_table_lock: unlock_page(pagecache_page); put_page(pagecache_page); } - if (page != pagecache_page) - unlock_page(page); - put_page(page); - out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); + /* + * Generally it's safe to hold refcount during waiting page lock. But + * here we just wait to defer the next page fault to avoid busy loop and + * the page is not used after unlocked before returning from the current + * page fault. So we are safe from accessing freed page, even if we wait + * here without taking refcount. + */ + if (need_wait_lock) + wait_on_page_locked(page); return ret; } @@ -2999,7 +3057,22 @@ void hugetlb_change_protection(struct vm_area_struct *vma, continue; if (huge_pmd_unshare(mm, &address, ptep)) continue; - if (!huge_pte_none(huge_ptep_get(ptep))) { + pte = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) + continue; + if (unlikely(is_hugetlb_entry_migration(pte))) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if (is_write_migration_entry(entry)) { + pte_t newpte; + + make_migration_entry_read(&entry); + newpte = swp_entry_to_pte(entry); + set_huge_pte_at(mm, address, ptep, newpte); + } + continue; + } + if (!huge_pte_none(pte)) { pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(pte_modify(pte, newprot)); set_huge_pte_at(mm, address, ptep, pte); @@ -3025,6 +3098,14 @@ int hugetlb_reserve_pages(struct inode *inode, struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); + /* This should never happen */ + if (from > to) { +#ifdef CONFIG_DEBUG_VM + WARN(1, "%s called with a negative range\n", __func__); +#endif + return -EINVAL; + } + /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page