X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?p=pandora-kernel.git;a=blobdiff_plain;f=mm%2Fhugetlb.c;h=031f036a95043aa4111d5e43e802d86efb93465a;hp=73f17c0293c0a0e57a62f65f11c969b9319532f5;hb=147ee65b012b3ab82e004f18d8f8a9980400358f;hpb=72784134ce53b8e40deb8c9981ec52122d9f6208 diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 73f17c0293c0..031f036a9504 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size; */ static DEFINE_SPINLOCK(hugetlb_lock); +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +{ + bool free = (spool->count == 0) && (spool->used_hpages == 0); + + spin_unlock(&spool->lock); + + /* If no pages are used, and no other handles to the subpool + * remain, free the subpool the subpool remain */ + if (free) + kfree(spool); +} + +struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) +{ + struct hugepage_subpool *spool; + + spool = kmalloc(sizeof(*spool), GFP_KERNEL); + if (!spool) + return NULL; + + spin_lock_init(&spool->lock); + spool->count = 1; + spool->max_hpages = nr_blocks; + spool->used_hpages = 0; + + return spool; +} + +void hugepage_put_subpool(struct hugepage_subpool *spool) +{ + spin_lock(&spool->lock); + BUG_ON(!spool->count); + spool->count--; + unlock_or_release_subpool(spool); +} + +static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, + long delta) +{ + int ret = 0; + + if (!spool) + return 0; + + spin_lock(&spool->lock); + if ((spool->used_hpages + delta) <= spool->max_hpages) { + spool->used_hpages += delta; + } else { + ret = -ENOMEM; + } + spin_unlock(&spool->lock); + + return ret; +} + +static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, + long delta) +{ + if (!spool) + return; + + spin_lock(&spool->lock); + spool->used_hpages -= delta; + /* If hugetlbfs_put_super couldn't free spool due to + * an outstanding quota reference, free it now. */ + unlock_or_release_subpool(spool); +} + +static inline struct hugepage_subpool *subpool_inode(struct inode *inode) +{ + return HUGETLBFS_SB(inode->i_sb)->spool; +} + +static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) +{ + return subpool_inode(vma->vm_file->f_dentry->d_inode); +} + /* * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. @@ -460,8 +538,10 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + unsigned int cpuset_mems_cookie; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); zonelist = huge_zonelist(vma, address, htlb_alloc_mask, &mpol, &nodemask); /* @@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, } } } -err: + mpol_cond_put(mpol); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; return page; + +err: + mpol_cond_put(mpol); + return NULL; } static void update_and_free_page(struct hstate *h, struct page *page) @@ -533,9 +618,9 @@ static void free_huge_page(struct page *page) */ struct hstate *h = page_hstate(page); int nid = page_to_nid(page); - struct address_space *mapping; + struct hugepage_subpool *spool = + (struct hugepage_subpool *)page_private(page); - mapping = (struct address_space *) page_private(page); set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); @@ -548,11 +633,11 @@ static void free_huge_page(struct page *page) h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } else { + arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); - if (mapping) - hugetlb_put_quota(mapping, 1); + hugepage_subpool_put_pages(spool, 1); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) @@ -595,6 +680,40 @@ int PageHuge(struct page *page) } EXPORT_SYMBOL_GPL(PageHuge); +/* + * PageHeadHuge() only returns true for hugetlbfs head page, but not for + * normal or transparent huge pages. + */ +int PageHeadHuge(struct page *page_head) +{ + compound_page_dtor *dtor; + + if (!PageHead(page_head)) + return 0; + + dtor = get_compound_page_dtor(page_head); + + return dtor == free_huge_page; +} +EXPORT_SYMBOL_GPL(PageHeadHuge); + +pgoff_t __basepage_index(struct page *page) +{ + struct page *page_head = compound_head(page); + pgoff_t index = page_index(page_head); + unsigned long compound_idx; + + if (!PageHuge(page_head)) + return page_index(page); + + if (compound_order(page_head) >= MAX_ORDER) + compound_idx = page_to_pfn(page) - page_to_pfn(page_head); + else + compound_idx = page - page_head; + + return (index << compound_order(page_head)) + compound_idx; +} + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -901,7 +1020,6 @@ retry: h->resv_huge_pages += delta; ret = 0; - spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) @@ -915,6 +1033,7 @@ retry: VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } + spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ free: @@ -960,17 +1079,19 @@ static void return_unused_surplus_pages(struct hstate *h, while (nr_pages--) { if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) break; + cond_resched_lock(&hugetlb_lock); } } /* * Determine if the huge page at addr within the vma has an associated * reservation. Where it does not we will need to logically increase - * reservation and actually increase quota before an allocation can occur. - * Where any new reservation would be required the reservation change is - * prepared, but not committed. Once the page has been quota'd allocated - * an instantiated the change should be committed via vma_commit_reservation. - * No action is required on failure. + * reservation and actually increase subpool usage before an allocation + * can occur. Where any new reservation would be required the + * reservation change is prepared, but not committed. Once the page + * has been allocated from the subpool and instantiated the change should + * be committed via vma_commit_reservation. No action is required on + * failure. */ static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) @@ -1019,24 +1140,24 @@ static void vma_commit_reservation(struct hstate *h, static struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { + struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; long chg; /* - * Processes that did not create the mapping will have no reserves and - * will not have accounted against quota. Check that the quota can be - * made before satisfying the allocation - * MAP_NORESERVE mappings may also need pages and quota allocated - * if no reserve mapping overlaps. + * Processes that did not create the mapping will have no + * reserves and will not have accounted against subpool + * limit. Check that the subpool limit can be made before + * satisfying the allocation MAP_NORESERVE mappings may also + * need pages and subpool limit allocated allocated if no reserve + * mapping overlaps. */ chg = vma_needs_reservation(h, vma, addr); if (chg < 0) return ERR_PTR(-VM_FAULT_OOM); if (chg) - if (hugetlb_get_quota(inode->i_mapping, chg)) + if (hugepage_subpool_get_pages(spool, chg)) return ERR_PTR(-VM_FAULT_SIGBUS); spin_lock(&hugetlb_lock); @@ -1046,12 +1167,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, if (!page) { page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { - hugetlb_put_quota(inode->i_mapping, chg); + hugepage_subpool_put_pages(spool, chg); return ERR_PTR(-VM_FAULT_SIGBUS); } } - set_page_private(page, (unsigned long) mapping); + set_page_private(page, (unsigned long)spool); vma_commit_reservation(h, vma, addr); @@ -1297,6 +1418,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * and reducing the surplus. */ spin_unlock(&hugetlb_lock); + + /* yield cpu to avoid soft lockup */ + cond_resched(); + ret = alloc_fresh_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) @@ -1328,6 +1453,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, while (min_count < persistent_huge_pages(h)) { if (!free_pool_huge_page(h, nodes_allowed, 0)) break; + cond_resched_lock(&hugetlb_lock); } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) @@ -1592,9 +1718,9 @@ static void __init hugetlb_sysfs_init(void) /* * node_hstate/s - associate per node hstate attributes, via their kobjects, - * with node sysdevs in node_devices[] using a parallel array. The array - * index of a node sysdev or _hstate == node id. - * This is here to avoid any static dependency of the node sysdev driver, in + * with node devices in node_devices[] using a parallel array. The array + * index of a node device or _hstate == node id. + * This is here to avoid any static dependency of the node device driver, in * the base kernel, on the hugetlb module. */ struct node_hstate { @@ -1604,7 +1730,7 @@ struct node_hstate { struct node_hstate node_hstates[MAX_NUMNODES]; /* - * A subset of global hstate attributes for node sysdevs + * A subset of global hstate attributes for node devices */ static struct attribute *per_node_hstate_attrs[] = { &nr_hugepages_attr.attr, @@ -1618,7 +1744,7 @@ static struct attribute_group per_node_hstate_attr_group = { }; /* - * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. + * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. * Returns node id via non-NULL nidp. */ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) @@ -1641,13 +1767,13 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) } /* - * Unregister hstate attributes from a single node sysdev. + * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ void hugetlb_unregister_node(struct node *node) { struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + struct node_hstate *nhs = &node_hstates[node->dev.id]; if (!nhs->hugepages_kobj) return; /* no hstate attributes */ @@ -1663,7 +1789,7 @@ void hugetlb_unregister_node(struct node *node) } /* - * hugetlb module exit: unregister hstate attributes from node sysdevs + * hugetlb module exit: unregister hstate attributes from node devices * that have them. */ static void hugetlb_unregister_all_nodes(void) @@ -1671,7 +1797,7 @@ static void hugetlb_unregister_all_nodes(void) int nid; /* - * disable node sysdev registrations. + * disable node device registrations. */ register_hugetlbfs_with_node(NULL, NULL); @@ -1683,20 +1809,20 @@ static void hugetlb_unregister_all_nodes(void) } /* - * Register hstate attributes for a single node sysdev. + * Register hstate attributes for a single node device. * No-op if attributes already registered. */ void hugetlb_register_node(struct node *node) { struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; if (nhs->hugepages_kobj) return; /* already allocated */ nhs->hugepages_kobj = kobject_create_and_add("hugepages", - &node->sysdev.kobj); + &node->dev.kobj); if (!nhs->hugepages_kobj) return; @@ -1707,7 +1833,7 @@ void hugetlb_register_node(struct node *node) if (err) { printk(KERN_ERR "Hugetlb: Unable to add hstate %s" " for node %d\n", - h->name, node->sysdev.id); + h->name, node->dev.id); hugetlb_unregister_node(node); break; } @@ -1716,8 +1842,8 @@ void hugetlb_register_node(struct node *node) /* * hugetlb init time: register hstate attributes for all registered node - * sysdevs of nodes that have memory. All on-line nodes should have - * registered their associated sysdev by this time. + * devices of nodes that have memory. All on-line nodes should have + * registered their associated device by this time. */ static void hugetlb_register_all_nodes(void) { @@ -1725,12 +1851,12 @@ static void hugetlb_register_all_nodes(void) for_each_node_state(nid, N_HIGH_MEMORY) { struct node *node = &node_devices[nid]; - if (node->sysdev.id == nid) + if (node->dev.id == nid) hugetlb_register_node(node); } /* - * Let the node sysdev driver know we're here so it can + * Let the node device driver know we're here so it can * [un]register hstate attributes on node hotplug. */ register_hugetlbfs_with_node(hugetlb_register_node, @@ -1768,11 +1894,7 @@ module_exit(hugetlb_exit); static int __init hugetlb_init(void) { - /* Some platform decide whether they support huge pages at boot - * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when - * there is no such support - */ - if (HPAGE_SHIFT == 0) + if (!hugepages_supported()) return 0; if (!size_to_hstate(default_hstate_size)) { @@ -1889,6 +2011,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->max_huge_pages; if (write && h->order >= MAX_ORDER) @@ -1954,6 +2079,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->nr_overcommit_huge_pages; if (write && h->order >= MAX_ORDER) @@ -1979,6 +2107,8 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return; seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" @@ -1995,6 +2125,8 @@ void hugetlb_report_meminfo(struct seq_file *m) int hugetlb_report_node_meminfo(int nid, char *buf) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return 0; return sprintf(buf, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" @@ -2007,8 +2139,12 @@ int hugetlb_report_node_meminfo(int nid, char *buf) /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { - struct hstate *h = &default_hstate; - return h->nr_huge_pages * pages_per_huge_page(h); + struct hstate *h; + unsigned long nr_total_pages = 0; + + for_each_hstate(h) + nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); + return nr_total_pages; } static int hugetlb_acct_memory(struct hstate *h, long delta) @@ -2068,10 +2204,20 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) kref_get(&reservations->refs); } +static void resv_map_put(struct vm_area_struct *vma) +{ + struct resv_map *reservations = vma_resv_map(vma); + + if (!reservations) + return; + kref_put(&reservations->refs, resv_map_release); +} + static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); struct resv_map *reservations = vma_resv_map(vma); + struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve; unsigned long start; unsigned long end; @@ -2083,11 +2229,11 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) reserve = (end - start) - region_count(&reservations->regions, start, end); - kref_put(&reservations->refs, resv_map_release); + resv_map_put(vma); if (reserve) { hugetlb_acct_memory(h, -reserve); - hugetlb_put_quota(vma->vm_file->f_mapping, reserve); + hugepage_subpool_put_pages(spool, reserve); } } } @@ -2137,6 +2283,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } +static int is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_migration_entry(swp)) + return 1; + else + return 0; +} + +static int is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) + return 1; + else + return 0; +} int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) @@ -2164,7 +2335,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_lock(&dst->page_table_lock); spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); - if (!huge_pte_none(huge_ptep_get(src_pte))) { + entry = huge_ptep_get(src_pte); + if (huge_pte_none(entry)) { /* skip none entry */ + ; + } else if (unlikely(is_hugetlb_entry_migration(entry) || + is_hugetlb_entry_hwpoisoned(entry))) { + swp_entry_t swp_entry = pte_to_swp_entry(entry); + + if (is_write_migration_entry(swp_entry) && cow) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + make_migration_entry_read(&swp_entry); + entry = swp_entry_to_pte(swp_entry); + set_huge_pte_at(src, addr, src_pte, entry); + } + set_huge_pte_at(dst, addr, dst_pte, entry); + } else { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_ptep_get(src_pte); @@ -2182,32 +2370,6 @@ nomem: return -ENOMEM; } -static int is_hugetlb_entry_migration(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return 0; - swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_migration_entry(swp)) - return 1; - else - return 0; -} - -static int is_hugetlb_entry_hwpoisoned(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return 0; - swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_hwpoison_entry(swp)) - return 1; - else - return 0; -} - void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { @@ -2267,9 +2429,10 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, continue; /* - * HWPoisoned hugepage is already unmapped and dropped reference + * Migrating hugepage or HWPoisoned hugepage is already + * unmapped and its refcount is dropped */ - if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) + if (unlikely(!pte_present(pte))) continue; page = pte_page(pte); @@ -2287,6 +2450,25 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } } +void __unmap_hugepage_range_final(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page) +{ + __unmap_hugepage_range(vma, start, end, ref_page); + + /* + * Clear this flag so that x86's huge_pmd_share page_table_shareable + * test will fail on a vma being torn down, and not grab a page table + * on its way out. We're lucky that the flag has such an appropriate + * name, and can in fact be safely cleared here. We could clear it + * before the __unmap_hugepage_range above, but all that's necessary + * is to clear it before releasing the i_mmap_mutex. This works + * because in the context this is called, the VMA is about to be + * destroyed and the i_mmap_mutex is held. + */ + vma->vm_flags &= ~VM_MAYSHARE; +} + void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { @@ -2315,9 +2497,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); - pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) - + (vma->vm_pgoff >> PAGE_SHIFT); - mapping = (struct address_space *)page_private(page); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; /* * Take the mapping lock for the duration of the table walk. As @@ -2330,6 +2512,14 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, if (iter_vma == vma) continue; + /* + * Shared VMAs have their own reserves and do not affect + * MAP_PRIVATE accounting but it is possible that a shared + * VMA is using the same page so check and skip such VMAs. + */ + if (iter_vma->vm_flags & VM_MAYSHARE) + continue; + /* * Unmap the page from other VMAs without their own reserves. * They get marked to be SIGKILLed if they fault in these @@ -2405,7 +2595,6 @@ retry_avoidcopy: if (outside_reserve) { BUG_ON(huge_pte_none(pte)); if (unmap_ref_private(mm, vma, old_page, address)) { - BUG_ON(page_count(old_page) != 1); BUG_ON(huge_pte_none(pte)); spin_lock(&mm->page_table_lock); goto retry_avoidcopy; @@ -2629,22 +2818,23 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + int need_wait_lock = 0; ptep = huge_pte_offset(mm, address); if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait(mm, (pmd_t *)ptep, address); + migration_entry_wait_huge(mm, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(h - hstates); + } else { + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; - /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate @@ -2659,6 +2849,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; + /* + * entry could be a migration/hwpoison entry at this point, so this + * check prevents the kernel from going below assuming that we have + * a active hugepage in pagecache. This goto expects the 2nd page fault, + * and is_hugetlb_entry_(migration|hwpoisoned) check will properly + * handle it. + */ + if (!pte_present(entry)) + goto out_mutex; + /* * If we are going to COW the mapping later, we examine the pending * reservations for this page now. This will ensure that any @@ -2678,28 +2878,30 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } + spin_lock(&mm->page_table_lock); + /* Check for a racing update before calling hugetlb_cow */ + if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + goto out_page_table_lock; + /* * hugetlb_cow() requires page locks of pte_page(entry) and * pagecache_page, so here we need take the former one * when page != pagecache_page or !pagecache_page. - * Note that locking order is always pagecache_page -> page, - * so no worry about deadlock. */ page = pte_page(entry); if (page != pagecache_page) - lock_page(page); - - spin_lock(&mm->page_table_lock); - /* Check for a racing update before calling hugetlb_cow */ - if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) - goto out_page_table_lock; + if (!trylock_page(page)) { + need_wait_lock = 1; + goto out_page_table_lock; + } + get_page(page); if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, pagecache_page); - goto out_page_table_lock; + goto out_put_page; } entry = pte_mkdirty(entry); } @@ -2707,7 +2909,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (huge_ptep_set_access_flags(vma, address, ptep, entry, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, ptep); - +out_put_page: + if (page != pagecache_page) + unlock_page(page); + put_page(page); out_page_table_lock: spin_unlock(&mm->page_table_lock); @@ -2715,12 +2920,18 @@ out_page_table_lock: unlock_page(pagecache_page); put_page(pagecache_page); } - if (page != pagecache_page) - unlock_page(page); - out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); + /* + * Generally it's safe to hold refcount during waiting page lock. But + * here we just wait to defer the next page fault to avoid busy loop and + * the page is not used after unlocked before returning from the current + * page fault. So we are safe from accessing freed page, even if we wait + * here without taking refcount. + */ + if (need_wait_lock) + wait_on_page_locked(page); return ret; } @@ -2770,7 +2981,17 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } - if (absent || + /* + * We need call hugetlb_fault for both hugepages under migration + * (in which case hugetlb_fault waits for the migration,) and + * hwpoisoned hugepages (in which case we need to prevent the + * caller from accessing to them.) In order to do this, we use + * here is_swap_pte instead of is_hugetlb_entry_migration and + * is_hugetlb_entry_hwpoisoned. This is because it simply covers + * both cases, and because we can't follow correct pages + * directly from any kind of swap entries. + */ + if (absent || is_swap_pte(huge_ptep_get(pte)) || ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { int ret; @@ -2836,16 +3057,36 @@ void hugetlb_change_protection(struct vm_area_struct *vma, continue; if (huge_pmd_unshare(mm, &address, ptep)) continue; - if (!huge_pte_none(huge_ptep_get(ptep))) { + pte = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) + continue; + if (unlikely(is_hugetlb_entry_migration(pte))) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if (is_write_migration_entry(entry)) { + pte_t newpte; + + make_migration_entry_read(&entry); + newpte = swp_entry_to_pte(entry); + set_huge_pte_at(mm, address, ptep, newpte); + } + continue; + } + if (!huge_pte_none(pte)) { pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(pte_modify(pte, newprot)); set_huge_pte_at(mm, address, ptep, pte); } } spin_unlock(&mm->page_table_lock); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); - + /* + * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare + * may have cleared our pud entry and done put_page on the page table: + * once we release i_mmap_mutex, another task can do the final put_page + * and that page table be reused and filled with junk. + */ flush_tlb_range(vma, start, end); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); } int hugetlb_reserve_pages(struct inode *inode, @@ -2855,11 +3096,12 @@ int hugetlb_reserve_pages(struct inode *inode, { long ret, chg; struct hstate *h = hstate_inode(inode); + struct hugepage_subpool *spool = subpool_inode(inode); /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page - * and filesystem quota without using reserves + * without using reserves */ if (vm_flags & VM_NORESERVE) return 0; @@ -2883,21 +3125,25 @@ int hugetlb_reserve_pages(struct inode *inode, set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } - if (chg < 0) - return chg; + if (chg < 0) { + ret = chg; + goto out_err; + } - /* There must be enough filesystem quota for the mapping */ - if (hugetlb_get_quota(inode->i_mapping, chg)) - return -ENOSPC; + /* There must be enough pages in the subpool for the mapping */ + if (hugepage_subpool_get_pages(spool, chg)) { + ret = -ENOSPC; + goto out_err; + } /* * Check enough hugepages are available for the reservation. - * Hand back the quota if there are not + * Hand the pages back to the subpool if there are not */ ret = hugetlb_acct_memory(h, chg); if (ret < 0) { - hugetlb_put_quota(inode->i_mapping, chg); - return ret; + hugepage_subpool_put_pages(spool, chg); + goto out_err; } /* @@ -2914,18 +3160,23 @@ int hugetlb_reserve_pages(struct inode *inode, if (!vma || vma->vm_flags & VM_MAYSHARE) region_add(&inode->i_mapping->private_list, from, to); return 0; +out_err: + if (vma) + resv_map_put(vma); + return ret; } void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { struct hstate *h = hstate_inode(inode); long chg = region_truncate(&inode->i_mapping->private_list, offset); + struct hugepage_subpool *spool = subpool_inode(inode); spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); - hugetlb_put_quota(inode->i_mapping, (chg - freed)); + hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -(chg - freed)); }