__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
-
__SetPageTail(p);
+ set_page_count(p, 0);
p->first_page = page;
}
}
combined_idx = buddy_idx & page_idx;
higher_page = page + (combined_idx - page_idx);
buddy_idx = __find_buddy_index(combined_idx, order + 1);
- higher_buddy = page + (buddy_idx - combined_idx);
+ higher_buddy = higher_page + (buddy_idx - combined_idx);
if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
return;
+ /*
+ * Walking all memory to count page types is very expensive and should
+ * be inhibited in non-blockable contexts.
+ */
+ if (!(gfp_mask & __GFP_WAIT))
+ filter |= SHOW_MEM_FILTER_PAGE_COUNT;
+
/*
* This documents exceptions given to allocations in certain
* contexts that are allowed to allocate outside current's set
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
struct page *page;
- if (!order || compaction_deferred(preferred_zone))
+ if (!order)
return NULL;
+ if (compaction_deferred(preferred_zone)) {
+ *deferred_compaction = true;
+ return NULL;
+ }
+
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, sync_migration);
* but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
- defer_compaction(preferred_zone);
+
+ /*
+ * As async compaction considers a subset of pageblocks, only
+ * defer if the failure was a sync compaction failure.
+ */
+ if (sync_migration)
+ defer_compaction(preferred_zone);
cond_resched();
}
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
return NULL;
}
gfp_to_alloc_flags(gfp_t gfp_mask)
{
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
- const gfp_t wait = gfp_mask & __GFP_WAIT;
+ const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
- if (!wait) {
+ if (atomic) {
/*
- * Not worth trying to allocate harder for
- * __GFP_NOMEMALLOC even if it can't schedule.
+ * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+ * if it can't schedule.
*/
- if (!(gfp_mask & __GFP_NOMEMALLOC))
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
alloc_flags |= ALLOC_HARDER;
/*
- * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+ * comment for __cpuset_node_allowed_softwall().
*/
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
bool sync_migration = false;
+ bool deferred_compaction = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
sync_migration = true;
+ /*
+ * If compaction is deferred for high-order allocations, it is because
+ * sync compaction recently failed. In this is the case and the caller
+ * has requested the system not be heavily disrupted, fail the
+ * allocation now instead of entering direct reclaim
+ */
+ if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+ goto nopage;
+
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
}
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
- struct page *page;
+ struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
+ unsigned int cpuset_mems_cookie;
gfp_mask &= gfp_allowed_mask;
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
- if (!preferred_zone) {
- put_mems_allowed();
- return NULL;
- }
+ if (!preferred_zone)
+ goto out;
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
- put_mems_allowed();
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+ /*
+ * When updating a task's mems_allowed, it is possible to race with
+ * parallel threads in such a way that an allocation can fail while
+ * the mask is being updated. If a page allocation is about to fail,
+ * check if the cpuset changed during allocation and if so, retry.
+ */
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
bool skip_free_areas_node(unsigned int flags, int nid)
{
bool ret = false;
+ unsigned int cpuset_mems_cookie;
if (!(flags & SHOW_MEM_FILTER_NODES))
goto out;
- get_mems_allowed();
- ret = !node_isset(nid, cpuset_current_mems_allowed);
- put_mems_allowed();
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ ret = !node_isset(nid, cpuset_current_mems_allowed);
+ } while (!put_mems_allowed(cpuset_mems_cookie));
out:
return ret;
}
unsigned long block_migratetype;
int reserve;
- /* Get the start pfn, end pfn and the number of blocks to reserve */
+ /*
+ * Get the start pfn, end pfn and the number of blocks to reserve
+ * We have to be careful to be aligned to pageblock_nr_pages to
+ * make sure that we always check pfn_valid for the first page in
+ * the block.
+ */
start_pfn = zone->zone_start_pfn;
end_pfn = start_pfn + zone->spanned_pages;
+ start_pfn = roundup(start_pfn, pageblock_nr_pages);
reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
pageblock_order;
if (page_to_nid(page) != zone_to_nid(zone))
continue;
- /* Blocks with reserved pages will never free, skip them. */
- block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
- if (pageblock_is_reserved(pfn, block_end_pfn))
- continue;
-
block_migratetype = get_pageblock_migratetype(page);
- /* If this block is reserved, account for it */
- if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
- reserve--;
- continue;
- }
+ /* Only test what is necessary when the reserves are not met */
+ if (reserve > 0) {
+ /*
+ * Blocks with reserved pages will never free, skip
+ * them.
+ */
+ block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+ if (pageblock_is_reserved(pfn, block_end_pfn))
+ continue;
- /* Suitable for reserving if this block is movable */
- if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
- set_pageblock_migratetype(page, MIGRATE_RESERVE);
- move_freepages_block(zone, page, MIGRATE_RESERVE);
- reserve--;
- continue;
+ /* If this block is reserved, account for it */
+ if (block_migratetype == MIGRATE_RESERVE) {
+ reserve--;
+ continue;
+ }
+
+ /* Suitable for reserving if this block is movable */
+ if (block_migratetype == MIGRATE_MOVABLE) {
+ set_pageblock_migratetype(page,
+ MIGRATE_RESERVE);
+ move_freepages_block(zone, page,
+ MIGRATE_RESERVE);
+ reserve--;
+ continue;
+ }
}
/*
* round what is now in bits to nearest long in bits, then return it in
* bytes.
*/
-static unsigned long __init usemap_size(unsigned long zonesize)
+static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
{
unsigned long usemapsize;
+ zonesize += zone_start_pfn & (pageblock_nr_pages-1);
usemapsize = roundup(zonesize, pageblock_nr_pages);
usemapsize = usemapsize >> pageblock_order;
usemapsize *= NR_PAGEBLOCK_BITS;
}
static void __init setup_usemap(struct pglist_data *pgdat,
- struct zone *zone, unsigned long zonesize)
+ struct zone *zone,
+ unsigned long zone_start_pfn,
+ unsigned long zonesize)
{
- unsigned long usemapsize = usemap_size(zonesize);
+ unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
if (usemapsize)
zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
usemapsize);
}
#else
-static inline void setup_usemap(struct pglist_data *pgdat,
- struct zone *zone, unsigned long zonesize) {}
+static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
+ unsigned long zone_start_pfn, unsigned long zonesize) {}
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
-/* Return a sensible default order for the pageblock size. */
-static inline int pageblock_default_order(void)
-{
- if (HPAGE_SHIFT > PAGE_SHIFT)
- return HUGETLB_PAGE_ORDER;
-
- return MAX_ORDER-1;
-}
-
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(unsigned int order)
+void __init set_pageblock_order(void)
{
+ unsigned int order;
+
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
return;
+ if (HPAGE_SHIFT > PAGE_SHIFT)
+ order = HUGETLB_PAGE_ORDER;
+ else
+ order = MAX_ORDER - 1;
+
/*
* Assume the largest contiguous order of interest is a huge page.
- * This value may be variable depending on boot parameters on IA64
+ * This value may be variable depending on boot parameters on IA64 and
+ * powerpc.
*/
pageblock_order = order;
}
/*
* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
- * and pageblock_default_order() are unused as pageblock_order is set
- * at compile-time. See include/linux/pageblock-flags.h for the values of
- * pageblock_order based on the kernel config
+ * is unused as pageblock_order is set at compile-time. See
+ * include/linux/pageblock-flags.h for the values of pageblock_order based on
+ * the kernel config
*/
-static inline int pageblock_default_order(unsigned int order)
+void __init set_pageblock_order(void)
{
- return MAX_ORDER-1;
}
-#define set_pageblock_order(x) do {} while (0)
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
if (!size)
continue;
- set_pageblock_order(pageblock_default_order());
- setup_usemap(pgdat, zone, size);
+ set_pageblock_order();
+ setup_usemap(pgdat, zone, zone_start_pfn, size);
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
pfn &= (PAGES_PER_SECTION-1);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#else
- pfn = pfn - zone->zone_start_pfn;
+ pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#endif /* CONFIG_SPARSEMEM */
}
bool is_pageblock_removable_nolock(struct page *page)
{
struct zone *zone = page_zone(page);
+ unsigned long pfn = page_to_pfn(page);
+
+ /*
+ * We have to be careful here because we are iterating over memory
+ * sections which are not zone aware so we might end up outside of
+ * the zone but still within the section.
+ */
+ if (!zone || zone->zone_start_pfn > pfn ||
+ zone->zone_start_pfn + zone->spanned_pages <= pfn)
+ return false;
+
return __count_immobile_pages(zone, page, 0);
}
zone->free_area[order].nr_free--;
__mod_zone_page_state(zone, NR_FREE_PAGES,
- (1UL << order));
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages -= 1 << order;
+#endif
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);