X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?p=pandora-kernel.git;a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=38cd47d1c1bc92bd3b6bd97097a09988e68eca43;hp=2b8ba3aebf6e2c6b46b0d12dfea058ee3ab022fe;hb=refs%2Fheads%2Fpandora-3.2-.57-wip;hpb=b4949b84567f3ae1227d076fc95bbd8efea06506 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b8ba3aebf6e..38cd47d1c1bc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,6 +57,8 @@ #include #include #include +#include +#include #include #include @@ -96,6 +98,14 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; +/* + * When calculating the number of globally allowed dirty pages, there + * is a certain number of per-zone reserves that should not be + * considered dirtyable memory. This is the sum of those reserves + * over all existing zones that contribute dirtyable memory. + */ +unsigned long dirty_balance_reserve __read_mostly; + int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -127,6 +137,13 @@ void pm_restrict_gfp_mask(void) saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~GFP_IOFS; } + +bool pm_suspended_storage(void) +{ + if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + return false; + return true; +} #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -403,6 +420,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) clear_highpage(page + i); } +#ifdef CONFIG_DEBUG_PAGEALLOC +unsigned int _debug_guardpage_minorder; + +static int __init debug_guardpage_minorder_setup(char *buf) +{ + unsigned long res; + + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); + return 0; + } + _debug_guardpage_minorder = res; + printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); + return 0; +} +__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); + +static inline void set_page_guard_flag(struct page *page) +{ + __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} + +static inline void clear_page_guard_flag(struct page *page) +{ + __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} +#else +static inline void set_page_guard_flag(struct page *page) { } +static inline void clear_page_guard_flag(struct page *page) { } +#endif + static inline void set_page_order(struct page *page, int order) { set_page_private(page, order); @@ -460,6 +508,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, if (page_zone_id(page) != page_zone_id(buddy)) return 0; + if (page_is_guard(buddy) && page_order(buddy) == order) { + VM_BUG_ON(page_count(buddy) != 0); + return 1; + } + if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON(page_count(buddy) != 0); return 1; @@ -483,10 +536,10 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * free pages of length of (1 << order) and marked with _mapcount -2. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the - * other. That is, if we allocate a small block, and both were - * free, the remainder of the region must be split into blocks. + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. * If a block is freed, and its buddy is also free, then this - * triggers coalescing into a block of larger size. + * triggers coalescing into a block of larger size. * * -- wli */ @@ -516,11 +569,19 @@ static inline void __free_one_page(struct page *page, buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) break; - - /* Our buddy is free, merge with it and move up one order. */ - list_del(&buddy->lru); - zone->free_area[order].nr_free--; - rmv_page_order(buddy); + /* + * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, + * merge with it and move up one order. + */ + if (page_is_guard(buddy)) { + clear_page_guard_flag(buddy); + set_page_private(page, 0); + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + } else { + list_del(&buddy->lru); + zone->free_area[order].nr_free--; + rmv_page_order(buddy); + } combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); page_idx = combined_idx; @@ -541,7 +602,7 @@ static inline void __free_one_page(struct page *page, combined_idx = buddy_idx & page_idx; higher_page = page + (combined_idx - page_idx); buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = page + (buddy_idx - combined_idx); + higher_buddy = higher_page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); @@ -654,7 +715,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) int i; int bad = 0; - trace_mm_page_free_direct(page, order); + trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); if (PageAnon(page)) @@ -720,6 +781,24 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) } } +#ifdef CONFIG_CMA +/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ +void __init init_cma_reserved_pageblock(struct page *page) +{ + unsigned i = pageblock_nr_pages; + struct page *p = page; + + do { + __ClearPageReserved(p); + set_page_count(p, 0); + } while (++p, --i); + + set_page_refcounted(page); + set_pageblock_migratetype(page, MIGRATE_CMA); + __free_pages(page, pageblock_order); + totalram_pages += pageblock_nr_pages; +} +#endif /* * The order of subdivision here is critical for the IO subsystem. @@ -746,6 +825,23 @@ static inline void expand(struct zone *zone, struct page *page, high--; size >>= 1; VM_BUG_ON(bad_range(zone, &page[size])); + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (high < debug_guardpage_minorder()) { + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + INIT_LIST_HEAD(&page[size].lru); + set_page_guard_flag(&page[size]); + set_page_private(&page[size], high); + /* Guard pages are not available for any usage */ + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); + continue; + } +#endif list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); @@ -828,11 +924,17 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted */ -static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ +static int fallbacks[MIGRATE_TYPES][4] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, +#ifdef CONFIG_CMA + [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, + [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ +#else + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, +#endif + [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ }; /* @@ -927,12 +1029,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order; --current_order) { - for (i = 0; i < MIGRATE_TYPES - 1; i++) { + for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; /* MIGRATE_RESERVE handled later if necessary */ if (migratetype == MIGRATE_RESERVE) - continue; + break; area = &(zone->free_area[current_order]); if (list_empty(&area->free_list[migratetype])) @@ -947,11 +1049,18 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * pages to the preferred allocation list. If falling * back for a reclaimable kernel allocation, be more * aggressive about taking ownership of free pages + * + * On the other hand, never change migration + * type of MIGRATE_CMA pageblocks nor move CMA + * pages on different free lists. We don't + * want unmovable pages to be allocated from + * MIGRATE_CMA areas. */ - if (unlikely(current_order >= (pageblock_order >> 1)) || - start_migratetype == MIGRATE_RECLAIMABLE || - page_group_by_mobility_disabled) { - unsigned long pages; + if (!is_migrate_cma(migratetype) && + (unlikely(current_order >= pageblock_order / 2) || + start_migratetype == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled)) { + int pages; pages = move_freepages_block(zone, page, start_migratetype); @@ -969,11 +1078,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) rmv_page_order(page); /* Take ownership for orders >= pageblock_order */ - if (current_order >= pageblock_order) + if (current_order >= pageblock_order && + !is_migrate_cma(migratetype)) change_pageblock_range(page, current_order, start_migratetype); - expand(zone, page, order, current_order, area, migratetype); + expand(zone, page, order, current_order, area, + is_migrate_cma(migratetype) + ? migratetype : start_migratetype); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype); @@ -1015,17 +1127,17 @@ retry_reserve: return page; } -/* +/* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. * Returns the number of new pages which were placed at *list. */ -static int rmqueue_bulk(struct zone *zone, unsigned int order, +static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, int cold) { - int i; - + int mt = migratetype, i; + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype); @@ -1045,7 +1157,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, list_add(&page->lru, list); else list_add_tail(&page->lru, list); - set_page_private(page, migratetype); + if (IS_ENABLED(CONFIG_CMA)) { + mt = get_pageblock_migratetype(page); + if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) + mt = migratetype; + } + set_page_private(page, mt); list = &page->lru; } __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); @@ -1210,6 +1327,19 @@ out: local_irq_restore(flags); } +/* + * Free a list of 0-order pages + */ +void free_hot_cold_page_list(struct list_head *list, int cold) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + trace_mm_page_free_batched(page, cold); + free_hot_cold_page(page, cold); + } +} + /* * split_page takes a non-compound higher-order page, and splits it into * n (1<= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; - for (; page < endpage; page += pageblock_nr_pages) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + for (; page < endpage; page += pageblock_nr_pages) { + int mt = get_pageblock_migratetype(page); + if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) + set_pageblock_migratetype(page, + MIGRATE_MOVABLE); + } } return 1 << order; @@ -1408,7 +1542,7 @@ static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) static int __init fail_page_alloc_debugfs(void) { - mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; dir = fault_create_debugfs_attr("fail_page_alloc", NULL, @@ -1457,7 +1591,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, long min = mark; int o; - free_pages -= (1 << order) + 1; + free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) @@ -1756,9 +1890,17 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) return; + /* + * Walking all memory to count page types is very expensive and should + * be inhibited in non-blockable contexts. + */ + if (!(gfp_mask & __GFP_WAIT)) + filter |= SHOW_MEM_FILTER_PAGE_COUNT; + /* * This documents exceptions given to allocations in certain * contexts that are allowed to allocate outside current's set @@ -1795,12 +1937,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, + unsigned long did_some_progress, unsigned long pages_reclaimed) { /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) return 0; + /* Always retry if specifically requested */ + if (gfp_mask & __GFP_NOFAIL) + return 1; + + /* + * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim + * making forward progress without invoking OOM. Suspend also disables + * storage devices so kswapd will not help. Bail if we are suspending. + */ + if (!did_some_progress && pm_suspended_storage()) + return 0; + /* * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER * means __GFP_NOFAIL, but that may not be true in other @@ -1819,13 +1974,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) return 1; - /* - * Don't let big-order allocations loop unless the caller - * explicitly requests that. - */ - if (gfp_mask & __GFP_NOFAIL) - return 1; - return 0; } @@ -1886,14 +2034,20 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { struct page *page; - if (!order || compaction_deferred(preferred_zone)) + if (!order) return NULL; + if (compaction_deferred(preferred_zone)) { + *deferred_compaction = true; + return NULL; + } + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration); @@ -1921,7 +2075,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * but not enough to satisfy watermarks. */ count_vm_event(COMPACTFAIL); - defer_compaction(preferred_zone); + + /* + * As async compaction considers a subset of pageblocks, only + * defer if the failure was a sync compaction failure. + */ + if (sync_migration) + defer_compaction(preferred_zone); cond_resched(); } @@ -1933,23 +2093,21 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { return NULL; } #endif /* CONFIG_COMPACTION */ -/* The really slow allocator path where we enter direct reclaim */ -static inline struct page * -__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) +/* Perform direct synchronous page reclaim */ +static int +__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, + nodemask_t *nodemask) { - struct page *page = NULL; struct reclaim_state reclaim_state; - bool drained = false; + int progress; cond_resched(); @@ -1960,7 +2118,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; - *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); + progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -1968,6 +2126,21 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, cond_resched(); + return progress; +} + +/* The really slow allocator path where we enter direct reclaim */ +static inline struct page * +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + struct page *page = NULL; + bool drained = false; + + *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, + nodemask); if (unlikely(!(*did_some_progress))) return NULL; @@ -2084,6 +2257,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed = 0; unsigned long did_some_progress; bool sync_migration = false; + bool deferred_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2164,12 +2338,22 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; sync_migration = true; + /* + * If compaction is deferred for high-order allocations, it is because + * sync compaction recently failed. In this is the case and the caller + * has requested the system not be heavily disrupted, fail the + * allocation now instead of entering direct reclaim + */ + if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -2218,7 +2402,8 @@ rebalance: /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; - if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { + if (should_alloc_retry(gfp_mask, order, did_some_progress, + pages_reclaimed)) { /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; @@ -2232,8 +2417,9 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; } @@ -2257,8 +2443,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; - struct page *page; + struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); + unsigned int cpuset_mems_cookie; gfp_mask &= gfp_allowed_mask; @@ -2277,15 +2464,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); - if (!preferred_zone) { - put_mems_allowed(); - return NULL; - } + if (!preferred_zone) + goto out; /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, @@ -2295,9 +2482,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); - put_mems_allowed(); trace_mm_page_alloc(page, order, gfp_mask, migratetype); + +out: + /* + * When updating a task's mems_allowed, it is possible to race with + * parallel threads in such a way that an allocation can fail while + * the mask is being updated. If a page allocation is about to fail, + * check if the cpuset changed during allocation and if so, retry. + */ + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2328,16 +2525,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page); -void __pagevec_free(struct pagevec *pvec) -{ - int i = pagevec_count(pvec); - - while (--i >= 0) { - trace_mm_pagevec_free(pvec->pages[i], pvec->cold); - free_hot_cold_page(pvec->pages[i], pvec->cold); - } -} - void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { @@ -2521,13 +2708,15 @@ void si_meminfo_node(struct sysinfo *val, int nid) bool skip_free_areas_node(unsigned int flags, int nid) { bool ret = false; + unsigned int cpuset_mems_cookie; if (!(flags & SHOW_MEM_FILTER_NODES)) goto out; - get_mems_allowed(); - ret = !node_isset(nid, cpuset_current_mems_allowed); - put_mems_allowed(); + do { + cpuset_mems_cookie = get_mems_allowed(); + ret = !node_isset(nid, cpuset_current_mems_allowed); + } while (!put_mems_allowed(cpuset_mems_cookie)); out: return ret; } @@ -3407,25 +3596,33 @@ static void setup_zone_migrate_reserve(struct zone *zone) if (page_to_nid(page) != zone_to_nid(zone)) continue; - /* Blocks with reserved pages will never free, skip them. */ - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); - if (pageblock_is_reserved(pfn, block_end_pfn)) - continue; - block_migratetype = get_pageblock_migratetype(page); - /* If this block is reserved, account for it */ - if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { - reserve--; - continue; - } + /* Only test what is necessary when the reserves are not met */ + if (reserve > 0) { + /* + * Blocks with reserved pages will never free, skip + * them. + */ + block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); + if (pageblock_is_reserved(pfn, block_end_pfn)) + continue; - /* Suitable for reserving if this block is movable */ - if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { - set_pageblock_migratetype(page, MIGRATE_RESERVE); - move_freepages_block(zone, page, MIGRATE_RESERVE); - reserve--; - continue; + /* If this block is reserved, account for it */ + if (block_migratetype == MIGRATE_RESERVE) { + reserve--; + continue; + } + + /* Suitable for reserving if this block is movable */ + if (block_migratetype == MIGRATE_MOVABLE) { + set_pageblock_migratetype(page, + MIGRATE_RESERVE); + move_freepages_block(zone, page, + MIGRATE_RESERVE); + reserve--; + continue; + } } /* @@ -4207,10 +4404,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, * round what is now in bits to nearest long in bits, then return it in * bytes. */ -static unsigned long __init usemap_size(unsigned long zonesize) +static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) { unsigned long usemapsize; + zonesize += zone_start_pfn & (pageblock_nr_pages-1); usemapsize = roundup(zonesize, pageblock_nr_pages); usemapsize = usemapsize >> pageblock_order; usemapsize *= NR_PAGEBLOCK_BITS; @@ -4220,40 +4418,41 @@ static unsigned long __init usemap_size(unsigned long zonesize) } static void __init setup_usemap(struct pglist_data *pgdat, - struct zone *zone, unsigned long zonesize) + struct zone *zone, + unsigned long zone_start_pfn, + unsigned long zonesize) { - unsigned long usemapsize = usemap_size(zonesize); + unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; if (usemapsize) zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, usemapsize); } #else -static inline void setup_usemap(struct pglist_data *pgdat, - struct zone *zone, unsigned long zonesize) {} +static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, + unsigned long zone_start_pfn, unsigned long zonesize) {} #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE -/* Return a sensible default order for the pageblock size. */ -static inline int pageblock_default_order(void) -{ - if (HPAGE_SHIFT > PAGE_SHIFT) - return HUGETLB_PAGE_ORDER; - - return MAX_ORDER-1; -} - /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -static inline void __init set_pageblock_order(unsigned int order) +void __init set_pageblock_order(void) { + unsigned int order; + /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) return; + if (HPAGE_SHIFT > PAGE_SHIFT) + order = HUGETLB_PAGE_ORDER; + else + order = MAX_ORDER - 1; + /* * Assume the largest contiguous order of interest is a huge page. - * This value may be variable depending on boot parameters on IA64 + * This value may be variable depending on boot parameters on IA64 and + * powerpc. */ pageblock_order = order; } @@ -4261,15 +4460,13 @@ static inline void __init set_pageblock_order(unsigned int order) /* * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() - * and pageblock_default_order() are unused as pageblock_order is set - * at compile-time. See include/linux/pageblock-flags.h for the values of - * pageblock_order based on the kernel config + * is unused as pageblock_order is set at compile-time. See + * include/linux/pageblock-flags.h for the values of pageblock_order based on + * the kernel config */ -static inline int pageblock_default_order(unsigned int order) +void __init set_pageblock_order(void) { - return MAX_ORDER-1; } -#define set_pageblock_order(x) do {} while (0) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ @@ -4292,7 +4489,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; pgdat_page_cgroup_init(pgdat); - + for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; @@ -4357,8 +4554,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, if (!size) continue; - set_pageblock_order(pageblock_default_order()); - setup_usemap(pgdat, zone, size); + set_pageblock_order(); + setup_usemap(pgdat, zone, zone_start_pfn, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); BUG_ON(ret); @@ -5082,8 +5279,19 @@ static void calculate_totalreserve_pages(void) if (max > zone->present_pages) max = zone->present_pages; reserve_pages += max; + /* + * Lowmem reserves are not available to + * GFP_HIGHUSER page cache allocations and + * kswapd tries to balance zones to their high + * watermark. As a result, neither should be + * regarded as dirtyable memory, to prevent a + * situation where reclaim has to clean pages + * in order to balance the zones. + */ + zone->dirty_balance_reserve = max; } } + dirty_balance_reserve = reserve_pages; totalreserve_pages = reserve_pages; } @@ -5126,14 +5334,7 @@ static void setup_per_zone_lowmem_reserve(void) calculate_totalreserve_pages(); } -/** - * setup_per_zone_wmarks - called when min_free_kbytes changes - * or when memory is hot-{added|removed} - * - * Ensures that the watermark[min,low,high] values for each zone are set - * correctly with respect to min_free_kbytes. - */ -void setup_per_zone_wmarks(void) +static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -5180,6 +5381,11 @@ void setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + + zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); + zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); + zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); + setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -5188,6 +5394,20 @@ void setup_per_zone_wmarks(void) calculate_totalreserve_pages(); } +/** + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-{added|removed} + * + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. + */ +void setup_per_zone_wmarks(void) +{ + mutex_lock(&zonelists_mutex); + __setup_per_zone_wmarks(); + mutex_unlock(&zonelists_mutex); +} + /* * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance @@ -5489,7 +5709,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) pfn &= (PAGES_PER_SECTION-1); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #else - pfn = pfn - zone->zone_start_pfn; + pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #endif /* CONFIG_SPARSEMEM */ } @@ -5561,14 +5781,16 @@ static int __count_immobile_pages(struct zone *zone, struct page *page, int count) { unsigned long pfn, iter, found; + int mt; + /* * For avoiding noise data, lru_add_drain_all() should be called * If ZONE_MOVABLE, the zone never contains immobile pages */ if (zone_idx(zone) == ZONE_MOVABLE) return true; - - if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) + mt = get_pageblock_migratetype(page); + if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) return true; pfn = page_to_pfn(page); @@ -5608,6 +5830,17 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) bool is_pageblock_removable_nolock(struct page *page) { struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + + /* + * We have to be careful here because we are iterating over memory + * sections which are not zone aware so we might end up outside of + * the zone but still within the section. + */ + if (!zone || zone->zone_start_pfn > pfn || + zone->zone_start_pfn + zone->spanned_pages <= pfn) + return false; + return __count_immobile_pages(zone, page, 0); } @@ -5667,7 +5900,7 @@ out: return ret; } -void unset_migratetype_isolate(struct page *page) +void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags; @@ -5675,12 +5908,264 @@ void unset_migratetype_isolate(struct page *page) spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(zone, page, MIGRATE_MOVABLE); + set_pageblock_migratetype(page, migratetype); + move_freepages_block(zone, page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); } +#ifdef CONFIG_CMA + +static unsigned long pfn_max_align_down(unsigned long pfn) +{ + return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, + pageblock_nr_pages) - 1); +} + +static unsigned long pfn_max_align_up(unsigned long pfn) +{ + return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, + pageblock_nr_pages)); +} + +static struct page * +__alloc_contig_migrate_alloc(struct page *page, unsigned long private, + int **resultp) +{ + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + + return alloc_page(gfp_mask); +} + +/* [start, end) must belong to a single zone. */ +static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) +{ + /* This function is based on compact_zone() from compaction.c. */ + + unsigned long pfn = start; + unsigned int tries = 0; + int ret = 0; + + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(start)), + .sync = true, + }; + INIT_LIST_HEAD(&cc.migratepages); + + migrate_prep_local(); + + while (pfn < end || !list_empty(&cc.migratepages)) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + if (list_empty(&cc.migratepages)) { + cc.nr_migratepages = 0; + pfn = isolate_migratepages_range(cc.zone, &cc, + pfn, end); + if (!pfn) { + ret = -EINTR; + break; + } + tries = 0; + } else if (++tries == 5) { + ret = ret < 0 ? ret : -EBUSY; + break; + } + + ret = migrate_pages(&cc.migratepages, + __alloc_contig_migrate_alloc, + 0, false, MIGRATE_SYNC); + } + + putback_lru_pages(&cc.migratepages); + return ret > 0 ? 0 : ret; +} + +/* + * Update zone's cma pages counter used for watermark level calculation. + */ +static inline void __update_cma_watermarks(struct zone *zone, int count) +{ + unsigned long flags; + spin_lock_irqsave(&zone->lock, flags); + zone->min_cma_pages += count; + spin_unlock_irqrestore(&zone->lock, flags); + setup_per_zone_wmarks(); +} + +/* + * Trigger memory pressure bump to reclaim some pages in order to be able to + * allocate 'count' pages in single page units. Does similar work as + *__alloc_pages_slowpath() function. + */ +static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) +{ + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct zonelist *zonelist = node_zonelist(0, gfp_mask); + int did_some_progress = 0; + int order = 1; + + /* + * Increase level of watermarks to force kswapd do his job + * to stabilise at new watermark level. + */ + __update_cma_watermarks(zone, count); + + /* Obey watermarks as if the page was being allocated */ + while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { + wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); + + did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, + NULL); + if (!did_some_progress) { + /* Exhausted what can be done so it's blamo time */ + out_of_memory(zonelist, gfp_mask, order, NULL); + } + } + + /* Restore original watermark levels. */ + __update_cma_watermarks(zone, -count); + + return count; +} + +/** + * alloc_contig_range() -- tries to allocate given range of pages + * @start: start PFN to allocate + * @end: one-past-the-last PFN to allocate + * @migratetype: migratetype of the underlaying pageblocks (either + * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks + * in range must have the same migratetype and it must + * be either of the two. + * + * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES + * aligned, however it's the caller's responsibility to guarantee that + * we are the only thread that changes migrate type of pageblocks the + * pages fall in. + * + * The PFN range must belong to a single zone. + * + * Returns zero on success or negative error code. On success all + * pages which PFN is in [start, end) are allocated for the caller and + * need to be freed with free_contig_range(). + */ +int alloc_contig_range(unsigned long start, unsigned long end, + unsigned migratetype) +{ + struct zone *zone = page_zone(pfn_to_page(start)); + unsigned long outer_start, outer_end; + int ret = 0, order; + + /* + * What we do here is we mark all pageblocks in range as + * MIGRATE_ISOLATE. Because pageblock and max order pages may + * have different sizes, and due to the way page allocator + * work, we align the range to biggest of the two pages so + * that page allocator won't try to merge buddies from + * different pageblocks and change MIGRATE_ISOLATE to some + * other migration type. + * + * Once the pageblocks are marked as MIGRATE_ISOLATE, we + * migrate the pages from an unaligned range (ie. pages that + * we are interested in). This will put all the pages in + * range back to page allocator as MIGRATE_ISOLATE. + * + * When this is done, we take the pages in range from page + * allocator removing them from the buddy system. This way + * page allocator will never consider using them. + * + * This lets us mark the pageblocks back as + * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the + * aligned range but not in the unaligned, original range are + * put back to page allocator so that buddy can use them. + */ + + ret = start_isolate_page_range(pfn_max_align_down(start), + pfn_max_align_up(end), migratetype); + if (ret) + goto done; + + ret = __alloc_contig_migrate_range(start, end); + if (ret) + goto done; + + /* + * Pages from [start, end) are within a MAX_ORDER_NR_PAGES + * aligned blocks that are marked as MIGRATE_ISOLATE. What's + * more, all pages in [start, end) are free in page allocator. + * What we are going to do is to allocate all pages from + * [start, end) (that is remove them from page allocator). + * + * The only problem is that pages at the beginning and at the + * end of interesting range may be not aligned with pages that + * page allocator holds, ie. they can be part of higher order + * pages. Because of this, we reserve the bigger range and + * once this is done free the pages we are not interested in. + * + * We don't have to hold zone->lock here because the pages are + * isolated thus they won't get removed from buddy. + */ + + lru_add_drain_all(); + drain_all_pages(); + + order = 0; + outer_start = start; + while (!PageBuddy(pfn_to_page(outer_start))) { + if (++order >= MAX_ORDER) { + ret = -EBUSY; + goto done; + } + outer_start &= ~0UL << order; + } + + /* Make sure the range is really isolated. */ + if (test_pages_isolated(outer_start, end)) { + pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", + outer_start, end); + ret = -EBUSY; + goto done; + } + + /* + * Reclaim enough pages to make sure that contiguous allocation + * will not starve the system. + */ + __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); + + /* Grab isolated pages from freelists. */ + outer_end = isolate_freepages_range(outer_start, end); + if (!outer_end) { + ret = -EBUSY; + goto done; + } + + /* Free head and tail (if any) */ + if (start != outer_start) + free_contig_range(outer_start, start - outer_start); + if (end != outer_end) + free_contig_range(end, outer_end - end); + +done: + undo_isolate_page_range(pfn_max_align_down(start), + pfn_max_align_up(end), migratetype); + return ret; +} + +void free_contig_range(unsigned long pfn, unsigned nr_pages) +{ + for (; nr_pages--; ++pfn) + __free_page(pfn_to_page(pfn)); +} +#endif + #ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be isolated before calling this. @@ -5720,6 +6205,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) zone->free_area[order].nr_free--; __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages -= 1 << order; +#endif for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order);