*/
void register_shrinker(struct shrinker *shrinker)
{
- shrinker->nr = 0;
+ atomic_long_set(&shrinker->nr_in_batch, 0);
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
- unsigned long total_scan;
- unsigned long max_pass;
+ long total_scan;
+ long max_pass;
int shrink_ret = 0;
long nr;
long new_nr;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
+ max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+ if (max_pass <= 0)
+ continue;
+
/*
* copy the current shrinker scan count into a local variable
* and zero it so that other concurrent shrinker invocations
* don't also do this scanning work.
*/
- do {
- nr = shrinker->nr;
- } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+ nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
total_scan = nr;
- max_pass = do_shrinker_shrink(shrinker, shrink, 0);
delta = (4 * nr_pages_scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
* manner that handles concurrent updates. If we exhausted the
* scan, there is no need to do an update.
*/
- do {
- nr = shrinker->nr;
- new_nr = total_scan + nr;
- if (total_scan <= 0)
- break;
- } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+ if (total_scan > 0)
+ new_nr = atomic_long_add_return(total_scan,
+ &shrinker->nr_in_batch);
+ else
+ new_nr = atomic_long_read(&shrinker->nr_in_batch);
trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
}
return PAGE_ACTIVATE;
}
- /*
- * Wait on writeback if requested to. This happens when
- * direct reclaiming a large contiguous area and the
- * first attempt to free a range of pages fails.
- */
- if (PageWriteback(page) &&
- (sc->reclaim_mode & RECLAIM_MODE_SYNC))
- wait_on_page_writeback(page);
-
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
lru = LRU_UNEVICTABLE;
add_page_to_unevictable_list(page);
/*
- * When racing with an mlock clearing (page is
- * unlocked), make sure that if the other thread does
- * not observe our setting of PG_lru and fails
- * isolation, we see PG_mlocked cleared below and move
+ * When racing with an mlock or AS_UNEVICTABLE clearing
+ * (page is unlocked) make sure that if the other thread
+ * does not observe our setting of PG_lru and fails
+ * isolation/check_move_unevictable_pages,
+ * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
* the page back to the evictable list.
*
- * The other side is TestClearPageMlocked().
+ * The other side is TestClearPageMlocked() or shmem_lock().
*/
smp_mb();
}
return PAGEREF_RECLAIM;
if (referenced_ptes) {
- if (PageAnon(page))
+ if (PageSwapBacked(page))
return PAGEREF_ACTIVATE;
/*
* All mapped pages start out with page table
*/
SetPageReferenced(page);
- if (referenced_page)
+ if (referenced_page || referenced_ptes > 1)
+ return PAGEREF_ACTIVATE;
+
+ /*
+ * Activate file-backed executable pages after first usage.
+ */
+ if (vm_flags & VM_EXEC)
return PAGEREF_ACTIVATE;
return PAGEREF_KEEP;
*/
static unsigned long shrink_page_list(struct list_head *page_list,
struct zone *zone,
- struct scan_control *sc)
+ struct scan_control *sc,
+ int priority,
+ unsigned long *ret_nr_dirty,
+ unsigned long *ret_nr_writeback)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
unsigned long nr_dirty = 0;
unsigned long nr_congested = 0;
unsigned long nr_reclaimed = 0;
+ unsigned long nr_writeback = 0;
cond_resched();
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
if (PageWriteback(page)) {
+ nr_writeback++;
/*
- * Synchronous reclaim is performed in two passes,
- * first an asynchronous pass over the list to
- * start parallel writeback, and a second synchronous
- * pass to wait for the IO to complete. Wait here
- * for any page for which writeback has already
- * started.
+ * Synchronous reclaim cannot queue pages for
+ * writeback due to the possibility of stack overflow
+ * but if it encounters a page under writeback, wait
+ * for the IO to complete.
*/
if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
may_enter_fs)
if (PageDirty(page)) {
nr_dirty++;
+ /*
+ * Only kswapd can writeback filesystem pages to
+ * avoid risk of stack overflow but do not writeback
+ * unless under significant pressure.
+ */
+ if (page_is_file_cache(page) &&
+ (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+ /*
+ * Immediately reclaim when written back.
+ * Similar in principal to deactivate_page()
+ * except we already have the page isolated
+ * and know it's dirty
+ */
+ inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+ SetPageReclaim(page);
+
+ goto keep_locked;
+ }
+
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
+ *ret_nr_dirty += nr_dirty;
+ *ret_nr_writeback += nr_writeback;
return nr_reclaimed;
}
*
* returns 0 on success, -ve errno on failure.
*/
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
{
+ bool all_lru_mode;
int ret = -EINVAL;
/* Only take pages on the LRU. */
if (!PageLRU(page))
return ret;
+ all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+ (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
+
/*
* When checking the active state, we need to be sure we are
* dealing with comparible boolean values. Take the logical not
* of each.
*/
- if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+ if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
return ret;
- if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+ if (!all_lru_mode && !!page_is_file_cache(page) != file)
return ret;
/*
ret = -EBUSY;
+ /*
+ * To minimise LRU disruption, the caller can indicate that it only
+ * wants to isolate pages it will be able to operate on without
+ * blocking - clean pages for the most part.
+ *
+ * ISOLATE_CLEAN means that only clean pages should be isolated. This
+ * is used by reclaim when it is cannot write to backing storage
+ *
+ * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+ * that it is possible to migrate without blocking
+ */
+ if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+ /* All the caller can do on PageWriteback is block */
+ if (PageWriteback(page))
+ return ret;
+
+ if (PageDirty(page)) {
+ struct address_space *mapping;
+
+ /* ISOLATE_CLEAN means only clean pages */
+ if (mode & ISOLATE_CLEAN)
+ return ret;
+
+ /*
+ * Only pages without mappings or that have a
+ * ->migratepage callback are possible to migrate
+ * without blocking
+ */
+ mapping = page_mapping(page);
+ if (mapping && !mapping->a_ops->migratepage)
+ return ret;
+ }
+ }
+
+ if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+ return ret;
+
if (likely(get_page_unless_zero(page))) {
/*
* Be careful not to clear PageLRU until after we're
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src, struct list_head *dst,
- unsigned long *scanned, int order, int mode, int file)
+ unsigned long *scanned, int order, isolate_mode_t mode,
+ int file)
{
unsigned long nr_taken = 0;
unsigned long nr_lumpy_taken = 0;
* anon page which don't already have a swap slot is
* pointless.
*/
- if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+ if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
!PageSwapCache(cursor_page))
break;
static unsigned long isolate_pages_global(unsigned long nr,
struct list_head *dst,
unsigned long *scanned, int order,
- int mode, struct zone *z,
- int active, int file)
+ isolate_mode_t mode,
+ struct zone *z, int active, int file)
{
int lru = LRU_BASE;
if (active)
}
/*
- * Returns true if the caller should wait to clean dirty/writeback pages.
+ * Returns true if a direct reclaim should wait on pages under writeback.
*
* If we are direct reclaiming for contiguous pages and we do not reclaim
* everything in the list, try again and wait for writeback IO to complete.
if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
return false;
- /* If we have relaimed everything on the isolated list, no stall */
+ /* If we have reclaimed everything on the isolated list, no stall */
if (nr_freed == nr_taken)
return false;
unsigned long nr_taken;
unsigned long nr_anon;
unsigned long nr_file;
+ unsigned long nr_dirty = 0;
+ unsigned long nr_writeback = 0;
+ isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
set_reclaim_mode(priority, sc, false);
+ if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+ reclaim_mode |= ISOLATE_ACTIVE;
+
lru_add_drain();
+
+ if (!sc->may_unmap)
+ reclaim_mode |= ISOLATE_UNMAPPED;
+ if (!sc->may_writepage)
+ reclaim_mode |= ISOLATE_CLEAN;
+
spin_lock_irq(&zone->lru_lock);
if (scanning_global_lru(sc)) {
- nr_taken = isolate_pages_global(nr_to_scan,
- &page_list, &nr_scanned, sc->order,
- sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
- zone, 0, file);
+ nr_taken = isolate_pages_global(nr_to_scan, &page_list,
+ &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone,
__count_zone_vm_events(PGSCAN_DIRECT, zone,
nr_scanned);
} else {
- nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
- &page_list, &nr_scanned, sc->order,
- sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
- zone, sc->mem_cgroup,
- 0, file);
+ nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
+ &nr_scanned, sc->order, reclaim_mode, zone,
+ sc->mem_cgroup, 0, file);
/*
* mem_cgroup_isolate_pages() keeps track of
* scanned pages on its own.
spin_unlock_irq(&zone->lru_lock);
- nr_reclaimed = shrink_page_list(&page_list, zone, sc);
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
+ &nr_dirty, &nr_writeback);
/* Check if we should syncronously wait for writeback */
if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
set_reclaim_mode(priority, sc, true);
- nr_reclaimed += shrink_page_list(&page_list, zone, sc);
+ nr_reclaimed += shrink_page_list(&page_list, zone, sc,
+ priority, &nr_dirty, &nr_writeback);
}
local_irq_disable();
putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+ /*
+ * If reclaim is isolating dirty pages under writeback, it implies
+ * that the long-lived page allocation rate is exceeding the page
+ * laundering rate. Either the global limits are not being effective
+ * at throttling processes due to the page distribution throughout
+ * zones or there is heavy usage of a slow backing device. The
+ * only option is to throttle from reclaim context which is not ideal
+ * as there is no guarantee the dirtying process is throttled in the
+ * same way balance_dirty_pages() manages.
+ *
+ * This scales the number of dirty pages that must be under writeback
+ * before throttling depending on priority. It is a simple backoff
+ * function that has the most effect in the range DEF_PRIORITY to
+ * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+ * in trouble and reclaim is considered to be in trouble.
+ *
+ * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
+ * DEF_PRIORITY-1 50% must be PageWriteback
+ * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
+ * ...
+ * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
+ * isolated page is PageWriteback
+ */
+ if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+ wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
zone_idx(zone),
nr_scanned, nr_reclaimed,
struct page *page;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
unsigned long nr_rotated = 0;
+ isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
lru_add_drain();
+
+ if (!sc->may_unmap)
+ reclaim_mode |= ISOLATE_UNMAPPED;
+ if (!sc->may_writepage)
+ reclaim_mode |= ISOLATE_CLEAN;
+
spin_lock_irq(&zone->lru_lock);
if (scanning_global_lru(sc)) {
nr_taken = isolate_pages_global(nr_pages, &l_hold,
&pgscanned, sc->order,
- ISOLATE_ACTIVE, zone,
+ reclaim_mode, zone,
1, file);
zone->pages_scanned += pgscanned;
} else {
nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
&pgscanned, sc->order,
- ISOLATE_ACTIVE, zone,
+ reclaim_mode, zone,
sc->mem_cgroup, 1, file);
/*
* mem_cgroup_isolate_pages() keeps track of
if (scanning_global_lru(sc))
low = inactive_anon_is_low_global(zone);
else
- low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
+ low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
return low;
}
#else
if (scanning_global_lru(sc))
low = inactive_file_is_low_global(zone);
else
- low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
+ low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
return low;
}
enum lru_list l;
int noswap = 0;
bool force_scan = false;
- unsigned long nr_force_scan[2];
- /* kswapd does zone balancing and needs to scan this zone */
- if (scanning_global_lru(sc) && current_is_kswapd())
+ /*
+ * If the zone or memcg is small, nr[l] can be 0. This
+ * results in no scanning on this priority and a potential
+ * priority drop. Global direct reclaim can go to the next
+ * zone and tends to have no problems. Global kswapd is for
+ * zone balancing and it needs to scan a minimum amount. When
+ * reclaiming for a memcg, a priority drop can cause high
+ * latencies, so it's better to scan a minimum amount there as
+ * well.
+ */
+ if (scanning_global_lru(sc) && current_is_kswapd() &&
+ zone->all_unreclaimable)
force_scan = true;
- /* memcg may have small limit and need to avoid priority drop */
if (!scanning_global_lru(sc))
force_scan = true;
fraction[0] = 0;
fraction[1] = 1;
denominator = 1;
- nr_force_scan[0] = 0;
- nr_force_scan[1] = SWAP_CLUSTER_MAX;
goto out;
}
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
- nr_force_scan[0] = SWAP_CLUSTER_MAX;
- nr_force_scan[1] = 0;
goto out;
}
}
* proportional to the fraction of recently scanned pages on
* each list that were recently referenced and in active use.
*/
- ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
+ ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
ap /= reclaim_stat->recent_rotated[0] + 1;
- fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
+ fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
spin_unlock_irq(&zone->lru_lock);
fraction[0] = ap;
fraction[1] = fp;
denominator = ap + fp + 1;
- if (force_scan) {
- unsigned long scan = SWAP_CLUSTER_MAX;
- nr_force_scan[0] = div64_u64(scan * ap, denominator);
- nr_force_scan[1] = div64_u64(scan * fp, denominator);
- }
out:
for_each_evictable_lru(l) {
int file = is_file_lru(l);
unsigned long scan;
scan = zone_nr_lru_pages(zone, sc, l);
- if (priority || noswap) {
+ if (priority || noswap || !vmscan_swappiness(sc)) {
scan >>= priority;
+ if (!scan && force_scan)
+ scan = SWAP_CLUSTER_MAX;
scan = div64_u64(scan * fraction[file], denominator);
}
-
- /*
- * If zone is small or memcg is small, nr[l] can be 0.
- * This results no-scan on this priority and priority drop down.
- * For global direct reclaim, it can visit next zone and tend
- * not to have problems. For global kswapd, it's for zone
- * balancing and it need to scan a small amounts. When using
- * memcg, priority drop can cause big latency. So, it's better
- * to scan small amount. See may_noscan above.
- */
- if (!scan && force_scan)
- scan = nr_force_scan[file];
nr[l] = scan;
}
}
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
- inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+ inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+ if (nr_swap_pages > 0)
+ inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
enum lru_list l;
unsigned long nr_reclaimed, nr_scanned;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ struct blk_plug plug;
restart:
nr_reclaimed = 0;
nr_scanned = sc->nr_scanned;
get_scan_count(zone, sc, nr, priority);
+ blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(l) {
if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
break;
}
+ blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
/*
throttle_vm_writeout(sc->gfp_mask);
}
+/* Returns true if compaction should go ahead for a high-order request */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+ unsigned long balance_gap, watermark;
+ bool watermark_ok;
+
+ /* Do not consider compaction for orders reclaim is meant to satisfy */
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
+ return false;
+
+ /*
+ * Compaction takes time to run and there are potentially other
+ * callers using the pages just freed. Continue reclaiming until
+ * there is a buffer of free pages available to give compaction
+ * a reasonable chance of completing and allocating the page
+ */
+ balance_gap = min(low_wmark_pages(zone),
+ (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+ KSWAPD_ZONE_BALANCE_GAP_RATIO);
+ watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+ watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+
+ /*
+ * If compaction is deferred, reclaim up to a point where
+ * compaction will have a chance of success when re-enabled
+ */
+ if (compaction_deferred(zone))
+ return watermark_ok;
+
+ /* If compaction is not ready to start, keep reclaiming */
+ if (!compaction_suitable(zone, sc->order))
+ return false;
+
+ return watermark_ok;
+}
+
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is ready to begin. This indicates to
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
*/
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ bool aborted_reclaim = false;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ if (COMPACTION_BUILD) {
+ /*
+ * If we already have plenty of memory free for
+ * compaction in this zone, don't free any more.
+ * Even though compaction is invoked for any
+ * non-zero order, only frequent costly order
+ * reclamation is disruptive enough to become a
+ * noticable problem, like transparent huge page
+ * allocations.
+ */
+ if (compaction_ready(zone, sc)) {
+ aborted_reclaim = true;
+ continue;
+ }
+ }
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
shrink_zone(priority, zone, sc);
}
+
+ return aborted_reclaim;
}
static bool zone_reclaimable(struct zone *zone)
struct zoneref *z;
struct zone *zone;
unsigned long writeback_threshold;
+ bool aborted_reclaim;
- get_mems_allowed();
delayacct_freepages_start();
if (scanning_global_lru(sc))
sc->nr_scanned = 0;
if (!priority)
disable_swap_token(sc->mem_cgroup);
- shrink_zones(priority, zonelist, sc);
+ aborted_reclaim = shrink_zones(priority, zonelist, sc);
+
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
*/
writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
if (total_scanned > writeback_threshold) {
- wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+ wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+ WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}
out:
delayacct_freepages_end();
- put_mems_allowed();
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
if (oom_killer_disabled)
return 0;
+ /* Aborted reclaim to try compaction? don't OOM, then */
+ if (aborted_reclaim)
+ return 1;
+
/* top priority shrink_zones still had more to do? don't OOM, then */
if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
/* If balanced, clear the congested flag */
zone_clear_flag(zone, ZONE_CONGESTED);
+ if (i <= *classzone_idx)
+ balanced += zone->present_pages;
}
}
* them before going back to sleep.
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
- schedule();
+
+ if (!kthread_should_stop())
+ schedule();
+
set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
} else {
if (remaining)
static int kswapd(void *p)
{
unsigned long order, new_order;
+ unsigned balanced_order;
int classzone_idx, new_classzone_idx;
+ int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
set_freezable();
order = new_order = 0;
+ balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+ balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
int ret;
* new request of a similar or harder type will succeed soon
* so consider going to sleep on the basis we reclaimed at
*/
- if (classzone_idx >= new_classzone_idx && order == new_order) {
+ if (balanced_classzone_idx >= new_classzone_idx &&
+ balanced_order == new_order) {
new_order = pgdat->kswapd_max_order;
new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, order, classzone_idx);
+ kswapd_try_to_sleep(pgdat, balanced_order,
+ balanced_classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
+ new_order = order;
+ new_classzone_idx = classzone_idx;
pgdat->kswapd_max_order = 0;
pgdat->classzone_idx = pgdat->nr_zones - 1;
}
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- order = balance_pgdat(pgdat, order, &classzone_idx);
+ balanced_classzone_idx = classzone_idx;
+ balanced_order = balance_pgdat(pgdat, order,
+ &balanced_classzone_idx);
}
}
+
+ tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
+ current->reclaim_state = NULL;
+ lockdep_clear_current_reclaim_state();
+
return 0;
}
}
/*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold lock_memory_hotplug().
*/
void kswapd_stop(int nid)
{
struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
- if (kswapd)
+ if (kswapd) {
kthread_stop(kswapd);
+ NODE_DATA(nid)->kswapd = NULL;
+ }
}
static int __init kswapd_init(void)
return 1;
}
+#ifdef CONFIG_SHMEM
/**
- * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
- * @page: page to check evictability and move to appropriate lru list
- * @zone: zone page is in
+ * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
+ * @pages: array of pages to check
+ * @nr_pages: number of pages to check
*
- * Checks a page for evictability and moves the page to the appropriate
- * zone lru list.
+ * Checks pages for evictability and moves them to the appropriate lru list.
*
- * Restrictions: zone->lru_lock must be held, page must be on LRU and must
- * have PageUnevictable set.
+ * This function is only used for SysV IPC SHM_UNLOCK.
*/
-static void check_move_unevictable_page(struct page *page, struct zone *zone)
+void check_move_unevictable_pages(struct page **pages, int nr_pages)
{
- VM_BUG_ON(PageActive(page));
+ struct zone *zone = NULL;
+ int pgscanned = 0;
+ int pgrescued = 0;
+ int i;
-retry:
- ClearPageUnevictable(page);
- if (page_evictable(page, NULL)) {
- enum lru_list l = page_lru_base_type(page);
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pages[i];
+ struct zone *pagezone;
- __dec_zone_state(zone, NR_UNEVICTABLE);
- list_move(&page->lru, &zone->lru[l].list);
- mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
- __inc_zone_state(zone, NR_INACTIVE_ANON + l);
- __count_vm_event(UNEVICTABLE_PGRESCUED);
- } else {
- /*
- * rotate unevictable list
- */
- SetPageUnevictable(page);
- list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
- mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
- if (page_evictable(page, NULL))
- goto retry;
- }
-}
+ pgscanned++;
+ pagezone = page_zone(page);
+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ zone = pagezone;
+ spin_lock_irq(&zone->lru_lock);
+ }
-/**
- * scan_mapping_unevictable_pages - scan an address space for evictable pages
- * @mapping: struct address_space to scan for evictable pages
- *
- * Scan all pages in mapping. Check unevictable pages for
- * evictability and move them to the appropriate zone lru list.
- */
-void scan_mapping_unevictable_pages(struct address_space *mapping)
-{
- pgoff_t next = 0;
- pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
- struct zone *zone;
- struct pagevec pvec;
+ if (!PageLRU(page) || !PageUnevictable(page))
+ continue;
- if (mapping->nrpages == 0)
- return;
+ if (page_evictable(page, NULL)) {
+ enum lru_list lru = page_lru_base_type(page);
- pagevec_init(&pvec, 0);
- while (next < end &&
- pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
- int i;
- int pg_scanned = 0;
-
- zone = NULL;
-
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
- pgoff_t page_index = page->index;
- struct zone *pagezone = page_zone(page);
-
- pg_scanned++;
- if (page_index > next)
- next = page_index;
- next++;
-
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
- }
-
- if (PageLRU(page) && PageUnevictable(page))
- check_move_unevictable_page(page, zone);
+ VM_BUG_ON(PageActive(page));
+ ClearPageUnevictable(page);
+ __dec_zone_state(zone, NR_UNEVICTABLE);
+ list_move(&page->lru, &zone->lru[lru].list);
+ mem_cgroup_move_lists(page, LRU_UNEVICTABLE, lru);
+ __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
+ pgrescued++;
}
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
-
- count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
}
-}
-
-/**
- * scan_zone_unevictable_pages - check unevictable list for evictable pages
- * @zone - zone of which to scan the unevictable list
- *
- * Scan @zone's unevictable LRU lists to check for pages that have become
- * evictable. Move those that have to @zone's inactive list where they
- * become candidates for reclaim, unless shrink_inactive_zone() decides
- * to reactivate them. Pages that are still unevictable are rotated
- * back onto @zone's unevictable list.
- */
-#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-static void scan_zone_unevictable_pages(struct zone *zone)
-{
- struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
- unsigned long scan;
- unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
-
- while (nr_to_scan > 0) {
- unsigned long batch_size = min(nr_to_scan,
- SCAN_UNEVICTABLE_BATCH_SIZE);
-
- spin_lock_irq(&zone->lru_lock);
- for (scan = 0; scan < batch_size; scan++) {
- struct page *page = lru_to_page(l_unevictable);
-
- if (!trylock_page(page))
- continue;
-
- prefetchw_prev_lru_page(page, l_unevictable, flags);
-
- if (likely(PageLRU(page) && PageUnevictable(page)))
- check_move_unevictable_page(page, zone);
-
- unlock_page(page);
- }
+ if (zone) {
+ __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+ __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
spin_unlock_irq(&zone->lru_lock);
-
- nr_to_scan -= batch_size;
}
}
+#endif /* CONFIG_SHMEM */
-
-/**
- * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
- *
- * A really big hammer: scan all zones' unevictable LRU lists to check for
- * pages that have become evictable. Move those back to the zones'
- * inactive list where they become candidates for reclaim.
- * This occurs when, e.g., we have unswappable pages on the unevictable lists,
- * and we add swap to the system. As such, it runs in the context of a task
- * that has possibly/probably made some previously unevictable pages
- * evictable.
- */
-static void scan_all_zones_unevictable_pages(void)
+static void warn_scan_unevictable_pages(void)
{
- struct zone *zone;
-
- for_each_zone(zone) {
- scan_zone_unevictable_pages(zone);
- }
+ printk_once(KERN_WARNING
+ "The scan_unevictable_pages sysctl/node-interface has been "
+ "disabled for lack of a legitimate use case. If you have "
+ "one, please send an email to linux-mm@kvack.org.\n");
}
/*
void __user *buffer,
size_t *length, loff_t *ppos)
{
+ warn_scan_unevictable_pages();
proc_doulongvec_minmax(table, write, buffer, length, ppos);
-
- if (write && *(unsigned long *)table->data)
- scan_all_zones_unevictable_pages();
-
scan_unevictable_pages = 0;
return 0;
}
struct sysdev_attribute *attr,
char *buf)
{
+ warn_scan_unevictable_pages();
return sprintf(buf, "0\n"); /* always zero; should fit... */
}
struct sysdev_attribute *attr,
const char *buf, size_t count)
{
- struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
- struct zone *zone;
- unsigned long res;
- unsigned long req = strict_strtoul(buf, 10, &res);
-
- if (!req)
- return 1; /* zero is no-op */
-
- for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!populated_zone(zone))
- continue;
- scan_zone_unevictable_pages(zone);
- }
+ warn_scan_unevictable_pages();
return 1;
}