#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
+#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
+#include <linux/kthread.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
- unsigned long nr_mapped; /* From page_state */
-
/* This context's GFP mask */
gfp_t gfp_mask;
int swap_cluster_max;
int swappiness;
+
+ int all_unreclaimable;
};
/*
break;
if (shrink_ret < nr_before)
ret += nr_before - shrink_ret;
- mod_page_state(slabs_scanned, this_scan);
+ count_vm_events(SLABS_SCANNED, this_scan);
total_scan -= this_scan;
cond_resched();
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
-
+ inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
int remove_mapping(struct address_space *mapping, struct page *page)
{
- if (!mapping)
- return 0; /* truncate got there first */
+ BUG_ON(!PageLocked(page));
+ BUG_ON(mapping != page_mapping(page));
write_lock_irq(&mapping->tree_lock);
-
/*
- * The non-racy check for busy page. It is critical to check
- * PageDirty _after_ making sure that the page is freeable and
- * not in use by anybody. (pagecache + us == 2)
+ * The non racy check for a busy page.
+ *
+ * Must be careful with the order of the tests. When someone has
+ * a ref to the page, it may be possible that they dirty it then
+ * drop the reference. So if PageDirty is tested before page_count
+ * here, then the following race may occur:
+ *
+ * get_user_pages(&page);
+ * [user mapping goes away]
+ * write_to(page);
+ * !PageDirty(page) [good]
+ * SetPageDirty(page);
+ * put_page(page);
+ * !page_count(page) [good, discard it]
+ *
+ * [oops, our write_to data is lost]
+ *
+ * Reversing the order of the tests ensures such a situation cannot
+ * escape unnoticed. The smp_rmb is needed to ensure the page->flags
+ * load is not satisfied before that of page->_count.
+ *
+ * Note that if SetPageDirty is always performed via set_page_dirty,
+ * and thus under tree_lock, then this ordering is not required.
*/
if (unlikely(page_count(page) != 2))
goto cannot_free;
if (TestSetPageLocked(page))
goto keep;
- BUG_ON(PageActive(page));
+ VM_BUG_ON(PageActive(page));
sc->nr_scanned++;
goto free_it;
}
- if (!remove_mapping(mapping, page))
+ if (!mapping || !remove_mapping(mapping, page))
goto keep_locked;
free_it:
unlock_page(page);
keep:
list_add(&page->lru, &ret_pages);
- BUG_ON(PageLRU(page));
+ VM_BUG_ON(PageLRU(page));
}
list_splice(&ret_pages, page_list);
if (pagevec_count(&freed_pvec))
__pagevec_release_nonlru(&freed_pvec);
- mod_page_state(pgactivate, pgactivate);
+ count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
- BUG_ON(!PageLRU(page));
+ VM_BUG_ON(!PageLRU(page));
list_del(&page->lru);
target = src;
nr_reclaimed += nr_freed;
local_irq_disable();
if (current_is_kswapd()) {
- __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
- __mod_page_state(kswapd_steal, nr_freed);
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
+ __count_vm_events(KSWAPD_STEAL, nr_freed);
} else
- __mod_page_state_zone(zone, pgscan_direct, nr_scan);
- __mod_page_state_zone(zone, pgsteal, nr_freed);
+ __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
+ __count_vm_events(PGACTIVATE, nr_freed);
if (nr_taken == 0)
goto done;
*/
while (!list_empty(&page_list)) {
page = lru_to_page(&page_list);
- BUG_ON(PageLRU(page));
+ VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
list_del(&page->lru);
if (PageActive(page))
return nr_reclaimed;
}
+static inline int zone_is_near_oom(struct zone *zone)
+{
+ return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
+}
+
/*
* This moves pages from the active list to the inactive list.
*
long distress;
long swap_tendency;
+ if (zone_is_near_oom(zone))
+ goto force_reclaim_mapped;
+
/*
* `distress' is a measure of how much trouble we're having
* reclaiming pages. 0 -> no problems. 100 -> great trouble.
* how much memory
* is mapped.
*/
- mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages;
+ mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
+ global_page_state(NR_ANON_PAGES)) * 100) /
+ vm_total_pages;
/*
* Now decide how much we really want to unmap some pages. The
* memory onto the inactive list.
*/
if (swap_tendency >= 100)
+force_reclaim_mapped:
reclaim_mapped = 1;
}
while (!list_empty(&l_inactive)) {
page = lru_to_page(&l_inactive);
prefetchw_prev_lru_page(page, &l_inactive, flags);
- BUG_ON(PageLRU(page));
+ VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
- BUG_ON(!PageActive(page));
+ VM_BUG_ON(!PageActive(page));
ClearPageActive(page);
list_move(&page->lru, &zone->inactive_list);
while (!list_empty(&l_active)) {
page = lru_to_page(&l_active);
prefetchw_prev_lru_page(page, &l_active, flags);
- BUG_ON(PageLRU(page));
+ VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
- BUG_ON(!PageActive(page));
+ VM_BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
}
}
zone->nr_active += pgmoved;
- spin_unlock(&zone->lru_lock);
- __mod_page_state_zone(zone, pgrefill, pgscanned);
- __mod_page_state(pgdeactivate, pgdeactivate);
- local_irq_enable();
+ __count_zone_vm_events(PGREFILL, zone, pgscanned);
+ __count_vm_events(PGDEACTIVATE, pgdeactivate);
+ spin_unlock_irq(&zone->lru_lock);
pagevec_release(&pvec);
}
unsigned long nr_reclaimed = 0;
int i;
+ sc->all_unreclaimable = 1;
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ sc->all_unreclaimable = 0;
+
nr_reclaimed += shrink_zone(priority, zone, sc);
}
return nr_reclaimed;
.swappiness = vm_swappiness,
};
- inc_page_state(allocstall);
+ count_vm_event(ALLOCSTALL);
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
}
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- sc.nr_mapped = read_page_state(nr_mapped);
sc.nr_scanned = 0;
if (!priority)
disable_swap_token();
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
blk_congestion_wait(WRITE, HZ/10);
}
+ /* top priority shrink_caches still had more to do? don't OOM, then */
+ if (!sc.all_unreclaimable)
+ ret = 1;
out:
for (i = 0; zones[i] != 0; i++) {
struct zone *zone = zones[i];
total_scanned = 0;
nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
- sc.nr_mapped = read_page_state(nr_mapped);
-
- inc_page_state(pageoutrun);
+ count_vm_event(PAGEOUTRUN);
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
if (zone->all_unreclaimable)
continue;
if (nr_slab == 0 && zone->pages_scanned >=
- (zone->nr_active + zone->nr_inactive) * 4)
+ (zone->nr_active + zone->nr_inactive) * 6)
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
};
cpumask_t cpumask;
- daemonize("kswapd%d", pgdat->node_id);
cpumask = node_to_cpumask(pgdat->node_id);
if (!cpus_empty(cpumask))
set_cpus_allowed(tsk, cpumask);
for_each_zone(zone)
lru_pages += zone->nr_active + zone->nr_inactive;
- nr_slab = read_page_state(nr_slab);
+ nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
/* If slab caches are huge, it's better to hit them first */
while (nr_slab >= lru_pages) {
reclaim_state.reclaimed_slab = 0;
for (prio = DEF_PRIORITY; prio >= 0; prio--) {
unsigned long nr_to_scan = nr_pages - ret;
- sc.nr_mapped = read_page_state(nr_mapped);
sc.nr_scanned = 0;
-
ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
if (ret >= nr_pages)
goto out;
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
-static int cpu_callback(struct notifier_block *nfb,
+static int __devinit cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
pg_data_t *pgdat;
}
#endif /* CONFIG_HOTPLUG_CPU */
+/*
+ * This kswapd start function will be called by init and node-hot-add.
+ * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+ */
+int kswapd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int ret = 0;
+
+ if (pgdat->kswapd)
+ return 0;
+
+ pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ if (IS_ERR(pgdat->kswapd)) {
+ /* failure at boot is fatal */
+ BUG_ON(system_state == SYSTEM_BOOTING);
+ printk("Failed to start kswapd on node %d\n",nid);
+ ret = -1;
+ }
+ return ret;
+}
+
static int __init kswapd_init(void)
{
- pg_data_t *pgdat;
+ int nid;
swap_setup();
- for_each_online_pgdat(pgdat) {
- pid_t pid;
-
- pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
- BUG_ON(pid < 0);
- read_lock(&tasklist_lock);
- pgdat->kswapd = find_task_by_pid(pid);
- read_unlock(&tasklist_lock);
- }
+ for_each_online_node(nid)
+ kswapd_run(nid);
hotcpu_notifier(cpu_callback, 0);
return 0;
}
*
* If non-zero call zone_reclaim when the number of free pages falls below
* the watermarks.
- *
- * In the future we may add flags to the mode. However, the page allocator
- * should only have to check that zone_reclaim_mode != 0 before calling
- * zone_reclaim().
*/
int zone_reclaim_mode __read_mostly;
#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
-#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
-
-/*
- * Mininum time between zone reclaim scans
- */
-int zone_reclaim_interval __read_mostly = 30*HZ;
/*
* Priority for ZONE_RECLAIM. This determines the fraction of pages
*/
#define ZONE_RECLAIM_PRIORITY 4
+/*
+ * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * occur.
+ */
+int sysctl_min_unmapped_ratio = 1;
+
+/*
+ * If the number of slab pages in a zone grows beyond this percentage then
+ * slab reclaim needs to occur.
+ */
+int sysctl_min_slab_ratio = 5;
+
/*
* Try to free up some pages from this zone through reclaim.
*/
struct scan_control sc = {
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
- .nr_mapped = read_page_state(nr_mapped),
.swap_cluster_max = max_t(unsigned long, nr_pages,
SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
.swappiness = vm_swappiness,
};
+ unsigned long slab_reclaimable;
disable_swap_token();
cond_resched();
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- /*
- * Free memory by calling shrink zone with increasing priorities
- * until we have enough memory freed.
- */
- priority = ZONE_RECLAIM_PRIORITY;
- do {
- nr_reclaimed += shrink_zone(priority, zone, &sc);
- priority--;
- } while (priority >= 0 && nr_reclaimed < nr_pages);
+ if (zone_page_state(zone, NR_FILE_PAGES) -
+ zone_page_state(zone, NR_FILE_MAPPED) >
+ zone->min_unmapped_pages) {
+ /*
+ * Free memory by calling shrink zone with increasing
+ * priorities until we have enough memory freed.
+ */
+ priority = ZONE_RECLAIM_PRIORITY;
+ do {
+ nr_reclaimed += shrink_zone(priority, zone, &sc);
+ priority--;
+ } while (priority >= 0 && nr_reclaimed < nr_pages);
+ }
- if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+ slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (slab_reclaimable > zone->min_slab_pages) {
/*
* shrink_slab() does not currently allow us to determine how
- * many pages were freed in this zone. So we just shake the slab
- * a bit and then go off node for this particular allocation
- * despite possibly having freed enough memory to allocate in
- * this zone. If we freed local memory then the next
- * allocations will be local again.
+ * many pages were freed in this zone. So we take the current
+ * number of slab pages and shake the slab until it is reduced
+ * by the same nr_pages that we used for reclaiming unmapped
+ * pages.
*
- * shrink_slab will free memory on all zones and may take
- * a long time.
+ * Note that shrink_slab will free memory on all zones and may
+ * take a long time.
*/
- shrink_slab(sc.nr_scanned, gfp_mask, order);
- }
+ while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
+ slab_reclaimable - nr_pages)
+ ;
- p->reclaim_state = NULL;
- current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
-
- if (nr_reclaimed == 0) {
/*
- * We were unable to reclaim enough pages to stay on node. We
- * now allow off node accesses for a certain time period before
- * trying again to reclaim pages from the local zone.
+ * Update nr_reclaimed by the number of slab pages we
+ * reclaimed from this zone.
*/
- zone->last_unsuccessful_zone_reclaim = jiffies;
+ nr_reclaimed += slab_reclaimable -
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE);
}
+ p->reclaim_state = NULL;
+ current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
return nr_reclaimed >= nr_pages;
}
int node_id;
/*
- * Do not reclaim if there was a recent unsuccessful attempt at zone
- * reclaim. In that case we let allocations go off node for the
- * zone_reclaim_interval. Otherwise we would scan for each off-node
- * page allocation.
+ * Zone reclaim reclaims unmapped file backed pages and
+ * slab pages if we are over the defined limits.
+ *
+ * A small portion of unmapped file backed pages is needed for
+ * file I/O otherwise pages read by file I/O will be immediately
+ * thrown out if the zone is overallocated. So we do not reclaim
+ * if less than a specified percentage of the zone is used by
+ * unmapped file backed pages.
*/
- if (time_before(jiffies,
- zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
- return 0;
+ if (zone_page_state(zone, NR_FILE_PAGES) -
+ zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
+ && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
+ <= zone->min_slab_pages)
+ return 0;
/*
* Avoid concurrent zone reclaims, do not reclaim in a zone that does
* over remote processors and spread off node memory allocations
* as wide as possible.
*/
- node_id = zone->zone_pgdat->node_id;
+ node_id = zone_to_nid(zone);
mask = node_to_cpumask(node_id);
if (!cpus_empty(mask) && node_id != numa_node_id())
return 0;