From: Christoph Lameter Date: Wed, 9 May 2007 09:35:14 +0000 (-0700) Subject: Move remote node draining out of slab allocators X-Git-Tag: v2.6.22-rc1~173 X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4037d452202e34214e8a939fa5621b2b3bbb45b7;p=pandora-kernel.git Move remote node draining out of slab allocators Currently the slab allocators contain callbacks into the page allocator to perform the draining of pagesets on remote nodes. This requires SLUB to have a whole subsystem in order to be compatible with SLAB. Moving node draining out of the slab allocators avoids a section of code in SLUB. Move the node draining so that is is done when the vm statistics are updated. At that point we are already touching all the cachelines with the pagesets of a processor. Add a expire counter there. If we have to update per zone or global vm statistics then assume that the pageset will require subsequent draining. The expire counter will be decremented on each vm stats update pass until it reaches zero. Then we will drain one batch from the pageset. The draining will cause vm counter updates which will then cause another expiration until the pcp is empty. So we will drain a batch every 3 seconds. Note that remote node draining is a somewhat esoteric feature that is required on large NUMA systems because otherwise significant portions of system memory can become trapped in pcp queues. The number of pcp is determined by the number of processors and nodes in a system. A system with 4 processors and 2 nodes has 8 pcps which is okay. But a system with 1024 processors and 512 nodes has 512k pcps with a high potential for large amount of memory being caught in them. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 97a36c3d96e2..0d2ef0b082a6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -176,10 +176,6 @@ extern void FASTCALL(free_cold_page(struct page *page)); #define free_page(addr) free_pages((addr),0) void page_alloc_init(void); -#ifdef CONFIG_NUMA -void drain_node_pages(int node); -#else -static inline void drain_node_pages(int node) { }; -#endif +void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); #endif /* __LINUX_GFP_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2f1544e83042..d09b1345a3a1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -83,6 +83,9 @@ struct per_cpu_pages { struct per_cpu_pageset { struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ +#ifdef CONFIG_NUMA + s8 expire; +#endif #ifdef CONFIG_SMP s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d53cbf8acb8e..f9b5d6d5f4d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -691,43 +691,26 @@ static void __init setup_nr_node_ids(void) {} #ifdef CONFIG_NUMA /* - * Called from the slab reaper to drain pagesets on a particular node that - * belongs to the currently executing processor. + * Called from the vmstat counter updater to drain pagesets of this + * currently executing processor on remote nodes after they have + * expired. + * * Note that this function must be called with the thread pinned to * a single processor. */ -void drain_node_pages(int nodeid) +void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - int i; - enum zone_type z; unsigned long flags; + int to_drain; - for (z = 0; z < MAX_NR_ZONES; z++) { - struct zone *zone = NODE_DATA(nodeid)->node_zones + z; - struct per_cpu_pageset *pset; - - if (!populated_zone(zone)) - continue; - - pset = zone_pcp(zone, smp_processor_id()); - for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { - struct per_cpu_pages *pcp; - - pcp = &pset->pcp[i]; - if (pcp->count) { - int to_drain; - - local_irq_save(flags); - if (pcp->count >= pcp->batch) - to_drain = pcp->batch; - else - to_drain = pcp->count; - free_pages_bulk(zone, to_drain, &pcp->list, 0); - pcp->count -= to_drain; - local_irq_restore(flags); - } - } - } + local_irq_save(flags); + if (pcp->count >= pcp->batch) + to_drain = pcp->batch; + else + to_drain = pcp->count; + free_pages_bulk(zone, to_drain, &pcp->list, 0); + pcp->count -= to_drain; + local_irq_restore(flags); } #endif diff --git a/mm/slab.c b/mm/slab.c index e50908b2bfac..944b20581f8c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -928,12 +928,6 @@ static void next_reap_node(void) { int node = __get_cpu_var(reap_node); - /* - * Also drain per cpu pages on remote zones - */ - if (node != numa_node_id()) - drain_node_pages(node); - node = next_node(node, node_online_map); if (unlikely(node >= MAX_NUMNODES)) node = first_node(node_online_map); Reading git-diff-tree failed