X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=arch%2Fx86_64%2Fmm%2Fnuma.c;h=41b8fb069924d9a6b26582688f9421aa44e80d8d;hb=f630fe2817601314b2eb7ca5ddc23c7834646731;hp=b2fac14baac0fc151838b32ad54f6eaf741b8047;hpb=edc4ff7c08e9885c40e60c4fb39fa42cc91a0602;p=pandora-kernel.git diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index b2fac14baac0..41b8fb069924 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; int numa_off __initdata; +unsigned long __initdata nodemap_addr; +unsigned long __initdata nodemap_size; /* @@ -52,34 +54,88 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) int res = -1; unsigned long addr, end; - if (shift >= 64) - return -1; - memset(memnodemap, 0xff, sizeof(memnodemap)); + memset(memnodemap, 0xff, memnodemapsize); for (i = 0; i < numnodes; i++) { addr = nodes[i].start; end = nodes[i].end; if (addr >= end) continue; - if ((end >> shift) >= NODEMAPSIZE) + if ((end >> shift) >= memnodemapsize) return 0; do { if (memnodemap[addr >> shift] != 0xff) return -1; memnodemap[addr >> shift] = i; - addr += (1UL << shift); + addr += (1UL << shift); } while (addr < end); res = 1; } return res; } -int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +static int __init allocate_cachealigned_memnodemap(void) +{ + unsigned long pad, pad_addr; + + memnodemap = memnode.embedded_map; + if (memnodemapsize <= 48) + return 0; + + pad = L1_CACHE_BYTES - 1; + pad_addr = 0x8000; + nodemap_size = pad + memnodemapsize; + nodemap_addr = find_e820_area(pad_addr, end_pfn<= end) + continue; + bitfield |= start; + nodes_used++; + if (end > memtop) + memtop = end; + } + if (nodes_used <= 1) + i = 63; + else + i = find_first_bit(&bitfield, sizeof(unsigned long)*8); + memnodemapsize = (memtop >> i)+1; + return i; +} - while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) - shift++; +int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +{ + int shift; + shift = extract_lsb_from_nodes(nodes, numnodes); + if (allocate_cachealigned_memnodemap()) + return -1; printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); @@ -161,7 +217,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en bootmap_start >> PAGE_SHIFT, start_pfn, end_pfn); - e820_bootmem_free(NODE_DATA(nodeid), start, end); + free_bootmem_with_active_regions(nodeid, end); reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<= (MAX_DMA32_PFN << PAGE_SHIFT))) + return 1; + return 0; +} + +static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { - int i; + int i, big; struct bootnode nodes[MAX_NUMNODES]; - unsigned long sz = ((end_pfn - start_pfn)< 1) { - unsigned long x = 1; - while ((x << 1) < sz) - x <<= 1; - if (x < sz/2) - printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); - sz = x; - } + old_sz = sz; + /* + * Round down to the nearest FAKE_NODE_MIN_SIZE. + */ + sz &= FAKE_NODE_MIN_HASH_MASK; + + /* + * We ensure that each node is at least 64MB big. Smaller than this + * size can cause VM hiccups. + */ + if (sz == 0) { + printk(KERN_INFO "Not enough memory for %d nodes. Reducing " + "the number of nodes\n", numa_fake); + numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; + printk(KERN_INFO "Number of fake nodes will be = %d\n", + numa_fake); + sz = FAKE_NODE_MIN_SIZE; + } + /* + * Find out how many nodes can get an extra NODE_MIN_SIZE granule. + * This logic ensures the extra memory gets distributed among as many + * nodes as possible (as compared to one single node getting all that + * extra memory. + */ + big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; + printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " + "%d\n", + (sz >> 20), (hole_size >> 20), big); memset(&nodes,0,sizeof(nodes)); + end = start; for (i = 0; i < numa_fake; i++) { - nodes[i].start = (start_pfn<= max_addr) { + numa_fake = i - 1; + break; + } + start = nodes[i].start = end; + /* + * Final node can have all the remaining memory. + */ if (i == numa_fake-1) - sz = (end_pfn<= max_addr) + break; + } + /* + * Look at the next node to make sure there is some real memory + * to map. Bad things happen when the only memory present + * in a zone on a fake node is IO hole. + */ + while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { + if (zone_cross_over(start, end + sz)) { + end = (MAX_DMA32_PFN << PAGE_SHIFT); + break; + } + if (end >= max_addr) + break; + end += FAKE_NODE_MIN_SIZE; + } + if (end > max_addr) + end = max_addr; + nodes[i].end = end; printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", i, nodes[i].start, nodes[i].end, @@ -259,8 +391,11 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); return -1; } - for_each_online_node(i) + for_each_online_node(i) { + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, + nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } numa_init_array(); return 0; } @@ -293,12 +428,14 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) end_pfn << PAGE_SHIFT); /* setup dummy node covering all memory */ memnode_shift = 63; + memnodemap = memnode.embedded_map; memnodemap[0] = 0; nodes_clear(node_online_map); node_set_online(0); for (i = 0; i < NR_CPUS; i++) numa_set_node(i, 0); node_to_cpumask[0] = cpumask_of_cpu(0); + e820_register_active_regions(0, start_pfn, end_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } @@ -323,34 +460,29 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } -#ifdef CONFIG_SPARSEMEM -static void __init arch_sparse_init(void) -{ - int i; - - for_each_online_node(i) - memory_present(i, node_start_pfn(i), node_end_pfn(i)); - - sparse_init(); -} -#else -#define arch_sparse_init() do {} while (0) -#endif - void __init paging_init(void) { int i; + unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; + max_zone_pfns[ZONE_NORMAL] = end_pfn; - arch_sparse_init(); + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); for_each_online_node(i) { setup_node_zones(i); } + + free_area_init_nodes(max_zone_pfns); } -/* [numa=off] */ -__init int numa_setup(char *opt) +static __init int numa_setup(char *opt) { + if (!opt) + return -EINVAL; if (!strncmp(opt,"off",3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU @@ -366,9 +498,11 @@ __init int numa_setup(char *opt) if (!strncmp(opt,"hotadd=", 7)) hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif - return 1; + return 0; } +early_param("numa", numa_setup); + /* * Setup early cpu_to_node. *