x86_64: fix setup_node_bootmem to support big mem excluding with memmap
authorYinghai Lu <yhlu.kernel@gmail.com>
Tue, 18 Mar 2008 19:52:37 +0000 (12:52 -0700)
committerIngo Molnar <mingo@elte.hu>
Sat, 26 Apr 2008 20:51:08 +0000 (22:51 +0200)
typical case: four sockets system, every node has 4g ram, and we are using:

memmap=10g$4g

to mask out memory on node1 and node2

when numa is enabled, early_node_mem is used to get node_data and node_bootmap.

if it can not get memory from the same node with find_e820_area(), it will
use alloc_bootmem to get buff from previous nodes.

so check it and print out some info about it.

need to move early_res_to_bootmem into every setup_node_bootmem.
and it takes range that node has. otherwise alloc_bootmem could return addr
that reserved early.

depends on "mm: make reserve_bootmem can crossed the nodes".

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
arch/x86/kernel/e820_64.c
arch/x86/kernel/setup_64.c
arch/x86/mm/numa_64.c
include/asm-x86/e820_64.h

index 79f0d52..645ee5e 100644 (file)
@@ -106,14 +106,19 @@ void __init free_early(unsigned long start, unsigned long end)
        early_res[j - 1].end = 0;
 }
 
-void __init early_res_to_bootmem(void)
+void __init early_res_to_bootmem(unsigned long start, unsigned long end)
 {
        int i;
+       unsigned long final_start, final_end;
        for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
                struct early_res *r = &early_res[i];
-               printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
-                       r->start, r->end - 1, r->name);
-               reserve_bootmem_generic(r->start, r->end - r->start);
+               final_start = max(start, r->start);
+               final_end = min(end, r->end);
+               if (final_start >= final_end)
+                       continue;
+               printk(KERN_INFO "  early res: %d [%lx-%lx] %s\n", i,
+                       final_start, final_end - 1, r->name);
+               reserve_bootmem_generic(final_start, final_end - final_start);
        }
 }
 
index b04e2c0..60e64c8 100644 (file)
@@ -190,6 +190,7 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
        bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
        e820_register_active_regions(0, start_pfn, end_pfn);
        free_bootmem_with_active_regions(0, end_pfn);
+       early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
        reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
 }
 #endif
@@ -421,8 +422,6 @@ void __init setup_arch(char **cmdline_p)
        contig_initmem_init(0, end_pfn);
 #endif
 
-       early_res_to_bootmem();
-
        dma32_reserve_bootmem();
 
 #ifdef CONFIG_ACPI_SLEEP
index 9a68922..c5066d5 100644 (file)
@@ -196,6 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
        unsigned long bootmap_start, nodedata_phys;
        void *bootmap;
        const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+       int nid;
 
        start = round_up(start, ZONE_ALIGN);
 
@@ -218,9 +219,19 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
        NODE_DATA(nodeid)->node_start_pfn = start_pfn;
        NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
 
-       /* Find a place for the bootmem map */
+       /*
+        * Find a place for the bootmem map
+        * nodedata_phys could be on other nodes by alloc_bootmem,
+        * so need to sure bootmap_start not to be small, otherwise
+        * early_node_mem will get that with find_e820_area instead
+        * of alloc_bootmem, that could clash with reserved range
+        */
        bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
-       bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+       nid = phys_to_nid(nodedata_phys);
+       if (nid == nodeid)
+               bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+       else
+               bootmap_start = round_up(start, PAGE_SIZE);
        /*
         * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
         * to use that to align to PAGE_SIZE
@@ -245,10 +256,29 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 
        free_bootmem_with_active_regions(nodeid, end);
 
-       reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size,
-                       BOOTMEM_DEFAULT);
-       reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
-                       bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
+       /*
+        * convert early reserve to bootmem reserve earlier
+        * otherwise early_node_mem could use early reserved mem
+        * on previous node
+        */
+       early_res_to_bootmem(start, end);
+
+       /*
+        * in some case early_node_mem could use alloc_bootmem
+        * to get range on other node, don't reserve that again
+        */
+       if (nid != nodeid)
+               printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nodeid, nid);
+       else
+               reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
+                                       pgdat_size, BOOTMEM_DEFAULT);
+       nid = phys_to_nid(bootmap_start);
+       if (nid != nodeid)
+               printk(KERN_INFO "    bootmap(%d) on node %d\n", nodeid, nid);
+       else
+               reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
+                                bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
+
 #ifdef CONFIG_ACPI_NUMA
        srat_reserve_add_area(nodeid);
 #endif
index b5e02e3..71c4d68 100644 (file)
@@ -49,7 +49,7 @@ extern void update_e820(void);
 
 extern void reserve_early(unsigned long start, unsigned long end, char *name);
 extern void free_early(unsigned long start, unsigned long end);
-extern void early_res_to_bootmem(void);
+extern void early_res_to_bootmem(unsigned long start, unsigned long end);
 
 #endif/*!__ASSEMBLY__*/