Merge branch 'x86/urgent' into x86-mm
authorTejun Heo <tj@kernel.org>
Mon, 2 May 2011 12:16:37 +0000 (14:16 +0200)
committerTejun Heo <tj@kernel.org>
Mon, 2 May 2011 12:16:47 +0000 (14:16 +0200)
Merge reason: Pick up the following two fix commits.

  2be19102b7: x86, NUMA: Fix empty memblk detection in numa_cleanup_meminfo()
  765af22da8: x86-32, NUMA: Fix ACPI NUMA init broken by recent x86-64 change

Scheduled NUMA init 32/64bit unification changes depend on these.

Signed-off-by: Tejun Heo <tj@kernel.org>
16 files changed:
arch/x86/Kconfig
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/mmzone_64.h
arch/x86/include/asm/percpu.h
arch/x86/include/asm/topology.h
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/numaq_32.c
arch/x86/kernel/cpu/mcheck/therm_throt.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/process.c
arch/x86/kernel/smpboot.c
arch/x86/mm/ioremap.c
arch/x86/mm/numa_32.c
arch/x86/mm/numa_64.c
arch/x86/mm/srat_32.c
drivers/acpi/processor_throttling.c

index cc6c53a..8db4fbf 100644 (file)
@@ -1223,6 +1223,10 @@ config HAVE_ARCH_BOOTMEM
        def_bool y
        depends on X86_32 && NUMA
 
+config HAVE_ARCH_ALLOC_REMAP
+       def_bool y
+       depends on X86_32 && NUMA
+
 config ARCH_HAVE_MEMORY_PRESENT
        def_bool y
        depends on X86_32 && DISCONTIGMEM
@@ -1231,13 +1235,9 @@ config NEED_NODE_MEMMAP_SIZE
        def_bool y
        depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
 
-config HAVE_ARCH_ALLOC_REMAP
-       def_bool y
-       depends on X86_32 && NUMA
-
 config ARCH_FLATMEM_ENABLE
        def_bool y
-       depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
+       depends on X86_32 && !NUMA
 
 config ARCH_DISCONTIGMEM_ENABLE
        def_bool y
@@ -1247,20 +1247,16 @@ config ARCH_DISCONTIGMEM_DEFAULT
        def_bool y
        depends on NUMA && X86_32
 
-config ARCH_PROC_KCORE_TEXT
-       def_bool y
-       depends on X86_64 && PROC_KCORE
-
-config ARCH_SPARSEMEM_DEFAULT
-       def_bool y
-       depends on X86_64
-
 config ARCH_SPARSEMEM_ENABLE
        def_bool y
        depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
        select SPARSEMEM_STATIC if X86_32
        select SPARSEMEM_VMEMMAP_ENABLE if X86_64
 
+config ARCH_SPARSEMEM_DEFAULT
+       def_bool y
+       depends on X86_64
+
 config ARCH_SELECT_MEMORY_MODEL
        def_bool y
        depends on ARCH_SPARSEMEM_ENABLE
@@ -1269,6 +1265,10 @@ config ARCH_MEMORY_PROBE
        def_bool X86_64
        depends on MEMORY_HOTPLUG
 
+config ARCH_PROC_KCORE_TEXT
+       def_bool y
+       depends on X86_64 && PROC_KCORE
+
 config ILLEGAL_POINTER_VALUE
        hex
        default 0 if X86_32
@@ -1703,10 +1703,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
        def_bool y
        depends on MEMORY_HOTPLUG
 
-config HAVE_ARCH_EARLY_PFN_TO_NID
-       def_bool X86_64
-       depends on NUMA
-
 config USE_PERCPU_NUMA_NODE_ID
        def_bool y
        depends on NUMA
index 91f3e08..50c0d30 100644 (file)
@@ -207,8 +207,7 @@ extern const char * const x86_power_flags[32];
 #define test_cpu_cap(c, bit)                                           \
         test_bit(bit, (unsigned long *)((c)->x86_capability))
 
-#define cpu_has(c, bit)                                                        \
-       (__builtin_constant_p(bit) &&                                   \
+#define REQUIRED_MASK_BIT_SET(bit)                                     \
         ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) ||     \
           (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) ||     \
           (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) ||     \
@@ -218,10 +217,16 @@ extern const char * const x86_power_flags[32];
           (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) ||     \
           (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ||     \
           (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) ||     \
-          (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )      \
-         ? 1 :                                                         \
+          (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
+
+#define cpu_has(c, bit)                                                        \
+       (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :  \
         test_cpu_cap(c, bit))
 
+#define this_cpu_has(bit)                                              \
+       (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :  \
+        x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
+
 #define boot_cpu_has(bit)      cpu_has(&boot_cpu_data, bit)
 
 #define set_cpu_cap(c, bit)    set_bit(bit, (unsigned long *)((c)->x86_capability))
index 288b96f..b3f88d7 100644 (file)
@@ -4,36 +4,13 @@
 #ifndef _ASM_X86_MMZONE_64_H
 #define _ASM_X86_MMZONE_64_H
 
-
 #ifdef CONFIG_NUMA
 
 #include <linux/mmdebug.h>
-
 #include <asm/smp.h>
 
-/* Simple perfect hash to map physical addresses to node numbers */
-struct memnode {
-       int shift;
-       unsigned int mapsize;
-       s16 *map;
-       s16 embedded_map[64 - 8];
-} ____cacheline_aligned; /* total size = 128 bytes */
-extern struct memnode memnode;
-#define memnode_shift memnode.shift
-#define memnodemap memnode.map
-#define memnodemapsize memnode.mapsize
-
 extern struct pglist_data *node_data[];
 
-static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
-{
-       unsigned nid;
-       VIRTUAL_BUG_ON(!memnodemap);
-       nid = memnodemap[addr >> memnode_shift];
-       VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
-       return nid;
-}
-
 #define NODE_DATA(nid)         (node_data[nid])
 
 #define node_start_pfn(nid)    (NODE_DATA(nid)->node_start_pfn)
index d475b43..76042d9 100644 (file)
@@ -542,6 +542,33 @@ do {                                                                       \
        old__;                                                          \
 })
 
+static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
+                        const unsigned long __percpu *addr)
+{
+       unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
+
+       return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0;
+}
+
+static inline int x86_this_cpu_variable_test_bit(int nr,
+                        const unsigned long __percpu *addr)
+{
+       int oldbit;
+
+       asm volatile("bt "__percpu_arg(2)",%1\n\t"
+                       "sbb %0,%0"
+                       : "=r" (oldbit)
+                       : "m" (*(unsigned long *)addr), "Ir" (nr));
+
+       return oldbit;
+}
+
+#define x86_this_cpu_test_bit(nr, addr)                        \
+       (__builtin_constant_p((nr))                     \
+        ? x86_this_cpu_constant_test_bit((nr), (addr)) \
+        : x86_this_cpu_variable_test_bit((nr), (addr)))
+
+
 #include <asm-generic/percpu.h>
 
 /* We can use this directly for local CPU (faster). */
index 910a708..8dba769 100644 (file)
@@ -95,7 +95,6 @@ extern void setup_node_to_cpumask_map(void);
 #ifdef CONFIG_X86_32
 extern unsigned long node_start_pfn[];
 extern unsigned long node_end_pfn[];
-extern unsigned long node_remap_size[];
 #define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
 
 # define SD_CACHE_NICE_TRIES   1
index fabf01e..2bc503b 100644 (file)
@@ -505,7 +505,7 @@ static void __cpuinit setup_APIC_timer(void)
 {
        struct clock_event_device *levt = &__get_cpu_var(lapic_events);
 
-       if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) {
+       if (this_cpu_has(X86_FEATURE_ARAT)) {
                lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
                /* Make LAPIC timer preferrable over percpu HPET */
                lapic_clockevent.rating = 150;
index 6273eee..0aced70 100644 (file)
@@ -93,10 +93,6 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
                                                node_end_pfn[node]);
 
        memory_present(node, node_start_pfn[node], node_end_pfn[node]);
-
-       node_remap_size[node] = node_memmap_size_bytes(node,
-                                       node_start_pfn[node],
-                                       node_end_pfn[node]);
 }
 
 /*
index 6f8c5e9..6b0f4cd 100644 (file)
@@ -355,7 +355,6 @@ static void notify_thresholds(__u64 msr_val)
 static void intel_thermal_interrupt(void)
 {
        __u64 msr_val;
-       struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
 
        rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
 
@@ -367,19 +366,19 @@ static void intel_thermal_interrupt(void)
                                CORE_LEVEL) != 0)
                mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
 
-       if (cpu_has(c, X86_FEATURE_PLN))
+       if (this_cpu_has(X86_FEATURE_PLN))
                if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
                                        POWER_LIMIT_EVENT,
                                        CORE_LEVEL) != 0)
                        mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
 
-       if (cpu_has(c, X86_FEATURE_PTS)) {
+       if (this_cpu_has(X86_FEATURE_PTS)) {
                rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
                if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
                                        THERMAL_THROTTLING_EVENT,
                                        PACKAGE_LEVEL) != 0)
                        mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
-               if (cpu_has(c, X86_FEATURE_PLN))
+               if (this_cpu_has(X86_FEATURE_PLN))
                        if (therm_throt_process(msr_val &
                                        PACKAGE_THERM_STATUS_POWER_LIMIT,
                                        POWER_LIMIT_EVENT,
index 5a532ce..f1b2718 100644 (file)
@@ -715,7 +715,7 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
        }
 }
 
-static int
+static int __init
 check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
 {
        int ret = 0;
index d46cbe4..88a90a9 100644 (file)
@@ -449,7 +449,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
        if (!need_resched()) {
-               if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
+               if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
                        clflush((void *)&current_thread_info()->flags);
 
                __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -465,7 +465,7 @@ static void mwait_idle(void)
        if (!need_resched()) {
                trace_power_start(POWER_CSTATE, 1, smp_processor_id());
                trace_cpu_idle(1, smp_processor_id());
-               if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
+               if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
                        clflush((void *)&current_thread_info()->flags);
 
                __monitor((void *)&current_thread_info()->flags, 0, 0);
index c2871d3..a3c430b 100644 (file)
@@ -1332,9 +1332,9 @@ static inline void mwait_play_dead(void)
        void *mwait_ptr;
        struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
 
-       if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)))
+       if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c))
                return;
-       if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
+       if (!this_cpu_has(X86_FEATURE_CLFLSH))
                return;
        if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
                return;
index 0369843..be1ef57 100644 (file)
@@ -90,13 +90,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
        if (is_ISA_range(phys_addr, last_addr))
                return (__force void __iomem *)phys_to_virt(phys_addr);
 
-       /*
-        * Check if the request spans more than any BAR in the iomem resource
-        * tree.
-        */
-       WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
-                 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
-
        /*
         * Don't allow anybody to remap normal RAM that we're using..
         */
@@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
        ret_addr = (void __iomem *) (vaddr + offset);
        mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
 
+       /*
+        * Check if the request spans more than any BAR in the iomem resource
+        * tree.
+        */
+       WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
+                 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+
        return ret_addr;
 err_free_area:
        free_vm_area(area);
index bde3906..c757c0a 100644 (file)
@@ -104,13 +104,9 @@ extern unsigned long highend_pfn, highstart_pfn;
 
 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
 
-unsigned long node_remap_size[MAX_NUMNODES];
 static void *node_remap_start_vaddr[MAX_NUMNODES];
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
-static unsigned long kva_start_pfn;
-static unsigned long kva_pages;
-
 int __cpuinit numa_cpu_node(int cpu)
 {
        return apic->x86_32_numa_cpu_node(cpu);
@@ -129,7 +125,6 @@ int __init get_memcfg_numa_flat(void)
        node_end_pfn[0] = max_pfn;
        memblock_x86_register_active_regions(0, 0, max_pfn);
        memory_present(0, 0, max_pfn);
-       node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
 
         /* Indicate there is one node available. */
        nodes_clear(node_online_map);
@@ -164,9 +159,8 @@ static void __init allocate_pgdat(int nid)
 {
        char buf[16];
 
-       if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
-               NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
-       else {
+       NODE_DATA(nid) = alloc_remap(nid, ALIGN(sizeof(pg_data_t), PAGE_SIZE));
+       if (!NODE_DATA(nid)) {
                unsigned long pgdat_phys;
                pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
                                 max_pfn_mapped<<PAGE_SHIFT,
@@ -182,25 +176,38 @@ static void __init allocate_pgdat(int nid)
 }
 
 /*
- * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
- * virtual address space (KVA) is reserved and portions of nodes are mapped
- * using it. This is to allow node-local memory to be allocated for
- * structures that would normally require ZONE_NORMAL. The memory is
- * allocated with alloc_remap() and callers should be prepared to allocate
- * from the bootmem allocator instead.
+ * Remap memory allocator
  */
 static unsigned long node_remap_start_pfn[MAX_NUMNODES];
 static void *node_remap_end_vaddr[MAX_NUMNODES];
 static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-static unsigned long node_remap_offset[MAX_NUMNODES];
 
+/**
+ * alloc_remap - Allocate remapped memory
+ * @nid: NUMA node to allocate memory from
+ * @size: The size of allocation
+ *
+ * Allocate @size bytes from the remap area of NUMA node @nid.  The
+ * size of the remap area is predetermined by init_alloc_remap() and
+ * only the callers considered there should call this function.  For
+ * more info, please read the comment on top of init_alloc_remap().
+ *
+ * The caller must be ready to handle allocation failure from this
+ * function and fall back to regular memory allocator in such cases.
+ *
+ * CONTEXT:
+ * Single CPU early boot context.
+ *
+ * RETURNS:
+ * Pointer to the allocated memory on success, %NULL on failure.
+ */
 void *alloc_remap(int nid, unsigned long size)
 {
        void *allocation = node_remap_alloc_vaddr[nid];
 
        size = ALIGN(size, L1_CACHE_BYTES);
 
-       if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
+       if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
                return NULL;
 
        node_remap_alloc_vaddr[nid] += size;
@@ -209,26 +216,6 @@ void *alloc_remap(int nid, unsigned long size)
        return allocation;
 }
 
-static void __init remap_numa_kva(void)
-{
-       void *vaddr;
-       unsigned long pfn;
-       int node;
-
-       for_each_online_node(node) {
-               printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
-               for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
-                       vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
-                       printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
-                               (unsigned long)vaddr,
-                               node_remap_start_pfn[node] + pfn);
-                       set_pmd_pfn((ulong) vaddr, 
-                               node_remap_start_pfn[node] + pfn, 
-                               PAGE_KERNEL_LARGE);
-               }
-       }
-}
-
 #ifdef CONFIG_HIBERNATION
 /**
  * resume_map_numa_kva - add KVA mapping to the temporary page tables created
@@ -240,15 +227,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
        int node;
 
        for_each_online_node(node) {
-               unsigned long start_va, start_pfn, size, pfn;
+               unsigned long start_va, start_pfn, nr_pages, pfn;
 
                start_va = (unsigned long)node_remap_start_vaddr[node];
                start_pfn = node_remap_start_pfn[node];
-               size = node_remap_size[node];
+               nr_pages = (node_remap_end_vaddr[node] -
+                           node_remap_start_vaddr[node]) >> PAGE_SHIFT;
 
                printk(KERN_DEBUG "%s: node %d\n", __func__, node);
 
-               for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) {
+               for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
                        unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
                        pgd_t *pgd = pgd_base + pgd_index(vaddr);
                        pud_t *pud = pud_offset(pgd, vaddr);
@@ -264,132 +252,102 @@ void resume_map_numa_kva(pgd_t *pgd_base)
 }
 #endif
 
-static __init unsigned long calculate_numa_remap_pages(void)
+/**
+ * init_alloc_remap - Initialize remap allocator for a NUMA node
+ * @nid: NUMA node to initizlie remap allocator for
+ *
+ * NUMA nodes may end up without any lowmem.  As allocating pgdat and
+ * memmap on a different node with lowmem is inefficient, a special
+ * remap allocator is implemented which can be used by alloc_remap().
+ *
+ * For each node, the amount of memory which will be necessary for
+ * pgdat and memmap is calculated and two memory areas of the size are
+ * allocated - one in the node and the other in lowmem; then, the area
+ * in the node is remapped to the lowmem area.
+ *
+ * As pgdat and memmap must be allocated in lowmem anyway, this
+ * doesn't waste lowmem address space; however, the actual lowmem
+ * which gets remapped over is wasted.  The amount shouldn't be
+ * problematic on machines this feature will be used.
+ *
+ * Initialization failure isn't fatal.  alloc_remap() is used
+ * opportunistically and the callers will fall back to other memory
+ * allocation mechanisms on failure.
+ */
+static __init void init_alloc_remap(int nid)
 {
-       int nid;
-       unsigned long size, reserve_pages = 0;
+       unsigned long size, pfn;
+       u64 node_pa, remap_pa;
+       void *remap_va;
 
-       for_each_online_node(nid) {
-               u64 node_kva_target;
-               u64 node_kva_final;
-
-               /*
-                * The acpi/srat node info can show hot-add memroy zones
-                * where memory could be added but not currently present.
-                */
-               printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
-                       nid, node_start_pfn[nid], node_end_pfn[nid]);
-               if (node_start_pfn[nid] > max_pfn)
-                       continue;
-               if (!node_end_pfn[nid])
-                       continue;
-               if (node_end_pfn[nid] > max_pfn)
-                       node_end_pfn[nid] = max_pfn;
-
-               /* ensure the remap includes space for the pgdat. */
-               size = node_remap_size[nid] + sizeof(pg_data_t);
-
-               /* convert size to large (pmd size) pages, rounding up */
-               size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
-               /* now the roundup is correct, convert to PAGE_SIZE pages */
-               size = size * PTRS_PER_PTE;
-
-               node_kva_target = round_down(node_end_pfn[nid] - size,
-                                                PTRS_PER_PTE);
-               node_kva_target <<= PAGE_SHIFT;
-               do {
-                       node_kva_final = memblock_find_in_range(node_kva_target,
-                                       ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
-                                               ((u64)size)<<PAGE_SHIFT,
-                                               LARGE_PAGE_BYTES);
-                       node_kva_target -= LARGE_PAGE_BYTES;
-               } while (node_kva_final == MEMBLOCK_ERROR &&
-                        (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
-
-               if (node_kva_final == MEMBLOCK_ERROR)
-                       panic("Can not get kva ram\n");
-
-               node_remap_size[nid] = size;
-               node_remap_offset[nid] = reserve_pages;
-               reserve_pages += size;
-               printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
-                                 " node %d at %llx\n",
-                               size, nid, node_kva_final>>PAGE_SHIFT);
-
-               /*
-                *  prevent kva address below max_low_pfn want it on system
-                *  with less memory later.
-                *  layout will be: KVA address , KVA RAM
-                *
-                *  we are supposed to only record the one less then max_low_pfn
-                *  but we could have some hole in high memory, and it will only
-                *  check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
-                *  to use it as free.
-                *  So memblock_x86_reserve_range here, hope we don't run out of that array
-                */
-               memblock_x86_reserve_range(node_kva_final,
-                             node_kva_final+(((u64)size)<<PAGE_SHIFT),
-                             "KVA RAM");
-
-               node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
-       }
-       printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
-                       reserve_pages);
-       return reserve_pages;
-}
+       /*
+        * The acpi/srat node info can show hot-add memroy zones where
+        * memory could be added but not currently present.
+        */
+       printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
+              nid, node_start_pfn[nid], node_end_pfn[nid]);
+       if (node_start_pfn[nid] > max_pfn)
+               return;
+       if (!node_end_pfn[nid])
+               return;
+       if (node_end_pfn[nid] > max_pfn)
+               node_end_pfn[nid] = max_pfn;
 
-static void init_remap_allocator(int nid)
-{
-       node_remap_start_vaddr[nid] = pfn_to_kaddr(
-                       kva_start_pfn + node_remap_offset[nid]);
-       node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
-               (node_remap_size[nid] * PAGE_SIZE);
-       node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
-               ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-
-       printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
-               (ulong) node_remap_start_vaddr[nid],
-               (ulong) node_remap_end_vaddr[nid]);
+       /* calculate the necessary space aligned to large page size */
+       size = node_memmap_size_bytes(nid, node_start_pfn[nid],
+                                     min(node_end_pfn[nid], max_pfn));
+       size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+       size = ALIGN(size, LARGE_PAGE_BYTES);
+
+       /* allocate node memory and the lowmem remap area */
+       node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
+                                        (u64)node_end_pfn[nid] << PAGE_SHIFT,
+                                        size, LARGE_PAGE_BYTES);
+       if (node_pa == MEMBLOCK_ERROR) {
+               pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
+                          size, nid);
+               return;
+       }
+       memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
+
+       remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
+                                         max_low_pfn << PAGE_SHIFT,
+                                         size, LARGE_PAGE_BYTES);
+       if (remap_pa == MEMBLOCK_ERROR) {
+               pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
+                          size, nid);
+               memblock_x86_free_range(node_pa, node_pa + size);
+               return;
+       }
+       memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
+       remap_va = phys_to_virt(remap_pa);
+
+       /* perform actual remap */
+       for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
+               set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
+                           (node_pa >> PAGE_SHIFT) + pfn,
+                           PAGE_KERNEL_LARGE);
+
+       /* initialize remap allocator parameters */
+       node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
+       node_remap_start_vaddr[nid] = remap_va;
+       node_remap_end_vaddr[nid] = remap_va + size;
+       node_remap_alloc_vaddr[nid] = remap_va;
+
+       printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
+              nid, node_pa, node_pa + size, remap_va, remap_va + size);
 }
 
 void __init initmem_init(void)
 {
        int nid;
-       long kva_target_pfn;
-
-       /*
-        * When mapping a NUMA machine we allocate the node_mem_map arrays
-        * from node local memory.  They are then mapped directly into KVA
-        * between zone normal and vmalloc space.  Calculate the size of
-        * this space and use it to adjust the boundary between ZONE_NORMAL
-        * and ZONE_HIGHMEM.
-        */
 
        get_memcfg_numa();
        numa_init_array();
 
-       kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
-
-       kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
-       do {
-               kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
-                                       max_low_pfn<<PAGE_SHIFT,
-                                       kva_pages<<PAGE_SHIFT,
-                                       PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
-               kva_target_pfn -= PTRS_PER_PTE;
-       } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
-
-       if (kva_start_pfn == MEMBLOCK_ERROR)
-               panic("Can not get kva space\n");
-
-       printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
-               kva_start_pfn, max_low_pfn);
-       printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
+       for_each_online_node(nid)
+               init_alloc_remap(nid);
 
-       /* avoid clash with initrd */
-       memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
-                     (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
-                    "KVA PG");
 #ifdef CONFIG_HIGHMEM
        highstart_pfn = highend_pfn = max_pfn;
        if (max_pfn > max_low_pfn)
@@ -409,12 +367,8 @@ void __init initmem_init(void)
 
        printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
                        (ulong) pfn_to_kaddr(max_low_pfn));
-       for_each_online_node(nid) {
-               init_remap_allocator(nid);
-
+       for_each_online_node(nid)
                allocate_pgdat(nid);
-       }
-       remap_numa_kva();
 
        printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
                        (ulong) pfn_to_kaddr(highstart_pfn));
index 85b52fc..a96767c 100644 (file)
@@ -28,125 +28,10 @@ EXPORT_SYMBOL(node_data);
 
 nodemask_t numa_nodes_parsed __initdata;
 
-struct memnode memnode;
-
-static unsigned long __initdata nodemap_addr;
-static unsigned long __initdata nodemap_size;
-
 static struct numa_meminfo numa_meminfo __initdata;
-
 static int numa_distance_cnt;
 static u8 *numa_distance;
 
-/*
- * Given a shift value, try to populate memnodemap[]
- * Returns :
- * 1 if OK
- * 0 if memnodmap[] too small (of shift too small)
- * -1 if node overlap or lost ram (shift too big)
- */
-static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
-{
-       unsigned long addr, end;
-       int i, res = -1;
-
-       memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
-       for (i = 0; i < mi->nr_blks; i++) {
-               addr = mi->blk[i].start;
-               end = mi->blk[i].end;
-               if (addr >= end)
-                       continue;
-               if ((end >> shift) >= memnodemapsize)
-                       return 0;
-               do {
-                       if (memnodemap[addr >> shift] != NUMA_NO_NODE)
-                               return -1;
-                       memnodemap[addr >> shift] = mi->blk[i].nid;
-                       addr += (1UL << shift);
-               } while (addr < end);
-               res = 1;
-       }
-       return res;
-}
-
-static int __init allocate_cachealigned_memnodemap(void)
-{
-       unsigned long addr;
-
-       memnodemap = memnode.embedded_map;
-       if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
-               return 0;
-
-       addr = 0x8000;
-       nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
-       nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
-                                     nodemap_size, L1_CACHE_BYTES);
-       if (nodemap_addr == MEMBLOCK_ERROR) {
-               printk(KERN_ERR
-                      "NUMA: Unable to allocate Memory to Node hash map\n");
-               nodemap_addr = nodemap_size = 0;
-               return -1;
-       }
-       memnodemap = phys_to_virt(nodemap_addr);
-       memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
-
-       printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
-              nodemap_addr, nodemap_addr + nodemap_size);
-       return 0;
-}
-
-/*
- * The LSB of all start and end addresses in the node map is the value of the
- * maximum possible shift.
- */
-static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
-{
-       int i, nodes_used = 0;
-       unsigned long start, end;
-       unsigned long bitfield = 0, memtop = 0;
-
-       for (i = 0; i < mi->nr_blks; i++) {
-               start = mi->blk[i].start;
-               end = mi->blk[i].end;
-               if (start >= end)
-                       continue;
-               bitfield |= start;
-               nodes_used++;
-               if (end > memtop)
-                       memtop = end;
-       }
-       if (nodes_used <= 1)
-               i = 63;
-       else
-               i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
-       memnodemapsize = (memtop >> i)+1;
-       return i;
-}
-
-static int __init compute_hash_shift(const struct numa_meminfo *mi)
-{
-       int shift;
-
-       shift = extract_lsb_from_nodes(mi);
-       if (allocate_cachealigned_memnodemap())
-               return -1;
-       printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
-               shift);
-
-       if (populate_memnodemap(mi, shift) != 1) {
-               printk(KERN_INFO "Your memory is not aligned you need to "
-                      "rebuild your kernel with a bigger NODEMAPSIZE "
-                      "shift=%d\n", shift);
-               return -1;
-       }
-       return shift;
-}
-
-int __meminit  __early_pfn_to_nid(unsigned long pfn)
-{
-       return phys_to_nid(pfn << PAGE_SHIFT);
-}
-
 static void * __init early_node_mem(int nodeid, unsigned long start,
                                    unsigned long end, unsigned long size,
                                    unsigned long align)
@@ -270,7 +155,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
        memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
        printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
                nodedata_phys + pgdat_size - 1);
-       nid = phys_to_nid(nodedata_phys);
+       nid = early_pfn_to_nid(nodedata_phys >> PAGE_SHIFT);
        if (nid != nodeid)
                printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nodeid, nid);
 
@@ -527,12 +412,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
        if (WARN_ON(nodes_empty(node_possible_map)))
                return -EINVAL;
 
-       memnode_shift = compute_hash_shift(mi);
-       if (memnode_shift < 0) {
-               printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
-               return -EINVAL;
-       }
-
        for (i = 0; i < mi->nr_blks; i++)
                memblock_x86_register_active_regions(mi->blk[i].nid,
                                        mi->blk[i].start >> PAGE_SHIFT,
@@ -626,17 +505,13 @@ static int __init numa_init(int (*init_func)(void))
 
 void __init initmem_init(void)
 {
-       int ret;
-
        if (!numa_off) {
 #ifdef CONFIG_ACPI_NUMA
-               ret = numa_init(x86_acpi_numa_init);
-               if (!ret)
+               if (!numa_init(x86_acpi_numa_init))
                        return;
 #endif
 #ifdef CONFIG_AMD_NUMA
-               ret = numa_init(amd_numa_init);
-               if (!ret)
+               if (!numa_init(amd_numa_init))
                        return;
 #endif
        }
index 364f36b..ae20046 100644 (file)
@@ -278,7 +278,6 @@ int __init get_memcfg_from_srat(void)
                unsigned long end = min(node_end_pfn[nid], max_pfn);
 
                memory_present(nid, start, end);
-               node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
        }
        return 1;
 out_fail:
index ad35017..605a295 100644 (file)
@@ -710,20 +710,14 @@ static int acpi_processor_get_throttling_fadt(struct acpi_processor *pr)
 }
 
 #ifdef CONFIG_X86
-static int acpi_throttling_rdmsr(struct acpi_processor *pr,
-                                       u64 *value)
+static int acpi_throttling_rdmsr(u64 *value)
 {
-       struct cpuinfo_x86 *c;
        u64 msr_high, msr_low;
-       unsigned int cpu;
        u64 msr = 0;
        int ret = -1;
 
-       cpu = pr->id;
-       c = &cpu_data(cpu);
-
-       if ((c->x86_vendor != X86_VENDOR_INTEL) ||
-               !cpu_has(c, X86_FEATURE_ACPI)) {
+       if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) ||
+               !this_cpu_has(X86_FEATURE_ACPI)) {
                printk(KERN_ERR PREFIX
                        "HARDWARE addr space,NOT supported yet\n");
        } else {
@@ -738,18 +732,13 @@ static int acpi_throttling_rdmsr(struct acpi_processor *pr,
        return ret;
 }
 
-static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
+static int acpi_throttling_wrmsr(u64 value)
 {
-       struct cpuinfo_x86 *c;
-       unsigned int cpu;
        int ret = -1;
        u64 msr;
 
-       cpu = pr->id;
-       c = &cpu_data(cpu);
-
-       if ((c->x86_vendor != X86_VENDOR_INTEL) ||
-               !cpu_has(c, X86_FEATURE_ACPI)) {
+       if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) ||
+               !this_cpu_has(X86_FEATURE_ACPI)) {
                printk(KERN_ERR PREFIX
                        "HARDWARE addr space,NOT supported yet\n");
        } else {
@@ -761,15 +750,14 @@ static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
        return ret;
 }
 #else
-static int acpi_throttling_rdmsr(struct acpi_processor *pr,
-                               u64 *value)
+static int acpi_throttling_rdmsr(u64 *value)
 {
        printk(KERN_ERR PREFIX
                "HARDWARE addr space,NOT supported yet\n");
        return -1;
 }
 
-static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
+static int acpi_throttling_wrmsr(u64 value)
 {
        printk(KERN_ERR PREFIX
                "HARDWARE addr space,NOT supported yet\n");
@@ -801,7 +789,7 @@ static int acpi_read_throttling_status(struct acpi_processor *pr,
                ret = 0;
                break;
        case ACPI_ADR_SPACE_FIXED_HARDWARE:
-               ret = acpi_throttling_rdmsr(pr, value);
+               ret = acpi_throttling_rdmsr(value);
                break;
        default:
                printk(KERN_ERR PREFIX "Unknown addr space %d\n",
@@ -834,7 +822,7 @@ static int acpi_write_throttling_state(struct acpi_processor *pr,
                ret = 0;
                break;
        case ACPI_ADR_SPACE_FIXED_HARDWARE:
-               ret = acpi_throttling_wrmsr(pr, value);
+               ret = acpi_throttling_wrmsr(value);
                break;
        default:
                printk(KERN_ERR PREFIX "Unknown addr space %d\n",