Merge branch 'percpu-for-linus' into percpu-for-next

author Tejun Heo <tj@kernel.org>

Fri, 14 Aug 2009 05:41:02 +0000 (14:41 +0900)

committer Tejun Heo <tj@kernel.org>

Fri, 14 Aug 2009 05:45:31 +0000 (14:45 +0900)
author Tejun Heo <tj@kernel.org>
Fri, 14 Aug 2009 05:41:02 +0000 (14:41 +0900)
committer Tejun Heo <tj@kernel.org>
Fri, 14 Aug 2009 05:45:31 +0000 (14:45 +0900)
diff --cc Makefile
Simple merge
diff --cc arch/mn10300/kernel/vmlinux.lds.S
Simple merge
diff --cc arch/sparc/kernel/smp_64.c

index 6970333,3691907..9856d86
--- 1/arch/sparc/kernel/smp_64.c
--- 2/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@@ -1478,26 -1491,25 +1478,26 @@@ void __init setup_per_cpu_areas(void
         size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
         static struct vm_struct vm;
         unsigned long delta, cpu;
- -      size_t pcpu_unit_size;
+ +      size_t size_sum, pcpu_unit_size;
         size_t ptrs_size;
+ +      void **ptrs;
   
- -      pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
- -                             PERCPU_DYNAMIC_RESERVE);
- -      dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
+ +      size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+ +                           PERCPU_DYNAMIC_RESERVE);
+ +      dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
   
   
-       ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0]));
- -      ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpur_ptrs[0]));
- -      pcpur_ptrs = alloc_bootmem(ptrs_size);
++      ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0]));
+ +      ptrs = alloc_bootmem(ptrs_size);
   
         for_each_possible_cpu(cpu) {
- -              pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
- -                                                   PCPU_CHUNK_SIZE);
+ +              ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
+ +                                             PCPU_CHUNK_SIZE);
   
- -              free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
- -                           PCPU_CHUNK_SIZE - pcpur_size);
+ +              free_bootmem(__pa(ptrs[cpu] + size_sum),
+ +                           PCPU_CHUNK_SIZE - size_sum);
   
- -              memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+ +              memcpy(ptrs[cpu], __per_cpu_load, static_size);
         }
   
         /* allocate address and map */
diff --cc arch/x86/Kconfig
Simple merge
diff --cc arch/x86/kernel/cpu/mcheck/mce.c
Simple merge
diff --cc arch/x86/kernel/cpu/perf_counter.c

index 13bd6d6,900332b..3d4ebbd
--- 1/arch/x86/kernel/cpu/perf_counter.c
--- 2/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@@ -1559,8 -1798,9 +1798,9 @@@ void callchain_store(struct perf_callch
                 entry->ip[entry->nr++] = ip;
   }
   
- -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
- -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+ +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+ +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+ static DEFINE_PER_CPU(int, in_nmi_frame);
   
   
   static void
diff --cc arch/x86/kernel/setup_percpu.c

index 7501bb1,07d8191..a26ff61
--- 1/arch/x86/kernel/setup_percpu.c
--- 2/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@@ -176,35 -185,130 +176,35 @@@ static ssize_t __init setup_pcpu_lpage(
                 return -EINVAL;
         }
   
- -      /*
- -       * Currently supports only single page.  Supporting multiple
- -       * pages won't be too difficult if it ever becomes necessary.
- -       */
- -      pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
- -                             PERCPU_DYNAMIC_RESERVE);
- -      if (pcpul_size > PMD_SIZE) {
- -              pr_warning("PERCPU: static data is larger than large page, "
- -                         "can't use large page\n");
- -              return -EINVAL;
- -      }
- -      dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
- -
- -      /* allocate pointer array and alloc large pages */
- -      map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
- -      pcpul_map = alloc_bootmem(map_size);
- -
- -      for_each_possible_cpu(cpu) {
- -              pcpul_map[cpu].cpu = cpu;
- -              pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
- -                                                      PMD_SIZE);
- -              if (!pcpul_map[cpu].ptr) {
- -                      pr_warning("PERCPU: failed to allocate large page "
- -                                 "for cpu%u\n", cpu);
- -                      goto enomem;
- -              }
- -
- -              /*
- -               * Only use pcpul_size bytes and give back the rest.
- -               *
- -               * Ingo: The 2MB up-rounding bootmem is needed to make
- -               * sure the partial 2MB page is still fully RAM - it's
- -               * not well-specified to have a PAT-incompatible area
- -               * (unmapped RAM, device memory, etc.) in that hole.
- -               */
- -              free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
- -                           PMD_SIZE - pcpul_size);
- -
- -              memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
+ +      /* allocate and build unit_map */
-       unit_map_size = num_possible_cpus() * sizeof(int);
++      unit_map_size = nr_cpu_ids * sizeof(int);
+ +      unit_map = alloc_bootmem_nopanic(unit_map_size);
+ +      if (!unit_map) {
+ +              pr_warning("PERCPU: failed to allocate unit_map\n");
+ +              return -ENOMEM;
         }
   
- -      /* allocate address and map */
- -      pcpul_vm.flags = VM_ALLOC;
- -      pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
- -      vm_area_register_early(&pcpul_vm, PMD_SIZE);
- -
- -      for_each_possible_cpu(cpu) {
- -              pmd_t *pmd, pmd_v;
- -
- -              pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
- -                                       cpu * PMD_SIZE);
- -              pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
- -                              PAGE_KERNEL_LARGE);
- -              set_pmd(pmd, pmd_v);
+ +      ret = pcpu_lpage_build_unit_map(static_size,
+ +                                      PERCPU_FIRST_CHUNK_RESERVE,
+ +                                      &dyn_size, &unit_size, PMD_SIZE,
+ +                                      unit_map, pcpu_lpage_cpu_distance);
+ +      if (ret < 0) {
+ +              pr_warning("PERCPU: failed to build unit_map\n");
+ +              goto out_free;
         }
+ +      nr_units = ret;
   
- -      /* we're ready, commit */
- -      pr_info("PERCPU: Remapped at %p with large pages, static data "
- -              "%zu bytes\n", pcpul_vm.addr, static_size);
- -
- -      ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
- -                                   PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
- -                                   PMD_SIZE, pcpul_vm.addr, NULL);
- -
- -      /* sort pcpul_map array for pcpu_lpage_remapped() */
- -      for (i = 0; i < nr_cpu_ids - 1; i++)
- -              for (j = i + 1; j < nr_cpu_ids; j++)
- -                      if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
- -                              struct pcpul_ent tmp = pcpul_map[i];
- -                              pcpul_map[i] = pcpul_map[j];
- -                              pcpul_map[j] = tmp;
- -                      }
- -
- -      return ret;
- -
- -enomem:
- -      for_each_possible_cpu(cpu)
- -              if (pcpul_map[cpu].ptr)
- -                      free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
- -      free_bootmem(__pa(pcpul_map), map_size);
- -      return -ENOMEM;
- -}
+ +      /* do the parameters look okay? */
+ +      if (!chosen) {
+ +              size_t vm_size = VMALLOC_END - VMALLOC_START;
+ +              size_t tot_size = nr_units * unit_size;
   
- -/**
- - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
- - * @kaddr: the kernel address in question
- - *
- - * Determine whether @kaddr falls in the pcpul recycled area.  This is
- - * used by pageattr to detect VM aliases and break up the pcpu PMD
- - * mapping such that the same physical page is not mapped under
- - * different attributes.
- - *
- - * The recycled area is always at the tail of a partially used PMD
- - * page.
- - *
- - * RETURNS:
- - * Address of corresponding remapped pcpu address if match is found;
- - * otherwise, NULL.
- - */
- -void *pcpu_lpage_remapped(void *kaddr)
- -{
- -      void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
- -      unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
- -      int left = 0, right = nr_cpu_ids - 1;
- -      int pos;
- -
- -      /* pcpul in use at all? */
- -      if (!pcpul_map)
- -              return NULL;
- -
- -      /* okay, perform binary search */
- -      while (left <= right) {
- -              pos = (left + right) / 2;
- -
- -              if (pcpul_map[pos].ptr < pmd_addr)
- -                      left = pos + 1;
- -              else if (pcpul_map[pos].ptr > pmd_addr)
- -                      right = pos - 1;
- -              else {
- -                      /* it shouldn't be in the area for the first chunk */
- -                      WARN_ON(offset < pcpul_size);
- -
- -                      return pcpul_vm.addr +
- -                              pcpul_map[pos].cpu * PMD_SIZE + offset;
+ +              /* don't consume more than 20% of vmalloc area */
+ +              if (tot_size > vm_size / 5) {
+ +                      pr_info("PERCPU: too large chunk size %zuMB for "
+ +                              "large page remap\n", tot_size >> 20);
+ +                      ret = -EINVAL;
+ +                      goto out_free;
                 }
         }
   
diff --cc arch/x86/kernel/vmlinux.lds.S
Simple merge
diff --cc arch/x86/mm/pageattr.c
Simple merge
diff --cc block/cfq-iosched.c
Simple merge
diff --cc drivers/cpufreq/cpufreq_conservative.c

index a7ef465,bdea7e2..bc33ddc
--- 1/drivers/cpufreq/cpufreq_conservative.c
--- 2/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@@ -64,8 -64,14 +64,14 @@@ struct cpu_dbs_info_s 
         unsigned int requested_freq;
         int cpu;
         unsigned int enable:1;
+       /*
+        * percpu mutex that serializes governor limit change with
+        * do_dbs_timer invocation. We do not want do_dbs_timer to run
+        * when user is changing the governor or limits.
+        */
+       struct mutex timer_mutex;
   };
- -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
+ +static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info);
   
   static unsigned int dbs_enable;       /* number of CPUs using this policy */
   
diff --cc drivers/cpufreq/cpufreq_ondemand.c

index 36f292a,d6ba142..d7a528c
--- 1/drivers/cpufreq/cpufreq_ondemand.c
--- 2/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@@ -70,10 -70,15 +70,15 @@@ struct cpu_dbs_info_s 
         unsigned int freq_lo_jiffies;
         unsigned int freq_hi_jiffies;
         int cpu;
-       unsigned int enable:1,
-               sample_type:1;
+       unsigned int sample_type:1;
+       /*
+        * percpu mutex that serializes governor limit change with
+        * do_dbs_timer invocation. We do not want do_dbs_timer to run
+        * when user is changing the governor or limits.
+        */
+       struct mutex timer_mutex;
   };
- -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
+ +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info);
   
   static unsigned int dbs_enable;       /* number of CPUs using this policy */
   
@@@ -193,6 -190,13 +191,13 @@@ static unsigned int powersave_bias_targ
         return freq_hi;
   }
   
- -      struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu);
+ static void ondemand_powersave_bias_init_cpu(int cpu)
+ {
++      struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
+       dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
+       dbs_info->freq_lo = 0;
+ }
+ 
   static void ondemand_powersave_bias_init(void)
   {
         int i;
@@@ -569,9 -550,10 +551,10 @@@ static int cpufreq_governor_dbs(struct 
                         return rc;
                 }
   
+               dbs_enable++;
                 for_each_cpu(j, policy->cpus) {
                         struct cpu_dbs_info_s *j_dbs_info;
- -                      j_dbs_info = &per_cpu(cpu_dbs_info, j);
+ +                      j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
                         j_dbs_info->cur_policy = policy;
   
                         j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
diff --cc drivers/xen/events.c
Simple merge
diff --cc include/asm-generic/vmlinux.lds.h

index ab8ea9b,6ad76bf..a43223a
--- 1/include/asm-generic/vmlinux.lds.h
--- 2/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@@ -30,15 -30,16 +30,13 @@@
    *    EXCEPTION_TABLE(...)
    *    NOTES
    *
-  *    __bss_start = .;
-  *    BSS_SECTION(0, 0)
-  *    __bss_stop = .;
+  *    BSS_SECTION(0, 0, 0)
    *    _end = .;
    *
- - *    /DISCARD/ : {
- - *            EXIT_TEXT
- - *            EXIT_DATA
- - *            EXIT_CALL
- - *    }
    *    STABS_DEBUG
    *    DWARF_DEBUG
+ + *
+ + *    DISCARDS                // must be the last
    * }
    *
    * [__init_begin, __init_end] is the init section that may be freed after init
diff --cc init/main.c
Simple merge
diff --cc kernel/module.c
Simple merge
diff --cc kernel/perf_counter.c
Simple merge
diff --cc kernel/sched.c
Simple merge
diff --cc kernel/trace/trace_events.c
Simple merge
diff --cc mm/page-writeback.c
Simple merge
diff --cc mm/percpu.c

index b3d0bcf,5fe3784..3f9f182
--- 1/mm/percpu.c
--- 2/mm/percpu.c
+++ b/mm/percpu.c
@@@ -1003,8 -747,9 +1003,8 @@@ static struct pcpu_chunk *alloc_pcpu_ch
         chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
         chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
         chunk->map[chunk->map_used++] = pcpu_unit_size;
- -      chunk->page = chunk->page_ar;
   
-       chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+       chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
         if (!chunk->vm) {
                 free_pcpu_chunk(chunk);
                 return NULL;
@@@ -1290,59 -1052,24 +1290,59 @@@ size_t __init pcpu_setup_first_chunk(si
         BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
                      ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
         BUG_ON(!static_size);
- -      if (unit_size >= 0) {
- -              BUG_ON(unit_size < size_sum);
- -              BUG_ON(unit_size & ~PAGE_MASK);
- -              BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
- -      } else
- -              BUG_ON(base_addr);
- -      BUG_ON(base_addr && populate_pte_fn);
- -
- -      if (unit_size >= 0)
- -              pcpu_unit_pages = unit_size >> PAGE_SHIFT;
- -      else
- -              pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
- -                                      PFN_UP(size_sum));
+ +      BUG_ON(!base_addr);
+ +      BUG_ON(unit_size < size_sum);
+ +      BUG_ON(unit_size & ~PAGE_MASK);
+ +      BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
+ +
+ +      /* determine number of units and verify and initialize pcpu_unit_map */
+ +      if (unit_map) {
+ +              int first_unit = INT_MAX, last_unit = INT_MIN;
+ +
+ +              for_each_possible_cpu(cpu) {
+ +                      int unit = unit_map[cpu];
+ +
+ +                      BUG_ON(unit < 0);
+ +                      for_each_possible_cpu(tcpu) {
+ +                              if (tcpu == cpu)
+ +                                      break;
+ +                              /* the mapping should be one-to-one */
+ +                              BUG_ON(unit_map[tcpu] == unit);
+ +                      }
+ +
+ +                      if (unit < first_unit) {
+ +                              pcpu_first_unit_cpu = cpu;
+ +                              first_unit = unit;
+ +                      }
+ +                      if (unit > last_unit) {
+ +                              pcpu_last_unit_cpu = cpu;
+ +                              last_unit = unit;
+ +                      }
+ +              }
+ +              pcpu_nr_units = last_unit + 1;
+ +              pcpu_unit_map = unit_map;
+ +      } else {
+ +              int *identity_map;
+ +
+ +              /* #units == #cpus, identity mapped */
-               identity_map = alloc_bootmem(num_possible_cpus() *
++              identity_map = alloc_bootmem(nr_cpu_ids *
+ +                                           sizeof(identity_map[0]));
   
-               pcpu_nr_units = num_possible_cpus();
+ +              for_each_possible_cpu(cpu)
+ +                      identity_map[cpu] = cpu;
+ +
+ +              pcpu_first_unit_cpu = 0;
+ +              pcpu_last_unit_cpu = pcpu_nr_units - 1;
++              pcpu_nr_units = nr_cpu_ids;
+ +              pcpu_unit_map = identity_map;
+ +      }
+ +
+ +      /* determine basic parameters */
+ +      pcpu_unit_pages = unit_size >> PAGE_SHIFT;
         pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
- -      pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
- -      pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
- -              + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
+ +      pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
+ +      pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
+ +              BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
   
         if (dyn_size < 0)
                 dyn_size = pcpu_unit_size - static_size - reserved_size;
@@@ -1461,555 -1237,44 +1461,558 @@@ ssize_t __init pcpu_embed_first_chunk(s
         unsigned int cpu;
   
         /* determine parameters and allocate */
- -      pcpue_size = PFN_ALIGN(static_size + reserved_size +
- -                             (dyn_size >= 0 ? dyn_size : 0));
- -      if (dyn_size != 0)
- -              dyn_size = pcpue_size - static_size - reserved_size;
- -
- -      if (unit_size >= 0) {
- -              BUG_ON(unit_size < pcpue_size);
- -              pcpue_unit_size = unit_size;
- -      } else
- -              pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
- -
- -      chunk_size = pcpue_unit_size * nr_cpu_ids;
- -
- -      pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
- -                                          __pa(MAX_DMA_ADDRESS));
- -      if (!pcpue_ptr) {
+ +      size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+ +
+ +      unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-       chunk_size = unit_size * num_possible_cpus();
++      chunk_size = unit_size * nr_cpu_ids;
+ +
+ +      base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
+ +                                     __pa(MAX_DMA_ADDRESS));
+ +      if (!base) {
                 pr_warning("PERCPU: failed to allocate %zu bytes for "
                            "embedding\n", chunk_size);
                 return -ENOMEM;
         }
   
         /* return the leftover and copy */
-       for_each_possible_cpu(cpu) {
+       for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
- -              void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+ +              void *ptr = base + cpu * unit_size;
   
-               free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
-               memcpy(ptr, __per_cpu_load, static_size);
+               if (cpu_possible(cpu)) {
- -                      free_bootmem(__pa(ptr + pcpue_size),
- -                                   pcpue_unit_size - pcpue_size);
++                      free_bootmem(__pa(ptr + size_sum),
++                                   unit_size - size_sum);
+                       memcpy(ptr, __per_cpu_load, static_size);
+               } else
- -                      free_bootmem(__pa(ptr), pcpue_unit_size);
++                      free_bootmem(__pa(ptr), unit_size);
         }
   
         /* we're ready, commit */
         pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
- -              pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+ +              size_sum >> PAGE_SHIFT, base, static_size);
+ +
+ +      return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+ +                                    unit_size, base, NULL);
+ +}
+ +
+ +/**
+ + * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
+ + * @static_size: the size of static percpu area in bytes
+ + * @reserved_size: the size of reserved percpu area in bytes
+ + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
+ + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
+ + * @populate_pte_fn: function to populate pte
+ + *
+ + * This is a helper to ease setting up embedded first percpu chunk and
+ + * can be called where pcpu_setup_first_chunk() is expected.
+ + *
+ + * This is the basic allocator.  Static percpu area is allocated
+ + * page-by-page into vmalloc area.
+ + *
+ + * RETURNS:
+ + * The determined pcpu_unit_size which can be used to initialize
+ + * percpu access on success, -errno on failure.
+ + */
+ +ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
+ +                                 pcpu_fc_alloc_fn_t alloc_fn,
+ +                                 pcpu_fc_free_fn_t free_fn,
+ +                                 pcpu_fc_populate_pte_fn_t populate_pte_fn)
+ +{
+ +      static struct vm_struct vm;
+ +      int unit_pages;
+ +      size_t pages_size;
+ +      struct page **pages;
+ +      unsigned int cpu;
+ +      int i, j;
+ +      ssize_t ret;
+ +
+ +      unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
+ +                                PCPU_MIN_UNIT_SIZE));
+ +
+ +      /* unaligned allocations can't be freed, round up to page size */
-       pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
-                              sizeof(pages[0]));
++      pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
+ +      pages = alloc_bootmem(pages_size);
+ +
+ +      /* allocate pages */
+ +      j = 0;
+ +      for_each_possible_cpu(cpu)
+ +              for (i = 0; i < unit_pages; i++) {
+ +                      void *ptr;
+ +
+ +                      ptr = alloc_fn(cpu, PAGE_SIZE);
+ +                      if (!ptr) {
+ +                              pr_warning("PERCPU: failed to allocate "
+ +                                         "4k page for cpu%u\n", cpu);
+ +                              goto enomem;
+ +                      }
+ +                      pages[j++] = virt_to_page(ptr);
+ +              }
+ +
+ +      /* allocate vm area, map the pages and copy static data */
+ +      vm.flags = VM_ALLOC;
-       vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
++      vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT;
+ +      vm_area_register_early(&vm, PAGE_SIZE);
+ +
+ +      for_each_possible_cpu(cpu) {
+ +              unsigned long unit_addr = (unsigned long)vm.addr +
+ +                      (cpu * unit_pages << PAGE_SHIFT);
+ +
+ +              for (i = 0; i < unit_pages; i++)
+ +                      populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
+ +
+ +              /* pte already populated, the following shouldn't fail */
+ +              ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
+ +                                     unit_pages);
+ +              if (ret < 0)
+ +                      panic("failed to map percpu area, err=%zd\n", ret);
+ +
+ +              /*
+ +               * FIXME: Archs with virtual cache should flush local
+ +               * cache for the linear mapping here - something
+ +               * equivalent to flush_cache_vmap() on the local cpu.
+ +               * flush_cache_vmap() can't be used as most supporting
+ +               * data structures are not set up yet.
+ +               */
+ +
+ +              /* copy static data */
+ +              memcpy((void *)unit_addr, __per_cpu_load, static_size);
+ +      }
+ +
+ +      /* we're ready, commit */
+ +      pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
+ +              unit_pages, static_size);
+ +
+ +      ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
+ +                                   unit_pages << PAGE_SHIFT, vm.addr, NULL);
+ +      goto out_free_ar;
+ +
+ +enomem:
+ +      while (--j >= 0)
+ +              free_fn(page_address(pages[j]), PAGE_SIZE);
+ +      ret = -ENOMEM;
+ +out_free_ar:
+ +      free_bootmem(__pa(pages), pages_size);
+ +      return ret;
+ +}
+ +
+ +/*
+ + * Large page remapping first chunk setup helper
+ + */
+ +#ifdef CONFIG_NEED_MULTIPLE_NODES
+ +
+ +/**
+ + * pcpu_lpage_build_unit_map - build unit_map for large page remapping
+ + * @static_size: the size of static percpu area in bytes
+ + * @reserved_size: the size of reserved percpu area in bytes
+ + * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
+ + * @unit_sizep: out parameter for unit size
+ + * @unit_map: unit_map to be filled
+ + * @cpu_distance_fn: callback to determine distance between cpus
+ + *
+ + * This function builds cpu -> unit map and determine other parameters
+ + * considering needed percpu size, large page size and distances
+ + * between CPUs in NUMA.
+ + *
+ + * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
+ + * may share units in the same large page.  The returned configuration
+ + * is guaranteed to have CPUs on different nodes on different large
+ + * pages and >=75% usage of allocated virtual address space.
+ + *
+ + * RETURNS:
+ + * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
+ + * returns the number of units to be allocated.  -errno on failure.
+ + */
+ +int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
+ +                                   ssize_t *dyn_sizep, size_t *unit_sizep,
+ +                                   size_t lpage_size, int *unit_map,
+ +                                   pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+ +{
+ +      static int group_map[NR_CPUS] __initdata;
+ +      static int group_cnt[NR_CPUS] __initdata;
+ +      int group_cnt_max = 0;
+ +      size_t size_sum, min_unit_size, alloc_size;
+ +      int upa, max_upa, uninitialized_var(best_upa);  /* units_per_alloc */
+ +      int last_allocs;
+ +      unsigned int cpu, tcpu;
+ +      int group, unit;
+ +
+ +      /*
+ +       * Determine min_unit_size, alloc_size and max_upa such that
+ +       * alloc_size is multiple of lpage_size and is the smallest
+ +       * which can accomodate 4k aligned segments which are equal to
+ +       * or larger than min_unit_size.
+ +       */
+ +      size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
+ +      min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+ +
+ +      alloc_size = roundup(min_unit_size, lpage_size);
+ +      upa = alloc_size / min_unit_size;
+ +      while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ +              upa--;
+ +      max_upa = upa;
+ +
+ +      /* group cpus according to their proximity */
+ +      for_each_possible_cpu(cpu) {
+ +              group = 0;
+ +      next_group:
+ +              for_each_possible_cpu(tcpu) {
+ +                      if (cpu == tcpu)
+ +                              break;
+ +                      if (group_map[tcpu] == group &&
+ +                          (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+ +                           cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+ +                              group++;
+ +                              goto next_group;
+ +                      }
+ +              }
+ +              group_map[cpu] = group;
+ +              group_cnt[group]++;
+ +              group_cnt_max = max(group_cnt_max, group_cnt[group]);
+ +      }
+ +
+ +      /*
+ +       * Expand unit size until address space usage goes over 75%
+ +       * and then as much as possible without using more address
+ +       * space.
+ +       */
+ +      last_allocs = INT_MAX;
+ +      for (upa = max_upa; upa; upa--) {
+ +              int allocs = 0, wasted = 0;
+ +
+ +              if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ +                      continue;
+ +
+ +              for (group = 0; group_cnt[group]; group++) {
+ +                      int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+ +                      allocs += this_allocs;
+ +                      wasted += this_allocs * upa - group_cnt[group];
+ +              }
+ +
+ +              /*
+ +               * Don't accept if wastage is over 25%.  The
+ +               * greater-than comparison ensures upa==1 always
+ +               * passes the following check.
+ +               */
+ +              if (wasted > num_possible_cpus() / 3)
+ +                      continue;
+ +
+ +              /* and then don't consume more memory */
+ +              if (allocs > last_allocs)
+ +                      break;
+ +              last_allocs = allocs;
+ +              best_upa = upa;
+ +      }
+ +      *unit_sizep = alloc_size / best_upa;
   
- -      return pcpu_setup_first_chunk(pcpue_get_page, static_size,
- -                                    reserved_size, dyn_size,
- -                                    pcpue_unit_size, pcpue_ptr, NULL);
+ +      /* assign units to cpus accordingly */
+ +      unit = 0;
+ +      for (group = 0; group_cnt[group]; group++) {
+ +              for_each_possible_cpu(cpu)
+ +                      if (group_map[cpu] == group)
+ +                              unit_map[cpu] = unit++;
+ +              unit = roundup(unit, best_upa);
+ +      }
+ +
+ +      return unit;    /* unit contains aligned number of units */
+ +}
+ +
+ +struct pcpul_ent {
+ +      void            *ptr;
+ +      void            *map_addr;
+ +};
+ +
+ +static size_t pcpul_size;
+ +static size_t pcpul_lpage_size;
+ +static int pcpul_nr_lpages;
+ +static struct pcpul_ent *pcpul_map;
+ +
+ +static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
+ +                                   unsigned int *cpup)
+ +{
+ +      unsigned int cpu;
+ +
+ +      for_each_possible_cpu(cpu)
+ +              if (unit_map[cpu] == unit) {
+ +                      if (cpup)
+ +                              *cpup = cpu;
+ +                      return true;
+ +              }
+ +
+ +      return false;
+ +}
+ +
+ +static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
+ +                                      size_t reserved_size, size_t dyn_size,
+ +                                      size_t unit_size, size_t lpage_size,
+ +                                      const int *unit_map, int nr_units)
+ +{
+ +      int width = 1, v = nr_units;
+ +      char empty_str[] = "--------";
+ +      int upl, lpl;   /* units per lpage, lpage per line */
+ +      unsigned int cpu;
+ +      int lpage, unit;
+ +
+ +      while (v /= 10)
+ +              width++;
+ +      empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
+ +
+ +      upl = max_t(int, lpage_size / unit_size, 1);
+ +      lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
+ +
+ +      printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
+ +             static_size, reserved_size, dyn_size, unit_size, lpage_size);
+ +
+ +      for (lpage = 0, unit = 0; unit < nr_units; unit++) {
+ +              if (!(unit % upl)) {
+ +                      if (!(lpage++ % lpl)) {
+ +                              printk("\n");
+ +                              printk("%spcpu-lpage: ", lvl);
+ +                      } else
+ +                              printk("| ");
+ +              }
+ +              if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+ +                      printk("%0*d ", width, cpu);
+ +              else
+ +                      printk("%s ", empty_str);
+ +      }
+ +      printk("\n");
+ +}
+ +
+ +/**
+ + * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
+ + * @static_size: the size of static percpu area in bytes
+ + * @reserved_size: the size of reserved percpu area in bytes
+ + * @dyn_size: free size for dynamic allocation in bytes
+ + * @unit_size: unit size in bytes
+ + * @lpage_size: the size of a large page
+ + * @unit_map: cpu -> unit mapping
+ + * @nr_units: the number of units
+ + * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
+ + * @free_fn: function to free percpu memory, @size <= lpage_size
+ + * @map_fn: function to map percpu lpage, always called with lpage_size
+ + *
+ + * This allocator uses large page to build and map the first chunk.
+ + * Unlike other helpers, the caller should always specify @dyn_size
+ + * and @unit_size.  These parameters along with @unit_map and
+ + * @nr_units can be determined using pcpu_lpage_build_unit_map().
+ + * This two stage initialization is to allow arch code to evaluate the
+ + * parameters before committing to it.
+ + *
+ + * Large pages are allocated as directed by @unit_map and other
+ + * parameters and mapped to vmalloc space.  Unused holes are returned
+ + * to the page allocator.  Note that these holes end up being actively
+ + * mapped twice - once to the physical mapping and to the vmalloc area
+ + * for the first percpu chunk.  Depending on architecture, this might
+ + * cause problem when changing page attributes of the returned area.
+ + * These double mapped areas can be detected using
+ + * pcpu_lpage_remapped().
+ + *
+ + * RETURNS:
+ + * The determined pcpu_unit_size which can be used to initialize
+ + * percpu access on success, -errno on failure.
+ + */
+ +ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
+ +                                    size_t dyn_size, size_t unit_size,
+ +                                    size_t lpage_size, const int *unit_map,
+ +                                    int nr_units,
+ +                                    pcpu_fc_alloc_fn_t alloc_fn,
+ +                                    pcpu_fc_free_fn_t free_fn,
+ +                                    pcpu_fc_map_fn_t map_fn)
+ +{
+ +      static struct vm_struct vm;
+ +      size_t chunk_size = unit_size * nr_units;
+ +      size_t map_size;
+ +      unsigned int cpu;
+ +      ssize_t ret;
+ +      int i, j, unit;
+ +
+ +      pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
+ +                           unit_size, lpage_size, unit_map, nr_units);
+ +
+ +      BUG_ON(chunk_size % lpage_size);
+ +
+ +      pcpul_size = static_size + reserved_size + dyn_size;
+ +      pcpul_lpage_size = lpage_size;
+ +      pcpul_nr_lpages = chunk_size / lpage_size;
+ +
+ +      /* allocate pointer array and alloc large pages */
+ +      map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
+ +      pcpul_map = alloc_bootmem(map_size);
+ +
+ +      /* allocate all pages */
+ +      for (i = 0; i < pcpul_nr_lpages; i++) {
+ +              size_t offset = i * lpage_size;
+ +              int first_unit = offset / unit_size;
+ +              int last_unit = (offset + lpage_size - 1) / unit_size;
+ +              void *ptr;
+ +
+ +              /* find out which cpu is mapped to this unit */
+ +              for (unit = first_unit; unit <= last_unit; unit++)
+ +                      if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+ +                              goto found;
+ +              continue;
+ +      found:
+ +              ptr = alloc_fn(cpu, lpage_size);
+ +              if (!ptr) {
+ +                      pr_warning("PERCPU: failed to allocate large page "
+ +                                 "for cpu%u\n", cpu);
+ +                      goto enomem;
+ +              }
+ +
+ +              pcpul_map[i].ptr = ptr;
+ +      }
+ +
+ +      /* return unused holes */
+ +      for (unit = 0; unit < nr_units; unit++) {
+ +              size_t start = unit * unit_size;
+ +              size_t end = start + unit_size;
+ +              size_t off, next;
+ +
+ +              /* don't free used part of occupied unit */
+ +              if (pcpul_unit_to_cpu(unit, unit_map, NULL))
+ +                      start += pcpul_size;
+ +
+ +              /* unit can span more than one page, punch the holes */
+ +              for (off = start; off < end; off = next) {
+ +                      void *ptr = pcpul_map[off / lpage_size].ptr;
+ +                      next = min(roundup(off + 1, lpage_size), end);
+ +                      if (ptr)
+ +                              free_fn(ptr + off % lpage_size, next - off);
+ +              }
+ +      }
+ +
+ +      /* allocate address, map and copy */
+ +      vm.flags = VM_ALLOC;
+ +      vm.size = chunk_size;
+ +      vm_area_register_early(&vm, unit_size);
+ +
+ +      for (i = 0; i < pcpul_nr_lpages; i++) {
+ +              if (!pcpul_map[i].ptr)
+ +                      continue;
+ +              pcpul_map[i].map_addr = vm.addr + i * lpage_size;
+ +              map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
+ +      }
+ +
+ +      for_each_possible_cpu(cpu)
+ +              memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
+ +                     static_size);
+ +
+ +      /* we're ready, commit */
+ +      pr_info("PERCPU: Remapped at %p with large pages, static data "
+ +              "%zu bytes\n", vm.addr, static_size);
+ +
+ +      ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+ +                                   unit_size, vm.addr, unit_map);
+ +
+ +      /*
+ +       * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
+ +       * lpages are pushed to the end and trimmed.
+ +       */
+ +      for (i = 0; i < pcpul_nr_lpages - 1; i++)
+ +              for (j = i + 1; j < pcpul_nr_lpages; j++) {
+ +                      struct pcpul_ent tmp;
+ +
+ +                      if (!pcpul_map[j].ptr)
+ +                              continue;
+ +                      if (pcpul_map[i].ptr &&
+ +                          pcpul_map[i].ptr < pcpul_map[j].ptr)
+ +                              continue;
+ +
+ +                      tmp = pcpul_map[i];
+ +                      pcpul_map[i] = pcpul_map[j];
+ +                      pcpul_map[j] = tmp;
+ +              }
+ +
+ +      while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
+ +              pcpul_nr_lpages--;
+ +
+ +      return ret;
+ +
+ +enomem:
+ +      for (i = 0; i < pcpul_nr_lpages; i++)
+ +              if (pcpul_map[i].ptr)
+ +                      free_fn(pcpul_map[i].ptr, lpage_size);
+ +      free_bootmem(__pa(pcpul_map), map_size);
+ +      return -ENOMEM;
+ +}
+ +
+ +/**
+ + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
+ + * @kaddr: the kernel address in question
+ + *
+ + * Determine whether @kaddr falls in the pcpul recycled area.  This is
+ + * used by pageattr to detect VM aliases and break up the pcpu large
+ + * page mapping such that the same physical page is not mapped under
+ + * different attributes.
+ + *
+ + * The recycled area is always at the tail of a partially used large
+ + * page.
+ + *
+ + * RETURNS:
+ + * Address of corresponding remapped pcpu address if match is found;
+ + * otherwise, NULL.
+ + */
+ +void *pcpu_lpage_remapped(void *kaddr)
+ +{
+ +      unsigned long lpage_mask = pcpul_lpage_size - 1;
+ +      void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
+ +      unsigned long offset = (unsigned long)kaddr & lpage_mask;
+ +      int left = 0, right = pcpul_nr_lpages - 1;
+ +      int pos;
+ +
+ +      /* pcpul in use at all? */
+ +      if (!pcpul_map)
+ +              return NULL;
+ +
+ +      /* okay, perform binary search */
+ +      while (left <= right) {
+ +              pos = (left + right) / 2;
+ +
+ +              if (pcpul_map[pos].ptr < lpage_addr)
+ +                      left = pos + 1;
+ +              else if (pcpul_map[pos].ptr > lpage_addr)
+ +                      right = pos - 1;
+ +              else
+ +                      return pcpul_map[pos].map_addr + offset;
+ +      }
+ +
+ +      return NULL;
+ +}
+ +#endif
+ +
+ +/*
+ + * Generic percpu area setup.
+ + *
+ + * The embedding helper is used because its behavior closely resembles
+ + * the original non-dynamic generic percpu area setup.  This is
+ + * important because many archs have addressing restrictions and might
+ + * fail if the percpu area is located far away from the previous
+ + * location.  As an added bonus, in non-NUMA cases, embedding is
+ + * generally a good idea TLB-wise because percpu area can piggy back
+ + * on the physical linear memory mapping which uses large page
+ + * mappings on applicable archs.
+ + */
+ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+ +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+ +EXPORT_SYMBOL(__per_cpu_offset);
+ +
+ +void __init setup_per_cpu_areas(void)
+ +{
+ +      size_t static_size = __per_cpu_end - __per_cpu_start;
+ +      ssize_t unit_size;
+ +      unsigned long delta;
+ +      unsigned int cpu;
+ +
+ +      /*
+ +       * Always reserve area for module percpu variables.  That's
+ +       * what the legacy allocator did.
+ +       */
+ +      unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
+ +                                         PERCPU_DYNAMIC_RESERVE);
+ +      if (unit_size < 0)
+ +              panic("Failed to initialized percpu areas.");
+ +
+ +      delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ +      for_each_possible_cpu(cpu)
+ +              __per_cpu_offset[cpu] = delta + cpu * unit_size;
   }
+ +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --cc mm/slub.c
Simple merge
author	Tejun Heo <tj@kernel.org>
	Fri, 14 Aug 2009 05:41:02 +0000 (14:41 +0900)
committer	Tejun Heo <tj@kernel.org>
	Fri, 14 Aug 2009 05:45:31 +0000 (14:45 +0900)
		1	2
Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/mn10300/kernel/vmlinux.lds.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/sparc/kernel/smp_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/mcheck/mce.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/perf_counter.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/setup_percpu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/vmlinux.lds.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/pageattr.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/cfq-iosched.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/cpufreq/cpufreq_conservative.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/cpufreq/cpufreq_ondemand.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/xen/events.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-generic/vmlinux.lds.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/module.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/perf_counter.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace_events.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page-writeback.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/percpu.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slub.c	patch \|	diff1 \|	diff2 \|	blob \| history