Merge branch 'percpu-for-linus' into percpu-for-next
Tejun Heo [Fri, 14 Aug 2009 05:41:02 +0000 (14:41 +0900)]
Conflicts:
arch/sparc/kernel/smp_64.c
arch/x86/kernel/cpu/perf_counter.c
arch/x86/kernel/setup_percpu.c
drivers/cpufreq/cpufreq_ondemand.c
mm/percpu.c

Conflicts in core and arch percpu codes are mostly from commit
ed78e1e078dd44249f88b1dd8c76dafb39567161 which substituted many
num_possible_cpus() with nr_cpu_ids.  As for-next branch has moved all
the first chunk allocators into mm/percpu.c, the changes are moved
from arch code to mm/percpu.c.

Signed-off-by: Tejun Heo <tj@kernel.org>

22 files changed:
1  2 
Makefile
arch/mn10300/kernel/vmlinux.lds.S
arch/sparc/kernel/smp_64.c
arch/x86/Kconfig
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/perf_counter.c
arch/x86/kernel/setup_percpu.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/mm/pageattr.c
block/cfq-iosched.c
drivers/cpufreq/cpufreq_conservative.c
drivers/cpufreq/cpufreq_ondemand.c
drivers/xen/events.c
include/asm-generic/vmlinux.lds.h
init/main.c
kernel/module.c
kernel/perf_counter.c
kernel/sched.c
kernel/trace/trace_events.c
mm/page-writeback.c
mm/percpu.c
mm/slub.c

diff --cc Makefile
Simple merge
Simple merge
@@@ -1478,26 -1491,25 +1478,26 @@@ void __init setup_per_cpu_areas(void
        size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
        static struct vm_struct vm;
        unsigned long delta, cpu;
 -      size_t pcpu_unit_size;
 +      size_t size_sum, pcpu_unit_size;
        size_t ptrs_size;
 +      void **ptrs;
  
 -      pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
 -                             PERCPU_DYNAMIC_RESERVE);
 -      dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
 +      size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
 +                           PERCPU_DYNAMIC_RESERVE);
 +      dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
  
  
-       ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0]));
 -      ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpur_ptrs[0]));
 -      pcpur_ptrs = alloc_bootmem(ptrs_size);
++      ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0]));
 +      ptrs = alloc_bootmem(ptrs_size);
  
        for_each_possible_cpu(cpu) {
 -              pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
 -                                                   PCPU_CHUNK_SIZE);
 +              ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
 +                                             PCPU_CHUNK_SIZE);
  
 -              free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
 -                           PCPU_CHUNK_SIZE - pcpur_size);
 +              free_bootmem(__pa(ptrs[cpu] + size_sum),
 +                           PCPU_CHUNK_SIZE - size_sum);
  
 -              memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
 +              memcpy(ptrs[cpu], __per_cpu_load, static_size);
        }
  
        /* allocate address and map */
Simple merge
Simple merge
@@@ -1559,8 -1798,9 +1798,9 @@@ void callchain_store(struct perf_callch
                entry->ip[entry->nr++] = ip;
  }
  
 -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
 -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
 +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
 +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+ static DEFINE_PER_CPU(int, in_nmi_frame);
  
  
  static void
@@@ -176,35 -185,130 +176,35 @@@ static ssize_t __init setup_pcpu_lpage(
                return -EINVAL;
        }
  
 -      /*
 -       * Currently supports only single page.  Supporting multiple
 -       * pages won't be too difficult if it ever becomes necessary.
 -       */
 -      pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
 -                             PERCPU_DYNAMIC_RESERVE);
 -      if (pcpul_size > PMD_SIZE) {
 -              pr_warning("PERCPU: static data is larger than large page, "
 -                         "can't use large page\n");
 -              return -EINVAL;
 -      }
 -      dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
 -
 -      /* allocate pointer array and alloc large pages */
 -      map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
 -      pcpul_map = alloc_bootmem(map_size);
 -
 -      for_each_possible_cpu(cpu) {
 -              pcpul_map[cpu].cpu = cpu;
 -              pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
 -                                                      PMD_SIZE);
 -              if (!pcpul_map[cpu].ptr) {
 -                      pr_warning("PERCPU: failed to allocate large page "
 -                                 "for cpu%u\n", cpu);
 -                      goto enomem;
 -              }
 -
 -              /*
 -               * Only use pcpul_size bytes and give back the rest.
 -               *
 -               * Ingo: The 2MB up-rounding bootmem is needed to make
 -               * sure the partial 2MB page is still fully RAM - it's
 -               * not well-specified to have a PAT-incompatible area
 -               * (unmapped RAM, device memory, etc.) in that hole.
 -               */
 -              free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
 -                           PMD_SIZE - pcpul_size);
 -
 -              memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
 +      /* allocate and build unit_map */
-       unit_map_size = num_possible_cpus() * sizeof(int);
++      unit_map_size = nr_cpu_ids * sizeof(int);
 +      unit_map = alloc_bootmem_nopanic(unit_map_size);
 +      if (!unit_map) {
 +              pr_warning("PERCPU: failed to allocate unit_map\n");
 +              return -ENOMEM;
        }
  
 -      /* allocate address and map */
 -      pcpul_vm.flags = VM_ALLOC;
 -      pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
 -      vm_area_register_early(&pcpul_vm, PMD_SIZE);
 -
 -      for_each_possible_cpu(cpu) {
 -              pmd_t *pmd, pmd_v;
 -
 -              pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
 -                                       cpu * PMD_SIZE);
 -              pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
 -                              PAGE_KERNEL_LARGE);
 -              set_pmd(pmd, pmd_v);
 +      ret = pcpu_lpage_build_unit_map(static_size,
 +                                      PERCPU_FIRST_CHUNK_RESERVE,
 +                                      &dyn_size, &unit_size, PMD_SIZE,
 +                                      unit_map, pcpu_lpage_cpu_distance);
 +      if (ret < 0) {
 +              pr_warning("PERCPU: failed to build unit_map\n");
 +              goto out_free;
        }
 +      nr_units = ret;
  
 -      /* we're ready, commit */
 -      pr_info("PERCPU: Remapped at %p with large pages, static data "
 -              "%zu bytes\n", pcpul_vm.addr, static_size);
 -
 -      ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
 -                                   PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
 -                                   PMD_SIZE, pcpul_vm.addr, NULL);
 -
 -      /* sort pcpul_map array for pcpu_lpage_remapped() */
 -      for (i = 0; i < nr_cpu_ids - 1; i++)
 -              for (j = i + 1; j < nr_cpu_ids; j++)
 -                      if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
 -                              struct pcpul_ent tmp = pcpul_map[i];
 -                              pcpul_map[i] = pcpul_map[j];
 -                              pcpul_map[j] = tmp;
 -                      }
 -
 -      return ret;
 -
 -enomem:
 -      for_each_possible_cpu(cpu)
 -              if (pcpul_map[cpu].ptr)
 -                      free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
 -      free_bootmem(__pa(pcpul_map), map_size);
 -      return -ENOMEM;
 -}
 +      /* do the parameters look okay? */
 +      if (!chosen) {
 +              size_t vm_size = VMALLOC_END - VMALLOC_START;
 +              size_t tot_size = nr_units * unit_size;
  
 -/**
 - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
 - * @kaddr: the kernel address in question
 - *
 - * Determine whether @kaddr falls in the pcpul recycled area.  This is
 - * used by pageattr to detect VM aliases and break up the pcpu PMD
 - * mapping such that the same physical page is not mapped under
 - * different attributes.
 - *
 - * The recycled area is always at the tail of a partially used PMD
 - * page.
 - *
 - * RETURNS:
 - * Address of corresponding remapped pcpu address if match is found;
 - * otherwise, NULL.
 - */
 -void *pcpu_lpage_remapped(void *kaddr)
 -{
 -      void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
 -      unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
 -      int left = 0, right = nr_cpu_ids - 1;
 -      int pos;
 -
 -      /* pcpul in use at all? */
 -      if (!pcpul_map)
 -              return NULL;
 -
 -      /* okay, perform binary search */
 -      while (left <= right) {
 -              pos = (left + right) / 2;
 -
 -              if (pcpul_map[pos].ptr < pmd_addr)
 -                      left = pos + 1;
 -              else if (pcpul_map[pos].ptr > pmd_addr)
 -                      right = pos - 1;
 -              else {
 -                      /* it shouldn't be in the area for the first chunk */
 -                      WARN_ON(offset < pcpul_size);
 -
 -                      return pcpul_vm.addr +
 -                              pcpul_map[pos].cpu * PMD_SIZE + offset;
 +              /* don't consume more than 20% of vmalloc area */
 +              if (tot_size > vm_size / 5) {
 +                      pr_info("PERCPU: too large chunk size %zuMB for "
 +                              "large page remap\n", tot_size >> 20);
 +                      ret = -EINVAL;
 +                      goto out_free;
                }
        }
  
Simple merge
Simple merge
Simple merge
@@@ -64,8 -64,14 +64,14 @@@ struct cpu_dbs_info_s 
        unsigned int requested_freq;
        int cpu;
        unsigned int enable:1;
+       /*
+        * percpu mutex that serializes governor limit change with
+        * do_dbs_timer invocation. We do not want do_dbs_timer to run
+        * when user is changing the governor or limits.
+        */
+       struct mutex timer_mutex;
  };
 -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
 +static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info);
  
  static unsigned int dbs_enable;       /* number of CPUs using this policy */
  
@@@ -70,10 -70,15 +70,15 @@@ struct cpu_dbs_info_s 
        unsigned int freq_lo_jiffies;
        unsigned int freq_hi_jiffies;
        int cpu;
-       unsigned int enable:1,
-               sample_type:1;
+       unsigned int sample_type:1;
+       /*
+        * percpu mutex that serializes governor limit change with
+        * do_dbs_timer invocation. We do not want do_dbs_timer to run
+        * when user is changing the governor or limits.
+        */
+       struct mutex timer_mutex;
  };
 -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
 +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info);
  
  static unsigned int dbs_enable;       /* number of CPUs using this policy */
  
@@@ -193,6 -190,13 +191,13 @@@ static unsigned int powersave_bias_targ
        return freq_hi;
  }
  
+ static void ondemand_powersave_bias_init_cpu(int cpu)
+ {
 -      struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu);
++      struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
+       dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
+       dbs_info->freq_lo = 0;
+ }
  static void ondemand_powersave_bias_init(void)
  {
        int i;
@@@ -569,9 -550,10 +551,10 @@@ static int cpufreq_governor_dbs(struct 
                        return rc;
                }
  
+               dbs_enable++;
                for_each_cpu(j, policy->cpus) {
                        struct cpu_dbs_info_s *j_dbs_info;
 -                      j_dbs_info = &per_cpu(cpu_dbs_info, j);
 +                      j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
                        j_dbs_info->cur_policy = policy;
  
                        j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
Simple merge
   *    EXCEPTION_TABLE(...)
   *    NOTES
   *
-  *    __bss_start = .;
-  *    BSS_SECTION(0, 0)
-  *    __bss_stop = .;
+  *    BSS_SECTION(0, 0, 0)
   *    _end = .;
   *
 - *    /DISCARD/ : {
 - *            EXIT_TEXT
 - *            EXIT_DATA
 - *            EXIT_CALL
 - *    }
   *    STABS_DEBUG
   *    DWARF_DEBUG
 + *
 + *    DISCARDS                // must be the last
   * }
   *
   * [__init_begin, __init_end] is the init section that may be freed after init
diff --cc init/main.c
Simple merge
diff --cc kernel/module.c
Simple merge
Simple merge
diff --cc kernel/sched.c
Simple merge
Simple merge
Simple merge
diff --cc mm/percpu.c
@@@ -1003,8 -747,9 +1003,8 @@@ static struct pcpu_chunk *alloc_pcpu_ch
        chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
        chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
        chunk->map[chunk->map_used++] = pcpu_unit_size;
 -      chunk->page = chunk->page_ar;
  
-       chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+       chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
        if (!chunk->vm) {
                free_pcpu_chunk(chunk);
                return NULL;
@@@ -1290,59 -1052,24 +1290,59 @@@ size_t __init pcpu_setup_first_chunk(si
        BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
                     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
        BUG_ON(!static_size);
 -      if (unit_size >= 0) {
 -              BUG_ON(unit_size < size_sum);
 -              BUG_ON(unit_size & ~PAGE_MASK);
 -              BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
 -      } else
 -              BUG_ON(base_addr);
 -      BUG_ON(base_addr && populate_pte_fn);
 -
 -      if (unit_size >= 0)
 -              pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 -      else
 -              pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
 -                                      PFN_UP(size_sum));
 +      BUG_ON(!base_addr);
 +      BUG_ON(unit_size < size_sum);
 +      BUG_ON(unit_size & ~PAGE_MASK);
 +      BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
 +
 +      /* determine number of units and verify and initialize pcpu_unit_map */
 +      if (unit_map) {
 +              int first_unit = INT_MAX, last_unit = INT_MIN;
 +
 +              for_each_possible_cpu(cpu) {
 +                      int unit = unit_map[cpu];
 +
 +                      BUG_ON(unit < 0);
 +                      for_each_possible_cpu(tcpu) {
 +                              if (tcpu == cpu)
 +                                      break;
 +                              /* the mapping should be one-to-one */
 +                              BUG_ON(unit_map[tcpu] == unit);
 +                      }
 +
 +                      if (unit < first_unit) {
 +                              pcpu_first_unit_cpu = cpu;
 +                              first_unit = unit;
 +                      }
 +                      if (unit > last_unit) {
 +                              pcpu_last_unit_cpu = cpu;
 +                              last_unit = unit;
 +                      }
 +              }
 +              pcpu_nr_units = last_unit + 1;
 +              pcpu_unit_map = unit_map;
 +      } else {
 +              int *identity_map;
 +
 +              /* #units == #cpus, identity mapped */
-               identity_map = alloc_bootmem(num_possible_cpus() *
++              identity_map = alloc_bootmem(nr_cpu_ids *
 +                                           sizeof(identity_map[0]));
  
 +              for_each_possible_cpu(cpu)
 +                      identity_map[cpu] = cpu;
 +
 +              pcpu_first_unit_cpu = 0;
 +              pcpu_last_unit_cpu = pcpu_nr_units - 1;
-               pcpu_nr_units = num_possible_cpus();
++              pcpu_nr_units = nr_cpu_ids;
 +              pcpu_unit_map = identity_map;
 +      }
 +
 +      /* determine basic parameters */
 +      pcpu_unit_pages = unit_size >> PAGE_SHIFT;
        pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 -      pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
 -      pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
 -              + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
 +      pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
 +      pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
 +              BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
  
        if (dyn_size < 0)
                dyn_size = pcpu_unit_size - static_size - reserved_size;
@@@ -1461,555 -1237,44 +1461,558 @@@ ssize_t __init pcpu_embed_first_chunk(s
        unsigned int cpu;
  
        /* determine parameters and allocate */
 -      pcpue_size = PFN_ALIGN(static_size + reserved_size +
 -                             (dyn_size >= 0 ? dyn_size : 0));
 -      if (dyn_size != 0)
 -              dyn_size = pcpue_size - static_size - reserved_size;
 -
 -      if (unit_size >= 0) {
 -              BUG_ON(unit_size < pcpue_size);
 -              pcpue_unit_size = unit_size;
 -      } else
 -              pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
 -
 -      chunk_size = pcpue_unit_size * nr_cpu_ids;
 -
 -      pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
 -                                          __pa(MAX_DMA_ADDRESS));
 -      if (!pcpue_ptr) {
 +      size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
 +
 +      unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-       chunk_size = unit_size * num_possible_cpus();
++      chunk_size = unit_size * nr_cpu_ids;
 +
 +      base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
 +                                     __pa(MAX_DMA_ADDRESS));
 +      if (!base) {
                pr_warning("PERCPU: failed to allocate %zu bytes for "
                           "embedding\n", chunk_size);
                return -ENOMEM;
        }
  
        /* return the leftover and copy */
-       for_each_possible_cpu(cpu) {
+       for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
 -              void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
 +              void *ptr = base + cpu * unit_size;
  
-               free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
-               memcpy(ptr, __per_cpu_load, static_size);
+               if (cpu_possible(cpu)) {
 -                      free_bootmem(__pa(ptr + pcpue_size),
 -                                   pcpue_unit_size - pcpue_size);
++                      free_bootmem(__pa(ptr + size_sum),
++                                   unit_size - size_sum);
+                       memcpy(ptr, __per_cpu_load, static_size);
+               } else
 -                      free_bootmem(__pa(ptr), pcpue_unit_size);
++                      free_bootmem(__pa(ptr), unit_size);
        }
  
        /* we're ready, commit */
        pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
 -              pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 +              size_sum >> PAGE_SHIFT, base, static_size);
 +
 +      return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
 +                                    unit_size, base, NULL);
 +}
 +
 +/**
 + * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
 + * @static_size: the size of static percpu area in bytes
 + * @reserved_size: the size of reserved percpu area in bytes
 + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
 + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
 + * @populate_pte_fn: function to populate pte
 + *
 + * This is a helper to ease setting up embedded first percpu chunk and
 + * can be called where pcpu_setup_first_chunk() is expected.
 + *
 + * This is the basic allocator.  Static percpu area is allocated
 + * page-by-page into vmalloc area.
 + *
 + * RETURNS:
 + * The determined pcpu_unit_size which can be used to initialize
 + * percpu access on success, -errno on failure.
 + */
 +ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 +                                 pcpu_fc_alloc_fn_t alloc_fn,
 +                                 pcpu_fc_free_fn_t free_fn,
 +                                 pcpu_fc_populate_pte_fn_t populate_pte_fn)
 +{
 +      static struct vm_struct vm;
 +      int unit_pages;
 +      size_t pages_size;
 +      struct page **pages;
 +      unsigned int cpu;
 +      int i, j;
 +      ssize_t ret;
 +
 +      unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
 +                                PCPU_MIN_UNIT_SIZE));
 +
 +      /* unaligned allocations can't be freed, round up to page size */
-       pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
-                              sizeof(pages[0]));
++      pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
 +      pages = alloc_bootmem(pages_size);
 +
 +      /* allocate pages */
 +      j = 0;
 +      for_each_possible_cpu(cpu)
 +              for (i = 0; i < unit_pages; i++) {
 +                      void *ptr;
 +
 +                      ptr = alloc_fn(cpu, PAGE_SIZE);
 +                      if (!ptr) {
 +                              pr_warning("PERCPU: failed to allocate "
 +                                         "4k page for cpu%u\n", cpu);
 +                              goto enomem;
 +                      }
 +                      pages[j++] = virt_to_page(ptr);
 +              }
 +
 +      /* allocate vm area, map the pages and copy static data */
 +      vm.flags = VM_ALLOC;
-       vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
++      vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT;
 +      vm_area_register_early(&vm, PAGE_SIZE);
 +
 +      for_each_possible_cpu(cpu) {
 +              unsigned long unit_addr = (unsigned long)vm.addr +
 +                      (cpu * unit_pages << PAGE_SHIFT);
 +
 +              for (i = 0; i < unit_pages; i++)
 +                      populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
 +
 +              /* pte already populated, the following shouldn't fail */
 +              ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
 +                                     unit_pages);
 +              if (ret < 0)
 +                      panic("failed to map percpu area, err=%zd\n", ret);
 +
 +              /*
 +               * FIXME: Archs with virtual cache should flush local
 +               * cache for the linear mapping here - something
 +               * equivalent to flush_cache_vmap() on the local cpu.
 +               * flush_cache_vmap() can't be used as most supporting
 +               * data structures are not set up yet.
 +               */
 +
 +              /* copy static data */
 +              memcpy((void *)unit_addr, __per_cpu_load, static_size);
 +      }
 +
 +      /* we're ready, commit */
 +      pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
 +              unit_pages, static_size);
 +
 +      ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
 +                                   unit_pages << PAGE_SHIFT, vm.addr, NULL);
 +      goto out_free_ar;
 +
 +enomem:
 +      while (--j >= 0)
 +              free_fn(page_address(pages[j]), PAGE_SIZE);
 +      ret = -ENOMEM;
 +out_free_ar:
 +      free_bootmem(__pa(pages), pages_size);
 +      return ret;
 +}
 +
 +/*
 + * Large page remapping first chunk setup helper
 + */
 +#ifdef CONFIG_NEED_MULTIPLE_NODES
 +
 +/**
 + * pcpu_lpage_build_unit_map - build unit_map for large page remapping
 + * @static_size: the size of static percpu area in bytes
 + * @reserved_size: the size of reserved percpu area in bytes
 + * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
 + * @unit_sizep: out parameter for unit size
 + * @unit_map: unit_map to be filled
 + * @cpu_distance_fn: callback to determine distance between cpus
 + *
 + * This function builds cpu -> unit map and determine other parameters
 + * considering needed percpu size, large page size and distances
 + * between CPUs in NUMA.
 + *
 + * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
 + * may share units in the same large page.  The returned configuration
 + * is guaranteed to have CPUs on different nodes on different large
 + * pages and >=75% usage of allocated virtual address space.
 + *
 + * RETURNS:
 + * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
 + * returns the number of units to be allocated.  -errno on failure.
 + */
 +int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
 +                                   ssize_t *dyn_sizep, size_t *unit_sizep,
 +                                   size_t lpage_size, int *unit_map,
 +                                   pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
 +{
 +      static int group_map[NR_CPUS] __initdata;
 +      static int group_cnt[NR_CPUS] __initdata;
 +      int group_cnt_max = 0;
 +      size_t size_sum, min_unit_size, alloc_size;
 +      int upa, max_upa, uninitialized_var(best_upa);  /* units_per_alloc */
 +      int last_allocs;
 +      unsigned int cpu, tcpu;
 +      int group, unit;
 +
 +      /*
 +       * Determine min_unit_size, alloc_size and max_upa such that
 +       * alloc_size is multiple of lpage_size and is the smallest
 +       * which can accomodate 4k aligned segments which are equal to
 +       * or larger than min_unit_size.
 +       */
 +      size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
 +      min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
 +
 +      alloc_size = roundup(min_unit_size, lpage_size);
 +      upa = alloc_size / min_unit_size;
 +      while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 +              upa--;
 +      max_upa = upa;
 +
 +      /* group cpus according to their proximity */
 +      for_each_possible_cpu(cpu) {
 +              group = 0;
 +      next_group:
 +              for_each_possible_cpu(tcpu) {
 +                      if (cpu == tcpu)
 +                              break;
 +                      if (group_map[tcpu] == group &&
 +                          (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
 +                           cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
 +                              group++;
 +                              goto next_group;
 +                      }
 +              }
 +              group_map[cpu] = group;
 +              group_cnt[group]++;
 +              group_cnt_max = max(group_cnt_max, group_cnt[group]);
 +      }
 +
 +      /*
 +       * Expand unit size until address space usage goes over 75%
 +       * and then as much as possible without using more address
 +       * space.
 +       */
 +      last_allocs = INT_MAX;
 +      for (upa = max_upa; upa; upa--) {
 +              int allocs = 0, wasted = 0;
 +
 +              if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 +                      continue;
 +
 +              for (group = 0; group_cnt[group]; group++) {
 +                      int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
 +                      allocs += this_allocs;
 +                      wasted += this_allocs * upa - group_cnt[group];
 +              }
 +
 +              /*
 +               * Don't accept if wastage is over 25%.  The
 +               * greater-than comparison ensures upa==1 always
 +               * passes the following check.
 +               */
 +              if (wasted > num_possible_cpus() / 3)
 +                      continue;
 +
 +              /* and then don't consume more memory */
 +              if (allocs > last_allocs)
 +                      break;
 +              last_allocs = allocs;
 +              best_upa = upa;
 +      }
 +      *unit_sizep = alloc_size / best_upa;
  
 -      return pcpu_setup_first_chunk(pcpue_get_page, static_size,
 -                                    reserved_size, dyn_size,
 -                                    pcpue_unit_size, pcpue_ptr, NULL);
 +      /* assign units to cpus accordingly */
 +      unit = 0;
 +      for (group = 0; group_cnt[group]; group++) {
 +              for_each_possible_cpu(cpu)
 +                      if (group_map[cpu] == group)
 +                              unit_map[cpu] = unit++;
 +              unit = roundup(unit, best_upa);
 +      }
 +
 +      return unit;    /* unit contains aligned number of units */
 +}
 +
 +struct pcpul_ent {
 +      void            *ptr;
 +      void            *map_addr;
 +};
 +
 +static size_t pcpul_size;
 +static size_t pcpul_lpage_size;
 +static int pcpul_nr_lpages;
 +static struct pcpul_ent *pcpul_map;
 +
 +static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
 +                                   unsigned int *cpup)
 +{
 +      unsigned int cpu;
 +
 +      for_each_possible_cpu(cpu)
 +              if (unit_map[cpu] == unit) {
 +                      if (cpup)
 +                              *cpup = cpu;
 +                      return true;
 +              }
 +
 +      return false;
 +}
 +
 +static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
 +                                      size_t reserved_size, size_t dyn_size,
 +                                      size_t unit_size, size_t lpage_size,
 +                                      const int *unit_map, int nr_units)
 +{
 +      int width = 1, v = nr_units;
 +      char empty_str[] = "--------";
 +      int upl, lpl;   /* units per lpage, lpage per line */
 +      unsigned int cpu;
 +      int lpage, unit;
 +
 +      while (v /= 10)
 +              width++;
 +      empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
 +
 +      upl = max_t(int, lpage_size / unit_size, 1);
 +      lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
 +
 +      printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
 +             static_size, reserved_size, dyn_size, unit_size, lpage_size);
 +
 +      for (lpage = 0, unit = 0; unit < nr_units; unit++) {
 +              if (!(unit % upl)) {
 +                      if (!(lpage++ % lpl)) {
 +                              printk("\n");
 +                              printk("%spcpu-lpage: ", lvl);
 +                      } else
 +                              printk("| ");
 +              }
 +              if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
 +                      printk("%0*d ", width, cpu);
 +              else
 +                      printk("%s ", empty_str);
 +      }
 +      printk("\n");
 +}
 +
 +/**
 + * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
 + * @static_size: the size of static percpu area in bytes
 + * @reserved_size: the size of reserved percpu area in bytes
 + * @dyn_size: free size for dynamic allocation in bytes
 + * @unit_size: unit size in bytes
 + * @lpage_size: the size of a large page
 + * @unit_map: cpu -> unit mapping
 + * @nr_units: the number of units
 + * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
 + * @free_fn: function to free percpu memory, @size <= lpage_size
 + * @map_fn: function to map percpu lpage, always called with lpage_size
 + *
 + * This allocator uses large page to build and map the first chunk.
 + * Unlike other helpers, the caller should always specify @dyn_size
 + * and @unit_size.  These parameters along with @unit_map and
 + * @nr_units can be determined using pcpu_lpage_build_unit_map().
 + * This two stage initialization is to allow arch code to evaluate the
 + * parameters before committing to it.
 + *
 + * Large pages are allocated as directed by @unit_map and other
 + * parameters and mapped to vmalloc space.  Unused holes are returned
 + * to the page allocator.  Note that these holes end up being actively
 + * mapped twice - once to the physical mapping and to the vmalloc area
 + * for the first percpu chunk.  Depending on architecture, this might
 + * cause problem when changing page attributes of the returned area.
 + * These double mapped areas can be detected using
 + * pcpu_lpage_remapped().
 + *
 + * RETURNS:
 + * The determined pcpu_unit_size which can be used to initialize
 + * percpu access on success, -errno on failure.
 + */
 +ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
 +                                    size_t dyn_size, size_t unit_size,
 +                                    size_t lpage_size, const int *unit_map,
 +                                    int nr_units,
 +                                    pcpu_fc_alloc_fn_t alloc_fn,
 +                                    pcpu_fc_free_fn_t free_fn,
 +                                    pcpu_fc_map_fn_t map_fn)
 +{
 +      static struct vm_struct vm;
 +      size_t chunk_size = unit_size * nr_units;
 +      size_t map_size;
 +      unsigned int cpu;
 +      ssize_t ret;
 +      int i, j, unit;
 +
 +      pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
 +                           unit_size, lpage_size, unit_map, nr_units);
 +
 +      BUG_ON(chunk_size % lpage_size);
 +
 +      pcpul_size = static_size + reserved_size + dyn_size;
 +      pcpul_lpage_size = lpage_size;
 +      pcpul_nr_lpages = chunk_size / lpage_size;
 +
 +      /* allocate pointer array and alloc large pages */
 +      map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
 +      pcpul_map = alloc_bootmem(map_size);
 +
 +      /* allocate all pages */
 +      for (i = 0; i < pcpul_nr_lpages; i++) {
 +              size_t offset = i * lpage_size;
 +              int first_unit = offset / unit_size;
 +              int last_unit = (offset + lpage_size - 1) / unit_size;
 +              void *ptr;
 +
 +              /* find out which cpu is mapped to this unit */
 +              for (unit = first_unit; unit <= last_unit; unit++)
 +                      if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
 +                              goto found;
 +              continue;
 +      found:
 +              ptr = alloc_fn(cpu, lpage_size);
 +              if (!ptr) {
 +                      pr_warning("PERCPU: failed to allocate large page "
 +                                 "for cpu%u\n", cpu);
 +                      goto enomem;
 +              }
 +
 +              pcpul_map[i].ptr = ptr;
 +      }
 +
 +      /* return unused holes */
 +      for (unit = 0; unit < nr_units; unit++) {
 +              size_t start = unit * unit_size;
 +              size_t end = start + unit_size;
 +              size_t off, next;
 +
 +              /* don't free used part of occupied unit */
 +              if (pcpul_unit_to_cpu(unit, unit_map, NULL))
 +                      start += pcpul_size;
 +
 +              /* unit can span more than one page, punch the holes */
 +              for (off = start; off < end; off = next) {
 +                      void *ptr = pcpul_map[off / lpage_size].ptr;
 +                      next = min(roundup(off + 1, lpage_size), end);
 +                      if (ptr)
 +                              free_fn(ptr + off % lpage_size, next - off);
 +              }
 +      }
 +
 +      /* allocate address, map and copy */
 +      vm.flags = VM_ALLOC;
 +      vm.size = chunk_size;
 +      vm_area_register_early(&vm, unit_size);
 +
 +      for (i = 0; i < pcpul_nr_lpages; i++) {
 +              if (!pcpul_map[i].ptr)
 +                      continue;
 +              pcpul_map[i].map_addr = vm.addr + i * lpage_size;
 +              map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
 +      }
 +
 +      for_each_possible_cpu(cpu)
 +              memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
 +                     static_size);
 +
 +      /* we're ready, commit */
 +      pr_info("PERCPU: Remapped at %p with large pages, static data "
 +              "%zu bytes\n", vm.addr, static_size);
 +
 +      ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
 +                                   unit_size, vm.addr, unit_map);
 +
 +      /*
 +       * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
 +       * lpages are pushed to the end and trimmed.
 +       */
 +      for (i = 0; i < pcpul_nr_lpages - 1; i++)
 +              for (j = i + 1; j < pcpul_nr_lpages; j++) {
 +                      struct pcpul_ent tmp;
 +
 +                      if (!pcpul_map[j].ptr)
 +                              continue;
 +                      if (pcpul_map[i].ptr &&
 +                          pcpul_map[i].ptr < pcpul_map[j].ptr)
 +                              continue;
 +
 +                      tmp = pcpul_map[i];
 +                      pcpul_map[i] = pcpul_map[j];
 +                      pcpul_map[j] = tmp;
 +              }
 +
 +      while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
 +              pcpul_nr_lpages--;
 +
 +      return ret;
 +
 +enomem:
 +      for (i = 0; i < pcpul_nr_lpages; i++)
 +              if (pcpul_map[i].ptr)
 +                      free_fn(pcpul_map[i].ptr, lpage_size);
 +      free_bootmem(__pa(pcpul_map), map_size);
 +      return -ENOMEM;
 +}
 +
 +/**
 + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
 + * @kaddr: the kernel address in question
 + *
 + * Determine whether @kaddr falls in the pcpul recycled area.  This is
 + * used by pageattr to detect VM aliases and break up the pcpu large
 + * page mapping such that the same physical page is not mapped under
 + * different attributes.
 + *
 + * The recycled area is always at the tail of a partially used large
 + * page.
 + *
 + * RETURNS:
 + * Address of corresponding remapped pcpu address if match is found;
 + * otherwise, NULL.
 + */
 +void *pcpu_lpage_remapped(void *kaddr)
 +{
 +      unsigned long lpage_mask = pcpul_lpage_size - 1;
 +      void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
 +      unsigned long offset = (unsigned long)kaddr & lpage_mask;
 +      int left = 0, right = pcpul_nr_lpages - 1;
 +      int pos;
 +
 +      /* pcpul in use at all? */
 +      if (!pcpul_map)
 +              return NULL;
 +
 +      /* okay, perform binary search */
 +      while (left <= right) {
 +              pos = (left + right) / 2;
 +
 +              if (pcpul_map[pos].ptr < lpage_addr)
 +                      left = pos + 1;
 +              else if (pcpul_map[pos].ptr > lpage_addr)
 +                      right = pos - 1;
 +              else
 +                      return pcpul_map[pos].map_addr + offset;
 +      }
 +
 +      return NULL;
 +}
 +#endif
 +
 +/*
 + * Generic percpu area setup.
 + *
 + * The embedding helper is used because its behavior closely resembles
 + * the original non-dynamic generic percpu area setup.  This is
 + * important because many archs have addressing restrictions and might
 + * fail if the percpu area is located far away from the previous
 + * location.  As an added bonus, in non-NUMA cases, embedding is
 + * generally a good idea TLB-wise because percpu area can piggy back
 + * on the physical linear memory mapping which uses large page
 + * mappings on applicable archs.
 + */
 +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 +EXPORT_SYMBOL(__per_cpu_offset);
 +
 +void __init setup_per_cpu_areas(void)
 +{
 +      size_t static_size = __per_cpu_end - __per_cpu_start;
 +      ssize_t unit_size;
 +      unsigned long delta;
 +      unsigned int cpu;
 +
 +      /*
 +       * Always reserve area for module percpu variables.  That's
 +       * what the legacy allocator did.
 +       */
 +      unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
 +                                         PERCPU_DYNAMIC_RESERVE);
 +      if (unit_size < 0)
 +              panic("Failed to initialized percpu areas.");
 +
 +      delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 +      for_each_possible_cpu(cpu)
 +              __per_cpu_offset[cpu] = delta + cpu * unit_size;
  }
 +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --cc mm/slub.c
Simple merge