sparsemem: Put mem map for one node together.
Yinghai Lu [Wed, 10 Feb 2010 09:20:22 +0000 (01:20 -0800)]
Add vmemmap_alloc_block_buf for mem map only.

It will fallback to the old way if it cannot get a block that big.

Before this patch, when a node have 128g ram installed, memmap are
split into two parts or more.
[    0.000000]  [ffffea0000000000-ffffea003fffffff] PMD -> [ffff880100600000-ffff88013e9fffff] on node 1
[    0.000000]  [ffffea0040000000-ffffea006fffffff] PMD -> [ffff88013ec00000-ffff88016ebfffff] on node 1
[    0.000000]  [ffffea0070000000-ffffea007fffffff] PMD -> [ffff882000600000-ffff8820105fffff] on node 0
[    0.000000]  [ffffea0080000000-ffffea00bfffffff] PMD -> [ffff882010800000-ffff8820507fffff] on node 0
[    0.000000]  [ffffea00c0000000-ffffea00dfffffff] PMD -> [ffff882050a00000-ffff8820709fffff] on node 0
[    0.000000]  [ffffea00e0000000-ffffea00ffffffff] PMD -> [ffff884000600000-ffff8840205fffff] on node 2
[    0.000000]  [ffffea0100000000-ffffea013fffffff] PMD -> [ffff884020800000-ffff8840607fffff] on node 2
[    0.000000]  [ffffea0140000000-ffffea014fffffff] PMD -> [ffff884060a00000-ffff8840709fffff] on node 2
[    0.000000]  [ffffea0150000000-ffffea017fffffff] PMD -> [ffff886000600000-ffff8860305fffff] on node 3
[    0.000000]  [ffffea0180000000-ffffea01bfffffff] PMD -> [ffff886030800000-ffff8860707fffff] on node 3
[    0.000000]  [ffffea01c0000000-ffffea01ffffffff] PMD -> [ffff888000600000-ffff8880405fffff] on node 4
[    0.000000]  [ffffea0200000000-ffffea022fffffff] PMD -> [ffff888040800000-ffff8880707fffff] on node 4
[    0.000000]  [ffffea0230000000-ffffea023fffffff] PMD -> [ffff88a000600000-ffff88a0105fffff] on node 5
[    0.000000]  [ffffea0240000000-ffffea027fffffff] PMD -> [ffff88a010800000-ffff88a0507fffff] on node 5
[    0.000000]  [ffffea0280000000-ffffea029fffffff] PMD -> [ffff88a050a00000-ffff88a0709fffff] on node 5
[    0.000000]  [ffffea02a0000000-ffffea02bfffffff] PMD -> [ffff88c000600000-ffff88c0205fffff] on node 6
[    0.000000]  [ffffea02c0000000-ffffea02ffffffff] PMD -> [ffff88c020800000-ffff88c0607fffff] on node 6
[    0.000000]  [ffffea0300000000-ffffea030fffffff] PMD -> [ffff88c060a00000-ffff88c0709fffff] on node 6
[    0.000000]  [ffffea0310000000-ffffea033fffffff] PMD -> [ffff88e000600000-ffff88e0305fffff] on node 7
[    0.000000]  [ffffea0340000000-ffffea037fffffff] PMD -> [ffff88e030800000-ffff88e0707fffff] on node 7

after patch will get
[    0.000000]  [ffffea0000000000-ffffea006fffffff] PMD -> [ffff880100200000-ffff88016e5fffff] on node 0
[    0.000000]  [ffffea0070000000-ffffea00dfffffff] PMD -> [ffff882000200000-ffff8820701fffff] on node 1
[    0.000000]  [ffffea00e0000000-ffffea014fffffff] PMD -> [ffff884000200000-ffff8840701fffff] on node 2
[    0.000000]  [ffffea0150000000-ffffea01bfffffff] PMD -> [ffff886000200000-ffff8860701fffff] on node 3
[    0.000000]  [ffffea01c0000000-ffffea022fffffff] PMD -> [ffff888000200000-ffff8880701fffff] on node 4
[    0.000000]  [ffffea0230000000-ffffea029fffffff] PMD -> [ffff88a000200000-ffff88a0701fffff] on node 5
[    0.000000]  [ffffea02a0000000-ffffea030fffffff] PMD -> [ffff88c000200000-ffff88c0701fffff] on node 6
[    0.000000]  [ffffea0310000000-ffffea037fffffff] PMD -> [ffff88e000200000-ffff88e0701fffff] on node 7

-v2: change buf to vmemmap_buf instead according to Ingo
     also add CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER according to Ingo
-v3: according to Andrew, use sizeof(name) instead of hard coded 15

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1265793639-15071-19-git-send-email-yinghai@kernel.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

arch/x86/mm/init_64.c
include/linux/mm.h
mm/Kconfig
mm/sparse-vmemmap.c
mm/sparse.c

index 53158b7..e9b040e 100644 (file)
@@ -977,7 +977,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
                        if (pmd_none(*pmd)) {
                                pte_t entry;
 
-                               p = vmemmap_alloc_block(PMD_SIZE, node);
+                               p = vmemmap_alloc_block_buf(PMD_SIZE, node);
                                if (!p)
                                        return -ENOMEM;
 
index f2c5b3c..f6002e5 100644 (file)
@@ -1326,12 +1326,19 @@ extern int randomize_va_space;
 const char * arch_vma_name(struct vm_area_struct *vma);
 void print_vma_addr(char *prefix, unsigned long rip);
 
+void sparse_mem_maps_populate_node(struct page **map_map,
+                                  unsigned long pnum_begin,
+                                  unsigned long pnum_end,
+                                  unsigned long map_count,
+                                  int nodeid);
+
 struct page *sparse_mem_map_populate(unsigned long pnum, int nid);
 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
 pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node);
 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
 void *vmemmap_alloc_block(unsigned long size, int node);
+void *vmemmap_alloc_block_buf(unsigned long size, int node);
 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
 int vmemmap_populate_basepages(struct page *start_page,
                                                unsigned long pages, int node);
index 17b8947..e4a33b9 100644 (file)
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME
 config SPARSEMEM_VMEMMAP_ENABLE
        bool
 
+config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+       def_bool y
+       depends on SPARSEMEM && X86_64
+
 config SPARSEMEM_VMEMMAP
        bool "Sparse Memory virtual memmap"
        depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
index 9506c39..392b9bb 100644 (file)
@@ -43,6 +43,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
        return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
 }
 
+static void *vmemmap_buf;
+static void *vmemmap_buf_end;
 
 void * __meminit vmemmap_alloc_block(unsigned long size, int node)
 {
@@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
                                __pa(MAX_DMA_ADDRESS));
 }
 
+/* need to make sure size is all the same during early stage */
+void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+{
+       void *ptr;
+
+       if (!vmemmap_buf)
+               return vmemmap_alloc_block(size, node);
+
+       /* take the from buf */
+       ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
+       if (ptr + size > vmemmap_buf_end)
+               return vmemmap_alloc_block(size, node);
+
+       vmemmap_buf = ptr + size;
+
+       return ptr;
+}
+
 void __meminit vmemmap_verify(pte_t *pte, int node,
                                unsigned long start, unsigned long end)
 {
@@ -80,7 +100,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
        pte_t *pte = pte_offset_kernel(pmd, addr);
        if (pte_none(*pte)) {
                pte_t entry;
-               void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+               void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
                if (!p)
                        return NULL;
                entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -163,3 +183,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
 
        return map;
 }
+
+void __init sparse_mem_maps_populate_node(struct page **map_map,
+                                         unsigned long pnum_begin,
+                                         unsigned long pnum_end,
+                                         unsigned long map_count, int nodeid)
+{
+       unsigned long pnum;
+       unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+       void *vmemmap_buf_start;
+
+       size = ALIGN(size, PMD_SIZE);
+       vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
+                        PMD_SIZE, __pa(MAX_DMA_ADDRESS));
+
+       if (vmemmap_buf_start) {
+               vmemmap_buf = vmemmap_buf_start;
+               vmemmap_buf_end = vmemmap_buf_start + size * map_count;
+       }
+
+       for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+               struct mem_section *ms;
+
+               if (!present_section_nr(pnum))
+                       continue;
+
+               map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+               if (map_map[pnum])
+                       continue;
+               ms = __nr_to_section(pnum);
+               printk(KERN_ERR "%s: sparsemem memory map backing failed "
+                       "some memory will not be available.\n", __func__);
+               ms->section_mem_map = 0;
+       }
+
+       if (vmemmap_buf_start) {
+               /* need to free left buf */
+#ifdef CONFIG_NO_BOOTMEM
+               free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
+               if (vmemmap_buf_start < vmemmap_buf) {
+                       char name[15];
+
+                       snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
+                       reserve_early_without_check(__pa(vmemmap_buf_start),
+                                                   __pa(vmemmap_buf), name);
+               }
+#else
+               free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
+#endif
+               vmemmap_buf = NULL;
+               vmemmap_buf_end = NULL;
+       }
+}
index 0cdaf0b..9b6b93a 100644 (file)
@@ -390,8 +390,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
                       PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
        return map;
 }
+void __init sparse_mem_maps_populate_node(struct page **map_map,
+                                         unsigned long pnum_begin,
+                                         unsigned long pnum_end,
+                                         unsigned long map_count, int nodeid)
+{
+       void *map;
+       unsigned long pnum;
+       unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+
+       map = alloc_remap(nodeid, size * map_count);
+       if (map) {
+               for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+                       if (!present_section_nr(pnum))
+                               continue;
+                       map_map[pnum] = map;
+                       map += size;
+               }
+               return;
+       }
+
+       size = PAGE_ALIGN(size);
+       map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
+       if (map) {
+               for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+                       if (!present_section_nr(pnum))
+                               continue;
+                       map_map[pnum] = map;
+                       map += size;
+               }
+               return;
+       }
+
+       /* fallback */
+       for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+               struct mem_section *ms;
+
+               if (!present_section_nr(pnum))
+                       continue;
+               map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+               if (map_map[pnum])
+                       continue;
+               ms = __nr_to_section(pnum);
+               printk(KERN_ERR "%s: sparsemem memory map backing failed "
+                       "some memory will not be available.\n", __func__);
+               ms->section_mem_map = 0;
+       }
+}
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
+static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
+                                unsigned long pnum_begin,
+                                unsigned long pnum_end,
+                                unsigned long map_count, int nodeid)
+{
+       sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
+                                        map_count, nodeid);
+}
+
+#ifndef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
 static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 {
        struct page *map;
@@ -407,6 +464,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
        ms->section_mem_map = 0;
        return NULL;
 }
+#endif
 
 void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
 {
@@ -420,12 +478,14 @@ void __init sparse_init(void)
 {
        unsigned long pnum;
        struct page *map;
+       struct page **map_map;
        unsigned long *usemap;
        unsigned long **usemap_map;
-       int size;
+       int size, size2;
        int nodeid_begin = 0;
        unsigned long pnum_begin = 0;
        unsigned long usemap_count;
+       unsigned long map_count;
 
        /*
         * map is using big page (aka 2M in x86 64 bit)
@@ -478,6 +538,48 @@ void __init sparse_init(void)
        sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
                                         usemap_count, nodeid_begin);
 
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+       size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
+       map_map = alloc_bootmem(size2);
+       if (!map_map)
+               panic("can not allocate map_map\n");
+
+       for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+               struct mem_section *ms;
+
+               if (!present_section_nr(pnum))
+                       continue;
+               ms = __nr_to_section(pnum);
+               nodeid_begin = sparse_early_nid(ms);
+               pnum_begin = pnum;
+               break;
+       }
+       map_count = 1;
+       for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+               struct mem_section *ms;
+               int nodeid;
+
+               if (!present_section_nr(pnum))
+                       continue;
+               ms = __nr_to_section(pnum);
+               nodeid = sparse_early_nid(ms);
+               if (nodeid == nodeid_begin) {
+                       map_count++;
+                       continue;
+               }
+               /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+               sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
+                                                map_count, nodeid_begin);
+               /* new start, update count etc*/
+               nodeid_begin = nodeid;
+               pnum_begin = pnum;
+               map_count = 1;
+       }
+       /* ok, last chunk */
+       sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
+                                        map_count, nodeid_begin);
+#endif
+
        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
                if (!present_section_nr(pnum))
                        continue;
@@ -486,7 +588,11 @@ void __init sparse_init(void)
                if (!usemap)
                        continue;
 
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+               map = map_map[pnum];
+#else
                map = sparse_early_mem_map_alloc(pnum);
+#endif
                if (!map)
                        continue;
 
@@ -496,6 +602,9 @@ void __init sparse_init(void)
 
        vmemmap_populate_print_last();
 
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+       free_bootmem(__pa(map_map), size2);
+#endif
        free_bootmem(__pa(usemap_map), size);
 }