[IA64] memory-less-nodes repost
[linux-2.6.git] / arch / ia64 / mm / discontig.c
index f3fd528..54136fd 100644 (file)
@@ -44,150 +44,7 @@ struct early_node_data {
 };
 
 static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
-
-/**
- * reassign_cpu_only_nodes - called from find_memory to move CPU-only nodes to a memory node
- *
- * This function will move nodes with only CPUs (no memory)
- * to a node with memory which is at the minimum numa_slit distance.
- * Any reassigments will result in the compression of the nodes
- * and renumbering the nid values where appropriate.
- * The static declarations below are to avoid large stack size which
- * makes the code not re-entrant.
- */
-static void __init reassign_cpu_only_nodes(void)
-{
-       struct node_memblk_s *p;
-       int i, j, k, nnode, nid, cpu, cpunid, pxm;
-       u8 cslit, slit;
-       static DECLARE_BITMAP(nodes_with_mem, MAX_NUMNODES) __initdata;
-       static u8 numa_slit_fix[MAX_NUMNODES * MAX_NUMNODES] __initdata;
-       static int node_flip[MAX_NUMNODES] __initdata;
-       static int old_nid_map[NR_CPUS] __initdata;
-
-       for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
-               if (!test_bit(p->nid, (void *) nodes_with_mem)) {
-                       set_bit(p->nid, (void *) nodes_with_mem);
-                       nnode++;
-               }
-
-       /*
-        * All nids with memory.
-        */
-       if (nnode == num_online_nodes())
-               return;
-
-       /*
-        * Change nids and attempt to migrate CPU-only nodes
-        * to the best numa_slit (closest neighbor) possible.
-        * For reassigned CPU nodes a nid can't be arrived at
-        * until after this loop because the target nid's new
-        * identity might not have been established yet. So
-        * new nid values are fabricated above num_online_nodes() and
-        * mapped back later to their true value.
-        */
-       /* MCD - This code is a bit complicated, but may be unnecessary now.
-        * We can now handle much more interesting node-numbering.
-        * The old requirement that 0 <= nid <= numnodes <= MAX_NUMNODES
-        * and that there be no holes in the numbering 0..numnodes
-        * has become simply 0 <= nid <= MAX_NUMNODES.
-        */
-       nid = 0;
-       for_each_online_node(i)  {
-               if (test_bit(i, (void *) nodes_with_mem)) {
-                       /*
-                        * Save original nid value for numa_slit
-                        * fixup and node_cpuid reassignments.
-                        */
-                       node_flip[nid] = i;
-
-                       if (i == nid) {
-                               nid++;
-                               continue;
-                       }
-
-                       for (p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
-                               if (p->nid == i)
-                                       p->nid = nid;
-
-                       cpunid = nid;
-                       nid++;
-               } else
-                       cpunid = MAX_NUMNODES;
-
-               for (cpu = 0; cpu < NR_CPUS; cpu++)
-                       if (node_cpuid[cpu].nid == i) {
-                               /*
-                                * For nodes not being reassigned just
-                                * fix the cpu's nid and reverse pxm map
-                                */
-                               if (cpunid < MAX_NUMNODES) {
-                                       pxm = nid_to_pxm_map[i];
-                                       pxm_to_nid_map[pxm] =
-                                                 node_cpuid[cpu].nid = cpunid;
-                                       continue;
-                               }
-
-                               /*
-                                * For nodes being reassigned, find best node by
-                                * numa_slit information and then make a temporary
-                                * nid value based on current nid and num_online_nodes().
-                                */
-                               slit = 0xff;
-                               k = 2*num_online_nodes();
-                               for_each_online_node(j) {
-                                       if (i == j)
-                                               continue;
-                                       else if (test_bit(j, (void *) nodes_with_mem)) {
-                                               cslit = numa_slit[i * num_online_nodes() + j];
-                                               if (cslit < slit) {
-                                                       k = num_online_nodes() + j;
-                                                       slit = cslit;
-                                               }
-                                       }
-                               }
-
-                               /* save old nid map so we can update the pxm */
-                               old_nid_map[cpu] = node_cpuid[cpu].nid;
-                               node_cpuid[cpu].nid = k;
-                       }
-       }
-
-       /*
-        * Fixup temporary nid values for CPU-only nodes.
-        */
-       for (cpu = 0; cpu < NR_CPUS; cpu++)
-               if (node_cpuid[cpu].nid == (2*num_online_nodes())) {
-                       pxm = nid_to_pxm_map[old_nid_map[cpu]];
-                       pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = nnode - 1;
-               } else {
-                       for (i = 0; i < nnode; i++) {
-                               if (node_flip[i] != (node_cpuid[cpu].nid - num_online_nodes()))
-                                       continue;
-
-                               pxm = nid_to_pxm_map[old_nid_map[cpu]];
-                               pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = i;
-                               break;
-                       }
-               }
-
-       /*
-        * Fix numa_slit by compressing from larger
-        * nid array to reduced nid array.
-        */
-       for (i = 0; i < nnode; i++)
-               for (j = 0; j < nnode; j++)
-                       numa_slit_fix[i * nnode + j] =
-                               numa_slit[node_flip[i] * num_online_nodes() + node_flip[j]];
-
-       memcpy(numa_slit, numa_slit_fix, sizeof (numa_slit));
-
-       nodes_clear(node_online_map);
-       for (i = 0; i < nnode; i++)
-               node_set_online(i);
-
-       return;
-}
+static nodemask_t memory_less_mask __initdata;
 
 /*
  * To prevent cache aliasing effects, align per-node structures so that they
@@ -233,46 +90,88 @@ static int __init build_node_maps(unsigned long start, unsigned long len,
 }
 
 /**
- * early_nr_phys_cpus_node - return number of physical cpus on a given node
+ * early_nr_cpus_node - return number of cpus on a given node
  * @node: node to check
  *
- * Count the number of physical cpus on @node.  These are cpus that actually
- * exist.  We can't use nr_cpus_node() yet because
+ * Count the number of cpus on @node.  We can't use nr_cpus_node() yet because
  * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
- * called yet.
+ * called yet.  Note that node 0 will also count all non-existent cpus.
  */
-static int early_nr_phys_cpus_node(int node)
+static int __init early_nr_cpus_node(int node)
 {
        int cpu, n = 0;
 
        for (cpu = 0; cpu < NR_CPUS; cpu++)
                if (node == node_cpuid[cpu].nid)
-                       if ((cpu == 0) || node_cpuid[cpu].phys_id)
-                               n++;
+                       n++;
 
        return n;
 }
 
+/**
+ * compute_pernodesize - compute size of pernode data
+ * @node: the node id.
+ */
+static unsigned long __init compute_pernodesize(int node)
+{
+       unsigned long pernodesize = 0, cpus;
+
+       cpus = early_nr_cpus_node(node);
+       pernodesize += PERCPU_PAGE_SIZE * cpus;
+       pernodesize += node * L1_CACHE_BYTES;
+       pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
+       pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
+       pernodesize = PAGE_ALIGN(pernodesize);
+       return pernodesize;
+}
 
 /**
- * early_nr_cpus_node - return number of cpus on a given node
- * @node: node to check
- *
- * Count the number of cpus on @node.  We can't use nr_cpus_node() yet because
- * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
- * called yet.  Note that node 0 will also count all non-existent cpus.
+ * fill_pernode - initialize pernode data.
+ * @node: the node id.
+ * @pernode: physical address of pernode data
+ * @pernodesize: size of the pernode data
  */
-static int early_nr_cpus_node(int node)
+static void __init fill_pernode(int node, unsigned long pernode,
+       unsigned long pernodesize)
 {
-       int cpu, n = 0;
+       void *cpu_data;
+       int cpus = early_nr_cpus_node(node), cpu;
+       struct bootmem_data *bdp = &mem_data[node].bootmem_data;
 
-       for (cpu = 0; cpu < NR_CPUS; cpu++)
-               if (node == node_cpuid[cpu].nid)
-                       n++;
+       mem_data[node].pernode_addr = pernode;
+       mem_data[node].pernode_size = pernodesize;
+       memset(__va(pernode), 0, pernodesize);
 
-       return n;
-}
+       cpu_data = (void *)pernode;
+       pernode += PERCPU_PAGE_SIZE * cpus;
+       pernode += node * L1_CACHE_BYTES;
+
+       mem_data[node].pgdat = __va(pernode);
+       pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
+
+       mem_data[node].node_data = __va(pernode);
+       pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
+
+       mem_data[node].pgdat->bdata = bdp;
+       pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
+
+       /*
+        * Copy the static per-cpu data into the region we
+        * just set aside and then setup __per_cpu_offset
+        * for each CPU on this node.
+        */
+       for (cpu = 0; cpu < NR_CPUS; cpu++) {
+               if (node == node_cpuid[cpu].nid) {
+                       memcpy(__va(cpu_data), __phys_per_cpu_start,
+                              __per_cpu_end - __per_cpu_start);
+                       __per_cpu_offset[cpu] = (char*)__va(cpu_data) -
+                               __per_cpu_start;
+                       cpu_data += PERCPU_PAGE_SIZE;
+               }
+       }
 
+       return;
+}
 /**
  * find_pernode_space - allocate memory for memory map and per-node structures
  * @start: physical start of range
@@ -304,9 +203,8 @@ static int early_nr_cpus_node(int node)
 static int __init find_pernode_space(unsigned long start, unsigned long len,
                                     int node)
 {
-       unsigned long epfn, cpu, cpus, phys_cpus;
+       unsigned long epfn;
        unsigned long pernodesize = 0, pernode, pages, mapsize;
-       void *cpu_data;
        struct bootmem_data *bdp = &mem_data[node].bootmem_data;
 
        epfn = (start + len) >> PAGE_SHIFT;
@@ -329,49 +227,12 @@ static int __init find_pernode_space(unsigned long start, unsigned long len,
         * Calculate total size needed, incl. what's necessary
         * for good alignment and alias prevention.
         */
-       cpus = early_nr_cpus_node(node);
-       phys_cpus = early_nr_phys_cpus_node(node);
-       pernodesize += PERCPU_PAGE_SIZE * cpus;
-       pernodesize += node * L1_CACHE_BYTES;
-       pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
-       pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
-       pernodesize = PAGE_ALIGN(pernodesize);
+       pernodesize = compute_pernodesize(node);
        pernode = NODEDATA_ALIGN(start, node);
 
        /* Is this range big enough for what we want to store here? */
-       if (start + len > (pernode + pernodesize + mapsize)) {
-               mem_data[node].pernode_addr = pernode;
-               mem_data[node].pernode_size = pernodesize;
-               memset(__va(pernode), 0, pernodesize);
-
-               cpu_data = (void *)pernode;
-               pernode += PERCPU_PAGE_SIZE * cpus;
-               pernode += node * L1_CACHE_BYTES;
-
-               mem_data[node].pgdat = __va(pernode);
-               pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
-
-               mem_data[node].node_data = __va(pernode);
-               pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
-
-               mem_data[node].pgdat->bdata = bdp;
-               pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
-
-               /*
-                * Copy the static per-cpu data into the region we
-                * just set aside and then setup __per_cpu_offset
-                * for each CPU on this node.
-                */
-               for (cpu = 0; cpu < NR_CPUS; cpu++) {
-                       if (node == node_cpuid[cpu].nid) {
-                               memcpy(__va(cpu_data), __phys_per_cpu_start,
-                                      __per_cpu_end - __per_cpu_start);
-                               __per_cpu_offset[cpu] = (char*)__va(cpu_data) -
-                                       __per_cpu_start;
-                               cpu_data += PERCPU_PAGE_SIZE;
-                       }
-               }
-       }
+       if (start + len > (pernode + pernodesize + mapsize))
+               fill_pernode(node, pernode, pernodesize);
 
        return 0;
 }
@@ -411,6 +272,9 @@ static void __init reserve_pernode_space(void)
        for_each_online_node(node) {
                pg_data_t *pdp = mem_data[node].pgdat;
 
+               if (node_isset(node, memory_less_mask))
+                       continue;
+
                bdp = pdp->bdata;
 
                /* First the bootmem_map itself */
@@ -456,6 +320,83 @@ static void __init initialize_pernode_data(void)
 }
 
 /**
+ * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit
+ *     node but fall back to any other node when __alloc_bootmem_node fails
+ *     for best.
+ * @nid: node id
+ * @pernodesize: size of this node's pernode data
+ * @align: alignment to use for this node's pernode data
+ */
+static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize,
+       unsigned long align)
+{
+       void *ptr = NULL;
+       u8 best = 0xff;
+       int bestnode = -1, node;
+
+       for_each_online_node(node) {
+               if (node_isset(node, memory_less_mask))
+                       continue;
+               else if (node_distance(nid, node) < best) {
+                       best = node_distance(nid, node);
+                       bestnode = node;
+               }
+       }
+
+       ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat,
+               pernodesize, align, __pa(MAX_DMA_ADDRESS));
+
+       if (!ptr)
+               panic("NO memory for memory less node\n");
+       return ptr;
+}
+
+/**
+ * pgdat_insert - insert the pgdat into global pgdat_list
+ * @pgdat: the pgdat for a node.
+ */
+static void __init pgdat_insert(pg_data_t *pgdat)
+{
+       pg_data_t *prev = NULL, *next;
+
+       for_each_pgdat(next)
+               if (pgdat->node_id < next->node_id)
+                       break;
+               else
+                       prev = next;
+
+       if (prev) {
+               prev->pgdat_next = pgdat;
+               pgdat->pgdat_next = next;
+       } else {
+               pgdat->pgdat_next = pgdat_list;
+               pgdat_list = pgdat;
+       }
+
+       return;
+}
+
+/**
+ * memory_less_nodes - allocate and initialize CPU only nodes pernode
+ *     information.
+ */
+static void __init memory_less_nodes(void)
+{
+       unsigned long pernodesize;
+       void *pernode;
+       int node;
+
+       for_each_node_mask(node, memory_less_mask) {
+               pernodesize = compute_pernodesize(node);
+               pernode = memory_less_node_alloc(node, pernodesize,
+                       (node) ? (node * PERCPU_PAGE_SIZE) : (1024*1024));
+               fill_pernode(node, __pa(pernode), pernodesize);
+       }
+
+       return;
+}
+
+/**
  * find_memory - walk the EFI memory map and setup the bootmem allocator
  *
  * Called early in boot to setup the bootmem allocator, and to
@@ -472,16 +413,19 @@ void __init find_memory(void)
                node_set_online(0);
        }
 
+       nodes_or(memory_less_mask, memory_less_mask, node_online_map);
        min_low_pfn = -1;
        max_low_pfn = 0;
 
-       if (num_online_nodes() > 1)
-               reassign_cpu_only_nodes();
-
        /* These actually end up getting called by call_pernode_memory() */
        efi_memmap_walk(filter_rsvd_memory, build_node_maps);
        efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
 
+       for_each_online_node(node)
+               if (mem_data[node].bootmem_data.node_low_pfn) {
+                       node_clear(node, memory_less_mask);
+                       mem_data[node].min_pfn = ~0UL;
+               }
        /*
         * Initialize the boot memory maps in reverse order since that's
         * what the bootmem allocator expects
@@ -492,17 +436,14 @@ void __init find_memory(void)
 
                if (!node_online(node))
                        continue;
+               else if (node_isset(node, memory_less_mask))
+                       continue;
 
                bdp = &mem_data[node].bootmem_data;
                pernode = mem_data[node].pernode_addr;
                pernodesize = mem_data[node].pernode_size;
                map = pernode + pernodesize;
 
-               /* Sanity check... */
-               if (!pernode)
-                       panic("pernode space for node %d "
-                             "could not be allocated!", node);
-
                init_bootmem_node(mem_data[node].pgdat,
                                  map>>PAGE_SHIFT,
                                  bdp->node_boot_start>>PAGE_SHIFT,
@@ -512,6 +453,7 @@ void __init find_memory(void)
        efi_memmap_walk(filter_rsvd_memory, free_node_bootmem);
 
        reserve_pernode_space();
+       memory_less_nodes();
        initialize_pernode_data();
 
        max_pfn = max_low_pfn;
@@ -680,12 +622,13 @@ void __init paging_init(void)
 
        max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 
-       /* so min() will work in count_node_pages */
-       for_each_online_node(node)
-               mem_data[node].min_pfn = ~0UL;
-
        efi_memmap_walk(filter_rsvd_memory, count_node_pages);
 
+       vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page));
+       vmem_map = (struct page *) vmalloc_end;
+       efi_memmap_walk(create_mem_map_page_table, NULL);
+       printk("Virtual mem_map starts at 0x%p\n", vmem_map);
+
        for_each_online_node(node) {
                memset(zones_size, 0, sizeof(zones_size));
                memset(zholes_size, 0, sizeof(zholes_size));
@@ -719,15 +662,6 @@ void __init paging_init(void)
                                 mem_data[node].num_dma_physpages);
                }
 
-               if (node == 0) {
-                       vmalloc_end -=
-                               PAGE_ALIGN(max_low_pfn * sizeof(struct page));
-                       vmem_map = (struct page *) vmalloc_end;
-
-                       efi_memmap_walk(create_mem_map_page_table, NULL);
-                       printk("Virtual mem_map starts at 0x%p\n", vmem_map);
-               }
-
                pfn_offset = mem_data[node].min_pfn;
 
                NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
@@ -735,5 +669,11 @@ void __init paging_init(void)
                                    pfn_offset, zholes_size);
        }
 
+       /*
+        * Make memory less nodes become a member of the known nodes.
+        */
+       for_each_node_mask(node, memory_less_mask)
+               pgdat_insert(mem_data[node].pgdat);
+
        zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }