]> nv-tegra.nvidia Code Review - linux-2.6.git/blobdiff - mm/bootmem.c
tmpfs: convert mem_cgroup shmem to radix-swap
[linux-2.6.git] / mm / bootmem.c
index d53112fcb4040a3a2ecf930e013eb9933bc6d34f..01d5a4b3dd0c1dd857f05f474ce096a9a2938001 100644 (file)
@@ -1,17 +1,21 @@
 /*
- *  linux/mm/bootmem.c
+ *  bootmem - A boot-time physical memory allocator and configurator
  *
  *  Copyright (C) 1999 Ingo Molnar
- *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *                1999 Kanoj Sarcar, SGI
+ *                2008 Johannes Weiner
  *
- *  simple boot-time physical memory area allocator and
- *  free memory collector. It's used to deal with reserved
- *  system memory and memory holes as well.
+ * Access to this subsystem has to be serialized externally (which is true
+ * for the boot process anyway).
  */
 #include <linux/init.h>
 #include <linux/pfn.h>
+#include <linux/slab.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
+#include <linux/kmemleak.h>
+#include <linux/range.h>
+#include <linux/memblock.h>
 
 #include <asm/bug.h>
 #include <asm/io.h>
 
 #include "internal.h"
 
-/*
- * Access to this subsystem has to be serialized externally. (this is
- * true for the boot process anyway)
- */
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data = {
+       .bdata = &bootmem_node_data[0]
+};
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
 
-EXPORT_UNUSED_SYMBOL(max_pfn);  /*  June 2006  */
+bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
 
-static LIST_HEAD(bdata_list);
-#ifdef CONFIG_CRASH_DUMP
-/*
- * If we have booted due to a crash, max_pfn will be a very low value. We need
- * to know the amount of memory that the previous kernel used.
- */
-unsigned long saved_max_pfn;
-#endif
+static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
 
-/* return the number of _pages_ that will be allocated for the boot bitmap */
-unsigned long __init bootmem_bootmap_pages(unsigned long pages)
+static int bootmem_debug;
+
+static int __init bootmem_debug_setup(char *buf)
 {
-       unsigned long mapsize;
+       bootmem_debug = 1;
+       return 0;
+}
+early_param("bootmem_debug", bootmem_debug_setup);
 
-       mapsize = (pages+7)/8;
-       mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
-       mapsize >>= PAGE_SHIFT;
+#define bdebug(fmt, args...) ({                                \
+       if (unlikely(bootmem_debug))                    \
+               printk(KERN_INFO                        \
+                       "bootmem::%s " fmt,             \
+                       __func__, ## args);             \
+})
 
-       return mapsize;
+static unsigned long __init bootmap_bytes(unsigned long pages)
+{
+       unsigned long bytes = (pages + 7) / 8;
+
+       return ALIGN(bytes, sizeof(long));
 }
 
-/*
- * link bdata in order
+/**
+ * bootmem_bootmap_pages - calculate bitmap size in pages
+ * @pages: number of pages the bitmap has to represent
  */
-static void __init link_bootmem(bootmem_data_t *bdata)
+unsigned long __init bootmem_bootmap_pages(unsigned long pages)
 {
-       bootmem_data_t *ent;
+       unsigned long bytes = bootmap_bytes(pages);
 
-       if (list_empty(&bdata_list)) {
-               list_add(&bdata->list, &bdata_list);
-               return;
-       }
-       /* insert in order */
-       list_for_each_entry(ent, &bdata_list, list) {
-               if (bdata->node_boot_start < ent->node_boot_start) {
-                       list_add_tail(&bdata->list, &ent->list);
-                       return;
-               }
-       }
-       list_add_tail(&bdata->list, &bdata_list);
+       return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
 }
 
 /*
- * Given an initialised bdata, it returns the size of the boot bitmap
+ * link bdata in order
  */
-static unsigned long __init get_mapsize(bootmem_data_t *bdata)
+static void __init link_bootmem(bootmem_data_t *bdata)
 {
-       unsigned long mapsize;
-       unsigned long start = PFN_DOWN(bdata->node_boot_start);
-       unsigned long end = bdata->node_low_pfn;
+       struct list_head *iter;
+
+       list_for_each(iter, &bdata_list) {
+               bootmem_data_t *ent;
 
-       mapsize = ((end - start) + 7) / 8;
-       return ALIGN(mapsize, sizeof(long));
+               ent = list_entry(iter, bootmem_data_t, list);
+               if (bdata->node_min_pfn < ent->node_min_pfn)
+                       break;
+       }
+       list_add_tail(&bdata->list, iter);
 }
 
 /*
  * Called once to set up the allocator itself.
  */
-static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
+static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
        unsigned long mapstart, unsigned long start, unsigned long end)
 {
-       bootmem_data_t *bdata = pgdat->bdata;
        unsigned long mapsize;
 
+       mminit_validate_memmodel_limits(&start, &end);
        bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
-       bdata->node_boot_start = PFN_PHYS(start);
+       bdata->node_min_pfn = start;
        bdata->node_low_pfn = end;
        link_bootmem(bdata);
 
@@ -102,334 +107,553 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
         * Initially all pages are reserved - setup_arch() has to
         * register free RAM areas explicitly.
         */
-       mapsize = get_mapsize(bdata);
+       mapsize = bootmap_bytes(end - start);
        memset(bdata->node_bootmem_map, 0xff, mapsize);
 
+       bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
+               bdata - bootmem_node_data, start, mapstart, end, mapsize);
+
        return mapsize;
 }
 
+/**
+ * init_bootmem_node - register a node as boot memory
+ * @pgdat: node to register
+ * @freepfn: pfn where the bitmap for this node is to be placed
+ * @startpfn: first pfn on the node
+ * @endpfn: first pfn after the node
+ *
+ * Returns the number of bytes needed to hold the bitmap for this node.
+ */
+unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
+                               unsigned long startpfn, unsigned long endpfn)
+{
+       return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
+}
+
+/**
+ * init_bootmem - register boot memory
+ * @start: pfn where the bitmap is to be placed
+ * @pages: number of available physical pages
+ *
+ * Returns the number of bytes needed to hold the bitmap.
+ */
+unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+{
+       max_low_pfn = pages;
+       min_low_pfn = start;
+       return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
+}
+
 /*
- * Marks a particular physical memory range as unallocatable. Usable RAM
- * might be used for boot-time allocations - or it might get added
- * to the free page pool later on.
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system.  Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
  */
-static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
-                                       unsigned long size)
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
 {
-       unsigned long sidx, eidx;
-       unsigned long i;
+       unsigned long cursor, end;
 
-       /*
-        * round up, partially reserved pages are considered
-        * fully reserved.
-        */
-       BUG_ON(!size);
-       BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
-       BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
+       kmemleak_free_part(__va(addr), size);
 
-       sidx = PFN_DOWN(addr - bdata->node_boot_start);
-       eidx = PFN_UP(addr + size - bdata->node_boot_start);
+       cursor = PFN_UP(addr);
+       end = PFN_DOWN(addr + size);
 
-       for (i = sidx; i < eidx; i++)
-               if (test_and_set_bit(i, bdata->node_bootmem_map)) {
-#ifdef CONFIG_DEBUG_BOOTMEM
-                       printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
-#endif
-               }
+       for (; cursor < end; cursor++) {
+               __free_pages_bootmem(pfn_to_page(cursor), 0);
+               totalram_pages++;
+       }
 }
 
-static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
-                                    unsigned long size)
+static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 {
-       unsigned long sidx, eidx;
-       unsigned long i;
+       int aligned;
+       struct page *page;
+       unsigned long start, end, pages, count = 0;
 
-       /*
-        * round down end of usable mem, partially free pages are
-        * considered reserved.
-        */
-       BUG_ON(!size);
-       BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn);
+       if (!bdata->node_bootmem_map)
+               return 0;
 
-       if (addr < bdata->last_success)
-               bdata->last_success = addr;
+       start = bdata->node_min_pfn;
+       end = bdata->node_low_pfn;
 
        /*
-        * Round up the beginning of the address.
+        * If the start is aligned to the machines wordsize, we might
+        * be able to free pages in bulks of that order.
         */
-       sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
-       eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
+       aligned = !(start & (BITS_PER_LONG - 1));
 
-       for (i = sidx; i < eidx; i++) {
-               if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
-                       BUG();
+       bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
+               bdata - bootmem_node_data, start, end, aligned);
+
+       while (start < end) {
+               unsigned long *map, idx, vec;
+
+               map = bdata->node_bootmem_map;
+               idx = start - bdata->node_min_pfn;
+               vec = ~map[idx / BITS_PER_LONG];
+
+               if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+                       int order = ilog2(BITS_PER_LONG);
+
+                       __free_pages_bootmem(pfn_to_page(start), order);
+                       count += BITS_PER_LONG;
+               } else {
+                       unsigned long off = 0;
+
+                       while (vec && off < BITS_PER_LONG) {
+                               if (vec & 1) {
+                                       page = pfn_to_page(start + off);
+                                       __free_pages_bootmem(page, 0);
+                                       count++;
+                               }
+                               vec >>= 1;
+                               off++;
+                       }
+               }
+               start += BITS_PER_LONG;
        }
+
+       page = virt_to_page(bdata->node_bootmem_map);
+       pages = bdata->node_low_pfn - bdata->node_min_pfn;
+       pages = bootmem_bootmap_pages(pages);
+       count += pages;
+       while (pages--)
+               __free_pages_bootmem(page++, 0);
+
+       bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
+
+       return count;
 }
 
-/*
- * We 'merge' subsequent allocations to save space. We might 'lose'
- * some fraction of a page if allocations cannot be satisfied due to
- * size constraints on boxes where there is physical RAM space
- * fragmentation - in these cases (mostly large memory boxes) this
- * is not a problem.
- *
- * On low memory boxes we get it right in 100% of the cases.
+/**
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
  *
- * alignment has to be a power of 2 value.
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+{
+       register_page_bootmem_info_node(pgdat);
+       return free_all_bootmem_core(pgdat->bdata);
+}
+
+/**
+ * free_all_bootmem - release free pages to the buddy allocator
  *
- * NOTE:  This function is _not_ reentrant.
+ * Returns the number of pages actually released.
  */
-void * __init
-__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
-             unsigned long align, unsigned long goal, unsigned long limit)
+unsigned long __init free_all_bootmem(void)
 {
-       unsigned long offset, remaining_size, areasize, preferred;
-       unsigned long i, start = 0, incr, eidx, end_pfn;
-       void *ret;
+       unsigned long total_pages = 0;
+       bootmem_data_t *bdata;
 
-       if (!size) {
-               printk("__alloc_bootmem_core(): zero-sized request\n");
-               BUG();
-       }
-       BUG_ON(align & (align-1));
+       list_for_each_entry(bdata, &bdata_list, list)
+               total_pages += free_all_bootmem_core(bdata);
 
-       if (limit && bdata->node_boot_start >= limit)
-               return NULL;
+       return total_pages;
+}
+
+static void __init __free(bootmem_data_t *bdata,
+                       unsigned long sidx, unsigned long eidx)
+{
+       unsigned long idx;
 
-       end_pfn = bdata->node_low_pfn;
-       limit = PFN_DOWN(limit);
-       if (limit && end_pfn > limit)
-               end_pfn = limit;
+       bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
+               sidx + bdata->node_min_pfn,
+               eidx + bdata->node_min_pfn);
 
-       eidx = end_pfn - PFN_DOWN(bdata->node_boot_start);
-       offset = 0;
-       if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
-               offset = align - (bdata->node_boot_start & (align - 1UL));
-       offset = PFN_DOWN(offset);
+       if (bdata->hint_idx > sidx)
+               bdata->hint_idx = sidx;
 
-       /*
-        * We try to allocate bootmem pages above 'goal'
-        * first, then we try to allocate lower pages.
-        */
-       if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) {
-               preferred = goal - bdata->node_boot_start;
-
-               if (bdata->last_success >= preferred)
-                       if (!limit || (limit && limit > bdata->last_success))
-                               preferred = bdata->last_success;
-       } else
-               preferred = 0;
-
-       preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
-       areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
-       incr = align >> PAGE_SHIFT ? : 1;
-
-restart_scan:
-       for (i = preferred; i < eidx; i += incr) {
-               unsigned long j;
-               i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
-               i = ALIGN(i, incr);
-               if (i >= eidx)
-                       break;
-               if (test_bit(i, bdata->node_bootmem_map))
-                       continue;
-               for (j = i + 1; j < i + areasize; ++j) {
-                       if (j >= eidx)
-                               goto fail_block;
-                       if (test_bit(j, bdata->node_bootmem_map))
-                               goto fail_block;
+       for (idx = sidx; idx < eidx; idx++)
+               if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
+                       BUG();
+}
+
+static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
+                       unsigned long eidx, int flags)
+{
+       unsigned long idx;
+       int exclusive = flags & BOOTMEM_EXCLUSIVE;
+
+       bdebug("nid=%td start=%lx end=%lx flags=%x\n",
+               bdata - bootmem_node_data,
+               sidx + bdata->node_min_pfn,
+               eidx + bdata->node_min_pfn,
+               flags);
+
+       for (idx = sidx; idx < eidx; idx++)
+               if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
+                       if (exclusive) {
+                               __free(bdata, sidx, idx);
+                               return -EBUSY;
+                       }
+                       bdebug("silent double reserve of PFN %lx\n",
+                               idx + bdata->node_min_pfn);
                }
-               start = i;
-               goto found;
-       fail_block:
-               i = ALIGN(j, incr);
-       }
+       return 0;
+}
 
-       if (preferred > offset) {
-               preferred = offset;
-               goto restart_scan;
-       }
-       return NULL;
+static int __init mark_bootmem_node(bootmem_data_t *bdata,
+                               unsigned long start, unsigned long end,
+                               int reserve, int flags)
+{
+       unsigned long sidx, eidx;
 
-found:
-       bdata->last_success = PFN_PHYS(start);
-       BUG_ON(start >= eidx);
+       bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
+               bdata - bootmem_node_data, start, end, reserve, flags);
 
-       /*
-        * Is the next page of the previous allocation-end the start
-        * of this allocation's buffer? If yes then we can 'merge'
-        * the previous partial page with this allocation.
-        */
-       if (align < PAGE_SIZE &&
-           bdata->last_offset && bdata->last_pos+1 == start) {
-               offset = ALIGN(bdata->last_offset, align);
-               BUG_ON(offset > PAGE_SIZE);
-               remaining_size = PAGE_SIZE - offset;
-               if (size < remaining_size) {
-                       areasize = 0;
-                       /* last_pos unchanged */
-                       bdata->last_offset = offset + size;
-                       ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
-                                          offset +
-                                          bdata->node_boot_start);
-               } else {
-                       remaining_size = size - remaining_size;
-                       areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
-                       ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
-                                          offset +
-                                          bdata->node_boot_start);
-                       bdata->last_pos = start + areasize - 1;
-                       bdata->last_offset = remaining_size;
-               }
-               bdata->last_offset &= ~PAGE_MASK;
-       } else {
-               bdata->last_pos = start + areasize - 1;
-               bdata->last_offset = size & ~PAGE_MASK;
-               ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
-       }
+       BUG_ON(start < bdata->node_min_pfn);
+       BUG_ON(end > bdata->node_low_pfn);
 
-       /*
-        * Reserve the area now:
-        */
-       for (i = start; i < start + areasize; i++)
-               if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
-                       BUG();
-       memset(ret, 0, size);
-       return ret;
+       sidx = start - bdata->node_min_pfn;
+       eidx = end - bdata->node_min_pfn;
+
+       if (reserve)
+               return __reserve(bdata, sidx, eidx, flags);
+       else
+               __free(bdata, sidx, eidx);
+       return 0;
 }
 
-static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
+static int __init mark_bootmem(unsigned long start, unsigned long end,
+                               int reserve, int flags)
 {
-       struct page *page;
-       unsigned long pfn;
-       bootmem_data_t *bdata = pgdat->bdata;
-       unsigned long i, count, total = 0;
-       unsigned long idx;
-       unsigned long *map; 
-       int gofast = 0;
-
-       BUG_ON(!bdata->node_bootmem_map);
-
-       count = 0;
-       /* first extant page of the node */
-       pfn = PFN_DOWN(bdata->node_boot_start);
-       idx = bdata->node_low_pfn - pfn;
-       map = bdata->node_bootmem_map;
-       /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
-       if (bdata->node_boot_start == 0 ||
-           ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
-               gofast = 1;
-       for (i = 0; i < idx; ) {
-               unsigned long v = ~map[i / BITS_PER_LONG];
-
-               if (gofast && v == ~0UL) {
-                       int order;
-
-                       page = pfn_to_page(pfn);
-                       count += BITS_PER_LONG;
-                       order = ffs(BITS_PER_LONG) - 1;
-                       __free_pages_bootmem(page, order);
-                       i += BITS_PER_LONG;
-                       page += BITS_PER_LONG;
-               } else if (v) {
-                       unsigned long m;
-
-                       page = pfn_to_page(pfn);
-                       for (m = 1; m && i < idx; m<<=1, page++, i++) {
-                               if (v & m) {
-                                       count++;
-                                       __free_pages_bootmem(page, 0);
-                               }
-                       }
-               } else {
-                       i += BITS_PER_LONG;
+       unsigned long pos;
+       bootmem_data_t *bdata;
+
+       pos = start;
+       list_for_each_entry(bdata, &bdata_list, list) {
+               int err;
+               unsigned long max;
+
+               if (pos < bdata->node_min_pfn ||
+                   pos >= bdata->node_low_pfn) {
+                       BUG_ON(pos != start);
+                       continue;
                }
-               pfn += BITS_PER_LONG;
-       }
-       total += count;
 
-       /*
-        * Now free the allocator bitmap itself, it's not
-        * needed anymore:
-        */
-       page = virt_to_page(bdata->node_bootmem_map);
-       count = 0;
-       idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
-       for (i = 0; i < idx; i++, page++) {
-               __free_pages_bootmem(page, 0);
-               count++;
+               max = min(bdata->node_low_pfn, end);
+
+               err = mark_bootmem_node(bdata, pos, max, reserve, flags);
+               if (reserve && err) {
+                       mark_bootmem(start, pos, 0, 0);
+                       return err;
+               }
+
+               if (max == end)
+                       return 0;
+               pos = bdata->node_low_pfn;
        }
-       total += count;
-       bdata->node_bootmem_map = NULL;
+       BUG();
+}
+
+/**
+ * free_bootmem_node - mark a page range as usable
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must reside completely on the specified node.
+ */
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+                             unsigned long size)
+{
+       unsigned long start, end;
+
+       kmemleak_free_part(__va(physaddr), size);
+
+       start = PFN_UP(physaddr);
+       end = PFN_DOWN(physaddr + size);
 
-       return total;
+       mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
 }
 
-unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
-                               unsigned long startpfn, unsigned long endpfn)
+/**
+ * free_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long addr, unsigned long size)
 {
-       return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
+       unsigned long start, end;
+
+       kmemleak_free_part(__va(addr), size);
+
+       start = PFN_UP(addr);
+       end = PFN_DOWN(addr + size);
+
+       mark_bootmem(start, end, 0, 0);
 }
 
-void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
-                                unsigned long size)
+/**
+ * reserve_bootmem_node - mark a page range as reserved
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must reside completely on the specified node.
+ */
+int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+                                unsigned long size, int flags)
 {
-       reserve_bootmem_core(pgdat->bdata, physaddr, size);
+       unsigned long start, end;
+
+       start = PFN_DOWN(physaddr);
+       end = PFN_UP(physaddr + size);
+
+       return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
 }
 
-void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
-                             unsigned long size)
+/**
+ * reserve_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+int __init reserve_bootmem(unsigned long addr, unsigned long size,
+                           int flags)
 {
-       free_bootmem_core(pgdat->bdata, physaddr, size);
+       unsigned long start, end;
+
+       start = PFN_DOWN(addr);
+       end = PFN_UP(addr + size);
+
+       return mark_bootmem(start, end, 1, flags);
 }
 
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+                                  int flags)
 {
-       return free_all_bootmem_core(pgdat);
+       return reserve_bootmem(phys, len, flags);
 }
 
-unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+static unsigned long __init align_idx(struct bootmem_data *bdata,
+                                     unsigned long idx, unsigned long step)
 {
-       max_low_pfn = pages;
-       min_low_pfn = start;
-       return init_bootmem_core(NODE_DATA(0), start, 0, pages);
+       unsigned long base = bdata->node_min_pfn;
+
+       /*
+        * Align the index with respect to the node start so that the
+        * combination of both satisfies the requested alignment.
+        */
+
+       return ALIGN(base + idx, step) - base;
 }
 
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-void __init reserve_bootmem(unsigned long addr, unsigned long size)
+static unsigned long __init align_off(struct bootmem_data *bdata,
+                                     unsigned long off, unsigned long align)
 {
-       reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+       unsigned long base = PFN_PHYS(bdata->node_min_pfn);
+
+       /* Same as align_idx for byte offsets */
+
+       return ALIGN(base + off, align) - base;
 }
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
-void __init free_bootmem(unsigned long addr, unsigned long size)
+static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+                                       unsigned long size, unsigned long align,
+                                       unsigned long goal, unsigned long limit)
 {
-       free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+       unsigned long fallback = 0;
+       unsigned long min, max, start, sidx, midx, step;
+
+       bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
+               bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
+               align, goal, limit);
+
+       BUG_ON(!size);
+       BUG_ON(align & (align - 1));
+       BUG_ON(limit && goal + size > limit);
+
+       if (!bdata->node_bootmem_map)
+               return NULL;
+
+       min = bdata->node_min_pfn;
+       max = bdata->node_low_pfn;
+
+       goal >>= PAGE_SHIFT;
+       limit >>= PAGE_SHIFT;
+
+       if (limit && max > limit)
+               max = limit;
+       if (max <= min)
+               return NULL;
+
+       step = max(align >> PAGE_SHIFT, 1UL);
+
+       if (goal && min < goal && goal < max)
+               start = ALIGN(goal, step);
+       else
+               start = ALIGN(min, step);
+
+       sidx = start - bdata->node_min_pfn;
+       midx = max - bdata->node_min_pfn;
+
+       if (bdata->hint_idx > sidx) {
+               /*
+                * Handle the valid case of sidx being zero and still
+                * catch the fallback below.
+                */
+               fallback = sidx + 1;
+               sidx = align_idx(bdata, bdata->hint_idx, step);
+       }
+
+       while (1) {
+               int merge;
+               void *region;
+               unsigned long eidx, i, start_off, end_off;
+find_block:
+               sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
+               sidx = align_idx(bdata, sidx, step);
+               eidx = sidx + PFN_UP(size);
+
+               if (sidx >= midx || eidx > midx)
+                       break;
+
+               for (i = sidx; i < eidx; i++)
+                       if (test_bit(i, bdata->node_bootmem_map)) {
+                               sidx = align_idx(bdata, i, step);
+                               if (sidx == i)
+                                       sidx += step;
+                               goto find_block;
+                       }
+
+               if (bdata->last_end_off & (PAGE_SIZE - 1) &&
+                               PFN_DOWN(bdata->last_end_off) + 1 == sidx)
+                       start_off = align_off(bdata, bdata->last_end_off, align);
+               else
+                       start_off = PFN_PHYS(sidx);
+
+               merge = PFN_DOWN(start_off) < sidx;
+               end_off = start_off + size;
+
+               bdata->last_end_off = end_off;
+               bdata->hint_idx = PFN_UP(end_off);
+
+               /*
+                * Reserve the area now:
+                */
+               if (__reserve(bdata, PFN_DOWN(start_off) + merge,
+                               PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
+                       BUG();
+
+               region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
+                               start_off);
+               memset(region, 0, size);
+               /*
+                * The min_count is set to 0 so that bootmem allocated blocks
+                * are never reported as leaks.
+                */
+               kmemleak_alloc(region, size, 0, 0);
+               return region;
+       }
+
+       if (fallback) {
+               sidx = align_idx(bdata, fallback - 1, step);
+               fallback = 0;
+               goto find_block;
+       }
+
+       return NULL;
 }
 
-unsigned long __init free_all_bootmem(void)
+static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
+                                       unsigned long size, unsigned long align,
+                                       unsigned long goal, unsigned long limit)
 {
-       return free_all_bootmem_core(NODE_DATA(0));
+       if (WARN_ON_ONCE(slab_is_available()))
+               return kzalloc(size, GFP_NOWAIT);
+
+#ifdef CONFIG_HAVE_ARCH_BOOTMEM
+       {
+               bootmem_data_t *p_bdata;
+
+               p_bdata = bootmem_arch_preferred_node(bdata, size, align,
+                                                       goal, limit);
+               if (p_bdata)
+                       return alloc_bootmem_core(p_bdata, size, align,
+                                                       goal, limit);
+       }
+#endif
+       return NULL;
 }
 
-void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
-                                     unsigned long goal)
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+                                       unsigned long align,
+                                       unsigned long goal,
+                                       unsigned long limit)
 {
        bootmem_data_t *bdata;
-       void *ptr;
+       void *region;
+
+restart:
+       region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
+       if (region)
+               return region;
 
        list_for_each_entry(bdata, &bdata_list, list) {
-               ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
-               if (ptr)
-                       return ptr;
+               if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
+                       continue;
+               if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
+                       break;
+
+               region = alloc_bootmem_core(bdata, size, align, goal, limit);
+               if (region)
+                       return region;
        }
+
+       if (goal) {
+               goal = 0;
+               goto restart;
+       }
+
        return NULL;
 }
 
-void * __init __alloc_bootmem(unsigned long size, unsigned long align,
-                             unsigned long goal)
+/**
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+                                       unsigned long goal)
+{
+       unsigned long limit = 0;
+
+       return ___alloc_bootmem_nopanic(size, align, goal, limit);
+}
+
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+                                       unsigned long goal, unsigned long limit)
 {
-       void *mem = __alloc_bootmem_nopanic(size,align,goal);
+       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
 
        if (mem)
                return mem;
@@ -441,47 +665,183 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return NULL;
 }
 
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+                             unsigned long goal)
+{
+       unsigned long limit = 0;
+
+       return ___alloc_bootmem(size, align, goal, limit);
+}
+
+static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
+                               unsigned long size, unsigned long align,
+                               unsigned long goal, unsigned long limit)
+{
+       void *ptr;
+
+       ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
+       if (ptr)
+               return ptr;
+
+       ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
+       if (ptr)
+               return ptr;
+
+       return ___alloc_bootmem(size, align, goal, limit);
+}
 
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
 void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
                                   unsigned long align, unsigned long goal)
+{
+       if (WARN_ON_ONCE(slab_is_available()))
+               return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+       return  ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+}
+
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+                                  unsigned long align, unsigned long goal)
+{
+#ifdef MAX_DMA32_PFN
+       unsigned long end_pfn;
+
+       if (WARN_ON_ONCE(slab_is_available()))
+               return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+       /* update goal according ...MAX_DMA32_PFN */
+       end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+       if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
+           (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
+               void *ptr;
+               unsigned long new_goal;
+
+               new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
+               ptr = alloc_bootmem_core(pgdat->bdata, size, align,
+                                                new_goal, 0);
+               if (ptr)
+                       return ptr;
+       }
+#endif
+
+       return __alloc_bootmem_node(pgdat, size, align, goal);
+
+}
+
+#ifdef CONFIG_SPARSEMEM
+/**
+ * alloc_bootmem_section - allocate boot memory from a specific section
+ * @size: size of the request in bytes
+ * @section_nr: sparse map section to allocate from
+ *
+ * Return NULL on failure.
+ */
+void * __init alloc_bootmem_section(unsigned long size,
+                                   unsigned long section_nr)
+{
+       bootmem_data_t *bdata;
+       unsigned long pfn, goal, limit;
+
+       pfn = section_nr_to_pfn(section_nr);
+       goal = pfn << PAGE_SHIFT;
+       limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+       bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
+
+       return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
+}
+#endif
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+                                  unsigned long align, unsigned long goal)
 {
        void *ptr;
 
-       ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+       if (WARN_ON_ONCE(slab_is_available()))
+               return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+       ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
        if (ptr)
                return ptr;
 
-       return __alloc_bootmem(size, align, goal);
+       ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+       if (ptr)
+               return ptr;
+
+       return __alloc_bootmem_nopanic(size, align, goal);
 }
 
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
 #endif
 
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
                                  unsigned long goal)
 {
-       bootmem_data_t *bdata;
-       void *ptr;
-
-       list_for_each_entry(bdata, &bdata_list, list) {
-               ptr = __alloc_bootmem_core(bdata, size, align, goal,
-                                               ARCH_LOW_ADDRESS_LIMIT);
-               if (ptr)
-                       return ptr;
-       }
-
-       /*
-        * Whoops, we cannot satisfy the allocation request.
-        */
-       printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
-       panic("Out of low memory");
-       return NULL;
+       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
 }
 
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
                                       unsigned long align, unsigned long goal)
 {
-       return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
-                                   ARCH_LOW_ADDRESS_LIMIT);
+       if (WARN_ON_ONCE(slab_is_available()))
+               return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+       return ___alloc_bootmem_node(pgdat->bdata, size, align,
+                               goal, ARCH_LOW_ADDRESS_LIMIT);
 }