config: tegra3: enable /dev mount with ACL
[linux-2.6.git] / mm / page_alloc.c
index 2a362c5..8859578 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
@@ -39,6 +40,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
+#include <linux/vmstat.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
@@ -53,6 +55,8 @@
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -104,19 +108,38 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
  * only be modified with pm_mutex held, unless the suspend/hibernate code is
  * guaranteed not to run in parallel with that modification).
  */
-void set_gfp_allowed_mask(gfp_t mask)
+
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
 {
        WARN_ON(!mutex_is_locked(&pm_mutex));
-       gfp_allowed_mask = mask;
+       if (saved_gfp_mask) {
+               gfp_allowed_mask = saved_gfp_mask;
+               saved_gfp_mask = 0;
+       }
 }
 
-gfp_t clear_gfp_allowed_mask(gfp_t mask)
+void pm_restrict_gfp_mask(void)
 {
-       gfp_t ret = gfp_allowed_mask;
-
        WARN_ON(!mutex_is_locked(&pm_mutex));
-       gfp_allowed_mask &= ~mask;
-       return ret;
+       WARN_ON(saved_gfp_mask);
+       saved_gfp_mask = gfp_allowed_mask;
+       gfp_allowed_mask &= ~GFP_IOFS;
+}
+
+static bool pm_suspending(void)
+{
+       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+               return false;
+       return true;
+}
+
+#else
+
+static bool pm_suspending(void)
+{
+       return false;
 }
 #endif /* CONFIG_PM_SLEEP */
 
@@ -167,6 +190,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
 };
 
 int min_free_kbytes = 1024;
+int min_free_order_shift = 1;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -281,7 +305,7 @@ static void bad_page(struct page *page)
 
        /* Don't complain about poisoned pages */
        if (PageHWPoison(page)) {
-               __ClearPageBuddy(page);
+               reset_page_mapcount(page); /* remove PageBuddy */
                return;
        }
 
@@ -312,7 +336,7 @@ static void bad_page(struct page *page)
        dump_stack();
 out:
        /* Leave bad fields for debug, except PageBuddy could make trouble */
-       __ClearPageBuddy(page);
+       reset_page_mapcount(page); /* remove PageBuddy */
        add_taint(TAINT_BAD_PAGE);
 }
 
@@ -346,12 +370,13 @@ void prep_compound_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
-
                __SetPageTail(p);
+               set_page_count(p, 0);
                p->first_page = page;
        }
 }
 
+/* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
        int i;
@@ -421,18 +446,10 @@ static inline void rmv_page_order(struct page *page)
  *
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
-       unsigned long buddy_idx = page_idx ^ (1 << order);
-
-       return page + (buddy_idx - page_idx);
-}
-
 static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
 {
-       return (page_idx & ~(1 << order));
+       return page_idx ^ (1 << order);
 }
 
 /*
@@ -443,8 +460,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
@@ -477,7 +494,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
  * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were   
@@ -494,6 +511,7 @@ static inline void __free_one_page(struct page *page,
 {
        unsigned long page_idx;
        unsigned long combined_idx;
+       unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
 
        if (unlikely(PageCompound(page)))
@@ -508,7 +526,8 @@ static inline void __free_one_page(struct page *page,
        VM_BUG_ON(bad_range(zone, page));
 
        while (order < MAX_ORDER-1) {
-               buddy = __page_find_buddy(page, page_idx, order);
+               buddy_idx = __find_buddy_index(page_idx, order);
+               buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
                        break;
 
@@ -516,7 +535,7 @@ static inline void __free_one_page(struct page *page,
                list_del(&buddy->lru);
                zone->free_area[order].nr_free--;
                rmv_page_order(buddy);
-               combined_idx = __find_combined_index(page_idx, order);
+               combined_idx = buddy_idx & page_idx;
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
                order++;
@@ -531,11 +550,12 @@ static inline void __free_one_page(struct page *page,
         * so it's less likely to be used soon and more likely to be merged
         * as a higher order page
         */
-       if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+       if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
                struct page *higher_page, *higher_buddy;
-               combined_idx = __find_combined_index(page_idx, order);
-               higher_page = page + combined_idx - page_idx;
-               higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+               combined_idx = buddy_idx & page_idx;
+               higher_page = page + (combined_idx - page_idx);
+               buddy_idx = __find_buddy_index(combined_idx, order + 1);
+               higher_buddy = page + (buddy_idx - combined_idx);
                if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                        list_add_tail(&page->lru,
                                &zone->free_area[order].free_list[migratetype]);
@@ -564,7 +584,8 @@ static inline int free_pages_check(struct page *page)
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
                (atomic_read(&page->_count) != 0) |
-               (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
+               (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
+               (mem_cgroup_bad_page_check(page)))) {
                bad_page(page);
                return 1;
        }
@@ -613,6 +634,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        list = &pcp->lists[migratetype];
                } while (list_empty(list));
 
+               /* This is the only non-empty list. Free them all. */
+               if (batch_free == MIGRATE_PCPTYPES)
+                       batch_free = to_free;
+
                do {
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
@@ -646,13 +671,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        trace_mm_page_free_direct(page, order);
        kmemcheck_free_shadow(page, order);
 
-       for (i = 0; i < (1 << order); i++) {
-               struct page *pg = page + i;
-
-               if (PageAnon(pg))
-                       pg->mapping = NULL;
-               bad += free_pages_check(pg);
-       }
+       if (PageAnon(page))
+               page->mapping = NULL;
+       for (i = 0; i < (1 << order); i++)
+               bad += free_pages_check(page + i);
        if (bad)
                return false;
 
@@ -752,7 +774,8 @@ static inline int check_new_page(struct page *page)
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
                (atomic_read(&page->_count) != 0)  |
-               (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
+               (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
+               (mem_cgroup_bad_page_check(page)))) {
                bad_page(page);
                return 1;
        }
@@ -865,9 +888,8 @@ static int move_freepages(struct zone *zone,
                }
 
                order = page_order(page);
-               list_del(&page->lru);
-               list_add(&page->lru,
-                       &zone->free_area[order].free_list[migratetype]);
+               list_move(&page->lru,
+                         &zone->free_area[order].free_list[migratetype]);
                page += 1 << order;
                pages_moved += 1 << order;
        }
@@ -938,7 +960,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                         * If breaking a large block of pages, move all free
                         * pages to the preferred allocation list. If falling
                         * back for a reclaimable kernel allocation, be more
-                        * agressive about taking ownership of free pages
+                        * aggressive about taking ownership of free pages
                         */
                        if (unlikely(current_order >= (pageblock_order >> 1)) ||
                                        start_migratetype == MIGRATE_RECLAIMABLE ||
@@ -1090,8 +1112,10 @@ static void drain_pages(unsigned int cpu)
                pset = per_cpu_ptr(zone->pageset, cpu);
 
                pcp = &pset->pcp;
-               free_pcppages_bulk(zone, pcp->count, pcp);
-               pcp->count = 0;
+               if (pcp->count) {
+                       free_pcppages_bulk(zone, pcp->count, pcp);
+                       pcp->count = 0;
+               }
                local_irq_restore(flags);
        }
 }
@@ -1333,7 +1357,7 @@ again:
        }
 
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
-       zone_statistics(preferred_zone, zone);
+       zone_statistics(preferred_zone, zone, gfp_flags);
        local_irq_restore(flags);
 
        VM_BUG_ON(bad_range(zone, page));
@@ -1361,21 +1385,12 @@ failed:
 
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 
-static struct fail_page_alloc_attr {
+static struct {
        struct fault_attr attr;
 
        u32 ignore_gfp_highmem;
        u32 ignore_gfp_wait;
        u32 min_order;
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
-       struct dentry *ignore_gfp_highmem_file;
-       struct dentry *ignore_gfp_wait_file;
-       struct dentry *min_order_file;
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
 } fail_page_alloc = {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_gfp_wait = 1,
@@ -1409,36 +1424,27 @@ static int __init fail_page_alloc_debugfs(void)
 {
        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
        struct dentry *dir;
-       int err;
-
-       err = init_fault_attr_dentries(&fail_page_alloc.attr,
-                                      "fail_page_alloc");
-       if (err)
-               return err;
-       dir = fail_page_alloc.attr.dentries.dir;
-
-       fail_page_alloc.ignore_gfp_wait_file =
-               debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                                     &fail_page_alloc.ignore_gfp_wait);
-
-       fail_page_alloc.ignore_gfp_highmem_file =
-               debugfs_create_bool("ignore-gfp-highmem", mode, dir,
-                                     &fail_page_alloc.ignore_gfp_highmem);
-       fail_page_alloc.min_order_file =
-               debugfs_create_u32("min-order", mode, dir,
-                                  &fail_page_alloc.min_order);
-
-       if (!fail_page_alloc.ignore_gfp_wait_file ||
-            !fail_page_alloc.ignore_gfp_highmem_file ||
-            !fail_page_alloc.min_order_file) {
-               err = -ENOMEM;
-               debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
-               debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
-               debugfs_remove(fail_page_alloc.min_order_file);
-               cleanup_fault_attr_dentries(&fail_page_alloc.attr);
-       }
 
-       return err;
+       dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+                                       &fail_page_alloc.attr);
+       if (IS_ERR(dir))
+               return PTR_ERR(dir);
+
+       if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
+                               &fail_page_alloc.ignore_gfp_wait))
+               goto fail;
+       if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+                               &fail_page_alloc.ignore_gfp_highmem))
+               goto fail;
+       if (!debugfs_create_u32("min-order", mode, dir,
+                               &fail_page_alloc.min_order))
+               goto fail;
+
+       return 0;
+fail:
+       debugfs_remove_recursive(dir);
+
+       return -ENOMEM;
 }
 
 late_initcall(fail_page_alloc_debugfs);
@@ -1455,35 +1461,54 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 
 /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                     int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags, long free_pages)
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
-       long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
        int o;
 
+       free_pages -= (1 << order) + 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
 
        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-               return 0;
+               return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
                free_pages -= z->free_area[o].nr_free << o;
 
                /* Require fewer higher order pages to be free */
-               min >>= 1;
+               min >>= min_free_order_shift;
 
                if (free_pages <= min)
-                       return 0;
+                       return false;
        }
-       return 1;
+       return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
+{
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                       zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
+{
+       long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+       if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+               free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                                               free_pages);
 }
 
 #ifdef CONFIG_NUMA
@@ -1588,6 +1613,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
        set_bit(i, zlc->fullzones);
 }
 
+/*
+ * clear all zones full, called after direct reclaim makes progress so that
+ * a zone that was recently full is not skipped over for up to a second
+ */
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+
+       zlc = zonelist->zlcache_ptr;
+       if (!zlc)
+               return;
+
+       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+}
+
 #else  /* CONFIG_NUMA */
 
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1604,6 +1644,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
+
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+}
 #endif /* CONFIG_NUMA */
 
 /*
@@ -1636,7 +1680,7 @@ zonelist_scan:
                                continue;
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
-                               goto try_next_zone;
+                               continue;
 
                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1648,17 +1692,36 @@ zonelist_scan:
                                    classzone_idx, alloc_flags))
                                goto try_this_zone;
 
+                       if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+                               /*
+                                * we do zlc_setup if there are multiple nodes
+                                * and before considering the first zone allowed
+                                * by the cpuset.
+                                */
+                               allowednodes = zlc_setup(zonelist, alloc_flags);
+                               zlc_active = 1;
+                               did_zlc_setup = 1;
+                       }
+
                        if (zone_reclaim_mode == 0)
                                goto this_zone_full;
 
+                       /*
+                        * As we may have just activated ZLC, check if the first
+                        * eligible zone has failed zone_reclaim recently.
+                        */
+                       if (NUMA_BUILD && zlc_active &&
+                               !zlc_zone_worth_trying(zonelist, z, allowednodes))
+                               continue;
+
                        ret = zone_reclaim(zone, gfp_mask, order);
                        switch (ret) {
                        case ZONE_RECLAIM_NOSCAN:
                                /* did not scan */
-                               goto try_next_zone;
+                               continue;
                        case ZONE_RECLAIM_FULL:
                                /* scanned but unreclaimable */
-                               goto this_zone_full;
+                               continue;
                        default:
                                /* did we reclaim enough */
                                if (!zone_watermark_ok(zone, order, mark,
@@ -1675,16 +1738,6 @@ try_this_zone:
 this_zone_full:
                if (NUMA_BUILD)
                        zlc_mark_zone_full(zonelist, z);
-try_next_zone:
-               if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
-                       /*
-                        * we do zlc_setup after the first zone is tried but only
-                        * if there are multiple nodes make it worthwhile
-                        */
-                       allowednodes = zlc_setup(zonelist, alloc_flags);
-                       zlc_active = 1;
-                       did_zlc_setup = 1;
-               }
        }
 
        if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1695,6 +1748,59 @@ try_next_zone:
        return page;
 }
 
+/*
+ * Large machines with many possible nodes should not always dump per-node
+ * meminfo in irq context.
+ */
+static inline bool should_suppress_show_mem(void)
+{
+       bool ret = false;
+
+#if NODES_SHIFT > 8
+       ret = in_interrupt();
+#endif
+       return ret;
+}
+
+static DEFINE_RATELIMIT_STATE(nopage_rs,
+               DEFAULT_RATELIMIT_INTERVAL,
+               DEFAULT_RATELIMIT_BURST);
+
+void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+{
+       va_list args;
+       unsigned int filter = SHOW_MEM_FILTER_NODES;
+
+       if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+               return;
+
+       /*
+        * This documents exceptions given to allocations in certain
+        * contexts that are allowed to allocate outside current's set
+        * of allowed nodes.
+        */
+       if (!(gfp_mask & __GFP_NOMEMALLOC))
+               if (test_thread_flag(TIF_MEMDIE) ||
+                   (current->flags & (PF_MEMALLOC | PF_EXITING)))
+                       filter &= ~SHOW_MEM_FILTER_NODES;
+       if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+               filter &= ~SHOW_MEM_FILTER_NODES;
+
+       if (fmt) {
+               printk(KERN_WARNING);
+               va_start(args, fmt);
+               vprintk(fmt, args);
+               va_end(args);
+       }
+
+       pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
+                  current->comm, order, gfp_mask);
+
+       dump_stack();
+       if (!should_suppress_show_mem())
+               show_mem(filter);
+}
+
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
                                unsigned long pages_reclaimed)
@@ -1788,15 +1894,18 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress)
+       int migratetype, unsigned long *did_some_progress,
+       bool sync_migration)
 {
        struct page *page;
 
        if (!order || compaction_deferred(preferred_zone))
                return NULL;
 
+       current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                               nodemask);
+                                               nodemask, sync_migration);
+       current->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
 
                /* Page migration frees to the PCP lists but we want merging */
@@ -1832,7 +1941,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress)
+       int migratetype, unsigned long *did_some_progress,
+       bool sync_migration)
 {
        return NULL;
 }
@@ -1847,29 +1957,32 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 {
        struct page *page = NULL;
        struct reclaim_state reclaim_state;
-       struct task_struct *p = current;
        bool drained = false;
 
        cond_resched();
 
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
-       p->flags |= PF_MEMALLOC;
+       current->flags |= PF_MEMALLOC;
        lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
-       p->reclaim_state = &reclaim_state;
+       current->reclaim_state = &reclaim_state;
 
        *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
 
-       p->reclaim_state = NULL;
+       current->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
-       p->flags &= ~PF_MEMALLOC;
+       current->flags &= ~PF_MEMALLOC;
 
        cond_resched();
 
        if (unlikely(!(*did_some_progress)))
                return NULL;
 
+       /* After successful reclaim, reconsider all zones for allocation */
+       if (NUMA_BUILD)
+               zlc_clear_zones_full(zonelist);
+
 retry:
        page = get_page_from_freelist(gfp_mask, nodemask, order,
                                        zonelist, high_zoneidx,
@@ -1907,7 +2020,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
                        preferred_zone, migratetype);
 
                if (!page && gfp_mask & __GFP_NOFAIL)
-                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
        } while (!page && (gfp_mask & __GFP_NOFAIL));
 
        return page;
@@ -1915,24 +2028,24 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 
 static inline
 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-                                               enum zone_type high_zoneidx)
+                                               enum zone_type high_zoneidx,
+                                               enum zone_type classzone_idx)
 {
        struct zoneref *z;
        struct zone *zone;
 
        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-               wakeup_kswapd(zone, order);
+               wakeup_kswapd(zone, order, classzone_idx);
 }
 
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
-       struct task_struct *p = current;
        int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
        const gfp_t wait = gfp_mask & __GFP_WAIT;
 
        /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
-       BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+       BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
 
        /*
         * The caller may dip into page reserves a bit more if the caller
@@ -1940,21 +2053,26 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
         * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
         * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
         */
-       alloc_flags |= (gfp_mask & __GFP_HIGH);
+       alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 
        if (!wait) {
-               alloc_flags |= ALLOC_HARDER;
+               /*
+                * Not worth trying to allocate harder for
+                * __GFP_NOMEMALLOC even if it can't schedule.
+                */
+               if  (!(gfp_mask & __GFP_NOMEMALLOC))
+                       alloc_flags |= ALLOC_HARDER;
                /*
                 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
                 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                 */
                alloc_flags &= ~ALLOC_CPUSET;
-       } else if (unlikely(rt_task(p)) && !in_interrupt())
+       } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
 
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
                if (!in_interrupt() &&
-                   ((p->flags & PF_MEMALLOC) ||
+                   ((current->flags & PF_MEMALLOC) ||
                     unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
@@ -1973,7 +2091,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        int alloc_flags;
        unsigned long pages_reclaimed = 0;
        unsigned long did_some_progress;
-       struct task_struct *p = current;
+       bool sync_migration = false;
 
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -1998,7 +2116,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 
 restart:
-       wake_all_kswapd(order, zonelist, high_zoneidx);
+       if (!(gfp_mask & __GFP_NO_KSWAPD))
+               wake_all_kswapd(order, zonelist, high_zoneidx,
+                                               zone_idx(preferred_zone));
 
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2007,6 +2127,15 @@ restart:
         */
        alloc_flags = gfp_to_alloc_flags(gfp_mask);
 
+       /*
+        * Find the true preferred zone if the allocation is unconstrained by
+        * cpusets.
+        */
+       if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+               first_zones_zonelist(zonelist, high_zoneidx, NULL,
+                                       &preferred_zone);
+
+rebalance:
        /* This is the last chance, in general, before the goto nopage. */
        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2014,7 +2143,6 @@ restart:
        if (page)
                goto got_pg;
 
-rebalance:
        /* Allocate without watermarks if the context allows */
        if (alloc_flags & ALLOC_NO_WATERMARKS) {
                page = __alloc_pages_high_priority(gfp_mask, order,
@@ -2029,21 +2157,26 @@ rebalance:
                goto nopage;
 
        /* Avoid recursion of direct reclaim */
-       if (p->flags & PF_MEMALLOC)
+       if (current->flags & PF_MEMALLOC)
                goto nopage;
 
        /* Avoid allocations with no watermarks from looping endlessly */
        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                goto nopage;
 
-       /* Try direct compaction */
+       /*
+        * Try direct compaction. The first pass is asynchronous. Subsequent
+        * attempts after direct reclaim are synchronous
+        */
        page = __alloc_pages_direct_compact(gfp_mask, order,
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                       migratetype, &did_some_progress);
+                                       migratetype, &did_some_progress,
+                                       sync_migration);
        if (page)
                goto got_pg;
+       sync_migration = true;
 
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2089,24 +2222,40 @@ rebalance:
 
                        goto restart;
                }
+
+               /*
+                * Suspend converts GFP_KERNEL to __GFP_WAIT which can
+                * prevent reclaim making forward progress without
+                * invoking OOM. Bail if we are suspending
+                */
+               if (pm_suspending())
+                       goto nopage;
        }
 
        /* Check if we should retry the allocation */
        pages_reclaimed += did_some_progress;
        if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
-               congestion_wait(BLK_RW_ASYNC, HZ/50);
+               wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
+       } else {
+               /*
+                * High-order allocations do not necessarily loop after
+                * direct reclaim and reclaim/compaction depends on compaction
+                * being called after reclaim so call directly if necessary
+                */
+               page = __alloc_pages_direct_compact(gfp_mask, order,
+                                       zonelist, high_zoneidx,
+                                       nodemask,
+                                       alloc_flags, preferred_zone,
+                                       migratetype, &did_some_progress,
+                                       sync_migration);
+               if (page)
+                       goto got_pg;
        }
 
 nopage:
-       if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
-               printk(KERN_WARNING "%s: page allocation failure."
-                       " order:%d, mode:0x%x\n",
-                       p->comm, order, gfp_mask);
-               dump_stack();
-               show_mem();
-       }
+       warn_alloc_failed(gfp_mask, order, NULL);
        return page;
 got_pg:
        if (kmemcheck_enabled)
@@ -2146,7 +2295,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 
        get_mems_allowed();
        /* The preferred zone is used for statistics later */
-       first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+       first_zones_zonelist(zonelist, high_zoneidx,
+                               nodemask ? : &cpuset_current_mems_allowed,
+                               &preferred_zone);
        if (!preferred_zone) {
                put_mems_allowed();
                return NULL;
@@ -2225,6 +2376,21 @@ void free_pages(unsigned long addr, unsigned int order)
 
 EXPORT_SYMBOL(free_pages);
 
+static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+{
+       if (addr) {
+               unsigned long alloc_end = addr + (PAGE_SIZE << order);
+               unsigned long used = addr + PAGE_ALIGN(size);
+
+               split_page(virt_to_page((void *)addr), order);
+               while (used < alloc_end) {
+                       free_page(used);
+                       used += PAGE_SIZE;
+               }
+       }
+       return (void *)addr;
+}
+
 /**
  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
  * @size: the number of bytes to allocate
@@ -2244,22 +2410,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
        unsigned long addr;
 
        addr = __get_free_pages(gfp_mask, order);
-       if (addr) {
-               unsigned long alloc_end = addr + (PAGE_SIZE << order);
-               unsigned long used = addr + PAGE_ALIGN(size);
-
-               split_page(virt_to_page((void *)addr), order);
-               while (used < alloc_end) {
-                       free_page(used);
-                       used += PAGE_SIZE;
-               }
-       }
-
-       return (void *)addr;
+       return make_alloc_exact(addr, order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact);
 
 /**
+ * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
+ *                        pages on a node.
+ * @nid: the preferred node ID where memory should be allocated
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * Like alloc_pages_exact(), but try to allocate on node nid first before falling
+ * back.
+ * Note this is not alloc_pages_exact_node() which allocates on a specific node,
+ * but is not exact.
+ */
+void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+{
+       unsigned order = get_order(size);
+       struct page *p = alloc_pages_node(nid, gfp_mask, order);
+       if (!p)
+               return NULL;
+       return make_alloc_exact((unsigned long)page_address(p), order, size);
+}
+EXPORT_SYMBOL(alloc_pages_exact_nid);
+
+/**
  * free_pages_exact - release memory allocated via alloc_pages_exact()
  * @virt: the value returned by alloc_pages_exact.
  * @size: size of allocation, same value as passed to alloc_pages_exact().
@@ -2353,19 +2530,41 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 }
 #endif
 
+/*
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
+ */
+bool skip_free_areas_node(unsigned int flags, int nid)
+{
+       bool ret = false;
+
+       if (!(flags & SHOW_MEM_FILTER_NODES))
+               goto out;
+
+       get_mems_allowed();
+       ret = !node_isset(nid, cpuset_current_mems_allowed);
+       put_mems_allowed();
+out:
+       return ret;
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
+ * Suppresses nodes that are not allowed by current's cpuset if
+ * SHOW_MEM_FILTER_NODES is passed.
  */
-void show_free_areas(void)
+void show_free_areas(unsigned int filter)
 {
        int cpu;
        struct zone *zone;
 
        for_each_populated_zone(zone) {
+               if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                       continue;
                show_node(zone);
                printk("%s per-cpu:\n", zone->name);
 
@@ -2407,6 +2606,8 @@ void show_free_areas(void)
        for_each_populated_zone(zone) {
                int i;
 
+               if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                       continue;
                show_node(zone);
                printk("%s"
                        " free:%lukB"
@@ -2437,7 +2638,7 @@ void show_free_areas(void)
                        " all_unreclaimable? %s"
                        "\n",
                        zone->name,
-                       K(zone_nr_free_pages(zone)),
+                       K(zone_page_state(zone, NR_FREE_PAGES)),
                        K(min_wmark_pages(zone)),
                        K(low_wmark_pages(zone)),
                        K(high_wmark_pages(zone)),
@@ -2474,6 +2675,8 @@ void show_free_areas(void)
        for_each_populated_zone(zone) {
                unsigned long nr[MAX_ORDER], flags, order, total = 0;
 
+               if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                       continue;
                show_node(zone);
                printk("%s: ", zone->name);
 
@@ -2580,9 +2783,16 @@ static int __parse_numa_zonelist_order(char *s)
 
 static __init int setup_numa_zonelist_order(char *s)
 {
-       if (s)
-               return __parse_numa_zonelist_order(s);
-       return 0;
+       int ret;
+
+       if (!s)
+               return 0;
+
+       ret = __parse_numa_zonelist_order(s);
+       if (ret == 0)
+               strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+
+       return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 
@@ -3008,14 +3218,6 @@ static __init_refok int __build_all_zonelists(void *data)
                build_zonelist_cache(pgdat);
        }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-       /* Setup real pagesets for the new zone */
-       if (data) {
-               struct zone *zone = data;
-               setup_zone_pageset(zone);
-       }
-#endif
-
        /*
         * Initialize the boot_pagesets that are going to be used
         * for bootstrapping processors. The real pagesets for
@@ -3053,7 +3255,7 @@ static __init_refok int __build_all_zonelists(void *data)
  * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  */
-void build_all_zonelists(void *data)
+void __ref build_all_zonelists(void *data)
 {
        set_zonelist_order();
 
@@ -3064,7 +3266,11 @@ void build_all_zonelists(void *data)
        } else {
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
-               stop_machine(__build_all_zonelists, data, NULL);
+#ifdef CONFIG_MEMORY_HOTPLUG
+               if (data)
+                       setup_zone_pageset((struct zone *)data);
+#endif
+               stop_machine(__build_all_zonelists, NULL, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -3160,6 +3366,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 
 /*
+ * Check if a pageblock contains reserved pages
+ */
+static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned long pfn;
+
+       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+               if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
+                       return 1;
+       }
+       return 0;
+}
+
+/*
  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
  * of blocks reserved is based on min_wmark_pages(zone). The memory within
  * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
@@ -3168,14 +3388,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
  */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
-       unsigned long start_pfn, pfn, end_pfn;
+       unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
        struct page *page;
        unsigned long block_migratetype;
        int reserve;
 
-       /* Get the start pfn, end pfn and the number of blocks to reserve */
+       /*
+        * Get the start pfn, end pfn and the number of blocks to reserve
+        * We have to be careful to be aligned to pageblock_nr_pages to
+        * make sure that we always check pfn_valid for the first page in
+        * the block.
+        */
        start_pfn = zone->zone_start_pfn;
        end_pfn = start_pfn + zone->spanned_pages;
+       start_pfn = roundup(start_pfn, pageblock_nr_pages);
        reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                        pageblock_order;
 
@@ -3198,7 +3424,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                        continue;
 
                /* Blocks with reserved pages will never free, skip them. */
-               if (PageReserved(page))
+               block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+               if (pageblock_is_reserved(pfn, block_end_pfn))
                        continue;
 
                block_migratetype = get_pageblock_migratetype(page);
@@ -3387,7 +3614,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                pcp->batch = PAGE_SHIFT * 8;
 }
 
-static __meminit void setup_zone_pageset(struct zone *zone)
+static void setup_zone_pageset(struct zone *zone)
 {
        int cpu;
 
@@ -3437,7 +3664,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 
        if (!slab_is_available()) {
                zone->wait_table = (wait_queue_head_t *)
-                       alloc_bootmem_node(pgdat, alloc_size);
+                       alloc_bootmem_node_nopanic(pgdat, alloc_size);
        } else {
                /*
                 * This case means that a zone whose size was 0 gets new memory
@@ -3638,13 +3865,45 @@ void __init free_bootmem_with_active_regions(int nid,
 }
 
 #ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Basic iterator support. Return the last range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns last region regardless of node
+ */
+static int __meminit last_active_region_index_in_nid(int nid)
+{
+       int i;
+
+       for (i = nr_nodemap_entries - 1; i >= 0; i--)
+               if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+                       return i;
+
+       return -1;
+}
+
+/*
+ * Basic iterator support. Return the previous active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardless of node
+ */
+static int __meminit previous_active_region_index_in_nid(int index, int nid)
+{
+       for (index = index - 1; index >= 0; index--)
+               if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+                       return index;
+
+       return -1;
+}
+
+#define for_each_active_range_index_in_nid_reverse(i, nid) \
+       for (i = last_active_region_index_in_nid(nid); i != -1; \
+                               i = previous_active_region_index_in_nid(i, nid))
+
 u64 __init find_memory_core_early(int nid, u64 size, u64 align,
                                        u64 goal, u64 limit)
 {
        int i;
 
        /* Need to go over early_node_map to find out good range for node */
-       for_each_active_range_index_in_nid(i, nid) {
+       for_each_active_range_index_in_nid_reverse(i, nid) {
                u64 addr;
                u64 ei_start, ei_last;
                u64 final_start, final_end;
@@ -3687,34 +3946,6 @@ int __init add_from_early_node_map(struct range *range, int az,
        return nr_range;
 }
 
-#ifdef CONFIG_NO_BOOTMEM
-void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
-                                       u64 goal, u64 limit)
-{
-       void *ptr;
-       u64 addr;
-
-       if (limit > memblock.current_limit)
-               limit = memblock.current_limit;
-
-       addr = find_memory_core_early(nid, size, align, goal, limit);
-
-       if (addr == MEMBLOCK_ERROR)
-               return NULL;
-
-       ptr = phys_to_virt(addr);
-       memset(ptr, 0, size);
-       memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
-       /*
-        * The min_count is set to 0 so that bootmem allocated blocks
-        * are never reported as leaks.
-        */
-       kmemleak_alloc(ptr, size, 0, 0);
-       return ptr;
-}
-#endif
-
-
 void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
 {
        int i;
@@ -3795,7 +4026,7 @@ static void __init find_usable_zone_for_movable(void)
 
 /*
  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
- * because it is sized independant of architecture. Unlike the other zones,
+ * because it is sized independent of architecture. Unlike the other zones,
  * the starting point for ZONE_MOVABLE is not fixed. It may be different
  * in each node depending on the size of each node and how evenly kernelcore
  * is distributed. This helper function adjusts the zone ranges
@@ -4010,10 +4241,11 @@ static void __init setup_usemap(struct pglist_data *pgdat,
        unsigned long usemapsize = usemap_size(zonesize);
        zone->pageblock_flags = NULL;
        if (usemapsize)
-               zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
+               zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
+                                                                  usemapsize);
 }
 #else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
                                struct zone *zone, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 
@@ -4130,10 +4362,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->zone_pgdat = pgdat;
 
                zone_pcp_init(zone);
-               for_each_lru(l) {
+               for_each_lru(l)
                        INIT_LIST_HEAD(&zone->lru[l].list);
-                       zone->reclaim_stat.nr_saved_scan[l] = 0;
-               }
                zone->reclaim_stat.recent_rotated[0] = 0;
                zone->reclaim_stat.recent_rotated[1] = 0;
                zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4176,7 +4406,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                size =  (end - start) * sizeof(struct page);
                map = alloc_remap(pgdat->node_id, size);
                if (!map)
-                       map = alloc_bootmem_node(pgdat, size);
+                       map = alloc_bootmem_node_nopanic(pgdat, size);
                pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
        }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -4398,6 +4628,60 @@ void __init sort_node_map(void)
                        cmp_node_active_region, NULL);
 }
 
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
+ * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Returns the determined alignment in pfn's.  0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+       unsigned long accl_mask = 0, last_end = 0;
+       int last_nid = -1;
+       int i;
+
+       for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
+               int nid = early_node_map[i].nid;
+               unsigned long start = early_node_map[i].start_pfn;
+               unsigned long end = early_node_map[i].end_pfn;
+               unsigned long mask;
+
+               if (!start || last_nid < 0 || last_nid == nid) {
+                       last_nid = nid;
+                       last_end = end;
+                       continue;
+               }
+
+               /*
+                * Start with a mask granular enough to pin-point to the
+                * start pfn and tick off bits one-by-one until it becomes
+                * too coarse to separate the current node from the last.
+                */
+               mask = ~((1 << __ffs(start)) - 1);
+               while (mask && last_end <= (start & (mask << 1)))
+                       mask <<= 1;
+
+               /* accumulate all internode masks */
+               accl_mask |= mask;
+       }
+
+       /* convert mask to number of pages */
+       return ~accl_mask + 1;
+}
+
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
@@ -4748,15 +5032,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
        dma_reserve = new_dma_reserve;
 }
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = {
-#ifndef CONFIG_NO_BOOTMEM
- .bdata = &bootmem_node_data[0]
-#endif
- };
-EXPORT_SYMBOL(contig_page_data);
-#endif
-
 void __init free_area_init(unsigned long *zones_size)
 {
        free_area_init_node(0, zones_size,
@@ -4950,7 +5225,7 @@ void setup_per_zone_wmarks(void)
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
-void calculate_zone_inactive_ratio(struct zone *zone)
+static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
 {
        unsigned int gb, ratio;
 
@@ -4964,7 +5239,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)
        zone->inactive_ratio = ratio;
 }
 
-static void __init setup_per_zone_inactive_ratio(void)
+static void __meminit setup_per_zone_inactive_ratio(void)
 {
        struct zone *zone;
 
@@ -4996,7 +5271,7 @@ static void __init setup_per_zone_inactive_ratio(void)
  * 8192MB:     11584k
  * 16384MB:    16384k
  */
-static int __init init_per_zone_wmark_min(void)
+int __meminit init_per_zone_wmark_min(void)
 {
        unsigned long lowmem_kbytes;
 
@@ -5008,6 +5283,7 @@ static int __init init_per_zone_wmark_min(void)
        if (min_free_kbytes > 65536)
                min_free_kbytes = 65536;
        setup_per_zone_wmarks();
+       refresh_zone_stat_thresholds();
        setup_per_zone_lowmem_reserve();
        setup_per_zone_inactive_ratio();
        return 0;
@@ -5297,26 +5573,71 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
  * page allocater never alloc memory from ISOLATE block.
  */
 
+static int
+__count_immobile_pages(struct zone *zone, struct page *page, int count)
+{
+       unsigned long pfn, iter, found;
+       /*
+        * For avoiding noise data, lru_add_drain_all() should be called
+        * If ZONE_MOVABLE, the zone never contains immobile pages
+        */
+       if (zone_idx(zone) == ZONE_MOVABLE)
+               return true;
+
+       if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+               return true;
+
+       pfn = page_to_pfn(page);
+       for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+               unsigned long check = pfn + iter;
+
+               if (!pfn_valid_within(check))
+                       continue;
+
+               page = pfn_to_page(check);
+               if (!page_count(page)) {
+                       if (PageBuddy(page))
+                               iter += (1 << page_order(page)) - 1;
+                       continue;
+               }
+               if (!PageLRU(page))
+                       found++;
+               /*
+                * If there are RECLAIMABLE pages, we need to check it.
+                * But now, memory offline itself doesn't call shrink_slab()
+                * and it still to be fixed.
+                */
+               /*
+                * If the page is not RAM, page_count()should be 0.
+                * we don't need more check. This is an _used_ not-movable page.
+                *
+                * The problematic thing here is PG_reserved pages. PG_reserved
+                * is set to both of a memory hole page and a _used_ kernel
+                * page at boot.
+                */
+               if (found > count)
+                       return false;
+       }
+       return true;
+}
+
+bool is_pageblock_removable_nolock(struct page *page)
+{
+       struct zone *zone = page_zone(page);
+       return __count_immobile_pages(zone, page, 0);
+}
+
 int set_migratetype_isolate(struct page *page)
 {
        struct zone *zone;
-       struct page *curr_page;
-       unsigned long flags, pfn, iter;
-       unsigned long immobile = 0;
+       unsigned long flags, pfn;
        struct memory_isolate_notify arg;
        int notifier_ret;
        int ret = -EBUSY;
-       int zone_idx;
 
        zone = page_zone(page);
-       zone_idx = zone_idx(zone);
 
        spin_lock_irqsave(&zone->lock, flags);
-       if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
-           zone_idx == ZONE_MOVABLE) {
-               ret = 0;
-               goto out;
-       }
 
        pfn = page_to_pfn(page);
        arg.start_pfn = pfn;
@@ -5336,23 +5657,20 @@ int set_migratetype_isolate(struct page *page)
         */
        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
        notifier_ret = notifier_to_errno(notifier_ret);
-       if (notifier_ret || !arg.pages_found)
+       if (notifier_ret)
                goto out;
-
-       for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
-               if (!pfn_valid_within(pfn))
-                       continue;
-
-               curr_page = pfn_to_page(iter);
-               if (!page_count(curr_page) || PageLRU(curr_page))
-                       continue;
-
-               immobile++;
-       }
-
-       if (arg.pages_found == immobile)
+       /*
+        * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+        * We just check MOVABLE pages.
+        */
+       if (__count_immobile_pages(zone, page, arg.pages_found))
                ret = 0;
 
+       /*
+        * immobile means "not-on-lru" paes. If immobile is larger than
+        * removable-by-driver pages reported by notifier, we'll fail.
+        */
+
 out:
        if (!ret) {
                set_pageblock_migratetype(page, MIGRATE_ISOLATE);
@@ -5471,7 +5789,6 @@ static struct trace_print_flags pageflag_names[] = {
        {1UL << PG_swapcache,           "swapcache"     },
        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
        {1UL << PG_reclaim,             "reclaim"       },
-       {1UL << PG_buddy,               "buddy"         },
        {1UL << PG_swapbacked,          "swapbacked"    },
        {1UL << PG_unevictable,         "unevictable"   },
 #ifdef CONFIG_MMU
@@ -5519,7 +5836,8 @@ void dump_page(struct page *page)
 {
        printk(KERN_ALERT
               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-               page, page_count(page), page_mapcount(page),
+               page, atomic_read(&page->_count), page_mapcount(page),
                page->mapping, page->index);
        dump_page_flags(page->flags);
+       mem_cgroup_print_bad_page(page);
 }