]> nv-tegra.nvidia Code Review - linux-2.6.git/blobdiff - mm/page_alloc.c
chromeos: config: renormalize cfgs and drop debug_ll
[linux-2.6.git] / mm / page_alloc.c
index 6ce27331834c94ee944a68fed300abc57dd1d850..716061f48fd36258c4380e3f9b9648059a7ae8b5 100644 (file)
@@ -57,6 +57,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
+#include <linux/page-debug-flags.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states);
 
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory.  This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
+
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void)
        saved_gfp_mask = gfp_allowed_mask;
        gfp_allowed_mask &= ~GFP_IOFS;
 }
+
+bool pm_suspended_storage(void)
+{
+       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+               return false;
+       return true;
+}
 #endif /* CONFIG_PM_SLEEP */
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -176,47 +192,23 @@ static char * const zone_names[MAX_NR_ZONES] = {
 };
 
 int min_free_kbytes = 1024;
+int min_free_order_shift = 1;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-  #ifndef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-    /*
-     * MAX_ACTIVE_REGIONS determines the maximum number of distinct ranges
-     * of memory (RAM) that may be registered with add_active_range().
-     * Ranges passed to add_active_range() will be merged if possible so
-     * the number of times add_active_range() can be called is related to
-     * the number of nodes and the number of holes
-     */
-    #ifdef CONFIG_MAX_ACTIVE_REGIONS
-      /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
-      #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
-    #else
-      #if MAX_NUMNODES >= 32
-        /* If there can be many nodes, allow up to 50 holes per node */
-        #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
-      #else
-        /* By default, allow up to 256 distinct regions */
-        #define MAX_ACTIVE_REGIONS 256
-      #endif
-    #endif
-
-    static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
-    static int __meminitdata nr_nodemap_entries;
-#endif /* !CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
-  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-  static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-  static unsigned long __initdata required_kernelcore;
-  static unsigned long __initdata required_movablecore;
-  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
-
-  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
-  int movable_zone;
-  EXPORT_SYMBOL(movable_zone);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __initdata required_kernelcore;
+static unsigned long __initdata required_movablecore;
+static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+
+/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
+int movable_zone;
+EXPORT_SYMBOL(movable_zone);
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
@@ -336,8 +328,8 @@ out:
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
- * All pages have PG_compound set.  All pages have their ->private pointing at
- * the head page (even the head page has this).
+ * All pages have PG_compound set.  All tail pages have their ->first_page
+ * pointing at the head page.
  *
  * The first tail page's ->lru.next holds the address of the compound page's
  * put_page() function.  Its ->lru.prev holds the order of allocation.
@@ -359,8 +351,8 @@ void prep_compound_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
-
                __SetPageTail(p);
+               set_page_count(p, 0);
                p->first_page = page;
        }
 }
@@ -406,6 +398,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
                clear_highpage(page + i);
 }
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+       unsigned long res;
+
+       if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
+               printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+               return 0;
+       }
+       _debug_guardpage_minorder = res;
+       printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+       return 0;
+}
+__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+
+static inline void set_page_guard_flag(struct page *page)
+{
+       __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+
+static inline void clear_page_guard_flag(struct page *page)
+{
+       __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline void set_page_guard_flag(struct page *page) { }
+static inline void clear_page_guard_flag(struct page *page) { }
+#endif
+
 static inline void set_page_order(struct page *page, int order)
 {
        set_page_private(page, order);
@@ -463,6 +486,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
        if (page_zone_id(page) != page_zone_id(buddy))
                return 0;
 
+       if (page_is_guard(buddy) && page_order(buddy) == order) {
+               VM_BUG_ON(page_count(buddy) != 0);
+               return 1;
+       }
+
        if (PageBuddy(buddy) && page_order(buddy) == order) {
                VM_BUG_ON(page_count(buddy) != 0);
                return 1;
@@ -519,11 +547,19 @@ static inline void __free_one_page(struct page *page,
                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
                        break;
-
-               /* Our buddy is free, merge with it and move up one order. */
-               list_del(&buddy->lru);
-               zone->free_area[order].nr_free--;
-               rmv_page_order(buddy);
+               /*
+                * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
+                * merge with it and move up one order.
+                */
+               if (page_is_guard(buddy)) {
+                       clear_page_guard_flag(buddy);
+                       set_page_private(page, 0);
+                       __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+               } else {
+                       list_del(&buddy->lru);
+                       zone->free_area[order].nr_free--;
+                       rmv_page_order(buddy);
+               }
                combined_idx = buddy_idx & page_idx;
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
@@ -657,7 +693,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        int i;
        int bad = 0;
 
-       trace_mm_page_free_direct(page, order);
+       trace_mm_page_free(page, order);
        kmemcheck_free_shadow(page, order);
 
        if (PageAnon(page))
@@ -695,32 +731,23 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        local_irq_restore(flags);
 }
 
-/*
- * permit the bootmem allocator to evade page validation on high-order frees
- */
 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
-       if (order == 0) {
-               __ClearPageReserved(page);
-               set_page_count(page, 0);
-               set_page_refcounted(page);
-               __free_page(page);
-       } else {
-               int loop;
+       unsigned int nr_pages = 1 << order;
+       unsigned int loop;
 
-               prefetchw(page);
-               for (loop = 0; loop < (1 << order); loop++) {
-                       struct page *p = &page[loop];
-
-                       if (loop + 1 < (1 << order))
-                               prefetchw(p + 1);
-                       __ClearPageReserved(p);
-                       set_page_count(p, 0);
-               }
+       prefetchw(page);
+       for (loop = 0; loop < nr_pages; loop++) {
+               struct page *p = &page[loop];
 
-               set_page_refcounted(page);
-               __free_pages(page, order);
+               if (loop + 1 < nr_pages)
+                       prefetchw(p + 1);
+               __ClearPageReserved(p);
+               set_page_count(p, 0);
        }
+
+       set_page_refcounted(page);
+       __free_pages(page, order);
 }
 
 
@@ -749,6 +776,23 @@ static inline void expand(struct zone *zone, struct page *page,
                high--;
                size >>= 1;
                VM_BUG_ON(bad_range(zone, &page[size]));
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+               if (high < debug_guardpage_minorder()) {
+                       /*
+                        * Mark as guard pages (or page), that will allow to
+                        * merge back to allocator when buddy will be freed.
+                        * Corresponding page table entries will not be touched,
+                        * pages will stay not present in virtual address space
+                        */
+                       INIT_LIST_HEAD(&page[size].lru);
+                       set_page_guard_flag(&page[size]);
+                       set_page_private(&page[size], high);
+                       /* Guard pages are not available for any usage */
+                       __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+                       continue;
+               }
+#endif
                list_add(&page[size].lru, &area->free_list[migratetype]);
                area->nr_free++;
                set_page_order(&page[size], high);
@@ -1118,11 +1162,47 @@ void drain_local_pages(void *arg)
 }
 
 /*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * Note that this code is protected against sending an IPI to an offline
+ * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
+ * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
+ * nothing keeps CPUs from showing up after we populated the cpumask and
+ * before the call to on_each_cpu_mask().
  */
 void drain_all_pages(void)
 {
-       on_each_cpu(drain_local_pages, NULL, 1);
+       int cpu;
+       struct per_cpu_pageset *pcp;
+       struct zone *zone;
+
+       /*
+        * Allocate in the BSS so we wont require allocation in
+        * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
+        */
+       static cpumask_t cpus_with_pcps;
+
+       /*
+        * We don't care about racing with CPU hotplug event
+        * as offline notification will cause the notified
+        * cpu to drain that CPU pcps and on_each_cpu_mask
+        * disables preemption as part of its processing
+        */
+       for_each_online_cpu(cpu) {
+               bool has_pcps = false;
+               for_each_populated_zone(zone) {
+                       pcp = per_cpu_ptr(zone->pageset, cpu);
+                       if (pcp->pcp.count) {
+                               has_pcps = true;
+                               break;
+                       }
+               }
+               if (has_pcps)
+                       cpumask_set_cpu(cpu, &cpus_with_pcps);
+               else
+                       cpumask_clear_cpu(cpu, &cpus_with_pcps);
+       }
+       on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
 }
 
 #ifdef CONFIG_HIBERNATION
@@ -1213,6 +1293,19 @@ out:
        local_irq_restore(flags);
 }
 
+/*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, int cold)
+{
+       struct page *page, *next;
+
+       list_for_each_entry_safe(page, next, list, lru) {
+               trace_mm_page_free_batched(page, cold);
+               free_hot_cold_page(page, cold);
+       }
+}
+
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
@@ -1411,7 +1504,7 @@ static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 
 static int __init fail_page_alloc_debugfs(void)
 {
-       mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+       umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
        struct dentry *dir;
 
        dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
@@ -1460,7 +1553,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        long min = mark;
        int o;
 
-       free_pages -= (1 << order) + 1;
+       free_pages -= (1 << order) - 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
@@ -1473,7 +1566,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                free_pages -= z->free_area[o].nr_free << o;
 
                /* Require fewer higher order pages to be free */
-               min >>= 1;
+               min >>= min_free_order_shift;
 
                if (free_pages <= min)
                        return false;
@@ -1670,6 +1763,35 @@ zonelist_scan:
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                continue;
+               /*
+                * When allocating a page cache page for writing, we
+                * want to get it from a zone that is within its dirty
+                * limit, such that no single zone holds more than its
+                * proportional share of globally allowed dirty pages.
+                * The dirty limits take into account the zone's
+                * lowmem reserves and high watermark so that kswapd
+                * should be able to balance it without having to
+                * write pages from its LRU list.
+                *
+                * This may look like it could increase pressure on
+                * lower zones by failing allocations in higher zones
+                * before they are full.  But the pages that do spill
+                * over are limited as the lower zones are protected
+                * by this very same mechanism.  It should not become
+                * a practical burden to them.
+                *
+                * XXX: For now, allow allocations to potentially
+                * exceed the per-zone dirty limit in the slowpath
+                * (ALLOC_WMARK_LOW unset) before going into reclaim,
+                * which is important when on a NUMA setup the allowed
+                * zones are together not big enough to reach the
+                * global limit.  The proper fix for these situations
+                * will require awareness of zones in the
+                * dirty-throttling and the flusher threads.
+                */
+               if ((alloc_flags & ALLOC_WMARK_LOW) &&
+                   (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+                       goto this_zone_full;
 
                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1759,7 +1881,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
        unsigned int filter = SHOW_MEM_FILTER_NODES;
 
-       if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+       if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+           debug_guardpage_minorder() > 0)
                return;
 
        /*
@@ -1798,12 +1921,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+                               unsigned long did_some_progress,
                                unsigned long pages_reclaimed)
 {
        /* Do not loop if specifically requested */
        if (gfp_mask & __GFP_NORETRY)
                return 0;
 
+       /* Always retry if specifically requested */
+       if (gfp_mask & __GFP_NOFAIL)
+               return 1;
+
+       /*
+        * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+        * making forward progress without invoking OOM. Suspend also disables
+        * storage devices so kswapd will not help. Bail if we are suspending.
+        */
+       if (!did_some_progress && pm_suspended_storage())
+               return 0;
+
        /*
         * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
         * means __GFP_NOFAIL, but that may not be true in other
@@ -1822,13 +1958,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
        if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
                return 1;
 
-       /*
-        * Don't let big-order allocations loop unless the caller
-        * explicitly requests that.
-        */
-       if (gfp_mask & __GFP_NOFAIL)
-               return 1;
-
        return 0;
 }
 
@@ -1876,7 +2005,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        goto out;
        }
        /* Exhausted what can be done so it's blamo time */
-       out_of_memory(zonelist, gfp_mask, order, nodemask);
+       out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 
 out:
        clear_zonelist_oom(zonelist, gfp_mask);
@@ -1889,13 +2018,19 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress,
-       bool sync_migration)
+       int migratetype, bool sync_migration,
+       bool *deferred_compaction,
+       unsigned long *did_some_progress)
 {
        struct page *page;
 
-       if (!order || compaction_deferred(preferred_zone))
+       if (!order)
+               return NULL;
+
+       if (compaction_deferred(preferred_zone, order)) {
+               *deferred_compaction = true;
                return NULL;
+       }
 
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
@@ -1914,6 +2049,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                if (page) {
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
+                       if (order >= preferred_zone->compact_order_failed)
+                               preferred_zone->compact_order_failed = order + 1;
                        count_vm_event(COMPACTSUCCESS);
                        return page;
                }
@@ -1924,7 +2061,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                 * but not enough to satisfy watermarks.
                 */
                count_vm_event(COMPACTFAIL);
-               defer_compaction(preferred_zone);
+
+               /*
+                * As async compaction considers a subset of pageblocks, only
+                * defer if the failure was a sync compaction failure.
+                */
+               if (sync_migration)
+                       defer_compaction(preferred_zone, order);
 
                cond_resched();
        }
@@ -1936,8 +2079,9 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress,
-       bool sync_migration)
+       int migratetype, bool sync_migration,
+       bool *deferred_compaction,
+       unsigned long *did_some_progress)
 {
        return NULL;
 }
@@ -2087,6 +2231,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long pages_reclaimed = 0;
        unsigned long did_some_progress;
        bool sync_migration = false;
+       bool deferred_compaction = false;
 
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2167,12 +2312,22 @@ rebalance:
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                       migratetype, &did_some_progress,
-                                       sync_migration);
+                                       migratetype, sync_migration,
+                                       &deferred_compaction,
+                                       &did_some_progress);
        if (page)
                goto got_pg;
        sync_migration = true;
 
+       /*
+        * If compaction is deferred for high-order allocations, it is because
+        * sync compaction recently failed. In this is the case and the caller
+        * has requested the system not be heavily disrupted, fail the
+        * allocation now instead of entering direct reclaim
+        */
+       if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+               goto nopage;
+
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                        zonelist, high_zoneidx,
@@ -2190,6 +2345,10 @@ rebalance:
                if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
                        if (oom_killer_disabled)
                                goto nopage;
+                       /* Coredumps can quickly deplete all memory reserves */
+                       if ((current->flags & PF_DUMPCORE) &&
+                           !(gfp_mask & __GFP_NOFAIL))
+                               goto nopage;
                        page = __alloc_pages_may_oom(gfp_mask, order,
                                        zonelist, high_zoneidx,
                                        nodemask, preferred_zone,
@@ -2221,7 +2380,8 @@ rebalance:
 
        /* Check if we should retry the allocation */
        pages_reclaimed += did_some_progress;
-       if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+       if (should_alloc_retry(gfp_mask, order, did_some_progress,
+                                               pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
@@ -2235,8 +2395,9 @@ rebalance:
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                       migratetype, &did_some_progress,
-                                       sync_migration);
+                                       migratetype, sync_migration,
+                                       &deferred_compaction,
+                                       &did_some_progress);
                if (page)
                        goto got_pg;
        }
@@ -2260,8 +2421,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        struct zone *preferred_zone;
-       struct page *page;
+       struct page *page = NULL;
        int migratetype = allocflags_to_migratetype(gfp_mask);
+       unsigned int cpuset_mems_cookie;
 
        gfp_mask &= gfp_allowed_mask;
 
@@ -2280,15 +2442,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
 
-       get_mems_allowed();
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
+
        /* The preferred zone is used for statistics later */
        first_zones_zonelist(zonelist, high_zoneidx,
                                nodemask ? : &cpuset_current_mems_allowed,
                                &preferred_zone);
-       if (!preferred_zone) {
-               put_mems_allowed();
-               return NULL;
-       }
+       if (!preferred_zone)
+               goto out;
 
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2298,9 +2460,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
-       put_mems_allowed();
 
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+       /*
+        * When updating a task's mems_allowed, it is possible to race with
+        * parallel threads in such a way that an allocation can fail while
+        * the mask is being updated. If a page allocation is about to fail,
+        * check if the cpuset changed during allocation and if so, retry.
+        */
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
+
        return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2331,16 +2503,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(get_zeroed_page);
 
-void __pagevec_free(struct pagevec *pvec)
-{
-       int i = pagevec_count(pvec);
-
-       while (--i >= 0) {
-               trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
-               free_hot_cold_page(pvec->pages[i], pvec->cold);
-       }
-}
-
 void __free_pages(struct page *page, unsigned int order)
 {
        if (put_page_testzero(page)) {
@@ -2524,13 +2686,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
        bool ret = false;
+       unsigned int cpuset_mems_cookie;
 
        if (!(flags & SHOW_MEM_FILTER_NODES))
                goto out;
 
-       get_mems_allowed();
-       ret = !node_isset(nid, cpuset_current_mems_allowed);
-       put_mems_allowed();
+       do {
+               cpuset_mems_cookie = get_mems_allowed();
+               ret = !node_isset(nid, cpuset_current_mems_allowed);
+       } while (!put_mems_allowed(cpuset_mems_cookie));
 out:
        return ret;
 }
@@ -3380,9 +3544,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
        unsigned long block_migratetype;
        int reserve;
 
-       /* Get the start pfn, end pfn and the number of blocks to reserve */
+       /*
+        * Get the start pfn, end pfn and the number of blocks to reserve
+        * We have to be careful to be aligned to pageblock_nr_pages to
+        * make sure that we always check pfn_valid for the first page in
+        * the block.
+        */
        start_pfn = zone->zone_start_pfn;
        end_pfn = start_pfn + zone->spanned_pages;
+       start_pfn = roundup(start_pfn, pageblock_nr_pages);
        reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                        pageblock_order;
 
@@ -3404,25 +3574,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                if (page_to_nid(page) != zone_to_nid(zone))
                        continue;
 
-               /* Blocks with reserved pages will never free, skip them. */
-               block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-               if (pageblock_is_reserved(pfn, block_end_pfn))
-                       continue;
-
                block_migratetype = get_pageblock_migratetype(page);
 
-               /* If this block is reserved, account for it */
-               if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
-                       reserve--;
-                       continue;
-               }
+               /* Only test what is necessary when the reserves are not met */
+               if (reserve > 0) {
+                       /*
+                        * Blocks with reserved pages will never free, skip
+                        * them.
+                        */
+                       block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                       if (pageblock_is_reserved(pfn, block_end_pfn))
+                               continue;
 
-               /* Suitable for reserving if this block is movable */
-               if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
-                       set_pageblock_migratetype(page, MIGRATE_RESERVE);
-                       move_freepages_block(zone, page, MIGRATE_RESERVE);
-                       reserve--;
-                       continue;
+                       /* If this block is reserved, account for it */
+                       if (block_migratetype == MIGRATE_RESERVE) {
+                               reserve--;
+                               continue;
+                       }
+
+                       /* Suitable for reserving if this block is movable */
+                       if (block_migratetype == MIGRATE_MOVABLE) {
+                               set_pageblock_migratetype(page,
+                                                       MIGRATE_RESERVE);
+                               move_freepages_block(zone, page,
+                                                       MIGRATE_RESERVE);
+                               reserve--;
+                               continue;
+                       }
                }
 
                /*
@@ -3734,7 +3912,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
        return 0;
 }
 
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
@@ -3803,18 +3981,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
        }
 }
 
-int __init add_from_early_node_map(struct range *range, int az,
-                                  int nr_range, int nid)
-{
-       unsigned long start_pfn, end_pfn;
-       int i;
-
-       /* need to go over early_node_map to find out good range for node */
-       for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
-               nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
-       return nr_range;
-}
-
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -4002,7 +4168,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 
-#else
+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long *zones_size)
@@ -4020,7 +4186,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
        return zholes_size[zone_type];
 }
 
-#endif
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
@@ -4140,7 +4306,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize, memmap_pages;
-               enum lru_list l;
+               enum lru_list lru;
 
                size = zone_spanned_pages_in_node(nid, j, zones_size);
                realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4190,8 +4356,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->zone_pgdat = pgdat;
 
                zone_pcp_init(zone);
-               for_each_lru(l)
-                       INIT_LIST_HEAD(&zone->lru[l].list);
+               for_each_lru(lru)
+                       INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
                zone->reclaim_stat.recent_rotated[0] = 0;
                zone->reclaim_stat.recent_rotated[1] = 0;
                zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4243,10 +4409,10 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
         */
        if (pgdat == NODE_DATA(0)) {
                mem_map = NODE_DATA(0)->node_mem_map;
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
                if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
                        mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
        }
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
@@ -4271,7 +4437,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        free_area_init_core(pgdat, zones_size, zholes_size);
 }
 
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 
 #if MAX_NUMNODES > 1
 /*
@@ -4292,201 +4458,6 @@ static inline void setup_nr_node_ids(void)
 }
 #endif
 
-#ifndef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-/*
- * Common iterator interface used to define for_each_mem_pfn_range().
- */
-void __meminit __next_mem_pfn_range(int *idx, int nid,
-                                   unsigned long *out_start_pfn,
-                                   unsigned long *out_end_pfn, int *out_nid)
-{
-       struct node_active_region *r = NULL;
-
-       while (++*idx < nr_nodemap_entries) {
-               if (nid == MAX_NUMNODES || nid == early_node_map[*idx].nid) {
-                       r = &early_node_map[*idx];
-                       break;
-               }
-       }
-       if (!r) {
-               *idx = -1;
-               return;
-       }
-
-       if (out_start_pfn)
-               *out_start_pfn = r->start_pfn;
-       if (out_end_pfn)
-               *out_end_pfn = r->end_pfn;
-       if (out_nid)
-               *out_nid = r->nid;
-}
-
-/**
- * add_active_range - Register a range of PFNs backed by physical memory
- * @nid: The node ID the range resides on
- * @start_pfn: The start PFN of the available physical memory
- * @end_pfn: The end PFN of the available physical memory
- *
- * These ranges are stored in an early_node_map[] and later used by
- * free_area_init_nodes() to calculate zone sizes and holes. If the
- * range spans a memory hole, it is up to the architecture to ensure
- * the memory is not freed by the bootmem allocator. If possible
- * the range being registered will be merged with existing ranges.
- */
-void __init add_active_range(unsigned int nid, unsigned long start_pfn,
-                                               unsigned long end_pfn)
-{
-       int i;
-
-       mminit_dprintk(MMINIT_TRACE, "memory_register",
-                       "Entering add_active_range(%d, %#lx, %#lx) "
-                       "%d entries of %d used\n",
-                       nid, start_pfn, end_pfn,
-                       nr_nodemap_entries, MAX_ACTIVE_REGIONS);
-
-       mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
-
-       /* Merge with existing active regions if possible */
-       for (i = 0; i < nr_nodemap_entries; i++) {
-               if (early_node_map[i].nid != nid)
-                       continue;
-
-               /* Skip if an existing region covers this new one */
-               if (start_pfn >= early_node_map[i].start_pfn &&
-                               end_pfn <= early_node_map[i].end_pfn)
-                       return;
-
-               /* Merge forward if suitable */
-               if (start_pfn <= early_node_map[i].end_pfn &&
-                               end_pfn > early_node_map[i].end_pfn) {
-                       early_node_map[i].end_pfn = end_pfn;
-                       return;
-               }
-
-               /* Merge backward if suitable */
-               if (start_pfn < early_node_map[i].start_pfn &&
-                               end_pfn >= early_node_map[i].start_pfn) {
-                       early_node_map[i].start_pfn = start_pfn;
-                       return;
-               }
-       }
-
-       /* Check that early_node_map is large enough */
-       if (i >= MAX_ACTIVE_REGIONS) {
-               printk(KERN_CRIT "More than %d memory regions, truncating\n",
-                                                       MAX_ACTIVE_REGIONS);
-               return;
-       }
-
-       early_node_map[i].nid = nid;
-       early_node_map[i].start_pfn = start_pfn;
-       early_node_map[i].end_pfn = end_pfn;
-       nr_nodemap_entries = i + 1;
-}
-
-/**
- * remove_active_range - Shrink an existing registered range of PFNs
- * @nid: The node id the range is on that should be shrunk
- * @start_pfn: The new PFN of the range
- * @end_pfn: The new PFN of the range
- *
- * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
- * The map is kept near the end physical page range that has already been
- * registered. This function allows an arch to shrink an existing registered
- * range.
- */
-void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
-                               unsigned long end_pfn)
-{
-       unsigned long this_start_pfn, this_end_pfn;
-       int i, j;
-       int removed = 0;
-
-       printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
-                         nid, start_pfn, end_pfn);
-
-       /* Find the old active region end and shrink */
-       for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
-               if (this_start_pfn >= start_pfn && this_end_pfn <= end_pfn) {
-                       /* clear it */
-                       early_node_map[i].start_pfn = 0;
-                       early_node_map[i].end_pfn = 0;
-                       removed = 1;
-                       continue;
-               }
-               if (this_start_pfn < start_pfn && this_end_pfn > start_pfn) {
-                       early_node_map[i].end_pfn = start_pfn;
-                       if (this_end_pfn > end_pfn)
-                               add_active_range(nid, end_pfn, this_end_pfn);
-                       continue;
-               }
-               if (this_start_pfn >= start_pfn && this_end_pfn > end_pfn &&
-                   this_start_pfn < end_pfn) {
-                       early_node_map[i].start_pfn = end_pfn;
-                       continue;
-               }
-       }
-
-       if (!removed)
-               return;
-
-       /* remove the blank ones */
-       for (i = nr_nodemap_entries - 1; i > 0; i--) {
-               if (early_node_map[i].nid != nid)
-                       continue;
-               if (early_node_map[i].end_pfn)
-                       continue;
-               /* we found it, get rid of it */
-               for (j = i; j < nr_nodemap_entries - 1; j++)
-                       memcpy(&early_node_map[j], &early_node_map[j+1],
-                               sizeof(early_node_map[j]));
-               j = nr_nodemap_entries - 1;
-               memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
-               nr_nodemap_entries--;
-       }
-}
-
-/**
- * remove_all_active_ranges - Remove all currently registered regions
- *
- * During discovery, it may be found that a table like SRAT is invalid
- * and an alternative discovery method must be used. This function removes
- * all currently registered regions.
- */
-void __init remove_all_active_ranges(void)
-{
-       memset(early_node_map, 0, sizeof(early_node_map));
-       nr_nodemap_entries = 0;
-}
-
-/* Compare two active node_active_regions */
-static int __init cmp_node_active_region(const void *a, const void *b)
-{
-       struct node_active_region *arange = (struct node_active_region *)a;
-       struct node_active_region *brange = (struct node_active_region *)b;
-
-       /* Done this way to avoid overflows */
-       if (arange->start_pfn > brange->start_pfn)
-               return 1;
-       if (arange->start_pfn < brange->start_pfn)
-               return -1;
-
-       return 0;
-}
-
-/* sort the node_map by start_pfn */
-void __init sort_node_map(void)
-{
-       sort(early_node_map, (size_t)nr_nodemap_entries,
-                       sizeof(struct node_active_region),
-                       cmp_node_active_region, NULL);
-}
-#else /* !CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static inline void sort_node_map(void)
-{
-}
-#endif
-
 /**
  * node_map_pfn_alignment - determine the maximum internode alignment
  *
@@ -4594,7 +4565,7 @@ static unsigned long __init early_calculate_totalpages(void)
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
-static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+static void __init find_zone_movable_pfns_for_nodes(void)
 {
        int i, nid;
        unsigned long usable_startpfn;
@@ -4740,8 +4711,10 @@ static void check_for_regular_memory(pg_data_t *pgdat)
 
        for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
                struct zone *zone = &pgdat->node_zones[zone_type];
-               if (zone->present_pages)
+               if (zone->present_pages) {
                        node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+                       break;
+               }
        }
 #endif
 }
@@ -4764,9 +4737,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        unsigned long start_pfn, end_pfn;
        int i, nid;
 
-       /* Sort early_node_map as initialisation assumes it is sorted */
-       sort_node_map();
-
        /* Record where the zone boundaries are */
        memset(arch_zone_lowest_possible_pfn, 0,
                                sizeof(arch_zone_lowest_possible_pfn));
@@ -4787,7 +4757,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 
        /* Find the PFNs that ZONE_MOVABLE begins at in each node */
        memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
-       find_zone_movable_pfns_for_nodes(zone_movable_pfn);
+       find_zone_movable_pfns_for_nodes();
 
        /* Print out the zone ranges */
        printk("Zone PFN ranges:\n");
@@ -4867,7 +4837,7 @@ static int __init cmdline_parse_movablecore(char *p)
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
@@ -4897,6 +4867,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
        int cpu = (unsigned long)hcpu;
 
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+               lru_add_drain_cpu(cpu);
                drain_pages(cpu);
 
                /*
@@ -4951,8 +4922,19 @@ static void calculate_totalreserve_pages(void)
                        if (max > zone->present_pages)
                                max = zone->present_pages;
                        reserve_pages += max;
+                       /*
+                        * Lowmem reserves are not available to
+                        * GFP_HIGHUSER page cache allocations and
+                        * kswapd tries to balance zones to their high
+                        * watermark.  As a result, neither should be
+                        * regarded as dirtyable memory, to prevent a
+                        * situation where reclaim has to clean pages
+                        * in order to balance the zones.
+                        */
+                       zone->dirty_balance_reserve = max;
                }
        }
+       dirty_balance_reserve = reserve_pages;
        totalreserve_pages = reserve_pages;
 }
 
@@ -5222,7 +5204,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        int ret;
 
        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-       if (!write || (ret == -EINVAL))
+       if (!write || (ret < 0))
                return ret;
        for_each_populated_zone(zone) {
                for_each_possible_cpu(cpu) {
@@ -5299,6 +5281,7 @@ void *__init alloc_large_system_hash(const char *tablename,
                max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
                do_div(max, bucketsize);
        }
+       max = min(max, 0x80000000ULL);
 
        if (numentries > max)
                numentries = max;
@@ -5476,7 +5459,25 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
 
 bool is_pageblock_removable_nolock(struct page *page)
 {
-       struct zone *zone = page_zone(page);
+       struct zone *zone;
+       unsigned long pfn;
+
+       /*
+        * We have to be careful here because we are iterating over memory
+        * sections which are not zone aware so we might end up outside of
+        * the zone but still within the section.
+        * We have to take care about the node as well. If the node is offline
+        * its NODE_DATA will be NULL - see page_zone.
+        */
+       if (!node_online(page_to_nid(page)))
+               return false;
+
+       zone = page_zone(page);
+       pfn = page_to_pfn(page);
+       if (zone->zone_start_pfn > pfn ||
+                       zone->zone_start_pfn + zone->spanned_pages <= pfn)
+               return false;
+
        return __count_immobile_pages(zone, page, 0);
 }