config: tegra3: enable /dev mount with ACL
[linux-2.6.git] / mm / page_alloc.c
index 01e6b61..8859578 100644 (file)
@@ -127,6 +127,20 @@ void pm_restrict_gfp_mask(void)
        saved_gfp_mask = gfp_allowed_mask;
        gfp_allowed_mask &= ~GFP_IOFS;
 }
+
+static bool pm_suspending(void)
+{
+       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+               return false;
+       return true;
+}
+
+#else
+
+static bool pm_suspending(void)
+{
+       return false;
+}
 #endif /* CONFIG_PM_SLEEP */
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -176,6 +190,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
 };
 
 int min_free_kbytes = 1024;
+int min_free_order_shift = 1;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -355,8 +370,8 @@ void prep_compound_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
-
                __SetPageTail(p);
+               set_page_count(p, 0);
                p->first_page = page;
        }
 }
@@ -1370,21 +1385,12 @@ failed:
 
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 
-static struct fail_page_alloc_attr {
+static struct {
        struct fault_attr attr;
 
        u32 ignore_gfp_highmem;
        u32 ignore_gfp_wait;
        u32 min_order;
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
-       struct dentry *ignore_gfp_highmem_file;
-       struct dentry *ignore_gfp_wait_file;
-       struct dentry *min_order_file;
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
 } fail_page_alloc = {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_gfp_wait = 1,
@@ -1418,36 +1424,27 @@ static int __init fail_page_alloc_debugfs(void)
 {
        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
        struct dentry *dir;
-       int err;
-
-       err = init_fault_attr_dentries(&fail_page_alloc.attr,
-                                      "fail_page_alloc");
-       if (err)
-               return err;
-       dir = fail_page_alloc.attr.dentries.dir;
-
-       fail_page_alloc.ignore_gfp_wait_file =
-               debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                                     &fail_page_alloc.ignore_gfp_wait);
-
-       fail_page_alloc.ignore_gfp_highmem_file =
-               debugfs_create_bool("ignore-gfp-highmem", mode, dir,
-                                     &fail_page_alloc.ignore_gfp_highmem);
-       fail_page_alloc.min_order_file =
-               debugfs_create_u32("min-order", mode, dir,
-                                  &fail_page_alloc.min_order);
-
-       if (!fail_page_alloc.ignore_gfp_wait_file ||
-            !fail_page_alloc.ignore_gfp_highmem_file ||
-            !fail_page_alloc.min_order_file) {
-               err = -ENOMEM;
-               debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
-               debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
-               debugfs_remove(fail_page_alloc.min_order_file);
-               cleanup_fault_attr_dentries(&fail_page_alloc.attr);
-       }
 
-       return err;
+       dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+                                       &fail_page_alloc.attr);
+       if (IS_ERR(dir))
+               return PTR_ERR(dir);
+
+       if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
+                               &fail_page_alloc.ignore_gfp_wait))
+               goto fail;
+       if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+                               &fail_page_alloc.ignore_gfp_highmem))
+               goto fail;
+       if (!debugfs_create_u32("min-order", mode, dir,
+                               &fail_page_alloc.min_order))
+               goto fail;
+
+       return 0;
+fail:
+       debugfs_remove_recursive(dir);
+
+       return -ENOMEM;
 }
 
 late_initcall(fail_page_alloc_debugfs);
@@ -1487,7 +1484,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                free_pages -= z->free_area[o].nr_free << o;
 
                /* Require fewer higher order pages to be free */
-               min >>= 1;
+               min >>= min_free_order_shift;
 
                if (free_pages <= min)
                        return false;
@@ -1616,6 +1613,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
        set_bit(i, zlc->fullzones);
 }
 
+/*
+ * clear all zones full, called after direct reclaim makes progress so that
+ * a zone that was recently full is not skipped over for up to a second
+ */
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+
+       zlc = zonelist->zlcache_ptr;
+       if (!zlc)
+               return;
+
+       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+}
+
 #else  /* CONFIG_NUMA */
 
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1632,6 +1644,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
+
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+}
 #endif /* CONFIG_NUMA */
 
 /*
@@ -1664,7 +1680,7 @@ zonelist_scan:
                                continue;
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
-                               goto try_next_zone;
+                               continue;
 
                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1676,17 +1692,36 @@ zonelist_scan:
                                    classzone_idx, alloc_flags))
                                goto try_this_zone;
 
+                       if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+                               /*
+                                * we do zlc_setup if there are multiple nodes
+                                * and before considering the first zone allowed
+                                * by the cpuset.
+                                */
+                               allowednodes = zlc_setup(zonelist, alloc_flags);
+                               zlc_active = 1;
+                               did_zlc_setup = 1;
+                       }
+
                        if (zone_reclaim_mode == 0)
                                goto this_zone_full;
 
+                       /*
+                        * As we may have just activated ZLC, check if the first
+                        * eligible zone has failed zone_reclaim recently.
+                        */
+                       if (NUMA_BUILD && zlc_active &&
+                               !zlc_zone_worth_trying(zonelist, z, allowednodes))
+                               continue;
+
                        ret = zone_reclaim(zone, gfp_mask, order);
                        switch (ret) {
                        case ZONE_RECLAIM_NOSCAN:
                                /* did not scan */
-                               goto try_next_zone;
+                               continue;
                        case ZONE_RECLAIM_FULL:
                                /* scanned but unreclaimable */
-                               goto this_zone_full;
+                               continue;
                        default:
                                /* did we reclaim enough */
                                if (!zone_watermark_ok(zone, order, mark,
@@ -1703,16 +1738,6 @@ try_this_zone:
 this_zone_full:
                if (NUMA_BUILD)
                        zlc_mark_zone_full(zonelist, z);
-try_next_zone:
-               if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
-                       /*
-                        * we do zlc_setup after the first zone is tried but only
-                        * if there are multiple nodes make it worthwhile
-                        */
-                       allowednodes = zlc_setup(zonelist, alloc_flags);
-                       zlc_active = 1;
-                       did_zlc_setup = 1;
-               }
        }
 
        if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1954,6 +1979,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!(*did_some_progress)))
                return NULL;
 
+       /* After successful reclaim, reconsider all zones for allocation */
+       if (NUMA_BUILD)
+               zlc_clear_zones_full(zonelist);
+
 retry:
        page = get_page_from_freelist(gfp_mask, nodemask, order,
                                        zonelist, high_zoneidx,
@@ -2106,6 +2135,7 @@ restart:
                first_zones_zonelist(zonelist, high_zoneidx, NULL,
                                        &preferred_zone);
 
+rebalance:
        /* This is the last chance, in general, before the goto nopage. */
        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2113,7 +2143,6 @@ restart:
        if (page)
                goto got_pg;
 
-rebalance:
        /* Allocate without watermarks if the context allows */
        if (alloc_flags & ALLOC_NO_WATERMARKS) {
                page = __alloc_pages_high_priority(gfp_mask, order,
@@ -2193,6 +2222,14 @@ rebalance:
 
                        goto restart;
                }
+
+               /*
+                * Suspend converts GFP_KERNEL to __GFP_WAIT which can
+                * prevent reclaim making forward progress without
+                * invoking OOM. Bail if we are suspending
+                */
+               if (pm_suspending())
+                       goto nopage;
        }
 
        /* Check if we should retry the allocation */
@@ -3356,9 +3393,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
        unsigned long block_migratetype;
        int reserve;
 
-       /* Get the start pfn, end pfn and the number of blocks to reserve */
+       /*
+        * Get the start pfn, end pfn and the number of blocks to reserve
+        * We have to be careful to be aligned to pageblock_nr_pages to
+        * make sure that we always check pfn_valid for the first page in
+        * the block.
+        */
        start_pfn = zone->zone_start_pfn;
        end_pfn = start_pfn + zone->spanned_pages;
+       start_pfn = roundup(start_pfn, pageblock_nr_pages);
        reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                        pageblock_order;
 
@@ -4319,10 +4362,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->zone_pgdat = pgdat;
 
                zone_pcp_init(zone);
-               for_each_lru(l) {
+               for_each_lru(l)
                        INIT_LIST_HEAD(&zone->lru[l].list);
-                       zone->reclaim_stat.nr_saved_scan[l] = 0;
-               }
                zone->reclaim_stat.recent_rotated[0] = 0;
                zone->reclaim_stat.recent_rotated[1] = 0;
                zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4587,6 +4628,60 @@ void __init sort_node_map(void)
                        cmp_node_active_region, NULL);
 }
 
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
+ * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Returns the determined alignment in pfn's.  0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+       unsigned long accl_mask = 0, last_end = 0;
+       int last_nid = -1;
+       int i;
+
+       for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
+               int nid = early_node_map[i].nid;
+               unsigned long start = early_node_map[i].start_pfn;
+               unsigned long end = early_node_map[i].end_pfn;
+               unsigned long mask;
+
+               if (!start || last_nid < 0 || last_nid == nid) {
+                       last_nid = nid;
+                       last_end = end;
+                       continue;
+               }
+
+               /*
+                * Start with a mask granular enough to pin-point to the
+                * start pfn and tick off bits one-by-one until it becomes
+                * too coarse to separate the current node from the last.
+                */
+               mask = ~((1 << __ffs(start)) - 1);
+               while (mask && last_end <= (start & (mask << 1)))
+                       mask <<= 1;
+
+               /* accumulate all internode masks */
+               accl_mask |= mask;
+       }
+
+       /* convert mask to number of pages */
+       return ~accl_mask + 1;
+}
+
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {