sys_swapon: call swap_cgroup_swapon() earlier

[linux-2.6.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index bbd0423f282047900fd76b4c88081de6603eaf16..136a547262a0badb41bb7cce60ab4e7e91dcb301 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -286,7 +286,7 @@ static void bad_page(struct page *page)
  
         /* Don't complain about poisoned pages */
         if (PageHWPoison(page)) {
-               __ClearPageBuddy(page);
+               reset_page_mapcount(page); /* remove PageBuddy */
                 return;
         }
  
@@ -317,7 +317,7 @@ static void bad_page(struct page *page)
         dump_stack();
  out:
         /* Leave bad fields for debug, except PageBuddy could make trouble */
-       __ClearPageBuddy(page);
+       reset_page_mapcount(page); /* remove PageBuddy */
         add_taint(TAINT_BAD_PAGE);
  }
  
@@ -427,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
   *
   * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
   */
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
-       unsigned long buddy_idx = page_idx ^ (1 << order);
-
-       return page + (buddy_idx - page_idx);
-}
-
  static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
  {
-       return (page_idx & ~(1 << order));
+       return page_idx ^ (1 << order);
  }
  
  /*
@@ -449,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
   * (c) a page and its buddy have the same order &&
   * (d) a page and its buddy are in the same zone.
   *
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
   *
   * For recording page's order, we use page_private(page).
   */
@@ -483,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
   * as necessary, plus some accounting needed to play nicely with other
   * parts of the VM system.
   * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
   * order is recorded in page_private(page) field.
   * So when we are allocating or freeing one, we can derive the state of the
   * other.  That is, if we allocate a small block, and both were   
@@ -500,6 +492,7 @@ static inline void __free_one_page(struct page *page,
  {
         unsigned long page_idx;
         unsigned long combined_idx;
+       unsigned long uninitialized_var(buddy_idx);
         struct page *buddy;
  
         if (unlikely(PageCompound(page)))
@@ -514,7 +507,8 @@ static inline void __free_one_page(struct page *page,
         VM_BUG_ON(bad_range(zone, page));
  
         while (order < MAX_ORDER-1) {
-               buddy = __page_find_buddy(page, page_idx, order);
+               buddy_idx = __find_buddy_index(page_idx, order);
+               buddy = page + (buddy_idx - page_idx);
                 if (!page_is_buddy(page, buddy, order))
                         break;
  
@@ -522,7 +516,7 @@ static inline void __free_one_page(struct page *page,
                 list_del(&buddy->lru);
                 zone->free_area[order].nr_free--;
                 rmv_page_order(buddy);
-               combined_idx = __find_combined_index(page_idx, order);
+               combined_idx = buddy_idx & page_idx;
                 page = page + (combined_idx - page_idx);
                 page_idx = combined_idx;
                 order++;
@@ -539,9 +533,10 @@ static inline void __free_one_page(struct page *page,
          */
         if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
                 struct page *higher_page, *higher_buddy;
-               combined_idx = __find_combined_index(page_idx, order);
-               higher_page = page + combined_idx - page_idx;
-               higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+               combined_idx = buddy_idx & page_idx;
+               higher_page = page + (combined_idx - page_idx);
+               buddy_idx = __find_buddy_index(combined_idx, order + 1);
+               higher_buddy = page + (buddy_idx - combined_idx);
                 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                         list_add_tail(&page->lru,
                                 &zone->free_area[order].free_list[migratetype]);
@@ -619,6 +614,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                         list = &pcp->lists[migratetype];
                 } while (list_empty(list));
  
+               /* This is the only non-empty list. Free them all. */
+               if (batch_free == MIGRATE_PCPTYPES)
+                       batch_free = to_free;
+
                 do {
                         page = list_entry(list->prev, struct page, lru);
                         /* must delete as __free_one_page list manipulates */
@@ -1093,8 +1092,10 @@ static void drain_pages(unsigned int cpu)
                 pset = per_cpu_ptr(zone->pageset, cpu);
  
                 pcp = &pset->pcp;
-               free_pcppages_bulk(zone, pcp->count, pcp);
-               pcp->count = 0;
+               if (pcp->count) {
+                       free_pcppages_bulk(zone, pcp->count, pcp);
+                       pcp->count = 0;
+               }
                 local_irq_restore(flags);
         }
  }
@@ -1336,7 +1337,7 @@ again:
         }
  
         __count_zone_vm_events(PGALLOC, zone, 1 << order);
-       zone_statistics(preferred_zone, zone);
+       zone_statistics(preferred_zone, zone, gfp_flags);
         local_irq_restore(flags);
  
         VM_BUG_ON(bad_range(zone, page));
@@ -1717,6 +1718,20 @@ try_next_zone:
         return page;
  }
  
+/*
+ * Large machines with many possible nodes should not always dump per-node
+ * meminfo in irq context.
+ */
+static inline bool should_suppress_show_mem(void)
+{
+       bool ret = false;
+
+#if NODES_SHIFT > 8
+       ret = in_interrupt();
+#endif
+       return ret;
+}
+
  static inline int
  should_alloc_retry(gfp_t gfp_mask, unsigned int order,
                                 unsigned long pages_reclaimed)
@@ -1814,15 +1829,14 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         bool sync_migration)
  {
         struct page *page;
-       struct task_struct *tsk = current;
  
         if (!order || compaction_deferred(preferred_zone))
                 return NULL;
  
-       tsk->flags |= PF_MEMALLOC;
+       current->flags |= PF_MEMALLOC;
         *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                 nodemask, sync_migration);
-       tsk->flags &= ~PF_MEMALLOC;
+       current->flags &= ~PF_MEMALLOC;
         if (*did_some_progress != COMPACT_SKIPPED) {
  
                 /* Page migration frees to the PCP lists but we want merging */
@@ -1874,23 +1888,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
  {
         struct page *page = NULL;
         struct reclaim_state reclaim_state;
-       struct task_struct *p = current;
         bool drained = false;
  
         cond_resched();
  
         /* We now go into synchronous reclaim */
         cpuset_memory_pressure_bump();
-       p->flags |= PF_MEMALLOC;
+       current->flags |= PF_MEMALLOC;
         lockdep_set_current_reclaim_state(gfp_mask);
         reclaim_state.reclaimed_slab = 0;
-       p->reclaim_state = &reclaim_state;
+       current->reclaim_state = &reclaim_state;
  
         *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
  
-       p->reclaim_state = NULL;
+       current->reclaim_state = NULL;
         lockdep_clear_current_reclaim_state();
-       p->flags &= ~PF_MEMALLOC;
+       current->flags &= ~PF_MEMALLOC;
  
         cond_resched();
  
@@ -1955,7 +1968,6 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
  static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
-       struct task_struct *p = current;
         int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
         const gfp_t wait = gfp_mask & __GFP_WAIT;
  
@@ -1971,18 +1983,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
         alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
  
         if (!wait) {
-               alloc_flags |= ALLOC_HARDER;
+               /*
+                * Not worth trying to allocate harder for
+                * __GFP_NOMEMALLOC even if it can't schedule.
+                */
+               if  (!(gfp_mask & __GFP_NOMEMALLOC))
+                       alloc_flags |= ALLOC_HARDER;
                 /*
                  * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
                  * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                  */
                 alloc_flags &= ~ALLOC_CPUSET;
-       } else if (unlikely(rt_task(p)) && !in_interrupt())
+       } else if (unlikely(rt_task(current)) && !in_interrupt())
                 alloc_flags |= ALLOC_HARDER;
  
         if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
                 if (!in_interrupt() &&
-                   ((p->flags & PF_MEMALLOC) ||
+                   ((current->flags & PF_MEMALLOC) ||
                      unlikely(test_thread_flag(TIF_MEMDIE))))
                         alloc_flags |= ALLOC_NO_WATERMARKS;
         }
@@ -2001,7 +2018,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         int alloc_flags;
         unsigned long pages_reclaimed = 0;
         unsigned long did_some_progress;
-       struct task_struct *p = current;
         bool sync_migration = false;
  
         /*
@@ -2038,6 +2054,14 @@ restart:
          */
         alloc_flags = gfp_to_alloc_flags(gfp_mask);
  
+       /*
+        * Find the true preferred zone if the allocation is unconstrained by
+        * cpusets.
+        */
+       if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+               first_zones_zonelist(zonelist, high_zoneidx, NULL,
+                                       &preferred_zone);
+
         /* This is the last chance, in general, before the goto nopage. */
         page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                         high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2060,7 +2084,7 @@ rebalance:
                 goto nopage;
  
         /* Avoid recursion of direct reclaim */
-       if (p->flags & PF_MEMALLOC)
+       if (current->flags & PF_MEMALLOC)
                 goto nopage;
  
         /* Avoid allocations with no watermarks from looping endlessly */
@@ -2079,7 +2103,7 @@ rebalance:
                                         sync_migration);
         if (page)
                 goto got_pg;
-       sync_migration = true;
+       sync_migration = !(gfp_mask & __GFP_NO_KSWAPD);
  
         /* Try direct reclaim and then allocating */
         page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2151,11 +2175,25 @@ rebalance:
  
  nopage:
         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
-               printk(KERN_WARNING "%s: page allocation failure."
-                       " order:%d, mode:0x%x\n",
-                       p->comm, order, gfp_mask);
+               unsigned int filter = SHOW_MEM_FILTER_NODES;
+
+               /*
+                * This documents exceptions given to allocations in certain
+                * contexts that are allowed to allocate outside current's set
+                * of allowed nodes.
+                */
+               if (!(gfp_mask & __GFP_NOMEMALLOC))
+                       if (test_thread_flag(TIF_MEMDIE) ||
+                           (current->flags & (PF_MEMALLOC | PF_EXITING)))
+                               filter &= ~SHOW_MEM_FILTER_NODES;
+               if (in_interrupt() || !wait)
+                       filter &= ~SHOW_MEM_FILTER_NODES;
+
+               pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n",
+                       current->comm, order, gfp_mask);
                 dump_stack();
-               show_mem();
+               if (!should_suppress_show_mem())
+                       __show_mem(filter);
         }
         return page;
  got_pg:
@@ -2196,7 +2234,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  
         get_mems_allowed();
         /* The preferred zone is used for statistics later */
-       first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+       first_zones_zonelist(zonelist, high_zoneidx,
+                               nodemask ? : &cpuset_current_mems_allowed,
+                               &preferred_zone);
         if (!preferred_zone) {
                 put_mems_allowed();
                 return NULL;
@@ -2403,19 +2443,42 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  }
  #endif
  
+/*
+ * Determine whether the zone's node should be displayed or not, depending on
+ * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas().
+ */
+static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone)
+{
+       bool ret = false;
+
+       if (!(flags & SHOW_MEM_FILTER_NODES))
+               goto out;
+
+       get_mems_allowed();
+       ret = !node_isset(zone->zone_pgdat->node_id,
+                               cpuset_current_mems_allowed);
+       put_mems_allowed();
+out:
+       return ret;
+}
+
  #define K(x) ((x) << (PAGE_SHIFT-10))
  
  /*
   * Show free area list (used inside shift_scroll-lock stuff)
   * We also calculate the percentage fragmentation. We do this by counting the
   * memory on each free list with the exception of the first item on the list.
+ * Suppresses nodes that are not allowed by current's cpuset if
+ * SHOW_MEM_FILTER_NODES is passed.
   */
-void show_free_areas(void)
+void __show_free_areas(unsigned int filter)
  {
         int cpu;
         struct zone *zone;
  
         for_each_populated_zone(zone) {
+               if (skip_free_areas_zone(filter, zone))
+                       continue;
                 show_node(zone);
                 printk("%s per-cpu:\n", zone->name);
  
@@ -2457,6 +2520,8 @@ void show_free_areas(void)
         for_each_populated_zone(zone) {
                 int i;
  
+               if (skip_free_areas_zone(filter, zone))
+                       continue;
                 show_node(zone);
                 printk("%s"
                         " free:%lukB"
@@ -2524,6 +2589,8 @@ void show_free_areas(void)
         for_each_populated_zone(zone) {
                 unsigned long nr[MAX_ORDER], flags, order, total = 0;
  
+               if (skip_free_areas_zone(filter, zone))
+                       continue;
                 show_node(zone);
                 printk("%s: ", zone->name);
  
@@ -2543,6 +2610,11 @@ void show_free_areas(void)
         show_swap_cache_info();
  }
  
+void show_free_areas(void)
+{
+       __show_free_areas(0);
+}
+
  static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
  {
         zoneref->zone = zone;
@@ -3691,13 +3763,45 @@ void __init free_bootmem_with_active_regions(int nid,
  }
  
  #ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Basic iterator support. Return the last range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns last region regardless of node
+ */
+static int __meminit last_active_region_index_in_nid(int nid)
+{
+       int i;
+
+       for (i = nr_nodemap_entries - 1; i >= 0; i--)
+               if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+                       return i;
+
+       return -1;
+}
+
+/*
+ * Basic iterator support. Return the previous active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardless of node
+ */
+static int __meminit previous_active_region_index_in_nid(int index, int nid)
+{
+       for (index = index - 1; index >= 0; index--)
+               if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+                       return index;
+
+       return -1;
+}
+
+#define for_each_active_range_index_in_nid_reverse(i, nid) \
+       for (i = last_active_region_index_in_nid(nid); i != -1; \
+                               i = previous_active_region_index_in_nid(i, nid))
+
  u64 __init find_memory_core_early(int nid, u64 size, u64 align,
                                         u64 goal, u64 limit)
  {
         int i;
  
         /* Need to go over early_node_map to find out good range for node */
-       for_each_active_range_index_in_nid(i, nid) {
+       for_each_active_range_index_in_nid_reverse(i, nid) {
                 u64 addr;
                 u64 ei_start, ei_last;
                 u64 final_start, final_end;
@@ -3740,34 +3844,6 @@ int __init add_from_early_node_map(struct range *range, int az,
         return nr_range;
  }
  
-#ifdef CONFIG_NO_BOOTMEM
-void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
-                                       u64 goal, u64 limit)
-{
-       void *ptr;
-       u64 addr;
-
-       if (limit > memblock.current_limit)
-               limit = memblock.current_limit;
-
-       addr = find_memory_core_early(nid, size, align, goal, limit);
-
-       if (addr == MEMBLOCK_ERROR)
-               return NULL;
-
-       ptr = phys_to_virt(addr);
-       memset(ptr, 0, size);
-       memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
-       /*
-        * The min_count is set to 0 so that bootmem allocated blocks
-        * are never reported as leaks.
-        */
-       kmemleak_alloc(ptr, size, 0, 0);
-       return ptr;
-}
-#endif
-
-
  void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
  {
         int i;
@@ -4801,15 +4877,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
         dma_reserve = new_dma_reserve;
  }
  
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = {
-#ifndef CONFIG_NO_BOOTMEM
- .bdata = &bootmem_node_data[0]
-#endif
- };
-EXPORT_SYMBOL(contig_page_data);
-#endif
-
  void __init free_area_init(unsigned long *zones_size)
  {
         free_area_init_node(0, zones_size,
@@ -5368,10 +5435,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
         for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
                 unsigned long check = pfn + iter;
  
-               if (!pfn_valid_within(check)) {
-                       iter++;
+               if (!pfn_valid_within(check))
                         continue;
-               }
+
                 page = pfn_to_page(check);
                 if (!page_count(page)) {
                         if (PageBuddy(page))
@@ -5569,7 +5635,6 @@ static struct trace_print_flags pageflag_names[] = {
         {1UL << PG_swapcache,           "swapcache"     },
         {1UL << PG_mappedtodisk,        "mappedtodisk"  },
         {1UL << PG_reclaim,             "reclaim"       },
-       {1UL << PG_buddy,               "buddy"         },
         {1UL << PG_swapbacked,          "swapbacked"    },
         {1UL << PG_unevictable,         "unevictable"   },
  #ifdef CONFIG_MMU