Merge remote branch 'origin/android-tegra-nv-3.4' into tot
[linux-2.6.git] / mm / vmscan.c
index 3b8ede8..0932dc2 100644 (file)
@@ -26,7 +26,6 @@
 #include <linux/buffer_head.h> /* for try_to_release_page(),
                                        buffer_heads_over_limit */
 #include <linux/mm_inline.h>
-#include <linux/pagevec.h>
 #include <linux/backing-dev.h>
 #include <linux/rmap.h>
 #include <linux/topology.h>
@@ -661,7 +660,7 @@ redo:
                 * When racing with an mlock or AS_UNEVICTABLE clearing
                 * (page is unlocked) make sure that if the other thread
                 * does not observe our setting of PG_lru and fails
-                * isolation/check_move_unevictable_page,
+                * isolation/check_move_unevictable_pages,
                 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
                 * the page back to the evictable list.
                 *
@@ -723,7 +722,7 @@ static enum page_references page_check_references(struct page *page,
                return PAGEREF_RECLAIM;
 
        if (referenced_ptes) {
-               if (PageAnon(page))
+               if (PageSwapBacked(page))
                        return PAGEREF_ACTIVATE;
                /*
                 * All mapped pages start out with page table
@@ -1136,25 +1135,36 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
  * Appropriate locks must be held before calling this function.
  *
  * @nr_to_scan:        The number of pages to look through on the list.
- * @src:       The LRU list to pull pages off.
+ * @mz:                The mem_cgroup_zone to pull pages from.
  * @dst:       The temp list to put pages on to.
- * @scanned:   The number of pages that were scanned.
- * @order:     The caller's attempted allocation order
+ * @nr_scanned:        The number of pages that were scanned.
+ * @sc:                The scan_control struct for this reclaim session
  * @mode:      One of the LRU isolation modes
+ * @active:    True [1] if isolating active pages
  * @file:      True [1] if isolating file [!anon] pages
  *
  * returns how many pages were moved onto *@dst.
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
-               struct list_head *src, struct list_head *dst,
-               unsigned long *scanned, int order, isolate_mode_t mode,
-               int file)
+               struct mem_cgroup_zone *mz, struct list_head *dst,
+               unsigned long *nr_scanned, struct scan_control *sc,
+               isolate_mode_t mode, int active, int file)
 {
+       struct lruvec *lruvec;
+       struct list_head *src;
        unsigned long nr_taken = 0;
        unsigned long nr_lumpy_taken = 0;
        unsigned long nr_lumpy_dirty = 0;
        unsigned long nr_lumpy_failed = 0;
        unsigned long scan;
+       int lru = LRU_BASE;
+
+       lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
+       if (active)
+               lru += LRU_ACTIVE;
+       if (file)
+               lru += LRU_FILE;
+       src = &lruvec->lists[lru];
 
        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
                struct page *page;
@@ -1184,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        BUG();
                }
 
-               if (!order)
+               if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
                        continue;
 
                /*
@@ -1198,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 */
                zone_id = page_zone_id(page);
                page_pfn = page_to_pfn(page);
-               pfn = page_pfn & ~((1 << order) - 1);
-               end_pfn = pfn + (1 << order);
+               pfn = page_pfn & ~((1 << sc->order) - 1);
+               end_pfn = pfn + (1 << sc->order);
                for (; pfn < end_pfn; pfn++) {
                        struct page *cursor_page;
 
@@ -1263,9 +1273,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        nr_lumpy_failed++;
        }
 
-       *scanned = scan;
+       *nr_scanned = scan;
 
-       trace_mm_vmscan_lru_isolate(order,
+       trace_mm_vmscan_lru_isolate(sc->order,
                        nr_to_scan, scan,
                        nr_taken,
                        nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
@@ -1273,49 +1283,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        return nr_taken;
 }
 
-static unsigned long isolate_pages(unsigned long nr, struct mem_cgroup_zone *mz,
-                                  struct list_head *dst,
-                                  unsigned long *scanned, int order,
-                                  isolate_mode_t mode, int active, int file)
-{
-       struct lruvec *lruvec;
-       int lru = LRU_BASE;
-
-       lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
-       if (active)
-               lru += LRU_ACTIVE;
-       if (file)
-               lru += LRU_FILE;
-       return isolate_lru_pages(nr, &lruvec->lists[lru], dst,
-                                scanned, order, mode, file);
-}
-
-/*
- * clear_active_flags() is a helper for shrink_active_list(), clearing
- * any active bits from the pages in the list.
- */
-static unsigned long clear_active_flags(struct list_head *page_list,
-                                       unsigned int *count)
-{
-       int nr_active = 0;
-       int lru;
-       struct page *page;
-
-       list_for_each_entry(page, page_list, lru) {
-               int numpages = hpage_nr_pages(page);
-               lru = page_lru_base_type(page);
-               if (PageActive(page)) {
-                       lru += LRU_ACTIVE;
-                       ClearPageActive(page);
-                       nr_active += numpages;
-               }
-               if (count)
-                       count[lru] += numpages;
-       }
-
-       return nr_active;
-}
-
 /**
  * isolate_lru_page - tries to isolate a page from its LRU list
  * @page: page to isolate from its LRU list
@@ -1389,28 +1356,21 @@ static int too_many_isolated(struct zone *zone, int file,
        return isolated > inactive;
 }
 
-/*
- * TODO: Try merging with migrations version of putback_lru_pages
- */
 static noinline_for_stack void
-putback_lru_pages(struct mem_cgroup_zone *mz, struct scan_control *sc,
-                 unsigned long nr_anon, unsigned long nr_file,
-                 struct list_head *page_list)
+putback_inactive_pages(struct mem_cgroup_zone *mz,
+                      struct list_head *page_list)
 {
-       struct page *page;
-       struct pagevec pvec;
-       struct zone *zone = mz->zone;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
-
-       pagevec_init(&pvec, 1);
+       struct zone *zone = mz->zone;
+       LIST_HEAD(pages_to_free);
 
        /*
         * Put back any unfreeable pages.
         */
-       spin_lock(&zone->lru_lock);
        while (!list_empty(page_list)) {
+               struct page *page = lru_to_page(page_list);
                int lru;
-               page = lru_to_page(page_list);
+
                VM_BUG_ON(PageLRU(page));
                list_del(&page->lru);
                if (unlikely(!page_evictable(page, NULL))) {
@@ -1427,32 +1387,53 @@ putback_lru_pages(struct mem_cgroup_zone *mz, struct scan_control *sc,
                        int numpages = hpage_nr_pages(page);
                        reclaim_stat->recent_rotated[file] += numpages;
                }
-               if (!pagevec_add(&pvec, page)) {
-                       spin_unlock_irq(&zone->lru_lock);
-                       __pagevec_release(&pvec);
-                       spin_lock_irq(&zone->lru_lock);
+               if (put_page_testzero(page)) {
+                       __ClearPageLRU(page);
+                       __ClearPageActive(page);
+                       del_page_from_lru_list(zone, page, lru);
+
+                       if (unlikely(PageCompound(page))) {
+                               spin_unlock_irq(&zone->lru_lock);
+                               (*get_compound_page_dtor(page))(page);
+                               spin_lock_irq(&zone->lru_lock);
+                       } else
+                               list_add(&page->lru, &pages_to_free);
                }
        }
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
-       __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
 
-       spin_unlock_irq(&zone->lru_lock);
-       pagevec_release(&pvec);
+       /*
+        * To save our caller's stack, now use input list for pages to free.
+        */
+       list_splice(&pages_to_free, page_list);
 }
 
 static noinline_for_stack void
 update_isolated_counts(struct mem_cgroup_zone *mz,
-                      struct scan_control *sc,
+                      struct list_head *page_list,
                       unsigned long *nr_anon,
-                      unsigned long *nr_file,
-                      struct list_head *isolated_list)
+                      unsigned long *nr_file)
 {
-       unsigned long nr_active;
        struct zone *zone = mz->zone;
        unsigned int count[NR_LRU_LISTS] = { 0, };
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+       unsigned long nr_active = 0;
+       struct page *page;
+       int lru;
 
-       nr_active = clear_active_flags(isolated_list, count);
+       /*
+        * Count pages and clear active flags
+        */
+       list_for_each_entry(page, page_list, lru) {
+               int numpages = hpage_nr_pages(page);
+               lru = page_lru_base_type(page);
+               if (PageActive(page)) {
+                       lru += LRU_ACTIVE;
+                       ClearPageActive(page);
+                       nr_active += numpages;
+               }
+               count[lru] += numpages;
+       }
+
+       preempt_disable();
        __count_vm_events(PGDEACTIVATE, nr_active);
 
        __mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1466,11 +1447,10 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
 
        *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
        *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+
        __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
        __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
-
-       reclaim_stat->recent_scanned[0] += *nr_anon;
-       reclaim_stat->recent_scanned[1] += *nr_file;
+       preempt_enable();
 }
 
 /*
@@ -1530,8 +1510,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
        unsigned long nr_file;
        unsigned long nr_dirty = 0;
        unsigned long nr_writeback = 0;
-       isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
+       isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
        struct zone *zone = mz->zone;
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
 
        while (unlikely(too_many_isolated(zone, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1543,20 +1524,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 
        set_reclaim_mode(priority, sc, false);
        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
-               reclaim_mode |= ISOLATE_ACTIVE;
+               isolate_mode |= ISOLATE_ACTIVE;
 
        lru_add_drain();
 
        if (!sc->may_unmap)
-               reclaim_mode |= ISOLATE_UNMAPPED;
+               isolate_mode |= ISOLATE_UNMAPPED;
        if (!sc->may_writepage)
-               reclaim_mode |= ISOLATE_CLEAN;
+               isolate_mode |= ISOLATE_CLEAN;
 
        spin_lock_irq(&zone->lru_lock);
 
-       nr_taken = isolate_pages(nr_to_scan, mz, &page_list,
-                                &nr_scanned, sc->order,
-                                reclaim_mode, 0, file);
+       nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
+                                    sc, isolate_mode, 0, file);
        if (global_reclaim(sc)) {
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
@@ -1566,15 +1546,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                               nr_scanned);
        }
+       spin_unlock_irq(&zone->lru_lock);
 
-       if (nr_taken == 0) {
-               spin_unlock_irq(&zone->lru_lock);
+       if (nr_taken == 0)
                return 0;
-       }
-
-       update_isolated_counts(mz, sc, &nr_anon, &nr_file, &page_list);
 
-       spin_unlock_irq(&zone->lru_lock);
+       update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
 
        nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
                                                &nr_dirty, &nr_writeback);
@@ -1586,12 +1563,28 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
                                        priority, &nr_dirty, &nr_writeback);
        }
 
-       local_irq_disable();
-       if (current_is_kswapd())
-               __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
-       __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
+       spin_lock_irq(&zone->lru_lock);
+
+       reclaim_stat->recent_scanned[0] += nr_anon;
+       reclaim_stat->recent_scanned[1] += nr_file;
+
+       if (global_reclaim(sc)) {
+               if (current_is_kswapd())
+                       __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
+                                              nr_reclaimed);
+               else
+                       __count_zone_vm_events(PGSTEAL_DIRECT, zone,
+                                              nr_reclaimed);
+       }
+
+       putback_inactive_pages(mz, &page_list);
 
-       putback_lru_pages(mz, sc, nr_anon, nr_file, &page_list);
+       __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+       __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+
+       spin_unlock_irq(&zone->lru_lock);
+
+       free_hot_cold_page_list(&page_list, 1);
 
        /*
         * If reclaim is isolating dirty pages under writeback, it implies
@@ -1647,14 +1640,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 
 static void move_active_pages_to_lru(struct zone *zone,
                                     struct list_head *list,
+                                    struct list_head *pages_to_free,
                                     enum lru_list lru)
 {
        unsigned long pgmoved = 0;
-       struct pagevec pvec;
        struct page *page;
 
-       pagevec_init(&pvec, 1);
-
        while (!list_empty(list)) {
                struct lruvec *lruvec;
 
@@ -1667,12 +1658,17 @@ static void move_active_pages_to_lru(struct zone *zone,
                list_move(&page->lru, &lruvec->lists[lru]);
                pgmoved += hpage_nr_pages(page);
 
-               if (!pagevec_add(&pvec, page) || list_empty(list)) {
-                       spin_unlock_irq(&zone->lru_lock);
-                       if (buffer_heads_over_limit)
-                               pagevec_strip(&pvec);
-                       __pagevec_release(&pvec);
-                       spin_lock_irq(&zone->lru_lock);
+               if (put_page_testzero(page)) {
+                       __ClearPageLRU(page);
+                       __ClearPageActive(page);
+                       del_page_from_lru_list(zone, page, lru);
+
+                       if (unlikely(PageCompound(page))) {
+                               spin_unlock_irq(&zone->lru_lock);
+                               (*get_compound_page_dtor(page))(page);
+                               spin_lock_irq(&zone->lru_lock);
+                       } else
+                               list_add(&page->lru, pages_to_free);
                }
        }
        __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1680,13 +1676,13 @@ static void move_active_pages_to_lru(struct zone *zone,
                __count_vm_events(PGDEACTIVATE, pgmoved);
 }
 
-static void shrink_active_list(unsigned long nr_pages,
+static void shrink_active_list(unsigned long nr_to_scan,
                               struct mem_cgroup_zone *mz,
                               struct scan_control *sc,
                               int priority, int file)
 {
        unsigned long nr_taken;
-       unsigned long pgscanned;
+       unsigned long nr_scanned;
        unsigned long vm_flags;
        LIST_HEAD(l_hold);      /* The pages which were snipped off */
        LIST_HEAD(l_active);
@@ -1694,28 +1690,28 @@ static void shrink_active_list(unsigned long nr_pages,
        struct page *page;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
        unsigned long nr_rotated = 0;
-       isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
+       isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
        struct zone *zone = mz->zone;
 
        lru_add_drain();
 
+       reset_reclaim_mode(sc);
+
        if (!sc->may_unmap)
-               reclaim_mode |= ISOLATE_UNMAPPED;
+               isolate_mode |= ISOLATE_UNMAPPED;
        if (!sc->may_writepage)
-               reclaim_mode |= ISOLATE_CLEAN;
+               isolate_mode |= ISOLATE_CLEAN;
 
        spin_lock_irq(&zone->lru_lock);
 
-       nr_taken = isolate_pages(nr_pages, mz, &l_hold,
-                                &pgscanned, sc->order,
-                                reclaim_mode, 1, file);
-
+       nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
+                                    isolate_mode, 1, file);
        if (global_reclaim(sc))
-               zone->pages_scanned += pgscanned;
+               zone->pages_scanned += nr_scanned;
 
        reclaim_stat->recent_scanned[file] += nr_taken;
 
-       __count_zone_vm_events(PGREFILL, zone, pgscanned);
+       __count_zone_vm_events(PGREFILL, zone, nr_scanned);
        if (file)
                __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
        else
@@ -1733,6 +1729,14 @@ static void shrink_active_list(unsigned long nr_pages,
                        continue;
                }
 
+               if (unlikely(buffer_heads_over_limit)) {
+                       if (page_has_private(page) && trylock_page(page)) {
+                               if (page_has_private(page))
+                                       try_to_release_page(page, 0);
+                               unlock_page(page);
+                       }
+               }
+
                if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
                        nr_rotated += hpage_nr_pages(page);
                        /*
@@ -1766,12 +1770,14 @@ static void shrink_active_list(unsigned long nr_pages,
         */
        reclaim_stat->recent_rotated[file] += nr_rotated;
 
-       move_active_pages_to_lru(zone, &l_active,
+       move_active_pages_to_lru(zone, &l_active, &l_hold,
                                                LRU_ACTIVE + file * LRU_FILE);
-       move_active_pages_to_lru(zone, &l_inactive,
+       move_active_pages_to_lru(zone, &l_inactive, &l_hold,
                                                LRU_BASE   + file * LRU_FILE);
        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
        spin_unlock_irq(&zone->lru_lock);
+
+       free_hot_cold_page_list(&l_hold, 1);
 }
 
 #ifdef CONFIG_SWAP
@@ -1898,7 +1904,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
        unsigned long ap, fp;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
        u64 fraction[2], denominator;
-       enum lru_list l;
+       enum lru_list lru;
        int noswap = 0;
        bool force_scan = false;
 
@@ -1988,18 +1994,18 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
        fraction[1] = fp;
        denominator = ap + fp + 1;
 out:
-       for_each_evictable_lru(l) {
-               int file = is_file_lru(l);
+       for_each_evictable_lru(lru) {
+               int file = is_file_lru(lru);
                unsigned long scan;
 
-               scan = zone_nr_lru_pages(mz, l);
+               scan = zone_nr_lru_pages(mz, lru);
                if (priority || noswap) {
                        scan >>= priority;
                        if (!scan && force_scan)
                                scan = SWAP_CLUSTER_MAX;
                        scan = div64_u64(scan * fraction[file], denominator);
                }
-               nr[l] = scan;
+               nr[lru] = scan;
        }
 }
 
@@ -2075,7 +2081,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
 {
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
-       enum lru_list l;
+       enum lru_list lru;
        unsigned long nr_reclaimed, nr_scanned;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        struct blk_plug plug;
@@ -2088,13 +2094,13 @@ restart:
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
-               for_each_evictable_lru(l) {
-                       if (nr[l]) {
+               for_each_evictable_lru(lru) {
+                       if (nr[lru]) {
                                nr_to_scan = min_t(unsigned long,
-                                                  nr[l], SWAP_CLUSTER_MAX);
-                               nr[l] -= nr_to_scan;
+                                                  nr[lru], SWAP_CLUSTER_MAX);
+                               nr[lru] -= nr_to_scan;
 
-                               nr_reclaimed += shrink_list(l, nr_to_scan,
+                               nr_reclaimed += shrink_list(lru, nr_to_scan,
                                                            mz, sc, priority);
                        }
                }
@@ -2189,7 +2195,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
         * If compaction is deferred, reclaim up to a point where
         * compaction will have a chance of success when re-enabled
         */
-       if (compaction_deferred(zone))
+       if (compaction_deferred(zone, sc->order))
                return watermark_ok;
 
        /* If compaction is not ready to start, keep reclaiming */
@@ -2217,7 +2223,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
  *
  * This function returns true if a zone is being reclaimed for a costly
  * high-order allocation and compaction is ready to begin. This indicates to
- * the caller that it should retry the allocation or fail.
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
  */
 static bool shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
@@ -2226,7 +2233,15 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
        struct zone *zone;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
-       bool should_abort_reclaim = false;
+       bool aborted_reclaim = false;
+
+       /*
+        * If the number of buffer_heads in the machine exceeds the maximum
+        * allowed level, force direct reclaim to scan the highmem zone as
+        * highmem pages could be pinning lowmem pages storing buffer_heads
+        */
+       if (buffer_heads_over_limit)
+               sc->gfp_mask |= __GFP_HIGHMEM;
 
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2248,11 +2263,11 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                                 * Even though compaction is invoked for any
                                 * non-zero order, only frequent costly order
                                 * reclamation is disruptive enough to become a
-                                * noticable problem, like transparent huge page
-                                * allocations.
+                                * noticeable problem, like transparent huge
+                                * page allocations.
                                 */
                                if (compaction_ready(zone, sc)) {
-                                       should_abort_reclaim = true;
+                                       aborted_reclaim = true;
                                        continue;
                                }
                        }
@@ -2274,7 +2289,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                shrink_zone(priority, zone, sc);
        }
 
-       return should_abort_reclaim;
+       return aborted_reclaim;
 }
 
 static bool zone_reclaimable(struct zone *zone)
@@ -2328,9 +2343,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        struct zoneref *z;
        struct zone *zone;
        unsigned long writeback_threshold;
-       bool should_abort_reclaim;
+       bool aborted_reclaim;
 
-       get_mems_allowed();
        delayacct_freepages_start();
 
        if (global_reclaim(sc))
@@ -2340,9 +2354,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token(sc->target_mem_cgroup);
-               should_abort_reclaim = shrink_zones(priority, zonelist, sc);
-               if (should_abort_reclaim)
-                       break;
+               aborted_reclaim = shrink_zones(priority, zonelist, sc);
 
                /*
                 * Don't shrink slabs when reclaiming memory from
@@ -2396,7 +2408,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 
 out:
        delayacct_freepages_end();
-       put_mems_allowed();
 
        if (sc->nr_reclaimed)
                return sc->nr_reclaimed;
@@ -2409,8 +2420,8 @@ out:
        if (oom_killer_disabled)
                return 0;
 
-       /* Aborting reclaim to try compaction? don't OOM, then */
-       if (should_abort_reclaim)
+       /* Aborted reclaim to try compaction? don't OOM, then */
+       if (aborted_reclaim)
                return 1;
 
        /* top priority shrink_zones still had more to do? don't OOM, then */
@@ -2719,6 +2730,17 @@ loop_again:
                         */
                        age_active_anon(zone, &sc, priority);
 
+                       /*
+                        * If the number of buffer_heads in the machine
+                        * exceeds the maximum allowed level and this node
+                        * has a highmem zone, force kswapd to reclaim from
+                        * it to relieve lowmem pressure.
+                        */
+                       if (buffer_heads_over_limit && is_highmem_idx(i)) {
+                               end_zone = i;
+                               break;
+                       }
+
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
@@ -2748,7 +2770,7 @@ loop_again:
                 */
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
-                       int nr_slab;
+                       int nr_slab, testorder;
                        unsigned long balance_gap;
 
                        if (!populated_zone(zone))
@@ -2781,7 +2803,21 @@ loop_again:
                                (zone->present_pages +
                                        KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
                                KSWAPD_ZONE_BALANCE_GAP_RATIO);
-                       if (!zone_watermark_ok_safe(zone, order,
+                       /*
+                        * Kswapd reclaims only single pages with compaction
+                        * enabled. Trying too hard to reclaim until contiguous
+                        * free pages have become available can hurt performance
+                        * by evicting too much useful data from memory.
+                        * Do not reclaim more than needed for compaction.
+                        */
+                       testorder = order;
+                       if (COMPACTION_BUILD && order &&
+                                       compaction_suitable(zone, order) !=
+                                               COMPACT_SKIPPED)
+                               testorder = 0;
+
+                       if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
+                                   !zone_watermark_ok_safe(zone, testorder,
                                        high_wmark_pages(zone) + balance_gap,
                                        end_zone, 0)) {
                                shrink_zone(priority, zone, &sc);
@@ -2810,7 +2846,7 @@ loop_again:
                                continue;
                        }
 
-                       if (!zone_watermark_ok_safe(zone, order,
+                       if (!zone_watermark_ok_safe(zone, testorder,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
                                /*
@@ -2898,6 +2934,8 @@ out:
         * and it is potentially going to sleep here.
         */
        if (order) {
+               int zones_need_compaction = 1;
+
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
 
@@ -2907,6 +2945,11 @@ out:
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
 
+                       /* Would compaction fail due to lack of free memory? */
+                       if (COMPACTION_BUILD &&
+                           compaction_suitable(zone, order) == COMPACT_SKIPPED)
+                               goto loop_again;
+
                        /* Confirm the zone is balanced for order-0 */
                        if (!zone_watermark_ok(zone, 0,
                                        high_wmark_pages(zone), 0, 0)) {
@@ -2914,11 +2957,17 @@ out:
                                goto loop_again;
                        }
 
+                       /* Check if the memory needs to be defragmented. */
+                       if (zone_watermark_ok(zone, order,
+                                   low_wmark_pages(zone), *classzone_idx, 0))
+                               zones_need_compaction = 0;
+
                        /* If balanced, clear the congested flag */
                        zone_clear_flag(zone, ZONE_CONGESTED);
-                       if (i <= *classzone_idx)
-                               balanced += zone->present_pages;
                }
+
+               if (zones_need_compaction)
+                       compact_pgdat(pgdat, order);
        }
 
        /*
@@ -3493,100 +3542,61 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
        return 1;
 }
 
+#ifdef CONFIG_SHMEM
 /**
- * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
- * @page: page to check evictability and move to appropriate lru list
- * @zone: zone page is in
+ * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
+ * @pages:     array of pages to check
+ * @nr_pages:  number of pages to check
  *
- * Checks a page for evictability and moves the page to the appropriate
- * zone lru list.
+ * Checks pages for evictability and moves them to the appropriate lru list.
  *
- * Restrictions: zone->lru_lock must be held, page must be on LRU and must
- * have PageUnevictable set.
+ * This function is only used for SysV IPC SHM_UNLOCK.
  */
-static void check_move_unevictable_page(struct page *page, struct zone *zone)
+void check_move_unevictable_pages(struct page **pages, int nr_pages)
 {
        struct lruvec *lruvec;
+       struct zone *zone = NULL;
+       int pgscanned = 0;
+       int pgrescued = 0;
+       int i;
 
-       VM_BUG_ON(PageActive(page));
-retry:
-       ClearPageUnevictable(page);
-       if (page_evictable(page, NULL)) {
-               enum lru_list l = page_lru_base_type(page);
-
-               __dec_zone_state(zone, NR_UNEVICTABLE);
-               lruvec = mem_cgroup_lru_move_lists(zone, page,
-                                                  LRU_UNEVICTABLE, l);
-               list_move(&page->lru, &lruvec->lists[l]);
-               __inc_zone_state(zone, NR_INACTIVE_ANON + l);
-               __count_vm_event(UNEVICTABLE_PGRESCUED);
-       } else {
-               /*
-                * rotate unevictable list
-                */
-               SetPageUnevictable(page);
-               lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE,
-                                                  LRU_UNEVICTABLE);
-               list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]);
-               if (page_evictable(page, NULL))
-                       goto retry;
-       }
-}
-
-/**
- * scan_mapping_unevictable_pages - scan an address space for evictable pages
- * @mapping: struct address_space to scan for evictable pages
- *
- * Scan all pages in mapping.  Check unevictable pages for
- * evictability and move them to the appropriate zone lru list.
- */
-void scan_mapping_unevictable_pages(struct address_space *mapping)
-{
-       pgoff_t next = 0;
-       pgoff_t end   = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
-                        PAGE_CACHE_SHIFT;
-       struct zone *zone;
-       struct pagevec pvec;
-
-       if (mapping->nrpages == 0)
-               return;
-
-       pagevec_init(&pvec, 0);
-       while (next < end &&
-               pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
-               int i;
-               int pg_scanned = 0;
-
-               zone = NULL;
-
-               for (i = 0; i < pagevec_count(&pvec); i++) {
-                       struct page *page = pvec.pages[i];
-                       pgoff_t page_index = page->index;
-                       struct zone *pagezone = page_zone(page);
+       for (i = 0; i < nr_pages; i++) {
+               struct page *page = pages[i];
+               struct zone *pagezone;
 
-                       pg_scanned++;
-                       if (page_index > next)
-                               next = page_index;
-                       next++;
+               pgscanned++;
+               pagezone = page_zone(page);
+               if (pagezone != zone) {
+                       if (zone)
+                               spin_unlock_irq(&zone->lru_lock);
+                       zone = pagezone;
+                       spin_lock_irq(&zone->lru_lock);
+               }
 
-                       if (pagezone != zone) {
-                               if (zone)
-                                       spin_unlock_irq(&zone->lru_lock);
-                               zone = pagezone;
-                               spin_lock_irq(&zone->lru_lock);
-                       }
+               if (!PageLRU(page) || !PageUnevictable(page))
+                       continue;
 
-                       if (PageLRU(page) && PageUnevictable(page))
-                               check_move_unevictable_page(page, zone);
+               if (page_evictable(page, NULL)) {
+                       enum lru_list lru = page_lru_base_type(page);
+
+                       VM_BUG_ON(PageActive(page));
+                       ClearPageUnevictable(page);
+                       __dec_zone_state(zone, NR_UNEVICTABLE);
+                       lruvec = mem_cgroup_lru_move_lists(zone, page,
+                                               LRU_UNEVICTABLE, lru);
+                       list_move(&page->lru, &lruvec->lists[lru]);
+                       __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
+                       pgrescued++;
                }
-               if (zone)
-                       spin_unlock_irq(&zone->lru_lock);
-               pagevec_release(&pvec);
-
-               count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
        }
 
+       if (zone) {
+               __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+               __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
+               spin_unlock_irq(&zone->lru_lock);
+       }
 }
+#endif /* CONFIG_SHMEM */
 
 static void warn_scan_unevictable_pages(void)
 {