mm: vmscan: fix typo in isolating lru pages

[linux-2.6.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 15e3a29fdb234c20480a6438e8861834346e88fc..26f4a8a4e0c75caebff02e582d8b297f4ff2b35d 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -183,7 +183,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
   */
  void register_shrinker(struct shrinker *shrinker)
  {
-       shrinker->nr = 0;
+       atomic_long_set(&shrinker->nr_in_batch, 0);
         down_write(&shrinker_rwsem);
         list_add_tail(&shrinker->list, &shrinker_list);
         up_write(&shrinker_rwsem);
@@ -247,25 +247,26 @@ unsigned long shrink_slab(struct shrink_control *shrink,
  
         list_for_each_entry(shrinker, &shrinker_list, list) {
                 unsigned long long delta;
-               unsigned long total_scan;
-               unsigned long max_pass;
+               long total_scan;
+               long max_pass;
                 int shrink_ret = 0;
                 long nr;
                 long new_nr;
                 long batch_size = shrinker->batch ? shrinker->batch
                                                   : SHRINK_BATCH;
  
+               max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+               if (max_pass <= 0)
+                       continue;
+
                 /*
                  * copy the current shrinker scan count into a local variable
                  * and zero it so that other concurrent shrinker invocations
                  * don't also do this scanning work.
                  */
-               do {
-                       nr = shrinker->nr;
-               } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+               nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
  
                 total_scan = nr;
-               max_pass = do_shrinker_shrink(shrinker, shrink, 0);
                 delta = (4 * nr_pages_scanned) / shrinker->seeks;
                 delta *= max_pass;
                 do_div(delta, lru_pages + 1);
@@ -325,12 +326,11 @@ unsigned long shrink_slab(struct shrink_control *shrink,
                  * manner that handles concurrent updates. If we exhausted the
                  * scan, there is no need to do an update.
                  */
-               do {
-                       nr = shrinker->nr;
-                       new_nr = total_scan + nr;
-                       if (total_scan <= 0)
-                               break;
-               } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+               if (total_scan > 0)
+                       new_nr = atomic_long_add_return(total_scan,
+                                       &shrinker->nr_in_batch);
+               else
+                       new_nr = atomic_long_read(&shrinker->nr_in_batch);
  
                 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
         }
@@ -633,13 +633,14 @@ redo:
                 lru = LRU_UNEVICTABLE;
                 add_page_to_unevictable_list(page);
                 /*
-                * When racing with an mlock clearing (page is
-                * unlocked), make sure that if the other thread does
-                * not observe our setting of PG_lru and fails
-                * isolation, we see PG_mlocked cleared below and move
+                * When racing with an mlock or AS_UNEVICTABLE clearing
+                * (page is unlocked) make sure that if the other thread
+                * does not observe our setting of PG_lru and fails
+                * isolation/check_move_unevictable_page,
+                * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
                  * the page back to the evictable list.
                  *
-                * The other side is TestClearPageMlocked().
+                * The other side is TestClearPageMlocked() or shmem_lock().
                  */
                 smp_mb();
         }
@@ -714,7 +715,13 @@ static enum page_references page_check_references(struct page *page,
                  */
                 SetPageReferenced(page);
  
-               if (referenced_page)
+               if (referenced_page || referenced_ptes > 1)
+                       return PAGEREF_ACTIVATE;
+
+               /*
+                * Activate file-backed executable pages after first usage.
+                */
+               if (vm_flags & VM_EXEC)
                         return PAGEREF_ACTIVATE;
  
                 return PAGEREF_KEEP;
@@ -727,31 +734,15 @@ static enum page_references page_check_references(struct page *page,
         return PAGEREF_RECLAIM;
  }
  
-static noinline_for_stack void free_page_list(struct list_head *free_pages)
-{
-       struct pagevec freed_pvec;
-       struct page *page, *tmp;
-
-       pagevec_init(&freed_pvec, 1);
-
-       list_for_each_entry_safe(page, tmp, free_pages, lru) {
-               list_del(&page->lru);
-               if (!pagevec_add(&freed_pvec, page)) {
-                       __pagevec_free(&freed_pvec);
-                       pagevec_reinit(&freed_pvec);
-               }
-       }
-
-       pagevec_free(&freed_pvec);
-}
-
  /*
   * shrink_page_list() returns the number of reclaimed pages
   */
  static unsigned long shrink_page_list(struct list_head *page_list,
                                       struct zone *zone,
                                       struct scan_control *sc,
-                                     int priority)
+                                     int priority,
+                                     unsigned long *ret_nr_dirty,
+                                     unsigned long *ret_nr_writeback)
  {
         LIST_HEAD(ret_pages);
         LIST_HEAD(free_pages);
@@ -759,6 +750,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
         unsigned long nr_dirty = 0;
         unsigned long nr_congested = 0;
         unsigned long nr_reclaimed = 0;
+       unsigned long nr_writeback = 0;
  
         cond_resched();
  
@@ -795,6 +787,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
  
                 if (PageWriteback(page)) {
+                       nr_writeback++;
                         /*
                          * Synchronous reclaim cannot queue pages for
                          * writeback due to the possibility of stack overflow
@@ -862,7 +855,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                          */
                         if (page_is_file_cache(page) &&
                                         (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
-                               inc_zone_page_state(page, NR_VMSCAN_WRITE_SKIP);
+                               /*
+                                * Immediately reclaim when written back.
+                                * Similar in principal to deactivate_page()
+                                * except we already have the page isolated
+                                * and know it's dirty
+                                */
+                               inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+                               SetPageReclaim(page);
+
                                 goto keep_locked;
                         }
  
@@ -996,10 +997,12 @@ keep_lumpy:
         if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
                 zone_set_flag(zone, ZONE_CONGESTED);
  
-       free_page_list(&free_pages);
+       free_hot_cold_page_list(&free_pages, 1);
  
         list_splice(&ret_pages, page_list);
         count_vm_events(PGACTIVATE, pgactivate);
+       *ret_nr_dirty += nr_dirty;
+       *ret_nr_writeback += nr_writeback;
         return nr_reclaimed;
  }
  
@@ -1163,14 +1166,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                          * anon page which don't already have a swap slot is
                          * pointless.
                          */
-                       if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+                       if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
                             !PageSwapCache(cursor_page))
                                 break;
  
                         if (__isolate_lru_page(cursor_page, mode, file) == 0) {
                                 list_move(&cursor_page->lru, dst);
                                 mem_cgroup_del_lru(cursor_page);
-                               nr_taken += hpage_nr_pages(page);
+                               nr_taken += hpage_nr_pages(cursor_page);
                                 nr_lumpy_taken++;
                                 if (PageDirty(cursor_page))
                                         nr_lumpy_dirty++;
@@ -1460,6 +1463,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
         unsigned long nr_taken;
         unsigned long nr_anon;
         unsigned long nr_file;
+       unsigned long nr_dirty = 0;
+       unsigned long nr_writeback = 0;
         isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
  
         while (unlikely(too_many_isolated(zone, file, sc))) {
@@ -1512,12 +1517,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
  
         spin_unlock_irq(&zone->lru_lock);
  
-       nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority);
+       nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
+                                               &nr_dirty, &nr_writeback);
  
         /* Check if we should syncronously wait for writeback */
         if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                 set_reclaim_mode(priority, sc, true);
-               nr_reclaimed += shrink_page_list(&page_list, zone, sc, priority);
+               nr_reclaimed += shrink_page_list(&page_list, zone, sc,
+                                       priority, &nr_dirty, &nr_writeback);
         }
  
         local_irq_disable();
@@ -1527,6 +1534,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
  
         putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
  
+       /*
+        * If reclaim is isolating dirty pages under writeback, it implies
+        * that the long-lived page allocation rate is exceeding the page
+        * laundering rate. Either the global limits are not being effective
+        * at throttling processes due to the page distribution throughout
+        * zones or there is heavy usage of a slow backing device. The
+        * only option is to throttle from reclaim context which is not ideal
+        * as there is no guarantee the dirtying process is throttled in the
+        * same way balance_dirty_pages() manages.
+        *
+        * This scales the number of dirty pages that must be under writeback
+        * before throttling depending on priority. It is a simple backoff
+        * function that has the most effect in the range DEF_PRIORITY to
+        * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+        * in trouble and reclaim is considered to be in trouble.
+        *
+        * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
+        * DEF_PRIORITY-1  50% must be PageWriteback
+        * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
+        * ...
+        * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
+        *                     isolated page is PageWriteback
+        */
+       if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+               wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+
         trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
                 zone_idx(zone),
                 nr_scanned, nr_reclaimed,
@@ -1722,7 +1755,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
         if (scanning_global_lru(sc))
                 low = inactive_anon_is_low_global(zone);
         else
-               low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
+               low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
         return low;
  }
  #else
@@ -1765,7 +1798,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
         if (scanning_global_lru(sc))
                 low = inactive_file_is_low_global(zone);
         else
-               low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
+               low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
         return low;
  }
  
@@ -1967,8 +2000,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
          * inactive lists are large enough, continue reclaiming
          */
         pages_for_compaction = (2UL << sc->order);
-       inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
-                               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+       inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+       if (nr_swap_pages > 0)
+               inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
         if (sc->nr_reclaimed < pages_for_compaction &&
                         inactive_lru_pages > pages_for_compaction)
                 return true;
@@ -2058,14 +2092,19 @@ restart:
   *
   * If a zone is deemed to be full of pinned pages then just give it a light
   * scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is either ready to begin or deferred.
+ * This indicates to the caller that it should retry the allocation or fail.
   */
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
                                         struct scan_control *sc)
  {
         struct zoneref *z;
         struct zone *zone;
         unsigned long nr_soft_reclaimed;
         unsigned long nr_soft_scanned;
+       bool should_abort_reclaim = false;
  
         for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                         gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2080,6 +2119,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                 continue;
                         if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                 continue;       /* Let kswapd poll it */
+                       if (COMPACTION_BUILD) {
+                               /*
+                                * If we already have plenty of memory free for
+                                * compaction in this zone, don't free any more.
+                                * Even though compaction is invoked for any
+                                * non-zero order, only frequent costly order
+                                * reclamation is disruptive enough to become a
+                                * noticable problem, like transparent huge page
+                                * allocations.
+                                */
+                               if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+                                       (compaction_suitable(zone, sc->order) ||
+                                        compaction_deferred(zone))) {
+                                       should_abort_reclaim = true;
+                                       continue;
+                               }
+                       }
                         /*
                          * This steals pages from memory cgroups over softlimit
                          * and returns the number of reclaimed pages and
@@ -2097,6 +2153,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
  
                 shrink_zone(priority, zone, sc);
         }
+
+       return should_abort_reclaim;
  }
  
  static bool zone_reclaimable(struct zone *zone)
@@ -2161,7 +2219,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 sc->nr_scanned = 0;
                 if (!priority)
                         disable_swap_token(sc->mem_cgroup);
-               shrink_zones(priority, zonelist, sc);
+               if (shrink_zones(priority, zonelist, sc))
+                       break;
+
                 /*
                  * Don't shrink slabs when reclaiming memory from
                  * over limit cgroups
@@ -2195,7 +2255,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                  */
                 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
                 if (total_scanned > writeback_threshold) {
-                       wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+                       wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+                                               WB_REASON_TRY_TO_FREE_PAGES);
                         sc->may_writepage = 1;
                 }
  
@@ -2704,6 +2765,8 @@ out:
  
                         /* If balanced, clear the congested flag */
                         zone_clear_flag(zone, ZONE_CONGESTED);
+                       if (i <= *classzone_idx)
+                               balanced += zone->present_pages;
                 }
         }
  
@@ -2777,7 +2840,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  static int kswapd(void *p)
  {
         unsigned long order, new_order;
+       unsigned balanced_order;
         int classzone_idx, new_classzone_idx;
+       int balanced_classzone_idx;
         pg_data_t *pgdat = (pg_data_t*)p;
         struct task_struct *tsk = current;
  
@@ -2808,7 +2873,9 @@ static int kswapd(void *p)
         set_freezable();
  
         order = new_order = 0;
+       balanced_order = 0;
         classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+       balanced_classzone_idx = classzone_idx;
         for ( ; ; ) {
                 int ret;
  
@@ -2817,7 +2884,8 @@ static int kswapd(void *p)
                  * new request of a similar or harder type will succeed soon
                  * so consider going to sleep on the basis we reclaimed at
                  */
-               if (classzone_idx >= new_classzone_idx && order == new_order) {
+               if (balanced_classzone_idx >= new_classzone_idx &&
+                                       balanced_order == new_order) {
                         new_order = pgdat->kswapd_max_order;
                         new_classzone_idx = pgdat->classzone_idx;
                         pgdat->kswapd_max_order =  0;
@@ -2832,9 +2900,12 @@ static int kswapd(void *p)
                         order = new_order;
                         classzone_idx = new_classzone_idx;
                 } else {
-                       kswapd_try_to_sleep(pgdat, order, classzone_idx);
+                       kswapd_try_to_sleep(pgdat, balanced_order,
+                                               balanced_classzone_idx);
                         order = pgdat->kswapd_max_order;
                         classzone_idx = pgdat->classzone_idx;
+                       new_order = order;
+                       new_classzone_idx = classzone_idx;
                         pgdat->kswapd_max_order = 0;
                         pgdat->classzone_idx = pgdat->nr_zones - 1;
                 }
@@ -2849,7 +2920,9 @@ static int kswapd(void *p)
                  */
                 if (!ret) {
                         trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                       order = balance_pgdat(pgdat, order, &classzone_idx);
+                       balanced_classzone_idx = classzone_idx;
+                       balanced_order = balance_pgdat(pgdat, order,
+                                               &balanced_classzone_idx);
                 }
         }
         return 0;
@@ -3361,66 +3434,13 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
  
  }
  
-/**
- * scan_zone_unevictable_pages - check unevictable list for evictable pages
- * @zone - zone of which to scan the unevictable list
- *
- * Scan @zone's unevictable LRU lists to check for pages that have become
- * evictable.  Move those that have to @zone's inactive list where they
- * become candidates for reclaim, unless shrink_inactive_zone() decides
- * to reactivate them.  Pages that are still unevictable are rotated
- * back onto @zone's unevictable list.
- */
-#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-static void scan_zone_unevictable_pages(struct zone *zone)
+static void warn_scan_unevictable_pages(void)
  {
-       struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
-       unsigned long scan;
-       unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
-
-       while (nr_to_scan > 0) {
-               unsigned long batch_size = min(nr_to_scan,
-                                               SCAN_UNEVICTABLE_BATCH_SIZE);
-
-               spin_lock_irq(&zone->lru_lock);
-               for (scan = 0;  scan < batch_size; scan++) {
-                       struct page *page = lru_to_page(l_unevictable);
-
-                       if (!trylock_page(page))
-                               continue;
-
-                       prefetchw_prev_lru_page(page, l_unevictable, flags);
-
-                       if (likely(PageLRU(page) && PageUnevictable(page)))
-                               check_move_unevictable_page(page, zone);
-
-                       unlock_page(page);
-               }
-               spin_unlock_irq(&zone->lru_lock);
-
-               nr_to_scan -= batch_size;
-       }
-}
-
-
-/**
- * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
- *
- * A really big hammer:  scan all zones' unevictable LRU lists to check for
- * pages that have become evictable.  Move those back to the zones'
- * inactive list where they become candidates for reclaim.
- * This occurs when, e.g., we have unswappable pages on the unevictable lists,
- * and we add swap to the system.  As such, it runs in the context of a task
- * that has possibly/probably made some previously unevictable pages
- * evictable.
- */
-static void scan_all_zones_unevictable_pages(void)
-{
-       struct zone *zone;
-
-       for_each_zone(zone) {
-               scan_zone_unevictable_pages(zone);
-       }
+       printk_once(KERN_WARNING
+                   "%s: The scan_unevictable_pages sysctl/node-interface has been "
+                   "disabled for lack of a legitimate use case.  If you have "
+                   "one, please send an email to linux-mm@kvack.org.\n",
+                   current->comm);
  }
  
  /*
@@ -3433,11 +3453,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
                            void __user *buffer,
                            size_t *length, loff_t *ppos)
  {
+       warn_scan_unevictable_pages();
         proc_doulongvec_minmax(table, write, buffer, length, ppos);
-
-       if (write && *(unsigned long *)table->data)
-               scan_all_zones_unevictable_pages();
-
         scan_unevictable_pages = 0;
         return 0;
  }
@@ -3448,45 +3465,34 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
   * a specified node's per zone unevictable lists for evictable pages.
   */
  
-static ssize_t read_scan_unevictable_node(struct sys_device *dev,
-                                         struct sysdev_attribute *attr,
+static ssize_t read_scan_unevictable_node(struct device *dev,
+                                         struct device_attribute *attr,
                                           char *buf)
  {
+       warn_scan_unevictable_pages();
         return sprintf(buf, "0\n");     /* always zero; should fit... */
  }
  
-static ssize_t write_scan_unevictable_node(struct sys_device *dev,
-                                          struct sysdev_attribute *attr,
+static ssize_t write_scan_unevictable_node(struct device *dev,
+                                          struct device_attribute *attr,
                                         const char *buf, size_t count)
  {
-       struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
-       struct zone *zone;
-       unsigned long res;
-       unsigned long req = strict_strtoul(buf, 10, &res);
-
-       if (!req)
-               return 1;       /* zero is no-op */
-
-       for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-               if (!populated_zone(zone))
-                       continue;
-               scan_zone_unevictable_pages(zone);
-       }
+       warn_scan_unevictable_pages();
         return 1;
  }
  
  
-static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
+static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
                         read_scan_unevictable_node,
                         write_scan_unevictable_node);
  
  int scan_unevictable_register_node(struct node *node)
  {
-       return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
+       return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
  }
  
  void scan_unevictable_unregister_node(struct node *node)
  {
-       sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
+       device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
  }
  #endif