mm: do batched scans for mem_cgroup
Wu Fengguang [Tue, 22 Sep 2009 00:03:11 +0000 (17:03 -0700)]
For mem_cgroup, shrink_zone() may call shrink_list() with nr_to_scan=1, in
which case shrink_list() _still_ calls isolate_pages() with the much
larger SWAP_CLUSTER_MAX.  It effectively scales up the inactive list scan
rate by up to 32 times.

For example, with 16k inactive pages and DEF_PRIORITY=12, (16k >> 12)=4.
So when shrink_zone() expects to scan 4 pages in the active/inactive list,
the active list will be scanned 4 pages, while the inactive list will be
(over) scanned SWAP_CLUSTER_MAX=32 pages in effect.  And that could break
the balance between the two lists.

It can further impact the scan of anon active list, due to the anon
active/inactive ratio rebalance logic in balance_pgdat()/shrink_zone():

inactive anon list over scanned => inactive_anon_is_low() == TRUE
                                => shrink_active_list()
                                => active anon list over scanned

So the end result may be

- anon inactive  => over scanned
- anon active    => over scanned (maybe not as much)
- file inactive  => over scanned
- file active    => under scanned (relatively)

The accesses to nr_saved_scan are not lock protected and so not 100%
accurate, however we can tolerate small errors and the resulted small
imbalanced scan rates between zones.

Cc: Rik van Riel <riel@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

include/linux/mmzone.h
mm/page_alloc.c
mm/vmscan.c

index 9c50309..c188ea6 100644 (file)
@@ -273,6 +273,11 @@ struct zone_reclaim_stat {
         */
        unsigned long           recent_rotated[2];
        unsigned long           recent_scanned[2];
+
+       /*
+        * accumulated for batching
+        */
+       unsigned long           nr_saved_scan[NR_LRU_LISTS];
 };
 
 struct zone {
@@ -327,7 +332,6 @@ struct zone {
        spinlock_t              lru_lock;       
        struct zone_lru {
                struct list_head list;
-               unsigned long nr_saved_scan;    /* accumulated for batching */
        } lru[NR_LRU_LISTS];
 
        struct zone_reclaim_stat reclaim_stat;
index 770f011..84d9da1 100644 (file)
@@ -3809,7 +3809,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone_pcp_init(zone);
                for_each_lru(l) {
                        INIT_LIST_HEAD(&zone->lru[l].list);
-                       zone->lru[l].nr_saved_scan = 0;
+                       zone->reclaim_stat.nr_saved_scan[l] = 0;
                }
                zone->reclaim_stat.recent_rotated[0] = 0;
                zone->reclaim_stat.recent_rotated[1] = 0;
index 5432c23..0e7f5e4 100644 (file)
@@ -1586,6 +1586,7 @@ static void shrink_zone(int priority, struct zone *zone,
        enum lru_list l;
        unsigned long nr_reclaimed = sc->nr_reclaimed;
        unsigned long swap_cluster_max = sc->swap_cluster_max;
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        int noswap = 0;
 
        /* If we have no swap space, do not bother scanning anon pages. */
@@ -1605,12 +1606,9 @@ static void shrink_zone(int priority, struct zone *zone,
                        scan >>= priority;
                        scan = (scan * percent[file]) / 100;
                }
-               if (scanning_global_lru(sc))
-                       nr[l] = nr_scan_try_batch(scan,
-                                                 &zone->lru[l].nr_saved_scan,
-                                                 swap_cluster_max);
-               else
-                       nr[l] = scan;
+               nr[l] = nr_scan_try_batch(scan,
+                                         &reclaim_stat->nr_saved_scan[l],
+                                         swap_cluster_max);
        }
 
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2220,6 +2218,7 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
 {
        struct zone *zone;
        unsigned long nr_reclaimed = 0;
+       struct zone_reclaim_stat *reclaim_stat;
 
        for_each_populated_zone(zone) {
                enum lru_list l;
@@ -2236,11 +2235,14 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
                                                l == LRU_ACTIVE_FILE))
                                continue;
 
-                       zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
-                       if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
+                       reclaim_stat = get_reclaim_stat(zone, sc);
+                       reclaim_stat->nr_saved_scan[l] +=
+                                               (lru_pages >> prio) + 1;
+                       if (reclaim_stat->nr_saved_scan[l]
+                                               >= nr_pages || pass > 3) {
                                unsigned long nr_to_scan;
 
-                               zone->lru[l].nr_saved_scan = 0;
+                               reclaim_stat->nr_saved_scan[l] = 0;
                                nr_to_scan = min(nr_pages, lru_pages);
                                nr_reclaimed += shrink_list(l, nr_to_scan, zone,
                                                                sc, prio);