tmpfs: convert shmem_truncate_range to radix-swap
[linux-2.6.git] / mm / memcontrol.c
index c39a177..5f84d23 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
+#include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index {
 enum mem_cgroup_events_target {
        MEM_CGROUP_TARGET_THRESH,
        MEM_CGROUP_TARGET_SOFTLIMIT,
+       MEM_CGROUP_TARGET_NUMAINFO,
        MEM_CGROUP_NTARGETS,
 };
 #define THRESHOLDS_EVENTS_TARGET (128)
 #define SOFTLIMIT_EVENTS_TARGET (1024)
+#define NUMAINFO_EVENTS_TARGET (1024)
 
 struct mem_cgroup_stat_cpu {
        long count[MEM_CGROUP_STAT_NSTATS];
@@ -202,6 +205,50 @@ struct mem_cgroup_eventfd_list {
 static void mem_cgroup_threshold(struct mem_cgroup *mem);
 static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
 
+enum {
+       SCAN_BY_LIMIT,
+       SCAN_BY_SYSTEM,
+       NR_SCAN_CONTEXT,
+       SCAN_BY_SHRINK, /* not recorded now */
+};
+
+enum {
+       SCAN,
+       SCAN_ANON,
+       SCAN_FILE,
+       ROTATE,
+       ROTATE_ANON,
+       ROTATE_FILE,
+       FREED,
+       FREED_ANON,
+       FREED_FILE,
+       ELAPSED,
+       NR_SCANSTATS,
+};
+
+struct scanstat {
+       spinlock_t      lock;
+       unsigned long   stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
+       unsigned long   rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
+};
+
+const char *scanstat_string[NR_SCANSTATS] = {
+       "scanned_pages",
+       "scanned_anon_pages",
+       "scanned_file_pages",
+       "rotated_pages",
+       "rotated_anon_pages",
+       "rotated_file_pages",
+       "freed_pages",
+       "freed_anon_pages",
+       "freed_file_pages",
+       "elapsed_ns",
+};
+#define SCANSTAT_WORD_LIMIT    "_by_limit"
+#define SCANSTAT_WORD_SYSTEM   "_by_system"
+#define SCANSTAT_WORD_HIERARCHY        "_under_hierarchy"
+
+
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
@@ -236,16 +283,20 @@ struct mem_cgroup {
        int last_scanned_node;
 #if MAX_NUMNODES > 1
        nodemask_t      scan_nodes;
-       unsigned long   next_scan_node_update;
+       atomic_t        numainfo_events;
+       atomic_t        numainfo_updating;
 #endif
        /*
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;
-       atomic_t        oom_lock;
+
+       bool            oom_lock;
+       atomic_t        under_oom;
+
        atomic_t        refcnt;
 
-       unsigned int    swappiness;
+       int     swappiness;
        /* OOM-Killer disable */
        int             oom_kill_disable;
 
@@ -263,7 +314,8 @@ struct mem_cgroup {
 
        /* For oom notifier event fd */
        struct list_head oom_notify;
-
+       /* For recording LRU-scan statistics */
+       struct scanstat scanstat;
        /*
         * Should we move charges of a task when a task is moved into this
         * mem_cgroup ? And what type of charges should we move ?
@@ -576,15 +628,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
        return val;
 }
 
-static long mem_cgroup_local_usage(struct mem_cgroup *mem)
-{
-       long ret;
-
-       ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
-       ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
-       return ret;
-}
-
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
                                         bool charge)
 {
@@ -641,27 +684,44 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        preempt_enable();
 }
 
-static unsigned long
-mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+unsigned long
+mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
+                       unsigned int lru_mask)
 {
        struct mem_cgroup_per_zone *mz;
+       enum lru_list l;
+       unsigned long ret = 0;
+
+       mz = mem_cgroup_zoneinfo(mem, nid, zid);
+
+       for_each_lru(l) {
+               if (BIT(l) & lru_mask)
+                       ret += MEM_CGROUP_ZSTAT(mz, l);
+       }
+       return ret;
+}
+
+static unsigned long
+mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
+                       int nid, unsigned int lru_mask)
+{
        u64 total = 0;
        int zid;
 
-       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-               mz = mem_cgroup_zoneinfo(mem, nid, zid);
-               total += MEM_CGROUP_ZSTAT(mz, idx);
-       }
+       for (zid = 0; zid < MAX_NR_ZONES; zid++)
+               total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
+
        return total;
 }
-static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
-                                       enum lru_list idx)
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
+                       unsigned int lru_mask)
 {
        int nid;
        u64 total = 0;
 
-       for_each_online_node(nid)
-               total += mem_cgroup_get_zonestat_node(mem, nid, idx);
+       for_each_node_state(nid, N_HIGH_MEMORY)
+               total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
        return total;
 }
 
@@ -688,6 +748,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
        case MEM_CGROUP_TARGET_SOFTLIMIT:
                next = val + SOFTLIMIT_EVENTS_TARGET;
                break;
+       case MEM_CGROUP_TARGET_NUMAINFO:
+               next = val + NUMAINFO_EVENTS_TARGET;
+               break;
        default:
                return;
        }
@@ -706,11 +769,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
                mem_cgroup_threshold(mem);
                __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
                if (unlikely(__memcg_event_check(mem,
-                       MEM_CGROUP_TARGET_SOFTLIMIT))){
+                            MEM_CGROUP_TARGET_SOFTLIMIT))) {
                        mem_cgroup_update_tree(mem, page);
                        __mem_cgroup_target_update(mem,
-                               MEM_CGROUP_TARGET_SOFTLIMIT);
+                                                  MEM_CGROUP_TARGET_SOFTLIMIT);
+               }
+#if MAX_NUMNODES > 1
+               if (unlikely(__memcg_event_check(mem,
+                       MEM_CGROUP_TARGET_NUMAINFO))) {
+                       atomic_inc(&mem->numainfo_events);
+                       __mem_cgroup_target_update(mem,
+                               MEM_CGROUP_TARGET_NUMAINFO);
                }
+#endif
        }
 }
 
@@ -1037,6 +1108,21 @@ void mem_cgroup_move_lists(struct page *page,
        mem_cgroup_add_lru_list(page, to);
 }
 
+/*
+ * Checks whether given mem is same or in the root_mem's
+ * hierarchy subtree
+ */
+static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
+               struct mem_cgroup *mem)
+{
+       if (root_mem != mem) {
+               return (root_mem->use_hierarchy &&
+                       css_is_ancestor(&mem->css, &root_mem->css));
+       }
+
+       return true;
+}
+
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
        int ret;
@@ -1056,10 +1142,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
         * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
         * hierarchy(even if use_hierarchy is disabled in "mem").
         */
-       if (mem->use_hierarchy)
-               ret = css_is_ancestor(&curr->css, &mem->css);
-       else
-               ret = (curr == mem);
+       ret = mem_cgroup_same_or_subtree(mem, curr);
        css_put(&curr->css);
        return ret;
 }
@@ -1071,8 +1154,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
        unsigned long gb;
        unsigned long inactive_ratio;
 
-       inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
-       active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
+       inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
+       active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
 
        gb = (inactive + active) >> (30 - PAGE_SHIFT);
        if (gb)
@@ -1111,110 +1194,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
        unsigned long active;
        unsigned long inactive;
 
-       inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
-       active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
+       inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
+       active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
 
        return (active > inactive);
 }
 
-unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
-                                               struct zone *zone,
-                                               enum lru_list lru)
-{
-       int nid = zone_to_nid(zone);
-       int zid = zone_idx(zone);
-       struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-
-       return MEM_CGROUP_ZSTAT(mz, lru);
-}
-
-#ifdef CONFIG_NUMA
-static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
-                                                       int nid)
-{
-       unsigned long ret;
-
-       ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
-               mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
-
-       return ret;
-}
-
-static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
-{
-       u64 total = 0;
-       int nid;
-
-       for_each_node_state(nid, N_HIGH_MEMORY)
-               total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
-
-       return total;
-}
-
-static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
-                                                       int nid)
-{
-       unsigned long ret;
-
-       ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
-               mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
-
-       return ret;
-}
-
-static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
-{
-       u64 total = 0;
-       int nid;
-
-       for_each_node_state(nid, N_HIGH_MEMORY)
-               total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
-
-       return total;
-}
-
-static unsigned long
-mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
-{
-       return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
-}
-
-static unsigned long
-mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
-{
-       u64 total = 0;
-       int nid;
-
-       for_each_node_state(nid, N_HIGH_MEMORY)
-               total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
-
-       return total;
-}
-
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
-                                                       int nid)
-{
-       enum lru_list l;
-       u64 total = 0;
-
-       for_each_lru(l)
-               total += mem_cgroup_get_zonestat_node(memcg, nid, l);
-
-       return total;
-}
-
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
-{
-       u64 total = 0;
-       int nid;
-
-       for_each_node_state(nid, N_HIGH_MEMORY)
-               total += mem_cgroup_node_nr_lru_pages(memcg, nid);
-
-       return total;
-}
-#endif /* CONFIG_NUMA */
-
 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
                                                      struct zone *zone)
 {
@@ -1324,7 +1309,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
        return margin >> PAGE_SHIFT;
 }
 
-static unsigned int get_swappiness(struct mem_cgroup *memcg)
+int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
        struct cgroup *cgrp = memcg->css.cgroup;
 
@@ -1396,10 +1381,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
        to = mc.to;
        if (!from)
                goto unlock;
-       if (from == mem || to == mem
-           || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
-           || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
-               ret = true;
+
+       ret = mem_cgroup_same_or_subtree(mem, from)
+               || mem_cgroup_same_or_subtree(mem, to);
 unlock:
        spin_unlock(&mc.lock);
        return ret;
@@ -1558,6 +1542,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
        return ret;
 }
 
+/**
+ * test_mem_cgroup_node_reclaimable
+ * @mem: the target memcg
+ * @nid: the node ID to be checked.
+ * @noswap : specify true here if the user wants flle only information.
+ *
+ * This function returns whether the specified memcg contains any
+ * reclaimable pages on a node. Returns true if there are any reclaimable
+ * pages in the node.
+ */
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+               int nid, bool noswap)
+{
+       if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
+               return true;
+       if (noswap || !total_swap_pages)
+               return false;
+       if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
+               return true;
+       return false;
+
+}
 #if MAX_NUMNODES > 1
 
 /*
@@ -1569,26 +1575,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
 {
        int nid;
-
-       if (time_after(mem->next_scan_node_update, jiffies))
+       /*
+        * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
+        * pagein/pageout changes since the last update.
+        */
+       if (!atomic_read(&mem->numainfo_events))
+               return;
+       if (atomic_inc_return(&mem->numainfo_updating) > 1)
                return;
 
-       mem->next_scan_node_update = jiffies + 10*HZ;
        /* make a nodemask where this memcg uses memory from */
        mem->scan_nodes = node_states[N_HIGH_MEMORY];
 
        for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
 
-               if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
-                   mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
-                       continue;
-
-               if (total_swap_pages &&
-                   (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
-                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
-                       continue;
-               node_clear(nid, mem->scan_nodes);
+               if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
+                       node_clear(nid, mem->scan_nodes);
        }
+
+       atomic_set(&mem->numainfo_events, 0);
+       atomic_set(&mem->numainfo_updating, 0);
 }
 
 /*
@@ -1626,13 +1632,91 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
        return node;
 }
 
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+       int nid;
+
+       /*
+        * quick check...making use of scan_node.
+        * We can skip unused nodes.
+        */
+       if (!nodes_empty(mem->scan_nodes)) {
+               for (nid = first_node(mem->scan_nodes);
+                    nid < MAX_NUMNODES;
+                    nid = next_node(nid, mem->scan_nodes)) {
+
+                       if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                               return true;
+               }
+       }
+       /*
+        * Check rest of nodes.
+        */
+       for_each_node_state(nid, N_HIGH_MEMORY) {
+               if (node_isset(nid, mem->scan_nodes))
+                       continue;
+               if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                       return true;
+       }
+       return false;
+}
+
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
 {
        return 0;
 }
+
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+       return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+}
 #endif
 
+static void __mem_cgroup_record_scanstat(unsigned long *stats,
+                          struct memcg_scanrecord *rec)
+{
+
+       stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
+       stats[SCAN_ANON] += rec->nr_scanned[0];
+       stats[SCAN_FILE] += rec->nr_scanned[1];
+
+       stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
+       stats[ROTATE_ANON] += rec->nr_rotated[0];
+       stats[ROTATE_FILE] += rec->nr_rotated[1];
+
+       stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
+       stats[FREED_ANON] += rec->nr_freed[0];
+       stats[FREED_FILE] += rec->nr_freed[1];
+
+       stats[ELAPSED] += rec->elapsed;
+}
+
+static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
+{
+       struct mem_cgroup *mem;
+       int context = rec->context;
+
+       if (context >= NR_SCAN_CONTEXT)
+               return;
+
+       mem = rec->mem;
+       spin_lock(&mem->scanstat.lock);
+       __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
+       spin_unlock(&mem->scanstat.lock);
+
+       mem = rec->root;
+       spin_lock(&mem->scanstat.lock);
+       __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
+       spin_unlock(&mem->scanstat.lock);
+}
+
 /*
  * Scan the hierarchy if needed to reclaim memory. We remember the last child
  * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1657,20 +1741,36 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
        bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
        bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
+       struct memcg_scanrecord rec;
        unsigned long excess;
-       unsigned long nr_scanned;
+       unsigned long scanned;
 
        excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
 
        /* If memsw_is_minimum==1, swap-out is of-no-use. */
-       if (!check_soft && root_mem->memsw_is_minimum)
+       if (!check_soft && !shrink && root_mem->memsw_is_minimum)
                noswap = true;
 
+       if (shrink)
+               rec.context = SCAN_BY_SHRINK;
+       else if (check_soft)
+               rec.context = SCAN_BY_SYSTEM;
+       else
+               rec.context = SCAN_BY_LIMIT;
+
+       rec.root = root_mem;
+
        while (1) {
                victim = mem_cgroup_select_victim(root_mem);
                if (victim == root_mem) {
                        loop++;
-                       if (loop >= 1)
+                       /*
+                        * We are not draining per cpu cached charges during
+                        * soft limit reclaim  because global reclaim doesn't
+                        * care about charges. It tries to free some memory and
+                        * charges will not give any.
+                        */
+                       if (!check_soft && loop >= 1)
                                drain_all_stock_async(root_mem);
                        if (loop >= 2) {
                                /*
@@ -1695,20 +1795,28 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                }
                        }
                }
-               if (!mem_cgroup_local_usage(victim)) {
+               if (!mem_cgroup_reclaimable(victim, noswap)) {
                        /* this cgroup's local usage == 0 */
                        css_put(&victim->css);
                        continue;
                }
+               rec.mem = victim;
+               rec.nr_scanned[0] = 0;
+               rec.nr_scanned[1] = 0;
+               rec.nr_rotated[0] = 0;
+               rec.nr_rotated[1] = 0;
+               rec.nr_freed[0] = 0;
+               rec.nr_freed[1] = 0;
+               rec.elapsed = 0;
                /* we use swappiness of local cgroup */
                if (check_soft) {
                        ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                               noswap, get_swappiness(victim), zone,
-                               &nr_scanned);
-                       *total_scanned += nr_scanned;
+                               noswap, zone, &rec, &scanned);
+                       *total_scanned += scanned;
                } else
                        ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
-                                               noswap, get_swappiness(victim));
+                                               noswap, &rec);
+               mem_cgroup_record_scanstat(&rec);
                css_put(&victim->css);
                /*
                 * At shrinking usage, we can't check we should stop here or
@@ -1730,38 +1838,84 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 /*
  * Check OOM-Killer is already running under our hierarchy.
  * If someone is running, return false.
+ * Has to be called with memcg_oom_lock
  */
 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
 {
-       int x, lock_count = 0;
-       struct mem_cgroup *iter;
+       int lock_count = -1;
+       struct mem_cgroup *iter, *failed = NULL;
+       bool cond = true;
 
-       for_each_mem_cgroup_tree(iter, mem) {
-               x = atomic_inc_return(&iter->oom_lock);
-               lock_count = max(x, lock_count);
+       for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+               bool locked = iter->oom_lock;
+
+               iter->oom_lock = true;
+               if (lock_count == -1)
+                       lock_count = iter->oom_lock;
+               else if (lock_count != locked) {
+                       /*
+                        * this subtree of our hierarchy is already locked
+                        * so we cannot give a lock.
+                        */
+                       lock_count = 0;
+                       failed = iter;
+                       cond = false;
+               }
        }
 
-       if (lock_count == 1)
-               return true;
-       return false;
+       if (!failed)
+               goto done;
+
+       /*
+        * OK, we failed to lock the whole subtree so we have to clean up
+        * what we set up to the failing subtree
+        */
+       cond = true;
+       for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+               if (iter == failed) {
+                       cond = false;
+                       continue;
+               }
+               iter->oom_lock = false;
+       }
+done:
+       return lock_count;
 }
 
+/*
+ * Has to be called with memcg_oom_lock
+ */
 static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
 {
        struct mem_cgroup *iter;
 
+       for_each_mem_cgroup_tree(iter, mem)
+               iter->oom_lock = false;
+       return 0;
+}
+
+static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
+{
+       struct mem_cgroup *iter;
+
+       for_each_mem_cgroup_tree(iter, mem)
+               atomic_inc(&iter->under_oom);
+}
+
+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
+{
+       struct mem_cgroup *iter;
+
        /*
         * When a new child is created while the hierarchy is under oom,
         * mem_cgroup_oom_lock() may not be called. We have to use
         * atomic_add_unless() here.
         */
        for_each_mem_cgroup_tree(iter, mem)
-               atomic_add_unless(&iter->oom_lock, -1, 0);
-       return 0;
+               atomic_add_unless(&iter->under_oom, -1, 0);
 }
 
-
-static DEFINE_MUTEX(memcg_oom_mutex);
+static DEFINE_SPINLOCK(memcg_oom_lock);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 
 struct oom_wait_info {
@@ -1772,25 +1926,20 @@ struct oom_wait_info {
 static int memcg_oom_wake_function(wait_queue_t *wait,
        unsigned mode, int sync, void *arg)
 {
-       struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
+       struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
+                         *oom_wait_mem;
        struct oom_wait_info *oom_wait_info;
 
        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
+       oom_wait_mem = oom_wait_info->mem;
 
-       if (oom_wait_info->mem == wake_mem)
-               goto wakeup;
-       /* if no hierarchy, no match */
-       if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
-               return 0;
        /*
         * Both of oom_wait_info->mem and wake_mem are stable under us.
         * Then we can use css_is_ancestor without taking care of RCU.
         */
-       if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
-           !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
+       if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
+                       && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
                return 0;
-
-wakeup:
        return autoremove_wake_function(wait, mode, sync, arg);
 }
 
@@ -1802,7 +1951,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
 
 static void memcg_oom_recover(struct mem_cgroup *mem)
 {
-       if (mem && atomic_read(&mem->oom_lock))
+       if (mem && atomic_read(&mem->under_oom))
                memcg_wakeup_oom(mem);
 }
 
@@ -1820,8 +1969,10 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
        owait.wait.private = current;
        INIT_LIST_HEAD(&owait.wait.task_list);
        need_to_kill = true;
+       mem_cgroup_mark_under_oom(mem);
+
        /* At first, try to OOM lock hierarchy under mem.*/
-       mutex_lock(&memcg_oom_mutex);
+       spin_lock(&memcg_oom_lock);
        locked = mem_cgroup_oom_lock(mem);
        /*
         * Even if signal_pending(), we can't quit charge() loop without
@@ -1833,7 +1984,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
                need_to_kill = false;
        if (locked)
                mem_cgroup_oom_notify(mem);
-       mutex_unlock(&memcg_oom_mutex);
+       spin_unlock(&memcg_oom_lock);
 
        if (need_to_kill) {
                finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1842,10 +1993,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
                schedule();
                finish_wait(&memcg_oom_waitq, &owait.wait);
        }
-       mutex_lock(&memcg_oom_mutex);
-       mem_cgroup_oom_unlock(mem);
+       spin_lock(&memcg_oom_lock);
+       if (locked)
+               mem_cgroup_oom_unlock(mem);
        memcg_wakeup_oom(mem);
-       mutex_unlock(&memcg_oom_mutex);
+       spin_unlock(&memcg_oom_lock);
+
+       mem_cgroup_unmark_under_oom(mem);
 
        if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
                return false;
@@ -1938,7 +2092,6 @@ struct memcg_stock_pcp {
 #define FLUSHING_CACHED_CHARGE (0)
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
-static DEFINE_MUTEX(percpu_charge_mutex);
 
 /*
  * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -2006,19 +2159,14 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
 }
 
 /*
- * Tries to drain stocked charges in other cpus. This function is asynchronous
- * and just put a work per cpu for draining localy on each cpu. Caller can
- * expects some charges will be back to res_counter later but cannot wait for
- * it.
+ * Drains all per-CPU charge caches for given root_mem resp. subtree
+ * of the hierarchy under it. sync flag says whether we should block
+ * until the work is done.
  */
-static void drain_all_stock_async(struct mem_cgroup *root_mem)
+static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
 {
        int cpu, curcpu;
-       /*
-        * If someone calls draining, avoid adding more kworker runs.
-        */
-       if (!mutex_trylock(&percpu_charge_mutex))
-               return;
+
        /* Notify other cpus that system-wide "drain" is running */
        get_online_cpus();
        /*
@@ -2032,34 +2180,48 @@ static void drain_all_stock_async(struct mem_cgroup *root_mem)
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
                struct mem_cgroup *mem;
 
-               if (cpu == curcpu)
-                       continue;
-
                mem = stock->cached;
-               if (!mem)
+               if (!mem || !stock->nr_pages)
                        continue;
-               if (mem != root_mem) {
-                       if (!root_mem->use_hierarchy)
-                               continue;
-                       /* check whether "mem" is under tree of "root_mem" */
-                       if (!css_is_ancestor(&mem->css, &root_mem->css))
-                               continue;
+               if (!mem_cgroup_same_or_subtree(root_mem, mem))
+                       continue;
+               if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+                       if (cpu == curcpu)
+                               drain_local_stock(&stock->work);
+                       else
+                               schedule_work_on(cpu, &stock->work);
                }
-               if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
-                       schedule_work_on(cpu, &stock->work);
        }
+
+       if (!sync)
+               goto out;
+
+       for_each_online_cpu(cpu) {
+               struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
+               if (mem_cgroup_same_or_subtree(root_mem, stock->cached) &&
+                               test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+                       flush_work(&stock->work);
+       }
+out:
        put_online_cpus();
-       mutex_unlock(&percpu_charge_mutex);
-       /* We don't wait for flush_work */
+}
+
+/*
+ * Tries to drain stocked charges in other cpus. This function is asynchronous
+ * and just put a work per cpu for draining localy on each cpu. Caller can
+ * expects some charges will be back to res_counter later but cannot wait for
+ * it.
+ */
+static void drain_all_stock_async(struct mem_cgroup *root_mem)
+{
+       drain_all_stock(root_mem, false);
 }
 
 /* This is a synchronous drain interface. */
-static void drain_all_stock_sync(void)
+static void drain_all_stock_sync(struct mem_cgroup *root_mem)
 {
        /* called when force_empty is called */
-       mutex_lock(&percpu_charge_mutex);
-       schedule_on_each_cpu(drain_local_stock);
-       mutex_unlock(&percpu_charge_mutex);
+       drain_all_stock(root_mem, true);
 }
 
 /*
@@ -3707,7 +3869,7 @@ move_account:
                        goto out;
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
-               drain_all_stock_sync();
+               drain_all_stock_sync(mem);
                ret = 0;
                mem_cgroup_start_move(mem);
                for_each_node_state(node, N_HIGH_MEMORY) {
@@ -3746,14 +3908,18 @@ try_to_free:
        /* try to free all pages in this cgroup */
        shrink = 1;
        while (nr_retries && mem->res.usage > 0) {
+               struct memcg_scanrecord rec;
                int progress;
 
                if (signal_pending(current)) {
                        ret = -EINTR;
                        goto out;
                }
+               rec.context = SCAN_BY_SHRINK;
+               rec.mem = mem;
+               rec.root = mem;
                progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
-                                               false, get_swappiness(mem));
+                                               false, &rec);
                if (!progress) {
                        nr_retries--;
                        /* maybe some writeback is necessary */
@@ -4079,15 +4245,15 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
        s->stat[MCS_PGMAJFAULT] += val;
 
        /* per zone stat */
-       val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
+       val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
        s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
-       val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
+       val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
        s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
-       val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
+       val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
        s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
-       val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
+       val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
        s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
-       val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
+       val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
        s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
 }
 
@@ -4109,35 +4275,37 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
        struct cgroup *cont = m->private;
        struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
 
-       total_nr = mem_cgroup_nr_lru_pages(mem_cont);
+       total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
        seq_printf(m, "total=%lu", total_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid);
+               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
 
-       file_nr = mem_cgroup_nr_file_lru_pages(mem_cont);
+       file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
        seq_printf(m, "file=%lu", file_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-               node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid);
+               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+                               LRU_ALL_FILE);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
 
-       anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont);
+       anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
        seq_printf(m, "anon=%lu", anon_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-               node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid);
+               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+                               LRU_ALL_ANON);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
 
-       unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont);
+       unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
        seq_printf(m, "unevictable=%lu", unevictable_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-               node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont,
-                                                                       nid);
+               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+                               BIT(LRU_UNEVICTABLE));
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
@@ -4215,7 +4383,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 
-       return get_swappiness(memcg);
+       return mem_cgroup_swappiness(memcg);
 }
 
 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
@@ -4505,15 +4673,15 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
        if (!event)
                return -ENOMEM;
 
-       mutex_lock(&memcg_oom_mutex);
+       spin_lock(&memcg_oom_lock);
 
        event->eventfd = eventfd;
        list_add(&event->list, &memcg->oom_notify);
 
        /* already in OOM ? */
-       if (atomic_read(&memcg->oom_lock))
+       if (atomic_read(&memcg->under_oom))
                eventfd_signal(eventfd, 1);
-       mutex_unlock(&memcg_oom_mutex);
+       spin_unlock(&memcg_oom_lock);
 
        return 0;
 }
@@ -4527,7 +4695,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 
        BUG_ON(type != _OOM_TYPE);
 
-       mutex_lock(&memcg_oom_mutex);
+       spin_lock(&memcg_oom_lock);
 
        list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
                if (ev->eventfd == eventfd) {
@@ -4536,7 +4704,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
                }
        }
 
-       mutex_unlock(&memcg_oom_mutex);
+       spin_unlock(&memcg_oom_lock);
 }
 
 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
@@ -4546,7 +4714,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
 
        cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
 
-       if (atomic_read(&mem->oom_lock))
+       if (atomic_read(&mem->under_oom))
                cb->fill(cb, "under_oom", 1);
        else
                cb->fill(cb, "under_oom", 0);
@@ -4595,6 +4763,54 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
 }
 #endif /* CONFIG_NUMA */
 
+static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
+                               struct cftype *cft,
+                               struct cgroup_map_cb *cb)
+{
+       struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+       char string[64];
+       int i;
+
+       for (i = 0; i < NR_SCANSTATS; i++) {
+               strcpy(string, scanstat_string[i]);
+               strcat(string, SCANSTAT_WORD_LIMIT);
+               cb->fill(cb, string,  mem->scanstat.stats[SCAN_BY_LIMIT][i]);
+       }
+
+       for (i = 0; i < NR_SCANSTATS; i++) {
+               strcpy(string, scanstat_string[i]);
+               strcat(string, SCANSTAT_WORD_SYSTEM);
+               cb->fill(cb, string,  mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
+       }
+
+       for (i = 0; i < NR_SCANSTATS; i++) {
+               strcpy(string, scanstat_string[i]);
+               strcat(string, SCANSTAT_WORD_LIMIT);
+               strcat(string, SCANSTAT_WORD_HIERARCHY);
+               cb->fill(cb, string,  mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
+       }
+       for (i = 0; i < NR_SCANSTATS; i++) {
+               strcpy(string, scanstat_string[i]);
+               strcat(string, SCANSTAT_WORD_SYSTEM);
+               strcat(string, SCANSTAT_WORD_HIERARCHY);
+               cb->fill(cb, string,  mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
+       }
+       return 0;
+}
+
+static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
+                               unsigned int event)
+{
+       struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+
+       spin_lock(&mem->scanstat.lock);
+       memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
+       memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
+       spin_unlock(&mem->scanstat.lock);
+       return 0;
+}
+
+
 static struct cftype mem_cgroup_files[] = {
        {
                .name = "usage_in_bytes",
@@ -4665,6 +4881,11 @@ static struct cftype mem_cgroup_files[] = {
                .mode = S_IRUGO,
        },
 #endif
+       {
+               .name = "vmscan_stat",
+               .read_map = mem_cgroup_vmscan_stat_read,
+               .trigger = mem_cgroup_reset_vmscan_stat,
+       },
 };
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4924,10 +5145,11 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        INIT_LIST_HEAD(&mem->oom_notify);
 
        if (parent)
-               mem->swappiness = get_swappiness(parent);
+               mem->swappiness = mem_cgroup_swappiness(parent);
        atomic_set(&mem->refcnt, 1);
        mem->move_charge_at_immigrate = 0;
        mutex_init(&mem->thresholds_lock);
+       spin_lock_init(&mem->scanstat.lock);
        return &mem->css;
 free_out:
        __mem_cgroup_free(mem);