memcg: fix refcnt going negative
[linux-2.6.git] / mm / memcontrol.c
1 /* memcontrol.c - Memory Controller
2  *
3  * Copyright IBM Corporation, 2007
4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5  *
6  * Copyright 2007 OpenVZ SWsoft Inc
7  * Author: Pavel Emelianov <xemul@openvz.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  */
19
20 #include <linux/res_counter.h>
21 #include <linux/memcontrol.h>
22 #include <linux/cgroup.h>
23 #include <linux/mm.h>
24 #include <linux/pagemap.h>
25 #include <linux/smp.h>
26 #include <linux/page-flags.h>
27 #include <linux/backing-dev.h>
28 #include <linux/bit_spinlock.h>
29 #include <linux/rcupdate.h>
30 #include <linux/limits.h>
31 #include <linux/mutex.h>
32 #include <linux/rbtree.h>
33 #include <linux/slab.h>
34 #include <linux/swap.h>
35 #include <linux/spinlock.h>
36 #include <linux/fs.h>
37 #include <linux/seq_file.h>
38 #include <linux/vmalloc.h>
39 #include <linux/mm_inline.h>
40 #include <linux/page_cgroup.h>
41 #include "internal.h"
42
43 #include <asm/uaccess.h>
44
45 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
46 #define MEM_CGROUP_RECLAIM_RETRIES      5
47 struct mem_cgroup *root_mem_cgroup __read_mostly;
48
49 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
50 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
51 int do_swap_account __read_mostly;
52 static int really_do_swap_account __initdata = 1; /* for remember boot option*/
53 #else
54 #define do_swap_account         (0)
55 #endif
56
57 static DEFINE_MUTEX(memcg_tasklist);    /* can be hold under cgroup_mutex */
58 #define SOFTLIMIT_EVENTS_THRESH (1000)
59
60 /*
61  * Statistics for memory cgroup.
62  */
63 enum mem_cgroup_stat_index {
64         /*
65          * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
66          */
67         MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
68         MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
69         MEM_CGROUP_STAT_MAPPED_FILE,  /* # of pages charged as file rss */
70         MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
71         MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
72         MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73         MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
74
75         MEM_CGROUP_STAT_NSTATS,
76 };
77
78 struct mem_cgroup_stat_cpu {
79         s64 count[MEM_CGROUP_STAT_NSTATS];
80 } ____cacheline_aligned_in_smp;
81
82 struct mem_cgroup_stat {
83         struct mem_cgroup_stat_cpu cpustat[0];
84 };
85
86 static inline void
87 __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88                                 enum mem_cgroup_stat_index idx)
89 {
90         stat->count[idx] = 0;
91 }
92
93 static inline s64
94 __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95                                 enum mem_cgroup_stat_index idx)
96 {
97         return stat->count[idx];
98 }
99
100 /*
101  * For accounting under irq disable, no need for increment preempt count.
102  */
103 static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
104                 enum mem_cgroup_stat_index idx, int val)
105 {
106         stat->count[idx] += val;
107 }
108
109 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
110                 enum mem_cgroup_stat_index idx)
111 {
112         int cpu;
113         s64 ret = 0;
114         for_each_possible_cpu(cpu)
115                 ret += stat->cpustat[cpu].count[idx];
116         return ret;
117 }
118
119 static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
120 {
121         s64 ret;
122
123         ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
124         ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
125         return ret;
126 }
127
128 /*
129  * per-zone information in memory controller.
130  */
131 struct mem_cgroup_per_zone {
132         /*
133          * spin_lock to protect the per cgroup LRU
134          */
135         struct list_head        lists[NR_LRU_LISTS];
136         unsigned long           count[NR_LRU_LISTS];
137
138         struct zone_reclaim_stat reclaim_stat;
139         struct rb_node          tree_node;      /* RB tree node */
140         unsigned long long      usage_in_excess;/* Set to the value by which */
141                                                 /* the soft limit is exceeded*/
142         bool                    on_tree;
143         struct mem_cgroup       *mem;           /* Back pointer, we cannot */
144                                                 /* use container_of        */
145 };
146 /* Macro for accessing counter */
147 #define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
148
149 struct mem_cgroup_per_node {
150         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
151 };
152
153 struct mem_cgroup_lru_info {
154         struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
155 };
156
157 /*
158  * Cgroups above their limits are maintained in a RB-Tree, independent of
159  * their hierarchy representation
160  */
161
162 struct mem_cgroup_tree_per_zone {
163         struct rb_root rb_root;
164         spinlock_t lock;
165 };
166
167 struct mem_cgroup_tree_per_node {
168         struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
169 };
170
171 struct mem_cgroup_tree {
172         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
173 };
174
175 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176
177 /*
178  * The memory controller data structure. The memory controller controls both
179  * page cache and RSS per cgroup. We would eventually like to provide
180  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
181  * to help the administrator determine what knobs to tune.
182  *
183  * TODO: Add a water mark for the memory controller. Reclaim will begin when
184  * we hit the water mark. May be even add a low water mark, such that
185  * no reclaim occurs from a cgroup at it's low water mark, this is
186  * a feature that will be implemented much later in the future.
187  */
188 struct mem_cgroup {
189         struct cgroup_subsys_state css;
190         /*
191          * the counter to account for memory usage
192          */
193         struct res_counter res;
194         /*
195          * the counter to account for mem+swap usage.
196          */
197         struct res_counter memsw;
198         /*
199          * Per cgroup active and inactive list, similar to the
200          * per zone LRU lists.
201          */
202         struct mem_cgroup_lru_info info;
203
204         /*
205           protect against reclaim related member.
206         */
207         spinlock_t reclaim_param_lock;
208
209         int     prev_priority;  /* for recording reclaim priority */
210
211         /*
212          * While reclaiming in a hiearchy, we cache the last child we
213          * reclaimed from.
214          */
215         int last_scanned_child;
216         /*
217          * Should the accounting and control be hierarchical, per subtree?
218          */
219         bool use_hierarchy;
220         unsigned long   last_oom_jiffies;
221         atomic_t        refcnt;
222
223         unsigned int    swappiness;
224
225         /* set when res.limit == memsw.limit */
226         bool            memsw_is_minimum;
227
228         /*
229          * statistics. This must be placed at the end of memcg.
230          */
231         struct mem_cgroup_stat stat;
232 };
233
234 /*
235  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
236  * limit reclaim to prevent infinite loops, if they ever occur.
237  */
238 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            (100)
239 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
240
241 enum charge_type {
242         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
243         MEM_CGROUP_CHARGE_TYPE_MAPPED,
244         MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
245         MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
246         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
247         MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
248         NR_CHARGE_TYPE,
249 };
250
251 /* only for here (for easy reading.) */
252 #define PCGF_CACHE      (1UL << PCG_CACHE)
253 #define PCGF_USED       (1UL << PCG_USED)
254 #define PCGF_LOCK       (1UL << PCG_LOCK)
255 /* Not used, but added here for completeness */
256 #define PCGF_ACCT       (1UL << PCG_ACCT)
257
258 /* for encoding cft->private value on file */
259 #define _MEM                    (0)
260 #define _MEMSWAP                (1)
261 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
262 #define MEMFILE_TYPE(val)       (((val) >> 16) & 0xffff)
263 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
264
265 /*
266  * Reclaim flags for mem_cgroup_hierarchical_reclaim
267  */
268 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT   0x0
269 #define MEM_CGROUP_RECLAIM_NOSWAP       (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
270 #define MEM_CGROUP_RECLAIM_SHRINK_BIT   0x1
271 #define MEM_CGROUP_RECLAIM_SHRINK       (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
272 #define MEM_CGROUP_RECLAIM_SOFT_BIT     0x2
273 #define MEM_CGROUP_RECLAIM_SOFT         (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
274
275 static void mem_cgroup_get(struct mem_cgroup *mem);
276 static void mem_cgroup_put(struct mem_cgroup *mem);
277 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
278
279 static struct mem_cgroup_per_zone *
280 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
281 {
282         return &mem->info.nodeinfo[nid]->zoneinfo[zid];
283 }
284
285 static struct mem_cgroup_per_zone *
286 page_cgroup_zoneinfo(struct page_cgroup *pc)
287 {
288         struct mem_cgroup *mem = pc->mem_cgroup;
289         int nid = page_cgroup_nid(pc);
290         int zid = page_cgroup_zid(pc);
291
292         if (!mem)
293                 return NULL;
294
295         return mem_cgroup_zoneinfo(mem, nid, zid);
296 }
297
298 static struct mem_cgroup_tree_per_zone *
299 soft_limit_tree_node_zone(int nid, int zid)
300 {
301         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
302 }
303
304 static struct mem_cgroup_tree_per_zone *
305 soft_limit_tree_from_page(struct page *page)
306 {
307         int nid = page_to_nid(page);
308         int zid = page_zonenum(page);
309
310         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
311 }
312
313 static void
314 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
315                                 struct mem_cgroup_per_zone *mz,
316                                 struct mem_cgroup_tree_per_zone *mctz)
317 {
318         struct rb_node **p = &mctz->rb_root.rb_node;
319         struct rb_node *parent = NULL;
320         struct mem_cgroup_per_zone *mz_node;
321
322         if (mz->on_tree)
323                 return;
324
325         mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
326         while (*p) {
327                 parent = *p;
328                 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
329                                         tree_node);
330                 if (mz->usage_in_excess < mz_node->usage_in_excess)
331                         p = &(*p)->rb_left;
332                 /*
333                  * We can't avoid mem cgroups that are over their soft
334                  * limit by the same amount
335                  */
336                 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
337                         p = &(*p)->rb_right;
338         }
339         rb_link_node(&mz->tree_node, parent, p);
340         rb_insert_color(&mz->tree_node, &mctz->rb_root);
341         mz->on_tree = true;
342 }
343
344 static void
345 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
346                                 struct mem_cgroup_per_zone *mz,
347                                 struct mem_cgroup_tree_per_zone *mctz)
348 {
349         if (!mz->on_tree)
350                 return;
351         rb_erase(&mz->tree_node, &mctz->rb_root);
352         mz->on_tree = false;
353 }
354
355 static void
356 mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
357                                 struct mem_cgroup_per_zone *mz,
358                                 struct mem_cgroup_tree_per_zone *mctz)
359 {
360         spin_lock(&mctz->lock);
361         __mem_cgroup_insert_exceeded(mem, mz, mctz);
362         spin_unlock(&mctz->lock);
363 }
364
365 static void
366 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
367                                 struct mem_cgroup_per_zone *mz,
368                                 struct mem_cgroup_tree_per_zone *mctz)
369 {
370         spin_lock(&mctz->lock);
371         __mem_cgroup_remove_exceeded(mem, mz, mctz);
372         spin_unlock(&mctz->lock);
373 }
374
375 static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
376 {
377         bool ret = false;
378         int cpu;
379         s64 val;
380         struct mem_cgroup_stat_cpu *cpustat;
381
382         cpu = get_cpu();
383         cpustat = &mem->stat.cpustat[cpu];
384         val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
385         if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
386                 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
387                 ret = true;
388         }
389         put_cpu();
390         return ret;
391 }
392
393 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
394 {
395         unsigned long long prev_usage_in_excess, new_usage_in_excess;
396         bool updated_tree = false;
397         struct mem_cgroup_per_zone *mz;
398         struct mem_cgroup_tree_per_zone *mctz;
399
400         mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
401         mctz = soft_limit_tree_from_page(page);
402
403         /*
404          * We do updates in lazy mode, mem's are removed
405          * lazily from the per-zone, per-node rb tree
406          */
407         prev_usage_in_excess = mz->usage_in_excess;
408
409         new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
410         if (prev_usage_in_excess) {
411                 mem_cgroup_remove_exceeded(mem, mz, mctz);
412                 updated_tree = true;
413         }
414         if (!new_usage_in_excess)
415                 goto done;
416         mem_cgroup_insert_exceeded(mem, mz, mctz);
417
418 done:
419         if (updated_tree) {
420                 spin_lock(&mctz->lock);
421                 mz->usage_in_excess = new_usage_in_excess;
422                 spin_unlock(&mctz->lock);
423         }
424 }
425
426 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
427 {
428         int node, zone;
429         struct mem_cgroup_per_zone *mz;
430         struct mem_cgroup_tree_per_zone *mctz;
431
432         for_each_node_state(node, N_POSSIBLE) {
433                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
434                         mz = mem_cgroup_zoneinfo(mem, node, zone);
435                         mctz = soft_limit_tree_node_zone(node, zone);
436                         mem_cgroup_remove_exceeded(mem, mz, mctz);
437                 }
438         }
439 }
440
441 static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
442 {
443         return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
444 }
445
446 static struct mem_cgroup_per_zone *
447 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
448 {
449         struct rb_node *rightmost = NULL;
450         struct mem_cgroup_per_zone *mz;
451
452 retry:
453         mz = NULL;
454         rightmost = rb_last(&mctz->rb_root);
455         if (!rightmost)
456                 goto done;              /* Nothing to reclaim from */
457
458         mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
459         /*
460          * Remove the node now but someone else can add it back,
461          * we will to add it back at the end of reclaim to its correct
462          * position in the tree.
463          */
464         __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
465         if (!res_counter_soft_limit_excess(&mz->mem->res) ||
466                 !css_tryget(&mz->mem->css))
467                 goto retry;
468 done:
469         return mz;
470 }
471
472 static struct mem_cgroup_per_zone *
473 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
474 {
475         struct mem_cgroup_per_zone *mz;
476
477         spin_lock(&mctz->lock);
478         mz = __mem_cgroup_largest_soft_limit_node(mctz);
479         spin_unlock(&mctz->lock);
480         return mz;
481 }
482
483 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
484                                          bool charge)
485 {
486         int val = (charge) ? 1 : -1;
487         struct mem_cgroup_stat *stat = &mem->stat;
488         struct mem_cgroup_stat_cpu *cpustat;
489         int cpu = get_cpu();
490
491         cpustat = &stat->cpustat[cpu];
492         __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
493         put_cpu();
494 }
495
496 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
497                                          struct page_cgroup *pc,
498                                          bool charge)
499 {
500         int val = (charge) ? 1 : -1;
501         struct mem_cgroup_stat *stat = &mem->stat;
502         struct mem_cgroup_stat_cpu *cpustat;
503         int cpu = get_cpu();
504
505         cpustat = &stat->cpustat[cpu];
506         if (PageCgroupCache(pc))
507                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
508         else
509                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
510
511         if (charge)
512                 __mem_cgroup_stat_add_safe(cpustat,
513                                 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
514         else
515                 __mem_cgroup_stat_add_safe(cpustat,
516                                 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
517         __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
518         put_cpu();
519 }
520
521 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
522                                         enum lru_list idx)
523 {
524         int nid, zid;
525         struct mem_cgroup_per_zone *mz;
526         u64 total = 0;
527
528         for_each_online_node(nid)
529                 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
530                         mz = mem_cgroup_zoneinfo(mem, nid, zid);
531                         total += MEM_CGROUP_ZSTAT(mz, idx);
532                 }
533         return total;
534 }
535
536 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
537 {
538         return container_of(cgroup_subsys_state(cont,
539                                 mem_cgroup_subsys_id), struct mem_cgroup,
540                                 css);
541 }
542
543 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
544 {
545         /*
546          * mm_update_next_owner() may clear mm->owner to NULL
547          * if it races with swapoff, page migration, etc.
548          * So this can be called with p == NULL.
549          */
550         if (unlikely(!p))
551                 return NULL;
552
553         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
554                                 struct mem_cgroup, css);
555 }
556
557 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
558 {
559         struct mem_cgroup *mem = NULL;
560
561         if (!mm)
562                 return NULL;
563         /*
564          * Because we have no locks, mm->owner's may be being moved to other
565          * cgroup. We use css_tryget() here even if this looks
566          * pessimistic (rather than adding locks here).
567          */
568         rcu_read_lock();
569         do {
570                 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
571                 if (unlikely(!mem))
572                         break;
573         } while (!css_tryget(&mem->css));
574         rcu_read_unlock();
575         return mem;
576 }
577
578 /*
579  * Call callback function against all cgroup under hierarchy tree.
580  */
581 static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
582                           int (*func)(struct mem_cgroup *, void *))
583 {
584         int found, ret, nextid;
585         struct cgroup_subsys_state *css;
586         struct mem_cgroup *mem;
587
588         if (!root->use_hierarchy)
589                 return (*func)(root, data);
590
591         nextid = 1;
592         do {
593                 ret = 0;
594                 mem = NULL;
595
596                 rcu_read_lock();
597                 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
598                                    &found);
599                 if (css && css_tryget(css))
600                         mem = container_of(css, struct mem_cgroup, css);
601                 rcu_read_unlock();
602
603                 if (mem) {
604                         ret = (*func)(mem, data);
605                         css_put(&mem->css);
606                 }
607                 nextid = found + 1;
608         } while (!ret && css);
609
610         return ret;
611 }
612
613 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
614 {
615         return (mem == root_mem_cgroup);
616 }
617
618 /*
619  * Following LRU functions are allowed to be used without PCG_LOCK.
620  * Operations are called by routine of global LRU independently from memcg.
621  * What we have to take care of here is validness of pc->mem_cgroup.
622  *
623  * Changes to pc->mem_cgroup happens when
624  * 1. charge
625  * 2. moving account
626  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
627  * It is added to LRU before charge.
628  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
629  * When moving account, the page is not on LRU. It's isolated.
630  */
631
632 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
633 {
634         struct page_cgroup *pc;
635         struct mem_cgroup_per_zone *mz;
636
637         if (mem_cgroup_disabled())
638                 return;
639         pc = lookup_page_cgroup(page);
640         /* can happen while we handle swapcache. */
641         if (!TestClearPageCgroupAcctLRU(pc))
642                 return;
643         VM_BUG_ON(!pc->mem_cgroup);
644         /*
645          * We don't check PCG_USED bit. It's cleared when the "page" is finally
646          * removed from global LRU.
647          */
648         mz = page_cgroup_zoneinfo(pc);
649         MEM_CGROUP_ZSTAT(mz, lru) -= 1;
650         if (mem_cgroup_is_root(pc->mem_cgroup))
651                 return;
652         VM_BUG_ON(list_empty(&pc->lru));
653         list_del_init(&pc->lru);
654         return;
655 }
656
657 void mem_cgroup_del_lru(struct page *page)
658 {
659         mem_cgroup_del_lru_list(page, page_lru(page));
660 }
661
662 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
663 {
664         struct mem_cgroup_per_zone *mz;
665         struct page_cgroup *pc;
666
667         if (mem_cgroup_disabled())
668                 return;
669
670         pc = lookup_page_cgroup(page);
671         /*
672          * Used bit is set without atomic ops but after smp_wmb().
673          * For making pc->mem_cgroup visible, insert smp_rmb() here.
674          */
675         smp_rmb();
676         /* unused or root page is not rotated. */
677         if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
678                 return;
679         mz = page_cgroup_zoneinfo(pc);
680         list_move(&pc->lru, &mz->lists[lru]);
681 }
682
683 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
684 {
685         struct page_cgroup *pc;
686         struct mem_cgroup_per_zone *mz;
687
688         if (mem_cgroup_disabled())
689                 return;
690         pc = lookup_page_cgroup(page);
691         VM_BUG_ON(PageCgroupAcctLRU(pc));
692         /*
693          * Used bit is set without atomic ops but after smp_wmb().
694          * For making pc->mem_cgroup visible, insert smp_rmb() here.
695          */
696         smp_rmb();
697         if (!PageCgroupUsed(pc))
698                 return;
699
700         mz = page_cgroup_zoneinfo(pc);
701         MEM_CGROUP_ZSTAT(mz, lru) += 1;
702         SetPageCgroupAcctLRU(pc);
703         if (mem_cgroup_is_root(pc->mem_cgroup))
704                 return;
705         list_add(&pc->lru, &mz->lists[lru]);
706 }
707
708 /*
709  * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
710  * lru because the page may.be reused after it's fully uncharged (because of
711  * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
712  * it again. This function is only used to charge SwapCache. It's done under
713  * lock_page and expected that zone->lru_lock is never held.
714  */
715 static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
716 {
717         unsigned long flags;
718         struct zone *zone = page_zone(page);
719         struct page_cgroup *pc = lookup_page_cgroup(page);
720
721         spin_lock_irqsave(&zone->lru_lock, flags);
722         /*
723          * Forget old LRU when this page_cgroup is *not* used. This Used bit
724          * is guarded by lock_page() because the page is SwapCache.
725          */
726         if (!PageCgroupUsed(pc))
727                 mem_cgroup_del_lru_list(page, page_lru(page));
728         spin_unlock_irqrestore(&zone->lru_lock, flags);
729 }
730
731 static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
732 {
733         unsigned long flags;
734         struct zone *zone = page_zone(page);
735         struct page_cgroup *pc = lookup_page_cgroup(page);
736
737         spin_lock_irqsave(&zone->lru_lock, flags);
738         /* link when the page is linked to LRU but page_cgroup isn't */
739         if (PageLRU(page) && !PageCgroupAcctLRU(pc))
740                 mem_cgroup_add_lru_list(page, page_lru(page));
741         spin_unlock_irqrestore(&zone->lru_lock, flags);
742 }
743
744
745 void mem_cgroup_move_lists(struct page *page,
746                            enum lru_list from, enum lru_list to)
747 {
748         if (mem_cgroup_disabled())
749                 return;
750         mem_cgroup_del_lru_list(page, from);
751         mem_cgroup_add_lru_list(page, to);
752 }
753
754 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
755 {
756         int ret;
757         struct mem_cgroup *curr = NULL;
758
759         task_lock(task);
760         rcu_read_lock();
761         curr = try_get_mem_cgroup_from_mm(task->mm);
762         rcu_read_unlock();
763         task_unlock(task);
764         if (!curr)
765                 return 0;
766         if (curr->use_hierarchy)
767                 ret = css_is_ancestor(&curr->css, &mem->css);
768         else
769                 ret = (curr == mem);
770         css_put(&curr->css);
771         return ret;
772 }
773
774 /*
775  * prev_priority control...this will be used in memory reclaim path.
776  */
777 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
778 {
779         int prev_priority;
780
781         spin_lock(&mem->reclaim_param_lock);
782         prev_priority = mem->prev_priority;
783         spin_unlock(&mem->reclaim_param_lock);
784
785         return prev_priority;
786 }
787
788 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
789 {
790         spin_lock(&mem->reclaim_param_lock);
791         if (priority < mem->prev_priority)
792                 mem->prev_priority = priority;
793         spin_unlock(&mem->reclaim_param_lock);
794 }
795
796 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
797 {
798         spin_lock(&mem->reclaim_param_lock);
799         mem->prev_priority = priority;
800         spin_unlock(&mem->reclaim_param_lock);
801 }
802
803 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
804 {
805         unsigned long active;
806         unsigned long inactive;
807         unsigned long gb;
808         unsigned long inactive_ratio;
809
810         inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
811         active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
812
813         gb = (inactive + active) >> (30 - PAGE_SHIFT);
814         if (gb)
815                 inactive_ratio = int_sqrt(10 * gb);
816         else
817                 inactive_ratio = 1;
818
819         if (present_pages) {
820                 present_pages[0] = inactive;
821                 present_pages[1] = active;
822         }
823
824         return inactive_ratio;
825 }
826
827 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
828 {
829         unsigned long active;
830         unsigned long inactive;
831         unsigned long present_pages[2];
832         unsigned long inactive_ratio;
833
834         inactive_ratio = calc_inactive_ratio(memcg, present_pages);
835
836         inactive = present_pages[0];
837         active = present_pages[1];
838
839         if (inactive * inactive_ratio < active)
840                 return 1;
841
842         return 0;
843 }
844
845 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
846 {
847         unsigned long active;
848         unsigned long inactive;
849
850         inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
851         active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
852
853         return (active > inactive);
854 }
855
856 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
857                                        struct zone *zone,
858                                        enum lru_list lru)
859 {
860         int nid = zone->zone_pgdat->node_id;
861         int zid = zone_idx(zone);
862         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
863
864         return MEM_CGROUP_ZSTAT(mz, lru);
865 }
866
867 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
868                                                       struct zone *zone)
869 {
870         int nid = zone->zone_pgdat->node_id;
871         int zid = zone_idx(zone);
872         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
873
874         return &mz->reclaim_stat;
875 }
876
877 struct zone_reclaim_stat *
878 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
879 {
880         struct page_cgroup *pc;
881         struct mem_cgroup_per_zone *mz;
882
883         if (mem_cgroup_disabled())
884                 return NULL;
885
886         pc = lookup_page_cgroup(page);
887         /*
888          * Used bit is set without atomic ops but after smp_wmb().
889          * For making pc->mem_cgroup visible, insert smp_rmb() here.
890          */
891         smp_rmb();
892         if (!PageCgroupUsed(pc))
893                 return NULL;
894
895         mz = page_cgroup_zoneinfo(pc);
896         if (!mz)
897                 return NULL;
898
899         return &mz->reclaim_stat;
900 }
901
902 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
903                                         struct list_head *dst,
904                                         unsigned long *scanned, int order,
905                                         int mode, struct zone *z,
906                                         struct mem_cgroup *mem_cont,
907                                         int active, int file)
908 {
909         unsigned long nr_taken = 0;
910         struct page *page;
911         unsigned long scan;
912         LIST_HEAD(pc_list);
913         struct list_head *src;
914         struct page_cgroup *pc, *tmp;
915         int nid = z->zone_pgdat->node_id;
916         int zid = zone_idx(z);
917         struct mem_cgroup_per_zone *mz;
918         int lru = LRU_FILE * file + active;
919         int ret;
920
921         BUG_ON(!mem_cont);
922         mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
923         src = &mz->lists[lru];
924
925         scan = 0;
926         list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
927                 if (scan >= nr_to_scan)
928                         break;
929
930                 page = pc->page;
931                 if (unlikely(!PageCgroupUsed(pc)))
932                         continue;
933                 if (unlikely(!PageLRU(page)))
934                         continue;
935
936                 scan++;
937                 ret = __isolate_lru_page(page, mode, file);
938                 switch (ret) {
939                 case 0:
940                         list_move(&page->lru, dst);
941                         mem_cgroup_del_lru(page);
942                         nr_taken++;
943                         break;
944                 case -EBUSY:
945                         /* we don't affect global LRU but rotate in our LRU */
946                         mem_cgroup_rotate_lru_list(page, page_lru(page));
947                         break;
948                 default:
949                         break;
950                 }
951         }
952
953         *scanned = scan;
954         return nr_taken;
955 }
956
957 #define mem_cgroup_from_res_counter(counter, member)    \
958         container_of(counter, struct mem_cgroup, member)
959
960 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
961 {
962         if (do_swap_account) {
963                 if (res_counter_check_under_limit(&mem->res) &&
964                         res_counter_check_under_limit(&mem->memsw))
965                         return true;
966         } else
967                 if (res_counter_check_under_limit(&mem->res))
968                         return true;
969         return false;
970 }
971
972 static unsigned int get_swappiness(struct mem_cgroup *memcg)
973 {
974         struct cgroup *cgrp = memcg->css.cgroup;
975         unsigned int swappiness;
976
977         /* root ? */
978         if (cgrp->parent == NULL)
979                 return vm_swappiness;
980
981         spin_lock(&memcg->reclaim_param_lock);
982         swappiness = memcg->swappiness;
983         spin_unlock(&memcg->reclaim_param_lock);
984
985         return swappiness;
986 }
987
988 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
989 {
990         int *val = data;
991         (*val)++;
992         return 0;
993 }
994
995 /**
996  * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
997  * @memcg: The memory cgroup that went over limit
998  * @p: Task that is going to be killed
999  *
1000  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1001  * enabled
1002  */
1003 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1004 {
1005         struct cgroup *task_cgrp;
1006         struct cgroup *mem_cgrp;
1007         /*
1008          * Need a buffer in BSS, can't rely on allocations. The code relies
1009          * on the assumption that OOM is serialized for memory controller.
1010          * If this assumption is broken, revisit this code.
1011          */
1012         static char memcg_name[PATH_MAX];
1013         int ret;
1014
1015         if (!memcg)
1016                 return;
1017
1018
1019         rcu_read_lock();
1020
1021         mem_cgrp = memcg->css.cgroup;
1022         task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1023
1024         ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1025         if (ret < 0) {
1026                 /*
1027                  * Unfortunately, we are unable to convert to a useful name
1028                  * But we'll still print out the usage information
1029                  */
1030                 rcu_read_unlock();
1031                 goto done;
1032         }
1033         rcu_read_unlock();
1034
1035         printk(KERN_INFO "Task in %s killed", memcg_name);
1036
1037         rcu_read_lock();
1038         ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1039         if (ret < 0) {
1040                 rcu_read_unlock();
1041                 goto done;
1042         }
1043         rcu_read_unlock();
1044
1045         /*
1046          * Continues from above, so we don't need an KERN_ level
1047          */
1048         printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1049 done:
1050
1051         printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1052                 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1053                 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1054                 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1055         printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1056                 "failcnt %llu\n",
1057                 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1058                 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1059                 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1060 }
1061
1062 /*
1063  * This function returns the number of memcg under hierarchy tree. Returns
1064  * 1(self count) if no children.
1065  */
1066 static int mem_cgroup_count_children(struct mem_cgroup *mem)
1067 {
1068         int num = 0;
1069         mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
1070         return num;
1071 }
1072
1073 /*
1074  * Visit the first child (need not be the first child as per the ordering
1075  * of the cgroup list, since we track last_scanned_child) of @mem and use
1076  * that to reclaim free pages from.
1077  */
1078 static struct mem_cgroup *
1079 mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1080 {
1081         struct mem_cgroup *ret = NULL;
1082         struct cgroup_subsys_state *css;
1083         int nextid, found;
1084
1085         if (!root_mem->use_hierarchy) {
1086                 css_get(&root_mem->css);
1087                 ret = root_mem;
1088         }
1089
1090         while (!ret) {
1091                 rcu_read_lock();
1092                 nextid = root_mem->last_scanned_child + 1;
1093                 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1094                                    &found);
1095                 if (css && css_tryget(css))
1096                         ret = container_of(css, struct mem_cgroup, css);
1097
1098                 rcu_read_unlock();
1099                 /* Updates scanning parameter */
1100                 spin_lock(&root_mem->reclaim_param_lock);
1101                 if (!css) {
1102                         /* this means start scan from ID:1 */
1103                         root_mem->last_scanned_child = 0;
1104                 } else
1105                         root_mem->last_scanned_child = found;
1106                 spin_unlock(&root_mem->reclaim_param_lock);
1107         }
1108
1109         return ret;
1110 }
1111
1112 /*
1113  * Scan the hierarchy if needed to reclaim memory. We remember the last child
1114  * we reclaimed from, so that we don't end up penalizing one child extensively
1115  * based on its position in the children list.
1116  *
1117  * root_mem is the original ancestor that we've been reclaim from.
1118  *
1119  * We give up and return to the caller when we visit root_mem twice.
1120  * (other groups can be removed while we're walking....)
1121  *
1122  * If shrink==true, for avoiding to free too much, this returns immedieately.
1123  */
1124 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1125                                                 struct zone *zone,
1126                                                 gfp_t gfp_mask,
1127                                                 unsigned long reclaim_options)
1128 {
1129         struct mem_cgroup *victim;
1130         int ret, total = 0;
1131         int loop = 0;
1132         bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1133         bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1134         bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1135         unsigned long excess = mem_cgroup_get_excess(root_mem);
1136
1137         /* If memsw_is_minimum==1, swap-out is of-no-use. */
1138         if (root_mem->memsw_is_minimum)
1139                 noswap = true;
1140
1141         while (1) {
1142                 victim = mem_cgroup_select_victim(root_mem);
1143                 if (victim == root_mem) {
1144                         loop++;
1145                         if (loop >= 2) {
1146                                 /*
1147                                  * If we have not been able to reclaim
1148                                  * anything, it might because there are
1149                                  * no reclaimable pages under this hierarchy
1150                                  */
1151                                 if (!check_soft || !total) {
1152                                         css_put(&victim->css);
1153                                         break;
1154                                 }
1155                                 /*
1156                                  * We want to do more targetted reclaim.
1157                                  * excess >> 2 is not to excessive so as to
1158                                  * reclaim too much, nor too less that we keep
1159                                  * coming back to reclaim from this cgroup
1160                                  */
1161                                 if (total >= (excess >> 2) ||
1162                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1163                                         css_put(&victim->css);
1164                                         break;
1165                                 }
1166                         }
1167                 }
1168                 if (!mem_cgroup_local_usage(&victim->stat)) {
1169                         /* this cgroup's local usage == 0 */
1170                         css_put(&victim->css);
1171                         continue;
1172                 }
1173                 /* we use swappiness of local cgroup */
1174                 if (check_soft)
1175                         ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1176                                 noswap, get_swappiness(victim), zone,
1177                                 zone->zone_pgdat->node_id);
1178                 else
1179                         ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1180                                                 noswap, get_swappiness(victim));
1181                 css_put(&victim->css);
1182                 /*
1183                  * At shrinking usage, we can't check we should stop here or
1184                  * reclaim more. It's depends on callers. last_scanned_child
1185                  * will work enough for keeping fairness under tree.
1186                  */
1187                 if (shrink)
1188                         return ret;
1189                 total += ret;
1190                 if (check_soft) {
1191                         if (res_counter_check_under_soft_limit(&root_mem->res))
1192                                 return total;
1193                 } else if (mem_cgroup_check_under_limit(root_mem))
1194                         return 1 + total;
1195         }
1196         return total;
1197 }
1198
1199 bool mem_cgroup_oom_called(struct task_struct *task)
1200 {
1201         bool ret = false;
1202         struct mem_cgroup *mem;
1203         struct mm_struct *mm;
1204
1205         rcu_read_lock();
1206         mm = task->mm;
1207         if (!mm)
1208                 mm = &init_mm;
1209         mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
1210         if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
1211                 ret = true;
1212         rcu_read_unlock();
1213         return ret;
1214 }
1215
1216 static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
1217 {
1218         mem->last_oom_jiffies = jiffies;
1219         return 0;
1220 }
1221
1222 static void record_last_oom(struct mem_cgroup *mem)
1223 {
1224         mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
1225 }
1226
1227 /*
1228  * Currently used to update mapped file statistics, but the routine can be
1229  * generalized to update other statistics as well.
1230  */
1231 void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
1232 {
1233         struct mem_cgroup *mem;
1234         struct mem_cgroup_stat *stat;
1235         struct mem_cgroup_stat_cpu *cpustat;
1236         int cpu;
1237         struct page_cgroup *pc;
1238
1239         if (!page_is_file_cache(page))
1240                 return;
1241
1242         pc = lookup_page_cgroup(page);
1243         if (unlikely(!pc))
1244                 return;
1245
1246         lock_page_cgroup(pc);
1247         mem = pc->mem_cgroup;
1248         if (!mem)
1249                 goto done;
1250
1251         if (!PageCgroupUsed(pc))
1252                 goto done;
1253
1254         /*
1255          * Preemption is already disabled, we don't need get_cpu()
1256          */
1257         cpu = smp_processor_id();
1258         stat = &mem->stat;
1259         cpustat = &stat->cpustat[cpu];
1260
1261         __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
1262 done:
1263         unlock_page_cgroup(pc);
1264 }
1265
1266 /*
1267  * Unlike exported interface, "oom" parameter is added. if oom==true,
1268  * oom-killer can be invoked.
1269  */
1270 static int __mem_cgroup_try_charge(struct mm_struct *mm,
1271                         gfp_t gfp_mask, struct mem_cgroup **memcg,
1272                         bool oom, struct page *page)
1273 {
1274         struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
1275         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1276         struct res_counter *fail_res, *soft_fail_res = NULL;
1277
1278         if (unlikely(test_thread_flag(TIF_MEMDIE))) {
1279                 /* Don't account this! */
1280                 *memcg = NULL;
1281                 return 0;
1282         }
1283
1284         /*
1285          * We always charge the cgroup the mm_struct belongs to.
1286          * The mm_struct's mem_cgroup changes on task migration if the
1287          * thread group leader migrates. It's possible that mm is not
1288          * set, if so charge the init_mm (happens for pagecache usage).
1289          */
1290         mem = *memcg;
1291         if (likely(!mem)) {
1292                 mem = try_get_mem_cgroup_from_mm(mm);
1293                 *memcg = mem;
1294         } else {
1295                 css_get(&mem->css);
1296         }
1297         if (unlikely(!mem))
1298                 return 0;
1299
1300         VM_BUG_ON(css_is_removed(&mem->css));
1301
1302         while (1) {
1303                 int ret = 0;
1304                 unsigned long flags = 0;
1305
1306                 if (mem_cgroup_is_root(mem))
1307                         goto done;
1308                 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
1309                                                 &soft_fail_res);
1310                 if (likely(!ret)) {
1311                         if (!do_swap_account)
1312                                 break;
1313                         ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
1314                                                         &fail_res, NULL);
1315                         if (likely(!ret))
1316                                 break;
1317                         /* mem+swap counter fails */
1318                         res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1319                         flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1320                         mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1321                                                                         memsw);
1322                 } else
1323                         /* mem counter fails */
1324                         mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1325                                                                         res);
1326
1327                 if (!(gfp_mask & __GFP_WAIT))
1328                         goto nomem;
1329
1330                 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1331                                                 gfp_mask, flags);
1332                 if (ret)
1333                         continue;
1334
1335                 /*
1336                  * try_to_free_mem_cgroup_pages() might not give us a full
1337                  * picture of reclaim. Some pages are reclaimed and might be
1338                  * moved to swap cache or just unmapped from the cgroup.
1339                  * Check the limit again to see if the reclaim reduced the
1340                  * current usage of the cgroup before giving up
1341                  *
1342                  */
1343                 if (mem_cgroup_check_under_limit(mem_over_limit))
1344                         continue;
1345
1346                 if (!nr_retries--) {
1347                         if (oom) {
1348                                 mutex_lock(&memcg_tasklist);
1349                                 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
1350                                 mutex_unlock(&memcg_tasklist);
1351                                 record_last_oom(mem_over_limit);
1352                         }
1353                         goto nomem;
1354                 }
1355         }
1356         /*
1357          * Insert just the ancestor, we should trickle down to the correct
1358          * cgroup for reclaim, since the other nodes will be below their
1359          * soft limit
1360          */
1361         if (soft_fail_res) {
1362                 mem_over_soft_limit =
1363                         mem_cgroup_from_res_counter(soft_fail_res, res);
1364                 if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
1365                         mem_cgroup_update_tree(mem_over_soft_limit, page);
1366         }
1367 done:
1368         return 0;
1369 nomem:
1370         css_put(&mem->css);
1371         return -ENOMEM;
1372 }
1373
1374 /*
1375  * A helper function to get mem_cgroup from ID. must be called under
1376  * rcu_read_lock(). The caller must check css_is_removed() or some if
1377  * it's concern. (dropping refcnt from swap can be called against removed
1378  * memcg.)
1379  */
1380 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1381 {
1382         struct cgroup_subsys_state *css;
1383
1384         /* ID 0 is unused ID */
1385         if (!id)
1386                 return NULL;
1387         css = css_lookup(&mem_cgroup_subsys, id);
1388         if (!css)
1389                 return NULL;
1390         return container_of(css, struct mem_cgroup, css);
1391 }
1392
1393 static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
1394 {
1395         struct mem_cgroup *mem;
1396         struct page_cgroup *pc;
1397         unsigned short id;
1398         swp_entry_t ent;
1399
1400         VM_BUG_ON(!PageLocked(page));
1401
1402         if (!PageSwapCache(page))
1403                 return NULL;
1404
1405         pc = lookup_page_cgroup(page);
1406         lock_page_cgroup(pc);
1407         if (PageCgroupUsed(pc)) {
1408                 mem = pc->mem_cgroup;
1409                 if (mem && !css_tryget(&mem->css))
1410                         mem = NULL;
1411         } else {
1412                 ent.val = page_private(page);
1413                 id = lookup_swap_cgroup(ent);
1414                 rcu_read_lock();
1415                 mem = mem_cgroup_lookup(id);
1416                 if (mem && !css_tryget(&mem->css))
1417                         mem = NULL;
1418                 rcu_read_unlock();
1419         }
1420         unlock_page_cgroup(pc);
1421         return mem;
1422 }
1423
1424 /*
1425  * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
1426  * USED state. If already USED, uncharge and return.
1427  */
1428
1429 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1430                                      struct page_cgroup *pc,
1431                                      enum charge_type ctype)
1432 {
1433         /* try_charge() can return NULL to *memcg, taking care of it. */
1434         if (!mem)
1435                 return;
1436
1437         lock_page_cgroup(pc);
1438         if (unlikely(PageCgroupUsed(pc))) {
1439                 unlock_page_cgroup(pc);
1440                 if (!mem_cgroup_is_root(mem)) {
1441                         res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1442                         if (do_swap_account)
1443                                 res_counter_uncharge(&mem->memsw, PAGE_SIZE,
1444                                                         NULL);
1445                 }
1446                 css_put(&mem->css);
1447                 return;
1448         }
1449
1450         pc->mem_cgroup = mem;
1451         /*
1452          * We access a page_cgroup asynchronously without lock_page_cgroup().
1453          * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1454          * is accessed after testing USED bit. To make pc->mem_cgroup visible
1455          * before USED bit, we need memory barrier here.
1456          * See mem_cgroup_add_lru_list(), etc.
1457          */
1458         smp_wmb();
1459         switch (ctype) {
1460         case MEM_CGROUP_CHARGE_TYPE_CACHE:
1461         case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1462                 SetPageCgroupCache(pc);
1463                 SetPageCgroupUsed(pc);
1464                 break;
1465         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1466                 ClearPageCgroupCache(pc);
1467                 SetPageCgroupUsed(pc);
1468                 break;
1469         default:
1470                 break;
1471         }
1472
1473         mem_cgroup_charge_statistics(mem, pc, true);
1474
1475         unlock_page_cgroup(pc);
1476 }
1477
1478 /**
1479  * mem_cgroup_move_account - move account of the page
1480  * @pc: page_cgroup of the page.
1481  * @from: mem_cgroup which the page is moved from.
1482  * @to: mem_cgroup which the page is moved to. @from != @to.
1483  *
1484  * The caller must confirm following.
1485  * - page is not on LRU (isolate_page() is useful.)
1486  *
1487  * returns 0 at success,
1488  * returns -EBUSY when lock is busy or "pc" is unstable.
1489  *
1490  * This function does "uncharge" from old cgroup but doesn't do "charge" to
1491  * new cgroup. It should be done by a caller.
1492  */
1493
1494 static int mem_cgroup_move_account(struct page_cgroup *pc,
1495         struct mem_cgroup *from, struct mem_cgroup *to)
1496 {
1497         struct mem_cgroup_per_zone *from_mz, *to_mz;
1498         int nid, zid;
1499         int ret = -EBUSY;
1500         struct page *page;
1501         int cpu;
1502         struct mem_cgroup_stat *stat;
1503         struct mem_cgroup_stat_cpu *cpustat;
1504
1505         VM_BUG_ON(from == to);
1506         VM_BUG_ON(PageLRU(pc->page));
1507
1508         nid = page_cgroup_nid(pc);
1509         zid = page_cgroup_zid(pc);
1510         from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
1511         to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
1512
1513         if (!trylock_page_cgroup(pc))
1514                 return ret;
1515
1516         if (!PageCgroupUsed(pc))
1517                 goto out;
1518
1519         if (pc->mem_cgroup != from)
1520                 goto out;
1521
1522         if (!mem_cgroup_is_root(from))
1523                 res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
1524         mem_cgroup_charge_statistics(from, pc, false);
1525
1526         page = pc->page;
1527         if (page_is_file_cache(page) && page_mapped(page)) {
1528                 cpu = smp_processor_id();
1529                 /* Update mapped_file data for mem_cgroup "from" */
1530                 stat = &from->stat;
1531                 cpustat = &stat->cpustat[cpu];
1532                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
1533                                                 -1);
1534
1535                 /* Update mapped_file data for mem_cgroup "to" */
1536                 stat = &to->stat;
1537                 cpustat = &stat->cpustat[cpu];
1538                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
1539                                                 1);
1540         }
1541
1542         if (do_swap_account && !mem_cgroup_is_root(from))
1543                 res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
1544         css_put(&from->css);
1545
1546         css_get(&to->css);
1547         pc->mem_cgroup = to;
1548         mem_cgroup_charge_statistics(to, pc, true);
1549         ret = 0;
1550 out:
1551         unlock_page_cgroup(pc);
1552         /*
1553          * We charges against "to" which may not have any tasks. Then, "to"
1554          * can be under rmdir(). But in current implementation, caller of
1555          * this function is just force_empty() and it's garanteed that
1556          * "to" is never removed. So, we don't check rmdir status here.
1557          */
1558         return ret;
1559 }
1560
1561 /*
1562  * move charges to its parent.
1563  */
1564
1565 static int mem_cgroup_move_parent(struct page_cgroup *pc,
1566                                   struct mem_cgroup *child,
1567                                   gfp_t gfp_mask)
1568 {
1569         struct page *page = pc->page;
1570         struct cgroup *cg = child->css.cgroup;
1571         struct cgroup *pcg = cg->parent;
1572         struct mem_cgroup *parent;
1573         int ret;
1574
1575         /* Is ROOT ? */
1576         if (!pcg)
1577                 return -EINVAL;
1578
1579
1580         parent = mem_cgroup_from_cont(pcg);
1581
1582
1583         ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1584         if (ret || !parent)
1585                 return ret;
1586
1587         if (!get_page_unless_zero(page)) {
1588                 ret = -EBUSY;
1589                 goto uncharge;
1590         }
1591
1592         ret = isolate_lru_page(page);
1593
1594         if (ret)
1595                 goto cancel;
1596
1597         ret = mem_cgroup_move_account(pc, child, parent);
1598
1599         putback_lru_page(page);
1600         if (!ret) {
1601                 put_page(page);
1602                 /* drop extra refcnt by try_charge() */
1603                 css_put(&parent->css);
1604                 return 0;
1605         }
1606
1607 cancel:
1608         put_page(page);
1609 uncharge:
1610         /* drop extra refcnt by try_charge() */
1611         css_put(&parent->css);
1612         /* uncharge if move fails */
1613         if (!mem_cgroup_is_root(parent)) {
1614                 res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
1615                 if (do_swap_account)
1616                         res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
1617         }
1618         return ret;
1619 }
1620
1621 /*
1622  * Charge the memory controller for page usage.
1623  * Return
1624  * 0 if the charge was successful
1625  * < 0 if the cgroup is over its limit
1626  */
1627 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1628                                 gfp_t gfp_mask, enum charge_type ctype,
1629                                 struct mem_cgroup *memcg)
1630 {
1631         struct mem_cgroup *mem;
1632         struct page_cgroup *pc;
1633         int ret;
1634
1635         pc = lookup_page_cgroup(page);
1636         /* can happen at boot */
1637         if (unlikely(!pc))
1638                 return 0;
1639         prefetchw(pc);
1640
1641         mem = memcg;
1642         ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
1643         if (ret || !mem)
1644                 return ret;
1645
1646         __mem_cgroup_commit_charge(mem, pc, ctype);
1647         return 0;
1648 }
1649
1650 int mem_cgroup_newpage_charge(struct page *page,
1651                               struct mm_struct *mm, gfp_t gfp_mask)
1652 {
1653         if (mem_cgroup_disabled())
1654                 return 0;
1655         if (PageCompound(page))
1656                 return 0;
1657         /*
1658          * If already mapped, we don't have to account.
1659          * If page cache, page->mapping has address_space.
1660          * But page->mapping may have out-of-use anon_vma pointer,
1661          * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
1662          * is NULL.
1663          */
1664         if (page_mapped(page) || (page->mapping && !PageAnon(page)))
1665                 return 0;
1666         if (unlikely(!mm))
1667                 mm = &init_mm;
1668         return mem_cgroup_charge_common(page, mm, gfp_mask,
1669                                 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1670 }
1671
1672 static void
1673 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1674                                         enum charge_type ctype);
1675
1676 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1677                                 gfp_t gfp_mask)
1678 {
1679         struct mem_cgroup *mem = NULL;
1680         int ret;
1681
1682         if (mem_cgroup_disabled())
1683                 return 0;
1684         if (PageCompound(page))
1685                 return 0;
1686         /*
1687          * Corner case handling. This is called from add_to_page_cache()
1688          * in usual. But some FS (shmem) precharges this page before calling it
1689          * and call add_to_page_cache() with GFP_NOWAIT.
1690          *
1691          * For GFP_NOWAIT case, the page may be pre-charged before calling
1692          * add_to_page_cache(). (See shmem.c) check it here and avoid to call
1693          * charge twice. (It works but has to pay a bit larger cost.)
1694          * And when the page is SwapCache, it should take swap information
1695          * into account. This is under lock_page() now.
1696          */
1697         if (!(gfp_mask & __GFP_WAIT)) {
1698                 struct page_cgroup *pc;
1699
1700
1701                 pc = lookup_page_cgroup(page);
1702                 if (!pc)
1703                         return 0;
1704                 lock_page_cgroup(pc);
1705                 if (PageCgroupUsed(pc)) {
1706                         unlock_page_cgroup(pc);
1707                         return 0;
1708                 }
1709                 unlock_page_cgroup(pc);
1710         }
1711
1712         if (unlikely(!mm && !mem))
1713                 mm = &init_mm;
1714
1715         if (page_is_file_cache(page))
1716                 return mem_cgroup_charge_common(page, mm, gfp_mask,
1717                                 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1718
1719         /* shmem */
1720         if (PageSwapCache(page)) {
1721                 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1722                 if (!ret)
1723                         __mem_cgroup_commit_charge_swapin(page, mem,
1724                                         MEM_CGROUP_CHARGE_TYPE_SHMEM);
1725         } else
1726                 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
1727                                         MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1728
1729         return ret;
1730 }
1731
1732 /*
1733  * While swap-in, try_charge -> commit or cancel, the page is locked.
1734  * And when try_charge() successfully returns, one refcnt to memcg without
1735  * struct page_cgroup is aquired. This refcnt will be cumsumed by
1736  * "commit()" or removed by "cancel()"
1737  */
1738 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1739                                  struct page *page,
1740                                  gfp_t mask, struct mem_cgroup **ptr)
1741 {
1742         struct mem_cgroup *mem;
1743         int ret;
1744
1745         if (mem_cgroup_disabled())
1746                 return 0;
1747
1748         if (!do_swap_account)
1749                 goto charge_cur_mm;
1750         /*
1751          * A racing thread's fault, or swapoff, may have already updated
1752          * the pte, and even removed page from swap cache: return success
1753          * to go on to do_swap_page()'s pte_same() test, which should fail.
1754          */
1755         if (!PageSwapCache(page))
1756                 return 0;
1757         mem = try_get_mem_cgroup_from_swapcache(page);
1758         if (!mem)
1759                 goto charge_cur_mm;
1760         *ptr = mem;
1761         ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
1762         /* drop extra refcnt from tryget */
1763         css_put(&mem->css);
1764         return ret;
1765 charge_cur_mm:
1766         if (unlikely(!mm))
1767                 mm = &init_mm;
1768         return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
1769 }
1770
1771 static void
1772 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1773                                         enum charge_type ctype)
1774 {
1775         struct page_cgroup *pc;
1776
1777         if (mem_cgroup_disabled())
1778                 return;
1779         if (!ptr)
1780                 return;
1781         cgroup_exclude_rmdir(&ptr->css);
1782         pc = lookup_page_cgroup(page);
1783         mem_cgroup_lru_del_before_commit_swapcache(page);
1784         __mem_cgroup_commit_charge(ptr, pc, ctype);
1785         mem_cgroup_lru_add_after_commit_swapcache(page);
1786         /*
1787          * Now swap is on-memory. This means this page may be
1788          * counted both as mem and swap....double count.
1789          * Fix it by uncharging from memsw. Basically, this SwapCache is stable
1790          * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
1791          * may call delete_from_swap_cache() before reach here.
1792          */
1793         if (do_swap_account && PageSwapCache(page)) {
1794                 swp_entry_t ent = {.val = page_private(page)};
1795                 unsigned short id;
1796                 struct mem_cgroup *memcg;
1797
1798                 id = swap_cgroup_record(ent, 0);
1799                 rcu_read_lock();
1800                 memcg = mem_cgroup_lookup(id);
1801                 if (memcg) {
1802                         /*
1803                          * This recorded memcg can be obsolete one. So, avoid
1804                          * calling css_tryget
1805                          */
1806                         if (!mem_cgroup_is_root(memcg))
1807                                 res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
1808                                                         NULL);
1809                         mem_cgroup_swap_statistics(memcg, false);
1810                         mem_cgroup_put(memcg);
1811                 }
1812                 rcu_read_unlock();
1813         }
1814         /*
1815          * At swapin, we may charge account against cgroup which has no tasks.
1816          * So, rmdir()->pre_destroy() can be called while we do this charge.
1817          * In that case, we need to call pre_destroy() again. check it here.
1818          */
1819         cgroup_release_and_wakeup_rmdir(&ptr->css);
1820 }
1821
1822 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1823 {
1824         __mem_cgroup_commit_charge_swapin(page, ptr,
1825                                         MEM_CGROUP_CHARGE_TYPE_MAPPED);
1826 }
1827
1828 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1829 {
1830         if (mem_cgroup_disabled())
1831                 return;
1832         if (!mem)
1833                 return;
1834         if (!mem_cgroup_is_root(mem)) {
1835                 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1836                 if (do_swap_account)
1837                         res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1838         }
1839         css_put(&mem->css);
1840 }
1841
1842
1843 /*
1844  * uncharge if !page_mapped(page)
1845  */
1846 static struct mem_cgroup *
1847 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1848 {
1849         struct page_cgroup *pc;
1850         struct mem_cgroup *mem = NULL;
1851         struct mem_cgroup_per_zone *mz;
1852         bool soft_limit_excess = false;
1853
1854         if (mem_cgroup_disabled())
1855                 return NULL;
1856
1857         if (PageSwapCache(page))
1858                 return NULL;
1859
1860         /*
1861          * Check if our page_cgroup is valid
1862          */
1863         pc = lookup_page_cgroup(page);
1864         if (unlikely(!pc || !PageCgroupUsed(pc)))
1865                 return NULL;
1866
1867         lock_page_cgroup(pc);
1868
1869         mem = pc->mem_cgroup;
1870
1871         if (!PageCgroupUsed(pc))
1872                 goto unlock_out;
1873
1874         switch (ctype) {
1875         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1876         case MEM_CGROUP_CHARGE_TYPE_DROP:
1877                 if (page_mapped(page))
1878                         goto unlock_out;
1879                 break;
1880         case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
1881                 if (!PageAnon(page)) {  /* Shared memory */
1882                         if (page->mapping && !page_is_file_cache(page))
1883                                 goto unlock_out;
1884                 } else if (page_mapped(page)) /* Anon */
1885                                 goto unlock_out;
1886                 break;
1887         default:
1888                 break;
1889         }
1890
1891         if (!mem_cgroup_is_root(mem)) {
1892                 res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
1893                 if (do_swap_account &&
1894                                 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1895                         res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1896         }
1897         if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1898                 mem_cgroup_swap_statistics(mem, true);
1899         mem_cgroup_charge_statistics(mem, pc, false);
1900
1901         ClearPageCgroupUsed(pc);
1902         /*
1903          * pc->mem_cgroup is not cleared here. It will be accessed when it's
1904          * freed from LRU. This is safe because uncharged page is expected not
1905          * to be reused (freed soon). Exception is SwapCache, it's handled by
1906          * special functions.
1907          */
1908
1909         mz = page_cgroup_zoneinfo(pc);
1910         unlock_page_cgroup(pc);
1911
1912         if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
1913                 mem_cgroup_update_tree(mem, page);
1914         /* at swapout, this memcg will be accessed to record to swap */
1915         if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1916                 css_put(&mem->css);
1917
1918         return mem;
1919
1920 unlock_out:
1921         unlock_page_cgroup(pc);
1922         return NULL;
1923 }
1924
1925 void mem_cgroup_uncharge_page(struct page *page)
1926 {
1927         /* early check. */
1928         if (page_mapped(page))
1929                 return;
1930         if (page->mapping && !PageAnon(page))
1931                 return;
1932         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1933 }
1934
1935 void mem_cgroup_uncharge_cache_page(struct page *page)
1936 {
1937         VM_BUG_ON(page_mapped(page));
1938         VM_BUG_ON(page->mapping);
1939         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1940 }
1941
1942 #ifdef CONFIG_SWAP
1943 /*
1944  * called after __delete_from_swap_cache() and drop "page" account.
1945  * memcg information is recorded to swap_cgroup of "ent"
1946  */
1947 void
1948 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
1949 {
1950         struct mem_cgroup *memcg;
1951         int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
1952
1953         if (!swapout) /* this was a swap cache but the swap is unused ! */
1954                 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
1955
1956         memcg = __mem_cgroup_uncharge_common(page, ctype);
1957
1958         /* record memcg information */
1959         if (do_swap_account && swapout && memcg) {
1960                 swap_cgroup_record(ent, css_id(&memcg->css));
1961                 mem_cgroup_get(memcg);
1962         }
1963         if (swapout && memcg)
1964                 css_put(&memcg->css);
1965 }
1966 #endif
1967
1968 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1969 /*
1970  * called from swap_entry_free(). remove record in swap_cgroup and
1971  * uncharge "memsw" account.
1972  */
1973 void mem_cgroup_uncharge_swap(swp_entry_t ent)
1974 {
1975         struct mem_cgroup *memcg;
1976         unsigned short id;
1977
1978         if (!do_swap_account)
1979                 return;
1980
1981         id = swap_cgroup_record(ent, 0);
1982         rcu_read_lock();
1983         memcg = mem_cgroup_lookup(id);
1984         if (memcg) {
1985                 /*
1986                  * We uncharge this because swap is freed.
1987                  * This memcg can be obsolete one. We avoid calling css_tryget
1988                  */
1989                 if (!mem_cgroup_is_root(memcg))
1990                         res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
1991                 mem_cgroup_swap_statistics(memcg, false);
1992                 mem_cgroup_put(memcg);
1993         }
1994         rcu_read_unlock();
1995 }
1996 #endif
1997
1998 /*
1999  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
2000  * page belongs to.
2001  */
2002 int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
2003 {
2004         struct page_cgroup *pc;
2005         struct mem_cgroup *mem = NULL;
2006         int ret = 0;
2007
2008         if (mem_cgroup_disabled())
2009                 return 0;
2010
2011         pc = lookup_page_cgroup(page);
2012         lock_page_cgroup(pc);
2013         if (PageCgroupUsed(pc)) {
2014                 mem = pc->mem_cgroup;
2015                 css_get(&mem->css);
2016         }
2017         unlock_page_cgroup(pc);
2018
2019         if (mem) {
2020                 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
2021                                                 page);
2022                 css_put(&mem->css);
2023         }
2024         *ptr = mem;
2025         return ret;
2026 }
2027
2028 /* remove redundant charge if migration failed*/
2029 void mem_cgroup_end_migration(struct mem_cgroup *mem,
2030                 struct page *oldpage, struct page *newpage)
2031 {
2032         struct page *target, *unused;
2033         struct page_cgroup *pc;
2034         enum charge_type ctype;
2035
2036         if (!mem)
2037                 return;
2038         cgroup_exclude_rmdir(&mem->css);
2039         /* at migration success, oldpage->mapping is NULL. */
2040         if (oldpage->mapping) {
2041                 target = oldpage;
2042                 unused = NULL;
2043         } else {
2044                 target = newpage;
2045                 unused = oldpage;
2046         }
2047
2048         if (PageAnon(target))
2049                 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2050         else if (page_is_file_cache(target))
2051                 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2052         else
2053                 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2054
2055         /* unused page is not on radix-tree now. */
2056         if (unused)
2057                 __mem_cgroup_uncharge_common(unused, ctype);
2058
2059         pc = lookup_page_cgroup(target);
2060         /*
2061          * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
2062          * So, double-counting is effectively avoided.
2063          */
2064         __mem_cgroup_commit_charge(mem, pc, ctype);
2065
2066         /*
2067          * Both of oldpage and newpage are still under lock_page().
2068          * Then, we don't have to care about race in radix-tree.
2069          * But we have to be careful that this page is unmapped or not.
2070          *
2071          * There is a case for !page_mapped(). At the start of
2072          * migration, oldpage was mapped. But now, it's zapped.
2073          * But we know *target* page is not freed/reused under us.
2074          * mem_cgroup_uncharge_page() does all necessary checks.
2075          */
2076         if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
2077                 mem_cgroup_uncharge_page(target);
2078         /*
2079          * At migration, we may charge account against cgroup which has no tasks
2080          * So, rmdir()->pre_destroy() can be called while we do this charge.
2081          * In that case, we need to call pre_destroy() again. check it here.
2082          */
2083         cgroup_release_and_wakeup_rmdir(&mem->css);
2084 }
2085
2086 /*
2087  * A call to try to shrink memory usage on charge failure at shmem's swapin.
2088  * Calling hierarchical_reclaim is not enough because we should update
2089  * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
2090  * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
2091  * not from the memcg which this page would be charged to.
2092  * try_charge_swapin does all of these works properly.
2093  */
2094 int mem_cgroup_shmem_charge_fallback(struct page *page,
2095                             struct mm_struct *mm,
2096                             gfp_t gfp_mask)
2097 {
2098         struct mem_cgroup *mem = NULL;
2099         int ret;
2100
2101         if (mem_cgroup_disabled())
2102                 return 0;
2103
2104         ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2105         if (!ret)
2106                 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
2107
2108         return ret;
2109 }
2110
2111 static DEFINE_MUTEX(set_limit_mutex);
2112
2113 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2114                                 unsigned long long val)
2115 {
2116         int retry_count;
2117         int progress;
2118         u64 memswlimit;
2119         int ret = 0;
2120         int children = mem_cgroup_count_children(memcg);
2121         u64 curusage, oldusage;
2122
2123         /*
2124          * For keeping hierarchical_reclaim simple, how long we should retry
2125          * is depends on callers. We set our retry-count to be function
2126          * of # of children which we should visit in this loop.
2127          */
2128         retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
2129
2130         oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2131
2132         while (retry_count) {
2133                 if (signal_pending(current)) {
2134                         ret = -EINTR;
2135                         break;
2136                 }
2137                 /*
2138                  * Rather than hide all in some function, I do this in
2139                  * open coded manner. You see what this really does.
2140                  * We have to guarantee mem->res.limit < mem->memsw.limit.
2141                  */
2142                 mutex_lock(&set_limit_mutex);
2143                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2144                 if (memswlimit < val) {
2145                         ret = -EINVAL;
2146                         mutex_unlock(&set_limit_mutex);
2147                         break;
2148                 }
2149                 ret = res_counter_set_limit(&memcg->res, val);
2150                 if (!ret) {
2151                         if (memswlimit == val)
2152                                 memcg->memsw_is_minimum = true;
2153                         else
2154                                 memcg->memsw_is_minimum = false;
2155                 }
2156                 mutex_unlock(&set_limit_mutex);
2157
2158                 if (!ret)
2159                         break;
2160
2161                 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
2162                                                 GFP_KERNEL,
2163                                                 MEM_CGROUP_RECLAIM_SHRINK);
2164                 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2165                 /* Usage is reduced ? */
2166                 if (curusage >= oldusage)
2167                         retry_count--;
2168                 else
2169                         oldusage = curusage;
2170         }
2171
2172         return ret;
2173 }
2174
2175 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2176                                         unsigned long long val)
2177 {
2178         int retry_count;
2179         u64 memlimit, oldusage, curusage;
2180         int children = mem_cgroup_count_children(memcg);
2181         int ret = -EBUSY;
2182
2183         /* see mem_cgroup_resize_res_limit */
2184         retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
2185         oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2186         while (retry_count) {
2187                 if (signal_pending(current)) {
2188                         ret = -EINTR;
2189                         break;
2190                 }
2191                 /*
2192                  * Rather than hide all in some function, I do this in
2193                  * open coded manner. You see what this really does.
2194                  * We have to guarantee mem->res.limit < mem->memsw.limit.
2195                  */
2196                 mutex_lock(&set_limit_mutex);
2197                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2198                 if (memlimit > val) {
2199                         ret = -EINVAL;
2200                         mutex_unlock(&set_limit_mutex);
2201                         break;
2202                 }
2203                 ret = res_counter_set_limit(&memcg->memsw, val);
2204                 if (!ret) {
2205                         if (memlimit == val)
2206                                 memcg->memsw_is_minimum = true;
2207                         else
2208                                 memcg->memsw_is_minimum = false;
2209                 }
2210                 mutex_unlock(&set_limit_mutex);
2211
2212                 if (!ret)
2213                         break;
2214
2215                 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2216                                                 MEM_CGROUP_RECLAIM_NOSWAP |
2217                                                 MEM_CGROUP_RECLAIM_SHRINK);
2218                 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2219                 /* Usage is reduced ? */
2220                 if (curusage >= oldusage)
2221                         retry_count--;
2222                 else
2223                         oldusage = curusage;
2224         }
2225         return ret;
2226 }
2227
2228 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2229                                                 gfp_t gfp_mask, int nid,
2230                                                 int zid)
2231 {
2232         unsigned long nr_reclaimed = 0;
2233         struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2234         unsigned long reclaimed;
2235         int loop = 0;
2236         struct mem_cgroup_tree_per_zone *mctz;
2237
2238         if (order > 0)
2239                 return 0;
2240
2241         mctz = soft_limit_tree_node_zone(nid, zid);
2242         /*
2243          * This loop can run a while, specially if mem_cgroup's continuously
2244          * keep exceeding their soft limit and putting the system under
2245          * pressure
2246          */
2247         do {
2248                 if (next_mz)
2249                         mz = next_mz;
2250                 else
2251                         mz = mem_cgroup_largest_soft_limit_node(mctz);
2252                 if (!mz)
2253                         break;
2254
2255                 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2256                                                 gfp_mask,
2257                                                 MEM_CGROUP_RECLAIM_SOFT);
2258                 nr_reclaimed += reclaimed;
2259                 spin_lock(&mctz->lock);
2260
2261                 /*
2262                  * If we failed to reclaim anything from this memory cgroup
2263                  * it is time to move on to the next cgroup
2264                  */
2265                 next_mz = NULL;
2266                 if (!reclaimed) {
2267                         do {
2268                                 /*
2269                                  * Loop until we find yet another one.
2270                                  *
2271                                  * By the time we get the soft_limit lock
2272                                  * again, someone might have aded the
2273                                  * group back on the RB tree. Iterate to
2274                                  * make sure we get a different mem.
2275                                  * mem_cgroup_largest_soft_limit_node returns
2276                                  * NULL if no other cgroup is present on
2277                                  * the tree
2278                                  */
2279                                 next_mz =
2280                                 __mem_cgroup_largest_soft_limit_node(mctz);
2281                                 if (next_mz == mz) {
2282                                         css_put(&next_mz->mem->css);
2283                                         next_mz = NULL;
2284                                 } else /* next_mz == NULL or other memcg */
2285                                         break;
2286                         } while (1);
2287                 }
2288                 mz->usage_in_excess =
2289                         res_counter_soft_limit_excess(&mz->mem->res);
2290                 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2291                 /*
2292                  * One school of thought says that we should not add
2293                  * back the node to the tree if reclaim returns 0.
2294                  * But our reclaim could return 0, simply because due
2295                  * to priority we are exposing a smaller subset of
2296                  * memory to reclaim from. Consider this as a longer
2297                  * term TODO.
2298                  */
2299                 if (mz->usage_in_excess)
2300                         __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
2301                 spin_unlock(&mctz->lock);
2302                 css_put(&mz->mem->css);
2303                 loop++;
2304                 /*
2305                  * Could not reclaim anything and there are no more
2306                  * mem cgroups to try or we seem to be looping without
2307                  * reclaiming anything.
2308                  */
2309                 if (!nr_reclaimed &&
2310                         (next_mz == NULL ||
2311                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2312                         break;
2313         } while (!nr_reclaimed);
2314         if (next_mz)
2315                 css_put(&next_mz->mem->css);
2316         return nr_reclaimed;
2317 }
2318
2319 /*
2320  * This routine traverse page_cgroup in given list and drop them all.
2321  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
2322  */
2323 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2324                                 int node, int zid, enum lru_list lru)
2325 {
2326         struct zone *zone;
2327         struct mem_cgroup_per_zone *mz;
2328         struct page_cgroup *pc, *busy;
2329         unsigned long flags, loop;
2330         struct list_head *list;
2331         int ret = 0;
2332
2333         zone = &NODE_DATA(node)->node_zones[zid];
2334         mz = mem_cgroup_zoneinfo(mem, node, zid);
2335         list = &mz->lists[lru];
2336
2337         loop = MEM_CGROUP_ZSTAT(mz, lru);
2338         /* give some margin against EBUSY etc...*/
2339         loop += 256;
2340         busy = NULL;
2341         while (loop--) {
2342                 ret = 0;
2343                 spin_lock_irqsave(&zone->lru_lock, flags);
2344                 if (list_empty(list)) {
2345                         spin_unlock_irqrestore(&zone->lru_lock, flags);
2346                         break;
2347                 }
2348                 pc = list_entry(list->prev, struct page_cgroup, lru);
2349                 if (busy == pc) {
2350                         list_move(&pc->lru, list);
2351                         busy = 0;
2352                         spin_unlock_irqrestore(&zone->lru_lock, flags);
2353                         continue;
2354                 }
2355                 spin_unlock_irqrestore(&zone->lru_lock, flags);
2356
2357                 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
2358                 if (ret == -ENOMEM)
2359                         break;
2360
2361                 if (ret == -EBUSY || ret == -EINVAL) {
2362                         /* found lock contention or "pc" is obsolete. */
2363                         busy = pc;
2364                         cond_resched();
2365                 } else
2366                         busy = NULL;
2367         }
2368
2369         if (!ret && !list_empty(list))
2370                 return -EBUSY;
2371         return ret;
2372 }
2373
2374 /*
2375  * make mem_cgroup's charge to be 0 if there is no task.
2376  * This enables deleting this mem_cgroup.
2377  */
2378 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2379 {
2380         int ret;
2381         int node, zid, shrink;
2382         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2383         struct cgroup *cgrp = mem->css.cgroup;
2384
2385         css_get(&mem->css);
2386
2387         shrink = 0;
2388         /* should free all ? */
2389         if (free_all)
2390                 goto try_to_free;
2391 move_account:
2392         while (mem->res.usage > 0) {
2393                 ret = -EBUSY;
2394                 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2395                         goto out;
2396                 ret = -EINTR;
2397                 if (signal_pending(current))
2398                         goto out;
2399                 /* This is for making all *used* pages to be on LRU. */
2400                 lru_add_drain_all();
2401                 ret = 0;
2402                 for_each_node_state(node, N_HIGH_MEMORY) {
2403                         for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
2404                                 enum lru_list l;
2405                                 for_each_lru(l) {
2406                                         ret = mem_cgroup_force_empty_list(mem,
2407                                                         node, zid, l);
2408                                         if (ret)
2409                                                 break;
2410                                 }
2411                         }
2412                         if (ret)
2413                                 break;
2414                 }
2415                 /* it seems parent cgroup doesn't have enough mem */
2416                 if (ret == -ENOMEM)
2417                         goto try_to_free;
2418                 cond_resched();
2419         }
2420         ret = 0;
2421 out:
2422         css_put(&mem->css);
2423         return ret;
2424
2425 try_to_free:
2426         /* returns EBUSY if there is a task or if we come here twice. */
2427         if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
2428                 ret = -EBUSY;
2429                 goto out;
2430         }
2431         /* we call try-to-free pages for make this cgroup empty */
2432         lru_add_drain_all();
2433         /* try to free all pages in this cgroup */
2434         shrink = 1;
2435         while (nr_retries && mem->res.usage > 0) {
2436                 int progress;
2437
2438                 if (signal_pending(current)) {
2439                         ret = -EINTR;
2440                         goto out;
2441                 }
2442                 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
2443                                                 false, get_swappiness(mem));
2444                 if (!progress) {
2445                         nr_retries--;
2446                         /* maybe some writeback is necessary */
2447                         congestion_wait(BLK_RW_ASYNC, HZ/10);
2448                 }
2449
2450         }
2451         lru_add_drain();
2452         /* try move_account...there may be some *locked* pages. */
2453         if (mem->res.usage)
2454                 goto move_account;
2455         ret = 0;
2456         goto out;
2457 }
2458
2459 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
2460 {
2461         return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
2462 }
2463
2464
2465 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
2466 {
2467         return mem_cgroup_from_cont(cont)->use_hierarchy;
2468 }
2469
2470 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2471                                         u64 val)
2472 {
2473         int retval = 0;
2474         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2475         struct cgroup *parent = cont->parent;
2476         struct mem_cgroup *parent_mem = NULL;
2477
2478         if (parent)
2479                 parent_mem = mem_cgroup_from_cont(parent);
2480
2481         cgroup_lock();
2482         /*
2483          * If parent's use_hiearchy is set, we can't make any modifications
2484          * in the child subtrees. If it is unset, then the change can
2485          * occur, provided the current cgroup has no children.
2486          *
2487          * For the root cgroup, parent_mem is NULL, we allow value to be
2488          * set if there are no children.
2489          */
2490         if ((!parent_mem || !parent_mem->use_hierarchy) &&
2491                                 (val == 1 || val == 0)) {
2492                 if (list_empty(&cont->children))
2493                         mem->use_hierarchy = val;
2494                 else
2495                         retval = -EBUSY;
2496         } else
2497                 retval = -EINVAL;
2498         cgroup_unlock();
2499
2500         return retval;
2501 }
2502
2503 struct mem_cgroup_idx_data {
2504         s64 val;
2505         enum mem_cgroup_stat_index idx;
2506 };
2507
2508 static int
2509 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2510 {
2511         struct mem_cgroup_idx_data *d = data;
2512         d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
2513         return 0;
2514 }
2515
2516 static void
2517 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2518                                 enum mem_cgroup_stat_index idx, s64 *val)
2519 {
2520         struct mem_cgroup_idx_data d;
2521         d.idx = idx;
2522         d.val = 0;
2523         mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
2524         *val = d.val;
2525 }
2526
2527 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2528 {
2529         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2530         u64 idx_val, val;
2531         int type, name;
2532
2533         type = MEMFILE_TYPE(cft->private);
2534         name = MEMFILE_ATTR(cft->private);
2535         switch (type) {
2536         case _MEM:
2537                 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2538                         mem_cgroup_get_recursive_idx_stat(mem,
2539                                 MEM_CGROUP_STAT_CACHE, &idx_val);
2540                         val = idx_val;
2541                         mem_cgroup_get_recursive_idx_stat(mem,
2542                                 MEM_CGROUP_STAT_RSS, &idx_val);
2543                         val += idx_val;
2544                         val <<= PAGE_SHIFT;
2545                 } else
2546                         val = res_counter_read_u64(&mem->res, name);
2547                 break;
2548         case _MEMSWAP:
2549                 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2550                         mem_cgroup_get_recursive_idx_stat(mem,
2551                                 MEM_CGROUP_STAT_CACHE, &idx_val);
2552                         val = idx_val;
2553                         mem_cgroup_get_recursive_idx_stat(mem,
2554                                 MEM_CGROUP_STAT_RSS, &idx_val);
2555                         val += idx_val;
2556                         mem_cgroup_get_recursive_idx_stat(mem,
2557                                 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2558                         val <<= PAGE_SHIFT;
2559                 } else
2560                         val = res_counter_read_u64(&mem->memsw, name);
2561                 break;
2562         default:
2563                 BUG();
2564                 break;
2565         }
2566         return val;
2567 }
2568 /*
2569  * The user of this function is...
2570  * RES_LIMIT.
2571  */
2572 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2573                             const char *buffer)
2574 {
2575         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2576         int type, name;
2577         unsigned long long val;
2578         int ret;
2579
2580         type = MEMFILE_TYPE(cft->private);
2581         name = MEMFILE_ATTR(cft->private);
2582         switch (name) {
2583         case RES_LIMIT:
2584                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2585                         ret = -EINVAL;
2586                         break;
2587                 }
2588                 /* This function does all necessary parse...reuse it */
2589                 ret = res_counter_memparse_write_strategy(buffer, &val);
2590                 if (ret)
2591                         break;
2592                 if (type == _MEM)
2593                         ret = mem_cgroup_resize_limit(memcg, val);
2594                 else
2595                         ret = mem_cgroup_resize_memsw_limit(memcg, val);
2596                 break;
2597         case RES_SOFT_LIMIT:
2598                 ret = res_counter_memparse_write_strategy(buffer, &val);
2599                 if (ret)
2600                         break;
2601                 /*
2602                  * For memsw, soft limits are hard to implement in terms
2603                  * of semantics, for now, we support soft limits for
2604                  * control without swap
2605                  */
2606                 if (type == _MEM)
2607                         ret = res_counter_set_soft_limit(&memcg->res, val);
2608                 else
2609                         ret = -EINVAL;
2610                 break;
2611         default:
2612                 ret = -EINVAL; /* should be BUG() ? */
2613                 break;
2614         }
2615         return ret;
2616 }
2617
2618 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
2619                 unsigned long long *mem_limit, unsigned long long *memsw_limit)
2620 {
2621         struct cgroup *cgroup;
2622         unsigned long long min_limit, min_memsw_limit, tmp;
2623
2624         min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2625         min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2626         cgroup = memcg->css.cgroup;
2627         if (!memcg->use_hierarchy)
2628                 goto out;
2629
2630         while (cgroup->parent) {
2631                 cgroup = cgroup->parent;
2632                 memcg = mem_cgroup_from_cont(cgroup);
2633                 if (!memcg->use_hierarchy)
2634                         break;
2635                 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
2636                 min_limit = min(min_limit, tmp);
2637                 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2638                 min_memsw_limit = min(min_memsw_limit, tmp);
2639         }
2640 out:
2641         *mem_limit = min_limit;
2642         *memsw_limit = min_memsw_limit;
2643         return;
2644 }
2645
2646 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2647 {
2648         struct mem_cgroup *mem;
2649         int type, name;
2650
2651         mem = mem_cgroup_from_cont(cont);
2652         type = MEMFILE_TYPE(event);
2653         name = MEMFILE_ATTR(event);
2654         switch (name) {
2655         case RES_MAX_USAGE:
2656                 if (type == _MEM)
2657                         res_counter_reset_max(&mem->res);
2658                 else
2659                         res_counter_reset_max(&mem->memsw);
2660                 break;
2661         case RES_FAILCNT:
2662                 if (type == _MEM)
2663                         res_counter_reset_failcnt(&mem->res);
2664                 else
2665                         res_counter_reset_failcnt(&mem->memsw);
2666                 break;
2667         }
2668
2669         return 0;
2670 }
2671
2672
2673 /* For read statistics */
2674 enum {
2675         MCS_CACHE,
2676         MCS_RSS,
2677         MCS_MAPPED_FILE,
2678         MCS_PGPGIN,
2679         MCS_PGPGOUT,
2680         MCS_SWAP,
2681         MCS_INACTIVE_ANON,
2682         MCS_ACTIVE_ANON,
2683         MCS_INACTIVE_FILE,
2684         MCS_ACTIVE_FILE,
2685         MCS_UNEVICTABLE,
2686         NR_MCS_STAT,
2687 };
2688
2689 struct mcs_total_stat {
2690         s64 stat[NR_MCS_STAT];
2691 };
2692
2693 struct {
2694         char *local_name;
2695         char *total_name;
2696 } memcg_stat_strings[NR_MCS_STAT] = {
2697         {"cache", "total_cache"},
2698         {"rss", "total_rss"},
2699         {"mapped_file", "total_mapped_file"},
2700         {"pgpgin", "total_pgpgin"},
2701         {"pgpgout", "total_pgpgout"},
2702         {"swap", "total_swap"},
2703         {"inactive_anon", "total_inactive_anon"},
2704         {"active_anon", "total_active_anon"},
2705         {"inactive_file", "total_inactive_file"},
2706         {"active_file", "total_active_file"},
2707         {"unevictable", "total_unevictable"}
2708 };
2709
2710
2711 static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2712 {
2713         struct mcs_total_stat *s = data;
2714         s64 val;
2715
2716         /* per cpu stat */
2717         val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
2718         s->stat[MCS_CACHE] += val * PAGE_SIZE;
2719         val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2720         s->stat[MCS_RSS] += val * PAGE_SIZE;
2721         val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE);
2722         s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE;
2723         val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2724         s->stat[MCS_PGPGIN] += val;
2725         val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2726         s->stat[MCS_PGPGOUT] += val;
2727         if (do_swap_account) {
2728                 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
2729                 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2730         }
2731
2732         /* per zone stat */
2733         val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
2734         s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
2735         val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
2736         s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
2737         val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
2738         s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
2739         val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
2740         s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
2741         val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
2742         s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
2743         return 0;
2744 }
2745
2746 static void
2747 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
2748 {
2749         mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
2750 }
2751
2752 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2753                                  struct cgroup_map_cb *cb)
2754 {
2755         struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
2756         struct mcs_total_stat mystat;
2757         int i;
2758
2759         memset(&mystat, 0, sizeof(mystat));
2760         mem_cgroup_get_local_stat(mem_cont, &mystat);
2761
2762         for (i = 0; i < NR_MCS_STAT; i++) {
2763                 if (i == MCS_SWAP && !do_swap_account)
2764                         continue;
2765                 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
2766         }
2767
2768         /* Hierarchical information */
2769         {
2770                 unsigned long long limit, memsw_limit;
2771                 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
2772                 cb->fill(cb, "hierarchical_memory_limit", limit);
2773                 if (do_swap_account)
2774                         cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
2775         }
2776
2777         memset(&mystat, 0, sizeof(mystat));
2778         mem_cgroup_get_total_stat(mem_cont, &mystat);
2779         for (i = 0; i < NR_MCS_STAT; i++) {
2780                 if (i == MCS_SWAP && !do_swap_account)
2781                         continue;
2782                 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2783         }
2784
2785 #ifdef CONFIG_DEBUG_VM
2786         cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
2787
2788         {
2789                 int nid, zid;
2790                 struct mem_cgroup_per_zone *mz;
2791                 unsigned long recent_rotated[2] = {0, 0};
2792                 unsigned long recent_scanned[2] = {0, 0};
2793
2794                 for_each_online_node(nid)
2795                         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2796                                 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
2797
2798                                 recent_rotated[0] +=
2799                                         mz->reclaim_stat.recent_rotated[0];
2800                                 recent_rotated[1] +=
2801                                         mz->reclaim_stat.recent_rotated[1];
2802                                 recent_scanned[0] +=
2803                                         mz->reclaim_stat.recent_scanned[0];
2804                                 recent_scanned[1] +=
2805                                         mz->reclaim_stat.recent_scanned[1];
2806                         }
2807                 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
2808                 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
2809                 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
2810                 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
2811         }
2812 #endif
2813
2814         return 0;
2815 }
2816
2817 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
2818 {
2819         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
2820
2821         return get_swappiness(memcg);
2822 }
2823
2824 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
2825                                        u64 val)
2826 {
2827         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
2828         struct mem_cgroup *parent;
2829
2830         if (val > 100)
2831                 return -EINVAL;
2832
2833         if (cgrp->parent == NULL)
2834                 return -EINVAL;
2835
2836         parent = mem_cgroup_from_cont(cgrp->parent);
2837
2838         cgroup_lock();
2839
2840         /* If under hierarchy, only empty-root can set this value */
2841         if ((parent->use_hierarchy) ||
2842             (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
2843                 cgroup_unlock();
2844                 return -EINVAL;
2845         }
2846
2847         spin_lock(&memcg->reclaim_param_lock);
2848         memcg->swappiness = val;
2849         spin_unlock(&memcg->reclaim_param_lock);
2850
2851         cgroup_unlock();
2852
2853         return 0;
2854 }
2855
2856
2857 static struct cftype mem_cgroup_files[] = {
2858         {
2859                 .name = "usage_in_bytes",
2860                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2861                 .read_u64 = mem_cgroup_read,
2862         },
2863         {
2864                 .name = "max_usage_in_bytes",
2865                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
2866                 .trigger = mem_cgroup_reset,
2867                 .read_u64 = mem_cgroup_read,
2868         },
2869         {
2870                 .name = "limit_in_bytes",
2871                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
2872                 .write_string = mem_cgroup_write,
2873                 .read_u64 = mem_cgroup_read,
2874         },
2875         {
2876                 .name = "soft_limit_in_bytes",
2877                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2878                 .write_string = mem_cgroup_write,
2879                 .read_u64 = mem_cgroup_read,
2880         },
2881         {
2882                 .name = "failcnt",
2883                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2884                 .trigger = mem_cgroup_reset,
2885                 .read_u64 = mem_cgroup_read,
2886         },
2887         {
2888                 .name = "stat",
2889                 .read_map = mem_control_stat_show,
2890         },
2891         {
2892                 .name = "force_empty",
2893                 .trigger = mem_cgroup_force_empty_write,
2894         },
2895         {
2896                 .name = "use_hierarchy",
2897                 .write_u64 = mem_cgroup_hierarchy_write,
2898                 .read_u64 = mem_cgroup_hierarchy_read,
2899         },
2900         {
2901                 .name = "swappiness",
2902                 .read_u64 = mem_cgroup_swappiness_read,
2903                 .write_u64 = mem_cgroup_swappiness_write,
2904         },
2905 };
2906
2907 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2908 static struct cftype memsw_cgroup_files[] = {
2909         {
2910                 .name = "memsw.usage_in_bytes",
2911                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
2912                 .read_u64 = mem_cgroup_read,
2913         },
2914         {
2915                 .name = "memsw.max_usage_in_bytes",
2916                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
2917                 .trigger = mem_cgroup_reset,
2918                 .read_u64 = mem_cgroup_read,
2919         },
2920         {
2921                 .name = "memsw.limit_in_bytes",
2922                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
2923                 .write_string = mem_cgroup_write,
2924                 .read_u64 = mem_cgroup_read,
2925         },
2926         {
2927                 .name = "memsw.failcnt",
2928                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
2929                 .trigger = mem_cgroup_reset,
2930                 .read_u64 = mem_cgroup_read,
2931         },
2932 };
2933
2934 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2935 {
2936         if (!do_swap_account)
2937                 return 0;
2938         return cgroup_add_files(cont, ss, memsw_cgroup_files,
2939                                 ARRAY_SIZE(memsw_cgroup_files));
2940 };
2941 #else
2942 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2943 {
2944         return 0;
2945 }
2946 #endif
2947
2948 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2949 {
2950         struct mem_cgroup_per_node *pn;
2951         struct mem_cgroup_per_zone *mz;
2952         enum lru_list l;
2953         int zone, tmp = node;
2954         /*
2955          * This routine is called against possible nodes.
2956          * But it's BUG to call kmalloc() against offline node.
2957          *
2958          * TODO: this routine can waste much memory for nodes which will
2959          *       never be onlined. It's better to use memory hotplug callback
2960          *       function.
2961          */
2962         if (!node_state(node, N_NORMAL_MEMORY))
2963                 tmp = -1;
2964         pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
2965         if (!pn)
2966                 return 1;
2967
2968         mem->info.nodeinfo[node] = pn;
2969         memset(pn, 0, sizeof(*pn));
2970
2971         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
2972                 mz = &pn->zoneinfo[zone];
2973                 for_each_lru(l)
2974                         INIT_LIST_HEAD(&mz->lists[l]);
2975                 mz->usage_in_excess = 0;
2976                 mz->on_tree = false;
2977                 mz->mem = mem;
2978         }
2979         return 0;
2980 }
2981
2982 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2983 {
2984         kfree(mem->info.nodeinfo[node]);
2985 }
2986
2987 static int mem_cgroup_size(void)
2988 {
2989         int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
2990         return sizeof(struct mem_cgroup) + cpustat_size;
2991 }
2992
2993 static struct mem_cgroup *mem_cgroup_alloc(void)
2994 {
2995         struct mem_cgroup *mem;
2996         int size = mem_cgroup_size();
2997
2998         if (size < PAGE_SIZE)
2999                 mem = kmalloc(size, GFP_KERNEL);
3000         else
3001                 mem = vmalloc(size);
3002
3003         if (mem)
3004                 memset(mem, 0, size);
3005         return mem;
3006 }
3007
3008 /*
3009  * At destroying mem_cgroup, references from swap_cgroup can remain.
3010  * (scanning all at force_empty is too costly...)
3011  *
3012  * Instead of clearing all references at force_empty, we remember
3013  * the number of reference from swap_cgroup and free mem_cgroup when
3014  * it goes down to 0.
3015  *
3016  * Removal of cgroup itself succeeds regardless of refs from swap.
3017  */
3018
3019 static void __mem_cgroup_free(struct mem_cgroup *mem)
3020 {
3021         int node;
3022
3023         mem_cgroup_remove_from_trees(mem);
3024         free_css_id(&mem_cgroup_subsys, &mem->css);
3025
3026         for_each_node_state(node, N_POSSIBLE)
3027                 free_mem_cgroup_per_zone_info(mem, node);
3028
3029         if (mem_cgroup_size() < PAGE_SIZE)
3030                 kfree(mem);
3031         else
3032                 vfree(mem);
3033 }
3034
3035 static void mem_cgroup_get(struct mem_cgroup *mem)
3036 {
3037         atomic_inc(&mem->refcnt);
3038 }
3039
3040 static void mem_cgroup_put(struct mem_cgroup *mem)
3041 {
3042         if (atomic_dec_and_test(&mem->refcnt)) {
3043                 struct mem_cgroup *parent = parent_mem_cgroup(mem);
3044                 __mem_cgroup_free(mem);
3045                 if (parent)
3046                         mem_cgroup_put(parent);
3047         }
3048 }
3049
3050 /*
3051  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
3052  */
3053 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
3054 {
3055         if (!mem->res.parent)
3056                 return NULL;
3057         return mem_cgroup_from_res_counter(mem->res.parent, res);
3058 }
3059
3060 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3061 static void __init enable_swap_cgroup(void)
3062 {
3063         if (!mem_cgroup_disabled() && really_do_swap_account)
3064                 do_swap_account = 1;
3065 }
3066 #else
3067 static void __init enable_swap_cgroup(void)
3068 {
3069 }
3070 #endif
3071
3072 static int mem_cgroup_soft_limit_tree_init(void)
3073 {
3074         struct mem_cgroup_tree_per_node *rtpn;
3075         struct mem_cgroup_tree_per_zone *rtpz;
3076         int tmp, node, zone;
3077
3078         for_each_node_state(node, N_POSSIBLE) {
3079                 tmp = node;
3080                 if (!node_state(node, N_NORMAL_MEMORY))
3081                         tmp = -1;
3082                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
3083                 if (!rtpn)
3084                         return 1;
3085
3086                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
3087
3088                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3089                         rtpz = &rtpn->rb_tree_per_zone[zone];
3090                         rtpz->rb_root = RB_ROOT;
3091                         spin_lock_init(&rtpz->lock);
3092                 }
3093         }
3094         return 0;
3095 }
3096
3097 static struct cgroup_subsys_state * __ref
3098 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3099 {
3100         struct mem_cgroup *mem, *parent;
3101         long error = -ENOMEM;
3102         int node;
3103
3104         mem = mem_cgroup_alloc();
3105         if (!mem)
3106                 return ERR_PTR(error);
3107
3108         for_each_node_state(node, N_POSSIBLE)
3109                 if (alloc_mem_cgroup_per_zone_info(mem, node))
3110                         goto free_out;
3111
3112         /* root ? */
3113         if (cont->parent == NULL) {
3114                 enable_swap_cgroup();
3115                 parent = NULL;
3116                 root_mem_cgroup = mem;
3117                 if (mem_cgroup_soft_limit_tree_init())
3118                         goto free_out;
3119
3120         } else {
3121                 parent = mem_cgroup_from_cont(cont->parent);
3122                 mem->use_hierarchy = parent->use_hierarchy;
3123         }
3124
3125         if (parent && parent->use_hierarchy) {
3126                 res_counter_init(&mem->res, &parent->res);
3127                 res_counter_init(&mem->memsw, &parent->memsw);
3128                 /*
3129                  * We increment refcnt of the parent to ensure that we can
3130                  * safely access it on res_counter_charge/uncharge.
3131                  * This refcnt will be decremented when freeing this
3132                  * mem_cgroup(see mem_cgroup_put).
3133                  */
3134                 mem_cgroup_get(parent);
3135         } else {
3136                 res_counter_init(&mem->res, NULL);
3137                 res_counter_init(&mem->memsw, NULL);
3138         }
3139         mem->last_scanned_child = 0;
3140         spin_lock_init(&mem->reclaim_param_lock);
3141
3142         if (parent)
3143                 mem->swappiness = get_swappiness(parent);
3144         atomic_set(&mem->refcnt, 1);
3145         return &mem->css;
3146 free_out:
3147         __mem_cgroup_free(mem);
3148         root_mem_cgroup = NULL;
3149         return ERR_PTR(error);
3150 }
3151
3152 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
3153                                         struct cgroup *cont)
3154 {
3155         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3156
3157         return mem_cgroup_force_empty(mem, false);
3158 }
3159
3160 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
3161                                 struct cgroup *cont)
3162 {
3163         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3164
3165         mem_cgroup_put(mem);
3166 }
3167
3168 static int mem_cgroup_populate(struct cgroup_subsys *ss,
3169                                 struct cgroup *cont)
3170 {
3171         int ret;
3172
3173         ret = cgroup_add_files(cont, ss, mem_cgroup_files,
3174                                 ARRAY_SIZE(mem_cgroup_files));
3175
3176         if (!ret)
3177                 ret = register_memsw_files(cont, ss);
3178         return ret;
3179 }
3180
3181 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3182                                 struct cgroup *cont,
3183                                 struct cgroup *old_cont,
3184                                 struct task_struct *p,
3185                                 bool threadgroup)
3186 {
3187         mutex_lock(&memcg_tasklist);
3188         /*
3189          * FIXME: It's better to move charges of this process from old
3190          * memcg to new memcg. But it's just on TODO-List now.
3191          */
3192         mutex_unlock(&memcg_tasklist);
3193 }
3194
3195 struct cgroup_subsys mem_cgroup_subsys = {
3196         .name = "memory",
3197         .subsys_id = mem_cgroup_subsys_id,
3198         .create = mem_cgroup_create,
3199         .pre_destroy = mem_cgroup_pre_destroy,
3200         .destroy = mem_cgroup_destroy,
3201         .populate = mem_cgroup_populate,
3202         .attach = mem_cgroup_move_task,
3203         .early_init = 0,
3204         .use_id = 1,
3205 };
3206
3207 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3208
3209 static int __init disable_swap_account(char *s)
3210 {
3211         really_do_swap_account = 0;
3212         return 1;
3213 }
3214 __setup("noswapaccount", disable_swap_account);
3215 #endif