memcg: avoid lock in updating file_mapped (Was fix race in file_mapped accouting...
[linux-2.6.git] / mm / memcontrol.c
1 /* memcontrol.c - Memory Controller
2  *
3  * Copyright IBM Corporation, 2007
4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5  *
6  * Copyright 2007 OpenVZ SWsoft Inc
7  * Author: Pavel Emelianov <xemul@openvz.org>
8  *
9  * Memory thresholds
10  * Copyright (C) 2009 Nokia Corporation
11  * Author: Kirill A. Shutemov
12  *
13  * This program is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2 of the License, or
16  * (at your option) any later version.
17  *
18  * This program is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  */
23
24 #include <linux/res_counter.h>
25 #include <linux/memcontrol.h>
26 #include <linux/cgroup.h>
27 #include <linux/mm.h>
28 #include <linux/hugetlb.h>
29 #include <linux/pagemap.h>
30 #include <linux/smp.h>
31 #include <linux/page-flags.h>
32 #include <linux/backing-dev.h>
33 #include <linux/bit_spinlock.h>
34 #include <linux/rcupdate.h>
35 #include <linux/limits.h>
36 #include <linux/mutex.h>
37 #include <linux/rbtree.h>
38 #include <linux/slab.h>
39 #include <linux/swap.h>
40 #include <linux/swapops.h>
41 #include <linux/spinlock.h>
42 #include <linux/eventfd.h>
43 #include <linux/sort.h>
44 #include <linux/fs.h>
45 #include <linux/seq_file.h>
46 #include <linux/vmalloc.h>
47 #include <linux/mm_inline.h>
48 #include <linux/page_cgroup.h>
49 #include <linux/cpu.h>
50 #include <linux/oom.h>
51 #include "internal.h"
52
53 #include <asm/uaccess.h>
54
55 #include <trace/events/vmscan.h>
56
57 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
58 #define MEM_CGROUP_RECLAIM_RETRIES      5
59 struct mem_cgroup *root_mem_cgroup __read_mostly;
60
61 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
62 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
63 int do_swap_account __read_mostly;
64 static int really_do_swap_account __initdata = 1; /* for remember boot option*/
65 #else
66 #define do_swap_account         (0)
67 #endif
68
69 /*
70  * Per memcg event counter is incremented at every pagein/pageout. This counter
71  * is used for trigger some periodic events. This is straightforward and better
72  * than using jiffies etc. to handle periodic memcg event.
73  *
74  * These values will be used as !((event) & ((1 <<(thresh)) - 1))
75  */
76 #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
77 #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
78
79 /*
80  * Statistics for memory cgroup.
81  */
82 enum mem_cgroup_stat_index {
83         /*
84          * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
85          */
86         MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
87         MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
88         MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
89         MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
90         MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
91         MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
92         MEM_CGROUP_EVENTS,      /* incremented at every  pagein/pageout */
93         MEM_CGROUP_ON_MOVE,     /* someone is moving account between groups */
94
95         MEM_CGROUP_STAT_NSTATS,
96 };
97
98 struct mem_cgroup_stat_cpu {
99         s64 count[MEM_CGROUP_STAT_NSTATS];
100 };
101
102 /*
103  * per-zone information in memory controller.
104  */
105 struct mem_cgroup_per_zone {
106         /*
107          * spin_lock to protect the per cgroup LRU
108          */
109         struct list_head        lists[NR_LRU_LISTS];
110         unsigned long           count[NR_LRU_LISTS];
111
112         struct zone_reclaim_stat reclaim_stat;
113         struct rb_node          tree_node;      /* RB tree node */
114         unsigned long long      usage_in_excess;/* Set to the value by which */
115                                                 /* the soft limit is exceeded*/
116         bool                    on_tree;
117         struct mem_cgroup       *mem;           /* Back pointer, we cannot */
118                                                 /* use container_of        */
119 };
120 /* Macro for accessing counter */
121 #define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
122
123 struct mem_cgroup_per_node {
124         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
125 };
126
127 struct mem_cgroup_lru_info {
128         struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
129 };
130
131 /*
132  * Cgroups above their limits are maintained in a RB-Tree, independent of
133  * their hierarchy representation
134  */
135
136 struct mem_cgroup_tree_per_zone {
137         struct rb_root rb_root;
138         spinlock_t lock;
139 };
140
141 struct mem_cgroup_tree_per_node {
142         struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
143 };
144
145 struct mem_cgroup_tree {
146         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
147 };
148
149 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
150
151 struct mem_cgroup_threshold {
152         struct eventfd_ctx *eventfd;
153         u64 threshold;
154 };
155
156 /* For threshold */
157 struct mem_cgroup_threshold_ary {
158         /* An array index points to threshold just below usage. */
159         int current_threshold;
160         /* Size of entries[] */
161         unsigned int size;
162         /* Array of thresholds */
163         struct mem_cgroup_threshold entries[0];
164 };
165
166 struct mem_cgroup_thresholds {
167         /* Primary thresholds array */
168         struct mem_cgroup_threshold_ary *primary;
169         /*
170          * Spare threshold array.
171          * This is needed to make mem_cgroup_unregister_event() "never fail".
172          * It must be able to store at least primary->size - 1 entries.
173          */
174         struct mem_cgroup_threshold_ary *spare;
175 };
176
177 /* for OOM */
178 struct mem_cgroup_eventfd_list {
179         struct list_head list;
180         struct eventfd_ctx *eventfd;
181 };
182
183 static void mem_cgroup_threshold(struct mem_cgroup *mem);
184 static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
185
186 /*
187  * The memory controller data structure. The memory controller controls both
188  * page cache and RSS per cgroup. We would eventually like to provide
189  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
190  * to help the administrator determine what knobs to tune.
191  *
192  * TODO: Add a water mark for the memory controller. Reclaim will begin when
193  * we hit the water mark. May be even add a low water mark, such that
194  * no reclaim occurs from a cgroup at it's low water mark, this is
195  * a feature that will be implemented much later in the future.
196  */
197 struct mem_cgroup {
198         struct cgroup_subsys_state css;
199         /*
200          * the counter to account for memory usage
201          */
202         struct res_counter res;
203         /*
204          * the counter to account for mem+swap usage.
205          */
206         struct res_counter memsw;
207         /*
208          * Per cgroup active and inactive list, similar to the
209          * per zone LRU lists.
210          */
211         struct mem_cgroup_lru_info info;
212
213         /*
214           protect against reclaim related member.
215         */
216         spinlock_t reclaim_param_lock;
217
218         /*
219          * While reclaiming in a hierarchy, we cache the last child we
220          * reclaimed from.
221          */
222         int last_scanned_child;
223         /*
224          * Should the accounting and control be hierarchical, per subtree?
225          */
226         bool use_hierarchy;
227         atomic_t        oom_lock;
228         atomic_t        refcnt;
229
230         unsigned int    swappiness;
231         /* OOM-Killer disable */
232         int             oom_kill_disable;
233
234         /* set when res.limit == memsw.limit */
235         bool            memsw_is_minimum;
236
237         /* protect arrays of thresholds */
238         struct mutex thresholds_lock;
239
240         /* thresholds for memory usage. RCU-protected */
241         struct mem_cgroup_thresholds thresholds;
242
243         /* thresholds for mem+swap usage. RCU-protected */
244         struct mem_cgroup_thresholds memsw_thresholds;
245
246         /* For oom notifier event fd */
247         struct list_head oom_notify;
248
249         /*
250          * Should we move charges of a task when a task is moved into this
251          * mem_cgroup ? And what type of charges should we move ?
252          */
253         unsigned long   move_charge_at_immigrate;
254         /*
255          * percpu counter.
256          */
257         struct mem_cgroup_stat_cpu *stat;
258 };
259
260 /* Stuffs for move charges at task migration. */
261 /*
262  * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
263  * left-shifted bitmap of these types.
264  */
265 enum move_type {
266         MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
267         MOVE_CHARGE_TYPE_FILE,  /* file page(including tmpfs) and swap of it */
268         NR_MOVE_TYPE,
269 };
270
271 /* "mc" and its members are protected by cgroup_mutex */
272 static struct move_charge_struct {
273         spinlock_t        lock; /* for from, to, moving_task */
274         struct mem_cgroup *from;
275         struct mem_cgroup *to;
276         unsigned long precharge;
277         unsigned long moved_charge;
278         unsigned long moved_swap;
279         struct task_struct *moving_task;        /* a task moving charges */
280         wait_queue_head_t waitq;                /* a waitq for other context */
281 } mc = {
282         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
283         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
284 };
285
286 static bool move_anon(void)
287 {
288         return test_bit(MOVE_CHARGE_TYPE_ANON,
289                                         &mc.to->move_charge_at_immigrate);
290 }
291
292 static bool move_file(void)
293 {
294         return test_bit(MOVE_CHARGE_TYPE_FILE,
295                                         &mc.to->move_charge_at_immigrate);
296 }
297
298 /*
299  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
300  * limit reclaim to prevent infinite loops, if they ever occur.
301  */
302 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            (100)
303 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
304
305 enum charge_type {
306         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
307         MEM_CGROUP_CHARGE_TYPE_MAPPED,
308         MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
309         MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
310         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
311         MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
312         NR_CHARGE_TYPE,
313 };
314
315 /* only for here (for easy reading.) */
316 #define PCGF_CACHE      (1UL << PCG_CACHE)
317 #define PCGF_USED       (1UL << PCG_USED)
318 #define PCGF_LOCK       (1UL << PCG_LOCK)
319 /* Not used, but added here for completeness */
320 #define PCGF_ACCT       (1UL << PCG_ACCT)
321
322 /* for encoding cft->private value on file */
323 #define _MEM                    (0)
324 #define _MEMSWAP                (1)
325 #define _OOM_TYPE               (2)
326 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
327 #define MEMFILE_TYPE(val)       (((val) >> 16) & 0xffff)
328 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
329 /* Used for OOM nofiier */
330 #define OOM_CONTROL             (0)
331
332 /*
333  * Reclaim flags for mem_cgroup_hierarchical_reclaim
334  */
335 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT   0x0
336 #define MEM_CGROUP_RECLAIM_NOSWAP       (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
337 #define MEM_CGROUP_RECLAIM_SHRINK_BIT   0x1
338 #define MEM_CGROUP_RECLAIM_SHRINK       (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
339 #define MEM_CGROUP_RECLAIM_SOFT_BIT     0x2
340 #define MEM_CGROUP_RECLAIM_SOFT         (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
341
342 static void mem_cgroup_get(struct mem_cgroup *mem);
343 static void mem_cgroup_put(struct mem_cgroup *mem);
344 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
345 static void drain_all_stock_async(void);
346
347 static struct mem_cgroup_per_zone *
348 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
349 {
350         return &mem->info.nodeinfo[nid]->zoneinfo[zid];
351 }
352
353 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
354 {
355         return &mem->css;
356 }
357
358 static struct mem_cgroup_per_zone *
359 page_cgroup_zoneinfo(struct page_cgroup *pc)
360 {
361         struct mem_cgroup *mem = pc->mem_cgroup;
362         int nid = page_cgroup_nid(pc);
363         int zid = page_cgroup_zid(pc);
364
365         if (!mem)
366                 return NULL;
367
368         return mem_cgroup_zoneinfo(mem, nid, zid);
369 }
370
371 static struct mem_cgroup_tree_per_zone *
372 soft_limit_tree_node_zone(int nid, int zid)
373 {
374         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
375 }
376
377 static struct mem_cgroup_tree_per_zone *
378 soft_limit_tree_from_page(struct page *page)
379 {
380         int nid = page_to_nid(page);
381         int zid = page_zonenum(page);
382
383         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
384 }
385
386 static void
387 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
388                                 struct mem_cgroup_per_zone *mz,
389                                 struct mem_cgroup_tree_per_zone *mctz,
390                                 unsigned long long new_usage_in_excess)
391 {
392         struct rb_node **p = &mctz->rb_root.rb_node;
393         struct rb_node *parent = NULL;
394         struct mem_cgroup_per_zone *mz_node;
395
396         if (mz->on_tree)
397                 return;
398
399         mz->usage_in_excess = new_usage_in_excess;
400         if (!mz->usage_in_excess)
401                 return;
402         while (*p) {
403                 parent = *p;
404                 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
405                                         tree_node);
406                 if (mz->usage_in_excess < mz_node->usage_in_excess)
407                         p = &(*p)->rb_left;
408                 /*
409                  * We can't avoid mem cgroups that are over their soft
410                  * limit by the same amount
411                  */
412                 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
413                         p = &(*p)->rb_right;
414         }
415         rb_link_node(&mz->tree_node, parent, p);
416         rb_insert_color(&mz->tree_node, &mctz->rb_root);
417         mz->on_tree = true;
418 }
419
420 static void
421 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
422                                 struct mem_cgroup_per_zone *mz,
423                                 struct mem_cgroup_tree_per_zone *mctz)
424 {
425         if (!mz->on_tree)
426                 return;
427         rb_erase(&mz->tree_node, &mctz->rb_root);
428         mz->on_tree = false;
429 }
430
431 static void
432 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
433                                 struct mem_cgroup_per_zone *mz,
434                                 struct mem_cgroup_tree_per_zone *mctz)
435 {
436         spin_lock(&mctz->lock);
437         __mem_cgroup_remove_exceeded(mem, mz, mctz);
438         spin_unlock(&mctz->lock);
439 }
440
441
442 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
443 {
444         unsigned long long excess;
445         struct mem_cgroup_per_zone *mz;
446         struct mem_cgroup_tree_per_zone *mctz;
447         int nid = page_to_nid(page);
448         int zid = page_zonenum(page);
449         mctz = soft_limit_tree_from_page(page);
450
451         /*
452          * Necessary to update all ancestors when hierarchy is used.
453          * because their event counter is not touched.
454          */
455         for (; mem; mem = parent_mem_cgroup(mem)) {
456                 mz = mem_cgroup_zoneinfo(mem, nid, zid);
457                 excess = res_counter_soft_limit_excess(&mem->res);
458                 /*
459                  * We have to update the tree if mz is on RB-tree or
460                  * mem is over its softlimit.
461                  */
462                 if (excess || mz->on_tree) {
463                         spin_lock(&mctz->lock);
464                         /* if on-tree, remove it */
465                         if (mz->on_tree)
466                                 __mem_cgroup_remove_exceeded(mem, mz, mctz);
467                         /*
468                          * Insert again. mz->usage_in_excess will be updated.
469                          * If excess is 0, no tree ops.
470                          */
471                         __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
472                         spin_unlock(&mctz->lock);
473                 }
474         }
475 }
476
477 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
478 {
479         int node, zone;
480         struct mem_cgroup_per_zone *mz;
481         struct mem_cgroup_tree_per_zone *mctz;
482
483         for_each_node_state(node, N_POSSIBLE) {
484                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
485                         mz = mem_cgroup_zoneinfo(mem, node, zone);
486                         mctz = soft_limit_tree_node_zone(node, zone);
487                         mem_cgroup_remove_exceeded(mem, mz, mctz);
488                 }
489         }
490 }
491
492 static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
493 {
494         return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
495 }
496
497 static struct mem_cgroup_per_zone *
498 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
499 {
500         struct rb_node *rightmost = NULL;
501         struct mem_cgroup_per_zone *mz;
502
503 retry:
504         mz = NULL;
505         rightmost = rb_last(&mctz->rb_root);
506         if (!rightmost)
507                 goto done;              /* Nothing to reclaim from */
508
509         mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
510         /*
511          * Remove the node now but someone else can add it back,
512          * we will to add it back at the end of reclaim to its correct
513          * position in the tree.
514          */
515         __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
516         if (!res_counter_soft_limit_excess(&mz->mem->res) ||
517                 !css_tryget(&mz->mem->css))
518                 goto retry;
519 done:
520         return mz;
521 }
522
523 static struct mem_cgroup_per_zone *
524 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
525 {
526         struct mem_cgroup_per_zone *mz;
527
528         spin_lock(&mctz->lock);
529         mz = __mem_cgroup_largest_soft_limit_node(mctz);
530         spin_unlock(&mctz->lock);
531         return mz;
532 }
533
534 static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
535                 enum mem_cgroup_stat_index idx)
536 {
537         int cpu;
538         s64 val = 0;
539
540         for_each_possible_cpu(cpu)
541                 val += per_cpu(mem->stat->count[idx], cpu);
542         return val;
543 }
544
545 static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
546 {
547         s64 ret;
548
549         ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
550         ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
551         return ret;
552 }
553
554 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
555                                          bool charge)
556 {
557         int val = (charge) ? 1 : -1;
558         this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
559 }
560
561 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
562                                          struct page_cgroup *pc,
563                                          bool charge)
564 {
565         int val = (charge) ? 1 : -1;
566
567         preempt_disable();
568
569         if (PageCgroupCache(pc))
570                 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
571         else
572                 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
573
574         if (charge)
575                 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
576         else
577                 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
578         __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
579
580         preempt_enable();
581 }
582
583 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
584                                         enum lru_list idx)
585 {
586         int nid, zid;
587         struct mem_cgroup_per_zone *mz;
588         u64 total = 0;
589
590         for_each_online_node(nid)
591                 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
592                         mz = mem_cgroup_zoneinfo(mem, nid, zid);
593                         total += MEM_CGROUP_ZSTAT(mz, idx);
594                 }
595         return total;
596 }
597
598 static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
599 {
600         s64 val;
601
602         val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
603
604         return !(val & ((1 << event_mask_shift) - 1));
605 }
606
607 /*
608  * Check events in order.
609  *
610  */
611 static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
612 {
613         /* threshold event is triggered in finer grain than soft limit */
614         if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
615                 mem_cgroup_threshold(mem);
616                 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
617                         mem_cgroup_update_tree(mem, page);
618         }
619 }
620
621 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
622 {
623         return container_of(cgroup_subsys_state(cont,
624                                 mem_cgroup_subsys_id), struct mem_cgroup,
625                                 css);
626 }
627
628 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
629 {
630         /*
631          * mm_update_next_owner() may clear mm->owner to NULL
632          * if it races with swapoff, page migration, etc.
633          * So this can be called with p == NULL.
634          */
635         if (unlikely(!p))
636                 return NULL;
637
638         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
639                                 struct mem_cgroup, css);
640 }
641
642 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
643 {
644         struct mem_cgroup *mem = NULL;
645
646         if (!mm)
647                 return NULL;
648         /*
649          * Because we have no locks, mm->owner's may be being moved to other
650          * cgroup. We use css_tryget() here even if this looks
651          * pessimistic (rather than adding locks here).
652          */
653         rcu_read_lock();
654         do {
655                 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
656                 if (unlikely(!mem))
657                         break;
658         } while (!css_tryget(&mem->css));
659         rcu_read_unlock();
660         return mem;
661 }
662
663 /*
664  * Call callback function against all cgroup under hierarchy tree.
665  */
666 static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
667                           int (*func)(struct mem_cgroup *, void *))
668 {
669         int found, ret, nextid;
670         struct cgroup_subsys_state *css;
671         struct mem_cgroup *mem;
672
673         if (!root->use_hierarchy)
674                 return (*func)(root, data);
675
676         nextid = 1;
677         do {
678                 ret = 0;
679                 mem = NULL;
680
681                 rcu_read_lock();
682                 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
683                                    &found);
684                 if (css && css_tryget(css))
685                         mem = container_of(css, struct mem_cgroup, css);
686                 rcu_read_unlock();
687
688                 if (mem) {
689                         ret = (*func)(mem, data);
690                         css_put(&mem->css);
691                 }
692                 nextid = found + 1;
693         } while (!ret && css);
694
695         return ret;
696 }
697
698 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
699 {
700         return (mem == root_mem_cgroup);
701 }
702
703 /*
704  * Following LRU functions are allowed to be used without PCG_LOCK.
705  * Operations are called by routine of global LRU independently from memcg.
706  * What we have to take care of here is validness of pc->mem_cgroup.
707  *
708  * Changes to pc->mem_cgroup happens when
709  * 1. charge
710  * 2. moving account
711  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
712  * It is added to LRU before charge.
713  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
714  * When moving account, the page is not on LRU. It's isolated.
715  */
716
717 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
718 {
719         struct page_cgroup *pc;
720         struct mem_cgroup_per_zone *mz;
721
722         if (mem_cgroup_disabled())
723                 return;
724         pc = lookup_page_cgroup(page);
725         /* can happen while we handle swapcache. */
726         if (!TestClearPageCgroupAcctLRU(pc))
727                 return;
728         VM_BUG_ON(!pc->mem_cgroup);
729         /*
730          * We don't check PCG_USED bit. It's cleared when the "page" is finally
731          * removed from global LRU.
732          */
733         mz = page_cgroup_zoneinfo(pc);
734         MEM_CGROUP_ZSTAT(mz, lru) -= 1;
735         if (mem_cgroup_is_root(pc->mem_cgroup))
736                 return;
737         VM_BUG_ON(list_empty(&pc->lru));
738         list_del_init(&pc->lru);
739         return;
740 }
741
742 void mem_cgroup_del_lru(struct page *page)
743 {
744         mem_cgroup_del_lru_list(page, page_lru(page));
745 }
746
747 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
748 {
749         struct mem_cgroup_per_zone *mz;
750         struct page_cgroup *pc;
751
752         if (mem_cgroup_disabled())
753                 return;
754
755         pc = lookup_page_cgroup(page);
756         /*
757          * Used bit is set without atomic ops but after smp_wmb().
758          * For making pc->mem_cgroup visible, insert smp_rmb() here.
759          */
760         smp_rmb();
761         /* unused or root page is not rotated. */
762         if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
763                 return;
764         mz = page_cgroup_zoneinfo(pc);
765         list_move(&pc->lru, &mz->lists[lru]);
766 }
767
768 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
769 {
770         struct page_cgroup *pc;
771         struct mem_cgroup_per_zone *mz;
772
773         if (mem_cgroup_disabled())
774                 return;
775         pc = lookup_page_cgroup(page);
776         VM_BUG_ON(PageCgroupAcctLRU(pc));
777         /*
778          * Used bit is set without atomic ops but after smp_wmb().
779          * For making pc->mem_cgroup visible, insert smp_rmb() here.
780          */
781         smp_rmb();
782         if (!PageCgroupUsed(pc))
783                 return;
784
785         mz = page_cgroup_zoneinfo(pc);
786         MEM_CGROUP_ZSTAT(mz, lru) += 1;
787         SetPageCgroupAcctLRU(pc);
788         if (mem_cgroup_is_root(pc->mem_cgroup))
789                 return;
790         list_add(&pc->lru, &mz->lists[lru]);
791 }
792
793 /*
794  * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
795  * lru because the page may.be reused after it's fully uncharged (because of
796  * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
797  * it again. This function is only used to charge SwapCache. It's done under
798  * lock_page and expected that zone->lru_lock is never held.
799  */
800 static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
801 {
802         unsigned long flags;
803         struct zone *zone = page_zone(page);
804         struct page_cgroup *pc = lookup_page_cgroup(page);
805
806         spin_lock_irqsave(&zone->lru_lock, flags);
807         /*
808          * Forget old LRU when this page_cgroup is *not* used. This Used bit
809          * is guarded by lock_page() because the page is SwapCache.
810          */
811         if (!PageCgroupUsed(pc))
812                 mem_cgroup_del_lru_list(page, page_lru(page));
813         spin_unlock_irqrestore(&zone->lru_lock, flags);
814 }
815
816 static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
817 {
818         unsigned long flags;
819         struct zone *zone = page_zone(page);
820         struct page_cgroup *pc = lookup_page_cgroup(page);
821
822         spin_lock_irqsave(&zone->lru_lock, flags);
823         /* link when the page is linked to LRU but page_cgroup isn't */
824         if (PageLRU(page) && !PageCgroupAcctLRU(pc))
825                 mem_cgroup_add_lru_list(page, page_lru(page));
826         spin_unlock_irqrestore(&zone->lru_lock, flags);
827 }
828
829
830 void mem_cgroup_move_lists(struct page *page,
831                            enum lru_list from, enum lru_list to)
832 {
833         if (mem_cgroup_disabled())
834                 return;
835         mem_cgroup_del_lru_list(page, from);
836         mem_cgroup_add_lru_list(page, to);
837 }
838
839 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
840 {
841         int ret;
842         struct mem_cgroup *curr = NULL;
843         struct task_struct *p;
844
845         p = find_lock_task_mm(task);
846         if (!p)
847                 return 0;
848         curr = try_get_mem_cgroup_from_mm(p->mm);
849         task_unlock(p);
850         if (!curr)
851                 return 0;
852         /*
853          * We should check use_hierarchy of "mem" not "curr". Because checking
854          * use_hierarchy of "curr" here make this function true if hierarchy is
855          * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
856          * hierarchy(even if use_hierarchy is disabled in "mem").
857          */
858         if (mem->use_hierarchy)
859                 ret = css_is_ancestor(&curr->css, &mem->css);
860         else
861                 ret = (curr == mem);
862         css_put(&curr->css);
863         return ret;
864 }
865
866 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
867 {
868         unsigned long active;
869         unsigned long inactive;
870         unsigned long gb;
871         unsigned long inactive_ratio;
872
873         inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
874         active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
875
876         gb = (inactive + active) >> (30 - PAGE_SHIFT);
877         if (gb)
878                 inactive_ratio = int_sqrt(10 * gb);
879         else
880                 inactive_ratio = 1;
881
882         if (present_pages) {
883                 present_pages[0] = inactive;
884                 present_pages[1] = active;
885         }
886
887         return inactive_ratio;
888 }
889
890 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
891 {
892         unsigned long active;
893         unsigned long inactive;
894         unsigned long present_pages[2];
895         unsigned long inactive_ratio;
896
897         inactive_ratio = calc_inactive_ratio(memcg, present_pages);
898
899         inactive = present_pages[0];
900         active = present_pages[1];
901
902         if (inactive * inactive_ratio < active)
903                 return 1;
904
905         return 0;
906 }
907
908 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
909 {
910         unsigned long active;
911         unsigned long inactive;
912
913         inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
914         active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
915
916         return (active > inactive);
917 }
918
919 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
920                                        struct zone *zone,
921                                        enum lru_list lru)
922 {
923         int nid = zone_to_nid(zone);
924         int zid = zone_idx(zone);
925         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
926
927         return MEM_CGROUP_ZSTAT(mz, lru);
928 }
929
930 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
931                                                       struct zone *zone)
932 {
933         int nid = zone_to_nid(zone);
934         int zid = zone_idx(zone);
935         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
936
937         return &mz->reclaim_stat;
938 }
939
940 struct zone_reclaim_stat *
941 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
942 {
943         struct page_cgroup *pc;
944         struct mem_cgroup_per_zone *mz;
945
946         if (mem_cgroup_disabled())
947                 return NULL;
948
949         pc = lookup_page_cgroup(page);
950         /*
951          * Used bit is set without atomic ops but after smp_wmb().
952          * For making pc->mem_cgroup visible, insert smp_rmb() here.
953          */
954         smp_rmb();
955         if (!PageCgroupUsed(pc))
956                 return NULL;
957
958         mz = page_cgroup_zoneinfo(pc);
959         if (!mz)
960                 return NULL;
961
962         return &mz->reclaim_stat;
963 }
964
965 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
966                                         struct list_head *dst,
967                                         unsigned long *scanned, int order,
968                                         int mode, struct zone *z,
969                                         struct mem_cgroup *mem_cont,
970                                         int active, int file)
971 {
972         unsigned long nr_taken = 0;
973         struct page *page;
974         unsigned long scan;
975         LIST_HEAD(pc_list);
976         struct list_head *src;
977         struct page_cgroup *pc, *tmp;
978         int nid = zone_to_nid(z);
979         int zid = zone_idx(z);
980         struct mem_cgroup_per_zone *mz;
981         int lru = LRU_FILE * file + active;
982         int ret;
983
984         BUG_ON(!mem_cont);
985         mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
986         src = &mz->lists[lru];
987
988         scan = 0;
989         list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
990                 if (scan >= nr_to_scan)
991                         break;
992
993                 page = pc->page;
994                 if (unlikely(!PageCgroupUsed(pc)))
995                         continue;
996                 if (unlikely(!PageLRU(page)))
997                         continue;
998
999                 scan++;
1000                 ret = __isolate_lru_page(page, mode, file);
1001                 switch (ret) {
1002                 case 0:
1003                         list_move(&page->lru, dst);
1004                         mem_cgroup_del_lru(page);
1005                         nr_taken++;
1006                         break;
1007                 case -EBUSY:
1008                         /* we don't affect global LRU but rotate in our LRU */
1009                         mem_cgroup_rotate_lru_list(page, page_lru(page));
1010                         break;
1011                 default:
1012                         break;
1013                 }
1014         }
1015
1016         *scanned = scan;
1017
1018         trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1019                                       0, 0, 0, mode);
1020
1021         return nr_taken;
1022 }
1023
1024 #define mem_cgroup_from_res_counter(counter, member)    \
1025         container_of(counter, struct mem_cgroup, member)
1026
1027 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1028 {
1029         if (do_swap_account) {
1030                 if (res_counter_check_under_limit(&mem->res) &&
1031                         res_counter_check_under_limit(&mem->memsw))
1032                         return true;
1033         } else
1034                 if (res_counter_check_under_limit(&mem->res))
1035                         return true;
1036         return false;
1037 }
1038
1039 static unsigned int get_swappiness(struct mem_cgroup *memcg)
1040 {
1041         struct cgroup *cgrp = memcg->css.cgroup;
1042         unsigned int swappiness;
1043
1044         /* root ? */
1045         if (cgrp->parent == NULL)
1046                 return vm_swappiness;
1047
1048         spin_lock(&memcg->reclaim_param_lock);
1049         swappiness = memcg->swappiness;
1050         spin_unlock(&memcg->reclaim_param_lock);
1051
1052         return swappiness;
1053 }
1054
1055 static void mem_cgroup_start_move(struct mem_cgroup *mem)
1056 {
1057         int cpu;
1058         /* Because this is for moving account, reuse mc.lock */
1059         spin_lock(&mc.lock);
1060         for_each_possible_cpu(cpu)
1061                 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1062         spin_unlock(&mc.lock);
1063
1064         synchronize_rcu();
1065 }
1066
1067 static void mem_cgroup_end_move(struct mem_cgroup *mem)
1068 {
1069         int cpu;
1070
1071         if (!mem)
1072                 return;
1073         spin_lock(&mc.lock);
1074         for_each_possible_cpu(cpu)
1075                 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1076         spin_unlock(&mc.lock);
1077 }
1078 /*
1079  * 2 routines for checking "mem" is under move_account() or not.
1080  *
1081  * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1082  *                        for avoiding race in accounting. If true,
1083  *                        pc->mem_cgroup may be overwritten.
1084  *
1085  * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1086  *                        under hierarchy of moving cgroups. This is for
1087  *                        waiting at hith-memory prressure caused by "move".
1088  */
1089
1090 static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1091 {
1092         VM_BUG_ON(!rcu_read_lock_held());
1093         return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1094 }
1095
1096 static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1097 {
1098         struct mem_cgroup *from;
1099         struct mem_cgroup *to;
1100         bool ret = false;
1101         /*
1102          * Unlike task_move routines, we access mc.to, mc.from not under
1103          * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1104          */
1105         spin_lock(&mc.lock);
1106         from = mc.from;
1107         to = mc.to;
1108         if (!from)
1109                 goto unlock;
1110         if (from == mem || to == mem
1111             || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
1112             || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
1113                 ret = true;
1114 unlock:
1115         spin_unlock(&mc.lock);
1116         return ret;
1117 }
1118
1119 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1120 {
1121         if (mc.moving_task && current != mc.moving_task) {
1122                 if (mem_cgroup_under_move(mem)) {
1123                         DEFINE_WAIT(wait);
1124                         prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1125                         /* moving charge context might have finished. */
1126                         if (mc.moving_task)
1127                                 schedule();
1128                         finish_wait(&mc.waitq, &wait);
1129                         return true;
1130                 }
1131         }
1132         return false;
1133 }
1134
1135 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1136 {
1137         int *val = data;
1138         (*val)++;
1139         return 0;
1140 }
1141
1142 /**
1143  * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1144  * @memcg: The memory cgroup that went over limit
1145  * @p: Task that is going to be killed
1146  *
1147  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1148  * enabled
1149  */
1150 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1151 {
1152         struct cgroup *task_cgrp;
1153         struct cgroup *mem_cgrp;
1154         /*
1155          * Need a buffer in BSS, can't rely on allocations. The code relies
1156          * on the assumption that OOM is serialized for memory controller.
1157          * If this assumption is broken, revisit this code.
1158          */
1159         static char memcg_name[PATH_MAX];
1160         int ret;
1161
1162         if (!memcg || !p)
1163                 return;
1164
1165
1166         rcu_read_lock();
1167
1168         mem_cgrp = memcg->css.cgroup;
1169         task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1170
1171         ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1172         if (ret < 0) {
1173                 /*
1174                  * Unfortunately, we are unable to convert to a useful name
1175                  * But we'll still print out the usage information
1176                  */
1177                 rcu_read_unlock();
1178                 goto done;
1179         }
1180         rcu_read_unlock();
1181
1182         printk(KERN_INFO "Task in %s killed", memcg_name);
1183
1184         rcu_read_lock();
1185         ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1186         if (ret < 0) {
1187                 rcu_read_unlock();
1188                 goto done;
1189         }
1190         rcu_read_unlock();
1191
1192         /*
1193          * Continues from above, so we don't need an KERN_ level
1194          */
1195         printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1196 done:
1197
1198         printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1199                 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1200                 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1201                 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1202         printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1203                 "failcnt %llu\n",
1204                 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1205                 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1206                 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1207 }
1208
1209 /*
1210  * This function returns the number of memcg under hierarchy tree. Returns
1211  * 1(self count) if no children.
1212  */
1213 static int mem_cgroup_count_children(struct mem_cgroup *mem)
1214 {
1215         int num = 0;
1216         mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
1217         return num;
1218 }
1219
1220 /*
1221  * Return the memory (and swap, if configured) limit for a memcg.
1222  */
1223 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1224 {
1225         u64 limit;
1226         u64 memsw;
1227
1228         limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
1229                         total_swap_pages;
1230         memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1231         /*
1232          * If memsw is finite and limits the amount of swap space available
1233          * to this memcg, return that limit.
1234          */
1235         return min(limit, memsw);
1236 }
1237
1238 /*
1239  * Visit the first child (need not be the first child as per the ordering
1240  * of the cgroup list, since we track last_scanned_child) of @mem and use
1241  * that to reclaim free pages from.
1242  */
1243 static struct mem_cgroup *
1244 mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1245 {
1246         struct mem_cgroup *ret = NULL;
1247         struct cgroup_subsys_state *css;
1248         int nextid, found;
1249
1250         if (!root_mem->use_hierarchy) {
1251                 css_get(&root_mem->css);
1252                 ret = root_mem;
1253         }
1254
1255         while (!ret) {
1256                 rcu_read_lock();
1257                 nextid = root_mem->last_scanned_child + 1;
1258                 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1259                                    &found);
1260                 if (css && css_tryget(css))
1261                         ret = container_of(css, struct mem_cgroup, css);
1262
1263                 rcu_read_unlock();
1264                 /* Updates scanning parameter */
1265                 spin_lock(&root_mem->reclaim_param_lock);
1266                 if (!css) {
1267                         /* this means start scan from ID:1 */
1268                         root_mem->last_scanned_child = 0;
1269                 } else
1270                         root_mem->last_scanned_child = found;
1271                 spin_unlock(&root_mem->reclaim_param_lock);
1272         }
1273
1274         return ret;
1275 }
1276
1277 /*
1278  * Scan the hierarchy if needed to reclaim memory. We remember the last child
1279  * we reclaimed from, so that we don't end up penalizing one child extensively
1280  * based on its position in the children list.
1281  *
1282  * root_mem is the original ancestor that we've been reclaim from.
1283  *
1284  * We give up and return to the caller when we visit root_mem twice.
1285  * (other groups can be removed while we're walking....)
1286  *
1287  * If shrink==true, for avoiding to free too much, this returns immedieately.
1288  */
1289 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1290                                                 struct zone *zone,
1291                                                 gfp_t gfp_mask,
1292                                                 unsigned long reclaim_options)
1293 {
1294         struct mem_cgroup *victim;
1295         int ret, total = 0;
1296         int loop = 0;
1297         bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1298         bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1299         bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1300         unsigned long excess = mem_cgroup_get_excess(root_mem);
1301
1302         /* If memsw_is_minimum==1, swap-out is of-no-use. */
1303         if (root_mem->memsw_is_minimum)
1304                 noswap = true;
1305
1306         while (1) {
1307                 victim = mem_cgroup_select_victim(root_mem);
1308                 if (victim == root_mem) {
1309                         loop++;
1310                         if (loop >= 1)
1311                                 drain_all_stock_async();
1312                         if (loop >= 2) {
1313                                 /*
1314                                  * If we have not been able to reclaim
1315                                  * anything, it might because there are
1316                                  * no reclaimable pages under this hierarchy
1317                                  */
1318                                 if (!check_soft || !total) {
1319                                         css_put(&victim->css);
1320                                         break;
1321                                 }
1322                                 /*
1323                                  * We want to do more targetted reclaim.
1324                                  * excess >> 2 is not to excessive so as to
1325                                  * reclaim too much, nor too less that we keep
1326                                  * coming back to reclaim from this cgroup
1327                                  */
1328                                 if (total >= (excess >> 2) ||
1329                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1330                                         css_put(&victim->css);
1331                                         break;
1332                                 }
1333                         }
1334                 }
1335                 if (!mem_cgroup_local_usage(victim)) {
1336                         /* this cgroup's local usage == 0 */
1337                         css_put(&victim->css);
1338                         continue;
1339                 }
1340                 /* we use swappiness of local cgroup */
1341                 if (check_soft)
1342                         ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1343                                 noswap, get_swappiness(victim), zone);
1344                 else
1345                         ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1346                                                 noswap, get_swappiness(victim));
1347                 css_put(&victim->css);
1348                 /*
1349                  * At shrinking usage, we can't check we should stop here or
1350                  * reclaim more. It's depends on callers. last_scanned_child
1351                  * will work enough for keeping fairness under tree.
1352                  */
1353                 if (shrink)
1354                         return ret;
1355                 total += ret;
1356                 if (check_soft) {
1357                         if (res_counter_check_under_soft_limit(&root_mem->res))
1358                                 return total;
1359                 } else if (mem_cgroup_check_under_limit(root_mem))
1360                         return 1 + total;
1361         }
1362         return total;
1363 }
1364
1365 static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1366 {
1367         int *val = (int *)data;
1368         int x;
1369         /*
1370          * Logically, we can stop scanning immediately when we find
1371          * a memcg is already locked. But condidering unlock ops and
1372          * creation/removal of memcg, scan-all is simple operation.
1373          */
1374         x = atomic_inc_return(&mem->oom_lock);
1375         *val = max(x, *val);
1376         return 0;
1377 }
1378 /*
1379  * Check OOM-Killer is already running under our hierarchy.
1380  * If someone is running, return false.
1381  */
1382 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1383 {
1384         int lock_count = 0;
1385
1386         mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
1387
1388         if (lock_count == 1)
1389                 return true;
1390         return false;
1391 }
1392
1393 static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
1394 {
1395         /*
1396          * When a new child is created while the hierarchy is under oom,
1397          * mem_cgroup_oom_lock() may not be called. We have to use
1398          * atomic_add_unless() here.
1399          */
1400         atomic_add_unless(&mem->oom_lock, -1, 0);
1401         return 0;
1402 }
1403
1404 static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1405 {
1406         mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1407 }
1408
1409 static DEFINE_MUTEX(memcg_oom_mutex);
1410 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1411
1412 struct oom_wait_info {
1413         struct mem_cgroup *mem;
1414         wait_queue_t    wait;
1415 };
1416
1417 static int memcg_oom_wake_function(wait_queue_t *wait,
1418         unsigned mode, int sync, void *arg)
1419 {
1420         struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
1421         struct oom_wait_info *oom_wait_info;
1422
1423         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1424
1425         if (oom_wait_info->mem == wake_mem)
1426                 goto wakeup;
1427         /* if no hierarchy, no match */
1428         if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1429                 return 0;
1430         /*
1431          * Both of oom_wait_info->mem and wake_mem are stable under us.
1432          * Then we can use css_is_ancestor without taking care of RCU.
1433          */
1434         if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
1435             !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
1436                 return 0;
1437
1438 wakeup:
1439         return autoremove_wake_function(wait, mode, sync, arg);
1440 }
1441
1442 static void memcg_wakeup_oom(struct mem_cgroup *mem)
1443 {
1444         /* for filtering, pass "mem" as argument. */
1445         __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1446 }
1447
1448 static void memcg_oom_recover(struct mem_cgroup *mem)
1449 {
1450         if (mem && atomic_read(&mem->oom_lock))
1451                 memcg_wakeup_oom(mem);
1452 }
1453
1454 /*
1455  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1456  */
1457 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1458 {
1459         struct oom_wait_info owait;
1460         bool locked, need_to_kill;
1461
1462         owait.mem = mem;
1463         owait.wait.flags = 0;
1464         owait.wait.func = memcg_oom_wake_function;
1465         owait.wait.private = current;
1466         INIT_LIST_HEAD(&owait.wait.task_list);
1467         need_to_kill = true;
1468         /* At first, try to OOM lock hierarchy under mem.*/
1469         mutex_lock(&memcg_oom_mutex);
1470         locked = mem_cgroup_oom_lock(mem);
1471         /*
1472          * Even if signal_pending(), we can't quit charge() loop without
1473          * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1474          * under OOM is always welcomed, use TASK_KILLABLE here.
1475          */
1476         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1477         if (!locked || mem->oom_kill_disable)
1478                 need_to_kill = false;
1479         if (locked)
1480                 mem_cgroup_oom_notify(mem);
1481         mutex_unlock(&memcg_oom_mutex);
1482
1483         if (need_to_kill) {
1484                 finish_wait(&memcg_oom_waitq, &owait.wait);
1485                 mem_cgroup_out_of_memory(mem, mask);
1486         } else {
1487                 schedule();
1488                 finish_wait(&memcg_oom_waitq, &owait.wait);
1489         }
1490         mutex_lock(&memcg_oom_mutex);
1491         mem_cgroup_oom_unlock(mem);
1492         memcg_wakeup_oom(mem);
1493         mutex_unlock(&memcg_oom_mutex);
1494
1495         if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1496                 return false;
1497         /* Give chance to dying process */
1498         schedule_timeout(1);
1499         return true;
1500 }
1501
1502 /*
1503  * Currently used to update mapped file statistics, but the routine can be
1504  * generalized to update other statistics as well.
1505  *
1506  * Notes: Race condition
1507  *
1508  * We usually use page_cgroup_lock() for accessing page_cgroup member but
1509  * it tends to be costly. But considering some conditions, we doesn't need
1510  * to do so _always_.
1511  *
1512  * Considering "charge", lock_page_cgroup() is not required because all
1513  * file-stat operations happen after a page is attached to radix-tree. There
1514  * are no race with "charge".
1515  *
1516  * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1517  * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1518  * if there are race with "uncharge". Statistics itself is properly handled
1519  * by flags.
1520  *
1521  * Considering "move", this is an only case we see a race. To make the race
1522  * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1523  * possibility of race condition. If there is, we take a lock.
1524  */
1525 void mem_cgroup_update_file_mapped(struct page *page, int val)
1526 {
1527         struct mem_cgroup *mem;
1528         struct page_cgroup *pc = lookup_page_cgroup(page);
1529         bool need_unlock = false;
1530
1531         if (unlikely(!pc))
1532                 return;
1533
1534         rcu_read_lock();
1535         mem = pc->mem_cgroup;
1536         if (unlikely(!mem || !PageCgroupUsed(pc)))
1537                 goto out;
1538         /* pc->mem_cgroup is unstable ? */
1539         if (unlikely(mem_cgroup_stealed(mem))) {
1540                 /* take a lock against to access pc->mem_cgroup */
1541                 lock_page_cgroup(pc);
1542                 need_unlock = true;
1543                 mem = pc->mem_cgroup;
1544                 if (!mem || !PageCgroupUsed(pc))
1545                         goto out;
1546         }
1547         if (val > 0) {
1548                 this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1549                 SetPageCgroupFileMapped(pc);
1550         } else {
1551                 this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1552                 if (!page_mapped(page)) /* for race between dec->inc counter */
1553                         ClearPageCgroupFileMapped(pc);
1554         }
1555
1556 out:
1557         if (unlikely(need_unlock))
1558                 unlock_page_cgroup(pc);
1559         rcu_read_unlock();
1560         return;
1561 }
1562
1563 /*
1564  * size of first charge trial. "32" comes from vmscan.c's magic value.
1565  * TODO: maybe necessary to use big numbers in big irons.
1566  */
1567 #define CHARGE_SIZE     (32 * PAGE_SIZE)
1568 struct memcg_stock_pcp {
1569         struct mem_cgroup *cached; /* this never be root cgroup */
1570         int charge;
1571         struct work_struct work;
1572 };
1573 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1574 static atomic_t memcg_drain_count;
1575
1576 /*
1577  * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
1578  * from local stock and true is returned. If the stock is 0 or charges from a
1579  * cgroup which is not current target, returns false. This stock will be
1580  * refilled.
1581  */
1582 static bool consume_stock(struct mem_cgroup *mem)
1583 {
1584         struct memcg_stock_pcp *stock;
1585         bool ret = true;
1586
1587         stock = &get_cpu_var(memcg_stock);
1588         if (mem == stock->cached && stock->charge)
1589                 stock->charge -= PAGE_SIZE;
1590         else /* need to call res_counter_charge */
1591                 ret = false;
1592         put_cpu_var(memcg_stock);
1593         return ret;
1594 }
1595
1596 /*
1597  * Returns stocks cached in percpu to res_counter and reset cached information.
1598  */
1599 static void drain_stock(struct memcg_stock_pcp *stock)
1600 {
1601         struct mem_cgroup *old = stock->cached;
1602
1603         if (stock->charge) {
1604                 res_counter_uncharge(&old->res, stock->charge);
1605                 if (do_swap_account)
1606                         res_counter_uncharge(&old->memsw, stock->charge);
1607         }
1608         stock->cached = NULL;
1609         stock->charge = 0;
1610 }
1611
1612 /*
1613  * This must be called under preempt disabled or must be called by
1614  * a thread which is pinned to local cpu.
1615  */
1616 static void drain_local_stock(struct work_struct *dummy)
1617 {
1618         struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1619         drain_stock(stock);
1620 }
1621
1622 /*
1623  * Cache charges(val) which is from res_counter, to local per_cpu area.
1624  * This will be consumed by consume_stock() function, later.
1625  */
1626 static void refill_stock(struct mem_cgroup *mem, int val)
1627 {
1628         struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1629
1630         if (stock->cached != mem) { /* reset if necessary */
1631                 drain_stock(stock);
1632                 stock->cached = mem;
1633         }
1634         stock->charge += val;
1635         put_cpu_var(memcg_stock);
1636 }
1637
1638 /*
1639  * Tries to drain stocked charges in other cpus. This function is asynchronous
1640  * and just put a work per cpu for draining localy on each cpu. Caller can
1641  * expects some charges will be back to res_counter later but cannot wait for
1642  * it.
1643  */
1644 static void drain_all_stock_async(void)
1645 {
1646         int cpu;
1647         /* This function is for scheduling "drain" in asynchronous way.
1648          * The result of "drain" is not directly handled by callers. Then,
1649          * if someone is calling drain, we don't have to call drain more.
1650          * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1651          * there is a race. We just do loose check here.
1652          */
1653         if (atomic_read(&memcg_drain_count))
1654                 return;
1655         /* Notify other cpus that system-wide "drain" is running */
1656         atomic_inc(&memcg_drain_count);
1657         get_online_cpus();
1658         for_each_online_cpu(cpu) {
1659                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1660                 schedule_work_on(cpu, &stock->work);
1661         }
1662         put_online_cpus();
1663         atomic_dec(&memcg_drain_count);
1664         /* We don't wait for flush_work */
1665 }
1666
1667 /* This is a synchronous drain interface. */
1668 static void drain_all_stock_sync(void)
1669 {
1670         /* called when force_empty is called */
1671         atomic_inc(&memcg_drain_count);
1672         schedule_on_each_cpu(drain_local_stock);
1673         atomic_dec(&memcg_drain_count);
1674 }
1675
1676 static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1677                                         unsigned long action,
1678                                         void *hcpu)
1679 {
1680         int cpu = (unsigned long)hcpu;
1681         struct memcg_stock_pcp *stock;
1682
1683         if (action != CPU_DEAD)
1684                 return NOTIFY_OK;
1685         stock = &per_cpu(memcg_stock, cpu);
1686         drain_stock(stock);
1687         return NOTIFY_OK;
1688 }
1689
1690
1691 /* See __mem_cgroup_try_charge() for details */
1692 enum {
1693         CHARGE_OK,              /* success */
1694         CHARGE_RETRY,           /* need to retry but retry is not bad */
1695         CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
1696         CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
1697         CHARGE_OOM_DIE,         /* the current is killed because of OOM */
1698 };
1699
1700 static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1701                                 int csize, bool oom_check)
1702 {
1703         struct mem_cgroup *mem_over_limit;
1704         struct res_counter *fail_res;
1705         unsigned long flags = 0;
1706         int ret;
1707
1708         ret = res_counter_charge(&mem->res, csize, &fail_res);
1709
1710         if (likely(!ret)) {
1711                 if (!do_swap_account)
1712                         return CHARGE_OK;
1713                 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1714                 if (likely(!ret))
1715                         return CHARGE_OK;
1716
1717                 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1718                 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1719         } else
1720                 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1721
1722         if (csize > PAGE_SIZE) /* change csize and retry */
1723                 return CHARGE_RETRY;
1724
1725         if (!(gfp_mask & __GFP_WAIT))
1726                 return CHARGE_WOULDBLOCK;
1727
1728         ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1729                                         gfp_mask, flags);
1730         /*
1731          * try_to_free_mem_cgroup_pages() might not give us a full
1732          * picture of reclaim. Some pages are reclaimed and might be
1733          * moved to swap cache or just unmapped from the cgroup.
1734          * Check the limit again to see if the reclaim reduced the
1735          * current usage of the cgroup before giving up
1736          */
1737         if (ret || mem_cgroup_check_under_limit(mem_over_limit))
1738                 return CHARGE_RETRY;
1739
1740         /*
1741          * At task move, charge accounts can be doubly counted. So, it's
1742          * better to wait until the end of task_move if something is going on.
1743          */
1744         if (mem_cgroup_wait_acct_move(mem_over_limit))
1745                 return CHARGE_RETRY;
1746
1747         /* If we don't need to call oom-killer at el, return immediately */
1748         if (!oom_check)
1749                 return CHARGE_NOMEM;
1750         /* check OOM */
1751         if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
1752                 return CHARGE_OOM_DIE;
1753
1754         return CHARGE_RETRY;
1755 }
1756
1757 /*
1758  * Unlike exported interface, "oom" parameter is added. if oom==true,
1759  * oom-killer can be invoked.
1760  */
1761 static int __mem_cgroup_try_charge(struct mm_struct *mm,
1762                 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1763 {
1764         int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1765         struct mem_cgroup *mem = NULL;
1766         int ret;
1767         int csize = CHARGE_SIZE;
1768
1769         /*
1770          * Unlike gloval-vm's OOM-kill, we're not in memory shortage
1771          * in system level. So, allow to go ahead dying process in addition to
1772          * MEMDIE process.
1773          */
1774         if (unlikely(test_thread_flag(TIF_MEMDIE)
1775                      || fatal_signal_pending(current)))
1776                 goto bypass;
1777
1778         /*
1779          * We always charge the cgroup the mm_struct belongs to.
1780          * The mm_struct's mem_cgroup changes on task migration if the
1781          * thread group leader migrates. It's possible that mm is not
1782          * set, if so charge the init_mm (happens for pagecache usage).
1783          */
1784         if (!*memcg && !mm)
1785                 goto bypass;
1786 again:
1787         if (*memcg) { /* css should be a valid one */
1788                 mem = *memcg;
1789                 VM_BUG_ON(css_is_removed(&mem->css));
1790                 if (mem_cgroup_is_root(mem))
1791                         goto done;
1792                 if (consume_stock(mem))
1793                         goto done;
1794                 css_get(&mem->css);
1795         } else {
1796                 struct task_struct *p;
1797
1798                 rcu_read_lock();
1799                 p = rcu_dereference(mm->owner);
1800                 VM_BUG_ON(!p);
1801                 /*
1802                  * because we don't have task_lock(), "p" can exit while
1803                  * we're here. In that case, "mem" can point to root
1804                  * cgroup but never be NULL. (and task_struct itself is freed
1805                  * by RCU, cgroup itself is RCU safe.) Then, we have small
1806                  * risk here to get wrong cgroup. But such kind of mis-account
1807                  * by race always happens because we don't have cgroup_mutex().
1808                  * It's overkill and we allow that small race, here.
1809                  */
1810                 mem = mem_cgroup_from_task(p);
1811                 VM_BUG_ON(!mem);
1812                 if (mem_cgroup_is_root(mem)) {
1813                         rcu_read_unlock();
1814                         goto done;
1815                 }
1816                 if (consume_stock(mem)) {
1817                         /*
1818                          * It seems dagerous to access memcg without css_get().
1819                          * But considering how consume_stok works, it's not
1820                          * necessary. If consume_stock success, some charges
1821                          * from this memcg are cached on this cpu. So, we
1822                          * don't need to call css_get()/css_tryget() before
1823                          * calling consume_stock().
1824                          */
1825                         rcu_read_unlock();
1826                         goto done;
1827                 }
1828                 /* after here, we may be blocked. we need to get refcnt */
1829                 if (!css_tryget(&mem->css)) {
1830                         rcu_read_unlock();
1831                         goto again;
1832                 }
1833                 rcu_read_unlock();
1834         }
1835
1836         do {
1837                 bool oom_check;
1838
1839                 /* If killed, bypass charge */
1840                 if (fatal_signal_pending(current)) {
1841                         css_put(&mem->css);
1842                         goto bypass;
1843                 }
1844
1845                 oom_check = false;
1846                 if (oom && !nr_oom_retries) {
1847                         oom_check = true;
1848                         nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1849                 }
1850
1851                 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
1852
1853                 switch (ret) {
1854                 case CHARGE_OK:
1855                         break;
1856                 case CHARGE_RETRY: /* not in OOM situation but retry */
1857                         csize = PAGE_SIZE;
1858                         css_put(&mem->css);
1859                         mem = NULL;
1860                         goto again;
1861                 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
1862                         css_put(&mem->css);
1863                         goto nomem;
1864                 case CHARGE_NOMEM: /* OOM routine works */
1865                         if (!oom) {
1866                                 css_put(&mem->css);
1867                                 goto nomem;
1868                         }
1869                         /* If oom, we never return -ENOMEM */
1870                         nr_oom_retries--;
1871                         break;
1872                 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
1873                         css_put(&mem->css);
1874                         goto bypass;
1875                 }
1876         } while (ret != CHARGE_OK);
1877
1878         if (csize > PAGE_SIZE)
1879                 refill_stock(mem, csize - PAGE_SIZE);
1880         css_put(&mem->css);
1881 done:
1882         *memcg = mem;
1883         return 0;
1884 nomem:
1885         *memcg = NULL;
1886         return -ENOMEM;
1887 bypass:
1888         *memcg = NULL;
1889         return 0;
1890 }
1891
1892 /*
1893  * Somemtimes we have to undo a charge we got by try_charge().
1894  * This function is for that and do uncharge, put css's refcnt.
1895  * gotten by try_charge().
1896  */
1897 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1898                                                         unsigned long count)
1899 {
1900         if (!mem_cgroup_is_root(mem)) {
1901                 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1902                 if (do_swap_account)
1903                         res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1904         }
1905 }
1906
1907 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1908 {
1909         __mem_cgroup_cancel_charge(mem, 1);
1910 }
1911
1912 /*
1913  * A helper function to get mem_cgroup from ID. must be called under
1914  * rcu_read_lock(). The caller must check css_is_removed() or some if
1915  * it's concern. (dropping refcnt from swap can be called against removed
1916  * memcg.)
1917  */
1918 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1919 {
1920         struct cgroup_subsys_state *css;
1921
1922         /* ID 0 is unused ID */
1923         if (!id)
1924                 return NULL;
1925         css = css_lookup(&mem_cgroup_subsys, id);
1926         if (!css)
1927                 return NULL;
1928         return container_of(css, struct mem_cgroup, css);
1929 }
1930
1931 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1932 {
1933         struct mem_cgroup *mem = NULL;
1934         struct page_cgroup *pc;
1935         unsigned short id;
1936         swp_entry_t ent;
1937
1938         VM_BUG_ON(!PageLocked(page));
1939
1940         pc = lookup_page_cgroup(page);
1941         lock_page_cgroup(pc);
1942         if (PageCgroupUsed(pc)) {
1943                 mem = pc->mem_cgroup;
1944                 if (mem && !css_tryget(&mem->css))
1945                         mem = NULL;
1946         } else if (PageSwapCache(page)) {
1947                 ent.val = page_private(page);
1948                 id = lookup_swap_cgroup(ent);
1949                 rcu_read_lock();
1950                 mem = mem_cgroup_lookup(id);
1951                 if (mem && !css_tryget(&mem->css))
1952                         mem = NULL;
1953                 rcu_read_unlock();
1954         }
1955         unlock_page_cgroup(pc);
1956         return mem;
1957 }
1958
1959 /*
1960  * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
1961  * USED state. If already USED, uncharge and return.
1962  */
1963
1964 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1965                                      struct page_cgroup *pc,
1966                                      enum charge_type ctype)
1967 {
1968         /* try_charge() can return NULL to *memcg, taking care of it. */
1969         if (!mem)
1970                 return;
1971
1972         lock_page_cgroup(pc);
1973         if (unlikely(PageCgroupUsed(pc))) {
1974                 unlock_page_cgroup(pc);
1975                 mem_cgroup_cancel_charge(mem);
1976                 return;
1977         }
1978
1979         pc->mem_cgroup = mem;
1980         /*
1981          * We access a page_cgroup asynchronously without lock_page_cgroup().
1982          * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1983          * is accessed after testing USED bit. To make pc->mem_cgroup visible
1984          * before USED bit, we need memory barrier here.
1985          * See mem_cgroup_add_lru_list(), etc.
1986          */
1987         smp_wmb();
1988         switch (ctype) {
1989         case MEM_CGROUP_CHARGE_TYPE_CACHE:
1990         case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1991                 SetPageCgroupCache(pc);
1992                 SetPageCgroupUsed(pc);
1993                 break;
1994         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1995                 ClearPageCgroupCache(pc);
1996                 SetPageCgroupUsed(pc);
1997                 break;
1998         default:
1999                 break;
2000         }
2001
2002         mem_cgroup_charge_statistics(mem, pc, true);
2003
2004         unlock_page_cgroup(pc);
2005         /*
2006          * "charge_statistics" updated event counter. Then, check it.
2007          * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2008          * if they exceeds softlimit.
2009          */
2010         memcg_check_events(mem, pc->page);
2011 }
2012
2013 /**
2014  * __mem_cgroup_move_account - move account of the page
2015  * @pc: page_cgroup of the page.
2016  * @from: mem_cgroup which the page is moved from.
2017  * @to: mem_cgroup which the page is moved to. @from != @to.
2018  * @uncharge: whether we should call uncharge and css_put against @from.
2019  *
2020  * The caller must confirm following.
2021  * - page is not on LRU (isolate_page() is useful.)
2022  * - the pc is locked, used, and ->mem_cgroup points to @from.
2023  *
2024  * This function doesn't do "charge" nor css_get to new cgroup. It should be
2025  * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
2026  * true, this function does "uncharge" from old cgroup, but it doesn't if
2027  * @uncharge is false, so a caller should do "uncharge".
2028  */
2029
2030 static void __mem_cgroup_move_account(struct page_cgroup *pc,
2031         struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
2032 {
2033         VM_BUG_ON(from == to);
2034         VM_BUG_ON(PageLRU(pc->page));
2035         VM_BUG_ON(!PageCgroupLocked(pc));
2036         VM_BUG_ON(!PageCgroupUsed(pc));
2037         VM_BUG_ON(pc->mem_cgroup != from);
2038
2039         if (PageCgroupFileMapped(pc)) {
2040                 /* Update mapped_file data for mem_cgroup */
2041                 preempt_disable();
2042                 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2043                 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2044                 preempt_enable();
2045         }
2046         mem_cgroup_charge_statistics(from, pc, false);
2047         if (uncharge)
2048                 /* This is not "cancel", but cancel_charge does all we need. */
2049                 mem_cgroup_cancel_charge(from);
2050
2051         /* caller should have done css_get */
2052         pc->mem_cgroup = to;
2053         mem_cgroup_charge_statistics(to, pc, true);
2054         /*
2055          * We charges against "to" which may not have any tasks. Then, "to"
2056          * can be under rmdir(). But in current implementation, caller of
2057          * this function is just force_empty() and move charge, so it's
2058          * garanteed that "to" is never removed. So, we don't check rmdir
2059          * status here.
2060          */
2061 }
2062
2063 /*
2064  * check whether the @pc is valid for moving account and call
2065  * __mem_cgroup_move_account()
2066  */
2067 static int mem_cgroup_move_account(struct page_cgroup *pc,
2068                 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
2069 {
2070         int ret = -EINVAL;
2071         lock_page_cgroup(pc);
2072         if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
2073                 __mem_cgroup_move_account(pc, from, to, uncharge);
2074                 ret = 0;
2075         }
2076         unlock_page_cgroup(pc);
2077         /*
2078          * check events
2079          */
2080         memcg_check_events(to, pc->page);
2081         memcg_check_events(from, pc->page);
2082         return ret;
2083 }
2084
2085 /*
2086  * move charges to its parent.
2087  */
2088
2089 static int mem_cgroup_move_parent(struct page_cgroup *pc,
2090                                   struct mem_cgroup *child,
2091                                   gfp_t gfp_mask)
2092 {
2093         struct page *page = pc->page;
2094         struct cgroup *cg = child->css.cgroup;
2095         struct cgroup *pcg = cg->parent;
2096         struct mem_cgroup *parent;
2097         int ret;
2098
2099         /* Is ROOT ? */
2100         if (!pcg)
2101                 return -EINVAL;
2102
2103         ret = -EBUSY;
2104         if (!get_page_unless_zero(page))
2105                 goto out;
2106         if (isolate_lru_page(page))
2107                 goto put;
2108
2109         parent = mem_cgroup_from_cont(pcg);
2110         ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
2111         if (ret || !parent)
2112                 goto put_back;
2113
2114         ret = mem_cgroup_move_account(pc, child, parent, true);
2115         if (ret)
2116                 mem_cgroup_cancel_charge(parent);
2117 put_back:
2118         putback_lru_page(page);
2119 put:
2120         put_page(page);
2121 out:
2122         return ret;
2123 }
2124
2125 /*
2126  * Charge the memory controller for page usage.
2127  * Return
2128  * 0 if the charge was successful
2129  * < 0 if the cgroup is over its limit
2130  */
2131 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2132                                 gfp_t gfp_mask, enum charge_type ctype)
2133 {
2134         struct mem_cgroup *mem = NULL;
2135         struct page_cgroup *pc;
2136         int ret;
2137
2138         pc = lookup_page_cgroup(page);
2139         /* can happen at boot */
2140         if (unlikely(!pc))
2141                 return 0;
2142         prefetchw(pc);
2143
2144         ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
2145         if (ret || !mem)
2146                 return ret;
2147
2148         __mem_cgroup_commit_charge(mem, pc, ctype);
2149         return 0;
2150 }
2151
2152 int mem_cgroup_newpage_charge(struct page *page,
2153                               struct mm_struct *mm, gfp_t gfp_mask)
2154 {
2155         if (mem_cgroup_disabled())
2156                 return 0;
2157         if (PageCompound(page))
2158                 return 0;
2159         /*
2160          * If already mapped, we don't have to account.
2161          * If page cache, page->mapping has address_space.
2162          * But page->mapping may have out-of-use anon_vma pointer,
2163          * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
2164          * is NULL.
2165          */
2166         if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2167                 return 0;
2168         if (unlikely(!mm))
2169                 mm = &init_mm;
2170         return mem_cgroup_charge_common(page, mm, gfp_mask,
2171                                 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2172 }
2173
2174 static void
2175 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2176                                         enum charge_type ctype);
2177
2178 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2179                                 gfp_t gfp_mask)
2180 {
2181         int ret;
2182
2183         if (mem_cgroup_disabled())
2184                 return 0;
2185         if (PageCompound(page))
2186                 return 0;
2187         /*
2188          * Corner case handling. This is called from add_to_page_cache()
2189          * in usual. But some FS (shmem) precharges this page before calling it
2190          * and call add_to_page_cache() with GFP_NOWAIT.
2191          *
2192          * For GFP_NOWAIT case, the page may be pre-charged before calling
2193          * add_to_page_cache(). (See shmem.c) check it here and avoid to call
2194          * charge twice. (It works but has to pay a bit larger cost.)
2195          * And when the page is SwapCache, it should take swap information
2196          * into account. This is under lock_page() now.
2197          */
2198         if (!(gfp_mask & __GFP_WAIT)) {
2199                 struct page_cgroup *pc;
2200
2201                 pc = lookup_page_cgroup(page);
2202                 if (!pc)
2203                         return 0;
2204                 lock_page_cgroup(pc);
2205                 if (PageCgroupUsed(pc)) {
2206                         unlock_page_cgroup(pc);
2207                         return 0;
2208                 }
2209                 unlock_page_cgroup(pc);
2210         }
2211
2212         if (unlikely(!mm))
2213                 mm = &init_mm;
2214
2215         if (page_is_file_cache(page))
2216                 return mem_cgroup_charge_common(page, mm, gfp_mask,
2217                                 MEM_CGROUP_CHARGE_TYPE_CACHE);
2218
2219         /* shmem */
2220         if (PageSwapCache(page)) {
2221                 struct mem_cgroup *mem = NULL;
2222
2223                 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2224                 if (!ret)
2225                         __mem_cgroup_commit_charge_swapin(page, mem,
2226                                         MEM_CGROUP_CHARGE_TYPE_SHMEM);
2227         } else
2228                 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2229                                         MEM_CGROUP_CHARGE_TYPE_SHMEM);
2230
2231         return ret;
2232 }
2233
2234 /*
2235  * While swap-in, try_charge -> commit or cancel, the page is locked.
2236  * And when try_charge() successfully returns, one refcnt to memcg without
2237  * struct page_cgroup is acquired. This refcnt will be consumed by
2238  * "commit()" or removed by "cancel()"
2239  */
2240 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2241                                  struct page *page,
2242                                  gfp_t mask, struct mem_cgroup **ptr)
2243 {
2244         struct mem_cgroup *mem;
2245         int ret;
2246
2247         if (mem_cgroup_disabled())
2248                 return 0;
2249
2250         if (!do_swap_account)
2251                 goto charge_cur_mm;
2252         /*
2253          * A racing thread's fault, or swapoff, may have already updated
2254          * the pte, and even removed page from swap cache: in those cases
2255          * do_swap_page()'s pte_same() test will fail; but there's also a
2256          * KSM case which does need to charge the page.
2257          */
2258         if (!PageSwapCache(page))
2259                 goto charge_cur_mm;
2260         mem = try_get_mem_cgroup_from_page(page);
2261         if (!mem)
2262                 goto charge_cur_mm;
2263         *ptr = mem;
2264         ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
2265         css_put(&mem->css);
2266         return ret;
2267 charge_cur_mm:
2268         if (unlikely(!mm))
2269                 mm = &init_mm;
2270         return __mem_cgroup_try_charge(mm, mask, ptr, true);
2271 }
2272
2273 static void
2274 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2275                                         enum charge_type ctype)
2276 {
2277         struct page_cgroup *pc;
2278
2279         if (mem_cgroup_disabled())
2280                 return;
2281         if (!ptr)
2282                 return;
2283         cgroup_exclude_rmdir(&ptr->css);
2284         pc = lookup_page_cgroup(page);
2285         mem_cgroup_lru_del_before_commit_swapcache(page);
2286         __mem_cgroup_commit_charge(ptr, pc, ctype);
2287         mem_cgroup_lru_add_after_commit_swapcache(page);
2288         /*
2289          * Now swap is on-memory. This means this page may be
2290          * counted both as mem and swap....double count.
2291          * Fix it by uncharging from memsw. Basically, this SwapCache is stable
2292          * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
2293          * may call delete_from_swap_cache() before reach here.
2294          */
2295         if (do_swap_account && PageSwapCache(page)) {
2296                 swp_entry_t ent = {.val = page_private(page)};
2297                 unsigned short id;
2298                 struct mem_cgroup *memcg;
2299
2300                 id = swap_cgroup_record(ent, 0);
2301                 rcu_read_lock();
2302                 memcg = mem_cgroup_lookup(id);
2303                 if (memcg) {
2304                         /*
2305                          * This recorded memcg can be obsolete one. So, avoid
2306                          * calling css_tryget
2307                          */
2308                         if (!mem_cgroup_is_root(memcg))
2309                                 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2310                         mem_cgroup_swap_statistics(memcg, false);
2311                         mem_cgroup_put(memcg);
2312                 }
2313                 rcu_read_unlock();
2314         }
2315         /*
2316          * At swapin, we may charge account against cgroup which has no tasks.
2317          * So, rmdir()->pre_destroy() can be called while we do this charge.
2318          * In that case, we need to call pre_destroy() again. check it here.
2319          */
2320         cgroup_release_and_wakeup_rmdir(&ptr->css);
2321 }
2322
2323 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2324 {
2325         __mem_cgroup_commit_charge_swapin(page, ptr,
2326                                         MEM_CGROUP_CHARGE_TYPE_MAPPED);
2327 }
2328
2329 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2330 {
2331         if (mem_cgroup_disabled())
2332                 return;
2333         if (!mem)
2334                 return;
2335         mem_cgroup_cancel_charge(mem);
2336 }
2337
2338 static void
2339 __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2340 {
2341         struct memcg_batch_info *batch = NULL;
2342         bool uncharge_memsw = true;
2343         /* If swapout, usage of swap doesn't decrease */
2344         if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2345                 uncharge_memsw = false;
2346
2347         batch = &current->memcg_batch;
2348         /*
2349          * In usual, we do css_get() when we remember memcg pointer.
2350          * But in this case, we keep res->usage until end of a series of
2351          * uncharges. Then, it's ok to ignore memcg's refcnt.
2352          */
2353         if (!batch->memcg)
2354                 batch->memcg = mem;
2355         /*
2356          * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2357          * In those cases, all pages freed continously can be expected to be in
2358          * the same cgroup and we have chance to coalesce uncharges.
2359          * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2360          * because we want to do uncharge as soon as possible.
2361          */
2362
2363         if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2364                 goto direct_uncharge;
2365
2366         /*
2367          * In typical case, batch->memcg == mem. This means we can
2368          * merge a series of uncharges to an uncharge of res_counter.
2369          * If not, we uncharge res_counter ony by one.
2370          */
2371         if (batch->memcg != mem)
2372                 goto direct_uncharge;
2373         /* remember freed charge and uncharge it later */
2374         batch->bytes += PAGE_SIZE;
2375         if (uncharge_memsw)
2376                 batch->memsw_bytes += PAGE_SIZE;
2377         return;
2378 direct_uncharge:
2379         res_counter_uncharge(&mem->res, PAGE_SIZE);
2380         if (uncharge_memsw)
2381                 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2382         if (unlikely(batch->memcg != mem))
2383                 memcg_oom_recover(mem);
2384         return;
2385 }
2386
2387 /*
2388  * uncharge if !page_mapped(page)
2389  */
2390 static struct mem_cgroup *
2391 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2392 {
2393         struct page_cgroup *pc;
2394         struct mem_cgroup *mem = NULL;
2395
2396         if (mem_cgroup_disabled())
2397                 return NULL;
2398
2399         if (PageSwapCache(page))
2400                 return NULL;
2401
2402         /*
2403          * Check if our page_cgroup is valid
2404          */
2405         pc = lookup_page_cgroup(page);
2406         if (unlikely(!pc || !PageCgroupUsed(pc)))
2407                 return NULL;
2408
2409         lock_page_cgroup(pc);
2410
2411         mem = pc->mem_cgroup;
2412
2413         if (!PageCgroupUsed(pc))
2414                 goto unlock_out;
2415
2416         switch (ctype) {
2417         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2418         case MEM_CGROUP_CHARGE_TYPE_DROP:
2419                 /* See mem_cgroup_prepare_migration() */
2420                 if (page_mapped(page) || PageCgroupMigration(pc))
2421                         goto unlock_out;
2422                 break;
2423         case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2424                 if (!PageAnon(page)) {  /* Shared memory */
2425                         if (page->mapping && !page_is_file_cache(page))
2426                                 goto unlock_out;
2427                 } else if (page_mapped(page)) /* Anon */
2428                                 goto unlock_out;
2429                 break;
2430         default:
2431                 break;
2432         }
2433
2434         mem_cgroup_charge_statistics(mem, pc, false);
2435
2436         ClearPageCgroupUsed(pc);
2437         /*
2438          * pc->mem_cgroup is not cleared here. It will be accessed when it's
2439          * freed from LRU. This is safe because uncharged page is expected not
2440          * to be reused (freed soon). Exception is SwapCache, it's handled by
2441          * special functions.
2442          */
2443
2444         unlock_page_cgroup(pc);
2445         /*
2446          * even after unlock, we have mem->res.usage here and this memcg
2447          * will never be freed.
2448          */
2449         memcg_check_events(mem, page);
2450         if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
2451                 mem_cgroup_swap_statistics(mem, true);
2452                 mem_cgroup_get(mem);
2453         }
2454         if (!mem_cgroup_is_root(mem))
2455                 __do_uncharge(mem, ctype);
2456
2457         return mem;
2458
2459 unlock_out:
2460         unlock_page_cgroup(pc);
2461         return NULL;
2462 }
2463
2464 void mem_cgroup_uncharge_page(struct page *page)
2465 {
2466         /* early check. */
2467         if (page_mapped(page))
2468                 return;
2469         if (page->mapping && !PageAnon(page))
2470                 return;
2471         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
2472 }
2473
2474 void mem_cgroup_uncharge_cache_page(struct page *page)
2475 {
2476         VM_BUG_ON(page_mapped(page));
2477         VM_BUG_ON(page->mapping);
2478         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
2479 }
2480
2481 /*
2482  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
2483  * In that cases, pages are freed continuously and we can expect pages
2484  * are in the same memcg. All these calls itself limits the number of
2485  * pages freed at once, then uncharge_start/end() is called properly.
2486  * This may be called prural(2) times in a context,
2487  */
2488
2489 void mem_cgroup_uncharge_start(void)
2490 {
2491         current->memcg_batch.do_batch++;
2492         /* We can do nest. */
2493         if (current->memcg_batch.do_batch == 1) {
2494                 current->memcg_batch.memcg = NULL;
2495                 current->memcg_batch.bytes = 0;
2496                 current->memcg_batch.memsw_bytes = 0;
2497         }
2498 }
2499
2500 void mem_cgroup_uncharge_end(void)
2501 {
2502         struct memcg_batch_info *batch = &current->memcg_batch;
2503
2504         if (!batch->do_batch)
2505                 return;
2506
2507         batch->do_batch--;
2508         if (batch->do_batch) /* If stacked, do nothing. */
2509                 return;
2510
2511         if (!batch->memcg)
2512                 return;
2513         /*
2514          * This "batch->memcg" is valid without any css_get/put etc...
2515          * bacause we hide charges behind us.
2516          */
2517         if (batch->bytes)
2518                 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2519         if (batch->memsw_bytes)
2520                 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2521         memcg_oom_recover(batch->memcg);
2522         /* forget this pointer (for sanity check) */
2523         batch->memcg = NULL;
2524 }
2525
2526 #ifdef CONFIG_SWAP
2527 /*
2528  * called after __delete_from_swap_cache() and drop "page" account.
2529  * memcg information is recorded to swap_cgroup of "ent"
2530  */
2531 void
2532 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2533 {
2534         struct mem_cgroup *memcg;
2535         int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
2536
2537         if (!swapout) /* this was a swap cache but the swap is unused ! */
2538                 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
2539
2540         memcg = __mem_cgroup_uncharge_common(page, ctype);
2541
2542         /*
2543          * record memcg information,  if swapout && memcg != NULL,
2544          * mem_cgroup_get() was called in uncharge().
2545          */
2546         if (do_swap_account && swapout && memcg)
2547                 swap_cgroup_record(ent, css_id(&memcg->css));
2548 }
2549 #endif
2550
2551 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2552 /*
2553  * called from swap_entry_free(). remove record in swap_cgroup and
2554  * uncharge "memsw" account.
2555  */
2556 void mem_cgroup_uncharge_swap(swp_entry_t ent)
2557 {
2558         struct mem_cgroup *memcg;
2559         unsigned short id;
2560
2561         if (!do_swap_account)
2562                 return;
2563
2564         id = swap_cgroup_record(ent, 0);
2565         rcu_read_lock();
2566         memcg = mem_cgroup_lookup(id);
2567         if (memcg) {
2568                 /*
2569                  * We uncharge this because swap is freed.
2570                  * This memcg can be obsolete one. We avoid calling css_tryget
2571                  */
2572                 if (!mem_cgroup_is_root(memcg))
2573                         res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2574                 mem_cgroup_swap_statistics(memcg, false);
2575                 mem_cgroup_put(memcg);
2576         }
2577         rcu_read_unlock();
2578 }
2579
2580 /**
2581  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2582  * @entry: swap entry to be moved
2583  * @from:  mem_cgroup which the entry is moved from
2584  * @to:  mem_cgroup which the entry is moved to
2585  * @need_fixup: whether we should fixup res_counters and refcounts.
2586  *
2587  * It succeeds only when the swap_cgroup's record for this entry is the same
2588  * as the mem_cgroup's id of @from.
2589  *
2590  * Returns 0 on success, -EINVAL on failure.
2591  *
2592  * The caller must have charged to @to, IOW, called res_counter_charge() about
2593  * both res and memsw, and called css_get().
2594  */
2595 static int mem_cgroup_move_swap_account(swp_entry_t entry,
2596                 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2597 {
2598         unsigned short old_id, new_id;
2599
2600         old_id = css_id(&from->css);
2601         new_id = css_id(&to->css);
2602
2603         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2604                 mem_cgroup_swap_statistics(from, false);
2605                 mem_cgroup_swap_statistics(to, true);
2606                 /*
2607                  * This function is only called from task migration context now.
2608                  * It postpones res_counter and refcount handling till the end
2609                  * of task migration(mem_cgroup_clear_mc()) for performance
2610                  * improvement. But we cannot postpone mem_cgroup_get(to)
2611                  * because if the process that has been moved to @to does
2612                  * swap-in, the refcount of @to might be decreased to 0.
2613                  */
2614                 mem_cgroup_get(to);
2615                 if (need_fixup) {
2616                         if (!mem_cgroup_is_root(from))
2617                                 res_counter_uncharge(&from->memsw, PAGE_SIZE);
2618                         mem_cgroup_put(from);
2619                         /*
2620                          * we charged both to->res and to->memsw, so we should
2621                          * uncharge to->res.
2622                          */
2623                         if (!mem_cgroup_is_root(to))
2624                                 res_counter_uncharge(&to->res, PAGE_SIZE);
2625                 }
2626                 return 0;
2627         }
2628         return -EINVAL;
2629 }
2630 #else
2631 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2632                 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2633 {
2634         return -EINVAL;
2635 }
2636 #endif
2637
2638 /*
2639  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
2640  * page belongs to.
2641  */
2642 int mem_cgroup_prepare_migration(struct page *page,
2643         struct page *newpage, struct mem_cgroup **ptr)
2644 {
2645         struct page_cgroup *pc;
2646         struct mem_cgroup *mem = NULL;
2647         enum charge_type ctype;
2648         int ret = 0;
2649
2650         if (mem_cgroup_disabled())
2651                 return 0;
2652
2653         pc = lookup_page_cgroup(page);
2654         lock_page_cgroup(pc);
2655         if (PageCgroupUsed(pc)) {
2656                 mem = pc->mem_cgroup;
2657                 css_get(&mem->css);
2658                 /*
2659                  * At migrating an anonymous page, its mapcount goes down
2660                  * to 0 and uncharge() will be called. But, even if it's fully
2661                  * unmapped, migration may fail and this page has to be
2662                  * charged again. We set MIGRATION flag here and delay uncharge
2663                  * until end_migration() is called
2664                  *
2665                  * Corner Case Thinking
2666                  * A)
2667                  * When the old page was mapped as Anon and it's unmap-and-freed
2668                  * while migration was ongoing.
2669                  * If unmap finds the old page, uncharge() of it will be delayed
2670                  * until end_migration(). If unmap finds a new page, it's
2671                  * uncharged when it make mapcount to be 1->0. If unmap code
2672                  * finds swap_migration_entry, the new page will not be mapped
2673                  * and end_migration() will find it(mapcount==0).
2674                  *
2675                  * B)
2676                  * When the old page was mapped but migraion fails, the kernel
2677                  * remaps it. A charge for it is kept by MIGRATION flag even
2678                  * if mapcount goes down to 0. We can do remap successfully
2679                  * without charging it again.
2680                  *
2681                  * C)
2682                  * The "old" page is under lock_page() until the end of
2683                  * migration, so, the old page itself will not be swapped-out.
2684                  * If the new page is swapped out before end_migraton, our
2685                  * hook to usual swap-out path will catch the event.
2686                  */
2687                 if (PageAnon(page))
2688                         SetPageCgroupMigration(pc);
2689         }
2690         unlock_page_cgroup(pc);
2691         /*
2692          * If the page is not charged at this point,
2693          * we return here.
2694          */
2695         if (!mem)
2696                 return 0;
2697
2698         *ptr = mem;
2699         ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
2700         css_put(&mem->css);/* drop extra refcnt */
2701         if (ret || *ptr == NULL) {
2702                 if (PageAnon(page)) {
2703                         lock_page_cgroup(pc);
2704                         ClearPageCgroupMigration(pc);
2705                         unlock_page_cgroup(pc);
2706                         /*
2707                          * The old page may be fully unmapped while we kept it.
2708                          */
2709                         mem_cgroup_uncharge_page(page);
2710                 }
2711                 return -ENOMEM;
2712         }
2713         /*
2714          * We charge new page before it's used/mapped. So, even if unlock_page()
2715          * is called before end_migration, we can catch all events on this new
2716          * page. In the case new page is migrated but not remapped, new page's
2717          * mapcount will be finally 0 and we call uncharge in end_migration().
2718          */
2719         pc = lookup_page_cgroup(newpage);
2720         if (PageAnon(page))
2721                 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2722         else if (page_is_file_cache(page))
2723                 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2724         else
2725                 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2726         __mem_cgroup_commit_charge(mem, pc, ctype);
2727         return ret;
2728 }
2729
2730 /* remove redundant charge if migration failed*/
2731 void mem_cgroup_end_migration(struct mem_cgroup *mem,
2732         struct page *oldpage, struct page *newpage)
2733 {
2734         struct page *used, *unused;
2735         struct page_cgroup *pc;
2736
2737         if (!mem)
2738                 return;
2739         /* blocks rmdir() */
2740         cgroup_exclude_rmdir(&mem->css);
2741         /* at migration success, oldpage->mapping is NULL. */
2742         if (oldpage->mapping) {
2743                 used = oldpage;
2744                 unused = newpage;
2745         } else {
2746                 used = newpage;
2747                 unused = oldpage;
2748         }
2749         /*
2750          * We disallowed uncharge of pages under migration because mapcount
2751          * of the page goes down to zero, temporarly.
2752          * Clear the flag and check the page should be charged.
2753          */
2754         pc = lookup_page_cgroup(oldpage);
2755         lock_page_cgroup(pc);
2756         ClearPageCgroupMigration(pc);
2757         unlock_page_cgroup(pc);
2758
2759         __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
2760
2761         /*
2762          * If a page is a file cache, radix-tree replacement is very atomic
2763          * and we can skip this check. When it was an Anon page, its mapcount
2764          * goes down to 0. But because we added MIGRATION flage, it's not
2765          * uncharged yet. There are several case but page->mapcount check
2766          * and USED bit check in mem_cgroup_uncharge_page() will do enough
2767          * check. (see prepare_charge() also)
2768          */
2769         if (PageAnon(used))
2770                 mem_cgroup_uncharge_page(used);
2771         /*
2772          * At migration, we may charge account against cgroup which has no
2773          * tasks.
2774          * So, rmdir()->pre_destroy() can be called while we do this charge.
2775          * In that case, we need to call pre_destroy() again. check it here.
2776          */
2777         cgroup_release_and_wakeup_rmdir(&mem->css);
2778 }
2779
2780 /*
2781  * A call to try to shrink memory usage on charge failure at shmem's swapin.
2782  * Calling hierarchical_reclaim is not enough because we should update
2783  * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
2784  * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
2785  * not from the memcg which this page would be charged to.
2786  * try_charge_swapin does all of these works properly.
2787  */
2788 int mem_cgroup_shmem_charge_fallback(struct page *page,
2789                             struct mm_struct *mm,
2790                             gfp_t gfp_mask)
2791 {
2792         struct mem_cgroup *mem = NULL;
2793         int ret;
2794
2795         if (mem_cgroup_disabled())
2796                 return 0;
2797
2798         ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2799         if (!ret)
2800                 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
2801
2802         return ret;
2803 }
2804
2805 static DEFINE_MUTEX(set_limit_mutex);
2806
2807 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2808                                 unsigned long long val)
2809 {
2810         int retry_count;
2811         u64 memswlimit, memlimit;
2812         int ret = 0;
2813         int children = mem_cgroup_count_children(memcg);
2814         u64 curusage, oldusage;
2815         int enlarge;
2816
2817         /*
2818          * For keeping hierarchical_reclaim simple, how long we should retry
2819          * is depends on callers. We set our retry-count to be function
2820          * of # of children which we should visit in this loop.
2821          */
2822         retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
2823
2824         oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2825
2826         enlarge = 0;
2827         while (retry_count) {
2828                 if (signal_pending(current)) {
2829                         ret = -EINTR;
2830                         break;
2831                 }
2832                 /*
2833                  * Rather than hide all in some function, I do this in
2834                  * open coded manner. You see what this really does.
2835                  * We have to guarantee mem->res.limit < mem->memsw.limit.
2836                  */
2837                 mutex_lock(&set_limit_mutex);
2838                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2839                 if (memswlimit < val) {
2840                         ret = -EINVAL;
2841                         mutex_unlock(&set_limit_mutex);
2842                         break;
2843                 }
2844
2845                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2846                 if (memlimit < val)
2847                         enlarge = 1;
2848
2849                 ret = res_counter_set_limit(&memcg->res, val);
2850                 if (!ret) {
2851                         if (memswlimit == val)
2852                                 memcg->memsw_is_minimum = true;
2853                         else
2854                                 memcg->memsw_is_minimum = false;
2855                 }
2856                 mutex_unlock(&set_limit_mutex);
2857
2858                 if (!ret)
2859                         break;
2860
2861                 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2862                                                 MEM_CGROUP_RECLAIM_SHRINK);
2863                 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2864                 /* Usage is reduced ? */
2865                 if (curusage >= oldusage)
2866                         retry_count--;
2867                 else
2868                         oldusage = curusage;
2869         }
2870         if (!ret && enlarge)
2871                 memcg_oom_recover(memcg);
2872
2873         return ret;
2874 }
2875
2876 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2877                                         unsigned long long val)
2878 {
2879         int retry_count;
2880         u64 memlimit, memswlimit, oldusage, curusage;
2881         int children = mem_cgroup_count_children(memcg);
2882         int ret = -EBUSY;
2883         int enlarge = 0;
2884
2885         /* see mem_cgroup_resize_res_limit */
2886         retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
2887         oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2888         while (retry_count) {
2889                 if (signal_pending(current)) {
2890                         ret = -EINTR;
2891                         break;
2892                 }
2893                 /*
2894                  * Rather than hide all in some function, I do this in
2895                  * open coded manner. You see what this really does.
2896                  * We have to guarantee mem->res.limit < mem->memsw.limit.
2897                  */
2898                 mutex_lock(&set_limit_mutex);
2899                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2900                 if (memlimit > val) {
2901                         ret = -EINVAL;
2902                         mutex_unlock(&set_limit_mutex);
2903                         break;
2904                 }
2905                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2906                 if (memswlimit < val)
2907                         enlarge = 1;
2908                 ret = res_counter_set_limit(&memcg->memsw, val);
2909                 if (!ret) {
2910                         if (memlimit == val)
2911                                 memcg->memsw_is_minimum = true;
2912                         else
2913                                 memcg->memsw_is_minimum = false;
2914                 }
2915                 mutex_unlock(&set_limit_mutex);
2916
2917                 if (!ret)
2918                         break;
2919
2920                 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2921                                                 MEM_CGROUP_RECLAIM_NOSWAP |
2922                                                 MEM_CGROUP_RECLAIM_SHRINK);
2923                 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2924                 /* Usage is reduced ? */
2925                 if (curusage >= oldusage)
2926                         retry_count--;
2927                 else
2928                         oldusage = curusage;
2929         }
2930         if (!ret && enlarge)
2931                 memcg_oom_recover(memcg);
2932         return ret;
2933 }
2934
2935 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2936                                             gfp_t gfp_mask)
2937 {
2938         unsigned long nr_reclaimed = 0;
2939         struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2940         unsigned long reclaimed;
2941         int loop = 0;
2942         struct mem_cgroup_tree_per_zone *mctz;
2943         unsigned long long excess;
2944
2945         if (order > 0)
2946                 return 0;
2947
2948         mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
2949         /*
2950          * This loop can run a while, specially if mem_cgroup's continuously
2951          * keep exceeding their soft limit and putting the system under
2952          * pressure
2953          */
2954         do {
2955                 if (next_mz)
2956                         mz = next_mz;
2957                 else
2958                         mz = mem_cgroup_largest_soft_limit_node(mctz);
2959                 if (!mz)
2960                         break;
2961
2962                 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2963                                                 gfp_mask,
2964                                                 MEM_CGROUP_RECLAIM_SOFT);
2965                 nr_reclaimed += reclaimed;
2966                 spin_lock(&mctz->lock);
2967
2968                 /*
2969                  * If we failed to reclaim anything from this memory cgroup
2970                  * it is time to move on to the next cgroup
2971                  */
2972                 next_mz = NULL;
2973                 if (!reclaimed) {
2974                         do {
2975                                 /*
2976                                  * Loop until we find yet another one.
2977                                  *
2978                                  * By the time we get the soft_limit lock
2979                                  * again, someone might have aded the
2980                                  * group back on the RB tree. Iterate to
2981                                  * make sure we get a different mem.
2982                                  * mem_cgroup_largest_soft_limit_node returns
2983                                  * NULL if no other cgroup is present on
2984                                  * the tree
2985                                  */
2986                                 next_mz =
2987                                 __mem_cgroup_largest_soft_limit_node(mctz);
2988                                 if (next_mz == mz) {
2989                                         css_put(&next_mz->mem->css);
2990                                         next_mz = NULL;
2991                                 } else /* next_mz == NULL or other memcg */
2992                                         break;
2993                         } while (1);
2994                 }
2995                 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2996                 excess = res_counter_soft_limit_excess(&mz->mem->res);
2997                 /*
2998                  * One school of thought says that we should not add
2999                  * back the node to the tree if reclaim returns 0.
3000                  * But our reclaim could return 0, simply because due
3001                  * to priority we are exposing a smaller subset of
3002                  * memory to reclaim from. Consider this as a longer
3003                  * term TODO.
3004                  */
3005                 /* If excess == 0, no tree ops */
3006                 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
3007                 spin_unlock(&mctz->lock);
3008                 css_put(&mz->mem->css);
3009                 loop++;
3010                 /*
3011                  * Could not reclaim anything and there are no more
3012                  * mem cgroups to try or we seem to be looping without
3013                  * reclaiming anything.
3014                  */
3015                 if (!nr_reclaimed &&
3016                         (next_mz == NULL ||
3017                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3018                         break;
3019         } while (!nr_reclaimed);
3020         if (next_mz)
3021                 css_put(&next_mz->mem->css);
3022         return nr_reclaimed;
3023 }
3024
3025 /*
3026  * This routine traverse page_cgroup in given list and drop them all.
3027  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
3028  */
3029 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3030                                 int node, int zid, enum lru_list lru)
3031 {
3032         struct zone *zone;
3033         struct mem_cgroup_per_zone *mz;
3034         struct page_cgroup *pc, *busy;
3035         unsigned long flags, loop;
3036         struct list_head *list;
3037         int ret = 0;
3038
3039         zone = &NODE_DATA(node)->node_zones[zid];
3040         mz = mem_cgroup_zoneinfo(mem, node, zid);
3041         list = &mz->lists[lru];
3042
3043         loop = MEM_CGROUP_ZSTAT(mz, lru);
3044         /* give some margin against EBUSY etc...*/
3045         loop += 256;
3046         busy = NULL;
3047         while (loop--) {
3048                 ret = 0;
3049                 spin_lock_irqsave(&zone->lru_lock, flags);
3050                 if (list_empty(list)) {
3051                         spin_unlock_irqrestore(&zone->lru_lock, flags);
3052                         break;
3053                 }
3054                 pc = list_entry(list->prev, struct page_cgroup, lru);
3055                 if (busy == pc) {
3056                         list_move(&pc->lru, list);
3057                         busy = NULL;
3058                         spin_unlock_irqrestore(&zone->lru_lock, flags);
3059                         continue;
3060                 }
3061                 spin_unlock_irqrestore(&zone->lru_lock, flags);
3062
3063                 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
3064                 if (ret == -ENOMEM)
3065                         break;
3066
3067                 if (ret == -EBUSY || ret == -EINVAL) {
3068                         /* found lock contention or "pc" is obsolete. */
3069                         busy = pc;
3070                         cond_resched();
3071                 } else
3072                         busy = NULL;
3073         }
3074
3075         if (!ret && !list_empty(list))
3076                 return -EBUSY;
3077         return ret;
3078 }
3079
3080 /*
3081  * make mem_cgroup's charge to be 0 if there is no task.
3082  * This enables deleting this mem_cgroup.
3083  */
3084 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
3085 {
3086         int ret;
3087         int node, zid, shrink;
3088         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3089         struct cgroup *cgrp = mem->css.cgroup;
3090
3091         css_get(&mem->css);
3092
3093         shrink = 0;
3094         /* should free all ? */
3095         if (free_all)
3096                 goto try_to_free;
3097 move_account:
3098         do {
3099                 ret = -EBUSY;
3100                 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3101                         goto out;
3102                 ret = -EINTR;
3103                 if (signal_pending(current))
3104                         goto out;
3105                 /* This is for making all *used* pages to be on LRU. */
3106                 lru_add_drain_all();
3107                 drain_all_stock_sync();
3108                 ret = 0;
3109                 mem_cgroup_start_move(mem);
3110                 for_each_node_state(node, N_HIGH_MEMORY) {
3111                         for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3112                                 enum lru_list l;
3113                                 for_each_lru(l) {
3114                                         ret = mem_cgroup_force_empty_list(mem,
3115                                                         node, zid, l);
3116                                         if (ret)
3117                                                 break;
3118                                 }
3119                         }
3120                         if (ret)
3121                                 break;
3122                 }
3123                 mem_cgroup_end_move(mem);
3124                 memcg_oom_recover(mem);
3125                 /* it seems parent cgroup doesn't have enough mem */
3126                 if (ret == -ENOMEM)
3127                         goto try_to_free;
3128                 cond_resched();
3129         /* "ret" should also be checked to ensure all lists are empty. */
3130         } while (mem->res.usage > 0 || ret);
3131 out:
3132         css_put(&mem->css);
3133         return ret;
3134
3135 try_to_free:
3136         /* returns EBUSY if there is a task or if we come here twice. */
3137         if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3138                 ret = -EBUSY;
3139                 goto out;
3140         }
3141         /* we call try-to-free pages for make this cgroup empty */
3142         lru_add_drain_all();
3143         /* try to free all pages in this cgroup */
3144         shrink = 1;
3145         while (nr_retries && mem->res.usage > 0) {
3146                 int progress;
3147
3148                 if (signal_pending(current)) {
3149                         ret = -EINTR;
3150                         goto out;
3151                 }
3152                 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3153                                                 false, get_swappiness(mem));
3154                 if (!progress) {
3155                         nr_retries--;
3156                         /* maybe some writeback is necessary */
3157                         congestion_wait(BLK_RW_ASYNC, HZ/10);
3158                 }
3159
3160         }
3161         lru_add_drain();
3162         /* try move_account...there may be some *locked* pages. */
3163         goto move_account;
3164 }
3165
3166 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3167 {
3168         return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3169 }
3170
3171
3172 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3173 {
3174         return mem_cgroup_from_cont(cont)->use_hierarchy;
3175 }
3176
3177 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3178                                         u64 val)
3179 {
3180         int retval = 0;
3181         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3182         struct cgroup *parent = cont->parent;
3183         struct mem_cgroup *parent_mem = NULL;
3184
3185         if (parent)
3186                 parent_mem = mem_cgroup_from_cont(parent);
3187
3188         cgroup_lock();
3189         /*
3190          * If parent's use_hierarchy is set, we can't make any modifications
3191          * in the child subtrees. If it is unset, then the change can
3192          * occur, provided the current cgroup has no children.
3193          *
3194          * For the root cgroup, parent_mem is NULL, we allow value to be
3195          * set if there are no children.
3196          */
3197         if ((!parent_mem || !parent_mem->use_hierarchy) &&
3198                                 (val == 1 || val == 0)) {
3199                 if (list_empty(&cont->children))
3200                         mem->use_hierarchy = val;
3201                 else
3202                         retval = -EBUSY;
3203         } else
3204                 retval = -EINVAL;
3205         cgroup_unlock();
3206
3207         return retval;
3208 }
3209
3210 struct mem_cgroup_idx_data {
3211         s64 val;
3212         enum mem_cgroup_stat_index idx;
3213 };
3214
3215 static int
3216 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
3217 {
3218         struct mem_cgroup_idx_data *d = data;
3219         d->val += mem_cgroup_read_stat(mem, d->idx);
3220         return 0;
3221 }
3222
3223 static void
3224 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
3225                                 enum mem_cgroup_stat_index idx, s64 *val)
3226 {
3227         struct mem_cgroup_idx_data d;
3228         d.idx = idx;
3229         d.val = 0;
3230         mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
3231         *val = d.val;
3232 }
3233
3234 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3235 {
3236         u64 idx_val, val;
3237
3238         if (!mem_cgroup_is_root(mem)) {
3239                 if (!swap)
3240                         return res_counter_read_u64(&mem->res, RES_USAGE);
3241                 else
3242                         return res_counter_read_u64(&mem->memsw, RES_USAGE);
3243         }
3244
3245         mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
3246         val = idx_val;
3247         mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
3248         val += idx_val;
3249
3250         if (swap) {
3251                 mem_cgroup_get_recursive_idx_stat(mem,
3252                                 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
3253                 val += idx_val;
3254         }
3255
3256         return val << PAGE_SHIFT;
3257 }
3258
3259 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3260 {
3261         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3262         u64 val;
3263         int type, name;
3264
3265         type = MEMFILE_TYPE(cft->private);
3266         name = MEMFILE_ATTR(cft->private);
3267         switch (type) {
3268         case _MEM:
3269                 if (name == RES_USAGE)
3270                         val = mem_cgroup_usage(mem, false);
3271                 else
3272                         val = res_counter_read_u64(&mem->res, name);
3273                 break;
3274         case _MEMSWAP:
3275                 if (name == RES_USAGE)
3276                         val = mem_cgroup_usage(mem, true);
3277                 else
3278                         val = res_counter_read_u64(&mem->memsw, name);
3279                 break;
3280         default:
3281                 BUG();
3282                 break;
3283         }
3284         return val;
3285 }
3286 /*
3287  * The user of this function is...
3288  * RES_LIMIT.
3289  */
3290 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3291                             const char *buffer)
3292 {
3293         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3294         int type, name;
3295         unsigned long long val;
3296         int ret;
3297
3298         type = MEMFILE_TYPE(cft->private);
3299         name = MEMFILE_ATTR(cft->private);
3300         switch (name) {
3301         case RES_LIMIT:
3302                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3303                         ret = -EINVAL;
3304                         break;
3305                 }
3306                 /* This function does all necessary parse...reuse it */
3307                 ret = res_counter_memparse_write_strategy(buffer, &val);
3308                 if (ret)
3309                         break;
3310                 if (type == _MEM)
3311                         ret = mem_cgroup_resize_limit(memcg, val);
3312                 else
3313                         ret = mem_cgroup_resize_memsw_limit(memcg, val);
3314                 break;
3315         case RES_SOFT_LIMIT:
3316                 ret = res_counter_memparse_write_strategy(buffer, &val);
3317                 if (ret)
3318                         break;
3319                 /*
3320                  * For memsw, soft limits are hard to implement in terms
3321                  * of semantics, for now, we support soft limits for
3322                  * control without swap
3323                  */
3324                 if (type == _MEM)
3325                         ret = res_counter_set_soft_limit(&memcg->res, val);
3326                 else
3327                         ret = -EINVAL;
3328                 break;
3329         default:
3330                 ret = -EINVAL; /* should be BUG() ? */
3331                 break;
3332         }
3333         return ret;
3334 }
3335
3336 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3337                 unsigned long long *mem_limit, unsigned long long *memsw_limit)
3338 {
3339         struct cgroup *cgroup;
3340         unsigned long long min_limit, min_memsw_limit, tmp;
3341
3342         min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3343         min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3344         cgroup = memcg->css.cgroup;
3345         if (!memcg->use_hierarchy)
3346                 goto out;
3347
3348         while (cgroup->parent) {
3349                 cgroup = cgroup->parent;
3350                 memcg = mem_cgroup_from_cont(cgroup);
3351                 if (!memcg->use_hierarchy)
3352                         break;
3353                 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3354                 min_limit = min(min_limit, tmp);
3355                 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3356                 min_memsw_limit = min(min_memsw_limit, tmp);
3357         }
3358 out:
3359         *mem_limit = min_limit;
3360         *memsw_limit = min_memsw_limit;
3361         return;
3362 }
3363
3364 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3365 {
3366         struct mem_cgroup *mem;
3367         int type, name;
3368
3369         mem = mem_cgroup_from_cont(cont);
3370         type = MEMFILE_TYPE(event);
3371         name = MEMFILE_ATTR(event);
3372         switch (name) {
3373         case RES_MAX_USAGE:
3374                 if (type == _MEM)
3375                         res_counter_reset_max(&mem->res);
3376                 else
3377                         res_counter_reset_max(&mem->memsw);
3378                 break;
3379         case RES_FAILCNT:
3380                 if (type == _MEM)
3381                         res_counter_reset_failcnt(&mem->res);
3382                 else
3383                         res_counter_reset_failcnt(&mem->memsw);
3384                 break;
3385         }
3386
3387         return 0;
3388 }
3389
3390 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3391                                         struct cftype *cft)
3392 {
3393         return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3394 }
3395
3396 #ifdef CONFIG_MMU
3397 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3398                                         struct cftype *cft, u64 val)
3399 {
3400         struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3401
3402         if (val >= (1 << NR_MOVE_TYPE))
3403                 return -EINVAL;
3404         /*
3405          * We check this value several times in both in can_attach() and
3406          * attach(), so we need cgroup lock to prevent this value from being
3407          * inconsistent.
3408          */
3409         cgroup_lock();
3410         mem->move_charge_at_immigrate = val;
3411         cgroup_unlock();
3412
3413         return 0;
3414 }
3415 #else
3416 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3417                                         struct cftype *cft, u64 val)
3418 {
3419         return -ENOSYS;
3420 }
3421 #endif
3422
3423
3424 /* For read statistics */
3425 enum {
3426         MCS_CACHE,
3427         MCS_RSS,
3428         MCS_FILE_MAPPED,
3429         MCS_PGPGIN,
3430         MCS_PGPGOUT,
3431         MCS_SWAP,
3432         MCS_INACTIVE_ANON,
3433         MCS_ACTIVE_ANON,
3434         MCS_INACTIVE_FILE,
3435         MCS_ACTIVE_FILE,
3436         MCS_UNEVICTABLE,
3437         NR_MCS_STAT,
3438 };
3439
3440 struct mcs_total_stat {
3441         s64 stat[NR_MCS_STAT];
3442 };
3443
3444 struct {
3445         char *local_name;
3446         char *total_name;
3447 } memcg_stat_strings[NR_MCS_STAT] = {
3448         {"cache", "total_cache"},
3449         {"rss", "total_rss"},
3450         {"mapped_file", "total_mapped_file"},
3451         {"pgpgin", "total_pgpgin"},
3452         {"pgpgout", "total_pgpgout"},
3453         {"swap", "total_swap"},
3454         {"inactive_anon", "total_inactive_anon"},
3455         {"active_anon", "total_active_anon"},
3456         {"inactive_file", "total_inactive_file"},
3457         {"active_file", "total_active_file"},
3458         {"unevictable", "total_unevictable"}
3459 };
3460
3461
3462 static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3463 {
3464         struct mcs_total_stat *s = data;
3465         s64 val;
3466
3467         /* per cpu stat */
3468         val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
3469         s->stat[MCS_CACHE] += val * PAGE_SIZE;
3470         val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
3471         s->stat[MCS_RSS] += val * PAGE_SIZE;
3472         val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3473         s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3474         val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
3475         s->stat[MCS_PGPGIN] += val;
3476         val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
3477         s->stat[MCS_PGPGOUT] += val;
3478         if (do_swap_account) {
3479                 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3480                 s->stat[MCS_SWAP] += val * PAGE_SIZE;
3481         }
3482
3483         /* per zone stat */
3484         val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
3485         s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
3486         val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
3487         s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
3488         val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
3489         s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
3490         val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
3491         s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3492         val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3493         s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3494         return 0;
3495 }
3496
3497 static void
3498 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3499 {
3500         mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
3501 }
3502
3503 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3504                                  struct cgroup_map_cb *cb)
3505 {
3506         struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
3507         struct mcs_total_stat mystat;
3508         int i;
3509
3510         memset(&mystat, 0, sizeof(mystat));
3511         mem_cgroup_get_local_stat(mem_cont, &mystat);
3512
3513         for (i = 0; i < NR_MCS_STAT; i++) {
3514                 if (i == MCS_SWAP && !do_swap_account)
3515                         continue;
3516                 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
3517         }
3518
3519         /* Hierarchical information */
3520         {
3521                 unsigned long long limit, memsw_limit;
3522                 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
3523                 cb->fill(cb, "hierarchical_memory_limit", limit);
3524                 if (do_swap_account)
3525                         cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
3526         }
3527
3528         memset(&mystat, 0, sizeof(mystat));
3529         mem_cgroup_get_total_stat(mem_cont, &mystat);
3530         for (i = 0; i < NR_MCS_STAT; i++) {
3531                 if (i == MCS_SWAP && !do_swap_account)
3532                         continue;
3533                 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
3534         }
3535
3536 #ifdef CONFIG_DEBUG_VM
3537         cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
3538
3539         {
3540                 int nid, zid;
3541                 struct mem_cgroup_per_zone *mz;
3542                 unsigned long recent_rotated[2] = {0, 0};
3543                 unsigned long recent_scanned[2] = {0, 0};
3544
3545                 for_each_online_node(nid)
3546                         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3547                                 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
3548
3549                                 recent_rotated[0] +=
3550                                         mz->reclaim_stat.recent_rotated[0];
3551                                 recent_rotated[1] +=
3552                                         mz->reclaim_stat.recent_rotated[1];
3553                                 recent_scanned[0] +=
3554                                         mz->reclaim_stat.recent_scanned[0];
3555                                 recent_scanned[1] +=
3556                                         mz->reclaim_stat.recent_scanned[1];
3557                         }
3558                 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
3559                 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
3560                 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
3561                 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
3562         }
3563 #endif
3564
3565         return 0;
3566 }
3567
3568 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
3569 {
3570         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3571
3572         return get_swappiness(memcg);
3573 }
3574
3575 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3576                                        u64 val)
3577 {
3578         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3579         struct mem_cgroup *parent;
3580
3581         if (val > 100)
3582                 return -EINVAL;
3583
3584         if (cgrp->parent == NULL)
3585                 return -EINVAL;
3586
3587         parent = mem_cgroup_from_cont(cgrp->parent);
3588
3589         cgroup_lock();
3590
3591         /* If under hierarchy, only empty-root can set this value */
3592         if ((parent->use_hierarchy) ||
3593             (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
3594                 cgroup_unlock();
3595                 return -EINVAL;
3596         }
3597
3598         spin_lock(&memcg->reclaim_param_lock);
3599         memcg->swappiness = val;
3600         spin_unlock(&memcg->reclaim_param_lock);
3601
3602         cgroup_unlock();
3603
3604         return 0;
3605 }
3606
3607 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3608 {
3609         struct mem_cgroup_threshold_ary *t;
3610         u64 usage;
3611         int i;
3612
3613         rcu_read_lock();
3614         if (!swap)
3615                 t = rcu_dereference(memcg->thresholds.primary);
3616         else
3617                 t = rcu_dereference(memcg->memsw_thresholds.primary);
3618
3619         if (!t)
3620                 goto unlock;
3621
3622         usage = mem_cgroup_usage(memcg, swap);
3623
3624         /*
3625          * current_threshold points to threshold just below usage.
3626          * If it's not true, a threshold was crossed after last
3627          * call of __mem_cgroup_threshold().
3628          */
3629         i = t->current_threshold;
3630
3631         /*
3632          * Iterate backward over array of thresholds starting from
3633          * current_threshold and check if a threshold is crossed.
3634          * If none of thresholds below usage is crossed, we read
3635          * only one element of the array here.
3636          */
3637         for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3638                 eventfd_signal(t->entries[i].eventfd, 1);
3639
3640         /* i = current_threshold + 1 */
3641         i++;
3642
3643         /*
3644          * Iterate forward over array of thresholds starting from
3645          * current_threshold+1 and check if a threshold is crossed.
3646          * If none of thresholds above usage is crossed, we read
3647          * only one element of the array here.
3648          */
3649         for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3650                 eventfd_signal(t->entries[i].eventfd, 1);
3651
3652         /* Update current_threshold */
3653         t->current_threshold = i - 1;
3654 unlock:
3655         rcu_read_unlock();
3656 }
3657
3658 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3659 {
3660         while (memcg) {
3661                 __mem_cgroup_threshold(memcg, false);
3662                 if (do_swap_account)
3663                         __mem_cgroup_threshold(memcg, true);
3664
3665                 memcg = parent_mem_cgroup(memcg);
3666         }
3667 }
3668
3669 static int compare_thresholds(const void *a, const void *b)
3670 {
3671         const struct mem_cgroup_threshold *_a = a;
3672         const struct mem_cgroup_threshold *_b = b;
3673
3674         return _a->threshold - _b->threshold;
3675 }
3676
3677 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3678 {
3679         struct mem_cgroup_eventfd_list *ev;
3680
3681         list_for_each_entry(ev, &mem->oom_notify, list)
3682                 eventfd_signal(ev->eventfd, 1);
3683         return 0;
3684 }
3685
3686 static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3687 {
3688         mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
3689 }
3690
3691 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
3692         struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3693 {
3694         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3695         struct mem_cgroup_thresholds *thresholds;
3696         struct mem_cgroup_threshold_ary *new;
3697         int type = MEMFILE_TYPE(cft->private);
3698         u64 threshold, usage;
3699         int i, size, ret;
3700
3701         ret = res_counter_memparse_write_strategy(args, &threshold);
3702         if (ret)
3703                 return ret;
3704
3705         mutex_lock(&memcg->thresholds_lock);
3706
3707         if (type == _MEM)
3708                 thresholds = &memcg->thresholds;
3709         else if (type == _MEMSWAP)
3710                 thresholds = &memcg->memsw_thresholds;
3711         else
3712                 BUG();
3713
3714         usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3715
3716         /* Check if a threshold crossed before adding a new one */
3717         if (thresholds->primary)
3718                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3719
3720         size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3721
3722         /* Allocate memory for new array of thresholds */
3723         new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3724                         GFP_KERNEL);
3725         if (!new) {
3726                 ret = -ENOMEM;
3727                 goto unlock;
3728         }
3729         new->size = size;
3730
3731         /* Copy thresholds (if any) to new array */
3732         if (thresholds->primary) {
3733                 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3734                                 sizeof(struct mem_cgroup_threshold));
3735         }
3736
3737         /* Add new threshold */
3738         new->entries[size - 1].eventfd = eventfd;
3739         new->entries[size - 1].threshold = threshold;
3740
3741         /* Sort thresholds. Registering of new threshold isn't time-critical */
3742         sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3743                         compare_thresholds, NULL);
3744
3745         /* Find current threshold */
3746         new->current_threshold = -1;
3747         for (i = 0; i < size; i++) {
3748                 if (new->entries[i].threshold < usage) {
3749                         /*
3750                          * new->current_threshold will not be used until
3751                          * rcu_assign_pointer(), so it's safe to increment
3752                          * it here.
3753                          */
3754                         ++new->current_threshold;
3755                 }
3756         }
3757
3758         /* Free old spare buffer and save old primary buffer as spare */
3759         kfree(thresholds->spare);
3760         thresholds->spare = thresholds->primary;
3761
3762         rcu_assign_pointer(thresholds->primary, new);
3763
3764         /* To be sure that nobody uses thresholds */
3765         synchronize_rcu();
3766
3767 unlock:
3768         mutex_unlock(&memcg->thresholds_lock);
3769
3770         return ret;
3771 }
3772
3773 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
3774         struct cftype *cft, struct eventfd_ctx *eventfd)
3775 {
3776         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3777         struct mem_cgroup_thresholds *thresholds;
3778         struct mem_cgroup_threshold_ary *new;
3779         int type = MEMFILE_TYPE(cft->private);
3780         u64 usage;
3781         int i, j, size;
3782
3783         mutex_lock(&memcg->thresholds_lock);
3784         if (type == _MEM)
3785                 thresholds = &memcg->thresholds;
3786         else if (type == _MEMSWAP)
3787                 thresholds = &memcg->memsw_thresholds;
3788         else
3789                 BUG();
3790
3791         /*
3792          * Something went wrong if we trying to unregister a threshold
3793          * if we don't have thresholds
3794          */
3795         BUG_ON(!thresholds);
3796
3797         usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3798
3799         /* Check if a threshold crossed before removing */
3800         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3801
3802         /* Calculate new number of threshold */
3803         size = 0;
3804         for (i = 0; i < thresholds->primary->size; i++) {
3805                 if (thresholds->primary->entries[i].eventfd != eventfd)
3806                         size++;
3807         }
3808
3809         new = thresholds->spare;
3810
3811         /* Set thresholds array to NULL if we don't have thresholds */
3812         if (!size) {
3813                 kfree(new);
3814                 new = NULL;
3815                 goto swap_buffers;
3816         }
3817
3818         new->size = size;
3819
3820         /* Copy thresholds and find current threshold */
3821         new->current_threshold = -1;
3822         for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3823                 if (thresholds->primary->entries[i].eventfd == eventfd)
3824                         continue;
3825
3826                 new->entries[j] = thresholds->primary->entries[i];
3827                 if (new->entries[j].threshold < usage) {
3828                         /*
3829                          * new->current_threshold will not be used
3830                          * until rcu_assign_pointer(), so it's safe to increment
3831                          * it here.
3832                          */
3833                         ++new->current_threshold;
3834                 }
3835                 j++;
3836         }
3837
3838 swap_buffers:
3839         /* Swap primary and spare array */
3840         thresholds->spare = thresholds->primary;
3841         rcu_assign_pointer(thresholds->primary, new);
3842
3843         /* To be sure that nobody uses thresholds */
3844         synchronize_rcu();
3845
3846         mutex_unlock(&memcg->thresholds_lock);
3847 }
3848
3849 static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
3850         struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3851 {
3852         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3853         struct mem_cgroup_eventfd_list *event;
3854         int type = MEMFILE_TYPE(cft->private);
3855
3856         BUG_ON(type != _OOM_TYPE);
3857         event = kmalloc(sizeof(*event), GFP_KERNEL);
3858         if (!event)
3859                 return -ENOMEM;
3860
3861         mutex_lock(&memcg_oom_mutex);
3862
3863         event->eventfd = eventfd;
3864         list_add(&event->list, &memcg->oom_notify);
3865
3866         /* already in OOM ? */
3867         if (atomic_read(&memcg->oom_lock))
3868                 eventfd_signal(eventfd, 1);
3869         mutex_unlock(&memcg_oom_mutex);
3870
3871         return 0;
3872 }
3873
3874 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
3875         struct cftype *cft, struct eventfd_ctx *eventfd)
3876 {
3877         struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3878         struct mem_cgroup_eventfd_list *ev, *tmp;
3879         int type = MEMFILE_TYPE(cft->private);
3880
3881         BUG_ON(type != _OOM_TYPE);
3882
3883         mutex_lock(&memcg_oom_mutex);
3884
3885         list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
3886                 if (ev->eventfd == eventfd) {
3887                         list_del(&ev->list);
3888                         kfree(ev);
3889                 }
3890         }
3891
3892         mutex_unlock(&memcg_oom_mutex);
3893 }
3894
3895 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
3896         struct cftype *cft,  struct cgroup_map_cb *cb)
3897 {
3898         struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3899
3900         cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
3901
3902         if (atomic_read(&mem->oom_lock))
3903                 cb->fill(cb, "under_oom", 1);
3904         else
3905                 cb->fill(cb, "under_oom", 0);
3906         return 0;
3907 }
3908
3909 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3910         struct cftype *cft, u64 val)
3911 {
3912         struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3913         struct mem_cgroup *parent;
3914
3915         /* cannot set to root cgroup and only 0 and 1 are allowed */
3916         if (!cgrp->parent || !((val == 0) || (val == 1)))
3917                 return -EINVAL;
3918
3919         parent = mem_cgroup_from_cont(cgrp->parent);
3920
3921         cgroup_lock();
3922         /* oom-kill-disable is a flag for subhierarchy. */
3923         if ((parent->use_hierarchy) ||
3924             (mem->use_hierarchy && !list_empty(&cgrp->children))) {
3925                 cgroup_unlock();
3926                 return -EINVAL;
3927         }
3928         mem->oom_kill_disable = val;
3929         if (!val)
3930                 memcg_oom_recover(mem);
3931         cgroup_unlock();
3932         return 0;
3933 }
3934
3935 static struct cftype mem_cgroup_files[] = {
3936         {
3937                 .name = "usage_in_bytes",
3938                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3939                 .read_u64 = mem_cgroup_read,
3940                 .register_event = mem_cgroup_usage_register_event,
3941                 .unregister_event = mem_cgroup_usage_unregister_event,
3942         },
3943         {
3944                 .name = "max_usage_in_bytes",
3945                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3946                 .trigger = mem_cgroup_reset,
3947                 .read_u64 = mem_cgroup_read,
3948         },
3949         {
3950                 .name = "limit_in_bytes",
3951                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3952                 .write_string = mem_cgroup_write,
3953                 .read_u64 = mem_cgroup_read,
3954         },
3955         {
3956                 .name = "soft_limit_in_bytes",
3957                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3958                 .write_string = mem_cgroup_write,
3959                 .read_u64 = mem_cgroup_read,
3960         },
3961         {
3962                 .name = "failcnt",
3963                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3964                 .trigger = mem_cgroup_reset,
3965                 .read_u64 = mem_cgroup_read,
3966         },
3967         {
3968                 .name = "stat",
3969                 .read_map = mem_control_stat_show,
3970         },
3971         {
3972                 .name = "force_empty",
3973                 .trigger = mem_cgroup_force_empty_write,
3974         },
3975         {
3976                 .name = "use_hierarchy",
3977                 .write_u64 = mem_cgroup_hierarchy_write,
3978                 .read_u64 = mem_cgroup_hierarchy_read,
3979         },
3980         {
3981                 .name = "swappiness",
3982                 .read_u64 = mem_cgroup_swappiness_read,
3983                 .write_u64 = mem_cgroup_swappiness_write,
3984         },
3985         {
3986                 .name = "move_charge_at_immigrate",
3987                 .read_u64 = mem_cgroup_move_charge_read,
3988                 .write_u64 = mem_cgroup_move_charge_write,
3989         },
3990         {
3991                 .name = "oom_control",
3992                 .read_map = mem_cgroup_oom_control_read,
3993                 .write_u64 = mem_cgroup_oom_control_write,
3994                 .register_event = mem_cgroup_oom_register_event,
3995                 .unregister_event = mem_cgroup_oom_unregister_event,
3996                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3997         },
3998 };
3999
4000 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4001 static struct cftype memsw_cgroup_files[] = {
4002         {
4003                 .name = "memsw.usage_in_bytes",
4004                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4005                 .read_u64 = mem_cgroup_read,
4006                 .register_event = mem_cgroup_usage_register_event,
4007                 .unregister_event = mem_cgroup_usage_unregister_event,
4008         },
4009         {
4010                 .name = "memsw.max_usage_in_bytes",
4011                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4012                 .trigger = mem_cgroup_reset,
4013                 .read_u64 = mem_cgroup_read,
4014         },
4015         {
4016                 .name = "memsw.limit_in_bytes",
4017                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4018                 .write_string = mem_cgroup_write,
4019                 .read_u64 = mem_cgroup_read,
4020         },
4021         {
4022                 .name = "memsw.failcnt",
4023                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4024                 .trigger = mem_cgroup_reset,
4025                 .read_u64 = mem_cgroup_read,
4026         },
4027 };
4028
4029 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4030 {
4031         if (!do_swap_account)
4032                 return 0;
4033         return cgroup_add_files(cont, ss, memsw_cgroup_files,
4034                                 ARRAY_SIZE(memsw_cgroup_files));
4035 };
4036 #else
4037 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4038 {
4039         return 0;
4040 }
4041 #endif
4042
4043 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4044 {
4045         struct mem_cgroup_per_node *pn;
4046         struct mem_cgroup_per_zone *mz;
4047         enum lru_list l;
4048         int zone, tmp = node;
4049         /*
4050          * This routine is called against possible nodes.
4051          * But it's BUG to call kmalloc() against offline node.
4052          *
4053          * TODO: this routine can waste much memory for nodes which will
4054          *       never be onlined. It's better to use memory hotplug callback
4055          *       function.
4056          */
4057         if (!node_state(node, N_NORMAL_MEMORY))
4058                 tmp = -1;
4059         pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4060         if (!pn)
4061                 return 1;
4062
4063         mem->info.nodeinfo[node] = pn;
4064         memset(pn, 0, sizeof(*pn));
4065
4066         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4067                 mz = &pn->zoneinfo[zone];
4068                 for_each_lru(l)
4069                         INIT_LIST_HEAD(&mz->lists[l]);
4070                 mz->usage_in_excess = 0;
4071                 mz->on_tree = false;
4072                 mz->mem = mem;
4073         }
4074         return 0;
4075 }
4076
4077 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4078 {
4079         kfree(mem->info.nodeinfo[node]);
4080 }
4081
4082 static struct mem_cgroup *mem_cgroup_alloc(void)
4083 {
4084         struct mem_cgroup *mem;
4085         int size = sizeof(struct mem_cgroup);
4086
4087         /* Can be very big if MAX_NUMNODES is very big */
4088         if (size < PAGE_SIZE)
4089                 mem = kmalloc(size, GFP_KERNEL);
4090         else
4091                 mem = vmalloc(size);
4092
4093         if (!mem)
4094                 return NULL;
4095
4096         memset(mem, 0, size);
4097         mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4098         if (!mem->stat) {
4099                 if (size < PAGE_SIZE)
4100                         kfree(mem);
4101                 else
4102                         vfree(mem);
4103                 mem = NULL;
4104         }
4105         return mem;
4106 }
4107
4108 /*
4109  * At destroying mem_cgroup, references from swap_cgroup can remain.
4110  * (scanning all at force_empty is too costly...)
4111  *
4112  * Instead of clearing all references at force_empty, we remember
4113  * the number of reference from swap_cgroup and free mem_cgroup when
4114  * it goes down to 0.
4115  *
4116  * Removal of cgroup itself succeeds regardless of refs from swap.
4117  */
4118
4119 static void __mem_cgroup_free(struct mem_cgroup *mem)
4120 {
4121         int node;
4122
4123         mem_cgroup_remove_from_trees(mem);
4124         free_css_id(&mem_cgroup_subsys, &mem->css);
4125
4126         for_each_node_state(node, N_POSSIBLE)
4127                 free_mem_cgroup_per_zone_info(mem, node);
4128
4129         free_percpu(mem->stat);
4130         if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4131                 kfree(mem);
4132         else
4133                 vfree(mem);
4134 }
4135
4136 static void mem_cgroup_get(struct mem_cgroup *mem)
4137 {
4138         atomic_inc(&mem->refcnt);
4139 }
4140
4141 static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
4142 {
4143         if (atomic_sub_and_test(count, &mem->refcnt)) {
4144                 struct mem_cgroup *parent = parent_mem_cgroup(mem);
4145                 __mem_cgroup_free(mem);
4146                 if (parent)
4147                         mem_cgroup_put(parent);
4148         }
4149 }
4150
4151 static void mem_cgroup_put(struct mem_cgroup *mem)
4152 {
4153         __mem_cgroup_put(mem, 1);
4154 }
4155
4156 /*
4157  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
4158  */
4159 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
4160 {
4161         if (!mem->res.parent)
4162                 return NULL;
4163         return mem_cgroup_from_res_counter(mem->res.parent, res);
4164 }
4165
4166 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4167 static void __init enable_swap_cgroup(void)
4168 {
4169         if (!mem_cgroup_disabled() && really_do_swap_account)
4170                 do_swap_account = 1;
4171 }
4172 #else
4173 static void __init enable_swap_cgroup(void)
4174 {
4175 }
4176 #endif
4177
4178 static int mem_cgroup_soft_limit_tree_init(void)
4179 {
4180         struct mem_cgroup_tree_per_node *rtpn;
4181         struct mem_cgroup_tree_per_zone *rtpz;
4182         int tmp, node, zone;
4183
4184         for_each_node_state(node, N_POSSIBLE) {
4185                 tmp = node;
4186                 if (!node_state(node, N_NORMAL_MEMORY))
4187                         tmp = -1;
4188                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4189                 if (!rtpn)
4190                         return 1;
4191
4192                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4193
4194                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4195                         rtpz = &rtpn->rb_tree_per_zone[zone];
4196                         rtpz->rb_root = RB_ROOT;
4197                         spin_lock_init(&rtpz->lock);
4198                 }
4199         }
4200         return 0;
4201 }
4202
4203 static struct cgroup_subsys_state * __ref
4204 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4205 {
4206         struct mem_cgroup *mem, *parent;
4207         long error = -ENOMEM;
4208         int node;
4209
4210         mem = mem_cgroup_alloc();
4211         if (!mem)
4212                 return ERR_PTR(error);
4213
4214         for_each_node_state(node, N_POSSIBLE)
4215                 if (alloc_mem_cgroup_per_zone_info(mem, node))
4216                         goto free_out;
4217
4218         /* root ? */
4219         if (cont->parent == NULL) {
4220                 int cpu;
4221                 enable_swap_cgroup();
4222                 parent = NULL;
4223                 root_mem_cgroup = mem;
4224                 if (mem_cgroup_soft_limit_tree_init())
4225                         goto free_out;
4226                 for_each_possible_cpu(cpu) {
4227                         struct memcg_stock_pcp *stock =
4228                                                 &per_cpu(memcg_stock, cpu);
4229                         INIT_WORK(&stock->work, drain_local_stock);
4230                 }
4231                 hotcpu_notifier(memcg_stock_cpu_callback, 0);
4232         } else {
4233                 parent = mem_cgroup_from_cont(cont->parent);
4234                 mem->use_hierarchy = parent->use_hierarchy;
4235                 mem->oom_kill_disable = parent->oom_kill_disable;
4236         }
4237
4238         if (parent && parent->use_hierarchy) {
4239                 res_counter_init(&mem->res, &parent->res);
4240                 res_counter_init(&mem->memsw, &parent->memsw);
4241                 /*
4242                  * We increment refcnt of the parent to ensure that we can
4243                  * safely access it on res_counter_charge/uncharge.
4244                  * This refcnt will be decremented when freeing this
4245                  * mem_cgroup(see mem_cgroup_put).
4246                  */
4247                 mem_cgroup_get(parent);
4248         } else {
4249                 res_counter_init(&mem->res, NULL);
4250                 res_counter_init(&mem->memsw, NULL);
4251         }
4252         mem->last_scanned_child = 0;
4253         spin_lock_init(&mem->reclaim_param_lock);
4254         INIT_LIST_HEAD(&mem->oom_notify);
4255
4256         if (parent)
4257                 mem->swappiness = get_swappiness(parent);
4258         atomic_set(&mem->refcnt, 1);
4259         mem->move_charge_at_immigrate = 0;
4260         mutex_init(&mem->thresholds_lock);
4261         return &mem->css;
4262 free_out:
4263         __mem_cgroup_free(mem);
4264         root_mem_cgroup = NULL;
4265         return ERR_PTR(error);
4266 }
4267
4268 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
4269                                         struct cgroup *cont)
4270 {
4271         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4272
4273         return mem_cgroup_force_empty(mem, false);
4274 }
4275
4276 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
4277                                 struct cgroup *cont)
4278 {
4279         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4280
4281         mem_cgroup_put(mem);
4282 }
4283
4284 static int mem_cgroup_populate(struct cgroup_subsys *ss,
4285                                 struct cgroup *cont)
4286 {
4287         int ret;
4288
4289         ret = cgroup_add_files(cont, ss, mem_cgroup_files,
4290                                 ARRAY_SIZE(mem_cgroup_files));
4291
4292         if (!ret)
4293                 ret = register_memsw_files(cont, ss);
4294         return ret;
4295 }
4296
4297 #ifdef CONFIG_MMU
4298 /* Handlers for move charge at task migration. */
4299 #define PRECHARGE_COUNT_AT_ONCE 256
4300 static int mem_cgroup_do_precharge(unsigned long count)
4301 {
4302         int ret = 0;
4303         int batch_count = PRECHARGE_COUNT_AT_ONCE;
4304         struct mem_cgroup *mem = mc.to;
4305
4306         if (mem_cgroup_is_root(mem)) {
4307                 mc.precharge += count;
4308                 /* we don't need css_get for root */
4309                 return ret;
4310         }
4311         /* try to charge at once */
4312         if (count > 1) {
4313                 struct res_counter *dummy;
4314                 /*
4315                  * "mem" cannot be under rmdir() because we've already checked
4316                  * by cgroup_lock_live_cgroup() that it is not removed and we
4317                  * are still under the same cgroup_mutex. So we can postpone
4318                  * css_get().
4319                  */
4320                 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
4321                         goto one_by_one;
4322                 if (do_swap_account && res_counter_charge(&mem->memsw,
4323                                                 PAGE_SIZE * count, &dummy)) {
4324                         res_counter_uncharge(&mem->res, PAGE_SIZE * count);
4325                         goto one_by_one;
4326                 }
4327                 mc.precharge += count;
4328                 return ret;
4329         }
4330 one_by_one:
4331         /* fall back to one by one charge */
4332         while (count--) {
4333                 if (signal_pending(current)) {
4334                         ret = -EINTR;
4335                         break;
4336                 }
4337                 if (!batch_count--) {
4338                         batch_count = PRECHARGE_COUNT_AT_ONCE;
4339                         cond_resched();
4340                 }
4341                 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
4342                 if (ret || !mem)
4343                         /* mem_cgroup_clear_mc() will do uncharge later */
4344                         return -ENOMEM;
4345                 mc.precharge++;
4346         }
4347         return ret;
4348 }
4349
4350 /**
4351  * is_target_pte_for_mc - check a pte whether it is valid for move charge
4352  * @vma: the vma the pte to be checked belongs
4353  * @addr: the address corresponding to the pte to be checked
4354  * @ptent: the pte to be checked
4355  * @target: the pointer the target page or swap ent will be stored(can be NULL)
4356  *
4357  * Returns
4358  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
4359  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4360  *     move charge. if @target is not NULL, the page is stored in target->page
4361  *     with extra refcnt got(Callers should handle it).
4362  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4363  *     target for charge migration. if @target is not NULL, the entry is stored
4364  *     in target->ent.
4365  *
4366  * Called with pte lock held.
4367  */
4368 union mc_target {
4369         struct page     *page;
4370         swp_entry_t     ent;
4371 };
4372
4373 enum mc_target_type {
4374         MC_TARGET_NONE, /* not used */
4375         MC_TARGET_PAGE,
4376         MC_TARGET_SWAP,
4377 };
4378
4379 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4380                                                 unsigned long addr, pte_t ptent)
4381 {
4382         struct page *page = vm_normal_page(vma, addr, ptent);
4383
4384         if (!page || !page_mapped(page))
4385                 return NULL;
4386         if (PageAnon(page)) {
4387                 /* we don't move shared anon */
4388                 if (!move_anon() || page_mapcount(page) > 2)
4389                         return NULL;
4390         } else if (!move_file())
4391                 /* we ignore mapcount for file pages */
4392                 return NULL;
4393         if (!get_page_unless_zero(page))
4394                 return NULL;
4395
4396         return page;
4397 }
4398
4399 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4400                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
4401 {
4402         int usage_count;
4403         struct page *page = NULL;
4404         swp_entry_t ent = pte_to_swp_entry(ptent);
4405
4406         if (!move_anon() || non_swap_entry(ent))
4407                 return NULL;
4408         usage_count = mem_cgroup_count_swap_user(ent, &page);
4409         if (usage_count > 1) { /* we don't move shared anon */
4410                 if (page)
4411                         put_page(page);
4412                 return NULL;
4413         }
4414         if (do_swap_account)
4415                 entry->val = ent.val;
4416
4417         return page;
4418 }
4419
4420 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4421                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
4422 {
4423         struct page *page = NULL;
4424         struct inode *inode;
4425         struct address_space *mapping;
4426         pgoff_t pgoff;
4427
4428         if (!vma->vm_file) /* anonymous vma */
4429                 return NULL;
4430         if (!move_file())
4431                 return NULL;
4432
4433         inode = vma->vm_file->f_path.dentry->d_inode;
4434         mapping = vma->vm_file->f_mapping;
4435         if (pte_none(ptent))
4436                 pgoff = linear_page_index(vma, addr);
4437         else /* pte_file(ptent) is true */
4438                 pgoff = pte_to_pgoff(ptent);
4439
4440         /* page is moved even if it's not RSS of this task(page-faulted). */
4441         if (!mapping_cap_swap_backed(mapping)) { /* normal file */
4442                 page = find_get_page(mapping, pgoff);
4443         } else { /* shmem/tmpfs file. we should take account of swap too. */
4444                 swp_entry_t ent;
4445                 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
4446                 if (do_swap_account)
4447                         entry->val = ent.val;
4448         }
4449
4450         return page;
4451 }
4452
4453 static int is_target_pte_for_mc(struct vm_area_struct *vma,
4454                 unsigned long addr, pte_t ptent, union mc_target *target)
4455 {
4456         struct page *page = NULL;
4457         struct page_cgroup *pc;
4458         int ret = 0;
4459         swp_entry_t ent = { .val = 0 };
4460
4461         if (pte_present(ptent))
4462                 page = mc_handle_present_pte(vma, addr, ptent);
4463         else if (is_swap_pte(ptent))
4464                 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4465         else if (pte_none(ptent) || pte_file(ptent))
4466                 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4467
4468         if (!page && !ent.val)
4469                 return 0;
4470         if (page) {
4471                 pc = lookup_page_cgroup(page);
4472                 /*
4473                  * Do only loose check w/o page_cgroup lock.
4474                  * mem_cgroup_move_account() checks the pc is valid or not under
4475                  * the lock.
4476                  */
4477                 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4478                         ret = MC_TARGET_PAGE;
4479                         if (target)
4480                                 target->page = page;
4481                 }
4482                 if (!ret || !target)
4483                         put_page(page);
4484         }
4485         /* There is a swap entry and a page doesn't exist or isn't charged */
4486         if (ent.val && !ret &&
4487                         css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4488                 ret = MC_TARGET_SWAP;
4489                 if (target)
4490                         target->ent = ent;
4491         }
4492         return ret;
4493 }
4494
4495 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4496                                         unsigned long addr, unsigned long end,
4497                                         struct mm_walk *walk)
4498 {
4499         struct vm_area_struct *vma = walk->private;
4500         pte_t *pte;
4501         spinlock_t *ptl;
4502
4503         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4504         for (; addr != end; pte++, addr += PAGE_SIZE)
4505                 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4506                         mc.precharge++; /* increment precharge temporarily */
4507         pte_unmap_unlock(pte - 1, ptl);
4508         cond_resched();
4509
4510         return 0;
4511 }
4512
4513 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4514 {
4515         unsigned long precharge;
4516         struct vm_area_struct *vma;
4517
4518         down_read(&mm->mmap_sem);
4519         for (vma = mm->mmap; vma; vma = vma->vm_next) {
4520                 struct mm_walk mem_cgroup_count_precharge_walk = {
4521                         .pmd_entry = mem_cgroup_count_precharge_pte_range,
4522                         .mm = mm,
4523                         .private = vma,
4524                 };
4525                 if (is_vm_hugetlb_page(vma))
4526                         continue;
4527                 walk_page_range(vma->vm_start, vma->vm_end,
4528                                         &mem_cgroup_count_precharge_walk);
4529         }
4530         up_read(&mm->mmap_sem);
4531
4532         precharge = mc.precharge;
4533         mc.precharge = 0;
4534
4535         return precharge;
4536 }
4537
4538 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4539 {
4540         return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
4541 }
4542
4543 static void mem_cgroup_clear_mc(void)
4544 {
4545         struct mem_cgroup *from = mc.from;
4546         struct mem_cgroup *to = mc.to;
4547
4548         /* we must uncharge all the leftover precharges from mc.to */
4549         if (mc.precharge) {
4550                 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4551                 mc.precharge = 0;
4552         }
4553         /*
4554          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4555          * we must uncharge here.
4556          */
4557         if (mc.moved_charge) {
4558                 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4559                 mc.moved_charge = 0;
4560         }
4561         /* we must fixup refcnts and charges */
4562         if (mc.moved_swap) {
4563                 /* uncharge swap account from the old cgroup */
4564                 if (!mem_cgroup_is_root(mc.from))
4565                         res_counter_uncharge(&mc.from->memsw,
4566                                                 PAGE_SIZE * mc.moved_swap);
4567                 __mem_cgroup_put(mc.from, mc.moved_swap);
4568
4569                 if (!mem_cgroup_is_root(mc.to)) {
4570                         /*
4571                          * we charged both to->res and to->memsw, so we should
4572                          * uncharge to->res.
4573                          */
4574                         res_counter_uncharge(&mc.to->res,
4575                                                 PAGE_SIZE * mc.moved_swap);
4576                 }
4577                 /* we've already done mem_cgroup_get(mc.to) */
4578
4579                 mc.moved_swap = 0;
4580         }
4581         spin_lock(&mc.lock);
4582         mc.from = NULL;
4583         mc.to = NULL;
4584         mc.moving_task = NULL;
4585         spin_unlock(&mc.lock);
4586         mem_cgroup_end_move(from);
4587         memcg_oom_recover(from);
4588         memcg_oom_recover(to);
4589         wake_up_all(&mc.waitq);
4590 }
4591
4592 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4593                                 struct cgroup *cgroup,
4594                                 struct task_struct *p,
4595                                 bool threadgroup)
4596 {
4597         int ret = 0;
4598         struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4599
4600         if (mem->move_charge_at_immigrate) {
4601                 struct mm_struct *mm;
4602                 struct mem_cgroup *from = mem_cgroup_from_task(p);
4603
4604                 VM_BUG_ON(from == mem);
4605
4606                 mm = get_task_mm(p);
4607                 if (!mm)
4608                         return 0;
4609                 /* We move charges only when we move a owner of the mm */
4610                 if (mm->owner == p) {
4611                         VM_BUG_ON(mc.from);
4612                         VM_BUG_ON(mc.to);
4613                         VM_BUG_ON(mc.precharge);
4614                         VM_BUG_ON(mc.moved_charge);
4615                         VM_BUG_ON(mc.moved_swap);
4616                         VM_BUG_ON(mc.moving_task);
4617                         mem_cgroup_start_move(from);
4618                         spin_lock(&mc.lock);
4619                         mc.from = from;
4620                         mc.to = mem;
4621                         mc.precharge = 0;
4622                         mc.moved_charge = 0;
4623                         mc.moved_swap = 0;
4624                         mc.moving_task = current;
4625                         spin_unlock(&mc.lock);
4626
4627                         ret = mem_cgroup_precharge_mc(mm);
4628                         if (ret)
4629                                 mem_cgroup_clear_mc();
4630                 }
4631                 mmput(mm);
4632         }
4633         return ret;
4634 }
4635
4636 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4637                                 struct cgroup *cgroup,
4638                                 struct task_struct *p,
4639                                 bool threadgroup)
4640 {
4641         mem_cgroup_clear_mc();
4642 }
4643
4644 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4645                                 unsigned long addr, unsigned long end,
4646                                 struct mm_walk *walk)
4647 {
4648         int ret = 0;
4649         struct vm_area_struct *vma = walk->private;
4650         pte_t *pte;
4651         spinlock_t *ptl;
4652
4653 retry:
4654         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4655         for (; addr != end; addr += PAGE_SIZE) {
4656                 pte_t ptent = *(pte++);
4657                 union mc_target target;
4658                 int type;
4659                 struct page *page;
4660                 struct page_cgroup *pc;
4661                 swp_entry_t ent;
4662
4663                 if (!mc.precharge)
4664                         break;
4665
4666                 type = is_target_pte_for_mc(vma, addr, ptent, &target);
4667                 switch (type) {
4668                 case MC_TARGET_PAGE:
4669                         page = target.page;
4670                         if (isolate_lru_page(page))
4671                                 goto put;
4672                         pc = lookup_page_cgroup(page);
4673                         if (!mem_cgroup_move_account(pc,
4674                                                 mc.from, mc.to, false)) {
4675                                 mc.precharge--;
4676                                 /* we uncharge from mc.from later. */
4677                                 mc.moved_charge++;
4678                         }
4679                         putback_lru_page(page);
4680 put:                    /* is_target_pte_for_mc() gets the page */
4681                         put_page(page);
4682                         break;
4683                 case MC_TARGET_SWAP:
4684                         ent = target.ent;
4685                         if (!mem_cgroup_move_swap_account(ent,
4686                                                 mc.from, mc.to, false)) {
4687                                 mc.precharge--;
4688                                 /* we fixup refcnts and charges later. */
4689                                 mc.moved_swap++;
4690                         }
4691                         break;
4692                 default:
4693                         break;
4694                 }
4695         }
4696         pte_unmap_unlock(pte - 1, ptl);
4697         cond_resched();
4698
4699         if (addr != end) {
4700                 /*
4701                  * We have consumed all precharges we got in can_attach().
4702                  * We try charge one by one, but don't do any additional
4703                  * charges to mc.to if we have failed in charge once in attach()
4704                  * phase.
4705                  */
4706                 ret = mem_cgroup_do_precharge(1);
4707                 if (!ret)
4708                         goto retry;
4709         }
4710
4711         return ret;
4712 }
4713
4714 static void mem_cgroup_move_charge(struct mm_struct *mm)
4715 {
4716         struct vm_area_struct *vma;
4717
4718         lru_add_drain_all();
4719         down_read(&mm->mmap_sem);
4720         for (vma = mm->mmap; vma; vma = vma->vm_next) {
4721                 int ret;
4722                 struct mm_walk mem_cgroup_move_charge_walk = {
4723                         .pmd_entry = mem_cgroup_move_charge_pte_range,
4724                         .mm = mm,
4725                         .private = vma,
4726                 };
4727                 if (is_vm_hugetlb_page(vma))
4728                         continue;
4729                 ret = walk_page_range(vma->vm_start, vma->vm_end,
4730                                                 &mem_cgroup_move_charge_walk);
4731                 if (ret)
4732                         /*
4733                          * means we have consumed all precharges and failed in
4734                          * doing additional charge. Just abandon here.
4735                          */
4736                         break;
4737         }
4738         up_read(&mm->mmap_sem);
4739 }
4740
4741 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4742                                 struct cgroup *cont,
4743                                 struct cgroup *old_cont,
4744                                 struct task_struct *p,
4745                                 bool threadgroup)
4746 {
4747         struct mm_struct *mm;
4748
4749         if (!mc.to)
4750                 /* no need to move charge */
4751                 return;
4752
4753         mm = get_task_mm(p);
4754         if (mm) {
4755                 mem_cgroup_move_charge(mm);
4756                 mmput(mm);
4757         }
4758         mem_cgroup_clear_mc();
4759 }
4760 #else   /* !CONFIG_MMU */
4761 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4762                                 struct cgroup *cgroup,
4763                                 struct task_struct *p,
4764                                 bool threadgroup)
4765 {
4766         return 0;
4767 }
4768 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4769                                 struct cgroup *cgroup,
4770                                 struct task_struct *p,
4771                                 bool threadgroup)
4772 {
4773 }
4774 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4775                                 struct cgroup *cont,
4776                                 struct cgroup *old_cont,
4777                                 struct task_struct *p,
4778                                 bool threadgroup)
4779 {
4780 }
4781 #endif
4782
4783 struct cgroup_subsys mem_cgroup_subsys = {
4784         .name = "memory",
4785         .subsys_id = mem_cgroup_subsys_id,
4786         .create = mem_cgroup_create,
4787         .pre_destroy = mem_cgroup_pre_destroy,
4788         .destroy = mem_cgroup_destroy,
4789         .populate = mem_cgroup_populate,
4790         .can_attach = mem_cgroup_can_attach,
4791         .cancel_attach = mem_cgroup_cancel_attach,
4792         .attach = mem_cgroup_move_task,
4793         .early_init = 0,
4794         .use_id = 1,
4795 };
4796
4797 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4798
4799 static int __init disable_swap_account(char *s)
4800 {
4801         really_do_swap_account = 0;
4802         return 1;
4803 }
4804 __setup("noswapaccount", disable_swap_account);
4805 #endif