memcg: move charges of anonymous swap
Daisuke Nishimura [Wed, 10 Mar 2010 23:22:17 +0000 (15:22 -0800)]
This patch is another core part of this move-charge-at-task-migration
feature.  It enables moving charges of anonymous swaps.

To move the charge of swap, we need to exchange swap_cgroup's record.

In current implementation, swap_cgroup's record is protected by:

  - page lock: if the entry is on swap cache.
  - swap_lock: if the entry is not on swap cache.

This works well in usual swap-in/out activity.

But this behavior make the feature of moving swap charge check many
conditions to exchange swap_cgroup's record safely.

So I changed modification of swap_cgroup's recored(swap_cgroup_record())
to use xchg, and define a new function to cmpxchg swap_cgroup's record.

This patch also enables moving charge of non pte_present but not uncharged
swap caches, which can be exist on swap-out path, by getting the target
pages via find_get_page() as do_mincore() does.

[kosaki.motohiro@jp.fujitsu.com: fix ia64 build]
[akpm@linux-foundation.org: fix typos]
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Documentation/cgroups/memory.txt
include/linux/page_cgroup.h
include/linux/swap.h
mm/memcontrol.c
mm/page_cgroup.c
mm/swapfile.c

index e726fb0..1f59a1a 100644 (file)
@@ -420,6 +420,8 @@ NOTE2: It is recommended to set the soft limit always below the hard limit,
 
 Users can move charges associated with a task along with task migration, that
 is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
+This feature is not supported in !CONFIG_MMU environments because of lack of
+page tables.
 
 8.1 Interface
 
index b0e4eb1..30b0813 100644 (file)
@@ -118,6 +118,8 @@ static inline void __init page_cgroup_init_flatmem(void)
 #include <linux/swap.h>
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+                                       unsigned short old, unsigned short new);
 extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
 extern unsigned short lookup_swap_cgroup(swp_entry_t ent);
 extern int swap_cgroup_swapon(int type, unsigned long max_pages);
index a2602a8..1f59d93 100644 (file)
@@ -355,6 +355,7 @@ static inline void disable_swap_token(void)
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 extern void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
+extern int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep);
 #else
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
@@ -485,6 +486,14 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 {
 }
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+static inline int
+mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
+{
+       return 0;
+}
+#endif
+
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
index 589084f..e883198 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
+#include <linux/swapops.h>
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
@@ -2270,6 +2271,54 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
        }
        rcu_read_unlock();
 }
+
+/**
+ * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
+ * @entry: swap entry to be moved
+ * @from:  mem_cgroup which the entry is moved from
+ * @to:  mem_cgroup which the entry is moved to
+ *
+ * It succeeds only when the swap_cgroup's record for this entry is the same
+ * as the mem_cgroup's id of @from.
+ *
+ * Returns 0 on success, -EINVAL on failure.
+ *
+ * The caller must have charged to @to, IOW, called res_counter_charge() about
+ * both res and memsw, and called css_get().
+ */
+static int mem_cgroup_move_swap_account(swp_entry_t entry,
+                               struct mem_cgroup *from, struct mem_cgroup *to)
+{
+       unsigned short old_id, new_id;
+
+       old_id = css_id(&from->css);
+       new_id = css_id(&to->css);
+
+       if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
+               if (!mem_cgroup_is_root(from))
+                       res_counter_uncharge(&from->memsw, PAGE_SIZE);
+               mem_cgroup_swap_statistics(from, false);
+               mem_cgroup_put(from);
+               /*
+                * we charged both to->res and to->memsw, so we should uncharge
+                * to->res.
+                */
+               if (!mem_cgroup_is_root(to))
+                       res_counter_uncharge(&to->res, PAGE_SIZE);
+               mem_cgroup_swap_statistics(to, true);
+               mem_cgroup_get(to);
+               css_put(&to->css);
+
+               return 0;
+       }
+       return -EINVAL;
+}
+#else
+static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
+                               struct mem_cgroup *from, struct mem_cgroup *to)
+{
+       return -EINVAL;
+}
 #endif
 
 /*
@@ -2949,6 +2998,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
        return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
 }
 
+#ifdef CONFIG_MMU
 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
                                        struct cftype *cft, u64 val)
 {
@@ -2967,6 +3017,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 
        return 0;
 }
+#else
+static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
+                                       struct cftype *cft, u64 val)
+{
+       return -ENOSYS;
+}
+#endif
 
 
 /* For read statistics */
@@ -3489,6 +3546,7 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
        return ret;
 }
 
+#ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 #define PRECHARGE_COUNT_AT_ONCE        256
 static int mem_cgroup_do_precharge(unsigned long count)
@@ -3544,77 +3602,124 @@ one_by_one:
        }
        return ret;
 }
+#else  /* !CONFIG_MMU */
+static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+                               struct cgroup *cgroup,
+                               struct task_struct *p,
+                               bool threadgroup)
+{
+       return 0;
+}
+static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+                               struct cgroup *cgroup,
+                               struct task_struct *p,
+                               bool threadgroup)
+{
+}
+static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+                               struct cgroup *cont,
+                               struct cgroup *old_cont,
+                               struct task_struct *p,
+                               bool threadgroup)
+{
+}
+#endif
 
 /**
  * is_target_pte_for_mc - check a pte whether it is valid for move charge
  * @vma: the vma the pte to be checked belongs
  * @addr: the address corresponding to the pte to be checked
  * @ptent: the pte to be checked
- * @target: the pointer the target page will be stored(can be NULL)
+ * @target: the pointer the target page or swap ent will be stored(can be NULL)
  *
  * Returns
  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
  *     move charge. if @target is not NULL, the page is stored in target->page
  *     with extra refcnt got(Callers should handle it).
+ *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
+ *     target for charge migration. if @target is not NULL, the entry is stored
+ *     in target->ent.
  *
  * Called with pte lock held.
  */
-/* We add a new member later. */
 union mc_target {
        struct page     *page;
+       swp_entry_t     ent;
 };
 
-/* We add a new type later. */
 enum mc_target_type {
        MC_TARGET_NONE, /* not used */
        MC_TARGET_PAGE,
+       MC_TARGET_SWAP,
 };
 
 static int is_target_pte_for_mc(struct vm_area_struct *vma,
                unsigned long addr, pte_t ptent, union mc_target *target)
 {
-       struct page *page;
+       struct page *page = NULL;
        struct page_cgroup *pc;
        int ret = 0;
+       swp_entry_t ent = { .val = 0 };
+       int usage_count = 0;
        bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
                                        &mc.to->move_charge_at_immigrate);
 
-       if (!pte_present(ptent))
-               return 0;
-
-       page = vm_normal_page(vma, addr, ptent);
-       if (!page || !page_mapped(page))
-               return 0;
-       /*
-        * TODO: We don't move charges of file(including shmem/tmpfs) pages for
-        * now.
-        */
-       if (!move_anon || !PageAnon(page))
-               return 0;
-       /*
-        * TODO: We don't move charges of shared(used by multiple processes)
-        * pages for now.
-        */
-       if (page_mapcount(page) > 1)
-               return 0;
-       if (!get_page_unless_zero(page))
+       if (!pte_present(ptent)) {
+               /* TODO: handle swap of shmes/tmpfs */
+               if (pte_none(ptent) || pte_file(ptent))
+                       return 0;
+               else if (is_swap_pte(ptent)) {
+                       ent = pte_to_swp_entry(ptent);
+                       if (!move_anon || non_swap_entry(ent))
+                               return 0;
+                       usage_count = mem_cgroup_count_swap_user(ent, &page);
+               }
+       } else {
+               page = vm_normal_page(vma, addr, ptent);
+               if (!page || !page_mapped(page))
+                       return 0;
+               /*
+                * TODO: We don't move charges of file(including shmem/tmpfs)
+                * pages for now.
+                */
+               if (!move_anon || !PageAnon(page))
+                       return 0;
+               if (!get_page_unless_zero(page))
+                       return 0;
+               usage_count = page_mapcount(page);
+       }
+       if (usage_count > 1) {
+               /*
+                * TODO: We don't move charges of shared(used by multiple
+                * processes) pages for now.
+                */
+               if (page)
+                       put_page(page);
                return 0;
-
-       pc = lookup_page_cgroup(page);
-       /*
-        * Do only loose check w/o page_cgroup lock. mem_cgroup_move_account()
-        * checks the pc is valid or not under the lock.
-        */
-       if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
-               ret = MC_TARGET_PAGE;
+       }
+       if (page) {
+               pc = lookup_page_cgroup(page);
+               /*
+                * Do only loose check w/o page_cgroup lock.
+                * mem_cgroup_move_account() checks the pc is valid or not under
+                * the lock.
+                */
+               if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+                       ret = MC_TARGET_PAGE;
+                       if (target)
+                               target->page = page;
+               }
+               if (!ret || !target)
+                       put_page(page);
+       }
+       /* throught */
+       if (ent.val && do_swap_account && !ret &&
+                       css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
+               ret = MC_TARGET_SWAP;
                if (target)
-                       target->page = page;
+                       target->ent = ent;
        }
-
-       if (!ret || !target)
-               put_page(page);
-
        return ret;
 }
 
@@ -3754,6 +3859,7 @@ retry:
                int type;
                struct page *page;
                struct page_cgroup *pc;
+               swp_entry_t ent;
 
                if (!mc.precharge)
                        break;
@@ -3775,6 +3881,11 @@ retry:
 put:                   /* is_target_pte_for_mc() gets the page */
                        put_page(page);
                        break;
+               case MC_TARGET_SWAP:
+                       ent = target.ent;
+                       if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to))
+                               mc.precharge--;
+                       break;
                default:
                        break;
                }
index 3d535d5..3dd8853 100644 (file)
@@ -335,6 +335,37 @@ not_enough_page:
 }
 
 /**
+ * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
+ * @end: swap entry to be cmpxchged
+ * @old: old id
+ * @new: new id
+ *
+ * Returns old id at success, 0 at failure.
+ * (There is no mem_cgroup useing 0 as its id)
+ */
+unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+                                       unsigned short old, unsigned short new)
+{
+       int type = swp_type(ent);
+       unsigned long offset = swp_offset(ent);
+       unsigned long idx = offset / SC_PER_PAGE;
+       unsigned long pos = offset & SC_POS_MASK;
+       struct swap_cgroup_ctrl *ctrl;
+       struct page *mappage;
+       struct swap_cgroup *sc;
+
+       ctrl = &swap_cgroup_ctrl[type];
+
+       mappage = ctrl->map[idx];
+       sc = page_address(mappage);
+       sc += pos;
+       if (cmpxchg(&sc->id, old, new) == old)
+               return old;
+       else
+               return 0;
+}
+
+/**
  * swap_cgroup_record - record mem_cgroup for this swp_entry.
  * @ent: swap entry to be recorded into
  * @mem: mem_cgroup to be recorded
@@ -358,8 +389,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
        mappage = ctrl->map[idx];
        sc = page_address(mappage);
        sc += pos;
-       old = sc->id;
-       sc->id = id;
+       old = xchg(&sc->id, id);
 
        return old;
 }
index 84374d8..6cd0a8f 100644 (file)
@@ -723,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry)
        return p != NULL;
 }
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/**
+ * mem_cgroup_count_swap_user - count the user of a swap entry
+ * @ent: the swap entry to be checked
+ * @pagep: the pointer for the swap cache page of the entry to be stored
+ *
+ * Returns the number of the user of the swap entry. The number is valid only
+ * for swaps of anonymous pages.
+ * If the entry is found on swap cache, the page is stored to pagep with
+ * refcount of it being incremented.
+ */
+int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
+{
+       struct page *page;
+       struct swap_info_struct *p;
+       int count = 0;
+
+       page = find_get_page(&swapper_space, ent.val);
+       if (page)
+               count += page_mapcount(page);
+       p = swap_info_get(ent);
+       if (p) {
+               count += swap_count(p->swap_map[swp_offset(ent)]);
+               spin_unlock(&swap_lock);
+       }
+
+       *pagep = page;
+       return count;
+}
+#endif
+
 #ifdef CONFIG_HIBERNATION
 /*
  * Find the swap type that corresponds to given device (if any).