#include <linux/cgroup.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
+#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
+#include <linux/swap.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
struct cgroup_subsys mem_cgroup_subsys;
+static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
/*
* The memory controller data structure. The memory controller controls both
*/
struct list_head active_list;
struct list_head inactive_list;
+ /*
+ * spin_lock to protect the per cgroup LRU
+ */
+ spinlock_t lru_lock;
+ unsigned long control_type; /* control RSS or RSS+Pagecache */
};
/*
/* mapped and cached states */
};
+enum {
+ MEM_CGROUP_TYPE_UNSPEC = 0,
+ MEM_CGROUP_TYPE_MAPPED,
+ MEM_CGROUP_TYPE_CACHED,
+ MEM_CGROUP_TYPE_ALL,
+ MEM_CGROUP_TYPE_MAX,
+};
+
+static struct mem_cgroup init_mem_cgroup;
static inline
struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
(page->page_cgroup & ~PAGE_CGROUP_LOCK);
}
-void __always_inline lock_page_cgroup(struct page *page)
+static void __always_inline lock_page_cgroup(struct page *page)
{
bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
VM_BUG_ON(!page_cgroup_locked(page));
}
-void __always_inline unlock_page_cgroup(struct page *page)
+static void __always_inline unlock_page_cgroup(struct page *page)
{
bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
/*
+ * Tie new page_cgroup to struct page under lock_page_cgroup()
+ * This can fail if the page has been tied to a page_cgroup.
+ * If success, returns 0.
+ */
+static inline int
+page_cgroup_assign_new_page_cgroup(struct page *page, struct page_cgroup *pc)
+{
+ int ret = 0;
+
+ lock_page_cgroup(page);
+ if (!page_get_page_cgroup(page))
+ page_assign_page_cgroup(page, pc);
+ else /* A page is tied to other pc. */
+ ret = 1;
+ unlock_page_cgroup(page);
+ return ret;
+}
+
+/*
+ * Clear page->page_cgroup member under lock_page_cgroup().
+ * If given "pc" value is different from one page->page_cgroup,
+ * page->cgroup is not cleared.
+ * Returns a value of page->page_cgroup at lock taken.
+ * A can can detect failure of clearing by following
+ * clear_page_cgroup(page, pc) == pc
+ */
+
+static inline struct page_cgroup *
+clear_page_cgroup(struct page *page, struct page_cgroup *pc)
+{
+ struct page_cgroup *ret;
+ /* lock and clear */
+ lock_page_cgroup(page);
+ ret = page_get_page_cgroup(page);
+ if (likely(ret == pc))
+ page_assign_page_cgroup(page, NULL);
+ unlock_page_cgroup(page);
+ return ret;
+}
+
+
+static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+{
+ if (active)
+ list_move(&pc->lru, &pc->mem_cgroup->active_list);
+ else
+ list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
+}
+
+int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
+{
+ int ret;
+
+ task_lock(task);
+ ret = task->mm && mm_cgroup(task->mm) == mem;
+ task_unlock(task);
+ return ret;
+}
+
+/*
+ * This routine assumes that the appropriate zone's lru lock is already held
+ */
+void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+{
+ struct mem_cgroup *mem;
+ if (!pc)
+ return;
+
+ mem = pc->mem_cgroup;
+
+ spin_lock(&mem->lru_lock);
+ __mem_cgroup_move_lists(pc, active);
+ spin_unlock(&mem->lru_lock);
+}
+
+unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
+ struct list_head *dst,
+ unsigned long *scanned, int order,
+ int mode, struct zone *z,
+ struct mem_cgroup *mem_cont,
+ int active)
+{
+ unsigned long nr_taken = 0;
+ struct page *page;
+ unsigned long scan;
+ LIST_HEAD(pc_list);
+ struct list_head *src;
+ struct page_cgroup *pc, *tmp;
+
+ if (active)
+ src = &mem_cont->active_list;
+ else
+ src = &mem_cont->inactive_list;
+
+ spin_lock(&mem_cont->lru_lock);
+ scan = 0;
+ list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
+ if (scan++ > nr_to_scan)
+ break;
+ page = pc->page;
+ VM_BUG_ON(!pc);
+
+ if (unlikely(!PageLRU(page))) {
+ scan--;
+ continue;
+ }
+
+ if (PageActive(page) && !active) {
+ __mem_cgroup_move_lists(pc, true);
+ scan--;
+ continue;
+ }
+ if (!PageActive(page) && active) {
+ __mem_cgroup_move_lists(pc, false);
+ scan--;
+ continue;
+ }
+
+ /*
+ * Reclaim, per zone
+ * TODO: make the active/inactive lists per zone
+ */
+ if (page_zone(page) != z)
+ continue;
+
+ /*
+ * Check if the meta page went away from under us
+ */
+ if (!list_empty(&pc->lru))
+ list_move(&pc->lru, &pc_list);
+ else
+ continue;
+
+ if (__isolate_lru_page(page, mode) == 0) {
+ list_move(&page->lru, dst);
+ nr_taken++;
+ }
+ }
+
+ list_splice(&pc_list, src);
+ spin_unlock(&mem_cont->lru_lock);
+
+ *scanned = scan;
+ return nr_taken;
+}
+
+/*
* Charge the memory controller for page usage.
* Return
* 0 if the charge was successful
* < 0 if the cgroup is over its limit
*/
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
{
struct mem_cgroup *mem;
- struct page_cgroup *pc, *race_pc;
+ struct page_cgroup *pc;
+ unsigned long flags;
+ unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
/*
* Should page_cgroup's go to their own slab?
* to see if the cgroup page already has a page_cgroup associated
* with it
*/
+retry:
lock_page_cgroup(page);
pc = page_get_page_cgroup(page);
/*
* The page_cgroup exists and the page has already been accounted
*/
if (pc) {
- atomic_inc(&pc->ref_cnt);
- goto done;
+ if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
+ /* this page is under being uncharged ? */
+ unlock_page_cgroup(page);
+ cpu_relax();
+ goto retry;
+ } else {
+ unlock_page_cgroup(page);
+ goto done;
+ }
}
unlock_page_cgroup(page);
- pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
+ pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
if (pc == NULL)
goto err;
* If we created the page_cgroup, we should free it on exceeding
* the cgroup limit.
*/
- if (res_counter_charge(&mem->res, 1)) {
+ while (res_counter_charge(&mem->res, PAGE_SIZE)) {
+ bool is_atomic = gfp_mask & GFP_ATOMIC;
+ /*
+ * We cannot reclaim under GFP_ATOMIC, fail the charge
+ */
+ if (is_atomic)
+ goto noreclaim;
+
+ if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
+ continue;
+
+ /*
+ * try_to_free_mem_cgroup_pages() might not give us a full
+ * picture of reclaim. Some pages are reclaimed and might be
+ * moved to swap cache or just unmapped from the cgroup.
+ * Check the limit again to see if the reclaim reduced the
+ * current usage of the cgroup before giving up
+ */
+ if (res_counter_check_under_limit(&mem->res))
+ continue;
+ /*
+ * Since we control both RSS and cache, we end up with a
+ * very interesting scenario where we end up reclaiming
+ * memory (essentially RSS), since the memory is pushed
+ * to swap cache, we eventually end up adding those
+ * pages back to our list. Hence we give ourselves a
+ * few chances before we fail
+ */
+ else if (nr_retries--) {
+ congestion_wait(WRITE, HZ/10);
+ continue;
+ }
+noreclaim:
css_put(&mem->css);
+ if (!is_atomic)
+ mem_cgroup_out_of_memory(mem, GFP_KERNEL);
goto free_pc;
}
- lock_page_cgroup(page);
- /*
- * Check if somebody else beat us to allocating the page_cgroup
- */
- race_pc = page_get_page_cgroup(page);
- if (race_pc) {
- kfree(pc);
- pc = race_pc;
- atomic_inc(&pc->ref_cnt);
- res_counter_uncharge(&mem->res, 1);
- css_put(&mem->css);
- goto done;
- }
-
atomic_set(&pc->ref_cnt, 1);
pc->mem_cgroup = mem;
pc->page = page;
- page_assign_page_cgroup(page, pc);
+ if (page_cgroup_assign_new_page_cgroup(page, pc)) {
+ /*
+ * an another charge is added to this page already.
+ * we do take lock_page_cgroup(page) again and read
+ * page->cgroup, increment refcnt.... just retry is OK.
+ */
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ css_put(&mem->css);
+ kfree(pc);
+ goto retry;
+ }
+
+ spin_lock_irqsave(&mem->lru_lock, flags);
+ list_add(&pc->lru, &mem->active_list);
+ spin_unlock_irqrestore(&mem->lru_lock, flags);
done:
- unlock_page_cgroup(page);
return 0;
free_pc:
kfree(pc);
- return -ENOMEM;
err:
- unlock_page_cgroup(page);
return -ENOMEM;
}
/*
+ * See if the cached pages should be charged at all?
+ */
+int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
+{
+ struct mem_cgroup *mem;
+ if (!mm)
+ mm = &init_mm;
+
+ mem = rcu_dereference(mm->mem_cgroup);
+ if (mem->control_type == MEM_CGROUP_TYPE_ALL)
+ return mem_cgroup_charge(page, mm, gfp_mask);
+ else
+ return 0;
+}
+
+/*
* Uncharging is always a welcome operation, we never complain, simply
* uncharge.
*/
{
struct mem_cgroup *mem;
struct page *page;
+ unsigned long flags;
+ /*
+ * This can handle cases when a page is not charged at all and we
+ * are switching between handling the control_type.
+ */
if (!pc)
return;
if (atomic_dec_and_test(&pc->ref_cnt)) {
page = pc->page;
- lock_page_cgroup(page);
- mem = pc->mem_cgroup;
- css_put(&mem->css);
- page_assign_page_cgroup(page, NULL);
- unlock_page_cgroup(page);
- res_counter_uncharge(&mem->res, 1);
- kfree(pc);
+ /*
+ * get page->cgroup and clear it under lock.
+ */
+ if (clear_page_cgroup(page, pc) == pc) {
+ mem = pc->mem_cgroup;
+ css_put(&mem->css);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ spin_lock_irqsave(&mem->lru_lock, flags);
+ list_del_init(&pc->lru);
+ spin_unlock_irqrestore(&mem->lru_lock, flags);
+ kfree(pc);
+ } else {
+ /*
+ * Note:This will be removed when force-empty patch is
+ * applied. just show warning here.
+ */
+ printk(KERN_ERR "Race in mem_cgroup_uncharge() ?");
+ dump_stack();
+ }
}
}
+/*
+ * Returns non-zero if a page (under migration) has valid page_cgroup member.
+ * Refcnt of page_cgroup is incremented.
+ */
-static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
- struct file *file, char __user *userbuf, size_t nbytes,
- loff_t *ppos)
+int mem_cgroup_prepare_migration(struct page *page)
+{
+ struct page_cgroup *pc;
+ int ret = 0;
+ lock_page_cgroup(page);
+ pc = page_get_page_cgroup(page);
+ if (pc && atomic_inc_not_zero(&pc->ref_cnt))
+ ret = 1;
+ unlock_page_cgroup(page);
+ return ret;
+}
+
+void mem_cgroup_end_migration(struct page *page)
+{
+ struct page_cgroup *pc = page_get_page_cgroup(page);
+ mem_cgroup_uncharge(pc);
+}
+/*
+ * We know both *page* and *newpage* are now not-on-LRU and Pg_locked.
+ * And no race with uncharge() routines because page_cgroup for *page*
+ * has extra one reference by mem_cgroup_prepare_migration.
+ */
+
+void mem_cgroup_page_migration(struct page *page, struct page *newpage)
+{
+ struct page_cgroup *pc;
+retry:
+ pc = page_get_page_cgroup(page);
+ if (!pc)
+ return;
+ if (clear_page_cgroup(page, pc) != pc)
+ goto retry;
+ pc->page = newpage;
+ lock_page_cgroup(newpage);
+ page_assign_page_cgroup(newpage, pc);
+ unlock_page_cgroup(newpage);
+ return;
+}
+
+int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
+{
+ *tmp = memparse(buf, &buf);
+ if (*buf != '\0')
+ return -EINVAL;
+
+ /*
+ * Round up the value to the closest page size
+ */
+ *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
+ return 0;
+}
+
+static ssize_t mem_cgroup_read(struct cgroup *cont,
+ struct cftype *cft, struct file *file,
+ char __user *userbuf, size_t nbytes, loff_t *ppos)
{
return res_counter_read(&mem_cgroup_from_cont(cont)->res,
- cft->private, userbuf, nbytes, ppos);
+ cft->private, userbuf, nbytes, ppos,
+ NULL);
}
static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
size_t nbytes, loff_t *ppos)
{
return res_counter_write(&mem_cgroup_from_cont(cont)->res,
- cft->private, userbuf, nbytes, ppos);
+ cft->private, userbuf, nbytes, ppos,
+ mem_cgroup_write_strategy);
+}
+
+static ssize_t mem_control_type_write(struct cgroup *cont,
+ struct cftype *cft, struct file *file,
+ const char __user *userbuf,
+ size_t nbytes, loff_t *pos)
+{
+ int ret;
+ char *buf, *end;
+ unsigned long tmp;
+ struct mem_cgroup *mem;
+
+ mem = mem_cgroup_from_cont(cont);
+ buf = kmalloc(nbytes + 1, GFP_KERNEL);
+ ret = -ENOMEM;
+ if (buf == NULL)
+ goto out;
+
+ buf[nbytes] = 0;
+ ret = -EFAULT;
+ if (copy_from_user(buf, userbuf, nbytes))
+ goto out_free;
+
+ ret = -EINVAL;
+ tmp = simple_strtoul(buf, &end, 10);
+ if (*end != '\0')
+ goto out_free;
+
+ if (tmp <= MEM_CGROUP_TYPE_UNSPEC || tmp >= MEM_CGROUP_TYPE_MAX)
+ goto out_free;
+
+ mem->control_type = tmp;
+ ret = nbytes;
+out_free:
+ kfree(buf);
+out:
+ return ret;
+}
+
+static ssize_t mem_control_type_read(struct cgroup *cont,
+ struct cftype *cft,
+ struct file *file, char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ unsigned long val;
+ char buf[64], *s;
+ struct mem_cgroup *mem;
+
+ mem = mem_cgroup_from_cont(cont);
+ s = buf;
+ val = mem->control_type;
+ s += sprintf(s, "%lu\n", val);
+ return simple_read_from_buffer((void __user *)userbuf, nbytes,
+ ppos, buf, s - buf);
}
static struct cftype mem_cgroup_files[] = {
{
- .name = "usage",
+ .name = "usage_in_bytes",
.private = RES_USAGE,
.read = mem_cgroup_read,
},
{
- .name = "limit",
+ .name = "limit_in_bytes",
.private = RES_LIMIT,
.write = mem_cgroup_write,
.read = mem_cgroup_read,
.private = RES_FAILCNT,
.read = mem_cgroup_read,
},
+ {
+ .name = "control_type",
+ .write = mem_control_type_write,
+ .read = mem_control_type_read,
+ },
};
static struct mem_cgroup init_mem_cgroup;
res_counter_init(&mem->res);
INIT_LIST_HEAD(&mem->active_list);
INIT_LIST_HEAD(&mem->inactive_list);
+ spin_lock_init(&mem->lru_lock);
+ mem->control_type = MEM_CGROUP_TYPE_ALL;
return &mem->css;
}
ARRAY_SIZE(mem_cgroup_files));
}
+static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+ struct cgroup *cont,
+ struct cgroup *old_cont,
+ struct task_struct *p)
+{
+ struct mm_struct *mm;
+ struct mem_cgroup *mem, *old_mem;
+
+ mm = get_task_mm(p);
+ if (mm == NULL)
+ return;
+
+ mem = mem_cgroup_from_cont(cont);
+ old_mem = mem_cgroup_from_cont(old_cont);
+
+ if (mem == old_mem)
+ goto out;
+
+ /*
+ * Only thread group leaders are allowed to migrate, the mm_struct is
+ * in effect owned by the leader
+ */
+ if (p->tgid != p->pid)
+ goto out;
+
+ css_get(&mem->css);
+ rcu_assign_pointer(mm->mem_cgroup, mem);
+ css_put(&old_mem->css);
+
+out:
+ mmput(mm);
+ return;
+}
+
struct cgroup_subsys mem_cgroup_subsys = {
.name = "memory",
.subsys_id = mem_cgroup_subsys_id,
.create = mem_cgroup_create,
.destroy = mem_cgroup_destroy,
.populate = mem_cgroup_populate,
+ .attach = mem_cgroup_move_task,
.early_init = 1,
};