wake up s_wait_unfrozen when ->freeze_fs fails
[linux-2.6.git] / block / blk-cgroup.c
index 7762987..fa8f263 100644 (file)
@@ -31,9 +31,9 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
                                                  struct cgroup *);
 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
-                             struct task_struct *, bool);
+                             struct cgroup_taskset *);
 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
-                          struct cgroup *, struct task_struct *, bool);
+                          struct cgroup_taskset *);
 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
 
@@ -114,6 +114,13 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
+struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
+{
+       return container_of(task_subsys_state(tsk, blkio_subsys_id),
+                           struct blkio_cgroup, css);
+}
+EXPORT_SYMBOL_GPL(task_blkio_cgroup);
+
 static inline void
 blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
 {
@@ -124,7 +131,54 @@ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
                if (blkiop->plid != blkg->plid)
                        continue;
                if (blkiop->ops.blkio_update_group_weight_fn)
-                       blkiop->ops.blkio_update_group_weight_fn(blkg, weight);
+                       blkiop->ops.blkio_update_group_weight_fn(blkg->key,
+                                                       blkg, weight);
+       }
+}
+
+static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
+                               int fileid)
+{
+       struct blkio_policy_type *blkiop;
+
+       list_for_each_entry(blkiop, &blkio_list, list) {
+
+               /* If this policy does not own the blkg, do not send updates */
+               if (blkiop->plid != blkg->plid)
+                       continue;
+
+               if (fileid == BLKIO_THROTL_read_bps_device
+                   && blkiop->ops.blkio_update_group_read_bps_fn)
+                       blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
+                                                               blkg, bps);
+
+               if (fileid == BLKIO_THROTL_write_bps_device
+                   && blkiop->ops.blkio_update_group_write_bps_fn)
+                       blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
+                                                               blkg, bps);
+       }
+}
+
+static inline void blkio_update_group_iops(struct blkio_group *blkg,
+                       unsigned int iops, int fileid)
+{
+       struct blkio_policy_type *blkiop;
+
+       list_for_each_entry(blkiop, &blkio_list, list) {
+
+               /* If this policy does not own the blkg, do not send updates */
+               if (blkiop->plid != blkg->plid)
+                       continue;
+
+               if (fileid == BLKIO_THROTL_read_iops_device
+                   && blkiop->ops.blkio_update_group_read_iops_fn)
+                       blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
+                                                               blkg, iops);
+
+               if (fileid == BLKIO_THROTL_write_iops_device
+                   && blkiop->ops.blkio_update_group_write_iops_fn)
+                       blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
+                                                               blkg,iops);
        }
 }
 
@@ -324,30 +378,47 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 
-void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
+                               unsigned long unaccounted_time)
 {
        unsigned long flags;
 
        spin_lock_irqsave(&blkg->stats_lock, flags);
        blkg->stats.time += time;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       blkg->stats.unaccounted_time += unaccounted_time;
+#endif
        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 
+/*
+ * should be called under rcu read lock or queue lock to make sure blkg pointer
+ * is valid.
+ */
 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
                                uint64_t bytes, bool direction, bool sync)
 {
-       struct blkio_group_stats *stats;
+       struct blkio_group_stats_cpu *stats_cpu;
        unsigned long flags;
 
-       spin_lock_irqsave(&blkg->stats_lock, flags);
-       stats = &blkg->stats;
-       stats->sectors += bytes >> 9;
-       blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
-                       sync);
-       blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
-                       direction, sync);
-       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+       /*
+        * Disabling interrupts to provide mutual exclusion between two
+        * writes on same cpu. It probably is not needed for 64bit. Not
+        * optimizing that case yet.
+        */
+       local_irq_save(flags);
+
+       stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+
+       u64_stats_update_begin(&stats_cpu->syncp);
+       stats_cpu->sectors += bytes >> 9;
+       blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
+                       1, direction, sync);
+       blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
+                       bytes, direction, sync);
+       u64_stats_update_end(&stats_cpu->syncp);
+       local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
 
@@ -370,18 +441,44 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 
+/*  Merged stats are per cpu.  */
 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
                                        bool sync)
 {
+       struct blkio_group_stats_cpu *stats_cpu;
        unsigned long flags;
 
-       spin_lock_irqsave(&blkg->stats_lock, flags);
-       blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
-                       sync);
-       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+       /*
+        * Disabling interrupts to provide mutual exclusion between two
+        * writes on same cpu. It probably is not needed for 64bit. Not
+        * optimizing that case yet.
+        */
+       local_irq_save(flags);
+
+       stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+
+       u64_stats_update_begin(&stats_cpu->syncp);
+       blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
+                               direction, sync);
+       u64_stats_update_end(&stats_cpu->syncp);
+       local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
+/*
+ * This function allocates the per cpu stats for blkio_group. Should be called
+ * from sleepable context as alloc_per_cpu() requires that.
+ */
+int blkio_alloc_blkg_stats(struct blkio_group *blkg)
+{
+       /* Allocate memory for per cpu stats */
+       blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
+       if (!blkg->stats_cpu)
+               return -ENOMEM;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
+
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
                struct blkio_group *blkg, void *key, dev_t dev,
                enum blkio_policy_id plid)
@@ -452,6 +549,30 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
 }
 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
 
+static void blkio_reset_stats_cpu(struct blkio_group *blkg)
+{
+       struct blkio_group_stats_cpu *stats_cpu;
+       int i, j, k;
+       /*
+        * Note: On 64 bit arch this should not be an issue. This has the
+        * possibility of returning some inconsistent value on 32bit arch
+        * as 64bit update on 32bit is non atomic. Taking care of this
+        * corner case makes code very complicated, like sending IPIs to
+        * cpus, taking care of stats of offline cpus etc.
+        *
+        * reset stats is anyway more of a debug feature and this sounds a
+        * corner case. So I am not complicating the code yet until and
+        * unless this becomes a real issue.
+        */
+       for_each_possible_cpu(i) {
+               stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
+               stats_cpu->sectors = 0;
+               for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
+                       for (k = 0; k < BLKIO_STAT_TOTAL; k++)
+                               stats_cpu->stat_arr_cpu[j][k] = 0;
+       }
+}
+
 static int
 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 {
@@ -496,7 +617,11 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
                }
 #endif
                spin_unlock(&blkg->stats_lock);
+
+               /* Reset Per cpu stats which don't take blkg->stats_lock */
+               blkio_reset_stats_cpu(blkg);
        }
+
        spin_unlock_irq(&blkcg->lock);
        return 0;
 }
@@ -542,6 +667,59 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
        return val;
 }
 
+
+static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
+                       enum stat_type_cpu type, enum stat_sub_type sub_type)
+{
+       int cpu;
+       struct blkio_group_stats_cpu *stats_cpu;
+       u64 val = 0, tval;
+
+       for_each_possible_cpu(cpu) {
+               unsigned int start;
+               stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
+
+               do {
+                       start = u64_stats_fetch_begin(&stats_cpu->syncp);
+                       if (type == BLKIO_STAT_CPU_SECTORS)
+                               tval = stats_cpu->sectors;
+                       else
+                               tval = stats_cpu->stat_arr_cpu[type][sub_type];
+               } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
+
+               val += tval;
+       }
+
+       return val;
+}
+
+static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
+               struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
+{
+       uint64_t disk_total, val;
+       char key_str[MAX_KEY_LEN];
+       enum stat_sub_type sub_type;
+
+       if (type == BLKIO_STAT_CPU_SECTORS) {
+               val = blkio_read_stat_cpu(blkg, type, 0);
+               return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
+       }
+
+       for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+                       sub_type++) {
+               blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+               val = blkio_read_stat_cpu(blkg, type, sub_type);
+               cb->fill(cb, key_str, val);
+       }
+
+       disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
+                       blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
+
+       blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+       cb->fill(cb, key_str, disk_total);
+       return disk_total;
+}
+
 /* This should be called with blkg->stats_lock held */
 static uint64_t blkio_get_stat(struct blkio_group *blkg,
                struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
@@ -553,10 +731,10 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
        if (type == BLKIO_STAT_TIME)
                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
                                        blkg->stats.time, cb, dev);
-       if (type == BLKIO_STAT_SECTORS)
-               return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-                                       blkg->stats.sectors, cb, dev);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+       if (type == BLKIO_STAT_UNACCOUNTED_TIME)
+               return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                       blkg->stats.unaccounted_time, cb, dev);
        if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
                uint64_t sum = blkg->stats.avg_queue_size_sum;
                uint64_t samples = blkg->stats.avg_queue_size_samples;
@@ -592,26 +770,16 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
        return disk_total;
 }
 
-static int blkio_check_dev_num(dev_t dev)
-{
-       int part = 0;
-       struct gendisk *disk;
-
-       disk = get_gendisk(dev, &part);
-       if (!disk || part)
-               return -ENODEV;
-
-       return 0;
-}
-
 static int blkio_policy_parse_and_set(char *buf,
        struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
 {
+       struct gendisk *disk = NULL;
        char *s[4], *p, *major_s = NULL, *minor_s = NULL;
-       int ret;
-       unsigned long major, minor, temp;
-       int i = 0;
+       unsigned long major, minor;
+       int i = 0, ret = -EINVAL;
+       int part;
        dev_t dev;
+       u64 temp;
 
        memset(s, 0, sizeof(s));
 
@@ -627,77 +795,185 @@ static int blkio_policy_parse_and_set(char *buf,
        }
 
        if (i != 2)
-               return -EINVAL;
+               goto out;
 
        p = strsep(&s[0], ":");
        if (p != NULL)
                major_s = p;
        else
-               return -EINVAL;
+               goto out;
 
        minor_s = s[0];
        if (!minor_s)
-               return -EINVAL;
+               goto out;
 
-       ret = strict_strtoul(major_s, 10, &major);
-       if (ret)
-               return -EINVAL;
+       if (strict_strtoul(major_s, 10, &major))
+               goto out;
 
-       ret = strict_strtoul(minor_s, 10, &minor);
-       if (ret)
-               return -EINVAL;
+       if (strict_strtoul(minor_s, 10, &minor))
+               goto out;
 
        dev = MKDEV(major, minor);
 
-       ret = blkio_check_dev_num(dev);
-       if (ret)
-               return ret;
+       if (strict_strtoull(s[1], 10, &temp))
+               goto out;
 
-       newpn->dev = dev;
+       /* For rule removal, do not check for device presence. */
+       if (temp) {
+               disk = get_gendisk(dev, &part);
+               if (!disk || part) {
+                       ret = -ENODEV;
+                       goto out;
+               }
+       }
 
-       if (s[1] == NULL)
-               return -EINVAL;
+       newpn->dev = dev;
 
        switch (plid) {
        case BLKIO_POLICY_PROP:
-               ret = strict_strtoul(s[1], 10, &temp);
-               if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
-                       temp > BLKIO_WEIGHT_MAX)
-                       return -EINVAL;
+               if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+                    temp > BLKIO_WEIGHT_MAX)
+                       goto out;
 
                newpn->plid = plid;
                newpn->fileid = fileid;
-               newpn->weight = temp;
+               newpn->val.weight = temp;
+               break;
+       case BLKIO_POLICY_THROTL:
+               switch(fileid) {
+               case BLKIO_THROTL_read_bps_device:
+               case BLKIO_THROTL_write_bps_device:
+                       newpn->plid = plid;
+                       newpn->fileid = fileid;
+                       newpn->val.bps = temp;
+                       break;
+               case BLKIO_THROTL_read_iops_device:
+               case BLKIO_THROTL_write_iops_device:
+                       if (temp > THROTL_IOPS_MAX)
+                               goto out;
+
+                       newpn->plid = plid;
+                       newpn->fileid = fileid;
+                       newpn->val.iops = (unsigned int)temp;
+                       break;
+               }
                break;
        default:
                BUG();
        }
-
-       return 0;
+       ret = 0;
+out:
+       put_disk(disk);
+       return ret;
 }
 
 unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
                              dev_t dev)
 {
        struct blkio_policy_node *pn;
+       unsigned long flags;
+       unsigned int weight;
+
+       spin_lock_irqsave(&blkcg->lock, flags);
 
        pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
                                BLKIO_PROP_weight_device);
        if (pn)
-               return pn->weight;
+               weight = pn->val.weight;
        else
-               return blkcg->weight;
+               weight = blkcg->weight;
+
+       spin_unlock_irqrestore(&blkcg->lock, flags);
+
+       return weight;
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
+uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
+{
+       struct blkio_policy_node *pn;
+       unsigned long flags;
+       uint64_t bps = -1;
+
+       spin_lock_irqsave(&blkcg->lock, flags);
+       pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_read_bps_device);
+       if (pn)
+               bps = pn->val.bps;
+       spin_unlock_irqrestore(&blkcg->lock, flags);
+
+       return bps;
+}
+
+uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
+{
+       struct blkio_policy_node *pn;
+       unsigned long flags;
+       uint64_t bps = -1;
+
+       spin_lock_irqsave(&blkcg->lock, flags);
+       pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_write_bps_device);
+       if (pn)
+               bps = pn->val.bps;
+       spin_unlock_irqrestore(&blkcg->lock, flags);
+
+       return bps;
+}
+
+unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
+{
+       struct blkio_policy_node *pn;
+       unsigned long flags;
+       unsigned int iops = -1;
+
+       spin_lock_irqsave(&blkcg->lock, flags);
+       pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_read_iops_device);
+       if (pn)
+               iops = pn->val.iops;
+       spin_unlock_irqrestore(&blkcg->lock, flags);
+
+       return iops;
+}
+
+unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
+{
+       struct blkio_policy_node *pn;
+       unsigned long flags;
+       unsigned int iops = -1;
+
+       spin_lock_irqsave(&blkcg->lock, flags);
+       pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_write_iops_device);
+       if (pn)
+               iops = pn->val.iops;
+       spin_unlock_irqrestore(&blkcg->lock, flags);
+
+       return iops;
+}
+
 /* Checks whether user asked for deleting a policy rule */
 static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
 {
        switch(pn->plid) {
        case BLKIO_POLICY_PROP:
-               if (pn->weight == 0)
+               if (pn->val.weight == 0)
                        return 1;
                break;
+       case BLKIO_POLICY_THROTL:
+               switch(pn->fileid) {
+               case BLKIO_THROTL_read_bps_device:
+               case BLKIO_THROTL_write_bps_device:
+                       if (pn->val.bps == 0)
+                               return 1;
+                       break;
+               case BLKIO_THROTL_read_iops_device:
+               case BLKIO_THROTL_write_iops_device:
+                       if (pn->val.iops == 0)
+                               return 1;
+               }
+               break;
        default:
                BUG();
        }
@@ -710,7 +986,18 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 {
        switch(oldpn->plid) {
        case BLKIO_POLICY_PROP:
-               oldpn->weight = newpn->weight;
+               oldpn->val.weight = newpn->val.weight;
+               break;
+       case BLKIO_POLICY_THROTL:
+               switch(newpn->fileid) {
+               case BLKIO_THROTL_read_bps_device:
+               case BLKIO_THROTL_write_bps_device:
+                       oldpn->val.bps = newpn->val.bps;
+                       break;
+               case BLKIO_THROTL_read_iops_device:
+               case BLKIO_THROTL_write_iops_device:
+                       oldpn->val.iops = newpn->val.iops;
+               }
                break;
        default:
                BUG();
@@ -718,27 +1005,42 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 }
 
 /*
- * Some rules/values in blkg have changed. Propogate those to respective
+ * Some rules/values in blkg have changed. Propagate those to respective
  * policies.
  */
 static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
                struct blkio_group *blkg, struct blkio_policy_node *pn)
 {
-       unsigned int weight;
+       unsigned int weight, iops;
+       u64 bps;
 
        switch(pn->plid) {
        case BLKIO_POLICY_PROP:
-               weight = pn->weight ? pn->weight :
+               weight = pn->val.weight ? pn->val.weight :
                                blkcg->weight;
                blkio_update_group_weight(blkg, weight);
                break;
+       case BLKIO_POLICY_THROTL:
+               switch(pn->fileid) {
+               case BLKIO_THROTL_read_bps_device:
+               case BLKIO_THROTL_write_bps_device:
+                       bps = pn->val.bps ? pn->val.bps : (-1);
+                       blkio_update_group_bps(blkg, bps, pn->fileid);
+                       break;
+               case BLKIO_THROTL_read_iops_device:
+               case BLKIO_THROTL_write_iops_device:
+                       iops = pn->val.iops ? pn->val.iops : (-1);
+                       blkio_update_group_iops(blkg, iops, pn->fileid);
+                       break;
+               }
+               break;
        default:
                BUG();
        }
 }
 
 /*
- * A policy node rule has been updated. Propogate this update to all the
+ * A policy node rule has been updated. Propagate this update to all the
  * block groups which might be affected by this update.
  */
 static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
@@ -801,6 +1103,7 @@ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
 
        if (blkio_delete_rule_command(newpn)) {
                blkio_policy_delete_node(pn);
+               kfree(pn);
                spin_unlock_irq(&blkcg->lock);
                goto update_io_group;
        }
@@ -826,7 +1129,21 @@ blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
                case BLKIO_POLICY_PROP:
                        if (pn->fileid == BLKIO_PROP_weight_device)
                                seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-                                       MINOR(pn->dev), pn->weight);
+                                       MINOR(pn->dev), pn->val.weight);
+                       break;
+               case BLKIO_POLICY_THROTL:
+                       switch(pn->fileid) {
+                       case BLKIO_THROTL_read_bps_device:
+                       case BLKIO_THROTL_write_bps_device:
+                               seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
+                                       MINOR(pn->dev), pn->val.bps);
+                               break;
+                       case BLKIO_THROTL_read_iops_device:
+                       case BLKIO_THROTL_write_iops_device:
+                               seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+                                       MINOR(pn->dev), pn->val.iops);
+                               break;
+                       }
                        break;
                default:
                        BUG();
@@ -869,6 +1186,18 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
                        BUG();
                }
                break;
+       case BLKIO_POLICY_THROTL:
+               switch(name){
+               case BLKIO_THROTL_read_bps_device:
+               case BLKIO_THROTL_write_bps_device:
+               case BLKIO_THROTL_read_iops_device:
+               case BLKIO_THROTL_write_iops_device:
+                       blkio_read_policy_node_files(cft, blkcg, m);
+                       return 0;
+               default:
+                       BUG();
+               }
+               break;
        default:
                BUG();
        }
@@ -877,8 +1206,8 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
 }
 
 static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
-               struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
-               bool show_total)
+               struct cftype *cft, struct cgroup_map_cb *cb,
+               enum stat_type type, bool show_total, bool pcpu)
 {
        struct blkio_group *blkg;
        struct hlist_node *n;
@@ -889,10 +1218,15 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
                if (blkg->dev) {
                        if (!cftype_blkg_same_policy(cft, blkg))
                                continue;
-                       spin_lock_irq(&blkg->stats_lock);
-                       cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
-                                               type);
-                       spin_unlock_irq(&blkg->stats_lock);
+                       if (pcpu)
+                               cgroup_total += blkio_get_stat_cpu(blkg, cb,
+                                               blkg->dev, type);
+                       else {
+                               spin_lock_irq(&blkg->stats_lock);
+                               cgroup_total += blkio_get_stat(blkg, cb,
+                                               blkg->dev, type);
+                               spin_unlock_irq(&blkg->stats_lock);
+                       }
                }
        }
        if (show_total)
@@ -916,50 +1250,64 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
                switch(name) {
                case BLKIO_PROP_time:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_TIME, 0);
+                                               BLKIO_STAT_TIME, 0, 0);
                case BLKIO_PROP_sectors:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_SECTORS, 0);
+                                               BLKIO_STAT_CPU_SECTORS, 0, 1);
                case BLKIO_PROP_io_service_bytes:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_SERVICE_BYTES, 1);
+                                       BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
                case BLKIO_PROP_io_serviced:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_SERVICED, 1);
+                                               BLKIO_STAT_CPU_SERVICED, 1, 1);
                case BLKIO_PROP_io_service_time:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_SERVICE_TIME, 1);
+                                               BLKIO_STAT_SERVICE_TIME, 1, 0);
                case BLKIO_PROP_io_wait_time:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_WAIT_TIME, 1);
+                                               BLKIO_STAT_WAIT_TIME, 1, 0);
                case BLKIO_PROP_io_merged:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_MERGED, 1);
+                                               BLKIO_STAT_CPU_MERGED, 1, 1);
                case BLKIO_PROP_io_queued:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_QUEUED, 1);
+                                               BLKIO_STAT_QUEUED, 1, 0);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+               case BLKIO_PROP_unaccounted_time:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                       BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
                case BLKIO_PROP_dequeue:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_DEQUEUE, 0);
+                                               BLKIO_STAT_DEQUEUE, 0, 0);
                case BLKIO_PROP_avg_queue_size:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+                                       BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
                case BLKIO_PROP_group_wait_time:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_GROUP_WAIT_TIME, 0);
+                                       BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
                case BLKIO_PROP_idle_time:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_IDLE_TIME, 0);
+                                               BLKIO_STAT_IDLE_TIME, 0, 0);
                case BLKIO_PROP_empty_time:
                        return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_EMPTY_TIME, 0);
+                                               BLKIO_STAT_EMPTY_TIME, 0, 0);
 #endif
                default:
                        BUG();
                }
                break;
-
+       case BLKIO_POLICY_THROTL:
+               switch(name){
+               case BLKIO_THROTL_io_service_bytes:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
+               case BLKIO_THROTL_io_serviced:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_CPU_SERVICED, 1, 1);
+               default:
+                       BUG();
+               }
+               break;
        default:
                BUG();
        }
@@ -1104,6 +1452,56 @@ struct cftype blkio_files[] = {
                .name = "reset_stats",
                .write_u64 = blkiocg_reset_stats,
        },
+#ifdef CONFIG_BLK_DEV_THROTTLING
+       {
+               .name = "throttle.read_bps_device",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_read_bps_device),
+               .read_seq_string = blkiocg_file_read,
+               .write_string = blkiocg_file_write,
+               .max_write_len = 256,
+       },
+
+       {
+               .name = "throttle.write_bps_device",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_write_bps_device),
+               .read_seq_string = blkiocg_file_read,
+               .write_string = blkiocg_file_write,
+               .max_write_len = 256,
+       },
+
+       {
+               .name = "throttle.read_iops_device",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_read_iops_device),
+               .read_seq_string = blkiocg_file_read,
+               .write_string = blkiocg_file_write,
+               .max_write_len = 256,
+       },
+
+       {
+               .name = "throttle.write_iops_device",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_write_iops_device),
+               .read_seq_string = blkiocg_file_read,
+               .write_string = blkiocg_file_write,
+               .max_write_len = 256,
+       },
+       {
+               .name = "throttle.io_service_bytes",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_io_service_bytes),
+               .read_map = blkiocg_file_read_map,
+       },
+       {
+               .name = "throttle.io_serviced",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_io_serviced),
+               .read_map = blkiocg_file_read_map,
+       },
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
        {
                .name = "avg_queue_size",
@@ -1135,6 +1533,12 @@ struct cftype blkio_files[] = {
                                BLKIO_PROP_dequeue),
                .read_map = blkiocg_file_read_map,
        },
+       {
+               .name = "unaccounted_time",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_unaccounted_time),
+               .read_map = blkiocg_file_read_map,
+       },
 #endif
 };
 
@@ -1172,13 +1576,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
                /*
                 * This blkio_group is being unlinked as associated cgroup is
                 * going away. Let all the IO controlling policies know about
-                * this event. Currently this is static call to one io
-                * controlling policy. Once we have more policies in place, we
-                * need some dynamic registration of callback function.
+                * this event.
                 */
                spin_lock(&blkio_list_lock);
-               list_for_each_entry(blkiop, &blkio_list, list)
+               list_for_each_entry(blkiop, &blkio_list, list) {
+                       if (blkiop->plid != blkg->plid)
+                               continue;
                        blkiop->ops.blkio_unlink_group_fn(key, blkg);
+               }
                spin_unlock(&blkio_list_lock);
        } while (1);
 
@@ -1204,10 +1609,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
                goto done;
        }
 
-       /* Currently we do not support hierarchy deeper than two level (0,1) */
-       if (parent != cgroup->top_cgroup)
-               return ERR_PTR(-EINVAL);
-
        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
        if (!blkcg)
                return ERR_PTR(-ENOMEM);
@@ -1227,34 +1628,40 @@ done:
  * of the main cic data structures.  For now we allow a task to change
  * its cgroup only if it's the only owner of its ioc.
  */
-static int blkiocg_can_attach(struct cgroup_subsys *subsys,
-                               struct cgroup *cgroup, struct task_struct *tsk,
-                               bool threadgroup)
+static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                             struct cgroup_taskset *tset)
 {
+       struct task_struct *task;
        struct io_context *ioc;
        int ret = 0;
 
        /* task_lock() is needed to avoid races with exit_io_context() */
-       task_lock(tsk);
-       ioc = tsk->io_context;
-       if (ioc && atomic_read(&ioc->nr_tasks) > 1)
-               ret = -EINVAL;
-       task_unlock(tsk);
-
+       cgroup_taskset_for_each(task, cgrp, tset) {
+               task_lock(task);
+               ioc = task->io_context;
+               if (ioc && atomic_read(&ioc->nr_tasks) > 1)
+                       ret = -EINVAL;
+               task_unlock(task);
+               if (ret)
+                       break;
+       }
        return ret;
 }
 
-static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
-                               struct cgroup *prev, struct task_struct *tsk,
-                               bool threadgroup)
+static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                          struct cgroup_taskset *tset)
 {
+       struct task_struct *task;
        struct io_context *ioc;
 
-       task_lock(tsk);
-       ioc = tsk->io_context;
-       if (ioc)
-               ioc->cgroup_changed = 1;
-       task_unlock(tsk);
+       cgroup_taskset_for_each(task, cgrp, tset) {
+               /* we don't lose anything even if ioc allocation fails */
+               ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+               if (ioc) {
+                       ioc_cgroup_changed(ioc);
+                       put_io_context(ioc, NULL);
+               }
+       }
 }
 
 void blkio_policy_register(struct blkio_policy_type *blkiop)