unsigned count;
unsigned total_weight;
u64 min_vdisktime;
+ struct cfq_ttime ttime;
};
-#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
- .count = 0, .min_vdisktime = 0, }
+#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \
+ .ttime = {.last_end_request = jiffies,},}
/*
* Per process-grouping structure
unsigned long slice_end;
long slice_resid;
- /* pending metadata requests */
- int meta_pending;
+ /* pending priority requests */
+ int prio_pending;
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
/* io prio of this group */
unsigned short ioprio, org_ioprio;
- unsigned short ioprio_class, org_ioprio_class;
+ unsigned short ioprio_class;
pid_t pid;
/* group service_tree key */
u64 vdisktime;
unsigned int weight;
+ unsigned int new_weight;
+ bool needs_update;
/* number of cfqq currently on this group */
int nr_cfqq;
/*
- * Per group busy queus average. Useful for workload slice calc. We
+ * Per group busy queues average. Useful for workload slice calc. We
* create the array for each prio class but at run time it is used
* only for RT and BE class and slot for IDLE class remains unused.
* This is primarily done to avoid confusion and a gcc warning.
#endif
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
+ struct cfq_ttime ttime;
};
/*
/* List of cfq groups being managed on this device*/
struct hlist_head cfqg_list;
- struct rcu_head rcu;
+
+ /* Number of groups which are on blkcg->blkg_list */
+ unsigned int nr_blkcg_linked_grps;
};
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
- blkg_path(&(cfqq)->cfqg->blkg), ##args);
+ blkg_path(&(cfqq)->cfqg->blkg), ##args)
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
- blkg_path(&(cfqg)->blkg), ##args); \
+ blkg_path(&(cfqg)->blkg), ##args) \
#else
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
-#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
#endif
#define cfq_log(cfqd, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
j++, st = i < IDLE_WORKLOAD ? \
&cfqg->service_trees[i][j]: NULL) \
+static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
+ struct cfq_ttime *ttime, bool group_idle)
+{
+ unsigned long slice;
+ if (!sample_valid(ttime->ttime_samples))
+ return false;
+ if (group_idle)
+ slice = cfqd->cfq_group_idle;
+ else
+ slice = cfqd->cfq_slice_idle;
+ return ttime->ttime_mean > slice;
+}
static inline bool iops_mode(struct cfq_data *cfqd)
{
}
}
-static int cfq_queue_empty(struct request_queue *q)
-{
- struct cfq_data *cfqd = q->elevator->elevator_data;
-
- return !cfqd->rq_queued;
-}
-
/*
* Scale schedule slice based on io priority. Use the sync time slice only
* if a queue is marked sync and has sync io queued. A sync queue with async
if (rq2 == NULL)
return rq1;
- if (rq_is_sync(rq1) && !rq_is_sync(rq2))
- return rq1;
- else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
- return rq2;
- if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
- return rq1;
- else if ((rq2->cmd_flags & REQ_META) &&
- !(rq1->cmd_flags & REQ_META))
- return rq2;
+ if (rq_is_sync(rq1) != rq_is_sync(rq2))
+ return rq_is_sync(rq1) ? rq1 : rq2;
+
+ if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
+ return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
s1 = blk_rq_pos(rq1);
s2 = blk_rq_pos(rq2);
}
static void
-cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
+cfq_update_group_weight(struct cfq_group *cfqg)
+{
+ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+ if (cfqg->needs_update) {
+ cfqg->weight = cfqg->new_weight;
+ cfqg->needs_update = false;
+ }
+}
+
+static void
+cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+
+ cfq_update_group_weight(cfqg);
+ __cfq_group_service_tree_add(st, cfqg);
+ st->total_weight += cfqg->weight;
+}
+
+static void
+cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_group *__cfqg;
/*
* Currently put the group at the end. Later implement something
* so that groups get lesser vtime based on their weights, so that
- * if group does not loose all if it was not continously backlogged.
+ * if group does not loose all if it was not continuously backlogged.
*/
n = rb_last(&st->rb);
if (n) {
cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
} else
cfqg->vdisktime = st->min_vdisktime;
+ cfq_group_service_tree_add(st, cfqg);
+}
- __cfq_group_service_tree_add(st, cfqg);
- st->total_weight += cfqg->weight;
+static void
+cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+ st->total_weight -= cfqg->weight;
+ if (!RB_EMPTY_NODE(&cfqg->rb_node))
+ cfq_rb_erase(&cfqg->rb_node, st);
}
static void
-cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
+cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
return;
cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
- st->total_weight -= cfqg->weight;
- if (!RB_EMPTY_NODE(&cfqg->rb_node))
- cfq_rb_erase(&cfqg->rb_node, st);
+ cfq_group_service_tree_del(st, cfqg);
cfqg->saved_workload_slice = 0;
cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
}
-static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
+static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
+ unsigned int *unaccounted_time)
{
unsigned int slice_used;
1);
} else {
slice_used = jiffies - cfqq->slice_start;
- if (slice_used > cfqq->allocated_slice)
+ if (slice_used > cfqq->allocated_slice) {
+ *unaccounted_time = slice_used - cfqq->allocated_slice;
slice_used = cfqq->allocated_slice;
+ }
+ if (time_after(cfqq->slice_start, cfqq->dispatch_start))
+ *unaccounted_time += cfqq->slice_start -
+ cfqq->dispatch_start;
}
return slice_used;
struct cfq_queue *cfqq)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
- unsigned int used_sl, charge;
+ unsigned int used_sl, charge, unaccounted_sl = 0;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count;
BUG_ON(nr_sync < 0);
- used_sl = charge = cfq_cfqq_slice_usage(cfqq);
+ used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
if (iops_mode(cfqd))
charge = cfqq->slice_dispatch;
charge = cfqq->allocated_slice;
/* Can't update vdisktime while group is on service tree */
- cfq_rb_erase(&cfqg->rb_node, st);
+ cfq_group_service_tree_del(st, cfqg);
cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
- __cfq_group_service_tree_add(st, cfqg);
+ /* If a new weight was requested, update now, off tree */
+ cfq_group_service_tree_add(st, cfqg);
/* This group is being expired. Save the context */
if (time_after(cfqd->workload_expires, jiffies)) {
cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
st->min_vdisktime);
- cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
- " sect=%u", used_sl, cfqq->slice_dispatch, charge,
- iops_mode(cfqd), cfqq->nr_sectors);
- cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
+ cfq_log_cfqq(cfqq->cfqd, cfqq,
+ "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
+ used_sl, cfqq->slice_dispatch, charge,
+ iops_mode(cfqd), cfqq->nr_sectors);
+ cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
+ unaccounted_sl);
cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
}
return NULL;
}
-void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
- unsigned int weight)
+static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
+ unsigned int weight)
{
- cfqg_of_blkg(blkg)->weight = weight;
+ struct cfq_group *cfqg = cfqg_of_blkg(blkg);
+ cfqg->new_weight = weight;
+ cfqg->needs_update = true;
}
-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
+ struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
{
- struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
- struct cfq_group *cfqg = NULL;
- void *key = cfqd;
- int i, j;
- struct cfq_rb_root *st;
struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
unsigned int major, minor;
- cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
- if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ /*
+ * Add group onto cgroup list. It might happen that bdi->dev is
+ * not initialized yet. Initialize this new group without major
+ * and minor info and this info will be filled in once a new thread
+ * comes for IO.
+ */
+ if (bdi->dev) {
sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfqg->blkg.dev = MKDEV(major, minor);
- goto done;
- }
- if (cfqg || !create)
- goto done;
+ cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
+ (void *)cfqd, MKDEV(major, minor));
+ } else
+ cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
+ (void *)cfqd, 0);
+
+ cfqd->nr_blkcg_linked_grps++;
+ cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+
+ /* Add group on cfqd list */
+ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+/*
+ * Should be called from sleepable context. No request queue lock as per
+ * cpu stats are allocated dynamically and alloc_percpu needs to be called
+ * from sleepable context.
+ */
+static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
+{
+ struct cfq_group *cfqg = NULL;
+ int i, j, ret;
+ struct cfq_rb_root *st;
cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
if (!cfqg)
- goto done;
+ return NULL;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
RB_CLEAR_NODE(&cfqg->rb_node);
+ cfqg->ttime.last_end_request = jiffies;
+
/*
* Take the initial reference that will be released on destroy
* This can be thought of a joint reference by cgroup and
*/
cfqg->ref = 1;
+ ret = blkio_alloc_blkg_stats(&cfqg->blkg);
+ if (ret) {
+ kfree(cfqg);
+ return NULL;
+ }
+
+ return cfqg;
+}
+
+static struct cfq_group *
+cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
+{
+ struct cfq_group *cfqg = NULL;
+ void *key = cfqd;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+
/*
- * Add group onto cgroup list. It might happen that bdi->dev is
- * not initialized yet. Initialize this new group without major
- * and minor info and this info will be filled in once a new thread
- * comes for IO. See code above.
+ * This is the common case when there are no blkio cgroups.
+ * Avoid lookup in this case
*/
- if (bdi->dev) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
- MKDEV(major, minor));
- } else
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
- 0);
-
- cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+ if (blkcg == &blkio_root_cgroup)
+ cfqg = &cfqd->root_group;
+ else
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
- /* Add group on cfqd list */
- hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+ if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ }
-done:
return cfqg;
}
/*
- * Search for the cfq group current task belongs to. If create = 1, then also
- * create the cfq group if it does not exist. request_queue lock must be held.
+ * Search for the cfq group current task belongs to. request_queue lock must
+ * be held.
*/
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
{
- struct cgroup *cgroup;
- struct cfq_group *cfqg = NULL;
+ struct blkio_cgroup *blkcg;
+ struct cfq_group *cfqg = NULL, *__cfqg = NULL;
+ struct request_queue *q = cfqd->queue;
rcu_read_lock();
- cgroup = task_cgroup(current, blkio_subsys_id);
- cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
- if (!cfqg && create)
+ blkcg = task_blkio_cgroup(current);
+ cfqg = cfq_find_cfqg(cfqd, blkcg);
+ if (cfqg) {
+ rcu_read_unlock();
+ return cfqg;
+ }
+
+ /*
+ * Need to allocate a group. Allocation of group also needs allocation
+ * of per cpu stats which in-turn takes a mutex() and can block. Hence
+ * we need to drop rcu lock and queue_lock before we call alloc.
+ *
+ * Not taking any queue reference here and assuming that queue is
+ * around by the time we return. CFQ queue allocation code does
+ * the same. It might be racy though.
+ */
+
+ rcu_read_unlock();
+ spin_unlock_irq(q->queue_lock);
+
+ cfqg = cfq_alloc_cfqg(cfqd);
+
+ spin_lock_irq(q->queue_lock);
+
+ rcu_read_lock();
+ blkcg = task_blkio_cgroup(current);
+
+ /*
+ * If some other thread already allocated the group while we were
+ * not holding queue lock, free up the group
+ */
+ __cfqg = cfq_find_cfqg(cfqd, blkcg);
+
+ if (__cfqg) {
+ kfree(cfqg);
+ rcu_read_unlock();
+ return __cfqg;
+ }
+
+ if (!cfqg)
cfqg = &cfqd->root_group;
+
+ cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
rcu_read_unlock();
return cfqg;
}
return;
for_each_cfqg_st(cfqg, i, j, st)
BUG_ON(!RB_EMPTY_ROOT(&st->rb));
+ free_percpu(cfqg->blkg.stats_cpu);
kfree(cfqg);
}
hlist_del_init(&cfqg->cfqd_node);
+ BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
+ cfqd->nr_blkcg_linked_grps--;
+
/*
* Put the reference taken at the time of creation so that when all
* queues are gone, group can be destroyed.
* it should not be NULL as even if elevator was exiting, cgroup deltion
* path got to it first.
*/
-void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
+static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
{
unsigned long flags;
struct cfq_data *cfqd = key;
}
#else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
{
return &cfqd->root_group;
}
struct cfq_rb_root *service_tree;
int left;
int new_cfqq = 1;
- int group_changed = 0;
service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
cfqq_type(cfqq));
rb_link_node(&cfqq->rb_node, parent, p);
rb_insert_color(&cfqq->rb_node, &service_tree->rb);
service_tree->count++;
- if ((add_front || !new_cfqq) && !group_changed)
+ if (add_front || !new_cfqq)
return;
- cfq_group_service_tree_add(cfqd, cfqq->cfqg);
+ cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
}
static struct cfq_queue *
cfqq->p_root = NULL;
}
- cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+ cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
BUG_ON(!cfqd->busy_queues);
cfqd->busy_queues--;
if (cfq_cfqq_sync(cfqq))
{
struct cfq_queue *cfqq = RQ_CFQQ(rq);
struct cfq_data *cfqd = cfqq->cfqd;
- struct request *__alias, *prev;
+ struct request *prev;
cfqq->queued[rq_is_sync(rq)]++;
- /*
- * looks a little odd, but the first insert might return an alias.
- * if that happens, put the alias on the dispatch list
- */
- while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
- cfq_dispatch_insert(cfqd->queue, __alias);
+ elv_rb_add(&cfqq->sort_list, rq);
if (!cfq_cfqq_on_rr(cfqq))
cfq_add_cfqq_rr(cfqd, cfqq);
cfqq->cfqd->rq_queued--;
cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
rq_data_dir(rq), rq_is_sync(rq));
- if (rq->cmd_flags & REQ_META) {
- WARN_ON(!cfqq->meta_pending);
- cfqq->meta_pending--;
+ if (rq->cmd_flags & REQ_PRIO) {
+ WARN_ON(!cfqq->prio_pending);
+ cfqq->prio_pending--;
}
}
* Otherwise, we do only if they are the last ones
* in their service tree.
*/
- if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
+ if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
+ !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))
return true;
cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
service_tree->count);
* slice, then don't idle. This avoids overrunning the allotted
* time slice.
*/
- if (sample_valid(cic->ttime_samples) &&
- (cfqq->slice_end - jiffies < cic->ttime_mean)) {
- cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
- cic->ttime_mean);
+ if (sample_valid(cic->ttime.ttime_samples) &&
+ (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
+ cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
+ cic->ttime.ttime_mean);
return;
}
WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
- return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
+ return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
}
/*
* this group, wait for requests to complete.
*/
check_group_idle:
- if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
- && cfqq->cfqg->dispatched) {
+ if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
+ cfqq->cfqg->dispatched &&
+ !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
cfqq = NULL;
goto keep_queue;
}
return false;
/*
- * If there is only one sync queue, and its think time is
- * small, we can ignore async queue here and give the sync
+ * If there is only one sync queue
+ * we can ignore async queue here and give the sync
* queue no dispatch limit. The reason is a sync queue can
* preempt async queue, limiting the sync queue doesn't make
* sense. This is useful for aiostress test.
*/
- if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) {
- struct cfq_io_context *cic = RQ_CIC(cfqq->next_rq);
-
- if (sample_valid(cic->ttime_samples) &&
- cic->ttime_mean < cfqd->cfq_slice_idle)
- promote_sync = true;
- }
+ if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
+ promote_sync = true;
/*
* We have other queues, don't allow more IO from this one
}
/*
- * Must always be called with the rcu_read_lock() held
+ * Call func for each cic attached to this ioc.
*/
static void
-__call_for_each_cic(struct io_context *ioc,
- void (*func)(struct io_context *, struct cfq_io_context *))
+call_for_each_cic(struct io_context *ioc,
+ void (*func)(struct io_context *, struct cfq_io_context *))
{
struct cfq_io_context *cic;
struct hlist_node *n;
+ rcu_read_lock();
+
hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
func(ioc, cic);
-}
-/*
- * Call func for each cic attached to this ioc.
- */
-static void
-call_for_each_cic(struct io_context *ioc,
- void (*func)(struct io_context *, struct cfq_io_context *))
-{
- rcu_read_lock();
- __call_for_each_cic(ioc, func);
rcu_read_unlock();
}
* should be ok to iterate over the known list, we will see all cic's
* since no new ones are added.
*/
- __call_for_each_cic(ioc, cic_free_func);
+ call_for_each_cic(ioc, cic_free_func);
}
static void cfq_put_cooperator(struct cfq_queue *cfqq)
smp_wmb();
cic->key = cfqd_dead_key(cfqd);
- if (ioc->ioc_data == cic)
+ rcu_read_lock();
+ if (rcu_dereference(ioc->ioc_data) == cic) {
+ rcu_read_unlock();
+ spin_lock(&ioc->lock);
rcu_assign_pointer(ioc->ioc_data, NULL);
+ spin_unlock(&ioc->lock);
+ } else
+ rcu_read_unlock();
if (cic->cfqq[BLK_RW_ASYNC]) {
cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
cfqd->queue->node);
if (cic) {
- cic->last_end_request = jiffies;
+ cic->ttime.last_end_request = jiffies;
INIT_LIST_HEAD(&cic->queue_list);
INIT_HLIST_NODE(&cic->cic_list);
cic->dtor = cfq_free_io_context;
* elevate the priority of this queue
*/
cfqq->org_ioprio = cfqq->ioprio;
- cfqq->org_ioprio_class = cfqq->ioprio_class;
cfq_clear_cfqq_prio_changed(cfqq);
}
struct cfq_group *cfqg;
retry:
- cfqg = cfq_get_cfqg(cfqd, 1);
+ cfqg = cfq_get_cfqg(cfqd);
cic = cfq_cic_lookup(cfqd, ioc);
/* cic always exists here */
cfqq = cic_to_cfqq(cic, is_sync);
spin_lock_irqsave(&ioc->lock, flags);
- BUG_ON(ioc->ioc_data == cic);
+ BUG_ON(rcu_dereference_check(ioc->ioc_data,
+ lockdep_is_held(&ioc->lock)) == cic);
radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
hlist_del_rcu(&cic->cic_list);
}
static void
-cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
+__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
{
- unsigned long elapsed = jiffies - cic->last_end_request;
- unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
+ unsigned long elapsed = jiffies - ttime->last_end_request;
+ elapsed = min(elapsed, 2UL * slice_idle);
- cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
- cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
- cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
+ ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
+ ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;
+ ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
+}
+
+static void
+cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ struct cfq_io_context *cic)
+{
+ if (cfq_cfqq_sync(cfqq)) {
+ __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
+ __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
+ cfqd->cfq_slice_idle);
+ }
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
+#endif
}
static void
else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
(!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
enable_idle = 0;
- else if (sample_valid(cic->ttime_samples)) {
- if (cic->ttime_mean > cfqd->cfq_slice_idle)
+ else if (sample_valid(cic->ttime.ttime_samples)) {
+ if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
enable_idle = 0;
else
enable_idle = 1;
* So both queues are sync. Let the new request get disk time if
* it's a metadata request and the current queue is doing regular IO.
*/
- if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
+ if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
return true;
/*
struct cfq_io_context *cic = RQ_CIC(rq);
cfqd->rq_queued++;
- if (rq->cmd_flags & REQ_META)
- cfqq->meta_pending++;
+ if (rq->cmd_flags & REQ_PRIO)
+ cfqq->prio_pending++;
- cfq_update_io_thinktime(cfqd, cic);
+ cfq_update_io_thinktime(cfqd, cfqq, cic);
cfq_update_io_seektime(cfqd, cfqq, rq);
cfq_update_idle_window(cfqd, cfqq, cic);
cfqd->busy_queues > 1) {
cfq_del_timer(cfqd, cfqq);
cfq_clear_cfqq_wait_request(cfqq);
- __blk_run_queue(cfqd->queue, false);
+ __blk_run_queue(cfqd->queue);
} else {
cfq_blkiocg_update_idle_time_stats(
&cfqq->cfqg->blkg);
* this new queue is RT and the current one is BE
*/
cfq_preempt_queue(cfqd, cfqq);
- __blk_run_queue(cfqd->queue, false);
+ __blk_run_queue(cfqd->queue);
}
}
if (cfqq->cfqg->nr_cfqq > 1)
return false;
+ /* the only queue in the group, but think time is big */
+ if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
+ return false;
+
if (cfq_slice_used(cfqq))
return true;
/* if slice left is less than think time, wait busy */
- if (cic && sample_valid(cic->ttime_samples)
- && (cfqq->slice_end - jiffies < cic->ttime_mean))
+ if (cic && sample_valid(cic->ttime.ttime_samples)
+ && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
return true;
/*
cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
if (sync) {
- RQ_CIC(rq)->last_end_request = now;
+ struct cfq_rb_root *service_tree;
+
+ RQ_CIC(rq)->ttime.last_end_request = now;
+
+ if (cfq_cfqq_on_rr(cfqq))
+ service_tree = cfqq->service_tree;
+ else
+ service_tree = service_tree_for(cfqq->cfqg,
+ cfqq_prio(cfqq), cfqq_type(cfqq));
+ service_tree->ttime.last_end_request = now;
if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
cfqd->last_delayed_sync = now;
}
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ cfqq->cfqg->ttime.last_end_request = now;
+#endif
+
/*
* If this is the active queue, check if it needs to be expired,
* or if we want to idle in case it has no pending requests.
cfq_schedule_dispatch(cfqd);
}
-/*
- * we temporarily boost lower priority queues if they are holding fs exclusive
- * resources. they are boosted to normal prio (CLASS_BE/4)
- */
-static void cfq_prio_boost(struct cfq_queue *cfqq)
-{
- if (has_fs_excl()) {
- /*
- * boost idle prio on transactions that would lock out other
- * users of the filesystem
- */
- if (cfq_class_idle(cfqq))
- cfqq->ioprio_class = IOPRIO_CLASS_BE;
- if (cfqq->ioprio > IOPRIO_NORM)
- cfqq->ioprio = IOPRIO_NORM;
- } else {
- /*
- * unboost the queue (if needed)
- */
- cfqq->ioprio_class = cfqq->org_ioprio_class;
- cfqq->ioprio = cfqq->org_ioprio;
- }
-}
-
static inline int __cfq_may_queue(struct cfq_queue *cfqq)
{
if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
if (cfqq) {
cfq_init_prio_data(cfqq, cic->ioc);
- cfq_prio_boost(cfqq);
return __cfq_may_queue(cfqq);
}
return 0;
queue_fail:
- if (cic)
- put_io_context(cic->ioc);
-
cfq_schedule_dispatch(cfqd);
spin_unlock_irqrestore(q->queue_lock, flags);
cfq_log(cfqd, "set_request fail");
struct request_queue *q = cfqd->queue;
spin_lock_irq(q->queue_lock);
- __blk_run_queue(cfqd->queue, false);
+ __blk_run_queue(cfqd->queue);
spin_unlock_irq(q->queue_lock);
}
cfq_put_queue(cfqd->async_idle_cfqq);
}
-static void cfq_cfqd_free(struct rcu_head *head)
-{
- kfree(container_of(head, struct cfq_data, rcu));
-}
-
static void cfq_exit_queue(struct elevator_queue *e)
{
struct cfq_data *cfqd = e->elevator_data;
struct request_queue *q = cfqd->queue;
+ bool wait = false;
cfq_shutdown_timer_wq(cfqd);
cfq_put_async_queues(cfqd);
cfq_release_cfq_groups(cfqd);
- cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
+
+ /*
+ * If there are groups which we could not unlink from blkcg list,
+ * wait for a rcu period for them to be freed.
+ */
+ if (cfqd->nr_blkcg_linked_grps)
+ wait = true;
spin_unlock_irq(q->queue_lock);
ida_remove(&cic_index_ida, cfqd->cic_index);
spin_unlock(&cic_index_lock);
- /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
- call_rcu(&cfqd->rcu, cfq_cfqd_free);
+ /*
+ * Wait for cfqg->blkg->key accessors to exit their grace periods.
+ * Do this wait only if there are other unlinked groups out
+ * there. This can happen if cgroup deletion path claimed the
+ * responsibility of cleaning up a group before queue cleanup code
+ * get to the group.
+ *
+ * Do not call synchronize_rcu() unconditionally as there are drivers
+ * which create/delete request queue hundreds of times during scan/boot
+ * and synchronize_rcu() can take significant time and slow down boot.
+ */
+ if (wait)
+ synchronize_rcu();
+
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ /* Free up per cpu stats for root group */
+ free_percpu(cfqd->root_group.blkg.stats_cpu);
+#endif
+ kfree(cfqd);
}
static int cfq_alloc_cic_index(void)
return NULL;
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
- if (!cfqd)
+ if (!cfqd) {
+ spin_lock(&cic_index_lock);
+ ida_remove(&cic_index_ida, i);
+ spin_unlock(&cic_index_lock);
return NULL;
+ }
/*
* Don't need take queue_lock in the routine, since we are
#ifdef CONFIG_CFQ_GROUP_IOSCHED
/*
- * Take a reference to root group which we never drop. This is just
- * to make sure that cfq_put_cfqg() does not try to kfree root group
+ * Set root group reference to 2. One reference will be dropped when
+ * all groups on cfqd->cfqg_list are being deleted during queue exit.
+ * Other reference will remain there as we don't want to delete this
+ * group as it is statically allocated and gets destroyed when
+ * throtl_data goes away.
*/
- cfqg->ref = 1;
+ cfqg->ref = 2;
+
+ if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
+ kfree(cfqg);
+ kfree(cfqd);
+ return NULL;
+ }
+
rcu_read_lock();
+
cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
(void *)cfqd, 0);
rcu_read_unlock();
+ cfqd->nr_blkcg_linked_grps++;
+
+ /* Add group on cfqd->cfqg_list */
+ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
#endif
/*
* Not strictly needed (since RB_ROOT just clears the node and we
.elevator_add_req_fn = cfq_insert_request,
.elevator_activate_req_fn = cfq_activate_request,
.elevator_deactivate_req_fn = cfq_deactivate_request,
- .elevator_queue_empty_fn = cfq_queue_empty,
.elevator_completed_req_fn = cfq_completed_request,
.elevator_former_req_fn = elv_rb_former_request,
.elevator_latter_req_fn = elv_rb_latter_request,